From 6b3574e68704d58ba41efe0ea4fe928cc166afcd Mon Sep 17 00:00:00 2001
From: Tarek Auel <tarek.auel@googlemail.com>
Date: Sat, 4 Jul 2015 01:10:52 -0700
Subject: [PATCH 001/549] [SPARK-8270][SQL] levenshtein distance

Jira: https://issues.apache.org/jira/browse/SPARK-8270

Info: I can not build the latest master, it stucks during the build process: `[INFO] Dependency-reduced POM written at: /Users/tarek/test/spark/bagel/dependency-reduced-pom.xml`

Author: Tarek Auel <tarek.auel@googlemail.com>

Closes #7214 from tarekauel/SPARK-8270 and squashes the following commits:

ab348b9 [Tarek Auel] Merge branch 'master' into SPARK-8270
a2ad318 [Tarek Auel] [SPARK-8270] changed order of fields
d91b12c [Tarek Auel] [SPARK-8270] python fix
adbd075 [Tarek Auel] [SPARK-8270] fixed typo
23185c9 [Tarek Auel] [SPARK-8270] levenshtein distance
---
 python/pyspark/sql/functions.py               | 14 ++++++++
 .../catalyst/analysis/FunctionRegistry.scala  |  1 +
 .../expressions/stringOperations.scala        | 32 +++++++++++++++++++
 .../expressions/StringFunctionsSuite.scala    |  9 ++++++
 .../org/apache/spark/sql/functions.scala      | 23 ++++++++++---
 .../spark/sql/DataFrameFunctionsSuite.scala   |  6 ++++
 6 files changed, 81 insertions(+), 4 deletions(-)

diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 69e563ef36e87..49dd0332afe74 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -323,6 +323,20 @@ def explode(col):
     return Column(jc)
 
 
+@ignore_unicode_prefix
+@since(1.5)
+def levenshtein(left, right):
+    """Computes the Levenshtein distance of the two given strings.
+
+    >>> df0 = sqlContext.createDataFrame([('kitten', 'sitting',)], ['l', 'r'])
+    >>> df0.select(levenshtein('l', 'r').alias('d')).collect()
+    [Row(d=3)]
+    """
+    sc = SparkContext._active_spark_context
+    jc = sc._jvm.functions.levenshtein(_to_java_column(left), _to_java_column(right))
+    return Column(jc)
+
+
 @ignore_unicode_prefix
 @since(1.5)
 def md5(col):
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index e249b58927cc4..92a50e7092317 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -163,6 +163,7 @@ object FunctionRegistry {
     expression[Lower]("lcase"),
     expression[Lower]("lower"),
     expression[StringLength]("length"),
+    expression[Levenshtein]("levenshtein"),
     expression[Substring]("substr"),
     expression[Substring]("substring"),
     expression[UnBase64]("unbase64"),
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
index 154ac3508c0c5..6de40629ff27e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.catalyst.expressions
 
 import java.util.regex.Pattern
 
+import org.apache.commons.lang3.StringUtils
 import org.apache.spark.sql.catalyst.analysis.UnresolvedException
 import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.types._
@@ -299,6 +300,37 @@ case class StringLength(child: Expression) extends UnaryExpression with ExpectsI
   override def prettyName: String = "length"
 }
 
+/**
+ * A function that return the Levenshtein distance between the two given strings.
+ */
+case class Levenshtein(left: Expression, right: Expression) extends BinaryExpression
+    with ExpectsInputTypes {
+
+  override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType)
+
+  override def dataType: DataType = IntegerType
+
+  override def eval(input: InternalRow): Any = {
+    val leftValue = left.eval(input)
+    if (leftValue == null) {
+      null
+    } else {
+      val rightValue = right.eval(input)
+      if(rightValue == null) {
+        null
+      } else {
+        StringUtils.getLevenshteinDistance(leftValue.toString, rightValue.toString)
+      }
+    }
+  }
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    val stringUtils = classOf[StringUtils].getName
+    nullSafeCodeGen(ctx, ev, (res, left, right) =>
+      s"$res = $stringUtils.getLevenshteinDistance($left.toString(), $right.toString());")
+  }
+}
+
 /**
  * Returns the numeric value of the first character of str.
  */
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringFunctionsSuite.scala
index 468df20442d38..1efbe1a245e83 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringFunctionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringFunctionsSuite.scala
@@ -274,4 +274,13 @@ class StringFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(Decode(Literal.create(null, BinaryType), Literal("utf-8")), null)
     checkEvaluation(Decode(b, Literal.create(null, StringType)), null, create_row(null))
   }
+
+  test("Levenshtein distance") {
+    checkEvaluation(Levenshtein(Literal.create(null, StringType), Literal("")), null)
+    checkEvaluation(Levenshtein(Literal(""), Literal.create(null, StringType)), null)
+    checkEvaluation(Levenshtein(Literal(""), Literal("")), 0)
+    checkEvaluation(Levenshtein(Literal("abc"), Literal("abc")), 0)
+    checkEvaluation(Levenshtein(Literal("kitten"), Literal("sitting")), 3)
+    checkEvaluation(Levenshtein(Literal("frog"), Literal("fog")), 1)
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index b63c6ee8aba4b..e4109da08e0a4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -1580,21 +1580,36 @@ object functions {
   //////////////////////////////////////////////////////////////////////////////////////////////
 
   /**
-   * Computes the length of a given string value
-   *
+   * Computes the length of a given string value.
+   * 
    * @group string_funcs
    * @since 1.5.0
    */
   def strlen(e: Column): Column = StringLength(e.expr)
 
   /**
-   * Computes the length of a given string column
-   *
+   * Computes the length of a given string column.
+   * 
    * @group string_funcs
    * @since 1.5.0
    */
   def strlen(columnName: String): Column = strlen(Column(columnName))
 
+  /**
+   * Computes the Levenshtein distance of the two given strings.
+   * @group string_funcs
+   * @since 1.5.0
+   */
+  def levenshtein(l: Column, r: Column): Column = Levenshtein(l.expr, r.expr)
+
+  /**
+   * Computes the Levenshtein distance of the two given strings.
+   * @group string_funcs
+   * @since 1.5.0
+   */
+  def levenshtein(leftColumnName: String, rightColumnName: String): Column =
+    levenshtein(Column(leftColumnName), Column(rightColumnName))
+
   /**
    * Computes the numeric value of the first character of the specified string value.
    *
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
index bd9fa400e5b34..bc455a922d154 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
@@ -226,6 +226,12 @@ class DataFrameFunctionsSuite extends QueryTest {
       })
   }
 
+  test("Levenshtein distance") {
+    val df = Seq(("kitten", "sitting"), ("frog", "fog")).toDF("l", "r")
+    checkAnswer(df.select(levenshtein("l", "r")), Seq(Row(3), Row(1)))
+    checkAnswer(df.selectExpr("levenshtein(l, r)"), Seq(Row(3), Row(1)))
+  }
+
   test("string ascii function") {
     val df = Seq(("abc", "")).toDF("a", "b")
     checkAnswer(

From 48f7aed686afde70a6f0802c6cb37b0cad0509f1 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Sat, 4 Jul 2015 01:11:35 -0700
Subject: [PATCH 002/549] Fixed minor style issue with the previous merge.

---
 sql/core/src/main/scala/org/apache/spark/sql/functions.scala | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index e4109da08e0a4..abcfc0b65020c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -1581,7 +1581,7 @@ object functions {
 
   /**
    * Computes the length of a given string value.
-   * 
+   *
    * @group string_funcs
    * @since 1.5.0
    */
@@ -1589,7 +1589,7 @@ object functions {
 
   /**
    * Computes the length of a given string column.
-   * 
+   *
    * @group string_funcs
    * @since 1.5.0
    */

From 347cab85cd924ffd326f3d1367b3b156ee08052d Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Sat, 4 Jul 2015 11:55:04 -0700
Subject: [PATCH 003/549] [SQL] More unit tests for implicit type cast & add
 simpleString to AbstractDataType.

Author: Reynold Xin <rxin@databricks.com>

Closes #7221 from rxin/implicit-cast-tests and squashes the following commits:

64b13bd [Reynold Xin] Fixed a bug ..
489b732 [Reynold Xin] [SQL] More unit tests for implicit type cast & add simpleString to AbstractDataType.
---
 .../sql/catalyst/analysis/CheckAnalysis.scala |  6 ++---
 .../spark/sql/types/AbstractDataType.scala    |  7 ++++++
 .../apache/spark/sql/types/ArrayType.scala    |  2 ++
 .../apache/spark/sql/types/DecimalType.scala  |  2 ++
 .../org/apache/spark/sql/types/MapType.scala  |  2 ++
 .../apache/spark/sql/types/StructType.scala   |  2 ++
 .../analysis/HiveTypeCoercionSuite.scala      | 25 ++++++++++++++++++-
 7 files changed, 42 insertions(+), 4 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
index 583338da57117..476ac2b7cb474 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
@@ -40,7 +40,7 @@ trait CheckAnalysis {
   def containsMultipleGenerators(exprs: Seq[Expression]): Boolean = {
     exprs.flatMap(_.collect {
       case e: Generator => true
-    }).length >= 1
+    }).nonEmpty
   }
 
   def checkAnalysis(plan: LogicalPlan): Unit = {
@@ -85,12 +85,12 @@ trait CheckAnalysis {
           case Aggregate(groupingExprs, aggregateExprs, child) =>
             def checkValidAggregateExpression(expr: Expression): Unit = expr match {
               case _: AggregateExpression => // OK
-              case e: Attribute if groupingExprs.find(_ semanticEquals e).isEmpty =>
+              case e: Attribute if !groupingExprs.exists(_.semanticEquals(e)) =>
                 failAnalysis(
                   s"expression '${e.prettyString}' is neither present in the group by, " +
                     s"nor is it an aggregate function. " +
                     "Add to group by or wrap in first() if you don't care which value you get.")
-              case e if groupingExprs.find(_ semanticEquals e).isDefined => // OK
+              case e if groupingExprs.exists(_.semanticEquals(e)) => // OK
               case e if e.references.isEmpty => // OK
               case e => e.children.foreach(checkValidAggregateExpression)
             }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala
index e5dc99fb625d8..ffefb0e7837e9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala
@@ -37,6 +37,9 @@ private[sql] abstract class AbstractDataType {
    * Returns true if this data type is a parent of the `childCandidate`.
    */
   private[sql] def isParentOf(childCandidate: DataType): Boolean
+
+  /** Readable string representation for the type. */
+  private[sql] def simpleString: String
 }
 
 
@@ -56,6 +59,10 @@ private[sql] class TypeCollection(private val types: Seq[DataType]) extends Abst
   private[sql] override def defaultConcreteType: DataType = types.head
 
   private[sql] override def isParentOf(childCandidate: DataType): Boolean = false
+
+  private[sql] override def simpleString: String = {
+    types.map(_.simpleString).mkString("(", " or ", ")")
+  }
 }
 
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala
index 8ea6cb14c360e..43413ec761e6b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala
@@ -31,6 +31,8 @@ object ArrayType extends AbstractDataType {
   private[sql] override def isParentOf(childCandidate: DataType): Boolean = {
     childCandidate.isInstanceOf[ArrayType]
   }
+
+  private[sql] override def simpleString: String = "array"
 }
 
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala
index 434fc037aad4f..127b16ff85bed 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala
@@ -90,6 +90,8 @@ object DecimalType extends AbstractDataType {
     childCandidate.isInstanceOf[DecimalType]
   }
 
+  private[sql] override def simpleString: String = "decimal"
+
   val Unlimited: DecimalType = DecimalType(None)
 
   private[sql] object Fixed {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala
index 2b25617ec6655..868dea13d971e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala
@@ -75,6 +75,8 @@ object MapType extends AbstractDataType {
     childCandidate.isInstanceOf[MapType]
   }
 
+  private[sql] override def simpleString: String = "map"
+
   /**
    * Construct a [[MapType]] object with the given key type and value type.
    * The `valueContainsNull` is true.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
index 7e77b77e73940..3b17566d54d9b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
@@ -309,6 +309,8 @@ object StructType extends AbstractDataType {
     childCandidate.isInstanceOf[StructType]
   }
 
+  private[sql] override def simpleString: String = "struct"
+
   def apply(fields: Seq[StructField]): StructType = StructType(fields.toArray)
 
   def apply(fields: java.util.List[StructField]): StructType = {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
index 60e727c6c7d4d..67d05ab536b7f 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
@@ -26,7 +26,7 @@ import org.apache.spark.sql.types._
 
 class HiveTypeCoercionSuite extends PlanTest {
 
-  test("implicit type cast") {
+  test("eligible implicit type cast") {
     def shouldCast(from: DataType, to: AbstractDataType, expected: DataType): Unit = {
       val got = HiveTypeCoercion.ImplicitTypeCasts.implicitCast(Literal.create(null, from), to)
       assert(got.map(_.dataType) == Option(expected),
@@ -68,6 +68,29 @@ class HiveTypeCoercionSuite extends PlanTest {
     shouldCast(IntegerType, TypeCollection(BinaryType, IntegerType), IntegerType)
     shouldCast(BinaryType, TypeCollection(BinaryType, IntegerType), BinaryType)
     shouldCast(BinaryType, TypeCollection(IntegerType, BinaryType), BinaryType)
+
+    shouldCast(IntegerType, TypeCollection(StringType, BinaryType), StringType)
+    shouldCast(IntegerType, TypeCollection(BinaryType, StringType), StringType)
+  }
+
+  test("ineligible implicit type cast") {
+    def shouldNotCast(from: DataType, to: AbstractDataType): Unit = {
+      val got = HiveTypeCoercion.ImplicitTypeCasts.implicitCast(Literal.create(null, from), to)
+      assert(got.isEmpty, s"Should not be able to cast $from to $to, but got $got")
+    }
+
+    shouldNotCast(IntegerType, DateType)
+    shouldNotCast(IntegerType, TimestampType)
+    shouldNotCast(LongType, DateType)
+    shouldNotCast(LongType, TimestampType)
+    shouldNotCast(DecimalType.Unlimited, DateType)
+    shouldNotCast(DecimalType.Unlimited, TimestampType)
+
+    shouldNotCast(IntegerType, TypeCollection(DateType, TimestampType))
+
+    shouldNotCast(IntegerType, ArrayType)
+    shouldNotCast(IntegerType, MapType)
+    shouldNotCast(IntegerType, StructType)
   }
 
   test("tightest common bound for types") {

From c991ef5abbb501933b2a68eea1987cf8d88794a5 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Sat, 4 Jul 2015 11:55:20 -0700
Subject: [PATCH 004/549] [SPARK-8822][SQL] clean up type checking in
 math.scala.

Author: Reynold Xin <rxin@databricks.com>

Closes #7220 from rxin/SPARK-8822 and squashes the following commits:

0cda076 [Reynold Xin] Test cases.
22d0463 [Reynold Xin] Fixed type precedence.
beb2a97 [Reynold Xin] [SPARK-8822][SQL] clean up type checking in math.scala.
---
 .../spark/sql/catalyst/expressions/math.scala | 260 +++++++-----------
 .../expressions/MathFunctionsSuite.scala      |  31 ++-
 2 files changed, 123 insertions(+), 168 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
index 0fc320fb08876..45b7e4d3405c8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
@@ -17,10 +17,8 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
-import java.lang.{Long => JLong}
-import java.util.Arrays
+import java.{lang => jl}
 
-import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
 import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
@@ -206,7 +204,7 @@ case class Factorial(child: Expression) extends UnaryExpression with ExpectsInpu
     if (evalE == null) {
       null
     } else {
-      val input = evalE.asInstanceOf[Integer]
+      val input = evalE.asInstanceOf[jl.Integer]
       if (input > 20 || input < 0) {
         null
       } else {
@@ -290,7 +288,7 @@ case class Bin(child: Expression)
     if (evalE == null) {
       null
     } else {
-      UTF8String.fromString(JLong.toBinaryString(evalE.asInstanceOf[Long]))
+      UTF8String.fromString(jl.Long.toBinaryString(evalE.asInstanceOf[Long]))
     }
   }
 
@@ -300,27 +298,18 @@ case class Bin(child: Expression)
   }
 }
 
-
 /**
  * If the argument is an INT or binary, hex returns the number as a STRING in hexadecimal format.
  * Otherwise if the number is a STRING, it converts each character into its hex representation
  * and returns the resulting STRING. Negative numbers would be treated as two's complement.
  */
-case class Hex(child: Expression) extends UnaryExpression with Serializable  {
+case class Hex(child: Expression) extends UnaryExpression with ExpectsInputTypes {
+  // TODO: Create code-gen version.
 
-  override def dataType: DataType = StringType
+  override def inputTypes: Seq[AbstractDataType] =
+    Seq(TypeCollection(LongType, StringType, BinaryType))
 
-  override def checkInputDataTypes(): TypeCheckResult = {
-    if (child.dataType.isInstanceOf[StringType]
-      || child.dataType.isInstanceOf[IntegerType]
-      || child.dataType.isInstanceOf[LongType]
-      || child.dataType.isInstanceOf[BinaryType]
-      || child.dataType == NullType) {
-      TypeCheckResult.TypeCheckSuccess
-    } else {
-      TypeCheckResult.TypeCheckFailure(s"hex doesn't accepts ${child.dataType} type")
-    }
-  }
+  override def dataType: DataType = StringType
 
   override def eval(input: InternalRow): Any = {
     val num = child.eval(input)
@@ -329,7 +318,6 @@ case class Hex(child: Expression) extends UnaryExpression with Serializable  {
     } else {
       child.dataType match {
         case LongType => hex(num.asInstanceOf[Long])
-        case IntegerType => hex(num.asInstanceOf[Integer].toLong)
         case BinaryType => hex(num.asInstanceOf[Array[Byte]])
         case StringType => hex(num.asInstanceOf[UTF8String])
       }
@@ -371,7 +359,55 @@ case class Hex(child: Expression) extends UnaryExpression with Serializable  {
         Character.toUpperCase(Character.forDigit((numBuf & 0xF).toInt, 16)).toByte
       numBuf >>>= 4
     } while (numBuf != 0)
-    UTF8String.fromBytes(Arrays.copyOfRange(value, value.length - len, value.length))
+    UTF8String.fromBytes(java.util.Arrays.copyOfRange(value, value.length - len, value.length))
+  }
+}
+
+
+/**
+ * Performs the inverse operation of HEX.
+ * Resulting characters are returned as a byte array.
+ */
+case class UnHex(child: Expression) extends UnaryExpression with ExpectsInputTypes {
+  // TODO: Create code-gen version.
+
+  override def inputTypes: Seq[AbstractDataType] = Seq(StringType)
+
+  override def dataType: DataType = BinaryType
+
+  override def eval(input: InternalRow): Any = {
+    val num = child.eval(input)
+    if (num == null) {
+      null
+    } else {
+      unhex(num.asInstanceOf[UTF8String].getBytes)
+    }
+  }
+
+  private val unhexDigits = {
+    val array = Array.fill[Byte](128)(-1)
+    (0 to 9).foreach(i => array('0' + i) = i.toByte)
+    (0 to 5).foreach(i => array('A' + i) = (i + 10).toByte)
+    (0 to 5).foreach(i => array('a' + i) = (i + 10).toByte)
+    array
+  }
+
+  private def unhex(inputBytes: Array[Byte]): Array[Byte] = {
+    var bytes = inputBytes
+    if ((bytes.length & 0x01) != 0) {
+      bytes = '0'.toByte +: bytes
+    }
+    val out = new Array[Byte](bytes.length >> 1)
+    // two characters form the hex value.
+    var i = 0
+    while (i < bytes.length) {
+      val first = unhexDigits(bytes(i))
+      val second = unhexDigits(bytes(i + 1))
+      if (first == -1 || second == -1) { return null}
+      out(i / 2) = (((first << 4) | second) & 0xFF).toByte
+      i += 2
+    }
+    out
   }
 }
 
@@ -423,22 +459,19 @@ case class Pow(left: Expression, right: Expression)
   }
 }
 
-case class ShiftLeft(left: Expression, right: Expression) extends BinaryExpression {
 
-  override def checkInputDataTypes(): TypeCheckResult = {
-    (left.dataType, right.dataType) match {
-      case (NullType, _) | (_, NullType) => return TypeCheckResult.TypeCheckSuccess
-      case (_, IntegerType) => left.dataType match {
-        case LongType | IntegerType | ShortType | ByteType =>
-          return TypeCheckResult.TypeCheckSuccess
-        case _ => // failed
-      }
-      case _ => // failed
-    }
-    TypeCheckResult.TypeCheckFailure(
-        s"ShiftLeft expects long, integer, short or byte value as first argument and an " +
-          s"integer value as second argument, not (${left.dataType}, ${right.dataType})")
-  }
+/**
+ * Bitwise unsigned left shift.
+ * @param left the base number to shift.
+ * @param right number of bits to left shift.
+ */
+case class ShiftLeft(left: Expression, right: Expression)
+  extends BinaryExpression with ExpectsInputTypes {
+
+  override def inputTypes: Seq[AbstractDataType] =
+    Seq(TypeCollection(IntegerType, LongType), IntegerType)
+
+  override def dataType: DataType = left.dataType
 
   override def eval(input: InternalRow): Any = {
     val valueLeft = left.eval(input)
@@ -446,10 +479,8 @@ case class ShiftLeft(left: Expression, right: Expression) extends BinaryExpressi
       val valueRight = right.eval(input)
       if (valueRight != null) {
         valueLeft match {
-          case l: Long => l << valueRight.asInstanceOf[Integer]
-          case i: Integer => i << valueRight.asInstanceOf[Integer]
-          case s: Short => s << valueRight.asInstanceOf[Integer]
-          case b: Byte => b << valueRight.asInstanceOf[Integer]
+          case l: jl.Long => l << valueRight.asInstanceOf[jl.Integer]
+          case i: jl.Integer => i << valueRight.asInstanceOf[jl.Integer]
         }
       } else {
         null
@@ -459,35 +490,24 @@ case class ShiftLeft(left: Expression, right: Expression) extends BinaryExpressi
     }
   }
 
-  override def dataType: DataType = {
-    left.dataType match {
-      case LongType => LongType
-      case IntegerType | ShortType | ByteType => IntegerType
-      case _ => NullType
-    }
-  }
-
   override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     nullSafeCodeGen(ctx, ev, (result, left, right) => s"$result = $left << $right;")
   }
 }
 
-case class ShiftRight(left: Expression, right: Expression) extends BinaryExpression {
 
-  override def checkInputDataTypes(): TypeCheckResult = {
-    (left.dataType, right.dataType) match {
-      case (NullType, _) | (_, NullType) => return TypeCheckResult.TypeCheckSuccess
-      case (_, IntegerType) => left.dataType match {
-        case LongType | IntegerType | ShortType | ByteType =>
-          return TypeCheckResult.TypeCheckSuccess
-        case _ => // failed
-      }
-      case _ => // failed
-    }
-    TypeCheckResult.TypeCheckFailure(
-          s"ShiftRight expects long, integer, short or byte value as first argument and an " +
-            s"integer value as second argument, not (${left.dataType}, ${right.dataType})")
-  }
+/**
+ * Bitwise unsigned left shift.
+ * @param left the base number to shift.
+ * @param right number of bits to left shift.
+ */
+case class ShiftRight(left: Expression, right: Expression)
+  extends BinaryExpression with ExpectsInputTypes {
+
+  override def inputTypes: Seq[AbstractDataType] =
+    Seq(TypeCollection(IntegerType, LongType), IntegerType)
+
+  override def dataType: DataType = left.dataType
 
   override def eval(input: InternalRow): Any = {
     val valueLeft = left.eval(input)
@@ -495,10 +515,8 @@ case class ShiftRight(left: Expression, right: Expression) extends BinaryExpress
       val valueRight = right.eval(input)
       if (valueRight != null) {
         valueLeft match {
-          case l: Long => l >> valueRight.asInstanceOf[Integer]
-          case i: Integer => i >> valueRight.asInstanceOf[Integer]
-          case s: Short => s >> valueRight.asInstanceOf[Integer]
-          case b: Byte => b >> valueRight.asInstanceOf[Integer]
+          case l: jl.Long => l >> valueRight.asInstanceOf[jl.Integer]
+          case i: jl.Integer => i >> valueRight.asInstanceOf[jl.Integer]
         }
       } else {
         null
@@ -508,35 +526,24 @@ case class ShiftRight(left: Expression, right: Expression) extends BinaryExpress
     }
   }
 
-  override def dataType: DataType = {
-    left.dataType match {
-      case LongType => LongType
-      case IntegerType | ShortType | ByteType => IntegerType
-      case _ => NullType
-    }
-  }
-
   override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     nullSafeCodeGen(ctx, ev, (result, left, right) => s"$result = $left >> $right;")
   }
 }
 
-case class ShiftRightUnsigned(left: Expression, right: Expression) extends BinaryExpression {
 
-  override def checkInputDataTypes(): TypeCheckResult = {
-    (left.dataType, right.dataType) match {
-      case (NullType, _) | (_, NullType) => return TypeCheckResult.TypeCheckSuccess
-      case (_, IntegerType) => left.dataType match {
-        case LongType | IntegerType | ShortType | ByteType =>
-          return TypeCheckResult.TypeCheckSuccess
-        case _ => // failed
-      }
-      case _ => // failed
-    }
-    TypeCheckResult.TypeCheckFailure(
-      s"ShiftRightUnsigned expects long, integer, short or byte value as first argument and an " +
-        s"integer value as second argument, not (${left.dataType}, ${right.dataType})")
-  }
+/**
+ * Bitwise unsigned right shift, for integer and long data type.
+ * @param left the base number.
+ * @param right the number of bits to right shift.
+ */
+case class ShiftRightUnsigned(left: Expression, right: Expression)
+  extends BinaryExpression with ExpectsInputTypes {
+
+  override def inputTypes: Seq[AbstractDataType] =
+    Seq(TypeCollection(IntegerType, LongType), IntegerType)
+
+  override def dataType: DataType = left.dataType
 
   override def eval(input: InternalRow): Any = {
     val valueLeft = left.eval(input)
@@ -544,10 +551,8 @@ case class ShiftRightUnsigned(left: Expression, right: Expression) extends Binar
       val valueRight = right.eval(input)
       if (valueRight != null) {
         valueLeft match {
-          case l: Long => l >>> valueRight.asInstanceOf[Integer]
-          case i: Integer => i >>> valueRight.asInstanceOf[Integer]
-          case s: Short => s >>> valueRight.asInstanceOf[Integer]
-          case b: Byte => b >>> valueRight.asInstanceOf[Integer]
+          case l: jl.Long => l >>> valueRight.asInstanceOf[jl.Integer]
+          case i: jl.Integer => i >>> valueRight.asInstanceOf[jl.Integer]
         }
       } else {
         null
@@ -557,74 +562,21 @@ case class ShiftRightUnsigned(left: Expression, right: Expression) extends Binar
     }
   }
 
-  override def dataType: DataType = {
-    left.dataType match {
-      case LongType => LongType
-      case IntegerType | ShortType | ByteType => IntegerType
-      case _ => NullType
-    }
-  }
-
   override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     nullSafeCodeGen(ctx, ev, (result, left, right) => s"$result = $left >>> $right;")
   }
 }
 
-/**
- * Performs the inverse operation of HEX.
- * Resulting characters are returned as a byte array.
- */
-case class UnHex(child: Expression) extends UnaryExpression with Serializable {
-
-  override def dataType: DataType = BinaryType
-
-  override def checkInputDataTypes(): TypeCheckResult = {
-    if (child.dataType.isInstanceOf[StringType] || child.dataType == NullType) {
-      TypeCheckResult.TypeCheckSuccess
-    } else {
-      TypeCheckResult.TypeCheckFailure(s"unHex accepts String type, not ${child.dataType}")
-    }
-  }
-
-  override def eval(input: InternalRow): Any = {
-    val num = child.eval(input)
-    if (num == null) {
-      null
-    } else {
-      unhex(num.asInstanceOf[UTF8String].getBytes)
-    }
-  }
-
-  private val unhexDigits = {
-    val array = Array.fill[Byte](128)(-1)
-    (0 to 9).foreach(i => array('0' + i) = i.toByte)
-    (0 to 5).foreach(i => array('A' + i) = (i + 10).toByte)
-    (0 to 5).foreach(i => array('a' + i) = (i + 10).toByte)
-    array
-  }
-
-  private def unhex(inputBytes: Array[Byte]): Array[Byte] = {
-    var bytes = inputBytes
-    if ((bytes.length & 0x01) != 0) {
-      bytes = '0'.toByte +: bytes
-    }
-    val out = new Array[Byte](bytes.length >> 1)
-    // two characters form the hex value.
-    var i = 0
-    while (i < bytes.length) {
-        val first = unhexDigits(bytes(i))
-        val second = unhexDigits(bytes(i + 1))
-        if (first == -1 || second == -1) { return null}
-        out(i / 2) = (((first << 4) | second) & 0xFF).toByte
-        i += 2
-    }
-    out
-  }
-}
 
 case class Hypot(left: Expression, right: Expression)
   extends BinaryMathExpression(math.hypot, "HYPOT")
 
+
+/**
+ * Computes the logarithm of a number.
+ * @param left the logarithm base, default to e.
+ * @param right the number to compute the logarithm of.
+ */
 case class Logarithm(left: Expression, right: Expression)
   extends BinaryMathExpression((c1, c2) => math.log(c2) / math.log(c1), "LOG") {
 
@@ -642,7 +594,7 @@ case class Logarithm(left: Expression, right: Expression)
       defineCodeGen(ctx, ev, (c1, c2) => s"java.lang.Math.log($c2) / java.lang.Math.log($c1)")
     }
     logCode + s"""
-      if (Double.valueOf(${ev.primitive}).isNaN()) {
+      if (Double.isNaN(${ev.primitive})) {
         ${ev.isNull} = true;
       }
     """
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala
index 20839c83d4fd0..03d8400cf356b 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala
@@ -161,11 +161,10 @@ class MathFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
   }
 
   test("factorial") {
-    val dataLong = (0 to 20)
-    dataLong.foreach { value =>
+    (0 to 20).foreach { value =>
       checkEvaluation(Factorial(Literal(value)), LongMath.factorial(value), EmptyRow)
     }
-    checkEvaluation((Literal.create(null, IntegerType)), null, create_row(null))
+    checkEvaluation(Literal.create(null, IntegerType), null, create_row(null))
     checkEvaluation(Factorial(Literal(20)), 2432902008176640000L, EmptyRow)
     checkEvaluation(Factorial(Literal(21)), null, EmptyRow)
   }
@@ -244,10 +243,8 @@ class MathFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(
       ShiftLeft(Literal.create(null, IntegerType), Literal.create(null, IntegerType)), null)
     checkEvaluation(ShiftLeft(Literal(21), Literal(1)), 42)
-    checkEvaluation(ShiftLeft(Literal(21.toByte), Literal(1)), 42)
-    checkEvaluation(ShiftLeft(Literal(21.toShort), Literal(1)), 42)
-    checkEvaluation(ShiftLeft(Literal(21.toLong), Literal(1)), 42.toLong)
 
+    checkEvaluation(ShiftLeft(Literal(21.toLong), Literal(1)), 42.toLong)
     checkEvaluation(ShiftLeft(Literal(-21.toLong), Literal(1)), -42.toLong)
   }
 
@@ -257,10 +254,8 @@ class MathFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(
       ShiftRight(Literal.create(null, IntegerType), Literal.create(null, IntegerType)), null)
     checkEvaluation(ShiftRight(Literal(42), Literal(1)), 21)
-    checkEvaluation(ShiftRight(Literal(42.toByte), Literal(1)), 21)
-    checkEvaluation(ShiftRight(Literal(42.toShort), Literal(1)), 21)
-    checkEvaluation(ShiftRight(Literal(42.toLong), Literal(1)), 21.toLong)
 
+    checkEvaluation(ShiftRight(Literal(42.toLong), Literal(1)), 21.toLong)
     checkEvaluation(ShiftRight(Literal(-42.toLong), Literal(1)), -21.toLong)
   }
 
@@ -270,16 +265,12 @@ class MathFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(
       ShiftRight(Literal.create(null, IntegerType), Literal.create(null, IntegerType)), null)
     checkEvaluation(ShiftRightUnsigned(Literal(42), Literal(1)), 21)
-    checkEvaluation(ShiftRightUnsigned(Literal(42.toByte), Literal(1)), 21)
-    checkEvaluation(ShiftRightUnsigned(Literal(42.toShort), Literal(1)), 21)
-    checkEvaluation(ShiftRightUnsigned(Literal(42.toLong), Literal(1)), 21.toLong)
 
+    checkEvaluation(ShiftRightUnsigned(Literal(42.toLong), Literal(1)), 21.toLong)
     checkEvaluation(ShiftRightUnsigned(Literal(-42.toLong), Literal(1)), 9223372036854775787L)
   }
 
   test("hex") {
-    checkEvaluation(Hex(Literal(28)), "1C")
-    checkEvaluation(Hex(Literal(-28)), "FFFFFFFFFFFFFFE4")
     checkEvaluation(Hex(Literal(100800200404L)), "177828FED4")
     checkEvaluation(Hex(Literal(-100800200404L)), "FFFFFFE887D7012C")
     checkEvaluation(Hex(Literal("helloHex")), "68656C6C6F486578")
@@ -313,6 +304,8 @@ class MathFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
       checkEvaluation(Logarithm(Literal(v2), Literal(v1)), f(v2 + 0.0, v1 + 0.0), EmptyRow)
       checkEvaluation(new Logarithm(Literal(v1)), f(math.E, v1 + 0.0), EmptyRow)
     }
+
+    // null input should yield null output
     checkEvaluation(
       Logarithm(Literal.create(null, DoubleType), Literal(1.0)),
       null,
@@ -321,5 +314,15 @@ class MathFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
       Logarithm(Literal(1.0), Literal.create(null, DoubleType)),
       null,
       create_row(null))
+
+    // negative input should yield null output
+    checkEvaluation(
+      Logarithm(Literal(-1.0), Literal(1.0)),
+      null,
+      create_row(null))
+    checkEvaluation(
+      Logarithm(Literal(1.0), Literal(-1.0)),
+      null,
+      create_row(null))
   }
 }

From 2b820f2a4bf9b154762e7516a5b0485322799da9 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Sat, 4 Jul 2015 22:52:50 -0700
Subject: [PATCH 005/549] [MINOR] [SQL] Minor fix for CatalystSchemaConverter

ping liancheng

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #7224 from viirya/few_fix_catalystschema and squashes the following commits:

d994330 [Liang-Chi Hsieh] Minor fix for CatalystSchemaConverter.
---
 .../main/scala/org/apache/spark/sql/SQLConf.scala    |  2 +-
 .../spark/sql/parquet/CatalystSchemaConverter.scala  | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
index 2c258b6ee399c..6005d35f015a9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
@@ -273,7 +273,7 @@ private[spark] object SQLConf {
   val PARQUET_FOLLOW_PARQUET_FORMAT_SPEC = booleanConf(
     key = "spark.sql.parquet.followParquetFormatSpec",
     defaultValue = Some(false),
-    doc = "Wether to stick to Parquet format specification when converting Parquet schema to " +
+    doc = "Whether to stick to Parquet format specification when converting Parquet schema to " +
       "Spark SQL schema and vice versa.  Sticks to the specification if set to true; falls back " +
       "to compatible mode if set to false.",
     isPublic = false)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala
index 2be7c64612cd2..4ab274ec17a02 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala
@@ -142,7 +142,7 @@ private[parquet] class CatalystSchemaConverter(
       DecimalType(precision, scale)
     }
 
-    field.getPrimitiveTypeName match {
+    typeName match {
       case BOOLEAN => BooleanType
 
       case FLOAT => FloatType
@@ -150,7 +150,7 @@ private[parquet] class CatalystSchemaConverter(
       case DOUBLE => DoubleType
 
       case INT32 =>
-        field.getOriginalType match {
+        originalType match {
           case INT_8 => ByteType
           case INT_16 => ShortType
           case INT_32 | null => IntegerType
@@ -161,7 +161,7 @@ private[parquet] class CatalystSchemaConverter(
         }
 
       case INT64 =>
-        field.getOriginalType match {
+        originalType match {
           case INT_64 | null => LongType
           case DECIMAL => makeDecimalType(maxPrecisionForBytes(8))
           case TIMESTAMP_MILLIS => typeNotImplemented()
@@ -176,7 +176,7 @@ private[parquet] class CatalystSchemaConverter(
         TimestampType
 
       case BINARY =>
-        field.getOriginalType match {
+        originalType match {
           case UTF8 | ENUM => StringType
           case null if assumeBinaryIsString => StringType
           case null => BinaryType
@@ -185,7 +185,7 @@ private[parquet] class CatalystSchemaConverter(
         }
 
       case FIXED_LEN_BYTE_ARRAY =>
-        field.getOriginalType match {
+        originalType match {
           case DECIMAL => makeDecimalType(maxPrecisionForBytes(field.getTypeLength))
           case INTERVAL => typeNotImplemented()
           case _ => illegalType()
@@ -261,7 +261,7 @@ private[parquet] class CatalystSchemaConverter(
   // Here we implement Parquet LIST backwards-compatibility rules.
   // See: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#backward-compatibility-rules
   // scalastyle:on
-  private def isElementType(repeatedType: Type, parentName: String) = {
+  private def isElementType(repeatedType: Type, parentName: String): Boolean = {
     {
       // For legacy 2-level list types with primitive element type, e.g.:
       //

From f9c448dce8139e85ac564daa0f7e0325e778cffe Mon Sep 17 00:00:00 2001
From: Joshi <rekhajoshm@gmail.com>
Date: Sun, 5 Jul 2015 12:58:03 -0700
Subject: [PATCH 006/549] [SPARK-7137] [ML] Update SchemaUtils checkInputColumn
 to print more info if needed

Author: Joshi <rekhajoshm@gmail.com>
Author: Rekha Joshi <rekhajoshm@gmail.com>

Closes #5992 from rekhajoshm/fix/SPARK-7137 and squashes the following commits:

8c42b57 [Joshi] update checkInputColumn to print more info if needed
33ddd2e [Joshi] update checkInputColumn to print more info if needed
acf3e17 [Joshi] update checkInputColumn to print more info if needed
8993c0e [Joshi] SPARK-7137: Add checkInputColumn back to Params and print more info
e3677c9 [Rekha Joshi] Merge pull request #1 from apache/master
---
 .../scala/org/apache/spark/ml/util/SchemaUtils.scala     | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/SchemaUtils.scala b/mllib/src/main/scala/org/apache/spark/ml/util/SchemaUtils.scala
index 7cd53c6d7ef79..76f651488aef9 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/util/SchemaUtils.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/util/SchemaUtils.scala
@@ -32,10 +32,15 @@ private[spark] object SchemaUtils {
    * @param colName  column name
    * @param dataType  required column data type
    */
-  def checkColumnType(schema: StructType, colName: String, dataType: DataType): Unit = {
+  def checkColumnType(
+      schema: StructType,
+      colName: String,
+      dataType: DataType,
+      msg: String = ""): Unit = {
     val actualDataType = schema(colName).dataType
+    val message = if (msg != null && msg.trim.length > 0) " " + msg else ""
     require(actualDataType.equals(dataType),
-      s"Column $colName must be of type $dataType but was actually $actualDataType.")
+      s"Column $colName must be of type $dataType but was actually $actualDataType.$message")
   }
 
   /**

From a0cb111b22cb093e86b0daeecb3dcc41d095df40 Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Sun, 5 Jul 2015 20:50:02 -0700
Subject: [PATCH 007/549] [SPARK-8549] [SPARKR] Fix the line length of SparkR

[[SPARK-8549] Fix the line length of SparkR - ASF JIRA](https://issues.apache.org/jira/browse/SPARK-8549)

Author: Yu ISHIKAWA <yuu.ishikawa@gmail.com>

Closes #7204 from yu-iskw/SPARK-8549 and squashes the following commits:

6fb131a [Yu ISHIKAWA] Fix the typo
1737598 [Yu ISHIKAWA] [SPARK-8549][SparkR] Fix the line length of SparkR
---
 R/pkg/R/generics.R                 |  3 ++-
 R/pkg/R/pairRDD.R                  | 12 ++++++------
 R/pkg/R/sparkR.R                   |  9 ++++++---
 R/pkg/R/utils.R                    | 31 +++++++++++++++++-------------
 R/pkg/inst/tests/test_includeJAR.R |  4 ++--
 R/pkg/inst/tests/test_rdd.R        | 12 ++++++++----
 R/pkg/inst/tests/test_sparkSQL.R   | 11 +++++++++--
 7 files changed, 51 insertions(+), 31 deletions(-)

diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index 79055b7f18558..fad9d71158c51 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -20,7 +20,8 @@
 # @rdname aggregateRDD
 # @seealso reduce
 # @export
-setGeneric("aggregateRDD", function(x, zeroValue, seqOp, combOp) { standardGeneric("aggregateRDD") })
+setGeneric("aggregateRDD",
+           function(x, zeroValue, seqOp, combOp) { standardGeneric("aggregateRDD") })
 
 # @rdname cache-methods
 # @export
diff --git a/R/pkg/R/pairRDD.R b/R/pkg/R/pairRDD.R
index 7f902ba8e683e..0f1179e0aa51a 100644
--- a/R/pkg/R/pairRDD.R
+++ b/R/pkg/R/pairRDD.R
@@ -560,8 +560,8 @@ setMethod("join",
 # Left outer join two RDDs
 #
 # @description
-# \code{leftouterjoin} This function left-outer-joins two RDDs where every element is of the form list(K, V).
-# The key types of the two RDDs should be the same.
+# \code{leftouterjoin} This function left-outer-joins two RDDs where every element is of
+# the form list(K, V). The key types of the two RDDs should be the same.
 #
 # @param x An RDD to be joined. Should be an RDD where each element is
 #             list(K, V).
@@ -597,8 +597,8 @@ setMethod("leftOuterJoin",
 # Right outer join two RDDs
 #
 # @description
-# \code{rightouterjoin} This function right-outer-joins two RDDs where every element is of the form list(K, V).
-# The key types of the two RDDs should be the same.
+# \code{rightouterjoin} This function right-outer-joins two RDDs where every element is of
+# the form list(K, V). The key types of the two RDDs should be the same.
 #
 # @param x An RDD to be joined. Should be an RDD where each element is
 #             list(K, V).
@@ -634,8 +634,8 @@ setMethod("rightOuterJoin",
 # Full outer join two RDDs
 #
 # @description
-# \code{fullouterjoin} This function full-outer-joins two RDDs where every element is of the form list(K, V).
-# The key types of the two RDDs should be the same.
+# \code{fullouterjoin} This function full-outer-joins two RDDs where every element is of
+# the form list(K, V). The key types of the two RDDs should be the same.
 #
 # @param x An RDD to be joined. Should be an RDD where each element is
 #             list(K, V).
diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R
index 86233e01db365..048eb8ed541e4 100644
--- a/R/pkg/R/sparkR.R
+++ b/R/pkg/R/sparkR.R
@@ -105,7 +105,8 @@ sparkR.init <- function(
   sparkPackages = "") {
 
   if (exists(".sparkRjsc", envir = .sparkREnv)) {
-    cat("Re-using existing Spark Context. Please stop SparkR with sparkR.stop() or restart R to create a new Spark Context\n")
+    cat(paste("Re-using existing Spark Context.",
+              "Please stop SparkR with sparkR.stop() or restart R to create a new Spark Context\n"))
     return(get(".sparkRjsc", envir = .sparkREnv))
   }
 
@@ -180,14 +181,16 @@ sparkR.init <- function(
 
   sparkExecutorEnvMap <- new.env()
   if (!any(names(sparkExecutorEnv) == "LD_LIBRARY_PATH")) {
-    sparkExecutorEnvMap[["LD_LIBRARY_PATH"]] <- paste0("$LD_LIBRARY_PATH:",Sys.getenv("LD_LIBRARY_PATH"))
+    sparkExecutorEnvMap[["LD_LIBRARY_PATH"]] <-
+      paste0("$LD_LIBRARY_PATH:",Sys.getenv("LD_LIBRARY_PATH"))
   }
   for (varname in names(sparkExecutorEnv)) {
     sparkExecutorEnvMap[[varname]] <- sparkExecutorEnv[[varname]]
   }
 
   nonEmptyJars <- Filter(function(x) { x != "" }, jars)
-  localJarPaths <- sapply(nonEmptyJars, function(j) { utils::URLencode(paste("file:", uriSep, j, sep = "")) })
+  localJarPaths <- sapply(nonEmptyJars,
+                          function(j) { utils::URLencode(paste("file:", uriSep, j, sep = "")) })
 
   # Set the start time to identify jobjs
   # Seconds resolution is good enough for this purpose, so use ints
diff --git a/R/pkg/R/utils.R b/R/pkg/R/utils.R
index 13cec0f712fb4..ea629a64f7158 100644
--- a/R/pkg/R/utils.R
+++ b/R/pkg/R/utils.R
@@ -334,18 +334,21 @@ getStorageLevel <- function(newLevel = c("DISK_ONLY",
                                          "MEMORY_ONLY_SER_2",
                                          "OFF_HEAP")) {
   match.arg(newLevel)
+  storageLevelClass <- "org.apache.spark.storage.StorageLevel"
   storageLevel <- switch(newLevel,
-                         "DISK_ONLY" = callJStatic("org.apache.spark.storage.StorageLevel", "DISK_ONLY"),
-                         "DISK_ONLY_2" = callJStatic("org.apache.spark.storage.StorageLevel", "DISK_ONLY_2"),
-                         "MEMORY_AND_DISK" = callJStatic("org.apache.spark.storage.StorageLevel", "MEMORY_AND_DISK"),
-                         "MEMORY_AND_DISK_2" = callJStatic("org.apache.spark.storage.StorageLevel", "MEMORY_AND_DISK_2"),
-                         "MEMORY_AND_DISK_SER" = callJStatic("org.apache.spark.storage.StorageLevel", "MEMORY_AND_DISK_SER"),
-                         "MEMORY_AND_DISK_SER_2" = callJStatic("org.apache.spark.storage.StorageLevel", "MEMORY_AND_DISK_SER_2"),
-                         "MEMORY_ONLY" = callJStatic("org.apache.spark.storage.StorageLevel", "MEMORY_ONLY"),
-                         "MEMORY_ONLY_2" = callJStatic("org.apache.spark.storage.StorageLevel", "MEMORY_ONLY_2"),
-                         "MEMORY_ONLY_SER" = callJStatic("org.apache.spark.storage.StorageLevel", "MEMORY_ONLY_SER"),
-                         "MEMORY_ONLY_SER_2" = callJStatic("org.apache.spark.storage.StorageLevel", "MEMORY_ONLY_SER_2"),
-                         "OFF_HEAP" = callJStatic("org.apache.spark.storage.StorageLevel", "OFF_HEAP"))
+                         "DISK_ONLY" = callJStatic(storageLevelClass, "DISK_ONLY"),
+                         "DISK_ONLY_2" = callJStatic(storageLevelClass, "DISK_ONLY_2"),
+                         "MEMORY_AND_DISK" = callJStatic(storageLevelClass, "MEMORY_AND_DISK"),
+                         "MEMORY_AND_DISK_2" = callJStatic(storageLevelClass, "MEMORY_AND_DISK_2"),
+                         "MEMORY_AND_DISK_SER" = callJStatic(storageLevelClass,
+                                                             "MEMORY_AND_DISK_SER"),
+                         "MEMORY_AND_DISK_SER_2" = callJStatic(storageLevelClass,
+                                                               "MEMORY_AND_DISK_SER_2"),
+                         "MEMORY_ONLY" = callJStatic(storageLevelClass, "MEMORY_ONLY"),
+                         "MEMORY_ONLY_2" = callJStatic(storageLevelClass, "MEMORY_ONLY_2"),
+                         "MEMORY_ONLY_SER" = callJStatic(storageLevelClass, "MEMORY_ONLY_SER"),
+                         "MEMORY_ONLY_SER_2" = callJStatic(storageLevelClass, "MEMORY_ONLY_SER_2"),
+                         "OFF_HEAP" = callJStatic(storageLevelClass, "OFF_HEAP"))
 }
 
 # Utility function for functions where an argument needs to be integer but we want to allow
@@ -545,9 +548,11 @@ mergePartitions <- function(rdd, zip) {
         lengthOfKeys <- part[[len - lengthOfValues]]
         stopifnot(len == lengthOfKeys + lengthOfValues)
 
-        # For zip operation, check if corresponding partitions of both RDDs have the same number of elements.
+        # For zip operation, check if corresponding partitions
+        # of both RDDs have the same number of elements.
         if (zip && lengthOfKeys != lengthOfValues) {
-          stop("Can only zip RDDs with same number of elements in each pair of corresponding partitions.")
+          stop(paste("Can only zip RDDs with same number of elements",
+                     "in each pair of corresponding partitions."))
         }
 
         if (lengthOfKeys > 1) {
diff --git a/R/pkg/inst/tests/test_includeJAR.R b/R/pkg/inst/tests/test_includeJAR.R
index 844d86f3cc97f..cc1faeabffe30 100644
--- a/R/pkg/inst/tests/test_includeJAR.R
+++ b/R/pkg/inst/tests/test_includeJAR.R
@@ -18,8 +18,8 @@ context("include an external JAR in SparkContext")
 
 runScript <- function() {
   sparkHome <- Sys.getenv("SPARK_HOME")
-  jarPath <- paste("--jars",
-                   shQuote(file.path(sparkHome, "R/lib/SparkR/test_support/sparktestjar_2.10-1.0.jar")))
+  sparkTestJarPath <- "R/lib/SparkR/test_support/sparktestjar_2.10-1.0.jar"
+  jarPath <- paste("--jars", shQuote(file.path(sparkHome, sparkTestJarPath)))
   scriptPath <- file.path(sparkHome, "R/lib/SparkR/tests/jarTest.R")
   submitPath <- file.path(sparkHome, "bin/spark-submit")
   res <- system2(command = submitPath,
diff --git a/R/pkg/inst/tests/test_rdd.R b/R/pkg/inst/tests/test_rdd.R
index fc3c01d837de4..b79692873cec3 100644
--- a/R/pkg/inst/tests/test_rdd.R
+++ b/R/pkg/inst/tests/test_rdd.R
@@ -669,13 +669,15 @@ test_that("fullOuterJoin() on pairwise RDDs", {
   rdd1 <- parallelize(sc, list(list(1,2), list(1,3), list(3,3)))
   rdd2 <- parallelize(sc, list(list(1,1), list(2,4)))
   actual <- collect(fullOuterJoin(rdd1, rdd2, 2L))
-  expected <- list(list(1, list(2, 1)), list(1, list(3, 1)), list(2, list(NULL, 4)), list(3, list(3, NULL)))
+  expected <- list(list(1, list(2, 1)), list(1, list(3, 1)),
+                   list(2, list(NULL, 4)), list(3, list(3, NULL)))
   expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
 
   rdd1 <- parallelize(sc, list(list("a",2), list("a",3), list("c", 1)))
   rdd2 <- parallelize(sc, list(list("a",1), list("b",4)))
   actual <- collect(fullOuterJoin(rdd1, rdd2, 2L))
-  expected <- list(list("b", list(NULL, 4)), list("a", list(2, 1)), list("a", list(3, 1)), list("c", list(1, NULL)))
+  expected <- list(list("b", list(NULL, 4)), list("a", list(2, 1)),
+                   list("a", list(3, 1)), list("c", list(1, NULL)))
   expect_equal(sortKeyValueList(actual),
                sortKeyValueList(expected))
 
@@ -683,13 +685,15 @@ test_that("fullOuterJoin() on pairwise RDDs", {
   rdd2 <- parallelize(sc, list(list(3,3), list(4,4)))
   actual <- collect(fullOuterJoin(rdd1, rdd2, 2L))
   expect_equal(sortKeyValueList(actual),
-               sortKeyValueList(list(list(1, list(1, NULL)), list(2, list(2, NULL)), list(3, list(NULL, 3)), list(4, list(NULL, 4)))))
+               sortKeyValueList(list(list(1, list(1, NULL)), list(2, list(2, NULL)),
+                                     list(3, list(NULL, 3)), list(4, list(NULL, 4)))))
 
   rdd1 <- parallelize(sc, list(list("a",1), list("b",2)))
   rdd2 <- parallelize(sc, list(list("c",3), list("d",4)))
   actual <- collect(fullOuterJoin(rdd1, rdd2, 2L))
   expect_equal(sortKeyValueList(actual),
-               sortKeyValueList(list(list("a", list(1, NULL)), list("b", list(2, NULL)), list("d", list(NULL, 4)), list("c", list(NULL, 3)))))
+               sortKeyValueList(list(list("a", list(1, NULL)), list("b", list(2, NULL)),
+                                     list("d", list(NULL, 4)), list("c", list(NULL, 3)))))
 })
 
 test_that("sortByKey() on pairwise RDDs", {
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index 0e4235ea8b4b3..b0ea38854304e 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -391,7 +391,7 @@ test_that("collect() and take() on a DataFrame return the same number of rows an
   expect_equal(ncol(collect(df)), ncol(take(df, 10)))
 })
 
-test_that("multiple pipeline transformations starting with a DataFrame result in an RDD with the correct values", {
+test_that("multiple pipeline transformations result in an RDD with the correct values", {
   df <- jsonFile(sqlContext, jsonPath)
   first <- lapply(df, function(row) {
     row$age <- row$age + 5
@@ -756,7 +756,14 @@ test_that("toJSON() returns an RDD of the correct values", {
 test_that("showDF()", {
   df <- jsonFile(sqlContext, jsonPath)
   s <- capture.output(showDF(df))
-  expect_output(s , "+----+-------+\n| age|   name|\n+----+-------+\n|null|Michael|\n|  30|   Andy|\n|  19| Justin|\n+----+-------+\n")
+  expected <- paste("+----+-------+\n",
+                    "| age|   name|\n",
+                    "+----+-------+\n",
+                    "|null|Michael|\n",
+                    "|  30|   Andy|\n",
+                    "|  19| Justin|\n",
+                    "+----+-------+\n", sep="")
+  expect_output(s , expected)
 })
 
 test_that("isLocal()", {

From 6d0411b4f3a202cfb53f638ee5fd49072b42d3a6 Mon Sep 17 00:00:00 2001
From: Cheng Hao <hao.cheng@intel.com>
Date: Sun, 5 Jul 2015 21:50:52 -0700
Subject: [PATCH 008/549] [SQL][Minor] Update the DataFrame API for
 encode/decode

This is a the follow up of #6843.

Author: Cheng Hao <hao.cheng@intel.com>

Closes #7230 from chenghao-intel/str_funcs2_followup and squashes the following commits:

52cc553 [Cheng Hao] update the code as comment
---
 .../expressions/stringOperations.scala        | 21 ++++++++++---------
 .../org/apache/spark/sql/functions.scala      | 14 +++++++------
 .../spark/sql/DataFrameFunctionsSuite.scala   |  8 +++++--
 3 files changed, 25 insertions(+), 18 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
index 6de40629ff27e..1a14a7a449342 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
@@ -392,12 +392,13 @@ case class UnBase64(child: Expression) extends UnaryExpression with ExpectsInput
 /**
  * Decodes the first argument into a String using the provided character set
  * (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16').
- * If either argument is null, the result will also be null. (As of Hive 0.12.0.).
+ * If either argument is null, the result will also be null.
  */
-case class Decode(bin: Expression, charset: Expression) extends Expression with ExpectsInputTypes {
-  override def children: Seq[Expression] = bin :: charset :: Nil
-  override def foldable: Boolean = bin.foldable && charset.foldable
-  override def nullable: Boolean = bin.nullable || charset.nullable
+case class Decode(bin: Expression, charset: Expression)
+  extends BinaryExpression with ExpectsInputTypes {
+
+  override def left: Expression = bin
+  override def right: Expression = charset
   override def dataType: DataType = StringType
   override def inputTypes: Seq[DataType] = Seq(BinaryType, StringType)
 
@@ -420,13 +421,13 @@ case class Decode(bin: Expression, charset: Expression) extends Expression with
 /**
  * Encodes the first argument into a BINARY using the provided character set
  * (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16').
- * If either argument is null, the result will also be null. (As of Hive 0.12.0.)
+ * If either argument is null, the result will also be null.
 */
 case class Encode(value: Expression, charset: Expression)
-  extends Expression with ExpectsInputTypes {
-  override def children: Seq[Expression] = value :: charset :: Nil
-  override def foldable: Boolean = value.foldable && charset.foldable
-  override def nullable: Boolean = value.nullable || charset.nullable
+  extends BinaryExpression with ExpectsInputTypes {
+
+  override def left: Expression = value
+  override def right: Expression = charset
   override def dataType: DataType = BinaryType
   override def inputTypes: Seq[DataType] = Seq(StringType, StringType)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index abcfc0b65020c..f80291776f335 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -1666,18 +1666,19 @@ object functions {
    * @group string_funcs
    * @since 1.5.0
    */
-  def encode(value: Column, charset: Column): Column = Encode(value.expr, charset.expr)
+  def encode(value: Column, charset: String): Column = Encode(value.expr, lit(charset).expr)
 
   /**
    * Computes the first argument into a binary from a string using the provided character set
    * (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16').
    * If either argument is null, the result will also be null.
+   * NOTE: charset represents the string value of the character set, not the column name.
    *
    * @group string_funcs
    * @since 1.5.0
    */
-  def encode(columnName: String, charsetColumnName: String): Column =
-    encode(Column(columnName), Column(charsetColumnName))
+  def encode(columnName: String, charset: String): Column =
+    encode(Column(columnName), charset)
 
   /**
    * Computes the first argument into a string from a binary using the provided character set
@@ -1687,18 +1688,19 @@ object functions {
    * @group string_funcs
    * @since 1.5.0
    */
-  def decode(value: Column, charset: Column): Column = Decode(value.expr, charset.expr)
+  def decode(value: Column, charset: String): Column = Decode(value.expr, lit(charset).expr)
 
   /**
    * Computes the first argument into a string from a binary using the provided character set
    * (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16').
    * If either argument is null, the result will also be null.
+   * NOTE: charset represents the string value of the character set, not the column name.
    *
    * @group string_funcs
    * @since 1.5.0
    */
-  def decode(columnName: String, charsetColumnName: String): Column =
-    decode(Column(columnName), Column(charsetColumnName))
+  def decode(columnName: String, charset: String): Column =
+    decode(Column(columnName), charset)
 
 
   //////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
index bc455a922d154..afba28515e032 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
@@ -261,11 +261,15 @@ class DataFrameFunctionsSuite extends QueryTest {
     // non ascii characters are not allowed in the code, so we disable the scalastyle here.
     val df = Seq(("大千世界", "utf-8", bytes)).toDF("a", "b", "c")
     checkAnswer(
-      df.select(encode($"a", $"b"), encode("a", "b"), decode($"c", $"b"), decode("c", "b")),
+      df.select(
+        encode($"a", "utf-8"),
+        encode("a", "utf-8"),
+        decode($"c", "utf-8"),
+        decode("c", "utf-8")),
       Row(bytes, bytes, "大千世界", "大千世界"))
 
     checkAnswer(
-      df.selectExpr("encode(a, b)", "decode(c, b)"),
+      df.selectExpr("encode(a, 'utf-8')", "decode(c, 'utf-8')"),
       Row(bytes, "大千世界"))
     // scalastyle:on
   }

From 86768b7b3b0c2964e744bc491bc20a1d3140ce93 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Sun, 5 Jul 2015 23:54:25 -0700
Subject: [PATCH 009/549] [SPARK-8831][SQL] Support AbstractDataType in
 TypeCollection.

Otherwise it is impossible to declare an expression supporting DecimalType.

Author: Reynold Xin <rxin@databricks.com>

Closes #7232 from rxin/typecollection-adt and squashes the following commits:

934d3d1 [Reynold Xin] [SPARK-8831][SQL] Support AbstractDataType in TypeCollection.
---
 .../spark/sql/catalyst/analysis/HiveTypeCoercion.scala |  2 --
 .../org/apache/spark/sql/types/AbstractDataType.scala  | 10 ++++++----
 .../sql/catalyst/analysis/HiveTypeCoercionSuite.scala  |  6 ++++++
 3 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
index 84acc0e7e90ec..5367b7f3308ee 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
@@ -708,8 +708,6 @@ object HiveTypeCoercion {
         case (NullType, target) => Cast(e, target.defaultConcreteType)
 
         // Implicit cast among numeric types
-        // If input is decimal, and we expect a decimal type, just use the input.
-        case (_: DecimalType, DecimalType) => e
         // If input is a numeric type but not decimal, and we expect a decimal type,
         // cast the input to unlimited precision decimal.
         case (_: NumericType, DecimalType) if !inType.isInstanceOf[DecimalType] =>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala
index ffefb0e7837e9..fb1b47e946214 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala
@@ -53,10 +53,12 @@ private[sql] abstract class AbstractDataType {
  *
  * This means that we prefer StringType over BinaryType if it is possible to cast to StringType.
  */
-private[sql] class TypeCollection(private val types: Seq[DataType]) extends AbstractDataType {
+private[sql] class TypeCollection(private val types: Seq[AbstractDataType])
+  extends AbstractDataType {
+
   require(types.nonEmpty, s"TypeCollection ($types) cannot be empty")
 
-  private[sql] override def defaultConcreteType: DataType = types.head
+  private[sql] override def defaultConcreteType: DataType = types.head.defaultConcreteType
 
   private[sql] override def isParentOf(childCandidate: DataType): Boolean = false
 
@@ -68,9 +70,9 @@ private[sql] class TypeCollection(private val types: Seq[DataType]) extends Abst
 
 private[sql] object TypeCollection {
 
-  def apply(types: DataType*): TypeCollection = new TypeCollection(types)
+  def apply(types: AbstractDataType*): TypeCollection = new TypeCollection(types)
 
-  def unapply(typ: AbstractDataType): Option[Seq[DataType]] = typ match {
+  def unapply(typ: AbstractDataType): Option[Seq[AbstractDataType]] = typ match {
     case typ: TypeCollection => Some(typ.types)
     case _ => None
   }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
index 67d05ab536b7f..b56426617789e 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
@@ -71,6 +71,12 @@ class HiveTypeCoercionSuite extends PlanTest {
 
     shouldCast(IntegerType, TypeCollection(StringType, BinaryType), StringType)
     shouldCast(IntegerType, TypeCollection(BinaryType, StringType), StringType)
+
+    shouldCast(
+      DecimalType.Unlimited, TypeCollection(IntegerType, DecimalType), DecimalType.Unlimited)
+    shouldCast(DecimalType(10, 2), TypeCollection(IntegerType, DecimalType), DecimalType(10, 2))
+    shouldCast(DecimalType(10, 2), TypeCollection(DecimalType, IntegerType), DecimalType(10, 2))
+    shouldCast(IntegerType, TypeCollection(DecimalType(10, 2), StringType), DecimalType(10, 2))
   }
 
   test("ineligible implicit type cast") {

From 39e4e7e4d89077a637c4cad3a986e0e3447d1ae7 Mon Sep 17 00:00:00 2001
From: Steve Lindemann <steve.lindemann@engineersgatelp.com>
Date: Mon, 6 Jul 2015 10:17:05 -0700
Subject: [PATCH 010/549] [SPARK-8841] [SQL] Fix partition pruning percentage
 log message

When pruning partitions for a query plan, a message is logged indicating what how many partitions were selected based on predicate criteria, and what percent were pruned.

The current release erroneously uses `1 - total/selected` to compute this quantity, leading to nonsense messages like "pruned -1000% partitions". The fix is simple and obvious.

Author: Steve Lindemann <steve.lindemann@engineersgatelp.com>

Closes #7227 from srlindemann/master and squashes the following commits:

c788061 [Steve Lindemann] fix percentPruned log message
---
 .../scala/org/apache/spark/sql/sources/DataSourceStrategy.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala
index ce16e050c56ed..66f7ba90140b5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala
@@ -65,7 +65,7 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
       logInfo {
         val total = t.partitionSpec.partitions.length
         val selected = selectedPartitions.length
-        val percentPruned = (1 - total.toDouble / selected.toDouble) * 100
+        val percentPruned = (1 - selected.toDouble / total.toDouble) * 100
         s"Selected $selected partitions out of $total, pruned $percentPruned% partitions."
       }
 

From 293225e0cd9318ad368dde30ac6a17725d33ebb6 Mon Sep 17 00:00:00 2001
From: "Daniel Emaasit (PhD Student)" <daniel.emaasit@gmail.com>
Date: Mon, 6 Jul 2015 10:36:02 -0700
Subject: [PATCH 011/549] [SPARK-8124] [SPARKR] Created more examples on SparkR
 DataFrames

Here are more examples on SparkR DataFrames including creating a Spark Contect and a SQL
context, loading data and simple data manipulation.

Author: Daniel Emaasit (PhD Student) <daniel.emaasit@gmail.com>

Closes #6668 from Emaasit/dan-dev and squashes the following commits:

3a97867 [Daniel Emaasit (PhD Student)] Used fewer rows for createDataFrame
f7227f9 [Daniel Emaasit (PhD Student)] Using command line arguments
a550f70 [Daniel Emaasit (PhD Student)] Used base R functions
33f9882 [Daniel Emaasit (PhD Student)] Renamed file
b6603e3 [Daniel Emaasit (PhD Student)] changed "Describe" function to "describe"
90565dd [Daniel Emaasit (PhD Student)] Deleted the getting-started file
b95a103 [Daniel Emaasit (PhD Student)] Deleted this file
cc55cd8 [Daniel Emaasit (PhD Student)] combined all the code into one .R file
c6933af [Daniel Emaasit (PhD Student)] changed variable name to SQLContext
8e0fe14 [Daniel Emaasit (PhD Student)] provided two options for creating DataFrames
2653573 [Daniel Emaasit (PhD Student)] Updates to a comment and variable name
275b787 [Daniel Emaasit (PhD Student)] Added the Apache License at the top of the file
2e8f724 [Daniel Emaasit (PhD Student)] Added the Apache License at the top of the file
486f44e [Daniel Emaasit (PhD Student)] Added the Apache License at the file
d705112 [Daniel Emaasit (PhD Student)] Created more examples on SparkR DataFrames
---
 examples/src/main/r/data-manipulation.R | 107 ++++++++++++++++++++++++
 1 file changed, 107 insertions(+)
 create mode 100644 examples/src/main/r/data-manipulation.R

diff --git a/examples/src/main/r/data-manipulation.R b/examples/src/main/r/data-manipulation.R
new file mode 100644
index 0000000000000..aa2336e300a91
--- /dev/null
+++ b/examples/src/main/r/data-manipulation.R
@@ -0,0 +1,107 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# For this example, we shall use the "flights" dataset
+# The dataset consists of every flight departing Houston in 2011.
+# The data set is made up of 227,496 rows x 14 columns. 
+
+# To run this example use
+# ./bin/sparkR --packages com.databricks:spark-csv_2.10:1.0.3
+#     examples/src/main/r/data-manipulation.R <path_to_csv>
+
+# Load SparkR library into your R session
+library(SparkR)
+
+args <- commandArgs(trailing = TRUE)
+
+if (length(args) != 1) {
+  print("Usage: data-manipulation.R <path-to-flights.csv")
+  print("The data can be downloaded from: http://s3-us-west-2.amazonaws.com/sparkr-data/flights.csv ")
+  q("no")
+}
+
+## Initialize SparkContext
+sc <- sparkR.init(appName = "SparkR-data-manipulation-example")
+
+## Initialize SQLContext
+sqlContext <- sparkRSQL.init(sc)
+
+flightsCsvPath <- args[[1]]
+
+# Create a local R dataframe
+flights_df <- read.csv(flightsCsvPath, header = TRUE)
+flights_df$date <- as.Date(flights_df$date)
+
+## Filter flights whose destination is San Francisco and write to a local data frame
+SFO_df <- flights_df[flights_df$dest == "SFO", ] 
+
+# Convert the local data frame into a SparkR DataFrame
+SFO_DF <- createDataFrame(sqlContext, SFO_df)
+
+#  Directly create a SparkR DataFrame from the source data
+flightsDF <- read.df(sqlContext, flightsCsvPath, source = "com.databricks.spark.csv", header = "true")
+
+# Print the schema of this Spark DataFrame
+printSchema(flightsDF)
+
+# Cache the DataFrame
+cache(flightsDF)
+
+# Print the first 6 rows of the DataFrame
+showDF(flightsDF, numRows = 6) ## Or
+head(flightsDF)
+
+# Show the column names in the DataFrame
+columns(flightsDF)
+
+# Show the number of rows in the DataFrame
+count(flightsDF)
+
+# Select specific columns
+destDF <- select(flightsDF, "dest", "cancelled")
+
+# Using SQL to select columns of data
+# First, register the flights DataFrame as a table
+registerTempTable(flightsDF, "flightsTable")
+destDF <- sql(sqlContext, "SELECT dest, cancelled FROM flightsTable")
+
+# Use collect to create a local R data frame
+local_df <- collect(destDF)
+
+# Print the newly created local data frame
+head(local_df)
+
+# Filter flights whose destination is JFK
+jfkDF <- filter(flightsDF, "dest = \"JFK\"") ##OR
+jfkDF <- filter(flightsDF, flightsDF$dest == "JFK")
+
+# If the magrittr library is available, we can use it to
+# chain data frame operations
+if("magrittr" %in% rownames(installed.packages())) {
+  library(magrittr)
+
+  # Group the flights by date and then find the average daily delay
+  # Write the result into a DataFrame
+  groupBy(flightsDF, flightsDF$date) %>%
+    summarize(avg(flightsDF$dep_delay), avg(flightsDF$arr_delay)) -> dailyDelayDF
+
+  # Print the computed data frame
+  head(dailyDelayDF)
+}
+
+# Stop the SparkContext now
+sparkR.stop()

From 0e194645f42be0d6ac9b5a712f8fc1798418736d Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@outlook.com>
Date: Mon, 6 Jul 2015 13:26:46 -0700
Subject: [PATCH 012/549] [SPARK-8837][SPARK-7114][SQL] support using keyword
 in column name

Author: Wenchen Fan <cloud0fan@outlook.com>

Closes #7237 from cloud-fan/parser and squashes the following commits:

e7b49bb [Wenchen Fan] support using keyword in column name
---
 .../apache/spark/sql/catalyst/SqlParser.scala | 28 ++++++++++++-------
 .../org/apache/spark/sql/SQLQuerySuite.scala  |  9 ++++++
 2 files changed, 27 insertions(+), 10 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
index 8d02fbf4f92c4..e8e9b9802e94b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
@@ -287,15 +287,18 @@ class SqlParser extends AbstractSparkSQLParser with DataTypeParser {
           throw new AnalysisException(s"invalid function approximate($floatLit) $udfName")
         }
       }
-    | CASE ~> expression.? ~ rep1(WHEN ~> expression ~ (THEN ~> expression)) ~
-        (ELSE ~> expression).? <~ END ^^ {
-          case casePart ~ altPart ~ elsePart =>
-            val branches = altPart.flatMap { case whenExpr ~ thenExpr =>
-              Seq(whenExpr, thenExpr)
-            } ++ elsePart
-            casePart.map(CaseKeyWhen(_, branches)).getOrElse(CaseWhen(branches))
-        }
-      )
+    | CASE ~> whenThenElse ^^ CaseWhen
+    | CASE ~> expression ~ whenThenElse ^^
+      { case keyPart ~ branches => CaseKeyWhen(keyPart, branches) }
+    )
+
+  protected lazy val whenThenElse: Parser[List[Expression]] =
+    rep1(WHEN ~> expression ~ (THEN ~> expression)) ~ (ELSE ~> expression).? <~ END ^^ {
+      case altPart ~ elsePart =>
+        altPart.flatMap { case whenExpr ~ thenExpr =>
+          Seq(whenExpr, thenExpr)
+        } ++ elsePart
+    }
 
   protected lazy val cast: Parser[Expression] =
     CAST ~ "(" ~> expression ~ (AS ~> dataType) <~ ")" ^^ {
@@ -354,6 +357,11 @@ class SqlParser extends AbstractSparkSQLParser with DataTypeParser {
   protected lazy val signedPrimary: Parser[Expression] =
     sign ~ primary ^^ { case s ~ e => if (s == "-") UnaryMinus(e) else e}
 
+  protected lazy val attributeName: Parser[String] = acceptMatch("attribute name", {
+    case lexical.Identifier(str) => str
+    case lexical.Keyword(str) if !lexical.delimiters.contains(str) => str
+  })
+
   protected lazy val primary: PackratParser[Expression] =
     ( literal
     | expression ~ ("[" ~> expression <~ "]") ^^
@@ -364,9 +372,9 @@ class SqlParser extends AbstractSparkSQLParser with DataTypeParser {
     | "(" ~> expression <~ ")"
     | function
     | dotExpressionHeader
-    | ident ^^ {case i => UnresolvedAttribute.quoted(i)}
     | signedPrimary
     | "~" ~> expression ^^ BitwiseNot
+    | attributeName ^^ UnresolvedAttribute.quoted
     )
 
   protected lazy val dotExpressionHeader: Parser[Expression] =
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index cc6af1ccc1cce..12ad019e8b473 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -1458,4 +1458,13 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll with SQLTestUtils {
       checkAnswer(sql("SELECT * FROM t ORDER BY NULL"), Seq(Row(1, 2), Row(1, 2)))
     }
   }
+
+  test("SPARK-8837: use keyword in column name") {
+    withTempTable("t") {
+      val df = Seq(1 -> "a").toDF("count", "sort")
+      checkAnswer(df.filter("count > 0"), Row(1, "a"))
+      df.registerTempTable("t")
+      checkAnswer(sql("select count, sort from t"), Row(1, "a"))
+    }
+  }
 }

From 57c72fcce75907c08a1ae53a0d85447176fc3c69 Mon Sep 17 00:00:00 2001
From: Dirceu Semighini Filho <dirceu.semighini@gmail.com>
Date: Mon, 6 Jul 2015 13:28:07 -0700
Subject: [PATCH 013/549] Small update in the readme file

Just change the attribute from -PsparkR to -Psparkr

Author: Dirceu Semighini Filho <dirceu.semighini@gmail.com>

Closes #7242 from dirceusemighini/patch-1 and squashes the following commits:

fad5991 [Dirceu Semighini Filho] Small update in the readme file
---
 R/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/README.md b/R/README.md
index d7d65b4f0eca5..005f56da1670c 100644
--- a/R/README.md
+++ b/R/README.md
@@ -6,7 +6,7 @@ SparkR is an R package that provides a light-weight frontend to use Spark from R
 
 #### Build Spark
 
-Build Spark with [Maven](http://spark.apache.org/docs/latest/building-spark.html#building-with-buildmvn) and include the `-PsparkR` profile to build the R package. For example to use the default Hadoop versions you can run
+Build Spark with [Maven](http://spark.apache.org/docs/latest/building-spark.html#building-with-buildmvn) and include the `-Psparkr` profile to build the R package. For example to use the default Hadoop versions you can run
 ```
   build/mvn -DskipTests -Psparkr package
 ```

From 37e4d92142a6309e2df7d36883e0c7892c3d792d Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Mon, 6 Jul 2015 13:31:31 -0700
Subject: [PATCH 014/549] [SPARK-8784] [SQL] Add Python API for hex and unhex

Add Python API for hex/unhex,  also cleanup Hex/Unhex

Author: Davies Liu <davies@databricks.com>

Closes #7223 from davies/hex and squashes the following commits:

6f1249d [Davies Liu] no explicit rule to cast string into binary
711a6ed [Davies Liu] fix test
f9fe5a3 [Davies Liu] Merge branch 'master' of github.com:apache/spark into hex
f032fbb [Davies Liu] Merge branch 'hex' of github.com:davies/spark into hex
49e325f [Davies Liu] Merge branch 'master' of github.com:apache/spark into hex
b31fc9a [Davies Liu] Update math.scala
25156b7 [Davies Liu] address comments and fix test
c3af78c [Davies Liu] address commments
1a24082 [Davies Liu] Add Python API for hex and unhex
---
 python/pyspark/sql/functions.py               | 28 +++++++
 .../catalyst/analysis/FunctionRegistry.scala  |  2 +-
 .../spark/sql/catalyst/expressions/math.scala | 83 ++++++++++---------
 .../expressions/MathFunctionsSuite.scala      | 25 ++++--
 .../org/apache/spark/sql/functions.scala      |  2 +-
 5 files changed, 93 insertions(+), 47 deletions(-)

diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 49dd0332afe74..dca39fa833435 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -395,6 +395,34 @@ def randn(seed=None):
     return Column(jc)
 
 
+@ignore_unicode_prefix
+@since(1.5)
+def hex(col):
+    """Computes hex value of the given column, which could be StringType,
+    BinaryType, IntegerType or LongType.
+
+    >>> sqlContext.createDataFrame([('ABC', 3)], ['a', 'b']).select(hex('a'), hex('b')).collect()
+    [Row(hex(a)=u'414243', hex(b)=u'3')]
+    """
+    sc = SparkContext._active_spark_context
+    jc = sc._jvm.functions.hex(_to_java_column(col))
+    return Column(jc)
+
+
+@ignore_unicode_prefix
+@since(1.5)
+def unhex(col):
+    """Inverse of hex. Interprets each pair of characters as a hexadecimal number
+    and converts to the byte representation of number.
+
+    >>> sqlContext.createDataFrame([('414243',)], ['a']).select(unhex('a')).collect()
+    [Row(unhex(a)=bytearray(b'ABC'))]
+    """
+    sc = SparkContext._active_spark_context
+    jc = sc._jvm.functions.unhex(_to_java_column(col))
+    return Column(jc)
+
+
 @ignore_unicode_prefix
 @since(1.5)
 def sha1(col):
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index 92a50e7092317..fef276353022c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -168,7 +168,7 @@ object FunctionRegistry {
     expression[Substring]("substring"),
     expression[UnBase64]("unbase64"),
     expression[Upper]("ucase"),
-    expression[UnHex]("unhex"),
+    expression[Unhex]("unhex"),
     expression[Upper]("upper"),
 
     // datetime functions
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
index 45b7e4d3405c8..92500453980f6 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
@@ -298,6 +298,21 @@ case class Bin(child: Expression)
   }
 }
 
+object Hex {
+  val hexDigits = Array[Char](
+    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'
+  ).map(_.toByte)
+
+  // lookup table to translate '0' -> 0 ... 'F'/'f' -> 15
+  val unhexDigits = {
+    val array = Array.fill[Byte](128)(-1)
+    (0 to 9).foreach(i => array('0' + i) = i.toByte)
+    (0 to 5).foreach(i => array('A' + i) = (i + 10).toByte)
+    (0 to 5).foreach(i => array('a' + i) = (i + 10).toByte)
+    array
+  }
+}
+
 /**
  * If the argument is an INT or binary, hex returns the number as a STRING in hexadecimal format.
  * Otherwise if the number is a STRING, it converts each character into its hex representation
@@ -307,7 +322,7 @@ case class Hex(child: Expression) extends UnaryExpression with ExpectsInputTypes
   // TODO: Create code-gen version.
 
   override def inputTypes: Seq[AbstractDataType] =
-    Seq(TypeCollection(LongType, StringType, BinaryType))
+    Seq(TypeCollection(LongType, BinaryType, StringType))
 
   override def dataType: DataType = StringType
 
@@ -319,30 +334,18 @@ case class Hex(child: Expression) extends UnaryExpression with ExpectsInputTypes
       child.dataType match {
         case LongType => hex(num.asInstanceOf[Long])
         case BinaryType => hex(num.asInstanceOf[Array[Byte]])
-        case StringType => hex(num.asInstanceOf[UTF8String])
+        case StringType => hex(num.asInstanceOf[UTF8String].getBytes)
       }
     }
   }
 
-  /**
-   * Converts every character in s to two hex digits.
-   */
-  private def hex(str: UTF8String): UTF8String = {
-    hex(str.getBytes)
-  }
-
-  private def hex(bytes: Array[Byte]): UTF8String = {
-    doHex(bytes, bytes.length)
-  }
-
-  private def doHex(bytes: Array[Byte], length: Int): UTF8String = {
+  private[this] def hex(bytes: Array[Byte]): UTF8String = {
+    val length = bytes.length
     val value = new Array[Byte](length * 2)
     var i = 0
     while (i < length) {
-      value(i * 2) = Character.toUpperCase(Character.forDigit(
-        (bytes(i) & 0xF0) >>> 4, 16)).toByte
-      value(i * 2 + 1) = Character.toUpperCase(Character.forDigit(
-        bytes(i) & 0x0F, 16)).toByte
+      value(i * 2) = Hex.hexDigits((bytes(i) & 0xF0) >> 4)
+      value(i * 2 + 1) = Hex.hexDigits(bytes(i) & 0x0F)
       i += 1
     }
     UTF8String.fromBytes(value)
@@ -355,24 +358,23 @@ case class Hex(child: Expression) extends UnaryExpression with ExpectsInputTypes
     var len = 0
     do {
       len += 1
-      value(value.length - len) =
-        Character.toUpperCase(Character.forDigit((numBuf & 0xF).toInt, 16)).toByte
+      value(value.length - len) = Hex.hexDigits((numBuf & 0xF).toInt)
       numBuf >>>= 4
     } while (numBuf != 0)
     UTF8String.fromBytes(java.util.Arrays.copyOfRange(value, value.length - len, value.length))
   }
 }
 
-
 /**
  * Performs the inverse operation of HEX.
  * Resulting characters are returned as a byte array.
  */
-case class UnHex(child: Expression) extends UnaryExpression with ExpectsInputTypes {
+case class Unhex(child: Expression) extends UnaryExpression with ExpectsInputTypes {
   // TODO: Create code-gen version.
 
   override def inputTypes: Seq[AbstractDataType] = Seq(StringType)
 
+  override def nullable: Boolean = true
   override def dataType: DataType = BinaryType
 
   override def eval(input: InternalRow): Any = {
@@ -384,26 +386,31 @@ case class UnHex(child: Expression) extends UnaryExpression with ExpectsInputTyp
     }
   }
 
-  private val unhexDigits = {
-    val array = Array.fill[Byte](128)(-1)
-    (0 to 9).foreach(i => array('0' + i) = i.toByte)
-    (0 to 5).foreach(i => array('A' + i) = (i + 10).toByte)
-    (0 to 5).foreach(i => array('a' + i) = (i + 10).toByte)
-    array
-  }
-
-  private def unhex(inputBytes: Array[Byte]): Array[Byte] = {
-    var bytes = inputBytes
+  private[this] def unhex(bytes: Array[Byte]): Array[Byte] = {
+    val out = new Array[Byte]((bytes.length + 1) >> 1)
+    var i = 0
     if ((bytes.length & 0x01) != 0) {
-      bytes = '0'.toByte +: bytes
+      // padding with '0'
+      if (bytes(0) < 0) {
+        return null
+      }
+      val v = Hex.unhexDigits(bytes(0))
+      if (v == -1) {
+        return null
+      }
+      out(0) = v
+      i += 1
     }
-    val out = new Array[Byte](bytes.length >> 1)
     // two characters form the hex value.
-    var i = 0
     while (i < bytes.length) {
-      val first = unhexDigits(bytes(i))
-      val second = unhexDigits(bytes(i + 1))
-      if (first == -1 || second == -1) { return null}
+      if (bytes(i) < 0 || bytes(i + 1) < 0) {
+        return null
+      }
+      val first = Hex.unhexDigits(bytes(i))
+      val second = Hex.unhexDigits(bytes(i + 1))
+      if (first == -1 || second == -1) {
+        return null
+      }
       out(i / 2) = (((first << 4) | second) & 0xFF).toByte
       i += 2
     }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala
index 03d8400cf356b..7ca9e30b2bcd5 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala
@@ -21,8 +21,7 @@ import com.google.common.math.LongMath
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.dsl.expressions._
-import org.apache.spark.sql.types.{DataType, LongType}
-import org.apache.spark.sql.types.{IntegerType, DoubleType}
+import org.apache.spark.sql.types._
 
 class MathFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
 
@@ -271,20 +270,32 @@ class MathFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
   }
 
   test("hex") {
+    checkEvaluation(Hex(Literal.create(null, LongType)), null)
+    checkEvaluation(Hex(Literal(28L)), "1C")
+    checkEvaluation(Hex(Literal(-28L)), "FFFFFFFFFFFFFFE4")
     checkEvaluation(Hex(Literal(100800200404L)), "177828FED4")
     checkEvaluation(Hex(Literal(-100800200404L)), "FFFFFFE887D7012C")
-    checkEvaluation(Hex(Literal("helloHex")), "68656C6C6F486578")
+    checkEvaluation(Hex(Literal.create(null, BinaryType)), null)
     checkEvaluation(Hex(Literal("helloHex".getBytes())), "68656C6C6F486578")
     // scalastyle:off
     // Turn off scala style for non-ascii chars
-    checkEvaluation(Hex(Literal("三重的")), "E4B889E9878DE79A84")
+    checkEvaluation(Hex(Literal("三重的".getBytes("UTF8"))), "E4B889E9878DE79A84")
     // scalastyle:on
   }
 
   test("unhex") {
-    checkEvaluation(UnHex(Literal("737472696E67")), "string".getBytes)
-    checkEvaluation(UnHex(Literal("")), new Array[Byte](0))
-    checkEvaluation(UnHex(Literal("0")), Array[Byte](0))
+    checkEvaluation(Unhex(Literal.create(null, StringType)), null)
+    checkEvaluation(Unhex(Literal("737472696E67")), "string".getBytes)
+    checkEvaluation(Unhex(Literal("")), new Array[Byte](0))
+    checkEvaluation(Unhex(Literal("F")), Array[Byte](15))
+    checkEvaluation(Unhex(Literal("ff")), Array[Byte](-1))
+    checkEvaluation(Unhex(Literal("GG")), null)
+    // scalastyle:off
+    // Turn off scala style for non-ascii chars
+    checkEvaluation(Unhex(Literal("E4B889E9878DE79A84")), "三重的".getBytes("UTF-8"))
+    checkEvaluation(Unhex(Literal("三重的")), null)
+
+    // scalastyle:on
   }
 
   test("hypot") {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index f80291776f335..4da9ffc495e17 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -1095,7 +1095,7 @@ object functions {
    * @group math_funcs
    * @since 1.5.0
    */
-  def unhex(column: Column): Column = UnHex(column.expr)
+  def unhex(column: Column): Column = Unhex(column.expr)
 
   /**
    * Inverse of hex. Interprets each pair of characters as a hexadecimal number

From 2471c0bf7f463bb144b44a2e51c0f363e71e099d Mon Sep 17 00:00:00 2001
From: kai <kaizeng@eecs.berkeley.edu>
Date: Mon, 6 Jul 2015 14:33:30 -0700
Subject: [PATCH 015/549] [SPARK-4485] [SQL] 1) Add broadcast hash outer join,
 (2) Fix SparkPlanTest

This pull request
(1) extracts common functions used by hash outer joins and put it in interface HashOuterJoin
(2) adds ShuffledHashOuterJoin and BroadcastHashOuterJoin
(3) adds test cases for shuffled and broadcast hash outer join
(3) makes SparkPlanTest to support binary or more complex operators, and fixes bugs in plan composition in SparkPlanTest

Author: kai <kaizeng@eecs.berkeley.edu>

Closes #7162 from kai-zeng/outer and squashes the following commits:

3742359 [kai] Fix not-serializable exception for code-generated keys in broadcasted relations
14e4bf8 [kai] Use CanBroadcast in broadcast outer join planning
dc5127e [kai] code style fixes
b5a4efa [kai] (1) Add broadcast hash outer join, (2) Fix SparkPlanTest
---
 .../spark/sql/execution/SparkStrategies.scala |  12 +-
 .../joins/BroadcastHashOuterJoin.scala        | 121 ++++++++++++++++++
 .../sql/execution/joins/HashOuterJoin.scala   |  95 ++++----------
 .../joins/ShuffledHashOuterJoin.scala         |  85 ++++++++++++
 .../org/apache/spark/sql/JoinSuite.scala      |  40 +++++-
 .../spark/sql/execution/SparkPlanTest.scala   |  99 +++++++++++---
 .../sql/execution/joins/OuterJoinSuite.scala  |  88 +++++++++++++
 7 files changed, 441 insertions(+), 99 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashOuterJoin.scala
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashOuterJoin.scala
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/joins/OuterJoinSuite.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index 5daf86d817586..32044989044a6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -117,8 +117,18 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
           leftKeys, rightKeys, buildSide, planLater(left), planLater(right))
         condition.map(Filter(_, hashJoin)).getOrElse(hashJoin) :: Nil
 
+      case ExtractEquiJoinKeys(
+             LeftOuter, leftKeys, rightKeys, condition, left, CanBroadcast(right)) =>
+        joins.BroadcastHashOuterJoin(
+          leftKeys, rightKeys, LeftOuter, condition, planLater(left), planLater(right)) :: Nil
+
+      case ExtractEquiJoinKeys(
+             RightOuter, leftKeys, rightKeys, condition, CanBroadcast(left), right) =>
+        joins.BroadcastHashOuterJoin(
+          leftKeys, rightKeys, RightOuter, condition, planLater(left), planLater(right)) :: Nil
+
       case ExtractEquiJoinKeys(joinType, leftKeys, rightKeys, condition, left, right) =>
-        joins.HashOuterJoin(
+        joins.ShuffledHashOuterJoin(
           leftKeys, rightKeys, joinType, condition, planLater(left), planLater(right)) :: Nil
 
       case _ => Nil
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashOuterJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashOuterJoin.scala
new file mode 100644
index 0000000000000..5da04c78744d9
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashOuterJoin.scala
@@ -0,0 +1,121 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.joins
+
+import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.plans.physical.{Distribution, UnspecifiedDistribution}
+import org.apache.spark.sql.catalyst.plans.{JoinType, LeftOuter, RightOuter}
+import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}
+import org.apache.spark.util.ThreadUtils
+
+import scala.collection.JavaConversions._
+import scala.concurrent._
+import scala.concurrent.duration._
+
+/**
+ * :: DeveloperApi ::
+ * Performs a outer hash join for two child relations.  When the output RDD of this operator is
+ * being constructed, a Spark job is asynchronously started to calculate the values for the
+ * broadcasted relation.  This data is then placed in a Spark broadcast variable.  The streamed
+ * relation is not shuffled.
+ */
+@DeveloperApi
+case class BroadcastHashOuterJoin(
+    leftKeys: Seq[Expression],
+    rightKeys: Seq[Expression],
+    joinType: JoinType,
+    condition: Option[Expression],
+    left: SparkPlan,
+    right: SparkPlan) extends BinaryNode with HashOuterJoin {
+
+  val timeout = {
+    val timeoutValue = sqlContext.conf.broadcastTimeout
+    if (timeoutValue < 0) {
+      Duration.Inf
+    } else {
+      timeoutValue.seconds
+    }
+  }
+
+  override def requiredChildDistribution: Seq[Distribution] =
+    UnspecifiedDistribution :: UnspecifiedDistribution :: Nil
+
+  private[this] lazy val (buildPlan, streamedPlan) = joinType match {
+    case RightOuter => (left, right)
+    case LeftOuter => (right, left)
+    case x =>
+      throw new IllegalArgumentException(
+        s"BroadcastHashOuterJoin should not take $x as the JoinType")
+  }
+
+  private[this] lazy val (buildKeys, streamedKeys) = joinType match {
+    case RightOuter => (leftKeys, rightKeys)
+    case LeftOuter => (rightKeys, leftKeys)
+    case x =>
+      throw new IllegalArgumentException(
+        s"BroadcastHashOuterJoin should not take $x as the JoinType")
+  }
+
+  @transient
+  private val broadcastFuture = future {
+    // Note that we use .execute().collect() because we don't want to convert data to Scala types
+    val input: Array[InternalRow] = buildPlan.execute().map(_.copy()).collect()
+    // buildHashTable uses code-generated rows as keys, which are not serializable
+    val hashed =
+      buildHashTable(input.iterator, new InterpretedProjection(buildKeys, buildPlan.output))
+    sparkContext.broadcast(hashed)
+  }(BroadcastHashOuterJoin.broadcastHashOuterJoinExecutionContext)
+
+  override def doExecute(): RDD[InternalRow] = {
+    val broadcastRelation = Await.result(broadcastFuture, timeout)
+
+    streamedPlan.execute().mapPartitions { streamedIter =>
+      val joinedRow = new JoinedRow()
+      val hashTable = broadcastRelation.value
+      val keyGenerator = newProjection(streamedKeys, streamedPlan.output)
+
+      joinType match {
+        case LeftOuter =>
+          streamedIter.flatMap(currentRow => {
+            val rowKey = keyGenerator(currentRow)
+            joinedRow.withLeft(currentRow)
+            leftOuterIterator(rowKey, joinedRow, hashTable.getOrElse(rowKey, EMPTY_LIST))
+          })
+
+        case RightOuter =>
+          streamedIter.flatMap(currentRow => {
+            val rowKey = keyGenerator(currentRow)
+            joinedRow.withRight(currentRow)
+            rightOuterIterator(rowKey, hashTable.getOrElse(rowKey, EMPTY_LIST), joinedRow)
+          })
+
+        case x =>
+          throw new IllegalArgumentException(
+            s"BroadcastHashOuterJoin should not take $x as the JoinType")
+      }
+    }
+  }
+}
+
+object BroadcastHashOuterJoin {
+
+  private val broadcastHashOuterJoinExecutionContext = ExecutionContext.fromExecutorService(
+    ThreadUtils.newDaemonCachedThreadPool("broadcast-hash-outer-join", 128))
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashOuterJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashOuterJoin.scala
index e41538ec1fc1a..886b5fa0c5103 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashOuterJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashOuterJoin.scala
@@ -19,32 +19,25 @@ package org.apache.spark.sql.execution.joins
 
 import java.util.{HashMap => JavaHashMap}
 
-import org.apache.spark.rdd.RDD
-
-import scala.collection.JavaConversions._
-
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.plans.physical.{ClusteredDistribution, Partitioning, UnknownPartitioning}
+import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, UnknownPartitioning}
 import org.apache.spark.sql.catalyst.plans.{FullOuter, JoinType, LeftOuter, RightOuter}
-import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}
+import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.util.collection.CompactBuffer
 
-/**
- * :: DeveloperApi ::
- * Performs a hash based outer join for two child relations by shuffling the data using
- * the join keys. This operator requires loading the associated partition in both side into memory.
- */
 @DeveloperApi
-case class HashOuterJoin(
-    leftKeys: Seq[Expression],
-    rightKeys: Seq[Expression],
-    joinType: JoinType,
-    condition: Option[Expression],
-    left: SparkPlan,
-    right: SparkPlan) extends BinaryNode {
-
-  override def outputPartitioning: Partitioning = joinType match {
+trait HashOuterJoin {
+  self: SparkPlan =>
+
+  val leftKeys: Seq[Expression]
+  val rightKeys: Seq[Expression]
+  val joinType: JoinType
+  val condition: Option[Expression]
+  val left: SparkPlan
+  val right: SparkPlan
+
+override def outputPartitioning: Partitioning = joinType match {
     case LeftOuter => left.outputPartitioning
     case RightOuter => right.outputPartitioning
     case FullOuter => UnknownPartitioning(left.outputPartitioning.numPartitions)
@@ -52,9 +45,6 @@ case class HashOuterJoin(
       throw new IllegalArgumentException(s"HashOuterJoin should not take $x as the JoinType")
   }
 
-  override def requiredChildDistribution: Seq[ClusteredDistribution] =
-    ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil
-
   override def output: Seq[Attribute] = {
     joinType match {
       case LeftOuter =>
@@ -68,8 +58,8 @@ case class HashOuterJoin(
     }
   }
 
-  @transient private[this] lazy val DUMMY_LIST = Seq[InternalRow](null)
-  @transient private[this] lazy val EMPTY_LIST = Seq.empty[InternalRow]
+  @transient private[this] lazy val DUMMY_LIST = CompactBuffer[InternalRow](null)
+  @transient protected[this] lazy val EMPTY_LIST = CompactBuffer[InternalRow]()
 
   @transient private[this] lazy val leftNullRow = new GenericInternalRow(left.output.length)
   @transient private[this] lazy val rightNullRow = new GenericInternalRow(right.output.length)
@@ -80,7 +70,7 @@ case class HashOuterJoin(
   // TODO we need to rewrite all of the iterators with our own implementation instead of the Scala
   // iterator for performance purpose.
 
-  private[this] def leftOuterIterator(
+  protected[this] def leftOuterIterator(
       key: InternalRow,
       joinedRow: JoinedRow,
       rightIter: Iterable[InternalRow]): Iterator[InternalRow] = {
@@ -89,7 +79,7 @@ case class HashOuterJoin(
         val temp = rightIter.collect {
           case r if boundCondition(joinedRow.withRight(r)) => joinedRow.copy()
         }
-        if (temp.size == 0) {
+        if (temp.isEmpty) {
           joinedRow.withRight(rightNullRow).copy :: Nil
         } else {
           temp
@@ -101,18 +91,17 @@ case class HashOuterJoin(
     ret.iterator
   }
 
-  private[this] def rightOuterIterator(
+  protected[this] def rightOuterIterator(
       key: InternalRow,
       leftIter: Iterable[InternalRow],
       joinedRow: JoinedRow): Iterator[InternalRow] = {
-
     val ret: Iterable[InternalRow] = {
       if (!key.anyNull) {
         val temp = leftIter.collect {
           case l if boundCondition(joinedRow.withLeft(l)) =>
-            joinedRow.copy
+            joinedRow.copy()
         }
-        if (temp.size == 0) {
+        if (temp.isEmpty) {
           joinedRow.withLeft(leftNullRow).copy :: Nil
         } else {
           temp
@@ -124,10 +113,9 @@ case class HashOuterJoin(
     ret.iterator
   }
 
-  private[this] def fullOuterIterator(
+  protected[this] def fullOuterIterator(
       key: InternalRow, leftIter: Iterable[InternalRow], rightIter: Iterable[InternalRow],
       joinedRow: JoinedRow): Iterator[InternalRow] = {
-
     if (!key.anyNull) {
       // Store the positions of records in right, if one of its associated row satisfy
       // the join condition.
@@ -171,7 +159,7 @@ case class HashOuterJoin(
     }
   }
 
-  private[this] def buildHashTable(
+  protected[this] def buildHashTable(
       iter: Iterator[InternalRow],
       keyGenerator: Projection): JavaHashMap[InternalRow, CompactBuffer[InternalRow]] = {
     val hashTable = new JavaHashMap[InternalRow, CompactBuffer[InternalRow]]()
@@ -190,43 +178,4 @@ case class HashOuterJoin(
 
     hashTable
   }
-
-  protected override def doExecute(): RDD[InternalRow] = {
-    val joinedRow = new JoinedRow()
-    left.execute().zipPartitions(right.execute()) { (leftIter, rightIter) =>
-      // TODO this probably can be replaced by external sort (sort merged join?)
-
-      joinType match {
-        case LeftOuter =>
-          val rightHashTable = buildHashTable(rightIter, newProjection(rightKeys, right.output))
-          val keyGenerator = newProjection(leftKeys, left.output)
-          leftIter.flatMap( currentRow => {
-            val rowKey = keyGenerator(currentRow)
-            joinedRow.withLeft(currentRow)
-            leftOuterIterator(rowKey, joinedRow, rightHashTable.getOrElse(rowKey, EMPTY_LIST))
-          })
-
-        case RightOuter =>
-          val leftHashTable = buildHashTable(leftIter, newProjection(leftKeys, left.output))
-          val keyGenerator = newProjection(rightKeys, right.output)
-          rightIter.flatMap ( currentRow => {
-            val rowKey = keyGenerator(currentRow)
-            joinedRow.withRight(currentRow)
-            rightOuterIterator(rowKey, leftHashTable.getOrElse(rowKey, EMPTY_LIST), joinedRow)
-          })
-
-        case FullOuter =>
-          val leftHashTable = buildHashTable(leftIter, newProjection(leftKeys, left.output))
-          val rightHashTable = buildHashTable(rightIter, newProjection(rightKeys, right.output))
-          (leftHashTable.keySet ++ rightHashTable.keySet).iterator.flatMap { key =>
-            fullOuterIterator(key,
-              leftHashTable.getOrElse(key, EMPTY_LIST),
-              rightHashTable.getOrElse(key, EMPTY_LIST), joinedRow)
-          }
-
-        case x =>
-          throw new IllegalArgumentException(s"HashOuterJoin should not take $x as the JoinType")
-      }
-    }
-  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashOuterJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashOuterJoin.scala
new file mode 100644
index 0000000000000..cfc9c14aaa363
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashOuterJoin.scala
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.joins
+
+import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.plans.physical.{Distribution, ClusteredDistribution}
+import org.apache.spark.sql.catalyst.plans.{FullOuter, JoinType, LeftOuter, RightOuter}
+import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}
+
+import scala.collection.JavaConversions._
+
+/**
+ * :: DeveloperApi ::
+ * Performs a hash based outer join for two child relations by shuffling the data using
+ * the join keys. This operator requires loading the associated partition in both side into memory.
+ */
+@DeveloperApi
+case class ShuffledHashOuterJoin(
+    leftKeys: Seq[Expression],
+    rightKeys: Seq[Expression],
+    joinType: JoinType,
+    condition: Option[Expression],
+    left: SparkPlan,
+    right: SparkPlan) extends BinaryNode with HashOuterJoin {
+
+  override def requiredChildDistribution: Seq[Distribution] =
+    ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil
+
+  protected override def doExecute(): RDD[InternalRow] = {
+    val joinedRow = new JoinedRow()
+    left.execute().zipPartitions(right.execute()) { (leftIter, rightIter) =>
+      // TODO this probably can be replaced by external sort (sort merged join?)
+      joinType match {
+        case LeftOuter =>
+          val rightHashTable = buildHashTable(rightIter, newProjection(rightKeys, right.output))
+          val keyGenerator = newProjection(leftKeys, left.output)
+          leftIter.flatMap( currentRow => {
+            val rowKey = keyGenerator(currentRow)
+            joinedRow.withLeft(currentRow)
+            leftOuterIterator(rowKey, joinedRow, rightHashTable.getOrElse(rowKey, EMPTY_LIST))
+          })
+
+        case RightOuter =>
+          val leftHashTable = buildHashTable(leftIter, newProjection(leftKeys, left.output))
+          val keyGenerator = newProjection(rightKeys, right.output)
+          rightIter.flatMap ( currentRow => {
+            val rowKey = keyGenerator(currentRow)
+            joinedRow.withRight(currentRow)
+            rightOuterIterator(rowKey, leftHashTable.getOrElse(rowKey, EMPTY_LIST), joinedRow)
+          })
+
+        case FullOuter =>
+          val leftHashTable = buildHashTable(leftIter, newProjection(leftKeys, left.output))
+          val rightHashTable = buildHashTable(rightIter, newProjection(rightKeys, right.output))
+          (leftHashTable.keySet ++ rightHashTable.keySet).iterator.flatMap { key =>
+            fullOuterIterator(key,
+              leftHashTable.getOrElse(key, EMPTY_LIST),
+              rightHashTable.getOrElse(key, EMPTY_LIST),
+              joinedRow)
+          }
+
+        case x =>
+          throw new IllegalArgumentException(
+            s"ShuffledHashOuterJoin should not take $x as the JoinType")
+      }
+    }
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
index 20390a5544304..8953889d1fae9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
@@ -45,9 +45,10 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach {
     val physical = df.queryExecution.sparkPlan
     val operators = physical.collect {
       case j: ShuffledHashJoin => j
-      case j: HashOuterJoin => j
+      case j: ShuffledHashOuterJoin => j
       case j: LeftSemiJoinHash => j
       case j: BroadcastHashJoin => j
+      case j: BroadcastHashOuterJoin => j
       case j: LeftSemiJoinBNL => j
       case j: CartesianProduct => j
       case j: BroadcastNestedLoopJoin => j
@@ -81,12 +82,13 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach {
       ("SELECT * FROM testData JOIN testData2 ON key = a", classOf[ShuffledHashJoin]),
       ("SELECT * FROM testData JOIN testData2 ON key = a and key = 2", classOf[ShuffledHashJoin]),
       ("SELECT * FROM testData JOIN testData2 ON key = a where key = 2", classOf[ShuffledHashJoin]),
-      ("SELECT * FROM testData LEFT JOIN testData2 ON key = a", classOf[HashOuterJoin]),
+      ("SELECT * FROM testData LEFT JOIN testData2 ON key = a", classOf[ShuffledHashOuterJoin]),
       ("SELECT * FROM testData RIGHT JOIN testData2 ON key = a where key = 2",
-        classOf[HashOuterJoin]),
+        classOf[ShuffledHashOuterJoin]),
       ("SELECT * FROM testData right join testData2 ON key = a and key = 2",
-        classOf[HashOuterJoin]),
-      ("SELECT * FROM testData full outer join testData2 ON key = a", classOf[HashOuterJoin]),
+        classOf[ShuffledHashOuterJoin]),
+      ("SELECT * FROM testData full outer join testData2 ON key = a",
+        classOf[ShuffledHashOuterJoin]),
       ("SELECT * FROM testData left JOIN testData2 ON (key * a != key + a)",
         classOf[BroadcastNestedLoopJoin]),
       ("SELECT * FROM testData right JOIN testData2 ON (key * a != key + a)",
@@ -133,6 +135,34 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach {
     ctx.sql("UNCACHE TABLE testData")
   }
 
+  test("broadcasted hash outer join operator selection") {
+    ctx.cacheManager.clearCache()
+    ctx.sql("CACHE TABLE testData")
+
+    val SORTMERGEJOIN_ENABLED: Boolean = ctx.conf.sortMergeJoinEnabled
+    Seq(
+      ("SELECT * FROM testData LEFT JOIN testData2 ON key = a", classOf[ShuffledHashOuterJoin]),
+      ("SELECT * FROM testData RIGHT JOIN testData2 ON key = a where key = 2",
+        classOf[BroadcastHashOuterJoin]),
+      ("SELECT * FROM testData right join testData2 ON key = a and key = 2",
+        classOf[BroadcastHashOuterJoin])
+    ).foreach { case (query, joinClass) => assertJoin(query, joinClass) }
+    try {
+      ctx.conf.setConf(SQLConf.SORTMERGE_JOIN, true)
+      Seq(
+        ("SELECT * FROM testData LEFT JOIN testData2 ON key = a", classOf[ShuffledHashOuterJoin]),
+        ("SELECT * FROM testData RIGHT JOIN testData2 ON key = a where key = 2",
+          classOf[BroadcastHashOuterJoin]),
+        ("SELECT * FROM testData right join testData2 ON key = a and key = 2",
+          classOf[BroadcastHashOuterJoin])
+      ).foreach { case (query, joinClass) => assertJoin(query, joinClass) }
+    } finally {
+      ctx.conf.setConf(SQLConf.SORTMERGE_JOIN, SORTMERGEJOIN_ENABLED)
+    }
+
+    ctx.sql("UNCACHE TABLE testData")
+  }
+
   test("multiple-key equi-join is hash-join") {
     val x = testData2.as("x")
     val y = testData2.as("y")
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala
index 13f3be8ca28d6..108b1122f7bff 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala
@@ -54,6 +54,37 @@ class SparkPlanTest extends SparkFunSuite {
       input: DataFrame,
       planFunction: SparkPlan => SparkPlan,
       expectedAnswer: Seq[Row]): Unit = {
+    checkAnswer(input :: Nil, (plans: Seq[SparkPlan]) => planFunction(plans.head), expectedAnswer)
+  }
+
+  /**
+   * Runs the plan and makes sure the answer matches the expected result.
+   * @param left the left input data to be used.
+   * @param right the right input data to be used.
+   * @param planFunction a function which accepts the input SparkPlan and uses it to instantiate
+   *                     the physical operator that's being tested.
+   * @param expectedAnswer the expected result in a [[Seq]] of [[Row]]s.
+   */
+  protected def checkAnswer(
+      left: DataFrame,
+      right: DataFrame,
+      planFunction: (SparkPlan, SparkPlan) => SparkPlan,
+      expectedAnswer: Seq[Row]): Unit = {
+    checkAnswer(left :: right :: Nil,
+      (plans: Seq[SparkPlan]) => planFunction(plans(0), plans(1)), expectedAnswer)
+  }
+
+  /**
+   * Runs the plan and makes sure the answer matches the expected result.
+   * @param input the input data to be used.
+   * @param planFunction a function which accepts the input SparkPlan and uses it to instantiate
+   *                     the physical operator that's being tested.
+   * @param expectedAnswer the expected result in a [[Seq]] of [[Row]]s.
+   */
+  protected def checkAnswer(
+      input: Seq[DataFrame],
+      planFunction: Seq[SparkPlan] => SparkPlan,
+      expectedAnswer: Seq[Row]): Unit = {
     SparkPlanTest.checkAnswer(input, planFunction, expectedAnswer) match {
       case Some(errorMessage) => fail(errorMessage)
       case None =>
@@ -72,11 +103,41 @@ class SparkPlanTest extends SparkFunSuite {
       planFunction: SparkPlan => SparkPlan,
       expectedAnswer: Seq[A]): Unit = {
     val expectedRows = expectedAnswer.map(Row.fromTuple)
-    SparkPlanTest.checkAnswer(input, planFunction, expectedRows) match {
-      case Some(errorMessage) => fail(errorMessage)
-      case None =>
-    }
+    checkAnswer(input, planFunction, expectedRows)
+  }
+
+  /**
+   * Runs the plan and makes sure the answer matches the expected result.
+   * @param left the left input data to be used.
+   * @param right the right input data to be used.
+   * @param planFunction a function which accepts the input SparkPlan and uses it to instantiate
+   *                     the physical operator that's being tested.
+   * @param expectedAnswer the expected result in a [[Seq]] of [[Product]]s.
+   */
+  protected def checkAnswer[A <: Product : TypeTag](
+      left: DataFrame,
+      right: DataFrame,
+      planFunction: (SparkPlan, SparkPlan) => SparkPlan,
+      expectedAnswer: Seq[A]): Unit = {
+    val expectedRows = expectedAnswer.map(Row.fromTuple)
+    checkAnswer(left, right, planFunction, expectedRows)
+  }
+
+  /**
+   * Runs the plan and makes sure the answer matches the expected result.
+   * @param input the input data to be used.
+   * @param planFunction a function which accepts the input SparkPlan and uses it to instantiate
+   *                     the physical operator that's being tested.
+   * @param expectedAnswer the expected result in a [[Seq]] of [[Product]]s.
+   */
+  protected def checkAnswer[A <: Product : TypeTag](
+      input: Seq[DataFrame],
+      planFunction: Seq[SparkPlan] => SparkPlan,
+      expectedAnswer: Seq[A]): Unit = {
+    val expectedRows = expectedAnswer.map(Row.fromTuple)
+    checkAnswer(input, planFunction, expectedRows)
   }
+
 }
 
 /**
@@ -92,27 +153,25 @@ object SparkPlanTest {
    * @param expectedAnswer the expected result in a [[Seq]] of [[Row]]s.
    */
   def checkAnswer(
-      input: DataFrame,
-      planFunction: SparkPlan => SparkPlan,
+      input: Seq[DataFrame],
+      planFunction: Seq[SparkPlan] => SparkPlan,
       expectedAnswer: Seq[Row]): Option[String] = {
 
-    val outputPlan = planFunction(input.queryExecution.sparkPlan)
+    val outputPlan = planFunction(input.map(_.queryExecution.sparkPlan))
 
     // A very simple resolver to make writing tests easier. In contrast to the real resolver
     // this is always case sensitive and does not try to handle scoping or complex type resolution.
-    val resolvedPlan = outputPlan transform {
-      case plan: SparkPlan =>
-        val inputMap = plan.children.flatMap(_.output).zipWithIndex.map {
-          case (a, i) =>
-            (a.name, BoundReference(i, a.dataType, a.nullable))
-        }.toMap
-
-        plan.transformExpressions {
-          case UnresolvedAttribute(Seq(u)) =>
-            inputMap.getOrElse(u,
-              sys.error(s"Invalid Test: Cannot resolve $u given input $inputMap"))
-        }
-    }
+    val resolvedPlan = TestSQLContext.prepareForExecution.execute(
+      outputPlan transform {
+        case plan: SparkPlan =>
+          val inputMap = plan.children.flatMap(_.output).map(a => (a.name, a)).toMap
+          plan.transformExpressions {
+            case UnresolvedAttribute(Seq(u)) =>
+              inputMap.getOrElse(u,
+                sys.error(s"Invalid Test: Cannot resolve $u given input $inputMap"))
+          }
+      }
+    )
 
     def prepareAnswer(answer: Seq[Row]): Seq[Row] = {
       // Converts data to types that we can do equality comparison using Scala collections.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/OuterJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/OuterJoinSuite.scala
new file mode 100644
index 0000000000000..5707d2fb300ae
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/OuterJoinSuite.scala
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.joins
+
+import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.catalyst.expressions.{Expression, LessThan}
+import org.apache.spark.sql.catalyst.plans.{FullOuter, LeftOuter, RightOuter}
+import org.apache.spark.sql.execution.{SparkPlan, SparkPlanTest}
+
+class OuterJoinSuite extends SparkPlanTest {
+
+  val left = Seq(
+    (1, 2.0),
+    (2, 1.0),
+    (3, 3.0)
+  ).toDF("a", "b")
+
+  val right = Seq(
+    (2, 3.0),
+    (3, 2.0),
+    (4, 1.0)
+  ).toDF("c", "d")
+
+  val leftKeys: List[Expression] = 'a :: Nil
+  val rightKeys: List[Expression] = 'c :: Nil
+  val condition = Some(LessThan('b, 'd))
+
+  test("shuffled hash outer join") {
+    checkAnswer(left, right, (left: SparkPlan, right: SparkPlan) =>
+      ShuffledHashOuterJoin(leftKeys, rightKeys, LeftOuter, condition, left, right),
+      Seq(
+        (1, 2.0, null, null),
+        (2, 1.0, 2, 3.0),
+        (3, 3.0, null, null)
+      ))
+
+    checkAnswer(left, right, (left: SparkPlan, right: SparkPlan) =>
+      ShuffledHashOuterJoin(leftKeys, rightKeys, RightOuter, condition, left, right),
+      Seq(
+        (2, 1.0, 2, 3.0),
+        (null, null, 3, 2.0),
+        (null, null, 4, 1.0)
+      ))
+
+    checkAnswer(left, right, (left: SparkPlan, right: SparkPlan) =>
+      ShuffledHashOuterJoin(leftKeys, rightKeys, FullOuter, condition, left, right),
+      Seq(
+        (1, 2.0, null, null),
+        (2, 1.0, 2, 3.0),
+        (3, 3.0, null, null),
+        (null, null, 3, 2.0),
+        (null, null, 4, 1.0)
+      ))
+  }
+
+  test("broadcast hash outer join") {
+    checkAnswer(left, right, (left: SparkPlan, right: SparkPlan) =>
+      BroadcastHashOuterJoin(leftKeys, rightKeys, LeftOuter, condition, left, right),
+      Seq(
+        (1, 2.0, null, null),
+        (2, 1.0, 2, 3.0),
+        (3, 3.0, null, null)
+      ))
+
+    checkAnswer(left, right, (left: SparkPlan, right: SparkPlan) =>
+      BroadcastHashOuterJoin(leftKeys, rightKeys, RightOuter, condition, left, right),
+      Seq(
+        (2, 1.0, 2, 3.0),
+        (null, null, 3, 2.0),
+        (null, null, 4, 1.0)
+      ))
+  }
+}

From 132e7fca129be8f00ba429a51bcef60abb2eed6d Mon Sep 17 00:00:00 2001
From: Daoyuan Wang <daoyuan.wang@intel.com>
Date: Mon, 6 Jul 2015 15:54:43 -0700
Subject: [PATCH 016/549] [MINOR] [SQL] remove unused code in Exchange

Author: Daoyuan Wang <daoyuan.wang@intel.com>

Closes #7234 from adrian-wang/exchangeclean and squashes the following commits:

b093ec9 [Daoyuan Wang] remove unused code
---
 .../org/apache/spark/sql/execution/Exchange.scala  | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
index edc64a03335d6..e054c1d144e34 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
@@ -117,20 +117,6 @@ case class Exchange(
     }
   }
 
-  private val keyOrdering = {
-    if (newOrdering.nonEmpty) {
-      val key = newPartitioning.keyExpressions
-      val boundOrdering = newOrdering.map { o =>
-        val ordinal = key.indexOf(o.child)
-        if (ordinal == -1) sys.error(s"Invalid ordering on $o requested for $newPartitioning")
-        o.copy(child = BoundReference(ordinal, o.child.dataType, o.child.nullable))
-      }
-      new RowOrdering(boundOrdering)
-    } else {
-      null // Ordering will not be used
-    }
-  }
-
   @transient private lazy val sparkConf = child.sqlContext.sparkContext.getConf
 
   private def getSerializer(

From 9ff203346ca4decf2999e33bfb8c400ec75313e6 Mon Sep 17 00:00:00 2001
From: Wisely Chen <wiselychen@appier.com>
Date: Mon, 6 Jul 2015 16:04:01 -0700
Subject: [PATCH 017/549] [SPARK-8656] [WEBUI] Fix the webUI and JSON API
 number is not synced

Spark standalone master web UI show "Alive Workers" total core, total used cores and "Alive workers" total memory, memory used.
But the JSON API page "http://MASTERURL:8088/json" shows "ALL workers"  core, memory number.
This webUI data is not sync with the JSON API.
The proper way is to sync the number with webUI and JSON API.

Author: Wisely Chen <wiselychen@appier.com>

Closes #7038 from thegiive/SPARK-8656 and squashes the following commits:

9e54bf0 [Wisely Chen] Change variable name to camel case
2c8ea89 [Wisely Chen] Change some styling and add local variable
431d2b0 [Wisely Chen] Worker List should contain DEAD node also
8b3b8e8 [Wisely Chen] [SPARK-8656] Fix the webUI and JSON API number is not synced
---
 .../scala/org/apache/spark/deploy/JsonProtocol.scala     | 9 +++++----
 .../org/apache/spark/deploy/master/WorkerInfo.scala      | 2 ++
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/JsonProtocol.scala b/core/src/main/scala/org/apache/spark/deploy/JsonProtocol.scala
index 2954f932b4f41..ccffb36652988 100644
--- a/core/src/main/scala/org/apache/spark/deploy/JsonProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/JsonProtocol.scala
@@ -76,12 +76,13 @@ private[deploy] object JsonProtocol {
   }
 
   def writeMasterState(obj: MasterStateResponse): JObject = {
+    val aliveWorkers = obj.workers.filter(_.isAlive())
     ("url" -> obj.uri) ~
     ("workers" -> obj.workers.toList.map(writeWorkerInfo)) ~
-    ("cores" -> obj.workers.map(_.cores).sum) ~
-    ("coresused" -> obj.workers.map(_.coresUsed).sum) ~
-    ("memory" -> obj.workers.map(_.memory).sum) ~
-    ("memoryused" -> obj.workers.map(_.memoryUsed).sum) ~
+    ("cores" -> aliveWorkers.map(_.cores).sum) ~
+    ("coresused" -> aliveWorkers.map(_.coresUsed).sum) ~
+    ("memory" -> aliveWorkers.map(_.memory).sum) ~
+    ("memoryused" -> aliveWorkers.map(_.memoryUsed).sum) ~
     ("activeapps" -> obj.activeApps.toList.map(writeApplicationInfo)) ~
     ("completedapps" -> obj.completedApps.toList.map(writeApplicationInfo)) ~
     ("activedrivers" -> obj.activeDrivers.toList.map(writeDriverInfo)) ~
diff --git a/core/src/main/scala/org/apache/spark/deploy/master/WorkerInfo.scala b/core/src/main/scala/org/apache/spark/deploy/master/WorkerInfo.scala
index 471811037e5e2..f751966605206 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/WorkerInfo.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/WorkerInfo.scala
@@ -105,4 +105,6 @@ private[spark] class WorkerInfo(
   def setState(state: WorkerState.Value): Unit = {
     this.state = state
   }
+
+  def isAlive(): Boolean = this.state == WorkerState.ALIVE
 }

From 1165b17d24cdf1dbebb2faca14308dfe5c2a652c Mon Sep 17 00:00:00 2001
From: Ankur Chauhan <achauhan@brightcove.com>
Date: Mon, 6 Jul 2015 16:04:57 -0700
Subject: [PATCH 018/549] [SPARK-6707] [CORE] [MESOS] Mesos Scheduler should
 allow the user to specify constraints based on slave attributes

Currently, the mesos scheduler only looks at the 'cpu' and 'mem' resources when trying to determine the usablility of a resource offer from a mesos slave node. It may be preferable for the user to be able to ensure that the spark jobs are only started on a certain set of nodes (based on attributes).

For example, If the user sets a property, let's say `spark.mesos.constraints` is set to `tachyon=true;us-east-1=false`, then the resource offers will be checked to see if they meet both these constraints and only then will be accepted to start new executors.

Author: Ankur Chauhan <achauhan@brightcove.com>

Closes #5563 from ankurcha/mesos_attribs and squashes the following commits:

902535b [Ankur Chauhan] Fix line length
d83801c [Ankur Chauhan] Update code as per code review comments
8b73f2d [Ankur Chauhan] Fix imports
c3523e7 [Ankur Chauhan] Added docs
1a24d0b [Ankur Chauhan] Expand scope of attributes matching to include all data types
482fd71 [Ankur Chauhan] Update access modifier to private[this] for offer constraints
5ccc32d [Ankur Chauhan] Fix nit pick whitespace
1bce782 [Ankur Chauhan] Fix nit pick whitespace
c0cbc75 [Ankur Chauhan] Use offer id value for debug message
7fee0ea [Ankur Chauhan] Add debug statements
fc7eb5b [Ankur Chauhan] Fix import codestyle
00be252 [Ankur Chauhan] Style changes as per code review comments
662535f [Ankur Chauhan] Incorporate code review comments + use SparkFunSuite
fdc0937 [Ankur Chauhan] Decline offers that did not meet criteria
67b58a0 [Ankur Chauhan] Add documentation for spark.mesos.constraints
63f53f4 [Ankur Chauhan] Update codestyle - uniform style for config values
02031e4 [Ankur Chauhan] Fix scalastyle warnings in tests
c09ed84 [Ankur Chauhan] Fixed the access modifier on offerConstraints val to private[mesos]
0c64df6 [Ankur Chauhan] Rename overhead fractions to memory_*, fix spacing
8cc1e8f [Ankur Chauhan] Make exception message more explicit about the source of the error
addedba [Ankur Chauhan] Added test case for malformed constraint string
ec9d9a6 [Ankur Chauhan] Add tests for parse constraint string
72fe88a [Ankur Chauhan] Fix up tests + remove redundant method override, combine utility class into new mesos scheduler util trait
92b47fd [Ankur Chauhan] Add attributes based constraints support to MesosScheduler
---
 .../mesos/CoarseMesosSchedulerBackend.scala   |  43 +++--
 .../scheduler/cluster/mesos/MemoryUtils.scala |  31 ----
 .../cluster/mesos/MesosClusterScheduler.scala |   1 +
 .../cluster/mesos/MesosSchedulerBackend.scala |  62 ++++---
 .../cluster/mesos/MesosSchedulerUtils.scala   | 153 +++++++++++++++++-
 .../cluster/mesos/MemoryUtilsSuite.scala      |  46 ------
 .../mesos/MesosSchedulerBackendSuite.scala    |   6 +-
 .../mesos/MesosSchedulerUtilsSuite.scala      | 140 ++++++++++++++++
 docs/running-on-mesos.md                      |  22 +++
 9 files changed, 376 insertions(+), 128 deletions(-)
 delete mode 100644 core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MemoryUtils.scala
 delete mode 100644 core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MemoryUtilsSuite.scala
 create mode 100644 core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtilsSuite.scala

diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala
index 6b8edca5aa485..b68f8c7685eba 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala
@@ -18,18 +18,18 @@
 package org.apache.spark.scheduler.cluster.mesos
 
 import java.io.File
-import java.util.{Collections, List => JList}
+import java.util.{List => JList}
 
 import scala.collection.JavaConversions._
 import scala.collection.mutable.{HashMap, HashSet}
 
-import org.apache.mesos.Protos.{TaskInfo => MesosTaskInfo, _}
 import org.apache.mesos.{Scheduler => MScheduler, _}
+import org.apache.mesos.Protos.{TaskInfo => MesosTaskInfo, _}
+import org.apache.spark.{SparkContext, SparkEnv, SparkException, TaskState}
 import org.apache.spark.rpc.RpcAddress
 import org.apache.spark.scheduler.TaskSchedulerImpl
 import org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend
 import org.apache.spark.util.Utils
-import org.apache.spark.{SparkContext, SparkEnv, SparkException, TaskState}
 
 /**
  * A SchedulerBackend that runs tasks on Mesos, but uses "coarse-grained" tasks, where it holds
@@ -66,6 +66,10 @@ private[spark] class CoarseMesosSchedulerBackend(
 
   val extraCoresPerSlave = conf.getInt("spark.mesos.extra.cores", 0)
 
+  // Offer constraints
+  private val slaveOfferConstraints =
+    parseConstraintString(sc.conf.get("spark.mesos.constraints", ""))
+
   var nextMesosTaskId = 0
 
   @volatile var appId: String = _
@@ -170,13 +174,16 @@ private[spark] class CoarseMesosSchedulerBackend(
   override def resourceOffers(d: SchedulerDriver, offers: JList[Offer]) {
     synchronized {
       val filters = Filters.newBuilder().setRefuseSeconds(5).build()
-
       for (offer <- offers) {
+        val offerAttributes = toAttributeMap(offer.getAttributesList)
+        val meetsConstraints = matchesAttributeRequirements(slaveOfferConstraints, offerAttributes)
         val slaveId = offer.getSlaveId.toString
         val mem = getResource(offer.getResourcesList, "mem")
         val cpus = getResource(offer.getResourcesList, "cpus").toInt
-        if (totalCoresAcquired < maxCores &&
-            mem >= MemoryUtils.calculateTotalMemory(sc) &&
+        val id = offer.getId.getValue
+        if (meetsConstraints &&
+            totalCoresAcquired < maxCores &&
+            mem >= calculateTotalMemory(sc) &&
             cpus >= 1 &&
             failuresBySlaveId.getOrElse(slaveId, 0) < MAX_SLAVE_FAILURES &&
             !slaveIdsWithExecutors.contains(slaveId)) {
@@ -193,33 +200,25 @@ private[spark] class CoarseMesosSchedulerBackend(
             .setCommand(createCommand(offer, cpusToUse + extraCoresPerSlave))
             .setName("Task " + taskId)
             .addResources(createResource("cpus", cpusToUse))
-            .addResources(createResource("mem",
-              MemoryUtils.calculateTotalMemory(sc)))
+            .addResources(createResource("mem", calculateTotalMemory(sc)))
 
           sc.conf.getOption("spark.mesos.executor.docker.image").foreach { image =>
             MesosSchedulerBackendUtil
-              .setupContainerBuilderDockerInfo(image, sc.conf, task.getContainerBuilder())
+              .setupContainerBuilderDockerInfo(image, sc.conf, task.getContainerBuilder)
           }
 
-          d.launchTasks(
-            Collections.singleton(offer.getId), Collections.singletonList(task.build()), filters)
+          // accept the offer and launch the task
+          logDebug(s"Accepting offer: $id with attributes: $offerAttributes mem: $mem cpu: $cpus")
+          d.launchTasks(List(offer.getId), List(task.build()), filters)
         } else {
-          // Filter it out
-          d.launchTasks(
-            Collections.singleton(offer.getId), Collections.emptyList[MesosTaskInfo](), filters)
+          // Decline the offer
+          logDebug(s"Declining offer: $id with attributes: $offerAttributes mem: $mem cpu: $cpus")
+          d.declineOffer(offer.getId)
         }
       }
     }
   }
 
-  /** Build a Mesos resource protobuf object */
-  private def createResource(resourceName: String, quantity: Double): Protos.Resource = {
-    Resource.newBuilder()
-      .setName(resourceName)
-      .setType(Value.Type.SCALAR)
-      .setScalar(Value.Scalar.newBuilder().setValue(quantity).build())
-      .build()
-  }
 
   override def statusUpdate(d: SchedulerDriver, status: TaskStatus) {
     val taskId = status.getTaskId.getValue.toInt
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MemoryUtils.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MemoryUtils.scala
deleted file mode 100644
index 8df4f3b554c41..0000000000000
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MemoryUtils.scala
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.scheduler.cluster.mesos
-
-import org.apache.spark.SparkContext
-
-private[spark] object MemoryUtils {
-  // These defaults copied from YARN
-  val OVERHEAD_FRACTION = 0.10
-  val OVERHEAD_MINIMUM = 384
-
-  def calculateTotalMemory(sc: SparkContext): Int = {
-    sc.conf.getInt("spark.mesos.executor.memoryOverhead",
-      math.max(OVERHEAD_FRACTION * sc.executorMemory, OVERHEAD_MINIMUM).toInt) + sc.executorMemory
-  }
-}
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala
index 1067a7f1caf4c..d3a20f822176e 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala
@@ -29,6 +29,7 @@ import org.apache.mesos.Protos.Environment.Variable
 import org.apache.mesos.Protos.TaskStatus.Reason
 import org.apache.mesos.Protos.{TaskState => MesosTaskState, _}
 import org.apache.mesos.{Scheduler, SchedulerDriver}
+
 import org.apache.spark.deploy.mesos.MesosDriverDescription
 import org.apache.spark.deploy.rest.{CreateSubmissionResponse, KillSubmissionResponse, SubmissionStatusResponse}
 import org.apache.spark.metrics.MetricsSystem
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala
index 49de85ef48ada..d72e2af456e15 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala
@@ -23,14 +23,14 @@ import java.util.{ArrayList => JArrayList, Collections, List => JList}
 import scala.collection.JavaConversions._
 import scala.collection.mutable.{HashMap, HashSet}
 
+import org.apache.mesos.{Scheduler => MScheduler, _}
 import org.apache.mesos.Protos.{ExecutorInfo => MesosExecutorInfo, TaskInfo => MesosTaskInfo, _}
 import org.apache.mesos.protobuf.ByteString
-import org.apache.mesos.{Scheduler => MScheduler, _}
+import org.apache.spark.{SparkContext, SparkException, TaskState}
 import org.apache.spark.executor.MesosExecutorBackend
 import org.apache.spark.scheduler._
 import org.apache.spark.scheduler.cluster.ExecutorInfo
 import org.apache.spark.util.Utils
-import org.apache.spark.{SparkContext, SparkException, TaskState}
 
 /**
  * A SchedulerBackend for running fine-grained tasks on Mesos. Each Spark task is mapped to a
@@ -59,6 +59,10 @@ private[spark] class MesosSchedulerBackend(
 
   private[mesos] val mesosExecutorCores = sc.conf.getDouble("spark.mesos.mesosExecutor.cores", 1)
 
+  // Offer constraints
+  private[this] val slaveOfferConstraints =
+    parseConstraintString(sc.conf.get("spark.mesos.constraints", ""))
+
   @volatile var appId: String = _
 
   override def start() {
@@ -71,8 +75,8 @@ private[spark] class MesosSchedulerBackend(
     val executorSparkHome = sc.conf.getOption("spark.mesos.executor.home")
       .orElse(sc.getSparkHome()) // Fall back to driver Spark home for backward compatibility
       .getOrElse {
-        throw new SparkException("Executor Spark home `spark.mesos.executor.home` is not set!")
-      }
+      throw new SparkException("Executor Spark home `spark.mesos.executor.home` is not set!")
+    }
     val environment = Environment.newBuilder()
     sc.conf.getOption("spark.executor.extraClassPath").foreach { cp =>
       environment.addVariables(
@@ -115,14 +119,14 @@ private[spark] class MesosSchedulerBackend(
       .setName("cpus")
       .setType(Value.Type.SCALAR)
       .setScalar(Value.Scalar.newBuilder()
-        .setValue(mesosExecutorCores).build())
+      .setValue(mesosExecutorCores).build())
       .build()
     val memory = Resource.newBuilder()
       .setName("mem")
       .setType(Value.Type.SCALAR)
       .setScalar(
         Value.Scalar.newBuilder()
-          .setValue(MemoryUtils.calculateTotalMemory(sc)).build())
+          .setValue(calculateTotalMemory(sc)).build())
       .build()
     val executorInfo = MesosExecutorInfo.newBuilder()
       .setExecutorId(ExecutorID.newBuilder().setValue(execId).build())
@@ -191,13 +195,31 @@ private[spark] class MesosSchedulerBackend(
         val mem = getResource(o.getResourcesList, "mem")
         val cpus = getResource(o.getResourcesList, "cpus")
         val slaveId = o.getSlaveId.getValue
-        (mem >= MemoryUtils.calculateTotalMemory(sc) &&
-          // need at least 1 for executor, 1 for task
-          cpus >= (mesosExecutorCores + scheduler.CPUS_PER_TASK)) ||
-          (slaveIdsWithExecutors.contains(slaveId) &&
-            cpus >= scheduler.CPUS_PER_TASK)
+        val offerAttributes = toAttributeMap(o.getAttributesList)
+
+        // check if all constraints are satisfield
+        //  1. Attribute constraints
+        //  2. Memory requirements
+        //  3. CPU requirements - need at least 1 for executor, 1 for task
+        val meetsConstraints = matchesAttributeRequirements(slaveOfferConstraints, offerAttributes)
+        val meetsMemoryRequirements = mem >= calculateTotalMemory(sc)
+        val meetsCPURequirements = cpus >= (mesosExecutorCores + scheduler.CPUS_PER_TASK)
+
+        val meetsRequirements =
+          (meetsConstraints && meetsMemoryRequirements && meetsCPURequirements) ||
+          (slaveIdsWithExecutors.contains(slaveId) && cpus >= scheduler.CPUS_PER_TASK)
+
+        // add some debug messaging
+        val debugstr = if (meetsRequirements) "Accepting" else "Declining"
+        val id = o.getId.getValue
+        logDebug(s"$debugstr offer: $id with attributes: $offerAttributes mem: $mem cpu: $cpus")
+
+        meetsRequirements
       }
 
+      // Decline offers we ruled out immediately
+      unUsableOffers.foreach(o => d.declineOffer(o.getId))
+
       val workerOffers = usableOffers.map { o =>
         val cpus = if (slaveIdsWithExecutors.contains(o.getSlaveId.getValue)) {
           getResource(o.getResourcesList, "cpus").toInt
@@ -223,15 +245,15 @@ private[spark] class MesosSchedulerBackend(
       val acceptedOffers = scheduler.resourceOffers(workerOffers).filter(!_.isEmpty)
       acceptedOffers
         .foreach { offer =>
-          offer.foreach { taskDesc =>
-            val slaveId = taskDesc.executorId
-            slaveIdsWithExecutors += slaveId
-            slavesIdsOfAcceptedOffers += slaveId
-            taskIdToSlaveId(taskDesc.taskId) = slaveId
-            mesosTasks.getOrElseUpdate(slaveId, new JArrayList[MesosTaskInfo])
-              .add(createMesosTask(taskDesc, slaveId))
-          }
+        offer.foreach { taskDesc =>
+          val slaveId = taskDesc.executorId
+          slaveIdsWithExecutors += slaveId
+          slavesIdsOfAcceptedOffers += slaveId
+          taskIdToSlaveId(taskDesc.taskId) = slaveId
+          mesosTasks.getOrElseUpdate(slaveId, new JArrayList[MesosTaskInfo])
+            .add(createMesosTask(taskDesc, slaveId))
         }
+      }
 
       // Reply to the offers
       val filters = Filters.newBuilder().setRefuseSeconds(1).build() // TODO: lower timeout?
@@ -251,8 +273,6 @@ private[spark] class MesosSchedulerBackend(
         d.declineOffer(o.getId)
       }
 
-      // Decline offers we ruled out immediately
-      unUsableOffers.foreach(o => d.declineOffer(o.getId))
     }
   }
 
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtils.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtils.scala
index d11228f3d016a..d8a8c848bb4d1 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtils.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtils.scala
@@ -17,14 +17,17 @@
 
 package org.apache.spark.scheduler.cluster.mesos
 
-import java.util.List
+import java.util.{List => JList}
 import java.util.concurrent.CountDownLatch
 
 import scala.collection.JavaConversions._
+import scala.util.control.NonFatal
 
-import org.apache.mesos.Protos.{FrameworkInfo, Resource, Status}
-import org.apache.mesos.{MesosSchedulerDriver, Scheduler}
-import org.apache.spark.Logging
+import com.google.common.base.Splitter
+import org.apache.mesos.{MesosSchedulerDriver, Protos, Scheduler}
+import org.apache.mesos.Protos._
+import org.apache.mesos.protobuf.GeneratedMessage
+import org.apache.spark.{Logging, SparkContext}
 import org.apache.spark.util.Utils
 
 /**
@@ -86,10 +89,150 @@ private[mesos] trait MesosSchedulerUtils extends Logging {
   /**
    * Get the amount of resources for the specified type from the resource list
    */
-  protected def getResource(res: List[Resource], name: String): Double = {
+  protected def getResource(res: JList[Resource], name: String): Double = {
     for (r <- res if r.getName == name) {
       return r.getScalar.getValue
     }
     0.0
   }
+
+  /** Helper method to get the key,value-set pair for a Mesos Attribute protobuf */
+  protected def getAttribute(attr: Attribute): (String, Set[String]) = {
+    (attr.getName, attr.getText.getValue.split(',').toSet)
+  }
+
+
+  /** Build a Mesos resource protobuf object */
+  protected def createResource(resourceName: String, quantity: Double): Protos.Resource = {
+    Resource.newBuilder()
+      .setName(resourceName)
+      .setType(Value.Type.SCALAR)
+      .setScalar(Value.Scalar.newBuilder().setValue(quantity).build())
+      .build()
+  }
+
+  /**
+   * Converts the attributes from the resource offer into a Map of name -> Attribute Value
+   * The attribute values are the mesos attribute types and they are
+   * @param offerAttributes
+   * @return
+   */
+  protected def toAttributeMap(offerAttributes: JList[Attribute]): Map[String, GeneratedMessage] = {
+    offerAttributes.map(attr => {
+      val attrValue = attr.getType match {
+        case Value.Type.SCALAR => attr.getScalar
+        case Value.Type.RANGES => attr.getRanges
+        case Value.Type.SET => attr.getSet
+        case Value.Type.TEXT => attr.getText
+      }
+      (attr.getName, attrValue)
+    }).toMap
+  }
+
+
+  /**
+   * Match the requirements (if any) to the offer attributes.
+   * if attribute requirements are not specified - return true
+   * else if attribute is defined and no values are given, simple attribute presence is performed
+   * else if attribute name and value is specified, subset match is performed on slave attributes
+   */
+  def matchesAttributeRequirements(
+      slaveOfferConstraints: Map[String, Set[String]],
+      offerAttributes: Map[String, GeneratedMessage]): Boolean = {
+    slaveOfferConstraints.forall {
+      // offer has the required attribute and subsumes the required values for that attribute
+      case (name, requiredValues) =>
+        offerAttributes.get(name) match {
+          case None => false
+          case Some(_) if requiredValues.isEmpty => true // empty value matches presence
+          case Some(scalarValue: Value.Scalar) =>
+            // check if provided values is less than equal to the offered values
+            requiredValues.map(_.toDouble).exists(_ <= scalarValue.getValue)
+          case Some(rangeValue: Value.Range) =>
+            val offerRange = rangeValue.getBegin to rangeValue.getEnd
+            // Check if there is some required value that is between the ranges specified
+            // Note: We only support the ability to specify discrete values, in the future
+            // we may expand it to subsume ranges specified with a XX..YY value or something
+            // similar to that.
+            requiredValues.map(_.toLong).exists(offerRange.contains(_))
+          case Some(offeredValue: Value.Set) =>
+            // check if the specified required values is a subset of offered set
+            requiredValues.subsetOf(offeredValue.getItemList.toSet)
+          case Some(textValue: Value.Text) =>
+            // check if the specified value is equal, if multiple values are specified
+            // we succeed if any of them match.
+            requiredValues.contains(textValue.getValue)
+        }
+    }
+  }
+
+  /**
+   * Parses the attributes constraints provided to spark and build a matching data struct:
+   *  Map[<attribute-name>, Set[values-to-match]]
+   *  The constraints are specified as ';' separated key-value pairs where keys and values
+   *  are separated by ':'. The ':' implies equality (for singular values) and "is one of" for
+   *  multiple values (comma separated). For example:
+   *  {{{
+   *  parseConstraintString("tachyon:true;zone:us-east-1a,us-east-1b")
+   *  // would result in
+   *  <code>
+   *  Map(
+   *    "tachyon" -> Set("true"),
+   *    "zone":   -> Set("us-east-1a", "us-east-1b")
+   *  )
+   *  }}}
+   *
+   *  Mesos documentation: http://mesos.apache.org/documentation/attributes-resources/
+   *                       https://github.com/apache/mesos/blob/master/src/common/values.cpp
+   *                       https://github.com/apache/mesos/blob/master/src/common/attributes.cpp
+   *
+   * @param constraintsVal constaints string consisting of ';' separated key-value pairs (separated
+   *                       by ':')
+   * @return  Map of constraints to match resources offers.
+   */
+  def parseConstraintString(constraintsVal: String): Map[String, Set[String]] = {
+    /*
+      Based on mesos docs:
+      attributes : attribute ( ";" attribute )*
+      attribute : labelString ":" ( labelString | "," )+
+      labelString : [a-zA-Z0-9_/.-]
+    */
+    val splitter = Splitter.on(';').trimResults().withKeyValueSeparator(':')
+    // kv splitter
+    if (constraintsVal.isEmpty) {
+      Map()
+    } else {
+      try {
+        Map() ++ mapAsScalaMap(splitter.split(constraintsVal)).map {
+          case (k, v) =>
+            if (v == null || v.isEmpty) {
+              (k, Set[String]())
+            } else {
+              (k, v.split(',').toSet)
+            }
+        }
+      } catch {
+        case NonFatal(e) =>
+          throw new IllegalArgumentException(s"Bad constraint string: $constraintsVal", e)
+      }
+    }
+  }
+
+  // These defaults copied from YARN
+  private val MEMORY_OVERHEAD_FRACTION = 0.10
+  private val MEMORY_OVERHEAD_MINIMUM = 384
+
+  /**
+   * Return the amount of memory to allocate to each executor, taking into account
+   * container overheads.
+   * @param sc SparkContext to use to get `spark.mesos.executor.memoryOverhead` value
+   * @return memory requirement as (0.1 * <memoryOverhead>) or MEMORY_OVERHEAD_MINIMUM
+   *         (whichever is larger)
+   */
+  def calculateTotalMemory(sc: SparkContext): Int = {
+    sc.conf.getInt("spark.mesos.executor.memoryOverhead",
+      math.max(MEMORY_OVERHEAD_FRACTION * sc.executorMemory, MEMORY_OVERHEAD_MINIMUM).toInt) +
+      sc.executorMemory
+  }
+
 }
diff --git a/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MemoryUtilsSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MemoryUtilsSuite.scala
deleted file mode 100644
index e72285d03d3ee..0000000000000
--- a/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MemoryUtilsSuite.scala
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.scheduler.cluster.mesos
-
-import org.mockito.Mockito._
-import org.scalatest.mock.MockitoSugar
-
-import org.apache.spark.{SparkConf, SparkContext, SparkFunSuite}
-
-class MemoryUtilsSuite extends SparkFunSuite with MockitoSugar {
-  test("MesosMemoryUtils should always override memoryOverhead when it's set") {
-    val sparkConf = new SparkConf
-
-    val sc = mock[SparkContext]
-    when(sc.conf).thenReturn(sparkConf)
-
-    // 384 > sc.executorMemory * 0.1 => 512 + 384 = 896
-    when(sc.executorMemory).thenReturn(512)
-    assert(MemoryUtils.calculateTotalMemory(sc) === 896)
-
-    // 384 < sc.executorMemory * 0.1 => 4096 + (4096 * 0.1) = 4505.6
-    when(sc.executorMemory).thenReturn(4096)
-    assert(MemoryUtils.calculateTotalMemory(sc) === 4505)
-
-    // set memoryOverhead
-    sparkConf.set("spark.mesos.executor.memoryOverhead", "100")
-    assert(MemoryUtils.calculateTotalMemory(sc) === 4196)
-    sparkConf.set("spark.mesos.executor.memoryOverhead", "400")
-    assert(MemoryUtils.calculateTotalMemory(sc) === 4496)
-  }
-}
diff --git a/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendSuite.scala
index 68df46a41ddc8..d01837fe78957 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendSuite.scala
@@ -149,7 +149,9 @@ class MesosSchedulerBackendSuite extends SparkFunSuite with LocalSparkContext wi
     when(sc.conf).thenReturn(new SparkConf)
     when(sc.listenerBus).thenReturn(listenerBus)
 
-    val minMem = MemoryUtils.calculateTotalMemory(sc).toInt
+    val backend = new MesosSchedulerBackend(taskScheduler, sc, "master")
+
+    val minMem = backend.calculateTotalMemory(sc)
     val minCpu = 4
 
     val mesosOffers = new java.util.ArrayList[Offer]
@@ -157,8 +159,6 @@ class MesosSchedulerBackendSuite extends SparkFunSuite with LocalSparkContext wi
     mesosOffers.add(createOffer(2, minMem - 1, minCpu))
     mesosOffers.add(createOffer(3, minMem, minCpu))
 
-    val backend = new MesosSchedulerBackend(taskScheduler, sc, "master")
-
     val expectedWorkerOffers = new ArrayBuffer[WorkerOffer](2)
     expectedWorkerOffers.append(new WorkerOffer(
       mesosOffers.get(0).getSlaveId.getValue,
diff --git a/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtilsSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtilsSuite.scala
new file mode 100644
index 0000000000000..b354914b6ffd0
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtilsSuite.scala
@@ -0,0 +1,140 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.scheduler.cluster.mesos
+
+import org.apache.mesos.Protos.Value
+import org.mockito.Mockito._
+import org.scalatest._
+import org.scalatest.mock.MockitoSugar
+import org.apache.spark.{SparkConf, SparkContext, SparkFunSuite}
+
+class MesosSchedulerUtilsSuite extends SparkFunSuite with Matchers with MockitoSugar {
+
+  // scalastyle:off structural.type
+  // this is the documented way of generating fixtures in scalatest
+  def fixture: Object {val sc: SparkContext; val sparkConf: SparkConf} = new {
+    val sparkConf = new SparkConf
+    val sc = mock[SparkContext]
+    when(sc.conf).thenReturn(sparkConf)
+  }
+  val utils = new MesosSchedulerUtils { }
+  // scalastyle:on structural.type
+
+  test("use at-least minimum overhead") {
+    val f = fixture
+    when(f.sc.executorMemory).thenReturn(512)
+    utils.calculateTotalMemory(f.sc) shouldBe 896
+  }
+
+  test("use overhead if it is greater than minimum value") {
+    val f = fixture
+    when(f.sc.executorMemory).thenReturn(4096)
+    utils.calculateTotalMemory(f.sc) shouldBe 4505
+  }
+
+  test("use spark.mesos.executor.memoryOverhead (if set)") {
+    val f = fixture
+    when(f.sc.executorMemory).thenReturn(1024)
+    f.sparkConf.set("spark.mesos.executor.memoryOverhead", "512")
+    utils.calculateTotalMemory(f.sc) shouldBe 1536
+  }
+
+  test("parse a non-empty constraint string correctly") {
+    val expectedMap = Map(
+      "tachyon" -> Set("true"),
+      "zone" -> Set("us-east-1a", "us-east-1b")
+    )
+    utils.parseConstraintString("tachyon:true;zone:us-east-1a,us-east-1b") should be (expectedMap)
+  }
+
+  test("parse an empty constraint string correctly") {
+    utils.parseConstraintString("") shouldBe Map()
+  }
+
+  test("throw an exception when the input is malformed") {
+    an[IllegalArgumentException] should be thrownBy
+      utils.parseConstraintString("tachyon;zone:us-east")
+  }
+
+  test("empty values for attributes' constraints matches all values") {
+    val constraintsStr = "tachyon:"
+    val parsedConstraints = utils.parseConstraintString(constraintsStr)
+
+    parsedConstraints shouldBe Map("tachyon" -> Set())
+
+    val zoneSet = Value.Set.newBuilder().addItem("us-east-1a").addItem("us-east-1b").build()
+    val noTachyonOffer = Map("zone" -> zoneSet)
+    val tachyonTrueOffer = Map("tachyon" -> Value.Text.newBuilder().setValue("true").build())
+    val tachyonFalseOffer = Map("tachyon" -> Value.Text.newBuilder().setValue("false").build())
+
+    utils.matchesAttributeRequirements(parsedConstraints, noTachyonOffer) shouldBe false
+    utils.matchesAttributeRequirements(parsedConstraints, tachyonTrueOffer) shouldBe true
+    utils.matchesAttributeRequirements(parsedConstraints, tachyonFalseOffer) shouldBe true
+  }
+
+  test("subset match is performed for set attributes") {
+    val supersetConstraint = Map(
+      "tachyon" -> Value.Text.newBuilder().setValue("true").build(),
+      "zone" -> Value.Set.newBuilder()
+        .addItem("us-east-1a")
+        .addItem("us-east-1b")
+        .addItem("us-east-1c")
+        .build())
+
+    val zoneConstraintStr = "tachyon:;zone:us-east-1a,us-east-1c"
+    val parsedConstraints = utils.parseConstraintString(zoneConstraintStr)
+
+    utils.matchesAttributeRequirements(parsedConstraints, supersetConstraint) shouldBe true
+  }
+
+  test("less than equal match is performed on scalar attributes") {
+    val offerAttribs = Map("gpus" -> Value.Scalar.newBuilder().setValue(3).build())
+
+    val ltConstraint = utils.parseConstraintString("gpus:2")
+    val eqConstraint = utils.parseConstraintString("gpus:3")
+    val gtConstraint = utils.parseConstraintString("gpus:4")
+
+    utils.matchesAttributeRequirements(ltConstraint, offerAttribs) shouldBe true
+    utils.matchesAttributeRequirements(eqConstraint, offerAttribs) shouldBe true
+    utils.matchesAttributeRequirements(gtConstraint, offerAttribs) shouldBe false
+  }
+
+  test("contains match is performed for range attributes") {
+    val offerAttribs = Map("ports" -> Value.Range.newBuilder().setBegin(7000).setEnd(8000).build())
+    val ltConstraint = utils.parseConstraintString("ports:6000")
+    val eqConstraint = utils.parseConstraintString("ports:7500")
+    val gtConstraint = utils.parseConstraintString("ports:8002")
+    val multiConstraint = utils.parseConstraintString("ports:5000,7500,8300")
+
+    utils.matchesAttributeRequirements(ltConstraint, offerAttribs) shouldBe false
+    utils.matchesAttributeRequirements(eqConstraint, offerAttribs) shouldBe true
+    utils.matchesAttributeRequirements(gtConstraint, offerAttribs) shouldBe false
+    utils.matchesAttributeRequirements(multiConstraint, offerAttribs) shouldBe true
+  }
+
+  test("equality match is performed for text attributes") {
+    val offerAttribs = Map("tachyon" -> Value.Text.newBuilder().setValue("true").build())
+
+    val trueConstraint = utils.parseConstraintString("tachyon:true")
+    val falseConstraint = utils.parseConstraintString("tachyon:false")
+
+    utils.matchesAttributeRequirements(trueConstraint, offerAttribs) shouldBe true
+    utils.matchesAttributeRequirements(falseConstraint, offerAttribs) shouldBe false
+  }
+
+}
diff --git a/docs/running-on-mesos.md b/docs/running-on-mesos.md
index 5f1d6daeb27f0..1f915d8ea1d73 100644
--- a/docs/running-on-mesos.md
+++ b/docs/running-on-mesos.md
@@ -184,6 +184,14 @@ acquire. By default, it will acquire *all* cores in the cluster (that get offere
 only makes sense if you run just one application at a time. You can cap the maximum number of cores
 using `conf.set("spark.cores.max", "10")` (for example).
 
+You may also make use of `spark.mesos.constraints` to set attribute based constraints on mesos resource offers. By default, all resource offers will be accepted.
+
+{% highlight scala %}
+conf.set("spark.mesos.constraints", "tachyon=true;us-east-1=false")
+{% endhighlight %}
+
+For example, Let's say `spark.mesos.constraints` is set to `tachyon=true;us-east-1=false`, then the resource offers will be checked to see if they meet both these constraints and only then will be accepted to start new executors.
+
 # Mesos Docker Support
 
 Spark can make use of a Mesos Docker containerizer by setting the property `spark.mesos.executor.docker.image`
@@ -298,6 +306,20 @@ See the [configuration page](configuration.html) for information on Spark config
     the final overhead will be this value.
   </td>
 </tr>
+<tr>
+  <td><code>spark.mesos.constraints</code></td>
+  <td>Attribute based constraints to be matched against when accepting resource offers.</td>
+  <td>
+    Attribute based constraints on mesos resource offers. By default, all resource offers will be accepted. Refer to <a href="http://mesos.apache.org/documentation/attributes-resources/">Mesos Attributes & Resources</a> for more information on attributes.
+    <ul>
+      <li>Scalar constraints are matched with "less than equal" semantics i.e. value in the constraint must be less than or equal to the value in the resource offer.</li>
+      <li>Range constraints are matched with "contains" semantics i.e. value in the constraint must be within the resource offer's value.</li>
+      <li>Set constraints are matched with "subset of" semantics i.e. value in the constraint must be a subset of the resource offer's value.</li>
+      <li>Text constraints are metched with "equality" semantics i.e. value in the constraint must be exactly equal to the resource offer's value.</li>
+      <li>In case there is no value present as a part of the constraint any offer with the corresponding attribute will be accepted (without value check).</li>
+    </ul>
+  </td>
+</tr>
 </table>
 
 # Troubleshooting and Debugging

From 96c5eeec3970e8b1ebc6ddf5c97a7acc47f539dc Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Mon, 6 Jul 2015 16:11:22 -0700
Subject: [PATCH 019/549] Revert "[SPARK-7212] [MLLIB] Add sequence learning
 flag"

This reverts commit 25f574eb9a3cb9b93b7d9194a8ec16e00ce2c036. After speaking to some users and developers, we realized that FP-growth doesn't meet the requirement for frequent sequence mining. PrefixSpan (SPARK-6487) would be the correct algorithm for it. feynmanliang

Author: Xiangrui Meng <meng@databricks.com>

Closes #7240 from mengxr/SPARK-7212.revert and squashes the following commits:

2b3d66b [Xiangrui Meng] Revert "[SPARK-7212] [MLLIB] Add sequence learning flag"
---
 .../org/apache/spark/mllib/fpm/FPGrowth.scala | 38 +++-----------
 .../spark/mllib/fpm/FPGrowthSuite.scala       | 52 +------------------
 python/pyspark/mllib/fpm.py                   |  4 +-
 3 files changed, 12 insertions(+), 82 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
index abac08022ea47..efa8459d3cdba 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
@@ -36,7 +36,7 @@ import org.apache.spark.storage.StorageLevel
  * :: Experimental ::
  *
  * Model trained by [[FPGrowth]], which holds frequent itemsets.
- * @param freqItemsets frequent itemsets, which is an RDD of [[FreqItemset]]
+ * @param freqItemsets frequent itemset, which is an RDD of [[FreqItemset]]
  * @tparam Item item type
  */
 @Experimental
@@ -62,14 +62,13 @@ class FPGrowthModel[Item: ClassTag](val freqItemsets: RDD[FreqItemset[Item]]) ex
 @Experimental
 class FPGrowth private (
     private var minSupport: Double,
-    private var numPartitions: Int,
-    private var ordered: Boolean) extends Logging with Serializable {
+    private var numPartitions: Int) extends Logging with Serializable {
 
   /**
    * Constructs a default instance with default parameters {minSupport: `0.3`, numPartitions: same
-   * as the input data, ordered: `false`}.
+   * as the input data}.
    */
-  def this() = this(0.3, -1, false)
+  def this() = this(0.3, -1)
 
   /**
    * Sets the minimal support level (default: `0.3`).
@@ -87,15 +86,6 @@ class FPGrowth private (
     this
   }
 
-  /**
-   * Indicates whether to mine itemsets (unordered) or sequences (ordered) (default: false, mine
-   * itemsets).
-   */
-  def setOrdered(ordered: Boolean): this.type = {
-    this.ordered = ordered
-    this
-  }
-
   /**
    * Computes an FP-Growth model that contains frequent itemsets.
    * @param data input data set, each element contains a transaction
@@ -165,7 +155,7 @@ class FPGrowth private (
     .flatMap { case (part, tree) =>
       tree.extract(minCount, x => partitioner.getPartition(x) == part)
     }.map { case (ranks, count) =>
-      new FreqItemset(ranks.map(i => freqItems(i)).reverse.toArray, count, ordered)
+      new FreqItemset(ranks.map(i => freqItems(i)).toArray, count)
     }
   }
 
@@ -181,12 +171,9 @@ class FPGrowth private (
       itemToRank: Map[Item, Int],
       partitioner: Partitioner): mutable.Map[Int, Array[Int]] = {
     val output = mutable.Map.empty[Int, Array[Int]]
-    // Filter the basket by frequent items pattern
+    // Filter the basket by frequent items pattern and sort their ranks.
     val filtered = transaction.flatMap(itemToRank.get)
-    if (!this.ordered) {
-      ju.Arrays.sort(filtered)
-    }
-    // Generate conditional transactions
+    ju.Arrays.sort(filtered)
     val n = filtered.length
     var i = n - 1
     while (i >= 0) {
@@ -211,18 +198,9 @@ object FPGrowth {
    * Frequent itemset.
    * @param items items in this itemset. Java users should call [[FreqItemset#javaItems]] instead.
    * @param freq frequency
-   * @param ordered indicates if items represents an itemset (false) or sequence (true)
    * @tparam Item item type
    */
-  class FreqItemset[Item](val items: Array[Item], val freq: Long, val ordered: Boolean)
-    extends Serializable {
-
-    /**
-     * Auxillary constructor, assumes unordered by default.
-     */
-    def this(items: Array[Item], freq: Long) {
-      this(items, freq, false)
-    }
+  class FreqItemset[Item](val items: Array[Item], val freq: Long) extends Serializable {
 
     /**
      * Returns items in a Java List.
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/fpm/FPGrowthSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/fpm/FPGrowthSuite.scala
index 1a8a1e79f2810..66ae3543ecc4e 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/fpm/FPGrowthSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/fpm/FPGrowthSuite.scala
@@ -22,7 +22,7 @@ import org.apache.spark.mllib.util.MLlibTestSparkContext
 class FPGrowthSuite extends SparkFunSuite with MLlibTestSparkContext {
 
 
-  test("FP-Growth frequent itemsets using String type") {
+  test("FP-Growth using String type") {
     val transactions = Seq(
       "r z h k p",
       "z y x w v u t s",
@@ -38,14 +38,12 @@ class FPGrowthSuite extends SparkFunSuite with MLlibTestSparkContext {
     val model6 = fpg
       .setMinSupport(0.9)
       .setNumPartitions(1)
-      .setOrdered(false)
       .run(rdd)
     assert(model6.freqItemsets.count() === 0)
 
     val model3 = fpg
       .setMinSupport(0.5)
       .setNumPartitions(2)
-      .setOrdered(false)
       .run(rdd)
     val freqItemsets3 = model3.freqItemsets.collect().map { itemset =>
       (itemset.items.toSet, itemset.freq)
@@ -63,59 +61,17 @@ class FPGrowthSuite extends SparkFunSuite with MLlibTestSparkContext {
     val model2 = fpg
       .setMinSupport(0.3)
       .setNumPartitions(4)
-      .setOrdered(false)
       .run(rdd)
     assert(model2.freqItemsets.count() === 54)
 
     val model1 = fpg
       .setMinSupport(0.1)
       .setNumPartitions(8)
-      .setOrdered(false)
       .run(rdd)
     assert(model1.freqItemsets.count() === 625)
   }
 
-  test("FP-Growth frequent sequences using String type"){
-    val transactions = Seq(
-      "r z h k p",
-      "z y x w v u t s",
-      "s x o n r",
-      "x z y m t s q e",
-      "z",
-      "x z y r q t p")
-      .map(_.split(" "))
-    val rdd = sc.parallelize(transactions, 2).cache()
-
-    val fpg = new FPGrowth()
-
-    val model1 = fpg
-      .setMinSupport(0.5)
-      .setNumPartitions(2)
-      .setOrdered(true)
-      .run(rdd)
-
-    /*
-      Use the following R code to verify association rules using arulesSequences package.
-
-      data = read_baskets("path", info = c("sequenceID","eventID","SIZE"))
-      freqItemSeq = cspade(data, parameter = list(support = 0.5))
-      resSeq = as(freqItemSeq, "data.frame")
-      resSeq$support = resSeq$support * length(transactions)
-      names(resSeq)[names(resSeq) == "support"] = "freq"
-      resSeq
-     */
-    val expected = Set(
-      (Seq("r"), 3L), (Seq("s"), 3L), (Seq("t"), 3L), (Seq("x"), 4L), (Seq("y"), 3L),
-      (Seq("z"), 5L), (Seq("z", "y"), 3L), (Seq("x", "t"), 3L), (Seq("y", "t"), 3L),
-      (Seq("z", "t"), 3L), (Seq("z", "y", "t"), 3L)
-    )
-    val freqItemseqs1 = model1.freqItemsets.collect().map { itemset =>
-      (itemset.items.toSeq, itemset.freq)
-    }.toSet
-    assert(freqItemseqs1 == expected)
-  }
-
-  test("FP-Growth frequent itemsets using Int type") {
+  test("FP-Growth using Int type") {
     val transactions = Seq(
       "1 2 3",
       "1 2 3 4",
@@ -132,14 +88,12 @@ class FPGrowthSuite extends SparkFunSuite with MLlibTestSparkContext {
     val model6 = fpg
       .setMinSupport(0.9)
       .setNumPartitions(1)
-      .setOrdered(false)
       .run(rdd)
     assert(model6.freqItemsets.count() === 0)
 
     val model3 = fpg
       .setMinSupport(0.5)
       .setNumPartitions(2)
-      .setOrdered(false)
       .run(rdd)
     assert(model3.freqItemsets.first().items.getClass === Array(1).getClass,
       "frequent itemsets should use primitive arrays")
@@ -155,14 +109,12 @@ class FPGrowthSuite extends SparkFunSuite with MLlibTestSparkContext {
     val model2 = fpg
       .setMinSupport(0.3)
       .setNumPartitions(4)
-      .setOrdered(false)
       .run(rdd)
     assert(model2.freqItemsets.count() === 15)
 
     val model1 = fpg
       .setMinSupport(0.1)
       .setNumPartitions(8)
-      .setOrdered(false)
       .run(rdd)
     assert(model1.freqItemsets.count() === 65)
   }
diff --git a/python/pyspark/mllib/fpm.py b/python/pyspark/mllib/fpm.py
index b7f00d60069e6..bdc4a132b1b18 100644
--- a/python/pyspark/mllib/fpm.py
+++ b/python/pyspark/mllib/fpm.py
@@ -39,8 +39,8 @@ class FPGrowthModel(JavaModelWrapper):
     >>> data = [["a", "b", "c"], ["a", "b", "d", "e"], ["a", "c", "e"], ["a", "c", "f"]]
     >>> rdd = sc.parallelize(data, 2)
     >>> model = FPGrowth.train(rdd, 0.6, 2)
-    >>> sorted(model.freqItemsets().collect(), key=lambda x: x.items)
-    [FreqItemset(items=[u'a'], freq=4), FreqItemset(items=[u'a', u'c'], freq=3), ...
+    >>> sorted(model.freqItemsets().collect())
+    [FreqItemset(items=[u'a'], freq=4), FreqItemset(items=[u'c'], freq=3), ...
     """
 
     def freqItemsets(self):

From 0effe180f4c2cf37af1012b33b43912bdecaf756 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Mon, 6 Jul 2015 16:15:12 -0700
Subject: [PATCH 020/549] [SPARK-8765] [MLLIB] Fix PySpark
 PowerIterationClustering test issue

PySpark PowerIterationClustering test failure due to bad demo data.
If the data is small,  PowerIterationClustering will behavior indeterministic.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #7177 from yanboliang/spark-8765 and squashes the following commits:

392ae54 [Yanbo Liang] fix model.assignments output
5ec3f1e [Yanbo Liang] fix PySpark PowerIterationClustering test issue
---
 python/pyspark/mllib/clustering.py | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/mllib/clustering.py b/python/pyspark/mllib/clustering.py
index a3eab635282f6..ed4d78a2c6788 100644
--- a/python/pyspark/mllib/clustering.py
+++ b/python/pyspark/mllib/clustering.py
@@ -282,18 +282,30 @@ class PowerIterationClusteringModel(JavaModelWrapper, JavaSaveable, JavaLoader):
 
     Model produced by [[PowerIterationClustering]].
 
-    >>> data = [(0, 1, 1.0), (0, 2, 1.0), (1, 3, 1.0), (2, 3, 1.0),
-    ...     (0, 3, 1.0), (1, 2, 1.0), (0, 4, 0.1)]
+    >>> data = [(0, 1, 1.0), (0, 2, 1.0), (0, 3, 1.0), (1, 2, 1.0), (1, 3, 1.0),
+    ... (2, 3, 1.0), (3, 4, 0.1), (4, 5, 1.0), (4, 15, 1.0), (5, 6, 1.0),
+    ... (6, 7, 1.0), (7, 8, 1.0), (8, 9, 1.0), (9, 10, 1.0), (10, 11, 1.0),
+    ... (11, 12, 1.0), (12, 13, 1.0), (13, 14, 1.0), (14, 15, 1.0)]
     >>> rdd = sc.parallelize(data, 2)
     >>> model = PowerIterationClustering.train(rdd, 2, 100)
     >>> model.k
     2
+    >>> result = sorted(model.assignments().collect(), key=lambda x: x.id)
+    >>> result[0].cluster == result[1].cluster == result[2].cluster == result[3].cluster
+    True
+    >>> result[4].cluster == result[5].cluster == result[6].cluster == result[7].cluster
+    True
     >>> import os, tempfile
     >>> path = tempfile.mkdtemp()
     >>> model.save(sc, path)
     >>> sameModel = PowerIterationClusteringModel.load(sc, path)
     >>> sameModel.k
     2
+    >>> result = sorted(model.assignments().collect(), key=lambda x: x.id)
+    >>> result[0].cluster == result[1].cluster == result[2].cluster == result[3].cluster
+    True
+    >>> result[4].cluster == result[5].cluster == result[6].cluster == result[7].cluster
+    True
     >>> from shutil import rmtree
     >>> try:
     ...     rmtree(path)

From 7b467cc9348fa910e445ad08914a72f8ed4fc249 Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Mon, 6 Jul 2015 16:26:31 -0700
Subject: [PATCH 021/549] [SPARK-8588] [SQL] Regression test

This PR adds regression test for https://issues.apache.org/jira/browse/SPARK-8588 (fixed by https://github.com/apache/spark/commit/457d07eaa023b44b75344110508f629925eb6247).

Author: Yin Huai <yhuai@databricks.com>

This patch had conflicts when merged, resolved by
Committer: Michael Armbrust <michael@databricks.com>

Closes #7103 from yhuai/SPARK-8588-test and squashes the following commits:

eb5f418 [Yin Huai] Add a query test.
c61a173 [Yin Huai] Regression test for SPARK-8588.
---
 .../analysis/HiveTypeCoercionSuite.scala      | 21 +++++++++++++++++++
 .../sql/hive/execution/SQLQuerySuite.scala    | 16 ++++++++++++++
 2 files changed, 37 insertions(+)

diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
index b56426617789e..93db33d44eb25 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
@@ -271,4 +271,25 @@ class HiveTypeCoercionSuite extends PlanTest {
       Literal(true)
     )
   }
+
+  /**
+   * There are rules that need to not fire before child expressions get resolved.
+   * We use this test to make sure those rules do not fire early.
+   */
+  test("make sure rules do not fire early") {
+    // InConversion
+    val inConversion = HiveTypeCoercion.InConversion
+    ruleTest(inConversion,
+      In(UnresolvedAttribute("a"), Seq(Literal(1))),
+      In(UnresolvedAttribute("a"), Seq(Literal(1)))
+    )
+    ruleTest(inConversion,
+      In(Literal("test"), Seq(UnresolvedAttribute("a"), Literal(1))),
+      In(Literal("test"), Seq(UnresolvedAttribute("a"), Literal(1)))
+    )
+    ruleTest(inConversion,
+      In(Literal("a"), Seq(Literal(1), Literal("b"))),
+      In(Literal("a"), Seq(Cast(Literal(1), StringType), Cast(Literal("b"), StringType)))
+    )
+  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index 6d645393a6da1..bf9f2ecd51793 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -990,5 +990,21 @@ class SQLQuerySuite extends QueryTest {
         Timestamp.valueOf("1969-12-31 16:00:00"),
         String.valueOf("1969-12-31 16:00:00"),
         Timestamp.valueOf("1970-01-01 00:00:00")))
+
+  }
+
+  test("SPARK-8588 HiveTypeCoercion.inConversion fires too early") {
+    val df =
+      TestHive.createDataFrame(Seq((1, "2014-01-01"), (2, "2015-01-01"), (3, "2016-01-01")))
+    df.toDF("id", "date").registerTempTable("test_SPARK8588")
+    checkAnswer(
+      TestHive.sql(
+        """
+          |select id, concat(year(date))
+          |from test_SPARK8588 where concat(year(date), ' year') in ('2015 year', '2014 year')
+        """.stripMargin),
+      Row(1, "2014") :: Row(2, "2015") :: Nil
+    )
+    TestHive.dropTempTable("test_SPARK8588")
   }
 }

From 09a06418debc25da0191d98798f7c5016d39be91 Mon Sep 17 00:00:00 2001
From: animesh <animesh@apache.spark>
Date: Mon, 6 Jul 2015 16:39:49 -0700
Subject: [PATCH 022/549] [SPARK-8072] [SQL] Better AnalysisException for
 writing DataFrame with identically named columns

Adding a function checkConstraints which will check for the constraints to be applied on the dataframe / dataframe schema. Function called before storing the dataframe to an external storage. Function added in the corresponding datasource API.
cc rxin marmbrus

Author: animesh <animesh@apache.spark>

This patch had conflicts when merged, resolved by
Committer: Michael Armbrust <michael@databricks.com>

Closes #7013 from animeshbaranawal/8072 and squashes the following commits:

f70dd0e [animesh] Change IO exception to Analysis Exception
fd45e1b [animesh] 8072: Fix Style Issues
a8a964f [animesh] 8072: Improving on previous commits
3cc4d2c [animesh] Fix Style Issues
1a89115 [animesh] Fix Style Issues
98b4399 [animesh] 8072 : Moved the exception handling to ResolvedDataSource specific to parquet format
7c3d928 [animesh] 8072: Adding check to DataFrameWriter.scala
---
 .../apache/spark/sql/json/JSONRelation.scala  | 31 +++++++++++++++++++
 .../apache/spark/sql/parquet/newParquet.scala | 19 +++++++++++-
 .../org/apache/spark/sql/DataFrameSuite.scala | 24 ++++++++++++++
 3 files changed, 73 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/json/JSONRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/json/JSONRelation.scala
index 69bf13e1e5a6a..2361d3bf52d2b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/json/JSONRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/json/JSONRelation.scala
@@ -22,6 +22,7 @@ import java.io.IOException
 import org.apache.hadoop.fs.Path
 
 import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression}
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types.StructType
@@ -37,6 +38,17 @@ private[sql] class DefaultSource
     parameters.getOrElse("path", sys.error("'path' must be specified for json data."))
   }
 
+  /** Constraints to be imposed on dataframe to be stored. */
+  private def checkConstraints(data: DataFrame): Unit = {
+    if (data.schema.fieldNames.length != data.schema.fieldNames.distinct.length) {
+      val duplicateColumns = data.schema.fieldNames.groupBy(identity).collect {
+        case (x, ys) if ys.length > 1 => "\"" + x + "\""
+      }.mkString(", ")
+      throw new AnalysisException(s"Duplicate column(s) : $duplicateColumns found, " +
+        s"cannot save to JSON format")
+    }
+  }
+
   /** Returns a new base relation with the parameters. */
   override def createRelation(
       sqlContext: SQLContext,
@@ -63,6 +75,10 @@ private[sql] class DefaultSource
       mode: SaveMode,
       parameters: Map[String, String],
       data: DataFrame): BaseRelation = {
+    // check if dataframe satisfies the constraints
+    // before moving forward
+    checkConstraints(data)
+
     val path = checkPath(parameters)
     val filesystemPath = new Path(path)
     val fs = filesystemPath.getFileSystem(sqlContext.sparkContext.hadoopConfiguration)
@@ -130,6 +146,17 @@ private[sql] class JSONRelation(
       samplingRatio,
       userSpecifiedSchema)(sqlContext)
 
+  /** Constraints to be imposed on dataframe to be stored. */
+  private def checkConstraints(data: DataFrame): Unit = {
+    if (data.schema.fieldNames.length != data.schema.fieldNames.distinct.length) {
+      val duplicateColumns = data.schema.fieldNames.groupBy(identity).collect {
+        case (x, ys) if ys.length > 1 => "\"" + x + "\""
+      }.mkString(", ")
+      throw new AnalysisException(s"Duplicate column(s) : $duplicateColumns found, " +
+        s"cannot save to JSON format")
+    }
+  }
+
   private val useJacksonStreamingAPI: Boolean = sqlContext.conf.useJacksonStreamingAPI
 
   override val needConversion: Boolean = false
@@ -178,6 +205,10 @@ private[sql] class JSONRelation(
   }
 
   override def insert(data: DataFrame, overwrite: Boolean): Unit = {
+    // check if dataframe satisfies constraints
+    // before moving forward
+    checkConstraints(data)
+
     val filesystemPath = path match {
       case Some(p) => new Path(p)
       case None =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
index 5ac3e9a44e6fe..6bc69c6ad0847 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
@@ -164,7 +164,24 @@ private[sql] class ParquetRelation2(
     }
   }
 
-  override def dataSchema: StructType = maybeDataSchema.getOrElse(metadataCache.dataSchema)
+  /** Constraints on schema of dataframe to be stored. */
+  private def checkConstraints(schema: StructType): Unit = {
+    if (schema.fieldNames.length != schema.fieldNames.distinct.length) {
+      val duplicateColumns = schema.fieldNames.groupBy(identity).collect {
+        case (x, ys) if ys.length > 1 => "\"" + x + "\""
+      }.mkString(", ")
+      throw new AnalysisException(s"Duplicate column(s) : $duplicateColumns found, " +
+        s"cannot save to parquet format")
+    }
+  }
+
+  override def dataSchema: StructType = {
+    val schema = maybeDataSchema.getOrElse(metadataCache.dataSchema)
+    // check if schema satisfies the constraints
+    // before moving forward
+    checkConstraints(schema)
+    schema
+  }
 
   override private[sql] def refresh(): Unit = {
     super.refresh()
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index afb1cf5f8d1cb..f592a9934d0e6 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -737,4 +737,28 @@ class DataFrameSuite extends QueryTest {
     df.col("")
     df.col("t.``")
   }
+
+  test("SPARK-8072: Better Exception for Duplicate Columns") {
+    // only one duplicate column present
+    val e = intercept[org.apache.spark.sql.AnalysisException] {
+      val df1 = Seq((1, 2, 3), (2, 3, 4), (3, 4, 5)).toDF("column1", "column2", "column1")
+                .write.format("parquet").save("temp")
+    }
+    assert(e.getMessage.contains("Duplicate column(s)"))
+    assert(e.getMessage.contains("parquet"))
+    assert(e.getMessage.contains("column1"))
+    assert(!e.getMessage.contains("column2"))
+
+    // multiple duplicate columns present
+    val f = intercept[org.apache.spark.sql.AnalysisException] {
+      val df2 = Seq((1, 2, 3, 4, 5), (2, 3, 4, 5, 6), (3, 4, 5, 6, 7))
+                .toDF("column1", "column2", "column3", "column1", "column3")
+                .write.format("json").save("temp")
+    }
+    assert(f.getMessage.contains("Duplicate column(s)"))
+    assert(f.getMessage.contains("JSON"))
+    assert(f.getMessage.contains("column1"))
+    assert(f.getMessage.contains("column3"))
+    assert(!f.getMessage.contains("column2"))
+  }
 }

From d4d6d31db5cc5c69ac369f754b7489f444c9ba2f Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Mon, 6 Jul 2015 17:16:44 -0700
Subject: [PATCH 023/549] [SPARK-8463][SQL] Use DriverRegistry to load jdbc
 driver at writing path

JIRA: https://issues.apache.org/jira/browse/SPARK-8463

Currently, at the reading path, `DriverRegistry` is used to load needed jdbc driver at executors. However, at the writing path, we also need `DriverRegistry` to load jdbc driver.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #6900 from viirya/jdbc_write_driver and squashes the following commits:

16cd04b [Liang-Chi Hsieh] Use DriverRegistry to load jdbc driver at writing path.
---
 .../main/scala/org/apache/spark/sql/jdbc/jdbc.scala   | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/jdbc.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/jdbc.scala
index dd8aaf6474895..f7ea852fe7f58 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/jdbc.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/jdbc.scala
@@ -58,13 +58,12 @@ package object jdbc {
      * are used.
      */
     def savePartition(
-        url: String,
+        getConnection: () => Connection,
         table: String,
         iterator: Iterator[Row],
         rddSchema: StructType,
-        nullTypes: Array[Int],
-        properties: Properties): Iterator[Byte] = {
-      val conn = DriverManager.getConnection(url, properties)
+        nullTypes: Array[Int]): Iterator[Byte] = {
+      val conn = getConnection()
       var committed = false
       try {
         conn.setAutoCommit(false) // Everything in the same db transaction.
@@ -185,8 +184,10 @@ package object jdbc {
       }
 
       val rddSchema = df.schema
+      val driver: String = DriverRegistry.getDriverClassName(url)
+      val getConnection: () => Connection = JDBCRDD.getConnector(driver, url, properties)
       df.foreachPartition { iterator =>
-        JDBCWriteDetails.savePartition(url, table, iterator, rddSchema, nullTypes, properties)
+        JDBCWriteDetails.savePartition(getConnection, table, iterator, rddSchema, nullTypes)
       }
     }
 

From 9eae5fa642317dd11fc783d832d4cbb7e62db471 Mon Sep 17 00:00:00 2001
From: Andrew Or <andrew@databricks.com>
Date: Mon, 6 Jul 2015 19:22:30 -0700
Subject: [PATCH 024/549] [SPARK-8819] Fix build for maven 3.3.x

This is a workaround for MSHADE-148, which leads to an infinite loop when building Spark with maven 3.3.x. This was originally caused by #6441, which added a bunch of test dependencies on the spark-core test module. Recently, it was revealed by #7193.

This patch adds a `-Prelease` profile. If present, it will set `createDependencyReducedPom` to true. The consequences are:
- If you are releasing Spark with this profile, you are fine as long as you use maven 3.2.x or before.
- If you are releasing Spark without this profile, you will run into SPARK-8781.
- If you are not releasing Spark but you are using this profile, you may run into SPARK-8819.
- If you are not releasing Spark and you did not include this profile, you are fine.

This is all documented in `pom.xml` and tested locally with both versions of maven.

Author: Andrew Or <andrew@databricks.com>

Closes #7219 from andrewor14/fix-maven-build and squashes the following commits:

1d37e87 [Andrew Or] Merge branch 'master' of github.com:apache/spark into fix-maven-build
3574ae4 [Andrew Or] Review comments
f39199c [Andrew Or] Create a -Prelease profile that flags `createDependencyReducedPom`
---
 dev/create-release/create-release.sh |  4 ++--
 pom.xml                              | 24 ++++++++++++++++++++++++
 2 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/dev/create-release/create-release.sh b/dev/create-release/create-release.sh
index 54274a83f6d66..cfe2cd4752b3f 100755
--- a/dev/create-release/create-release.sh
+++ b/dev/create-release/create-release.sh
@@ -118,13 +118,13 @@ if [[ ! "$@" =~ --skip-publish ]]; then
 
   rm -rf $SPARK_REPO
 
-  build/mvn -DskipTests -Pyarn -Phive \
+  build/mvn -DskipTests -Pyarn -Phive -Prelease-profile\
     -Phive-thriftserver -Phadoop-2.2 -Pspark-ganglia-lgpl -Pkinesis-asl \
     clean install
 
   ./dev/change-version-to-2.11.sh
   
-  build/mvn -DskipTests -Pyarn -Phive \
+  build/mvn -DskipTests -Pyarn -Phive -Prelease-profile\
     -Dscala-2.11 -Phadoop-2.2 -Pspark-ganglia-lgpl -Pkinesis-asl \
     clean install
 
diff --git a/pom.xml b/pom.xml
index ffa96128a3d61..fbcc9152765cf 100644
--- a/pom.xml
+++ b/pom.xml
@@ -161,6 +161,8 @@
     <fasterxml.jackson.version>2.4.4</fasterxml.jackson.version>
     <snappy.version>1.1.1.7</snappy.version>
     <netlib.java.version>1.1.2</netlib.java.version>
+    <!-- For maven shade plugin (see SPARK-8819) -->
+    <create.dependency.reduced.pom>false</create.dependency.reduced.pom>
 
     <test.java.home>${java.home}</test.java.home>
 
@@ -1440,6 +1442,8 @@
         <version>2.3</version>
         <configuration>
           <shadedArtifactAttached>false</shadedArtifactAttached>
+          <!-- Work around MSHADE-148. See SPARK-8819. -->
+          <createDependencyReducedPom>${create.dependency.reduced.pom}</createDependencyReducedPom>
           <artifactSet>
             <includes>
               <!-- At a minimum we must include this to force effective pom generation -->
@@ -1826,6 +1830,26 @@
       </properties>
     </profile>
 
+    <profile>
+      <!--
+          Use this profile only for making Spark releases. Note that due to SPARK-8819,
+          you must use maven version 3.2.x or before to avoid running into MSHADE-148.
+      -->
+      <id>release-profile</id>
+      <properties>
+        <!--
+            The maven shade plugin has a bug where enabling the `createDependencyReducedPom`
+            property causes maven to go into an infinite loop (MSHADE-148). This is only an
+            issue for the Spark build if the maven version is 3.3.x or newer (SPARK-8819).
+
+            However, since disabling this property has the side effect of not resolving
+            variables in the released pom files (SPARK-8781), we need to enable this during
+            releases.
+        -->
+        <create.dependency.reduced.pom>true</create.dependency.reduced.pom>
+      </properties>
+    </profile>
+
     <!--
       These empty profiles are available in some sub-modules. Declare them here so that
       maven does not complain when they're provided on the command line for a sub-module

From 929dfa24b13b8d1bd08aa8a3b256766a9765f985 Mon Sep 17 00:00:00 2001
From: Andrew Or <andrew@databricks.com>
Date: Mon, 6 Jul 2015 19:27:04 -0700
Subject: [PATCH 025/549] Revert "[SPARK-8781] Fix variables in published
 pom.xml are not resolved"

This reverts commit 82cf3315e690f4ac15b50edea6a3d673aa5be4c0.

Conflicts:
	pom.xml
---
 extras/kinesis-asl/pom.xml                    |  7 +++++++
 .../kinesis/KinesisReceiverSuite.scala        | 20 ++++++++-----------
 2 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/extras/kinesis-asl/pom.xml b/extras/kinesis-asl/pom.xml
index 5289073eb457a..c242e7a57b9ab 100644
--- a/extras/kinesis-asl/pom.xml
+++ b/extras/kinesis-asl/pom.xml
@@ -40,6 +40,13 @@
       <artifactId>spark-streaming_${scala.binary.version}</artifactId>
       <version>${project.version}</version>
     </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-core_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-streaming_${scala.binary.version}</artifactId>
diff --git a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala
index 6c262624833cd..2103dca6b766f 100644
--- a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala
+++ b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala
@@ -26,23 +26,18 @@ import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionIn
 import com.amazonaws.services.kinesis.clientlibrary.types.ShutdownReason
 import com.amazonaws.services.kinesis.model.Record
 import org.mockito.Mockito._
-// scalastyle:off
-// To avoid introducing a dependency on Spark core tests, simply use scalatest's FunSuite
-// here instead of our own SparkFunSuite. Introducing the dependency has caused problems
-// in the past (SPARK-8781) that are complicated by bugs in the maven shade plugin (MSHADE-148).
-import org.scalatest.{BeforeAndAfter, FunSuite, Matchers}
+import org.scalatest.{BeforeAndAfter, Matchers}
 import org.scalatest.mock.MockitoSugar
 
 import org.apache.spark.storage.StorageLevel
-import org.apache.spark.streaming.{Milliseconds, Seconds, StreamingContext}
+import org.apache.spark.streaming.{Milliseconds, Seconds, StreamingContext, TestSuiteBase}
 import org.apache.spark.util.{Clock, ManualClock, Utils}
 
 /**
  * Suite of Kinesis streaming receiver tests focusing mostly on the KinesisRecordProcessor
  */
-class KinesisReceiverSuite extends FunSuite with Matchers with BeforeAndAfter
-  with MockitoSugar {
-// scalastyle:on
+class KinesisReceiverSuite extends TestSuiteBase with Matchers with BeforeAndAfter
+    with MockitoSugar {
 
   val app = "TestKinesisReceiver"
   val stream = "mySparkStream"
@@ -62,7 +57,7 @@ class KinesisReceiverSuite extends FunSuite with Matchers with BeforeAndAfter
   var checkpointStateMock: KinesisCheckpointState = _
   var currentClockMock: Clock = _
 
-  before {
+  override def beforeFunction(): Unit = {
     receiverMock = mock[KinesisReceiver]
     checkpointerMock = mock[IRecordProcessorCheckpointer]
     checkpointClockMock = mock[ManualClock]
@@ -70,7 +65,8 @@ class KinesisReceiverSuite extends FunSuite with Matchers with BeforeAndAfter
     currentClockMock = mock[Clock]
   }
 
-  after {
+  override def afterFunction(): Unit = {
+    super.afterFunction()
     // Since this suite was originally written using EasyMock, add this to preserve the old
     // mocking semantics (see SPARK-5735 for more details)
     verifyNoMoreInteractions(receiverMock, checkpointerMock, checkpointClockMock,
@@ -78,7 +74,7 @@ class KinesisReceiverSuite extends FunSuite with Matchers with BeforeAndAfter
   }
 
   test("KinesisUtils API") {
-    val ssc = new StreamingContext("local[2]", getClass.getSimpleName, Seconds(1))
+    val ssc = new StreamingContext(master, framework, batchDuration)
     // Tests the API, does not actually test data receiving
     val kinesisStream1 = KinesisUtils.createStream(ssc, "mySparkStream",
       "https://kinesis.us-west-2.amazonaws.com", Seconds(2),

From 1821fc165808143e98b3d9626141b1a55bde90ac Mon Sep 17 00:00:00 2001
From: Takeshi YAMAMURO <linguin.m.s@gmail.com>
Date: Mon, 6 Jul 2015 19:44:31 -0700
Subject: [PATCH 026/549] [SPARK-6747] [SQL] Throw an AnalysisException when
 unsupported Java list types used in Hive UDF

The current implementation can't handle List<> as a return type in Hive UDF and
throws meaningless Match Error.
We assume an UDF below;
public class UDFToListString extends UDF {
public List<String> evaluate(Object o)
{ return Arrays.asList("xxx", "yyy", "zzz"); }
}
An exception of scala.MatchError is thrown as follows when the UDF used;
scala.MatchError: interface java.util.List (of class java.lang.Class)
at org.apache.spark.sql.hive.HiveInspectors$class.javaClassToDataType(HiveInspectors.scala:174)
at org.apache.spark.sql.hive.HiveSimpleUdf.javaClassToDataType(hiveUdfs.scala:76)
at org.apache.spark.sql.hive.HiveSimpleUdf.dataType$lzycompute(hiveUdfs.scala:106)
at org.apache.spark.sql.hive.HiveSimpleUdf.dataType(hiveUdfs.scala:106)
at org.apache.spark.sql.catalyst.expressions.Alias.toAttribute(namedExpressions.scala:131)
at org.apache.spark.sql.catalyst.planning.PhysicalOperation$$anonfun$collectAliases$1.applyOrElse(patterns.scala:95)
at org.apache.spark.sql.catalyst.planning.PhysicalOperation$$anonfun$collectAliases$1.applyOrElse(patterns.scala:94)
at scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:33)
at scala.collection.TraversableLike$$anonfun$collect$1.apply(TraversableLike.scala:278)
...
To make udf developers more understood, we need to throw a more suitable exception.

Author: Takeshi YAMAMURO <linguin.m.s@gmail.com>

Closes #7248 from maropu/FixBugInHiveInspectors and squashes the following commits:

1c3df2a [Takeshi YAMAMURO] Fix comments
56305de [Takeshi YAMAMURO] Fix conflicts
92ed7a6 [Takeshi YAMAMURO] Throw an exception when java list type used
2844a8e [Takeshi YAMAMURO] Apply comments
7114a47 [Takeshi YAMAMURO] Add TODO comments in UDFToListString of HiveUdfSuite
fdb2ae4 [Takeshi YAMAMURO] Add StringToUtf8 to comvert String into UTF8String
af61f2e [Takeshi YAMAMURO] Remove a new type
7f812fd [Takeshi YAMAMURO] Fix code-style errors
6984bf4 [Takeshi YAMAMURO] Apply review comments
93e3d4e [Takeshi YAMAMURO] Add a blank line at the end of UDFToListString
ee232db [Takeshi YAMAMURO] Support List as a return type in Hive UDF
1e82316 [Takeshi YAMAMURO] Apply comments
21e8763 [Takeshi YAMAMURO] Add TODO comments in UDFToListString of HiveUdfSuite
a488712 [Takeshi YAMAMURO] Add StringToUtf8 to comvert String into UTF8String
1c7b9d1 [Takeshi YAMAMURO] Remove a new type
f965c34 [Takeshi YAMAMURO] Fix code-style errors
9406416 [Takeshi YAMAMURO] Apply review comments
e21ce7e [Takeshi YAMAMURO] Add a blank line at the end of UDFToListString
e553f10 [Takeshi YAMAMURO] Support List as a return type in Hive UDF
---
 .../spark/sql/hive/HiveInspectors.scala       | 10 +++++-
 .../sql/hive/execution/UDFToListInt.java      | 29 +++++++++++++++++
 .../sql/hive/execution/UDFToListString.java   | 29 +++++++++++++++++
 .../sql/hive/execution/HiveUDFSuite.scala     | 32 ++++++++++++++++++-
 4 files changed, 98 insertions(+), 2 deletions(-)
 create mode 100644 sql/hive/src/test/java/org/apache/spark/sql/hive/execution/UDFToListInt.java
 create mode 100644 sql/hive/src/test/java/org/apache/spark/sql/hive/execution/UDFToListString.java

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
index a6b8ead577fb5..7423d8030f2b5 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
@@ -26,8 +26,8 @@ import org.apache.hadoop.{io => hadoopIo}
 
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
-import org.apache.spark.sql.types
 import org.apache.spark.sql.types._
+import org.apache.spark.sql.{AnalysisException, types}
 import org.apache.spark.unsafe.types.UTF8String
 
 /* Implicit conversions */
@@ -218,6 +218,14 @@ private[hive] trait HiveInspectors {
 
     // Hive seems to return this for struct types?
     case c: Class[_] if c == classOf[java.lang.Object] => NullType
+
+    // java list type unsupported
+    case c: Class[_] if c == classOf[java.util.List[_]] =>
+      throw new AnalysisException(
+        "List type in java is unsupported because " +
+        "JVM type erasure makes spark fail to catch a component type in List<>")
+
+    case c => throw new AnalysisException(s"Unsupported java type $c")
   }
 
   /**
diff --git a/sql/hive/src/test/java/org/apache/spark/sql/hive/execution/UDFToListInt.java b/sql/hive/src/test/java/org/apache/spark/sql/hive/execution/UDFToListInt.java
new file mode 100644
index 0000000000000..67576a72f1980
--- /dev/null
+++ b/sql/hive/src/test/java/org/apache/spark/sql/hive/execution/UDFToListInt.java
@@ -0,0 +1,29 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.execution;
+
+import org.apache.hadoop.hive.ql.exec.UDF;
+
+import java.util.Arrays;
+import java.util.List;
+
+public class UDFToListInt extends UDF {
+    public List<Integer> evaluate(Object o) {
+        return Arrays.asList(1, 2, 3);
+    }
+}
diff --git a/sql/hive/src/test/java/org/apache/spark/sql/hive/execution/UDFToListString.java b/sql/hive/src/test/java/org/apache/spark/sql/hive/execution/UDFToListString.java
new file mode 100644
index 0000000000000..f02395cbba88b
--- /dev/null
+++ b/sql/hive/src/test/java/org/apache/spark/sql/hive/execution/UDFToListString.java
@@ -0,0 +1,29 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.execution;
+
+import org.apache.hadoop.hive.ql.exec.UDF;
+
+import java.util.Arrays;
+import java.util.List;
+
+public class UDFToListString extends UDF {
+    public List<String> evaluate(Object o) {
+        return Arrays.asList("data1", "data2", "data3");
+    }
+}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala
index 56b0bef1d0571..44686204c2af7 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala
@@ -28,7 +28,7 @@ import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectIn
 import org.apache.hadoop.hive.serde2.objectinspector.{ObjectInspector, ObjectInspectorFactory}
 import org.apache.hadoop.hive.serde2.{AbstractSerDe, SerDeStats}
 import org.apache.hadoop.io.Writable
-import org.apache.spark.sql.{QueryTest, Row}
+import org.apache.spark.sql.{AnalysisException, QueryTest, Row}
 import org.apache.spark.sql.hive.test.TestHive
 
 import org.apache.spark.util.Utils
@@ -133,6 +133,36 @@ class HiveUDFSuite extends QueryTest {
     TestHive.reset()
   }
 
+  test("UDFToListString") {
+    val testData = TestHive.sparkContext.parallelize(StringCaseClass("") :: Nil).toDF()
+    testData.registerTempTable("inputTable")
+
+    sql(s"CREATE TEMPORARY FUNCTION testUDFToListString AS '${classOf[UDFToListString].getName}'")
+    val errMsg = intercept[AnalysisException] {
+      sql("SELECT testUDFToListString(s) FROM inputTable")
+    }
+    assert(errMsg.getMessage === "List type in java is unsupported because " +
+      "JVM type erasure makes spark fail to catch a component type in List<>;")
+
+    sql("DROP TEMPORARY FUNCTION IF EXISTS testUDFToListString")
+    TestHive.reset()
+  }
+
+  test("UDFToListInt") {
+    val testData = TestHive.sparkContext.parallelize(StringCaseClass("") :: Nil).toDF()
+    testData.registerTempTable("inputTable")
+
+    sql(s"CREATE TEMPORARY FUNCTION testUDFToListInt AS '${classOf[UDFToListInt].getName}'")
+    val errMsg = intercept[AnalysisException] {
+      sql("SELECT testUDFToListInt(s) FROM inputTable")
+    }
+    assert(errMsg.getMessage === "List type in java is unsupported because " +
+      "JVM type erasure makes spark fail to catch a component type in List<>;")
+
+    sql("DROP TEMPORARY FUNCTION IF EXISTS testUDFToListInt")
+    TestHive.reset()
+  }
+
   test("UDFListListInt") {
     val testData = TestHive.sparkContext.parallelize(
       ListListIntCaseClass(Nil) ::

From 6718c1eb671faaf5c1d865ad5d01dbf78dae9cd2 Mon Sep 17 00:00:00 2001
From: Alok  Singh <singhal@Aloks-MacBook-Pro.local>
Date: Mon, 6 Jul 2015 21:53:55 -0700
Subject: [PATCH 027/549] [SPARK-5562] [MLLIB] LDA should handle empty
 document.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

See the jira https://issues.apache.org/jira/browse/SPARK-5562

Author: Alok  Singh <singhal@Aloks-MacBook-Pro.local>
Author: Alok  Singh <singhal@aloks-mbp.usca.ibm.com>
Author: Alok Singh <“singhal@us.ibm.com”>

Closes #7064 from aloknsingh/aloknsingh_SPARK-5562 and squashes the following commits:

259a0a7 [Alok Singh] change as per the comments by @jkbradley
be48491 [Alok  Singh] [SPARK-5562][MLlib] re-order import in alphabhetical order
c01311b [Alok  Singh] [SPARK-5562][MLlib] fix the newline typo
b271c8a [Alok  Singh] [SPARK-5562][Mllib] As per github discussion with jkbradley. We would like to simply things.
7c06251 [Alok  Singh] [SPARK-5562][MLlib] modified the JavaLDASuite for test passing
c710cb6 [Alok  Singh] fix the scala code style to have space after :
2572a08 [Alok  Singh] [SPARK-5562][MLlib] change the import xyz._ to the import xyz.{c1, c2} ..
ab55fbf [Alok  Singh] [SPARK-5562][MLlib] Change as per Sean Owen's comments https://github.com/apache/spark/pull/7064/files#diff-9236d23975e6f5a5608ffc81dfd79146
9f4f9ea [Alok  Singh] [SPARK-5562][MLlib] LDA should handle empty document.
---
 docs/mllib-clustering.md                            |  2 +-
 .../apache/spark/mllib/clustering/JavaLDASuite.java | 13 +++++++++++--
 .../apache/spark/mllib/clustering/LDASuite.scala    | 13 +++++++++++--
 3 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/docs/mllib-clustering.md b/docs/mllib-clustering.md
index 3aad4149f99db..d72dc20a5ad6e 100644
--- a/docs/mllib-clustering.md
+++ b/docs/mllib-clustering.md
@@ -447,7 +447,7 @@ It supports different inference algorithms via `setOptimizer` function. EMLDAOpt
 on the likelihood function and yields comprehensive results, while OnlineLDAOptimizer uses iterative mini-batch sampling for [online variational inference](https://www.cs.princeton.edu/~blei/papers/HoffmanBleiBach2010b.pdf) and is generally memory friendly. After fitting on the documents, LDA provides:
 
 * Topics: Inferred topics, each of which is a probability distribution over terms (words).
-* Topic distributions for documents: For each document in the training set, LDA gives a probability distribution over topics. (EM only)
+* Topic distributions for documents: For each non empty document in the training set, LDA gives a probability distribution over topics. (EM only). Note that for empty documents, we don't create the topic distributions. (EM only)
 
 LDA takes the following parameters:
 
diff --git a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java
index 581c033f08ebe..b48f190f599a2 100644
--- a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java
+++ b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java
@@ -28,12 +28,13 @@
 import org.junit.Before;
 import org.junit.Test;
 
+import org.apache.spark.api.java.function.Function;
 import org.apache.spark.api.java.JavaPairRDD;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.mllib.linalg.Matrix;
 import org.apache.spark.mllib.linalg.Vector;
-
+import org.apache.spark.mllib.linalg.Vectors;
 
 public class JavaLDASuite implements Serializable {
   private transient JavaSparkContext sc;
@@ -110,7 +111,15 @@ public void distributedLDAModel() {
 
     // Check: topic distributions
     JavaPairRDD<Long, Vector> topicDistributions = model.javaTopicDistributions();
-    assertEquals(topicDistributions.count(), corpus.count());
+    // SPARK-5562. since the topicDistribution returns the distribution of the non empty docs
+    // over topics. Compare it against nonEmptyCorpus instead of corpus
+    JavaPairRDD<Long, Vector> nonEmptyCorpus = corpus.filter(
+      new Function<Tuple2<Long, Vector>, Boolean>() {
+        public Boolean call(Tuple2<Long, Vector> tuple2) {
+          return Vectors.norm(tuple2._2(), 1.0) != 0.0;
+        }
+    });
+    assertEquals(topicDistributions.count(), nonEmptyCorpus.count());
   }
 
   @Test
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
index 406affa25539d..03a8a2538b464 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
@@ -99,9 +99,13 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext {
 
     // Check: per-doc topic distributions
     val topicDistributions = model.topicDistributions.collect()
+
     //  Ensure all documents are covered.
-    assert(topicDistributions.length === tinyCorpus.length)
-    assert(tinyCorpus.map(_._1).toSet === topicDistributions.map(_._1).toSet)
+    // SPARK-5562. since the topicDistribution returns the distribution of the non empty docs
+    // over topics. Compare it against nonEmptyTinyCorpus instead of tinyCorpus
+    val nonEmptyTinyCorpus = getNonEmptyDoc(tinyCorpus)
+    assert(topicDistributions.length === nonEmptyTinyCorpus.length)
+    assert(nonEmptyTinyCorpus.map(_._1).toSet === topicDistributions.map(_._1).toSet)
     //  Ensure we have proper distributions
     topicDistributions.foreach { case (docId, topicDistribution) =>
       assert(topicDistribution.size === tinyK)
@@ -232,12 +236,17 @@ private[clustering] object LDASuite {
   }
 
   def tinyCorpus: Array[(Long, Vector)] = Array(
+    Vectors.dense(0, 0, 0, 0, 0), // empty doc
     Vectors.dense(1, 3, 0, 2, 8),
     Vectors.dense(0, 2, 1, 0, 4),
     Vectors.dense(2, 3, 12, 3, 1),
+    Vectors.dense(0, 0, 0, 0, 0), // empty doc
     Vectors.dense(0, 3, 1, 9, 8),
     Vectors.dense(1, 1, 4, 2, 6)
   ).zipWithIndex.map { case (wordCounts, docId) => (docId.toLong, wordCounts) }
   assert(tinyCorpus.forall(_._2.size == tinyVocabSize)) // sanity check for test data
 
+  def getNonEmptyDoc(corpus: Array[(Long, Vector)]): Array[(Long, Vector)] = corpus.filter {
+    case (_, wc: Vector) => Vectors.norm(wc, p = 1.0) != 0.0
+  }
 }

From c46aaf47f38163e9c7be671d7b8398512df34e62 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@outlook.com>
Date: Mon, 6 Jul 2015 22:13:50 -0700
Subject: [PATCH 028/549] [SPARK-8759][SQL] add default eval to binary and
 unary expression according to default behavior of nullable

We have `nullSafeCodeGen` to provide default code generation for binary and unary expression, and we can do the same thing for `eval`.

Author: Wenchen Fan <cloud0fan@outlook.com>

Closes #7157 from cloud-fan/refactor and squashes the following commits:

f3987c6 [Wenchen Fan] refactor Expression
---
 .../spark/sql/catalyst/expressions/Cast.scala |   7 +-
 .../sql/catalyst/expressions/Expression.scala |  69 ++++++-
 .../catalyst/expressions/ExtractValue.scala   |  51 ++---
 .../sql/catalyst/expressions/arithmetic.scala |  97 ++++-----
 .../sql/catalyst/expressions/bitwise.scala    |   8 +-
 .../expressions/decimalFunctions.scala        |  33 +---
 .../spark/sql/catalyst/expressions/math.scala | 186 +++++-------------
 .../spark/sql/catalyst/expressions/misc.scala | 153 ++++++--------
 .../sql/catalyst/expressions/predicates.scala |  84 +++-----
 .../spark/sql/catalyst/expressions/sets.scala |   9 +-
 .../expressions/stringOperations.scala        | 138 +++----------
 11 files changed, 292 insertions(+), 543 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
index 2d99d1a3fe8dc..4f73ba40b5a18 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -114,8 +114,6 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
     }
   }
 
-  override def foldable: Boolean = child.foldable
-
   override def nullable: Boolean = Cast.forceNullable(child.dataType, dataType) || child.nullable
 
   override def toString: String = s"CAST($child, $dataType)"
@@ -426,10 +424,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
 
   private[this] lazy val cast: Any => Any = cast(child.dataType, dataType)
 
-  override def eval(input: InternalRow): Any = {
-    val evaluated = child.eval(input)
-    if (evaluated == null) null else cast(evaluated)
-  }
+  protected override def nullSafeEval(input: Any): Any = cast(input)
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     // TODO: Add support for more data types.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
index cafbbafdca207..386feb95b4ac8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
@@ -183,6 +183,27 @@ abstract class UnaryExpression extends Expression with trees.UnaryNode[Expressio
   override def foldable: Boolean = child.foldable
   override def nullable: Boolean = child.nullable
 
+  /**
+   * Default behavior of evaluation according to the default nullability of UnaryExpression.
+   * If subclass of UnaryExpression override nullable, probably should also override this.
+   */
+  override def eval(input: InternalRow): Any = {
+    val value = child.eval(input)
+    if (value == null) {
+      null
+    } else {
+      nullSafeEval(value)
+    }
+  }
+
+  /**
+   * Called by default [[eval]] implementation.  If subclass of UnaryExpression keep the default
+   * nullability, they can override this method to save null-check code.  If we need full control
+   * of evaluation process, we should override [[eval]].
+   */
+  protected def nullSafeEval(input: Any): Any =
+    sys.error(s"UnaryExpressions must override either eval or nullSafeEval")
+
   /**
    * Called by unary expressions to generate a code block that returns null if its parent returns
    * null, and if not not null, use `f` to generate the expression.
@@ -198,21 +219,24 @@ abstract class UnaryExpression extends Expression with trees.UnaryNode[Expressio
       ctx: CodeGenContext,
       ev: GeneratedExpressionCode,
       f: String => String): String = {
-    nullSafeCodeGen(ctx, ev, (result, eval) => {
-      s"$result = ${f(eval)};"
+    nullSafeCodeGen(ctx, ev, eval => {
+      s"${ev.primitive} = ${f(eval)};"
     })
   }
 
   /**
    * Called by unary expressions to generate a code block that returns null if its parent returns
    * null, and if not not null, use `f` to generate the expression.
+   *
+   * @param f function that accepts the non-null evaluation result name of child and returns Java
+   *          code to compute the output.
    */
   protected def nullSafeCodeGen(
       ctx: CodeGenContext,
       ev: GeneratedExpressionCode,
-      f: (String, String) => String): String = {
+      f: String => String): String = {
     val eval = child.gen(ctx)
-    val resultCode = f(ev.primitive, eval.primitive)
+    val resultCode = f(eval.primitive)
     eval.code + s"""
       boolean ${ev.isNull} = ${eval.isNull};
       ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
@@ -235,6 +259,32 @@ abstract class BinaryExpression extends Expression with trees.BinaryNode[Express
 
   override def nullable: Boolean = left.nullable || right.nullable
 
+  /**
+   * Default behavior of evaluation according to the default nullability of BinaryExpression.
+   * If subclass of BinaryExpression override nullable, probably should also override this.
+   */
+  override def eval(input: InternalRow): Any = {
+    val value1 = left.eval(input)
+    if (value1 == null) {
+      null
+    } else {
+      val value2 = right.eval(input)
+      if (value2 == null) {
+        null
+      } else {
+        nullSafeEval(value1, value2)
+      }
+    }
+  }
+
+  /**
+   * Called by default [[eval]] implementation.  If subclass of BinaryExpression keep the default
+   * nullability, they can override this method to save null-check code.  If we need full control
+   * of evaluation process, we should override [[eval]].
+   */
+  protected def nullSafeEval(input1: Any, input2: Any): Any =
+    sys.error(s"BinaryExpressions must override either eval or nullSafeEval")
+
   /**
    * Short hand for generating binary evaluation code.
    * If either of the sub-expressions is null, the result of this computation
@@ -246,8 +296,8 @@ abstract class BinaryExpression extends Expression with trees.BinaryNode[Express
     ctx: CodeGenContext,
     ev: GeneratedExpressionCode,
     f: (String, String) => String): String = {
-    nullSafeCodeGen(ctx, ev, (result, eval1, eval2) => {
-      s"$result = ${f(eval1, eval2)};"
+    nullSafeCodeGen(ctx, ev, (eval1, eval2) => {
+      s"${ev.primitive} = ${f(eval1, eval2)};"
     })
   }
 
@@ -255,14 +305,17 @@ abstract class BinaryExpression extends Expression with trees.BinaryNode[Express
    * Short hand for generating binary evaluation code.
    * If either of the sub-expressions is null, the result of this computation
    * is assumed to be null.
+   *
+   * @param f function that accepts the 2 non-null evaluation result names of children
+   *          and returns Java code to compute the output.
    */
   protected def nullSafeCodeGen(
     ctx: CodeGenContext,
     ev: GeneratedExpressionCode,
-    f: (String, String, String) => String): String = {
+    f: (String, String) => String): String = {
     val eval1 = left.gen(ctx)
     val eval2 = right.gen(ctx)
-    val resultCode = f(ev.primitive, eval1.primitive, eval2.primitive)
+    val resultCode = f(eval1.primitive, eval2.primitive)
     s"""
       ${eval1.code}
       boolean ${ev.isNull} = ${eval1.isNull};
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExtractValue.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExtractValue.scala
index 3020e7fc967f2..e451c7ffbddc1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExtractValue.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExtractValue.scala
@@ -122,18 +122,16 @@ case class GetStructField(child: Expression, field: StructField, ordinal: Int)
   override def dataType: DataType = field.dataType
   override def nullable: Boolean = child.nullable || field.nullable
 
-  override def eval(input: InternalRow): Any = {
-    val baseValue = child.eval(input).asInstanceOf[InternalRow]
-    if (baseValue == null) null else baseValue(ordinal)
-  }
+  protected override def nullSafeEval(input: Any): Any =
+    input.asInstanceOf[InternalRow](ordinal)
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
-    nullSafeCodeGen(ctx, ev, (result, eval) => {
+    nullSafeCodeGen(ctx, ev, eval => {
       s"""
         if ($eval.isNullAt($ordinal)) {
           ${ev.isNull} = true;
         } else {
-          $result = ${ctx.getColumn(eval, dataType, ordinal)};
+          ${ev.primitive} = ${ctx.getColumn(eval, dataType, ordinal)};
         }
       """
     })
@@ -152,12 +150,9 @@ case class GetArrayStructFields(
   override def dataType: DataType = ArrayType(field.dataType, containsNull)
   override def nullable: Boolean = child.nullable || containsNull || field.nullable
 
-  override def eval(input: InternalRow): Any = {
-    val baseValue = child.eval(input).asInstanceOf[Seq[InternalRow]]
-    if (baseValue == null) null else {
-      baseValue.map { row =>
-        if (row == null) null else row(ordinal)
-      }
+  protected override def nullSafeEval(input: Any): Any = {
+    input.asInstanceOf[Seq[InternalRow]].map { row =>
+      if (row == null) null else row(ordinal)
     }
   }
 
@@ -165,7 +160,7 @@ case class GetArrayStructFields(
     val arraySeqClass = "scala.collection.mutable.ArraySeq"
     // TODO: consider using Array[_] for ArrayType child to avoid
     // boxing of primitives
-    nullSafeCodeGen(ctx, ev, (result, eval) => {
+    nullSafeCodeGen(ctx, ev, eval => {
       s"""
         final int n = $eval.size();
         final $arraySeqClass<Object> values = new $arraySeqClass<Object>(n);
@@ -175,7 +170,7 @@ case class GetArrayStructFields(
             values.update(j, ${ctx.getColumn("row", field.dataType, ordinal)});
           }
         }
-        $result = (${ctx.javaType(dataType)}) values;
+        ${ev.primitive} = (${ctx.javaType(dataType)}) values;
       """
     })
   }
@@ -193,22 +188,6 @@ abstract class ExtractValueWithOrdinal extends BinaryExpression with ExtractValu
   /** `Null` is returned for invalid ordinals. */
   override def nullable: Boolean = true
   override def toString: String = s"$child[$ordinal]"
-
-  override def eval(input: InternalRow): Any = {
-    val value = child.eval(input)
-    if (value == null) {
-      null
-    } else {
-      val o = ordinal.eval(input)
-      if (o == null) {
-        null
-      } else {
-        evalNotNull(value, o)
-      }
-    }
-  }
-
-  protected def evalNotNull(value: Any, ordinal: Any): Any
 }
 
 /**
@@ -219,7 +198,7 @@ case class GetArrayItem(child: Expression, ordinal: Expression)
 
   override def dataType: DataType = child.dataType.asInstanceOf[ArrayType].elementType
 
-  protected def evalNotNull(value: Any, ordinal: Any) = {
+  protected override def nullSafeEval(value: Any, ordinal: Any): Any = {
     // TODO: consider using Array[_] for ArrayType child to avoid
     // boxing of primitives
     val baseValue = value.asInstanceOf[Seq[_]]
@@ -232,13 +211,13 @@ case class GetArrayItem(child: Expression, ordinal: Expression)
   }
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
-    nullSafeCodeGen(ctx, ev, (result, eval1, eval2) => {
+    nullSafeCodeGen(ctx, ev, (eval1, eval2) => {
       s"""
         final int index = (int)$eval2;
         if (index >= $eval1.size() || index < 0) {
           ${ev.isNull} = true;
         } else {
-          $result = (${ctx.boxedType(dataType)})$eval1.apply(index);
+          ${ev.primitive} = (${ctx.boxedType(dataType)})$eval1.apply(index);
         }
       """
     })
@@ -253,16 +232,16 @@ case class GetMapValue(child: Expression, ordinal: Expression)
 
   override def dataType: DataType = child.dataType.asInstanceOf[MapType].valueType
 
-  protected def evalNotNull(value: Any, ordinal: Any) = {
+  protected override def nullSafeEval(value: Any, ordinal: Any): Any = {
     val baseValue = value.asInstanceOf[Map[Any, _]]
     baseValue.get(ordinal).orNull
   }
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
-    nullSafeCodeGen(ctx, ev, (result, eval1, eval2) => {
+    nullSafeCodeGen(ctx, ev, (eval1, eval2) => {
       s"""
         if ($eval1.contains($eval2)) {
-          $result = (${ctx.boxedType(dataType)})$eval1.apply($eval2);
+          ${ev.primitive} = (${ctx.boxedType(dataType)})$eval1.apply($eval2);
         } else {
           ${ev.isNull} = true;
         }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
index 4fbf4c87009c2..dca6642665741 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
@@ -26,18 +26,6 @@ abstract class UnaryArithmetic extends UnaryExpression {
   self: Product =>
 
   override def dataType: DataType = child.dataType
-
-  override def eval(input: InternalRow): Any = {
-    val evalE = child.eval(input)
-    if (evalE == null) {
-      null
-    } else {
-      evalInternal(evalE)
-    }
-  }
-
-  protected def evalInternal(evalE: Any): Any =
-    sys.error(s"UnaryArithmetics must override either eval or evalInternal")
 }
 
 case class UnaryMinus(child: Expression) extends UnaryArithmetic {
@@ -53,7 +41,7 @@ case class UnaryMinus(child: Expression) extends UnaryArithmetic {
     case dt: NumericType => defineCodeGen(ctx, ev, c => s"(${ctx.javaType(dt)})(-($c))")
   }
 
-  protected override def evalInternal(evalE: Any) = numeric.negate(evalE)
+  protected override def nullSafeEval(input: Any): Any = numeric.negate(input)
 }
 
 case class UnaryPositive(child: Expression) extends UnaryArithmetic {
@@ -62,7 +50,7 @@ case class UnaryPositive(child: Expression) extends UnaryArithmetic {
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String =
     defineCodeGen(ctx, ev, c => c)
 
-  protected override def evalInternal(evalE: Any) = evalE
+  protected override def nullSafeEval(input: Any): Any = input
 }
 
 /**
@@ -74,7 +62,7 @@ case class Abs(child: Expression) extends UnaryArithmetic {
 
   private lazy val numeric = TypeUtils.getNumeric(dataType)
 
-  protected override def evalInternal(evalE: Any) = numeric.abs(evalE)
+  protected override def nullSafeEval(input: Any): Any = numeric.abs(input)
 }
 
 abstract class BinaryArithmetic extends BinaryOperator {
@@ -94,20 +82,6 @@ abstract class BinaryArithmetic extends BinaryOperator {
 
   protected def checkTypesInternal(t: DataType): TypeCheckResult
 
-  override def eval(input: InternalRow): Any = {
-    val evalE1 = left.eval(input)
-    if(evalE1 == null) {
-      null
-    } else {
-      val evalE2 = right.eval(input)
-      if (evalE2 == null) {
-        null
-      } else {
-        evalInternal(evalE1, evalE2)
-      }
-    }
-  }
-
   /** Name of the function for this expression on a [[Decimal]] type. */
   def decimalMethod: String =
     sys.error("BinaryArithmetics must override either decimalMethod or genCode")
@@ -122,9 +96,6 @@ abstract class BinaryArithmetic extends BinaryOperator {
     case _ =>
       defineCodeGen(ctx, ev, (eval1, eval2) => s"$eval1 $symbol $eval2")
   }
-
-  protected def evalInternal(evalE1: Any, evalE2: Any): Any =
-    sys.error(s"BinaryArithmetics must override either eval or evalInternal")
 }
 
 private[sql] object BinaryArithmetic {
@@ -143,7 +114,7 @@ case class Add(left: Expression, right: Expression) extends BinaryArithmetic {
 
   private lazy val numeric = TypeUtils.getNumeric(dataType)
 
-  protected override def evalInternal(evalE1: Any, evalE2: Any) = numeric.plus(evalE1, evalE2)
+  protected override def nullSafeEval(input1: Any, input2: Any): Any = numeric.plus(input1, input2)
 }
 
 case class Subtract(left: Expression, right: Expression) extends BinaryArithmetic {
@@ -158,7 +129,7 @@ case class Subtract(left: Expression, right: Expression) extends BinaryArithmeti
 
   private lazy val numeric = TypeUtils.getNumeric(dataType)
 
-  protected override def evalInternal(evalE1: Any, evalE2: Any) = numeric.minus(evalE1, evalE2)
+  protected override def nullSafeEval(input1: Any, input2: Any): Any = numeric.minus(input1, input2)
 }
 
 case class Multiply(left: Expression, right: Expression) extends BinaryArithmetic {
@@ -173,7 +144,7 @@ case class Multiply(left: Expression, right: Expression) extends BinaryArithmeti
 
   private lazy val numeric = TypeUtils.getNumeric(dataType)
 
-  protected override def evalInternal(evalE1: Any, evalE2: Any) = numeric.times(evalE1, evalE2)
+  protected override def nullSafeEval(input1: Any, input2: Any): Any = numeric.times(input1, input2)
 }
 
 case class Divide(left: Expression, right: Expression) extends BinaryArithmetic {
@@ -194,15 +165,15 @@ case class Divide(left: Expression, right: Expression) extends BinaryArithmetic
   }
 
   override def eval(input: InternalRow): Any = {
-    val evalE2 = right.eval(input)
-    if (evalE2 == null || evalE2 == 0) {
+    val input2 = right.eval(input)
+    if (input2 == null || input2 == 0) {
       null
     } else {
-      val evalE1 = left.eval(input)
-      if (evalE1 == null) {
+      val input1 = left.eval(input)
+      if (input1 == null) {
         null
       } else {
-        div(evalE1, evalE2)
+        div(input1, input2)
       }
     }
   }
@@ -260,15 +231,15 @@ case class Remainder(left: Expression, right: Expression) extends BinaryArithmet
   }
 
   override def eval(input: InternalRow): Any = {
-    val evalE2 = right.eval(input)
-    if (evalE2 == null || evalE2 == 0) {
+    val input2 = right.eval(input)
+    if (input2 == null || input2 == 0) {
       null
     } else {
-      val evalE1 = left.eval(input)
-      if (evalE1 == null) {
+      val input1 = left.eval(input)
+      if (input1 == null) {
         null
       } else {
-        integral.rem(evalE1, evalE2)
+        integral.rem(input1, input2)
       }
     }
   }
@@ -317,17 +288,17 @@ case class MaxOf(left: Expression, right: Expression) extends BinaryArithmetic {
   private lazy val ordering = TypeUtils.getOrdering(dataType)
 
   override def eval(input: InternalRow): Any = {
-    val evalE1 = left.eval(input)
-    val evalE2 = right.eval(input)
-    if (evalE1 == null) {
-      evalE2
-    } else if (evalE2 == null) {
-      evalE1
+    val input1 = left.eval(input)
+    val input2 = right.eval(input)
+    if (input1 == null) {
+      input2
+    } else if (input2 == null) {
+      input1
     } else {
-      if (ordering.compare(evalE1, evalE2) < 0) {
-        evalE2
+      if (ordering.compare(input1, input2) < 0) {
+        input2
       } else {
-        evalE1
+        input1
       }
     }
   }
@@ -371,17 +342,17 @@ case class MinOf(left: Expression, right: Expression) extends BinaryArithmetic {
   private lazy val ordering = TypeUtils.getOrdering(dataType)
 
   override def eval(input: InternalRow): Any = {
-    val evalE1 = left.eval(input)
-    val evalE2 = right.eval(input)
-    if (evalE1 == null) {
-      evalE2
-    } else if (evalE2 == null) {
-      evalE1
+    val input1 = left.eval(input)
+    val input2 = right.eval(input)
+    if (input1 == null) {
+      input2
+    } else if (input2 == null) {
+      input1
     } else {
-      if (ordering.compare(evalE1, evalE2) < 0) {
-        evalE1
+      if (ordering.compare(input1, input2) < 0) {
+        input1
       } else {
-        evalE2
+        input2
       }
     }
   }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/bitwise.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/bitwise.scala
index 9002dda7bf4d0..2d47124d247e7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/bitwise.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/bitwise.scala
@@ -45,7 +45,7 @@ case class BitwiseAnd(left: Expression, right: Expression) extends BinaryArithme
       ((evalE1: Long, evalE2: Long) => evalE1 & evalE2).asInstanceOf[(Any, Any) => Any]
   }
 
-  protected override def evalInternal(evalE1: Any, evalE2: Any) = and(evalE1, evalE2)
+  protected override def nullSafeEval(input1: Any, input2: Any): Any = and(input1, input2)
 }
 
 /**
@@ -70,7 +70,7 @@ case class BitwiseOr(left: Expression, right: Expression) extends BinaryArithmet
       ((evalE1: Long, evalE2: Long) => evalE1 | evalE2).asInstanceOf[(Any, Any) => Any]
   }
 
-  protected override def evalInternal(evalE1: Any, evalE2: Any) = or(evalE1, evalE2)
+  protected override def nullSafeEval(input1: Any, input2: Any): Any = or(input1, input2)
 }
 
 /**
@@ -95,7 +95,7 @@ case class BitwiseXor(left: Expression, right: Expression) extends BinaryArithme
       ((evalE1: Long, evalE2: Long) => evalE1 ^ evalE2).asInstanceOf[(Any, Any) => Any]
   }
 
-  protected override def evalInternal(evalE1: Any, evalE2: Any): Any = xor(evalE1, evalE2)
+  protected override def nullSafeEval(input1: Any, input2: Any): Any = xor(input1, input2)
 }
 
 /**
@@ -122,5 +122,5 @@ case class BitwiseNot(child: Expression) extends UnaryArithmetic {
     defineCodeGen(ctx, ev, c => s"(${ctx.javaType(dataType)}) ~($c)")
   }
 
-  protected override def evalInternal(evalE: Any) = not(evalE)
+  protected override def nullSafeEval(input: Any): Any = not(input)
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalFunctions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalFunctions.scala
index f5c2dde191cf3..2fa74b4ffc5da 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalFunctions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalFunctions.scala
@@ -30,14 +30,8 @@ case class UnscaledValue(child: Expression) extends UnaryExpression {
   override def dataType: DataType = LongType
   override def toString: String = s"UnscaledValue($child)"
 
-  override def eval(input: InternalRow): Any = {
-    val childResult = child.eval(input)
-    if (childResult == null) {
-      null
-    } else {
-      childResult.asInstanceOf[Decimal].toUnscaledLong
-    }
-  }
+  protected override def nullSafeEval(input: Any): Any =
+    input.asInstanceOf[Decimal].toUnscaledLong
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     defineCodeGen(ctx, ev, c => s"$c.toUnscaledLong()")
@@ -54,26 +48,15 @@ case class MakeDecimal(child: Expression, precision: Int, scale: Int) extends Un
   override def dataType: DataType = DecimalType(precision, scale)
   override def toString: String = s"MakeDecimal($child,$precision,$scale)"
 
-  override def eval(input: InternalRow): Decimal = {
-    val childResult = child.eval(input)
-    if (childResult == null) {
-      null
-    } else {
-      new Decimal().setOrNull(childResult.asInstanceOf[Long], precision, scale)
-    }
-  }
+  protected override def nullSafeEval(input: Any): Any =
+    Decimal(input.asInstanceOf[Long], precision, scale)
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
-    val eval = child.gen(ctx)
-    eval.code + s"""
-      boolean ${ev.isNull} = ${eval.isNull};
-      ${ctx.decimalType} ${ev.primitive} = null;
-
-      if (!${ev.isNull}) {
-        ${ev.primitive} = (new ${ctx.decimalType}()).setOrNull(
-          ${eval.primitive}, $precision, $scale);
+    nullSafeCodeGen(ctx, ev, eval => {
+      s"""
+        ${ev.primitive} = (new ${ctx.decimalType}()).setOrNull($eval, $precision, $scale);
         ${ev.isNull} = ${ev.primitive} == null;
-      }
       """
+    })
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
index 92500453980f6..9dca8513c0d9c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
@@ -61,21 +61,16 @@ abstract class UnaryMathExpression(f: Double => Double, name: String)
   override def nullable: Boolean = true
   override def toString: String = s"$name($child)"
 
-  override def eval(input: InternalRow): Any = {
-    val evalE = child.eval(input)
-    if (evalE == null) {
-      null
-    } else {
-      val result = f(evalE.asInstanceOf[Double])
-      if (result.isNaN) null else result
-    }
+  protected override def nullSafeEval(input: Any): Any = {
+    val result = f(input.asInstanceOf[Double])
+    if (result.isNaN) null else result
   }
 
   // name of function in java.lang.Math
   def funcName: String = name.toLowerCase
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
-    nullSafeCodeGen(ctx, ev, (result, eval) => {
+    nullSafeCodeGen(ctx, ev, eval => {
       s"""
         ${ev.primitive} = java.lang.Math.${funcName}($eval);
         if (Double.valueOf(${ev.primitive}).isNaN()) {
@@ -101,19 +96,9 @@ abstract class BinaryMathExpression(f: (Double, Double) => Double, name: String)
 
   override def dataType: DataType = DoubleType
 
-  override def eval(input: InternalRow): Any = {
-    val evalE1 = left.eval(input)
-    if (evalE1 == null) {
-      null
-    } else {
-      val evalE2 = right.eval(input)
-      if (evalE2 == null) {
-        null
-      } else {
-        val result = f(evalE1.asInstanceOf[Double], evalE2.asInstanceOf[Double])
-        if (result.isNaN) null else result
-      }
-    }
+  protected override def nullSafeEval(input1: Any, input2: Any): Any = {
+    val result = f(input1.asInstanceOf[Double], input2.asInstanceOf[Double])
+    if (result.isNaN) null else result
   }
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
@@ -194,39 +179,29 @@ case class Factorial(child: Expression) extends UnaryExpression with ExpectsInpu
 
   override def dataType: DataType = LongType
 
-  override def foldable: Boolean = child.foldable
-
   // If the value not in the range of [0, 20], it still will be null, so set it to be true here.
   override def nullable: Boolean = true
 
-  override def eval(input: InternalRow): Any = {
-    val evalE = child.eval(input)
-    if (evalE == null) {
+  protected override def nullSafeEval(input: Any): Any = {
+    val value = input.asInstanceOf[jl.Integer]
+    if (value > 20 || value < 0) {
       null
     } else {
-      val input = evalE.asInstanceOf[jl.Integer]
-      if (input > 20 || input < 0) {
-        null
-      } else {
-        Factorial.factorial(input)
-      }
+      Factorial.factorial(value)
     }
   }
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
-    val eval = child.gen(ctx)
-    eval.code + s"""
-      boolean ${ev.isNull} = ${eval.isNull};
-      ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
-      if (!${ev.isNull}) {
-        if (${eval.primitive} > 20 || ${eval.primitive} < 0) {
+    nullSafeCodeGen(ctx, ev, eval => {
+      s"""
+        if ($eval > 20 || $eval < 0) {
           ${ev.isNull} = true;
         } else {
           ${ev.primitive} =
-            org.apache.spark.sql.catalyst.expressions.Factorial.factorial(${eval.primitive});
+            org.apache.spark.sql.catalyst.expressions.Factorial.factorial($eval);
         }
-      }
-    """
+      """
+    })
   }
 }
 
@@ -235,17 +210,14 @@ case class Log(child: Expression) extends UnaryMathExpression(math.log, "LOG")
 case class Log2(child: Expression)
   extends UnaryMathExpression((x: Double) => math.log(x) / math.log(2), "LOG2") {
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
-    val eval = child.gen(ctx)
-    eval.code + s"""
-      boolean ${ev.isNull} = ${eval.isNull};
-      ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
-      if (!${ev.isNull}) {
-        ${ev.primitive} = java.lang.Math.log(${eval.primitive}) / java.lang.Math.log(2);
+    nullSafeCodeGen(ctx, ev, eval => {
+      s"""
+        ${ev.primitive} = java.lang.Math.log($eval) / java.lang.Math.log(2);
         if (Double.valueOf(${ev.primitive}).isNaN()) {
           ${ev.isNull} = true;
         }
-      }
-    """
+      """
+    })
   }
 }
 
@@ -283,14 +255,8 @@ case class Bin(child: Expression)
   override def inputTypes: Seq[DataType] = Seq(LongType)
   override def dataType: DataType = StringType
 
-  override def eval(input: InternalRow): Any = {
-    val evalE = child.eval(input)
-    if (evalE == null) {
-      null
-    } else {
-      UTF8String.fromString(jl.Long.toBinaryString(evalE.asInstanceOf[Long]))
-    }
-  }
+  protected override def nullSafeEval(input: Any): Any =
+    UTF8String.fromString(jl.Long.toBinaryString(input.asInstanceOf[Long]))
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     defineCodeGen(ctx, ev, (c) =>
@@ -326,17 +292,10 @@ case class Hex(child: Expression) extends UnaryExpression with ExpectsInputTypes
 
   override def dataType: DataType = StringType
 
-  override def eval(input: InternalRow): Any = {
-    val num = child.eval(input)
-    if (num == null) {
-      null
-    } else {
-      child.dataType match {
-        case LongType => hex(num.asInstanceOf[Long])
-        case BinaryType => hex(num.asInstanceOf[Array[Byte]])
-        case StringType => hex(num.asInstanceOf[UTF8String].getBytes)
-      }
-    }
+  protected override def nullSafeEval(num: Any): Any = child.dataType match {
+    case LongType => hex(num.asInstanceOf[Long])
+    case BinaryType => hex(num.asInstanceOf[Array[Byte]])
+    case StringType => hex(num.asInstanceOf[UTF8String].getBytes)
   }
 
   private[this] def hex(bytes: Array[Byte]): UTF8String = {
@@ -377,14 +336,8 @@ case class Unhex(child: Expression) extends UnaryExpression with ExpectsInputTyp
   override def nullable: Boolean = true
   override def dataType: DataType = BinaryType
 
-  override def eval(input: InternalRow): Any = {
-    val num = child.eval(input)
-    if (num == null) {
-      null
-    } else {
-      unhex(num.asInstanceOf[UTF8String].getBytes)
-    }
-  }
+  protected override def nullSafeEval(num: Any): Any =
+    unhex(num.asInstanceOf[UTF8String].getBytes)
 
   private[this] def unhex(bytes: Array[Byte]): Array[Byte] = {
     val out = new Array[Byte]((bytes.length + 1) >> 1)
@@ -429,21 +382,10 @@ case class Unhex(child: Expression) extends UnaryExpression with ExpectsInputTyp
 case class Atan2(left: Expression, right: Expression)
   extends BinaryMathExpression(math.atan2, "ATAN2") {
 
-  override def eval(input: InternalRow): Any = {
-    val evalE1 = left.eval(input)
-    if (evalE1 == null) {
-      null
-    } else {
-      val evalE2 = right.eval(input)
-      if (evalE2 == null) {
-        null
-      } else {
-        // With codegen, the values returned by -0.0 and 0.0 are different. Handled with +0.0
-        val result = math.atan2(evalE1.asInstanceOf[Double] + 0.0,
-          evalE2.asInstanceOf[Double] + 0.0)
-        if (result.isNaN) null else result
-      }
-    }
+  protected override def nullSafeEval(input1: Any, input2: Any): Any = {
+    // With codegen, the values returned by -0.0 and 0.0 are different. Handled with +0.0
+    val result = math.atan2(input1.asInstanceOf[Double] + 0.0, input2.asInstanceOf[Double] + 0.0)
+    if (result.isNaN) null else result
   }
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
@@ -480,25 +422,15 @@ case class ShiftLeft(left: Expression, right: Expression)
 
   override def dataType: DataType = left.dataType
 
-  override def eval(input: InternalRow): Any = {
-    val valueLeft = left.eval(input)
-    if (valueLeft != null) {
-      val valueRight = right.eval(input)
-      if (valueRight != null) {
-        valueLeft match {
-          case l: jl.Long => l << valueRight.asInstanceOf[jl.Integer]
-          case i: jl.Integer => i << valueRight.asInstanceOf[jl.Integer]
-        }
-      } else {
-        null
-      }
-    } else {
-      null
+  protected override def nullSafeEval(input1: Any, input2: Any): Any = {
+    input1 match {
+      case l: jl.Long => l << input2.asInstanceOf[jl.Integer]
+      case i: jl.Integer => i << input2.asInstanceOf[jl.Integer]
     }
   }
 
   override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
-    nullSafeCodeGen(ctx, ev, (result, left, right) => s"$result = $left << $right;")
+    defineCodeGen(ctx, ev, (left, right) => s"$left << $right")
   }
 }
 
@@ -516,25 +448,15 @@ case class ShiftRight(left: Expression, right: Expression)
 
   override def dataType: DataType = left.dataType
 
-  override def eval(input: InternalRow): Any = {
-    val valueLeft = left.eval(input)
-    if (valueLeft != null) {
-      val valueRight = right.eval(input)
-      if (valueRight != null) {
-        valueLeft match {
-          case l: jl.Long => l >> valueRight.asInstanceOf[jl.Integer]
-          case i: jl.Integer => i >> valueRight.asInstanceOf[jl.Integer]
-        }
-      } else {
-        null
-      }
-    } else {
-      null
+  protected override def nullSafeEval(input1: Any, input2: Any): Any = {
+    input1 match {
+      case l: jl.Long => l >> input2.asInstanceOf[jl.Integer]
+      case i: jl.Integer => i >> input2.asInstanceOf[jl.Integer]
     }
   }
 
   override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
-    nullSafeCodeGen(ctx, ev, (result, left, right) => s"$result = $left >> $right;")
+    defineCodeGen(ctx, ev, (left, right) => s"$left >> $right")
   }
 }
 
@@ -552,25 +474,15 @@ case class ShiftRightUnsigned(left: Expression, right: Expression)
 
   override def dataType: DataType = left.dataType
 
-  override def eval(input: InternalRow): Any = {
-    val valueLeft = left.eval(input)
-    if (valueLeft != null) {
-      val valueRight = right.eval(input)
-      if (valueRight != null) {
-        valueLeft match {
-          case l: jl.Long => l >>> valueRight.asInstanceOf[jl.Integer]
-          case i: jl.Integer => i >>> valueRight.asInstanceOf[jl.Integer]
-        }
-      } else {
-        null
-      }
-    } else {
-      null
+  protected override def nullSafeEval(input1: Any, input2: Any): Any = {
+    input1 match {
+      case l: jl.Long => l >>> input2.asInstanceOf[jl.Integer]
+      case i: jl.Integer => i >>> input2.asInstanceOf[jl.Integer]
     }
   }
 
   override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
-    nullSafeCodeGen(ctx, ev, (result, left, right) => s"$result = $left >>> $right;")
+    defineCodeGen(ctx, ev, (left, right) => s"$left >>> $right")
   }
 }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala
index e008af3966941..3b59cd431b871 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala
@@ -37,14 +37,8 @@ case class Md5(child: Expression) extends UnaryExpression with ExpectsInputTypes
 
   override def inputTypes: Seq[DataType] = Seq(BinaryType)
 
-  override def eval(input: InternalRow): Any = {
-    val value = child.eval(input)
-    if (value == null) {
-      null
-    } else {
-      UTF8String.fromString(DigestUtils.md5Hex(value.asInstanceOf[Array[Byte]]))
-    }
-  }
+  protected override def nullSafeEval(input: Any): Any =
+    UTF8String.fromString(DigestUtils.md5Hex(input.asInstanceOf[Array[Byte]]))
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     defineCodeGen(ctx, ev, c =>
@@ -67,76 +61,56 @@ case class Sha2(left: Expression, right: Expression)
 
   override def inputTypes: Seq[DataType] = Seq(BinaryType, IntegerType)
 
-  override def eval(input: InternalRow): Any = {
-    val evalE1 = left.eval(input)
-    if (evalE1 == null) {
-      null
-    } else {
-      val evalE2 = right.eval(input)
-      if (evalE2 == null) {
-        null
-      } else {
-        val bitLength = evalE2.asInstanceOf[Int]
-        val input = evalE1.asInstanceOf[Array[Byte]]
-        bitLength match {
-          case 224 =>
-            // DigestUtils doesn't support SHA-224 now
-            try {
-              val md = MessageDigest.getInstance("SHA-224")
-              md.update(input)
-              UTF8String.fromBytes(md.digest())
-            } catch {
-              // SHA-224 is not supported on the system, return null
-              case noa: NoSuchAlgorithmException => null
-            }
-          case 256 | 0 =>
-            UTF8String.fromString(DigestUtils.sha256Hex(input))
-          case 384 =>
-            UTF8String.fromString(DigestUtils.sha384Hex(input))
-          case 512 =>
-            UTF8String.fromString(DigestUtils.sha512Hex(input))
-          case _ => null
+  protected override def nullSafeEval(input1: Any, input2: Any): Any = {
+    val bitLength = input2.asInstanceOf[Int]
+    val input = input1.asInstanceOf[Array[Byte]]
+    bitLength match {
+      case 224 =>
+        // DigestUtils doesn't support SHA-224 now
+        try {
+          val md = MessageDigest.getInstance("SHA-224")
+          md.update(input)
+          UTF8String.fromBytes(md.digest())
+        } catch {
+          // SHA-224 is not supported on the system, return null
+          case noa: NoSuchAlgorithmException => null
         }
-      }
+      case 256 | 0 =>
+        UTF8String.fromString(DigestUtils.sha256Hex(input))
+      case 384 =>
+        UTF8String.fromString(DigestUtils.sha384Hex(input))
+      case 512 =>
+        UTF8String.fromString(DigestUtils.sha512Hex(input))
+      case _ => null
     }
   }
+
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
-    val eval1 = left.gen(ctx)
-    val eval2 = right.gen(ctx)
     val digestUtils = "org.apache.commons.codec.digest.DigestUtils"
-
-    s"""
-      ${eval1.code}
-      boolean ${ev.isNull} = ${eval1.isNull};
-      ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
-      if (!${ev.isNull}) {
-        ${eval2.code}
-        if (!${eval2.isNull}) {
-          if (${eval2.primitive} == 224) {
-            try {
-              java.security.MessageDigest md = java.security.MessageDigest.getInstance("SHA-224");
-              md.update(${eval1.primitive});
-              ${ev.primitive} = ${ctx.stringType}.fromBytes(md.digest());
-            } catch (java.security.NoSuchAlgorithmException e) {
-              ${ev.isNull} = true;
-            }
-          } else if (${eval2.primitive} == 256 || ${eval2.primitive} == 0) {
-            ${ev.primitive} =
-              ${ctx.stringType}.fromString(${digestUtils}.sha256Hex(${eval1.primitive}));
-          } else if (${eval2.primitive} == 384) {
-            ${ev.primitive} =
-              ${ctx.stringType}.fromString(${digestUtils}.sha384Hex(${eval1.primitive}));
-          } else if (${eval2.primitive} == 512) {
-            ${ev.primitive} =
-              ${ctx.stringType}.fromString(${digestUtils}.sha512Hex(${eval1.primitive}));
-          } else {
+    nullSafeCodeGen(ctx, ev, (eval1, eval2) => {
+      s"""
+        if ($eval2 == 224) {
+          try {
+            java.security.MessageDigest md = java.security.MessageDigest.getInstance("SHA-224");
+            md.update($eval1);
+            ${ev.primitive} = ${ctx.stringType}.fromBytes(md.digest());
+          } catch (java.security.NoSuchAlgorithmException e) {
             ${ev.isNull} = true;
           }
+        } else if ($eval2 == 256 || $eval2 == 0) {
+          ${ev.primitive} =
+            ${ctx.stringType}.fromString($digestUtils.sha256Hex($eval1));
+        } else if ($eval2 == 384) {
+          ${ev.primitive} =
+            ${ctx.stringType}.fromString($digestUtils.sha384Hex($eval1));
+        } else if ($eval2 == 512) {
+          ${ev.primitive} =
+            ${ctx.stringType}.fromString($digestUtils.sha512Hex($eval1));
         } else {
           ${ev.isNull} = true;
         }
-      }
-    """
+      """
+    })
   }
 }
 
@@ -150,19 +124,12 @@ case class Sha1(child: Expression) extends UnaryExpression with ExpectsInputType
 
   override def inputTypes: Seq[DataType] = Seq(BinaryType)
 
-  override def eval(input: InternalRow): Any = {
-    val value = child.eval(input)
-    if (value == null) {
-      null
-    } else {
-      UTF8String.fromString(DigestUtils.shaHex(value.asInstanceOf[Array[Byte]]))
-    }
-  }
+  protected override def nullSafeEval(input: Any): Any =
+    UTF8String.fromString(DigestUtils.shaHex(input.asInstanceOf[Array[Byte]]))
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     defineCodeGen(ctx, ev, c =>
-      "org.apache.spark.unsafe.types.UTF8String.fromString" +
-        s"(org.apache.commons.codec.digest.DigestUtils.shaHex($c))"
+      s"${ctx.stringType}.fromString(org.apache.commons.codec.digest.DigestUtils.shaHex($c))"
     )
   }
 }
@@ -177,30 +144,20 @@ case class Crc32(child: Expression) extends UnaryExpression with ExpectsInputTyp
 
   override def inputTypes: Seq[DataType] = Seq(BinaryType)
 
-  override def eval(input: InternalRow): Any = {
-    val value = child.eval(input)
-    if (value == null) {
-      null
-    } else {
-      val checksum = new CRC32
-      checksum.update(value.asInstanceOf[Array[Byte]], 0, value.asInstanceOf[Array[Byte]].length)
-      checksum.getValue
-    }
+  protected override def nullSafeEval(input: Any): Any = {
+    val checksum = new CRC32
+    checksum.update(input.asInstanceOf[Array[Byte]], 0, input.asInstanceOf[Array[Byte]].length)
+    checksum.getValue
   }
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
-    val value = child.gen(ctx)
     val CRC32 = "java.util.zip.CRC32"
-    s"""
-      ${value.code}
-      boolean ${ev.isNull} = ${value.isNull};
-      long ${ev.primitive} = ${ctx.defaultValue(dataType)};
-      if (!${ev.isNull}) {
-        ${CRC32} checksum = new ${CRC32}();
-        checksum.update(${value.primitive}, 0, ${value.primitive}.length);
+    nullSafeCodeGen(ctx, ev, value => {
+      s"""
+        $CRC32 checksum = new $CRC32();
+        checksum.update($value, 0, $value.length);
         ${ev.primitive} = checksum.getValue();
-      }
-    """
+      """
+    })
   }
-
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
index 0b479f466c63c..402a0aa232493 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
@@ -74,12 +74,7 @@ case class Not(child: Expression) extends UnaryExpression with Predicate with Ex
 
   override def inputTypes: Seq[DataType] = Seq(BooleanType)
 
-  override def eval(input: InternalRow): Any = {
-    child.eval(input) match {
-      case null => null
-      case b: Boolean => !b
-    }
-  }
+  protected override def nullSafeEval(input: Any): Any = !input.asInstanceOf[Boolean]
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     defineCodeGen(ctx, ev, c => s"!($c)")
@@ -105,17 +100,14 @@ case class In(value: Expression, list: Seq[Expression]) extends Predicate {
  * Optimized version of In clause, when all filter values of In clause are
  * static.
  */
-case class InSet(value: Expression, hset: Set[Any])
-  extends Predicate {
-
-  override def children: Seq[Expression] = value :: Nil
+case class InSet(child: Expression, hset: Set[Any])
+  extends UnaryExpression with Predicate {
 
-  override def foldable: Boolean = value.foldable
   override def nullable: Boolean = true // TODO: Figure out correct nullability semantics of IN.
-  override def toString: String = s"$value INSET ${hset.mkString("(", ",", ")")}"
+  override def toString: String = s"$child INSET ${hset.mkString("(", ",", ")")}"
 
   override def eval(input: InternalRow): Any = {
-    hset.contains(value.eval(input))
+    hset.contains(child.eval(input))
   }
 }
 
@@ -127,15 +119,15 @@ case class And(left: Expression, right: Expression)
   override def inputTypes: Seq[DataType] = Seq(BooleanType, BooleanType)
 
   override def eval(input: InternalRow): Any = {
-    val l = left.eval(input)
-    if (l == false) {
+    val input1 = left.eval(input)
+    if (input1 == false) {
        false
     } else {
-      val r = right.eval(input)
-      if (r == false) {
+      val input2 = right.eval(input)
+      if (input2 == false) {
         false
       } else {
-        if (l != null && r != null) {
+        if (input1 != null && input2 != null) {
           true
         } else {
           null
@@ -176,15 +168,15 @@ case class Or(left: Expression, right: Expression)
   override def inputTypes: Seq[DataType] = Seq(BooleanType, BooleanType)
 
   override def eval(input: InternalRow): Any = {
-    val l = left.eval(input)
-    if (l == true) {
+    val input1 = left.eval(input)
+    if (input1 == true) {
       true
     } else {
-      val r = right.eval(input)
-      if (r == true) {
+      val input2 = right.eval(input)
+      if (input2 == true) {
         true
       } else {
-        if (l != null && r != null) {
+        if (input1 != null && input2 != null) {
           false
         } else {
           null
@@ -232,20 +224,6 @@ abstract class BinaryComparison extends BinaryOperator with Predicate {
 
   protected def checkTypesInternal(t: DataType): TypeCheckResult
 
-  override def eval(input: InternalRow): Any = {
-    val evalE1 = left.eval(input)
-    if (evalE1 == null) {
-      null
-    } else {
-      val evalE2 = right.eval(input)
-      if (evalE2 == null) {
-        null
-      } else {
-        evalInternal(evalE1, evalE2)
-      }
-    }
-  }
-
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     if (ctx.isPrimitiveType(left.dataType)) {
       // faster version
@@ -254,9 +232,6 @@ abstract class BinaryComparison extends BinaryOperator with Predicate {
       defineCodeGen(ctx, ev, (c1, c2) => s"${ctx.genComp(left.dataType, c1, c2)} $symbol 0")
     }
   }
-
-  protected def evalInternal(evalE1: Any, evalE2: Any): Any =
-    sys.error(s"BinaryComparisons must override either eval or evalInternal")
 }
 
 private[sql] object BinaryComparison {
@@ -277,9 +252,9 @@ case class EqualTo(left: Expression, right: Expression) extends BinaryComparison
 
   override protected def checkTypesInternal(t: DataType) = TypeCheckResult.TypeCheckSuccess
 
-  protected override def evalInternal(l: Any, r: Any) = {
-    if (left.dataType != BinaryType) l == r
-    else java.util.Arrays.equals(l.asInstanceOf[Array[Byte]], r.asInstanceOf[Array[Byte]])
+  protected override def nullSafeEval(input1: Any, input2: Any): Any = {
+    if (left.dataType != BinaryType) input1 == input2
+    else java.util.Arrays.equals(input1.asInstanceOf[Array[Byte]], input2.asInstanceOf[Array[Byte]])
   }
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
@@ -295,15 +270,18 @@ case class EqualNullSafe(left: Expression, right: Expression) extends BinaryComp
   override protected def checkTypesInternal(t: DataType) = TypeCheckResult.TypeCheckSuccess
 
   override def eval(input: InternalRow): Any = {
-    val l = left.eval(input)
-    val r = right.eval(input)
-    if (l == null && r == null) {
+    val input1 = left.eval(input)
+    val input2 = right.eval(input)
+    if (input1 == null && input2 == null) {
       true
-    } else if (l == null || r == null) {
+    } else if (input1 == null || input2 == null) {
       false
     } else {
-      if (left.dataType != BinaryType) l == r
-      else java.util.Arrays.equals(l.asInstanceOf[Array[Byte]], r.asInstanceOf[Array[Byte]])
+      if (left.dataType != BinaryType) {
+        input1 == input2
+      } else {
+        java.util.Arrays.equals(input1.asInstanceOf[Array[Byte]], input2.asInstanceOf[Array[Byte]])
+      }
     }
   }
 
@@ -327,7 +305,7 @@ case class LessThan(left: Expression, right: Expression) extends BinaryCompariso
 
   private lazy val ordering = TypeUtils.getOrdering(left.dataType)
 
-  protected override def evalInternal(evalE1: Any, evalE2: Any) = ordering.lt(evalE1, evalE2)
+  protected override def nullSafeEval(input1: Any, input2: Any): Any = ordering.lt(input1, input2)
 }
 
 case class LessThanOrEqual(left: Expression, right: Expression) extends BinaryComparison {
@@ -338,7 +316,7 @@ case class LessThanOrEqual(left: Expression, right: Expression) extends BinaryCo
 
   private lazy val ordering = TypeUtils.getOrdering(left.dataType)
 
-  protected override def evalInternal(evalE1: Any, evalE2: Any) = ordering.lteq(evalE1, evalE2)
+  protected override def nullSafeEval(input1: Any, input2: Any): Any = ordering.lteq(input1, input2)
 }
 
 case class GreaterThan(left: Expression, right: Expression) extends BinaryComparison {
@@ -349,7 +327,7 @@ case class GreaterThan(left: Expression, right: Expression) extends BinaryCompar
 
   private lazy val ordering = TypeUtils.getOrdering(left.dataType)
 
-  protected override def evalInternal(evalE1: Any, evalE2: Any) = ordering.gt(evalE1, evalE2)
+  protected override def nullSafeEval(input1: Any, input2: Any): Any = ordering.gt(input1, input2)
 }
 
 case class GreaterThanOrEqual(left: Expression, right: Expression) extends BinaryComparison {
@@ -360,5 +338,5 @@ case class GreaterThanOrEqual(left: Expression, right: Expression) extends Binar
 
   private lazy val ordering = TypeUtils.getOrdering(left.dataType)
 
-  protected override def evalInternal(evalE1: Any, evalE2: Any) = ordering.gteq(evalE1, evalE2)
+  protected override def nullSafeEval(input1: Any, input2: Any): Any = ordering.gteq(input1, input2)
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/sets.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/sets.scala
index 5d51a4ca65332..9b44fb1ed5b3f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/sets.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/sets.scala
@@ -135,6 +135,7 @@ case class AddItemToSet(item: Expression, set: Expression) extends Expression {
  */
 case class CombineSets(left: Expression, right: Expression) extends BinaryExpression {
 
+  override def nullable: Boolean = left.nullable
   override def dataType: DataType = left.dataType
 
   override def eval(input: InternalRow): Any = {
@@ -183,12 +184,8 @@ case class CountSet(child: Expression) extends UnaryExpression {
 
   override def dataType: DataType = LongType
 
-  override def eval(input: InternalRow): Any = {
-    val childEval = child.eval(input).asInstanceOf[OpenHashSet[Any]]
-    if (childEval != null) {
-      childEval.size.toLong
-    }
-  }
+  protected override def nullSafeEval(input: Any): Any =
+    input.asInstanceOf[OpenHashSet[Any]].size.toLong
 
   override def toString: String = s"$child.count()"
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
index 1a14a7a449342..6e6a7fb171214 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
@@ -31,7 +31,6 @@ trait StringRegexExpression extends ExpectsInputTypes {
   def escape(v: String): String
   def matches(regex: Pattern, str: String): Boolean
 
-  override def nullable: Boolean = left.nullable || right.nullable
   override def dataType: DataType = BooleanType
   override def inputTypes: Seq[DataType] = Seq(StringType, StringType)
 
@@ -50,22 +49,12 @@ trait StringRegexExpression extends ExpectsInputTypes {
 
   protected def pattern(str: String) = if (cache == null) compile(str) else cache
 
-  override def eval(input: InternalRow): Any = {
-    val l = left.eval(input)
-    if (l == null) {
+  protected override def nullSafeEval(input1: Any, input2: Any): Any = {
+    val regex = pattern(input2.asInstanceOf[UTF8String].toString())
+    if(regex == null) {
       null
     } else {
-      val r = right.eval(input)
-      if(r == null) {
-        null
-      } else {
-        val regex = pattern(r.asInstanceOf[UTF8String].toString())
-        if(regex == null) {
-          null
-        } else {
-          matches(regex, l.asInstanceOf[UTF8String].toString())
-        }
-      }
+      matches(regex, input1.asInstanceOf[UTF8String].toString())
     }
   }
 }
@@ -120,14 +109,8 @@ trait CaseConversionExpression extends ExpectsInputTypes {
   override def dataType: DataType = StringType
   override def inputTypes: Seq[DataType] = Seq(StringType)
 
-  override def eval(input: InternalRow): Any = {
-    val evaluated = child.eval(input)
-    if (evaluated == null) {
-      null
-    } else {
-      convert(evaluated.asInstanceOf[UTF8String])
-    }
-  }
+  protected override def nullSafeEval(input: Any): Any =
+    convert(input.asInstanceOf[UTF8String])
 }
 
 /**
@@ -160,20 +143,10 @@ trait StringComparison extends ExpectsInputTypes {
 
   def compare(l: UTF8String, r: UTF8String): Boolean
 
-  override def nullable: Boolean = left.nullable || right.nullable
-
   override def inputTypes: Seq[DataType] = Seq(StringType, StringType)
 
-  override def eval(input: InternalRow): Any = {
-    val leftEval = left.eval(input)
-    if(leftEval == null) {
-      null
-    } else {
-      val rightEval = right.eval(input)
-      if (rightEval == null) null
-      else compare(leftEval.asInstanceOf[UTF8String], rightEval.asInstanceOf[UTF8String])
-    }
-  }
+  protected override def nullSafeEval(input1: Any, input2: Any): Any =
+    compare(input1.asInstanceOf[UTF8String], input2.asInstanceOf[UTF8String])
 
   override def toString: String = s"$nodeName($left, $right)"
 }
@@ -288,10 +261,8 @@ case class StringLength(child: Expression) extends UnaryExpression with ExpectsI
   override def dataType: DataType = IntegerType
   override def inputTypes: Seq[DataType] = Seq(StringType)
 
-  override def eval(input: InternalRow): Any = {
-    val string = child.eval(input)
-    if (string == null) null else string.asInstanceOf[UTF8String].length
-  }
+  protected override def nullSafeEval(string: Any): Any =
+    string.asInstanceOf[UTF8String].length
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     defineCodeGen(ctx, ev, c => s"($c).length()")
@@ -310,24 +281,13 @@ case class Levenshtein(left: Expression, right: Expression) extends BinaryExpres
 
   override def dataType: DataType = IntegerType
 
-  override def eval(input: InternalRow): Any = {
-    val leftValue = left.eval(input)
-    if (leftValue == null) {
-      null
-    } else {
-      val rightValue = right.eval(input)
-      if(rightValue == null) {
-        null
-      } else {
-        StringUtils.getLevenshteinDistance(leftValue.toString, rightValue.toString)
-      }
-    }
-  }
+  protected override def nullSafeEval(input1: Any, input2: Any): Any =
+    StringUtils.getLevenshteinDistance(input1.toString, input2.toString)
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     val stringUtils = classOf[StringUtils].getName
-    nullSafeCodeGen(ctx, ev, (res, left, right) =>
-      s"$res = $stringUtils.getLevenshteinDistance($left.toString(), $right.toString());")
+    defineCodeGen(ctx, ev, (left, right) =>
+      s"$stringUtils.getLevenshteinDistance($left.toString(), $right.toString())")
   }
 }
 
@@ -338,17 +298,12 @@ case class Ascii(child: Expression) extends UnaryExpression with ExpectsInputTyp
   override def dataType: DataType = IntegerType
   override def inputTypes: Seq[DataType] = Seq(StringType)
 
-  override def eval(input: InternalRow): Any = {
-    val string = child.eval(input)
-    if (string == null) {
-      null
+  protected override def nullSafeEval(string: Any): Any = {
+    val bytes = string.asInstanceOf[UTF8String].getBytes
+    if (bytes.length > 0) {
+      bytes(0).asInstanceOf[Int]
     } else {
-      val bytes = string.asInstanceOf[UTF8String].getBytes
-      if (bytes.length > 0) {
-        bytes(0).asInstanceOf[Int]
-      } else {
-        0
-      }
+      0
     }
   }
 }
@@ -360,15 +315,10 @@ case class Base64(child: Expression) extends UnaryExpression with ExpectsInputTy
   override def dataType: DataType = StringType
   override def inputTypes: Seq[DataType] = Seq(BinaryType)
 
-  override def eval(input: InternalRow): Any = {
-    val bytes = child.eval(input)
-    if (bytes == null) {
-      null
-    } else {
-      UTF8String.fromBytes(
-        org.apache.commons.codec.binary.Base64.encodeBase64(
-          bytes.asInstanceOf[Array[Byte]]))
-    }
+  protected override def nullSafeEval(bytes: Any): Any = {
+    UTF8String.fromBytes(
+      org.apache.commons.codec.binary.Base64.encodeBase64(
+        bytes.asInstanceOf[Array[Byte]]))
   }
 }
 
@@ -379,14 +329,8 @@ case class UnBase64(child: Expression) extends UnaryExpression with ExpectsInput
   override def dataType: DataType = BinaryType
   override def inputTypes: Seq[DataType] = Seq(StringType)
 
-  override def eval(input: InternalRow): Any = {
-    val string = child.eval(input)
-    if (string == null) {
-      null
-    } else {
-      org.apache.commons.codec.binary.Base64.decodeBase64(string.asInstanceOf[UTF8String].toString)
-    }
-  }
+  protected override def nullSafeEval(string: Any): Any =
+    org.apache.commons.codec.binary.Base64.decodeBase64(string.asInstanceOf[UTF8String].toString)
 }
 
 /**
@@ -402,19 +346,9 @@ case class Decode(bin: Expression, charset: Expression)
   override def dataType: DataType = StringType
   override def inputTypes: Seq[DataType] = Seq(BinaryType, StringType)
 
-  override def eval(input: InternalRow): Any = {
-    val l = bin.eval(input)
-    if (l == null) {
-      null
-    } else {
-      val r = charset.eval(input)
-      if (r == null) {
-        null
-      } else {
-        val fromCharset = r.asInstanceOf[UTF8String].toString
-        UTF8String.fromString(new String(l.asInstanceOf[Array[Byte]], fromCharset))
-      }
-    }
+  protected override def nullSafeEval(input1: Any, input2: Any): Any = {
+    val fromCharset = input2.asInstanceOf[UTF8String].toString
+    UTF8String.fromString(new String(input1.asInstanceOf[Array[Byte]], fromCharset))
   }
 }
 
@@ -431,19 +365,9 @@ case class Encode(value: Expression, charset: Expression)
   override def dataType: DataType = BinaryType
   override def inputTypes: Seq[DataType] = Seq(StringType, StringType)
 
-  override def eval(input: InternalRow): Any = {
-    val l = value.eval(input)
-    if (l == null) {
-      null
-    } else {
-      val r = charset.eval(input)
-      if (r == null) {
-        null
-      } else {
-        val toCharset = r.asInstanceOf[UTF8String].toString
-        l.asInstanceOf[UTF8String].toString.getBytes(toCharset)
-      }
-    }
+  protected override def nullSafeEval(input1: Any, input2: Any): Any = {
+    val toCharset = input2.asInstanceOf[UTF8String].toString
+    input1.asInstanceOf[UTF8String].toString.getBytes(toCharset)
   }
 }
 

From 1cb2629f1aa466f92246828c562ea6f35c89ab87 Mon Sep 17 00:00:00 2001
From: Patrick Wendell <patrick@databricks.com>
Date: Mon, 6 Jul 2015 22:14:24 -0700
Subject: [PATCH 029/549] [HOTFIX] Rename release-profile to release

when publishing releases. We named it as 'release-profile' because that is
the Maven convention. However, it turns out this special name causes several
other things to kick-in when we are creating releases that are not desirable.
For instance, it triggers the javadoc plugin to run, which actually fails
in our current build set-up.

The fix is just to rename this to a different profile to have no
collateral damage associated with its use.
---
 dev/create-release/create-release.sh | 4 ++--
 pom.xml                              | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/dev/create-release/create-release.sh b/dev/create-release/create-release.sh
index cfe2cd4752b3f..30190dcd41ec5 100755
--- a/dev/create-release/create-release.sh
+++ b/dev/create-release/create-release.sh
@@ -118,13 +118,13 @@ if [[ ! "$@" =~ --skip-publish ]]; then
 
   rm -rf $SPARK_REPO
 
-  build/mvn -DskipTests -Pyarn -Phive -Prelease-profile\
+  build/mvn -DskipTests -Pyarn -Phive -Prelease\
     -Phive-thriftserver -Phadoop-2.2 -Pspark-ganglia-lgpl -Pkinesis-asl \
     clean install
 
   ./dev/change-version-to-2.11.sh
   
-  build/mvn -DskipTests -Pyarn -Phive -Prelease-profile\
+  build/mvn -DskipTests -Pyarn -Phive -Prelease\
     -Dscala-2.11 -Phadoop-2.2 -Pspark-ganglia-lgpl -Pkinesis-asl \
     clean install
 
diff --git a/pom.xml b/pom.xml
index fbcc9152765cf..27dd4f750a3f6 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1835,7 +1835,7 @@
           Use this profile only for making Spark releases. Note that due to SPARK-8819,
           you must use maven version 3.2.x or before to avoid running into MSHADE-148.
       -->
-      <id>release-profile</id>
+      <id>release</id>
       <properties>
         <!--
             The maven shade plugin has a bug where enabling the `createDependencyReducedPom`

From dcbd85b70f026fbc0b7e77fcc364513581007c8d Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Tue, 7 Jul 2015 08:09:56 -0700
Subject: [PATCH 030/549] [SPARK-6731] [CORE] Addendum: Upgrade Apache
 commons-math3 to 3.4.1

(This finishes the job by removing the version overridden by Hadoop profiles.)

See discussion at https://github.com/apache/spark/pull/6994#issuecomment-119113167

Author: Sean Owen <sowen@cloudera.com>

Closes #7261 from srowen/SPARK-6731.2 and squashes the following commits:

5a3f59e [Sean Owen] Finish updating Commons Math3 to 3.4.1 from 3.1.1
---
 pom.xml | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/pom.xml b/pom.xml
index 27dd4f750a3f6..bece526a2a3e5 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1717,7 +1717,6 @@
       <properties>
         <hadoop.version>2.3.0</hadoop.version>
         <jets3t.version>0.9.3</jets3t.version>
-        <commons.math3.version>3.1.1</commons.math3.version>
       </properties>
     </profile>
 
@@ -1726,7 +1725,6 @@
       <properties>
         <hadoop.version>2.4.0</hadoop.version>
         <jets3t.version>0.9.3</jets3t.version>
-        <commons.math3.version>3.1.1</commons.math3.version>
       </properties>
     </profile>
 
@@ -1735,7 +1733,6 @@
       <properties>
         <hadoop.version>2.6.0</hadoop.version>
         <jets3t.version>0.9.3</jets3t.version>
-        <commons.math3.version>3.1.1</commons.math3.version>
         <zookeeper.version>3.4.6</zookeeper.version>
         <curator.version>2.6.0</curator.version>
       </properties>

From d73bc08d98e803889ef6215eab81d7bb0049e941 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Tue, 7 Jul 2015 08:19:17 -0700
Subject: [PATCH 031/549] [SPARK-8788] [ML] Add Java unit test for PCA
 transformer

Add Java unit test for PCA transformer

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #7184 from yanboliang/spark-8788 and squashes the following commits:

9d1a2af [Yanbo Liang] address comments
b34451f [Yanbo Liang] Add Java unit test for PCA transformer
---
 .../apache/spark/ml/feature/JavaPCASuite.java | 114 ++++++++++++++++++
 1 file changed, 114 insertions(+)
 create mode 100644 mllib/src/test/java/org/apache/spark/ml/feature/JavaPCASuite.java

diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaPCASuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaPCASuite.java
new file mode 100644
index 0000000000000..5cf43fec6f29e
--- /dev/null
+++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaPCASuite.java
@@ -0,0 +1,114 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.feature;
+
+import java.io.Serializable;
+import java.util.List;
+
+import scala.Tuple2;
+
+import com.google.common.collect.Lists;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.spark.api.java.function.Function;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.mllib.linalg.distributed.RowMatrix;
+import org.apache.spark.mllib.linalg.Matrix;
+import org.apache.spark.mllib.linalg.Vector;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SQLContext;
+
+public class JavaPCASuite implements Serializable {
+  private transient JavaSparkContext jsc;
+  private transient SQLContext sqlContext;
+
+  @Before
+  public void setUp() {
+    jsc = new JavaSparkContext("local", "JavaPCASuite");
+    sqlContext = new SQLContext(jsc);
+  }
+
+  @After
+  public void tearDown() {
+    jsc.stop();
+    jsc = null;
+  }
+
+  public static class VectorPair implements Serializable {
+    private Vector features = Vectors.dense(0.0);
+    private Vector expected = Vectors.dense(0.0);
+
+    public void setFeatures(Vector features) {
+      this.features = features;
+    }
+
+    public Vector getFeatures() {
+      return this.features;
+    }
+
+    public void setExpected(Vector expected) {
+      this.expected = expected;
+    }
+
+    public Vector getExpected() {
+      return this.expected;
+    }
+  }
+
+  @Test
+  public void testPCA() {
+    List<Vector> points = Lists.newArrayList(
+      Vectors.sparse(5, new int[]{1, 3}, new double[]{1.0, 7.0}),
+      Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0),
+      Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)
+    );
+    JavaRDD<Vector> dataRDD = jsc.parallelize(points, 2);
+
+    RowMatrix mat = new RowMatrix(dataRDD.rdd());
+    Matrix pc = mat.computePrincipalComponents(3);
+    JavaRDD<Vector> expected = mat.multiply(pc).rows().toJavaRDD();
+
+    JavaRDD<VectorPair> featuresExpected = dataRDD.zip(expected).map(
+      new Function<Tuple2<Vector, Vector>, VectorPair>() {
+        public VectorPair call(Tuple2<Vector, Vector> pair) {
+          VectorPair featuresExpected = new VectorPair();
+          featuresExpected.setFeatures(pair._1());
+          featuresExpected.setExpected(pair._2());
+          return featuresExpected;
+        }
+      }
+    );
+
+    DataFrame df = sqlContext.createDataFrame(featuresExpected, VectorPair.class);
+    PCAModel pca = new PCA()
+      .setInputCol("features")
+      .setOutputCol("pca_features")
+      .setK(3)
+      .fit(df);
+    List<Row> result = pca.transform(df).select("pca_features", "expected").toJavaRDD().collect();
+    for (Row r : result) {
+      Assert.assertEquals(r.get(1), r.get(0));
+    }
+  }
+}

From 0a63d7ab8a58d3e48d01740729a7832f1834efe8 Mon Sep 17 00:00:00 2001
From: Mike Dusenberry <mwdusenb@us.ibm.com>
Date: Tue, 7 Jul 2015 08:24:52 -0700
Subject: [PATCH 032/549] [SPARK-8570] [MLLIB] [DOCS] Improve MLlib Local
 Matrix Documentation.

Updated MLlib Data Types Local Matrix section to include information on sparse matrices, added sparse matrix examples to the Scala and Java examples, and added Python examples for both dense and sparse matrices.

Author: Mike Dusenberry <mwdusenb@us.ibm.com>

Closes #6958 from dusenberrymw/Improve_MLlib_Local_Matrix_Documentation and squashes the following commits:

ceae407 [Mike Dusenberry] Updated MLlib Data Types Local Matrix section to include information on sparse matrices, added sparse matrix examples to the Scala and Java examples, and added Python examples for both dense and sparse matrices.
---
 docs/mllib-data-types.md | 44 +++++++++++++++++++++++++++++++++-------
 1 file changed, 37 insertions(+), 7 deletions(-)

diff --git a/docs/mllib-data-types.md b/docs/mllib-data-types.md
index d824dab1d7f7b..3aa040046fca5 100644
--- a/docs/mllib-data-types.md
+++ b/docs/mllib-data-types.md
@@ -226,7 +226,8 @@ examples = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
 
 A local matrix has integer-typed row and column indices and double-typed values, stored on a single
 machine.  MLlib supports dense matrices, whose entry values are stored in a single double array in
-column major.  For example, the following matrix `\[ \begin{pmatrix}
+column-major order, and sparse matrices, whose non-zero entry values are stored in the Compressed Sparse
+Column (CSC) format in column-major order.  For example, the following dense matrix `\[ \begin{pmatrix}
 1.0 & 2.0 \\
 3.0 & 4.0 \\
 5.0 & 6.0
@@ -238,28 +239,33 @@ is stored in a one-dimensional array `[1.0, 3.0, 5.0, 2.0, 4.0, 6.0]` with the m
 <div data-lang="scala" markdown="1">
 
 The base class of local matrices is
-[`Matrix`](api/scala/index.html#org.apache.spark.mllib.linalg.Matrix), and we provide one
-implementation: [`DenseMatrix`](api/scala/index.html#org.apache.spark.mllib.linalg.DenseMatrix).
+[`Matrix`](api/scala/index.html#org.apache.spark.mllib.linalg.Matrix), and we provide two
+implementations: [`DenseMatrix`](api/scala/index.html#org.apache.spark.mllib.linalg.DenseMatrix),
+and [`SparseMatrix`](api/scala/index.html#org.apache.spark.mllib.linalg.SparseMatrix).
 We recommend using the factory methods implemented
 in [`Matrices`](api/scala/index.html#org.apache.spark.mllib.linalg.Matrices$) to create local
-matrices.
+matrices. Remember, local matrices in MLlib are stored in column-major order.
 
 {% highlight scala %}
 import org.apache.spark.mllib.linalg.{Matrix, Matrices}
 
 // Create a dense matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0))
 val dm: Matrix = Matrices.dense(3, 2, Array(1.0, 3.0, 5.0, 2.0, 4.0, 6.0))
+
+// Create a sparse matrix ((9.0, 0.0), (0.0, 8.0), (0.0, 6.0))
+val sm: Matrix = Matrices.sparse(3, 2, Array(0, 1, 3), Array(0, 2, 1), Array(9, 6, 8))
 {% endhighlight %}
 </div>
 
 <div data-lang="java" markdown="1">
 
 The base class of local matrices is
-[`Matrix`](api/java/org/apache/spark/mllib/linalg/Matrix.html), and we provide one
-implementation: [`DenseMatrix`](api/java/org/apache/spark/mllib/linalg/DenseMatrix.html).
+[`Matrix`](api/java/org/apache/spark/mllib/linalg/Matrix.html), and we provide two
+implementations: [`DenseMatrix`](api/java/org/apache/spark/mllib/linalg/DenseMatrix.html),
+and [`SparseMatrix`](api/java/org/apache/spark/mllib/linalg/SparseMatrix.html).
 We recommend using the factory methods implemented
 in [`Matrices`](api/java/org/apache/spark/mllib/linalg/Matrices.html) to create local
-matrices.
+matrices. Remember, local matrices in MLlib are stored in column-major order.
 
 {% highlight java %}
 import org.apache.spark.mllib.linalg.Matrix;
@@ -267,6 +273,30 @@ import org.apache.spark.mllib.linalg.Matrices;
 
 // Create a dense matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0))
 Matrix dm = Matrices.dense(3, 2, new double[] {1.0, 3.0, 5.0, 2.0, 4.0, 6.0});
+
+// Create a sparse matrix ((9.0, 0.0), (0.0, 8.0), (0.0, 6.0))
+Matrix sm = Matrices.sparse(3, 2, new int[] {0, 1, 3}, new int[] {0, 2, 1}, new double[] {9, 6, 8});
+{% endhighlight %}
+</div>
+
+<div data-lang="python" markdown="1">
+
+The base class of local matrices is
+[`Matrix`](api/python/pyspark.mllib.html#pyspark.mllib.linalg.Matrix), and we provide two
+implementations: [`DenseMatrix`](api/python/pyspark.mllib.html#pyspark.mllib.linalg.DenseMatrix),
+and [`SparseMatrix`](api/python/pyspark.mllib.html#pyspark.mllib.linalg.SparseMatrix).
+We recommend using the factory methods implemented
+in [`Matrices`](api/python/pyspark.mllib.html#pyspark.mllib.linalg.Matrices) to create local
+matrices. Remember, local matrices in MLlib are stored in column-major order.
+
+{% highlight python %}
+import org.apache.spark.mllib.linalg.{Matrix, Matrices}
+
+// Create a dense matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0))
+dm2 = Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])
+
+// Create a sparse matrix ((9.0, 0.0), (0.0, 8.0), (0.0, 6.0))
+sm = Matrices.sparse(3, 2, [0, 1, 3], [0, 2, 1], [9, 6, 8])
 {% endhighlight %}
 </div>
 

From 1dbc4a155f3697a3973909806be42a1be6017d12 Mon Sep 17 00:00:00 2001
From: MechCoder <manojkumarsivaraj334@gmail.com>
Date: Tue, 7 Jul 2015 08:58:08 -0700
Subject: [PATCH 033/549] [SPARK-8711] [ML] Add additional methods to PySpark
 ML tree models

Add numNodes and depth to treeModels, add treeWeights to ensemble Models.
Add __repr__ to all models.

Author: MechCoder <manojkumarsivaraj334@gmail.com>

Closes #7095 from MechCoder/missing_methods_tree and squashes the following commits:

23b08be [MechCoder] private [spark]
38a0860 [MechCoder] rename pyTreeWeights to javaTreeWeights
6d16ad8 [MechCoder] Fix Python 3 Error
47d7023 [MechCoder] Use np.allclose and treeEnsembleModel -> TreeEnsembleMethods
819098c [MechCoder] [SPARK-8711] [ML] Add additional methods ot PySpark ML tree models
---
 .../org/apache/spark/ml/tree/treeModels.scala |  5 ++
 python/pyspark/ml/classification.py           | 20 ++++++--
 python/pyspark/ml/regression.py               | 46 +++++++++++++++++--
 3 files changed, 64 insertions(+), 7 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/treeModels.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/treeModels.scala
index 1929f9d02156e..22873909c33fa 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tree/treeModels.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tree/treeModels.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.ml.tree
 
+import org.apache.spark.mllib.linalg.{Vectors, Vector}
 
 /**
  * Abstraction for Decision Tree models.
@@ -70,6 +71,10 @@ private[ml] trait TreeEnsembleModel {
   /** Weights for each tree, zippable with [[trees]] */
   def treeWeights: Array[Double]
 
+  /** Weights used by the python wrappers. */
+  // Note: An array cannot be returned directly due to serialization problems.
+  private[spark] def javaTreeWeights: Vector = Vectors.dense(treeWeights)
+
   /** Summary of the model */
   override def toString: String = {
     // Implementing classes should generally override this method to be more descriptive.
diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
index 7abbde8b260eb..89117e492846b 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -18,7 +18,8 @@
 from pyspark.ml.util import keyword_only
 from pyspark.ml.wrapper import JavaEstimator, JavaModel
 from pyspark.ml.param.shared import *
-from pyspark.ml.regression import RandomForestParams
+from pyspark.ml.regression import (
+    RandomForestParams, DecisionTreeModel, TreeEnsembleModels)
 from pyspark.mllib.common import inherit_doc
 
 
@@ -202,6 +203,10 @@ class DecisionTreeClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred
     >>> td = si_model.transform(df)
     >>> dt = DecisionTreeClassifier(maxDepth=2, labelCol="indexed")
     >>> model = dt.fit(td)
+    >>> model.numNodes
+    3
+    >>> model.depth
+    1
     >>> test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], ["features"])
     >>> model.transform(test0).head().prediction
     0.0
@@ -269,7 +274,8 @@ def getImpurity(self):
         return self.getOrDefault(self.impurity)
 
 
-class DecisionTreeClassificationModel(JavaModel):
+@inherit_doc
+class DecisionTreeClassificationModel(DecisionTreeModel):
     """
     Model fitted by DecisionTreeClassifier.
     """
@@ -284,6 +290,7 @@ class RandomForestClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred
     It supports both binary and multiclass labels, as well as both continuous and categorical
     features.
 
+    >>> from numpy import allclose
     >>> from pyspark.mllib.linalg import Vectors
     >>> from pyspark.ml.feature import StringIndexer
     >>> df = sqlContext.createDataFrame([
@@ -294,6 +301,8 @@ class RandomForestClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred
     >>> td = si_model.transform(df)
     >>> rf = RandomForestClassifier(numTrees=2, maxDepth=2, labelCol="indexed", seed=42)
     >>> model = rf.fit(td)
+    >>> allclose(model.treeWeights, [1.0, 1.0])
+    True
     >>> test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], ["features"])
     >>> model.transform(test0).head().prediction
     0.0
@@ -423,7 +432,7 @@ def getFeatureSubsetStrategy(self):
         return self.getOrDefault(self.featureSubsetStrategy)
 
 
-class RandomForestClassificationModel(JavaModel):
+class RandomForestClassificationModel(TreeEnsembleModels):
     """
     Model fitted by RandomForestClassifier.
     """
@@ -438,6 +447,7 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol
     It supports binary labels, as well as both continuous and categorical features.
     Note: Multiclass labels are not currently supported.
 
+    >>> from numpy import allclose
     >>> from pyspark.mllib.linalg import Vectors
     >>> from pyspark.ml.feature import StringIndexer
     >>> df = sqlContext.createDataFrame([
@@ -448,6 +458,8 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol
     >>> td = si_model.transform(df)
     >>> gbt = GBTClassifier(maxIter=5, maxDepth=2, labelCol="indexed")
     >>> model = gbt.fit(td)
+    >>> allclose(model.treeWeights, [1.0, 0.1, 0.1, 0.1, 0.1])
+    True
     >>> test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], ["features"])
     >>> model.transform(test0).head().prediction
     0.0
@@ -558,7 +570,7 @@ def getStepSize(self):
         return self.getOrDefault(self.stepSize)
 
 
-class GBTClassificationModel(JavaModel):
+class GBTClassificationModel(TreeEnsembleModels):
     """
     Model fitted by GBTClassifier.
     """
diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py
index b139e27372d80..44f60a769566d 100644
--- a/python/pyspark/ml/regression.py
+++ b/python/pyspark/ml/regression.py
@@ -172,6 +172,10 @@ class DecisionTreeRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi
     ...     (0.0, Vectors.sparse(1, [], []))], ["label", "features"])
     >>> dt = DecisionTreeRegressor(maxDepth=2)
     >>> model = dt.fit(df)
+    >>> model.depth
+    1
+    >>> model.numNodes
+    3
     >>> test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], ["features"])
     >>> model.transform(test0).head().prediction
     0.0
@@ -239,7 +243,37 @@ def getImpurity(self):
         return self.getOrDefault(self.impurity)
 
 
-class DecisionTreeRegressionModel(JavaModel):
+@inherit_doc
+class DecisionTreeModel(JavaModel):
+
+    @property
+    def numNodes(self):
+        """Return number of nodes of the decision tree."""
+        return self._call_java("numNodes")
+
+    @property
+    def depth(self):
+        """Return depth of the decision tree."""
+        return self._call_java("depth")
+
+    def __repr__(self):
+        return self._call_java("toString")
+
+
+@inherit_doc
+class TreeEnsembleModels(JavaModel):
+
+    @property
+    def treeWeights(self):
+        """Return the weights for each tree"""
+        return list(self._call_java("javaTreeWeights"))
+
+    def __repr__(self):
+        return self._call_java("toString")
+
+
+@inherit_doc
+class DecisionTreeRegressionModel(DecisionTreeModel):
     """
     Model fitted by DecisionTreeRegressor.
     """
@@ -253,12 +287,15 @@ class RandomForestRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi
     learning algorithm for regression.
     It supports both continuous and categorical features.
 
+    >>> from numpy import allclose
     >>> from pyspark.mllib.linalg import Vectors
     >>> df = sqlContext.createDataFrame([
     ...     (1.0, Vectors.dense(1.0)),
     ...     (0.0, Vectors.sparse(1, [], []))], ["label", "features"])
     >>> rf = RandomForestRegressor(numTrees=2, maxDepth=2, seed=42)
     >>> model = rf.fit(df)
+    >>> allclose(model.treeWeights, [1.0, 1.0])
+    True
     >>> test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], ["features"])
     >>> model.transform(test0).head().prediction
     0.0
@@ -389,7 +426,7 @@ def getFeatureSubsetStrategy(self):
         return self.getOrDefault(self.featureSubsetStrategy)
 
 
-class RandomForestRegressionModel(JavaModel):
+class RandomForestRegressionModel(TreeEnsembleModels):
     """
     Model fitted by RandomForestRegressor.
     """
@@ -403,12 +440,15 @@ class GBTRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol,
     learning algorithm for regression.
     It supports both continuous and categorical features.
 
+    >>> from numpy import allclose
     >>> from pyspark.mllib.linalg import Vectors
     >>> df = sqlContext.createDataFrame([
     ...     (1.0, Vectors.dense(1.0)),
     ...     (0.0, Vectors.sparse(1, [], []))], ["label", "features"])
     >>> gbt = GBTRegressor(maxIter=5, maxDepth=2)
     >>> model = gbt.fit(df)
+    >>> allclose(model.treeWeights, [1.0, 0.1, 0.1, 0.1, 0.1])
+    True
     >>> test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], ["features"])
     >>> model.transform(test0).head().prediction
     0.0
@@ -518,7 +558,7 @@ def getStepSize(self):
         return self.getOrDefault(self.stepSize)
 
 
-class GBTRegressionModel(JavaModel):
+class GBTRegressionModel(TreeEnsembleModels):
     """
     Model fitted by GBTRegressor.
     """

From 738c10748b49eb8a475d1fd26c6a271ca36497cf Mon Sep 17 00:00:00 2001
From: MechCoder <manojkumarsivaraj334@gmail.com>
Date: Tue, 7 Jul 2015 08:59:52 -0700
Subject: [PATCH 034/549] [SPARK-8823] [MLLIB] [PYSPARK] Optimizations for
 SparseVector dot products

Follow up for https://github.com/apache/spark/pull/5946

Currently we iterate over indices and values in SparseVector and can be vectorized.

Author: MechCoder <manojkumarsivaraj334@gmail.com>

Closes #7222 from MechCoder/sparse_optim and squashes the following commits:

dcb51d3 [MechCoder] [SPARK-8823] [MLlib] [PySpark] Optimizations for SparseVector dot product
---
 python/pyspark/mllib/linalg.py | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/python/pyspark/mllib/linalg.py b/python/pyspark/mllib/linalg.py
index 9959a01cce7e0..12d8dbbb92c56 100644
--- a/python/pyspark/mllib/linalg.py
+++ b/python/pyspark/mllib/linalg.py
@@ -590,18 +590,14 @@ def dot(self, other):
             return np.dot(other.array[self.indices], self.values)
 
         elif isinstance(other, SparseVector):
-            result = 0.0
-            i, j = 0, 0
-            while i < len(self.indices) and j < len(other.indices):
-                if self.indices[i] == other.indices[j]:
-                    result += self.values[i] * other.values[j]
-                    i += 1
-                    j += 1
-                elif self.indices[i] < other.indices[j]:
-                    i += 1
-                else:
-                    j += 1
-            return result
+            # Find out common indices.
+            self_cmind = np.in1d(self.indices, other.indices, assume_unique=True)
+            self_values = self.values[self_cmind]
+            if self_values.size == 0:
+                return 0.0
+            else:
+                other_cmind = np.in1d(other.indices, self.indices, assume_unique=True)
+                return np.dot(self_values, other.values[other_cmind])
 
         else:
             return self.dot(_convert_to_vector(other))

From 70beb808e13f6371968ac87f7cf625ed110375e6 Mon Sep 17 00:00:00 2001
From: Simon Hafner <hafnersimon@gmail.com>
Date: Tue, 7 Jul 2015 09:42:59 -0700
Subject: [PATCH 035/549] [SPARK-8821] [EC2] Switched to binary mode for file
 reading

Otherwise the script will crash with

    - Downloading boto...
    Traceback (most recent call last):
      File "ec2/spark_ec2.py", line 148, in <module>
        setup_external_libs(external_libs)
      File "ec2/spark_ec2.py", line 128, in setup_external_libs
        if hashlib.md5(tar.read()).hexdigest() != lib["md5"]:
      File "/usr/lib/python3.4/codecs.py", line 319, in decode
        (result, consumed) = self._buffer_decode(data, self.errors, final)
    UnicodeDecodeError: 'utf-8' codec can't decode byte 0x8b in position 1: invalid start byte

In case of an utf8 env setting.

Author: Simon Hafner <hafnersimon@gmail.com>

Closes #7215 from reactormonk/branch-1.4 and squashes the following commits:

e86957a [Simon Hafner] [SPARK-8821] [EC2] Switched to binary mode

(cherry picked from commit 83a621a5a8f8a2991c4cfa687279589e5c623d46)
Signed-off-by: Shivaram Venkataraman <shivaram@cs.berkeley.edu>
---
 ec2/spark_ec2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ec2/spark_ec2.py b/ec2/spark_ec2.py
index 18ccbc0a3edd0..8582d434a8c55 100755
--- a/ec2/spark_ec2.py
+++ b/ec2/spark_ec2.py
@@ -125,7 +125,7 @@ def setup_external_libs(libs):
             )
             with open(tgz_file_path, "wb") as tgz_file:
                 tgz_file.write(download_stream.read())
-            with open(tgz_file_path) as tar:
+            with open(tgz_file_path, "rb") as tar:
                 if hashlib.md5(tar.read()).hexdigest() != lib["md5"]:
                     print("ERROR: Got wrong md5sum for {lib}.".format(lib=lib["name"]), file=stderr)
                     sys.exit(1)

From 3336c7b148ad543d1f9b64ca2b559ea04930f5be Mon Sep 17 00:00:00 2001
From: Feynman Liang <fliang@databricks.com>
Date: Tue, 7 Jul 2015 11:34:30 -0700
Subject: [PATCH 036/549] [SPARK-8559] [MLLIB] Support Association Rule
 Generation

Distributed generation of single-consequent association rules from a RDD of frequent itemsets. Tests referenced against `R`'s implementation of A Priori in [arules](http://cran.r-project.org/web/packages/arules/index.html).

Author: Feynman Liang <fliang@databricks.com>

Closes #7005 from feynmanliang/fp-association-rules-distributed and squashes the following commits:

466ced0 [Feynman Liang] Refactor AR generation impl
73c1cff [Feynman Liang] Make rule attributes public, remove numTransactions from FreqItemset
80f63ff [Feynman Liang] Change default confidence and optimize imports
04cf5b5 [Feynman Liang] Code review with @mengxr, add R to tests
0cc1a6a [Feynman Liang] Java compatibility test
f3c14b5 [Feynman Liang] Fix MiMa test
764375e [Feynman Liang] Fix tests
1187307 [Feynman Liang] Almost working tests
b20779b [Feynman Liang] Working implementation
5395c4e [Feynman Liang] Fix imports
2d34405 [Feynman Liang] Partial implementation of distributed ar
83ace4b [Feynman Liang] Local rule generation without pruning complete
69c2c87 [Feynman Liang] Working local implementation, now to parallelize../..
4e1ec9a [Feynman Liang] Pull FreqItemsets out, refactor type param, tests
69ccedc [Feynman Liang] First implementation of association rule generation
---
 .../spark/mllib/fpm/AssociationRules.scala    | 108 ++++++++++++++++++
 .../org/apache/spark/mllib/fpm/FPGrowth.scala |   2 +-
 .../mllib/fpm/JavaAssociationRulesSuite.java  |  58 ++++++++++
 .../spark/mllib/fpm/JavaFPGrowthSuite.java    |   5 +-
 .../mllib/fpm/AssociationRulesSuite.scala     |  89 +++++++++++++++
 5 files changed, 258 insertions(+), 4 deletions(-)
 create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala
 create mode 100644 mllib/src/test/java/org/apache/spark/mllib/fpm/JavaAssociationRulesSuite.java
 create mode 100644 mllib/src/test/scala/org/apache/spark/mllib/fpm/AssociationRulesSuite.scala

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala
new file mode 100644
index 0000000000000..4a0f842f3338d
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala
@@ -0,0 +1,108 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.mllib.fpm
+
+import scala.reflect.ClassTag
+
+import org.apache.spark.Logging
+import org.apache.spark.annotation.Experimental
+import org.apache.spark.api.java.JavaRDD
+import org.apache.spark.api.java.JavaSparkContext.fakeClassTag
+import org.apache.spark.mllib.fpm.AssociationRules.Rule
+import org.apache.spark.mllib.fpm.FPGrowth.FreqItemset
+import org.apache.spark.rdd.RDD
+
+/**
+ * :: Experimental ::
+ *
+ * Generates association rules from a [[RDD[FreqItemset[Item]]]. This method only generates
+ * association rules which have a single item as the consequent.
+ */
+@Experimental
+class AssociationRules private (
+    private var minConfidence: Double) extends Logging with Serializable {
+
+  /**
+   * Constructs a default instance with default parameters {minConfidence = 0.8}.
+   */
+  def this() = this(0.8)
+
+  /**
+   * Sets the minimal confidence (default: `0.8`).
+   */
+  def setMinConfidence(minConfidence: Double): this.type = {
+    this.minConfidence = minConfidence
+    this
+  }
+
+  /**
+   * Computes the association rules with confidence above [[minConfidence]].
+   * @param freqItemsets frequent itemset model obtained from [[FPGrowth]]
+   * @return a [[Set[Rule[Item]]] containing the assocation rules.
+   */
+  def run[Item: ClassTag](freqItemsets: RDD[FreqItemset[Item]]): RDD[Rule[Item]] = {
+    // For candidate rule X => Y, generate (X, (Y, freq(X union Y)))
+    val candidates = freqItemsets.flatMap { itemset =>
+      val items = itemset.items
+      items.flatMap { item =>
+        items.partition(_ == item) match {
+          case (consequent, antecedent) if !antecedent.isEmpty =>
+            Some((antecedent.toSeq, (consequent.toSeq, itemset.freq)))
+          case _ => None
+        }
+      }
+    }
+
+    // Join to get (X, ((Y, freq(X union Y)), freq(X))), generate rules, and filter by confidence
+    candidates.join(freqItemsets.map(x => (x.items.toSeq, x.freq)))
+      .map { case (antecendent, ((consequent, freqUnion), freqAntecedent)) =>
+      new Rule(antecendent.toArray, consequent.toArray, freqUnion, freqAntecedent)
+    }.filter(_.confidence >= minConfidence)
+  }
+
+  def run[Item](freqItemsets: JavaRDD[FreqItemset[Item]]): JavaRDD[Rule[Item]] = {
+    val tag = fakeClassTag[Item]
+    run(freqItemsets.rdd)(tag)
+  }
+}
+
+object AssociationRules {
+
+  /**
+   * :: Experimental ::
+   *
+   * An association rule between sets of items.
+   * @param antecedent hypotheses of the rule
+   * @param consequent conclusion of the rule
+   * @tparam Item item type
+   */
+  @Experimental
+  class Rule[Item] private[mllib] (
+      val antecedent: Array[Item],
+      val consequent: Array[Item],
+      freqUnion: Double,
+      freqAntecedent: Double) extends Serializable {
+
+    def confidence: Double = freqUnion.toDouble / freqAntecedent
+
+    require(antecedent.toSet.intersect(consequent.toSet).isEmpty, {
+      val sharedItems = antecedent.toSet.intersect(consequent.toSet)
+      s"A valid association rule must have disjoint antecedent and " +
+        s"consequent but ${sharedItems} is present in both."
+    })
+  }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
index efa8459d3cdba..0da59e812d5f9 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
@@ -28,7 +28,7 @@ import org.apache.spark.{HashPartitioner, Logging, Partitioner, SparkException}
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.api.java.JavaSparkContext.fakeClassTag
-import org.apache.spark.mllib.fpm.FPGrowth.FreqItemset
+import org.apache.spark.mllib.fpm.FPGrowth._
 import org.apache.spark.rdd.RDD
 import org.apache.spark.storage.StorageLevel
 
diff --git a/mllib/src/test/java/org/apache/spark/mllib/fpm/JavaAssociationRulesSuite.java b/mllib/src/test/java/org/apache/spark/mllib/fpm/JavaAssociationRulesSuite.java
new file mode 100644
index 0000000000000..b3815ae6039c0
--- /dev/null
+++ b/mllib/src/test/java/org/apache/spark/mllib/fpm/JavaAssociationRulesSuite.java
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.mllib.fpm;
+
+import java.io.Serializable;
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+import com.google.common.collect.Lists;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.mllib.fpm.FPGrowth.FreqItemset;
+
+
+public class JavaAssociationRulesSuite implements Serializable {
+  private transient JavaSparkContext sc;
+
+  @Before
+  public void setUp() {
+    sc = new JavaSparkContext("local", "JavaFPGrowth");
+  }
+
+  @After
+  public void tearDown() {
+    sc.stop();
+    sc = null;
+  }
+
+  @Test
+  public void runAssociationRules() {
+
+    @SuppressWarnings("unchecked")
+    JavaRDD<FPGrowth.FreqItemset<String>> freqItemsets = sc.parallelize(Lists.newArrayList(
+      new FreqItemset<String>(new String[] {"a"}, 15L),
+      new FreqItemset<String>(new String[] {"b"}, 35L),
+      new FreqItemset<String>(new String[] {"a", "b"}, 18L)
+    ));
+
+    JavaRDD<AssociationRules.Rule<String>> results = (new AssociationRules()).run(freqItemsets);
+  }
+}
+
diff --git a/mllib/src/test/java/org/apache/spark/mllib/fpm/JavaFPGrowthSuite.java b/mllib/src/test/java/org/apache/spark/mllib/fpm/JavaFPGrowthSuite.java
index bd0edf2b9ea62..9ce2c52dca8b6 100644
--- a/mllib/src/test/java/org/apache/spark/mllib/fpm/JavaFPGrowthSuite.java
+++ b/mllib/src/test/java/org/apache/spark/mllib/fpm/JavaFPGrowthSuite.java
@@ -29,7 +29,6 @@
 
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.mllib.fpm.FPGrowth.FreqItemset;
 
 public class JavaFPGrowthSuite implements Serializable {
   private transient JavaSparkContext sc;
@@ -62,10 +61,10 @@ public void runFPGrowth() {
       .setNumPartitions(2)
       .run(rdd);
 
-    List<FreqItemset<String>> freqItemsets = model.freqItemsets().toJavaRDD().collect();
+    List<FPGrowth.FreqItemset<String>> freqItemsets = model.freqItemsets().toJavaRDD().collect();
     assertEquals(18, freqItemsets.size());
 
-    for (FreqItemset<String> itemset: freqItemsets) {
+    for (FPGrowth.FreqItemset<String> itemset: freqItemsets) {
       // Test return types.
       List<String> items = itemset.javaItems();
       long freq = itemset.freq();
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/fpm/AssociationRulesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/fpm/AssociationRulesSuite.scala
new file mode 100644
index 0000000000000..77a2773c36f56
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/mllib/fpm/AssociationRulesSuite.scala
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.mllib.fpm
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.mllib.util.MLlibTestSparkContext
+
+class AssociationRulesSuite extends SparkFunSuite with MLlibTestSparkContext {
+
+  test("association rules using String type") {
+    val freqItemsets = sc.parallelize(Seq(
+      (Set("s"), 3L), (Set("z"), 5L), (Set("x"), 4L), (Set("t"), 3L), (Set("y"), 3L),
+      (Set("r"), 3L),
+      (Set("x", "z"), 3L), (Set("t", "y"), 3L), (Set("t", "x"), 3L), (Set("s", "x"), 3L),
+      (Set("y", "x"), 3L), (Set("y", "z"), 3L), (Set("t", "z"), 3L),
+      (Set("y", "x", "z"), 3L), (Set("t", "x", "z"), 3L), (Set("t", "y", "z"), 3L),
+      (Set("t", "y", "x"), 3L),
+      (Set("t", "y", "x", "z"), 3L)
+    ).map {
+      case (items, freq) => new FPGrowth.FreqItemset(items.toArray, freq)
+    })
+
+    val ar = new AssociationRules()
+
+    val results1 = ar
+      .setMinConfidence(0.9)
+      .run(freqItemsets)
+      .collect()
+
+    /* Verify results using the `R` code:
+       transactions = as(sapply(
+         list("r z h k p",
+              "z y x w v u t s",
+              "s x o n r",
+              "x z y m t s q e",
+              "z",
+              "x z y r q t p"),
+         FUN=function(x) strsplit(x," ",fixed=TRUE)),
+         "transactions")
+       ars = apriori(transactions,
+                     parameter = list(support = 0.0, confidence = 0.5, target="rules", minlen=2))
+       arsDF = as(ars, "data.frame")
+       arsDF$support = arsDF$support * length(transactions)
+       names(arsDF)[names(arsDF) == "support"] = "freq"
+       > nrow(arsDF)
+       [1] 23
+       > sum(arsDF$confidence == 1)
+       [1] 23
+     */
+    assert(results1.size === 23)
+    assert(results1.count(rule => math.abs(rule.confidence - 1.0D) < 1e-6) == 23)
+
+    val results2 = ar
+      .setMinConfidence(0)
+      .run(freqItemsets)
+      .collect()
+
+    /* Verify results using the `R` code:
+       ars = apriori(transactions,
+                  parameter = list(support = 0.5, confidence = 0.5, target="rules", minlen=2))
+       arsDF = as(ars, "data.frame")
+       arsDF$support = arsDF$support * length(transactions)
+       names(arsDF)[names(arsDF) == "support"] = "freq"
+       nrow(arsDF)
+       sum(arsDF$confidence == 1)
+       > nrow(arsDF)
+       [1] 30
+       > sum(arsDF$confidence == 1)
+       [1] 23
+     */
+    assert(results2.size === 30)
+    assert(results2.count(rule => math.abs(rule.confidence - 1.0D) < 1e-6) == 23)
+  }
+}
+

From 35d781e71b68eb6da7f49fdae40fa6c4f8e27060 Mon Sep 17 00:00:00 2001
From: MechCoder <manojkumarsivaraj334@gmail.com>
Date: Tue, 7 Jul 2015 12:35:40 -0700
Subject: [PATCH 037/549] [SPARK-8704] [ML] [PySpark] Add missing methods in
 StandardScaler

Add std, mean to StandardScalerModel
getVectors, findSynonyms to Word2Vec Model
setFeatures and getFeatures to hashingTF

Author: MechCoder <manojkumarsivaraj334@gmail.com>

Closes #7086 from MechCoder/missing_model_methods and squashes the following commits:

9fbae90 [MechCoder] Add type
6e3d6b2 [MechCoder] [SPARK-8704] Add missing methods in StandardScaler (ML and PySpark)
---
 .../spark/ml/feature/StandardScaler.scala      |  6 ++++++
 python/pyspark/ml/feature.py                   | 18 ++++++++++++++++++
 2 files changed, 24 insertions(+)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala
index ca3c1cfb56b7f..72b545e5db3e4 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala
@@ -106,6 +106,12 @@ class StandardScalerModel private[ml] (
     scaler: feature.StandardScalerModel)
   extends Model[StandardScalerModel] with StandardScalerParams {
 
+  /** Standard deviation of the StandardScalerModel */
+  val std: Vector = scaler.std
+
+  /** Mean of the StandardScalerModel */
+  val mean: Vector = scaler.mean
+
   /** @group setParam */
   def setInputCol(value: String): this.type = set(inputCol, value)
 
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 8804dace849b3..9bca7cc000aa5 100644
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -627,6 +627,10 @@ class StandardScaler(JavaEstimator, HasInputCol, HasOutputCol):
     >>> df = sqlContext.createDataFrame([(Vectors.dense([0.0]),), (Vectors.dense([2.0]),)], ["a"])
     >>> standardScaler = StandardScaler(inputCol="a", outputCol="scaled")
     >>> model = standardScaler.fit(df)
+    >>> model.mean
+    DenseVector([1.0])
+    >>> model.std
+    DenseVector([1.4142])
     >>> model.transform(df).collect()[1].scaled
     DenseVector([1.4142])
     """
@@ -692,6 +696,20 @@ class StandardScalerModel(JavaModel):
     Model fitted by StandardScaler.
     """
 
+    @property
+    def std(self):
+        """
+        Standard deviation of the StandardScalerModel.
+        """
+        return self._call_java("std")
+
+    @property
+    def mean(self):
+        """
+        Mean of the StandardScalerModel.
+        """
+        return self._call_java("mean")
+
 
 @inherit_doc
 class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol):

From 3bf20c27ff3cb3a32bfc3a44e08a57865957c117 Mon Sep 17 00:00:00 2001
From: DB Tsai <dbt@netflix.com>
Date: Tue, 7 Jul 2015 15:46:44 -0700
Subject: [PATCH 038/549] [SPARK-8845] [ML] ML use of Breeze optimization: use
 adjustedValue  instead of value

In LinearRegression and LogisticRegression, we use Breeze's optimizers (LBFGS and OWLQN). We check the State.value to see the current objective. However, Breeze's documentation makes it sound like value and adjustedValue differ for some optimizers, possibly including OWLQN: https://github.com/scalanlp/breeze/blob/26faf622862e8d7a42a401aef601347aac655f2b/math/src/main/scala/breeze/optimize/FirstOrderMinimizer.scala#L36
If that is the case, then we should use adjustedValue instead of value. This is relevant to SPARK-8538 and SPARK-8539, where we will provide the objective trace to the user.

Author: DB Tsai <dbt@netflix.com>

Closes #7245 from dbtsai/SPARK-8845 and squashes the following commits:

fa4c91e [DB Tsai] address feedback
e6caac1 [DB Tsai] java style multiline comment
b10c574 [DB Tsai] address feedback
c9ff81e [DB Tsai] first commit
---
 .../classification/LogisticRegression.scala   | 83 ++++++++++---------
 .../ml/regression/LinearRegression.scala      | 57 ++++++++-----
 2 files changed, 80 insertions(+), 60 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index 2e6eedd45ab07..3967151f76c7c 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -116,7 +116,7 @@ class LogisticRegression(override val uid: String)
           case ((summarizer: MultivariateOnlineSummarizer, labelSummarizer: MultiClassSummarizer),
           (label: Double, features: Vector)) =>
             (summarizer.add(features), labelSummarizer.add(label))
-      },
+        },
         combOp = (c1, c2) => (c1, c2) match {
           case ((summarizer1: MultivariateOnlineSummarizer,
           classSummarizer1: MultiClassSummarizer), (summarizer2: MultivariateOnlineSummarizer,
@@ -166,18 +166,18 @@ class LogisticRegression(override val uid: String)
       Vectors.zeros(if ($(fitIntercept)) numFeatures + 1 else numFeatures)
 
     if ($(fitIntercept)) {
-      /**
-       * For binary logistic regression, when we initialize the weights as zeros,
-       * it will converge faster if we initialize the intercept such that
-       * it follows the distribution of the labels.
-       *
-       * {{{
-       * P(0) = 1 / (1 + \exp(b)), and
-       * P(1) = \exp(b) / (1 + \exp(b))
-       * }}}, hence
-       * {{{
-       * b = \log{P(1) / P(0)} = \log{count_1 / count_0}
-       * }}}
+      /*
+         For binary logistic regression, when we initialize the weights as zeros,
+         it will converge faster if we initialize the intercept such that
+         it follows the distribution of the labels.
+
+         {{{
+         P(0) = 1 / (1 + \exp(b)), and
+         P(1) = \exp(b) / (1 + \exp(b))
+         }}}, hence
+         {{{
+         b = \log{P(1) / P(0)} = \log{count_1 / count_0}
+         }}}
        */
       initialWeightsWithIntercept.toArray(numFeatures)
         = math.log(histogram(1).toDouble / histogram(0).toDouble)
@@ -186,39 +186,48 @@ class LogisticRegression(override val uid: String)
     val states = optimizer.iterations(new CachedDiffFunction(costFun),
       initialWeightsWithIntercept.toBreeze.toDenseVector)
 
-    var state = states.next()
-    val lossHistory = mutable.ArrayBuilder.make[Double]
+    val (weights, intercept, objectiveHistory) = {
+      /*
+         Note that in Logistic Regression, the objective history (loss + regularization)
+         is log-likelihood which is invariance under feature standardization. As a result,
+         the objective history from optimizer is the same as the one in the original space.
+       */
+      val arrayBuilder = mutable.ArrayBuilder.make[Double]
+      var state: optimizer.State = null
+      while (states.hasNext) {
+        state = states.next()
+        arrayBuilder += state.adjustedValue
+      }
 
-    while (states.hasNext) {
-      lossHistory += state.value
-      state = states.next()
-    }
-    lossHistory += state.value
+      if (state == null) {
+        val msg = s"${optimizer.getClass.getName} failed."
+        logError(msg)
+        throw new SparkException(msg)
+      }
 
-    // The weights are trained in the scaled space; we're converting them back to
-    // the original space.
-    val weightsWithIntercept = {
+      /*
+         The weights are trained in the scaled space; we're converting them back to
+         the original space.
+         Note that the intercept in scaled space and original space is the same;
+         as a result, no scaling is needed.
+       */
       val rawWeights = state.x.toArray.clone()
       var i = 0
-      // Note that the intercept in scaled space and original space is the same;
-      // as a result, no scaling is needed.
       while (i < numFeatures) {
         rawWeights(i) *= { if (featuresStd(i) != 0.0) 1.0 / featuresStd(i) else 0.0 }
         i += 1
       }
-      Vectors.dense(rawWeights)
+
+      if ($(fitIntercept)) {
+        (Vectors.dense(rawWeights.dropRight(1)).compressed, rawWeights.last, arrayBuilder.result())
+      } else {
+        (Vectors.dense(rawWeights).compressed, 0.0, arrayBuilder.result())
+      }
     }
 
     if (handlePersistence) instances.unpersist()
 
-    val (weights, intercept) = if ($(fitIntercept)) {
-      (Vectors.dense(weightsWithIntercept.toArray.slice(0, weightsWithIntercept.size - 1)),
-        weightsWithIntercept(weightsWithIntercept.size - 1))
-    } else {
-      (weightsWithIntercept, 0.0)
-    }
-
-    new LogisticRegressionModel(uid, weights.compressed, intercept)
+    copyValues(new LogisticRegressionModel(uid, weights, intercept))
   }
 
   override def copy(extra: ParamMap): LogisticRegression = defaultCopy(extra)
@@ -423,16 +432,12 @@ private class LogisticAggregator(
     require(dim == data.size, s"Dimensions mismatch when adding new sample." +
       s" Expecting $dim but got ${data.size}.")
 
-    val dataSize = data.size
-
     val localWeightsArray = weightsArray
     val localGradientSumArray = gradientSumArray
 
     numClasses match {
       case 2 =>
-        /**
-         * For Binary Logistic Regression.
-         */
+        // For Binary Logistic Regression.
         val margin = - {
           var sum = 0.0
           data.foreachActive { (index, value) =>
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index 1b1d7299fb496..f672c96576a33 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -22,7 +22,7 @@ import scala.collection.mutable
 import breeze.linalg.{DenseVector => BDV, norm => brzNorm}
 import breeze.optimize.{CachedDiffFunction, DiffFunction, LBFGS => BreezeLBFGS, OWLQN => BreezeOWLQN}
 
-import org.apache.spark.Logging
+import org.apache.spark.{SparkException, Logging}
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.ml.PredictorParams
 import org.apache.spark.ml.param.ParamMap
@@ -132,7 +132,6 @@ class LinearRegression(override val uid: String)
     val numFeatures = summarizer.mean.size
     val yMean = statCounter.mean
     val yStd = math.sqrt(statCounter.variance)
-    // look at glmnet5.m L761 maaaybe that has info
 
     // If the yStd is zero, then the intercept is yMean with zero weights;
     // as a result, training is not needed.
@@ -162,21 +161,34 @@ class LinearRegression(override val uid: String)
     }
 
     val initialWeights = Vectors.zeros(numFeatures)
-    val states =
-      optimizer.iterations(new CachedDiffFunction(costFun), initialWeights.toBreeze.toDenseVector)
-
-    var state = states.next()
-    val lossHistory = mutable.ArrayBuilder.make[Double]
+    val states = optimizer.iterations(new CachedDiffFunction(costFun),
+      initialWeights.toBreeze.toDenseVector)
+
+    val (weights, objectiveHistory) = {
+      /*
+         Note that in Linear Regression, the objective history (loss + regularization) returned
+         from optimizer is computed in the scaled space given by the following formula.
+         {{{
+         L = 1/2n||\sum_i w_i(x_i - \bar{x_i}) / \hat{x_i} - (y - \bar{y}) / \hat{y}||^2 + regTerms
+         }}}
+       */
+      val arrayBuilder = mutable.ArrayBuilder.make[Double]
+      var state: optimizer.State = null
+      while (states.hasNext) {
+        state = states.next()
+        arrayBuilder += state.adjustedValue
+      }
 
-    while (states.hasNext) {
-      lossHistory += state.value
-      state = states.next()
-    }
-    lossHistory += state.value
+      if (state == null) {
+        val msg = s"${optimizer.getClass.getName} failed."
+        logError(msg)
+        throw new SparkException(msg)
+      }
 
-    // The weights are trained in the scaled space; we're converting them back to
-    // the original space.
-    val weights = {
+      /*
+         The weights are trained in the scaled space; we're converting them back to
+         the original space.
+       */
       val rawWeights = state.x.toArray.clone()
       var i = 0
       val len = rawWeights.length
@@ -184,17 +196,20 @@ class LinearRegression(override val uid: String)
         rawWeights(i) *= { if (featuresStd(i) != 0.0) yStd / featuresStd(i) else 0.0 }
         i += 1
       }
-      Vectors.dense(rawWeights)
+
+      (Vectors.dense(rawWeights).compressed, arrayBuilder.result())
     }
 
-    // The intercept in R's GLMNET is computed using closed form after the coefficients are
-    // converged. See the following discussion for detail.
-    // http://stats.stackexchange.com/questions/13617/how-is-the-intercept-computed-in-glmnet
+    /*
+       The intercept in R's GLMNET is computed using closed form after the coefficients are
+       converged. See the following discussion for detail.
+       http://stats.stackexchange.com/questions/13617/how-is-the-intercept-computed-in-glmnet
+     */
     val intercept = if ($(fitIntercept)) yMean - dot(weights, Vectors.dense(featuresMean)) else 0.0
+
     if (handlePersistence) instances.unpersist()
 
-    // TODO: Converts to sparse format based on the storage, but may base on the scoring speed.
-    copyValues(new LinearRegressionModel(uid, weights.compressed, intercept))
+    copyValues(new LinearRegressionModel(uid, weights, intercept))
   }
 
   override def copy(extra: ParamMap): LinearRegression = defaultCopy(extra)

From da56c4e728d6bdb6d5559ec7fd8a3f7ddd1e2319 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@appier.com>
Date: Tue, 7 Jul 2015 15:49:22 -0700
Subject: [PATCH 039/549] [SPARK-8794] [SQL] Make PrunedScan work for Sample

JIRA: https://issues.apache.org/jira/browse/SPARK-8794

Currently `PrunedScan` works only when followed by project or filter operations. However, even if there is a `Sample` between these operations and `PrunedScan`, `PrunedScan` should work too.

Author: Liang-Chi Hsieh <viirya@appier.com>
Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #7228 from viirya/sample_prunedscan and squashes the following commits:

ede7cd8 [Liang-Chi Hsieh] Keep PrunedScanSuite untouched.
6f05d30 [Liang-Chi Hsieh] Move unit test to FilterPushdownSuite.
5f32473 [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into sample_prunedscan
7e4ba76 [Liang-Chi Hsieh] Use Optimzier for push down projection and filter.
0686830 [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into sample_prunedscan
df82785 [Liang-Chi Hsieh] Make PrunedScan work on Sample.
---
 .../sql/catalyst/optimizer/Optimizer.scala     | 18 ++++++++++++++++++
 .../optimizer/FilterPushdownSuite.scala        | 16 ++++++++++++++++
 2 files changed, 34 insertions(+)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index bfd24287c9645..7d41ef9aafa44 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -41,6 +41,7 @@ object DefaultOptimizer extends Optimizer {
     Batch("Operator Optimizations", FixedPoint(100),
       // Operator push down
       UnionPushDown,
+      SamplePushDown,
       PushPredicateThroughJoin,
       PushPredicateThroughProject,
       PushPredicateThroughGenerate,
@@ -65,6 +66,23 @@ object DefaultOptimizer extends Optimizer {
       ConvertToLocalRelation) :: Nil
 }
 
+/**
+ * Pushes operations down into a Sample.
+ */
+object SamplePushDown extends Rule[LogicalPlan] {
+
+  def apply(plan: LogicalPlan): LogicalPlan = plan transform {
+    // Push down filter into sample
+    case Filter(condition, s @ Sample(lb, up, replace, seed, child)) =>
+      Sample(lb, up, replace, seed,
+        Filter(condition, child))
+    // Push down projection into sample
+    case Project(projectList, s @ Sample(lb, up, replace, seed, child)) =>
+      Sample(lb, up, replace, seed,
+        Project(projectList, child))
+  }
+}
+
 /**
  * Pushes operations to either side of a Union.
  */
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
index ffdc673cdc455..dc28b3ffb59ee 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
@@ -34,6 +34,7 @@ class FilterPushdownSuite extends PlanTest {
       Batch("Subqueries", Once,
         EliminateSubQueries) ::
       Batch("Filter Pushdown", Once,
+        SamplePushDown,
         CombineFilters,
         PushPredicateThroughProject,
         BooleanSimplification,
@@ -593,4 +594,19 @@ class FilterPushdownSuite extends PlanTest {
 
     comparePlans(optimized1, analysis.EliminateSubQueries(correctAnswer1))
   }
+
+  test("push project and filter down into sample") {
+    val x = testRelation.subquery('x)
+    val originalQuery =
+      Sample(0.0, 0.6, false, 11L, x).select('a)
+
+    val originalQueryAnalyzed = EliminateSubQueries(analysis.SimpleAnalyzer.execute(originalQuery))
+
+    val optimized = Optimize.execute(originalQueryAnalyzed)
+
+    val correctAnswer =
+      Sample(0.0, 0.6, false, 11L, x.select('a))
+
+    comparePlans(optimized, correctAnswer.analyze)
+  }
 }

From 770ff1025e751ad139b6284c1d7cc3de778b1d87 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Tue, 7 Jul 2015 17:40:14 -0700
Subject: [PATCH 040/549] [SPARK-8876][SQL] Remove InternalRow type alias in
 expressions package.

The type alias was there because initially when I moved Row around, I didn't want to do massive changes to the expression code. But now it should be pretty easy to just remove it. One less concept to worry about.

Author: Reynold Xin <rxin@databricks.com>

Closes #7270 from rxin/internalrow and squashes the following commits:

72fc842 [Reynold Xin] [SPARK-8876][SQL] Remove InternalRow type alias in expressions package.
---
 .../spark/sql/catalyst/analysis/unresolved.scala   |  1 +
 .../sql/catalyst/expressions/BoundAttribute.scala  |  1 +
 .../spark/sql/catalyst/expressions/Cast.scala      |  1 +
 .../sql/catalyst/expressions/Expression.scala      |  1 +
 .../sql/catalyst/expressions/ExtractValue.scala    |  2 ++
 .../sql/catalyst/expressions/Projection.scala      |  2 ++
 .../spark/sql/catalyst/expressions/ScalaUDF.scala  |  1 +
 .../spark/sql/catalyst/expressions/SortOrder.scala |  1 +
 .../catalyst/expressions/SpecificMutableRow.scala  |  1 +
 .../catalyst/expressions/UnsafeRowConverter.scala  |  1 +
 .../sql/catalyst/expressions/aggregates.scala      |  1 +
 .../sql/catalyst/expressions/arithmetic.scala      |  1 +
 .../expressions/codegen/GenerateOrdering.scala     |  8 +++-----
 .../expressions/codegen/GeneratePredicate.scala    |  1 +
 .../catalyst/expressions/complexTypeCreator.scala  |  4 +---
 .../sql/catalyst/expressions/conditionals.scala    |  2 +-
 .../catalyst/expressions/datetimeFunctions.scala   |  6 +++++-
 .../sql/catalyst/expressions/generators.scala      |  1 +
 .../spark/sql/catalyst/expressions/literals.scala  |  2 +-
 .../spark/sql/catalyst/expressions/math.scala      |  1 +
 .../catalyst/expressions/namedExpressions.scala    |  1 +
 .../sql/catalyst/expressions/nullFunctions.scala   |  1 +
 .../spark/sql/catalyst/expressions/package.scala   |  4 ----
 .../sql/catalyst/expressions/predicates.scala      |  1 +
 .../spark/sql/catalyst/expressions/random.scala    |  1 +
 .../spark/sql/catalyst/expressions/rows.scala      |  1 +
 .../spark/sql/catalyst/expressions/sets.scala      |  1 +
 .../catalyst/expressions/stringOperations.scala    |  3 +++
 .../catalyst/expressions/windowExpressions.scala   |  1 +
 .../spark/sql/catalyst/expressions/CastSuite.scala |  1 +
 .../catalyst/expressions/ComplexTypeSuite.scala    |  4 +---
 .../expressions/ExpressionEvalHelper.scala         |  1 +
 .../UnsafeFixedWidthAggregationMapSuite.scala      |  1 +
 .../expressions/UnsafeRowConverterSuite.scala      |  1 +
 .../optimizer/ConvertToLocalRelationSuite.scala    |  2 +-
 .../spark/sql/catalyst/trees/TreeNodeSuite.scala   |  1 +
 .../scala/org/apache/spark/sql/DataFrame.scala     |  1 +
 .../sql/columnar/InMemoryColumnarTableScan.scala   |  1 +
 .../org/apache/spark/sql/execution/Aggregate.scala |  1 +
 .../org/apache/spark/sql/execution/Exchange.scala  |  1 +
 .../org/apache/spark/sql/execution/Expand.scala    |  1 +
 .../org/apache/spark/sql/execution/Generate.scala  |  3 ++-
 .../spark/sql/execution/GeneratedAggregate.scala   |  1 +
 .../org/apache/spark/sql/execution/SparkPlan.scala |  1 +
 .../spark/sql/execution/SparkStrategies.scala      |  1 +
 .../org/apache/spark/sql/execution/Window.scala    |  1 +
 .../spark/sql/execution/basicOperators.scala       |  1 +
 .../expressions/MonotonicallyIncreasingID.scala    |  3 ++-
 .../execution/expressions/SparkPartitionID.scala   |  3 ++-
 .../sql/execution/joins/BroadcastHashJoin.scala    |  3 ++-
 .../execution/joins/BroadcastHashOuterJoin.scala   |  1 +
 .../joins/BroadcastLeftSemiJoinHash.scala          |  1 +
 .../execution/joins/BroadcastNestedLoopJoin.scala  |  1 +
 .../spark/sql/execution/joins/HashJoin.scala       |  1 +
 .../spark/sql/execution/joins/HashOuterJoin.scala  |  1 +
 .../spark/sql/execution/joins/HashedRelation.scala |  3 ++-
 .../sql/execution/joins/LeftSemiJoinBNL.scala      |  1 +
 .../sql/execution/joins/LeftSemiJoinHash.scala     |  3 ++-
 .../execution/joins/ShuffledHashOuterJoin.scala    |  5 +++--
 .../spark/sql/execution/joins/SortMergeJoin.scala  |  1 +
 .../apache/spark/sql/execution/pythonUDFs.scala    |  1 +
 .../spark/sql/execution/stat/FrequentItems.scala   |  2 +-
 .../scala/org/apache/spark/sql/jdbc/JDBCRDD.scala  |  3 ++-
 .../org/apache/spark/sql/json/JacksonParser.scala  |  1 +
 .../scala/org/apache/spark/sql/json/JsonRDD.scala  |  1 +
 .../spark/sql/parquet/ParquetConverter.scala       |  1 +
 .../spark/sql/parquet/ParquetTableOperations.scala | 14 +++++---------
 .../spark/sql/parquet/ParquetTableSupport.scala    |  3 ++-
 .../spark/sql/sources/DataSourceStrategy.scala     |  1 +
 .../org/apache/spark/sql/sources/commands.scala    |  1 +
 .../scala/org/apache/spark/sql/sources/ddl.scala   |  3 ++-
 .../org/apache/spark/sql/sources/interfaces.scala  |  3 ++-
 .../spark/sql/columnar/ColumnStatsSuite.scala      |  2 +-
 .../sql/execution/joins/HashedRelationSuite.scala  |  3 ++-
 .../org/apache/spark/sql/hive/HiveInspectors.scala |  1 +
 .../spark/sql/hive/HiveMetastoreCatalog.scala      |  1 +
 .../org/apache/spark/sql/hive/HiveStrategies.scala |  3 ++-
 .../org/apache/spark/sql/hive/TableReader.scala    |  1 +
 .../spark/sql/hive/execution/HiveTableScan.scala   |  1 +
 .../sql/hive/execution/InsertIntoHiveTable.scala   |  3 ++-
 .../sql/hive/execution/ScriptTransformation.scala  |  2 +-
 .../scala/org/apache/spark/sql/hive/hiveUDFs.scala |  1 +
 .../apache/spark/sql/hive/orc/OrcRelation.scala    |  1 +
 .../apache/spark/sql/hive/HiveInspectorSuite.scala |  3 ++-
 .../sql/hive/orc/OrcPartitionDiscoverySuite.scala  |  2 +-
 85 files changed, 114 insertions(+), 48 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
index ae3adbab05108..f2e579afe833a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.catalyst.analysis
 
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.{errors, trees}
 import org.apache.spark.sql.catalyst.errors.TreeNodeException
 import org.apache.spark.sql.catalyst.expressions._
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala
index dc0b4ac5cd9bb..3f0d7b803125f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.Logging
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.errors.attachTree
 import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode}
 import org.apache.spark.sql.catalyst.trees
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
index 4f73ba40b5a18..cfe201bf83070 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -21,6 +21,7 @@ import java.math.{BigDecimal => JavaBigDecimal}
 import java.sql.{Date, Timestamp}
 
 import org.apache.spark.Logging
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
 import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode}
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
index 386feb95b4ac8..54ec10444c4f3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.{TypeCheckResult, UnresolvedAttribute}
 import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode}
 import org.apache.spark.sql.catalyst.trees
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExtractValue.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExtractValue.scala
index e451c7ffbddc1..2b25ba03579ec 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExtractValue.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExtractValue.scala
@@ -20,10 +20,12 @@ package org.apache.spark.sql.catalyst.expressions
 import scala.collection.Map
 
 import org.apache.spark.sql.AnalysisException
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, CodeGenContext}
 import org.apache.spark.sql.types._
 
+
 object ExtractValue {
   /**
    * Returns the resolved `ExtractValue`. It will return one kind of concrete `ExtractValue`,
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
index fcfe83ceb863a..886a486bf5ee0 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+import org.apache.spark.sql.catalyst.InternalRow
+
 /**
  * A [[Projection]] that is calculated by calling the `eval` of each of the specified expressions.
  * @param expressions a sequence of expressions that determine the value of each column of the
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala
index fc055c97a179f..6fb3343bb63f2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.CatalystTypeConverters
 import org.apache.spark.sql.types.DataType
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SortOrder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SortOrder.scala
index 4baae03b3a224..8ab4ef060b68c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SortOrder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SortOrder.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.errors.TreeNodeException
 import org.apache.spark.sql.catalyst.trees
 import org.apache.spark.sql.types.DataType
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SpecificMutableRow.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SpecificMutableRow.scala
index 3928c0f2ffdaf..efa24710a5a67 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SpecificMutableRow.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SpecificMutableRow.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverter.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverter.scala
index b11fc245c4af9..1f395497a9839 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverter.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverter.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.util.ObjectPool
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.PlatformDependent
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
index 64e07bd2a17db..c0e17f97e9b3c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.catalyst.expressions
 
 import com.clearspring.analytics.stream.cardinality.HyperLogLog
 
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.trees
 import org.apache.spark.sql.catalyst.errors.TreeNodeException
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
index dca6642665741..8476af4a5d8d6 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
 import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode}
 import org.apache.spark.sql.catalyst.util.TypeUtils
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateOrdering.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateOrdering.scala
index 97cb16045ae4a..d05dfc108e63a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateOrdering.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateOrdering.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.expressions.codegen
 
 import org.apache.spark.Logging
 import org.apache.spark.annotation.Private
-import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 
 /**
@@ -33,11 +33,9 @@ class BaseOrdering extends Ordering[InternalRow] {
 }
 
 /**
- * Generates bytecode for an [[Ordering]] of [[Row Rows]] for a given set of
- * [[Expression Expressions]].
+ * Generates bytecode for an [[Ordering]] of rows for a given set of expressions.
  */
-object GenerateOrdering
-    extends CodeGenerator[Seq[SortOrder], Ordering[InternalRow]] with Logging {
+object GenerateOrdering extends CodeGenerator[Seq[SortOrder], Ordering[InternalRow]] with Logging {
 
   protected def canonicalize(in: Seq[SortOrder]): Seq[SortOrder] =
     in.map(ExpressionCanonicalizer.execute(_).asInstanceOf[SortOrder])
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratePredicate.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratePredicate.scala
index 3ebc2c147579b..274a42cb69087 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratePredicate.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratePredicate.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.catalyst.expressions.codegen
 
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 
 /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
index fa70409353e79..65b9d7e608320 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
@@ -17,12 +17,10 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
-import org.apache.spark.sql.catalyst
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
 import org.apache.spark.sql.catalyst.util.TypeUtils
-import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.TypeCheckFailure
 import org.apache.spark.sql.types._
-import org.apache.spark.unsafe.types.UTF8String
 
 /**
  * Returns an Array containing the evaluation of all children expressions.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionals.scala
index 1d7393d3d91f9..395e84f089e41 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionals.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionals.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
-import org.apache.spark.sql.catalyst
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
 import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.types.{BooleanType, DataType}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeFunctions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeFunctions.scala
index 13ba2f2e5d62d..a492b966a5e31 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeFunctions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeFunctions.scala
@@ -17,13 +17,15 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
-import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode}
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.types._
 
 /**
  * Returns the current date at the start of query evaluation.
  * All calls of current_date within the same query return the same value.
+ *
+ * There is no code generation since this expression should get constant folded by the optimizer.
  */
 case class CurrentDate() extends LeafExpression {
   override def foldable: Boolean = true
@@ -39,6 +41,8 @@ case class CurrentDate() extends LeafExpression {
 /**
  * Returns the current timestamp at the start of query evaluation.
  * All calls of current_timestamp within the same query return the same value.
+ *
+ * There is no code generation since this expression should get constant folded by the optimizer.
  */
 case class CurrentTimestamp() extends LeafExpression {
   override def foldable: Boolean = true
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
index 7a42a1d310581..b68d30a26abd8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.catalyst.expressions
 import scala.collection.Map
 
 import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
 import org.apache.spark.sql.catalyst.{CatalystTypeConverters, trees}
 import org.apache.spark.sql.types._
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
index 479224af5627a..3a7a7ae440036 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.expressions
 
 import java.sql.{Date, Timestamp}
 
-import org.apache.spark.sql.catalyst
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.CatalystTypeConverters
 import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode}
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
index 9dca8513c0d9c..c31890e27fb54 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.catalyst.expressions
 
 import java.{lang => jl}
 
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
index 81ebda3060c51..6181c60c0e453 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
 import org.apache.spark.sql.catalyst.errors.TreeNodeException
 import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullFunctions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullFunctions.scala
index 145d323a9f0bb..1522bcae08d17 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullFunctions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullFunctions.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
 import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode}
 import org.apache.spark.sql.catalyst.util.TypeUtils
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/package.scala
index d24d74e7b82ad..5155bfe41ba66 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/package.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/package.scala
@@ -49,10 +49,6 @@ package org.apache.spark.sql.catalyst
  */
 package object expressions  {
 
-  type InternalRow = org.apache.spark.sql.catalyst.InternalRow
-
-  val InternalRow = org.apache.spark.sql.catalyst.InternalRow
-
   /**
    * Converts a [[InternalRow]] to another Row given a sequence of expression that define each
    * column of the new row. If the schema of the input row is specified, then the given expression
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
index 402a0aa232493..f74fd04619714 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
 import org.apache.spark.sql.catalyst.util.TypeUtils
 import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, CodeGenContext}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/random.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/random.scala
index 45588bacd2e45..6cdc3000382e2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/random.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/random.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.TaskContext
 import org.apache.spark.sql.AnalysisException
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.types.{DataType, DoubleType}
 import org.apache.spark.util.Utils
 import org.apache.spark.util.random.XORShiftRandom
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala
index dd5f2ed2d382e..1363884c201bc 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.types.{DataType, StructType, AtomicType}
 import org.apache.spark.unsafe.types.UTF8String
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/sets.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/sets.scala
index 9b44fb1ed5b3f..49b2026364cd6 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/sets.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/sets.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode}
 import org.apache.spark.sql.types._
 import org.apache.spark.util.collection.OpenHashSet
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
index 6e6a7fb171214..2e51cbb28c10a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
@@ -20,11 +20,14 @@ package org.apache.spark.sql.catalyst.expressions
 import java.util.regex.Pattern
 
 import org.apache.commons.lang3.StringUtils
+
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.UnresolvedException
 import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
 
+
 trait StringRegexExpression extends ExpectsInputTypes {
   self: BinaryExpression =>
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala
index 12023ad311dc8..344361685853f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.UnresolvedException
 import org.apache.spark.sql.catalyst.errors.TreeNodeException
 import org.apache.spark.sql.types.{DataType, NumericType}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
index f3809be722a84..518961e38396f 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.catalyst.expressions
 import java.sql.{Timestamp, Date}
 
 import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.types._
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala
index a09014e1ffc15..e3042143632aa 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala
@@ -17,13 +17,11 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
-import org.scalatest.exceptions.TestFailedException
-
 import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.UnresolvedExtractValue
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.types._
-import org.apache.spark.unsafe.types.UTF8String
 
 
 class ComplexTypeSuite extends SparkFunSuite with ExpressionEvalHelper {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
index 3171caf6ad77f..43392df4bec2e 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
@@ -21,6 +21,7 @@ import org.scalactic.TripleEqualsSupport.Spread
 import org.scalatest.Matchers._
 
 import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.CatalystTypeConverters
 import org.apache.spark.sql.catalyst.expressions.codegen.{GenerateProjection, GenerateMutableProjection}
 import org.apache.spark.sql.catalyst.optimizer.DefaultOptimizer
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMapSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMapSuite.scala
index 6fafc2f86684c..c9667e90a0aaa 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMapSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMapSuite.scala
@@ -23,6 +23,7 @@ import scala.util.Random
 import org.scalatest.{BeforeAndAfterEach, Matchers}
 
 import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.codegen.GenerateProjection
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.memory.{ExecutorMemoryManager, MemoryAllocator, TaskMemoryManager}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala
index 94c2f3242b122..96d4e64ea344a 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala
@@ -23,6 +23,7 @@ import java.util.Arrays
 import org.scalatest.Matchers
 
 import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.util.{ObjectPool, DateTimeUtils}
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.PlatformDependent
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConvertToLocalRelationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConvertToLocalRelationSuite.scala
index 54e8c6462e962..049a19b86f7cd 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConvertToLocalRelationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConvertToLocalRelationSuite.scala
@@ -17,10 +17,10 @@
 
 package org.apache.spark.sql.catalyst.optimizer
 
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.dsl.plans._
-import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.PlanTest
 import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
 import org.apache.spark.sql.catalyst.rules.RuleExecutor
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala
index 86792f0217572..1bd7d4e5cdf0f 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.catalyst.trees
 import scala.collection.mutable.ArrayBuffer
 
 import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.types.{IntegerType, StringType, NullType}
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index caad2da80b1eb..60142291eaf30 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -32,6 +32,7 @@ import org.apache.spark.annotation.{DeveloperApi, Experimental}
 import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.api.python.SerDeUtil
 import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical.{Filter, _}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala
index cb1fd4947fdbc..38720968c1313 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala
@@ -22,6 +22,7 @@ import java.nio.ByteBuffer
 import scala.collection.mutable.ArrayBuffer
 
 import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.expressions._
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Aggregate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Aggregate.scala
index 6e8a5ef18ab62..3cd60a2aa55ed 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Aggregate.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Aggregate.scala
@@ -21,6 +21,7 @@ import java.util.HashMap
 
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.errors._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.physical._
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
index e054c1d144e34..74d933404551c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
@@ -24,6 +24,7 @@ import org.apache.spark.shuffle.hash.HashShuffleManager
 import org.apache.spark.shuffle.sort.SortShuffleManager
 import org.apache.spark.shuffle.unsafe.UnsafeShuffleManager
 import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.errors.attachTree
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.physical._
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Expand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Expand.scala
index 42a0c1be4f694..d90cae1c4c060 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Expand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Expand.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.execution
 
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.errors._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, UnknownPartitioning}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Generate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Generate.scala
index c1665f78a960e..c3c0dc441c928 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Generate.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Generate.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.execution
 
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 
 /**
@@ -35,7 +36,7 @@ private[execution] sealed case class LazyIterator(func: () => TraversableOnce[In
 
 /**
  * :: DeveloperApi ::
- * Applies a [[catalyst.expressions.Generator Generator]] to a stream of input rows, combining the
+ * Applies a [[Generator]] to a stream of input rows, combining the
  * output of each into a new stream of rows.  This operation is similar to a `flatMap` in functional
  * programming with one important additional feature, which allows the input rows to be joined with
  * their output.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala
index 44930f82b53a0..c069da016f9f0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.execution
 import org.apache.spark.TaskContext
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.trees._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.physical._
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
index 7739a9f949c77..ca53186383237 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
@@ -21,6 +21,7 @@ import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.Logging
 import org.apache.spark.rdd.{RDD, RDDOperationScope}
 import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.{CatalystTypeConverters, trees}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.Row
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index 32044989044a6..59b9b553a7ae5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.execution
 
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.planning._
 import org.apache.spark.sql.catalyst.plans._
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Window.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Window.scala
index fd6f1d7ae1255..6e127e548a120 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Window.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Window.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.execution
 import java.util
 
 import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.physical.{AllTuples, ClusteredDistribution, Distribution, Partitioning}
 import org.apache.spark.util.collection.CompactBuffer
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
index 647c4ab5cb651..de14e6ad79ad6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.execution
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.{RDD, ShuffledRDD}
 import org.apache.spark.shuffle.sort.SortShuffleManager
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.CatalystTypeConverters
 import org.apache.spark.sql.catalyst.errors._
 import org.apache.spark.sql.catalyst.expressions._
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/MonotonicallyIncreasingID.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/MonotonicallyIncreasingID.scala
index 68914cf85cb50..437d143e53f3f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/MonotonicallyIncreasingID.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/MonotonicallyIncreasingID.scala
@@ -18,7 +18,8 @@
 package org.apache.spark.sql.execution.expressions
 
 import org.apache.spark.TaskContext
-import org.apache.spark.sql.catalyst.expressions.{InternalRow, LeafExpression}
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.LeafExpression
 import org.apache.spark.sql.types.{LongType, DataType}
 
 /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/SparkPartitionID.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/SparkPartitionID.scala
index 12c2eed0d6b7c..822d3d8c9108d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/SparkPartitionID.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/SparkPartitionID.scala
@@ -18,7 +18,8 @@
 package org.apache.spark.sql.execution.expressions
 
 import org.apache.spark.TaskContext
-import org.apache.spark.sql.catalyst.expressions.{LeafExpression, InternalRow}
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.LeafExpression
 import org.apache.spark.sql.types.{IntegerType, DataType}
 
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoin.scala
index 2d2e1b92b86be..7ffdce60d2955 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoin.scala
@@ -22,7 +22,8 @@ import scala.concurrent.duration._
 
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.catalyst.expressions.{Expression, InternalRow}
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.Expression
 import org.apache.spark.sql.catalyst.plans.physical.{Distribution, Partitioning, UnspecifiedDistribution}
 import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}
 import org.apache.spark.util.ThreadUtils
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashOuterJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashOuterJoin.scala
index 5da04c78744d9..06c244f211aba 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashOuterJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashOuterJoin.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.execution.joins
 
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.physical.{Distribution, UnspecifiedDistribution}
 import org.apache.spark.sql.catalyst.plans.{JoinType, LeftOuter, RightOuter}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastLeftSemiJoinHash.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastLeftSemiJoinHash.scala
index 412a3d4178e12..f7b46d6888d7d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastLeftSemiJoinHash.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastLeftSemiJoinHash.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.execution.joins
 
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastNestedLoopJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastNestedLoopJoin.scala
index 0b2cf8e12a6c4..60b4266fad8b1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastNestedLoopJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastNestedLoopJoin.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.execution.joins
 
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.physical.Partitioning
 import org.apache.spark.sql.catalyst.plans.{FullOuter, JoinType, LeftOuter, RightOuter}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala
index 3a4196a90d14a..ff85ea3f6a410 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.execution.joins
 
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.util.collection.CompactBuffer
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashOuterJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashOuterJoin.scala
index 886b5fa0c5103..333745187728e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashOuterJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashOuterJoin.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.execution.joins
 import java.util.{HashMap => JavaHashMap}
 
 import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, UnknownPartitioning}
 import org.apache.spark.sql.catalyst.plans.{FullOuter, JoinType, LeftOuter, RightOuter}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
index e18c817975134..de062c791f5fb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
@@ -20,7 +20,8 @@ package org.apache.spark.sql.execution.joins
 import java.io.{ObjectInput, ObjectOutput, Externalizable}
 import java.util.{HashMap => JavaHashMap}
 
-import org.apache.spark.sql.catalyst.expressions.{Projection, InternalRow}
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.Projection
 import org.apache.spark.sql.execution.SparkSqlSerializer
 import org.apache.spark.util.collection.CompactBuffer
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/LeftSemiJoinBNL.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/LeftSemiJoinBNL.scala
index 2a6d4d1ab08bb..db5be9f453674 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/LeftSemiJoinBNL.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/LeftSemiJoinBNL.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.execution.joins
 
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.physical.Partitioning
 import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/LeftSemiJoinHash.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/LeftSemiJoinHash.scala
index 20d74270afb48..611ba928a16ec 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/LeftSemiJoinHash.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/LeftSemiJoinHash.scala
@@ -19,7 +19,8 @@ package org.apache.spark.sql.execution.joins
 
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, InternalRow}
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression}
 import org.apache.spark.sql.catalyst.plans.physical.ClusteredDistribution
 import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashOuterJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashOuterJoin.scala
index cfc9c14aaa363..ab0a6ad56acde 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashOuterJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashOuterJoin.scala
@@ -17,15 +17,16 @@
 
 package org.apache.spark.sql.execution.joins
 
+import scala.collection.JavaConversions._
+
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.physical.{Distribution, ClusteredDistribution}
 import org.apache.spark.sql.catalyst.plans.{FullOuter, JoinType, LeftOuter, RightOuter}
 import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}
 
-import scala.collection.JavaConversions._
-
 /**
  * :: DeveloperApi ::
  * Performs a hash based outer join for two child relations by shuffling the data using
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoin.scala
index 2abe65a71813d..981447eacad74 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoin.scala
@@ -21,6 +21,7 @@ import java.util.NoSuchElementException
 
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUDFs.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUDFs.scala
index 9e1cff06c7eea..6946e798b71b0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUDFs.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUDFs.scala
@@ -29,6 +29,7 @@ import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.api.python.{PythonBroadcast, PythonRDD}
 import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.plans.logical
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala
index 4e2e2c210d5a9..ec5c6950f37ad 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.execution.stat
 import scala.collection.mutable.{Map => MutableMap}
 
 import org.apache.spark.Logging
-import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.plans.logical.LocalRelation
 import org.apache.spark.sql.types.{ArrayType, StructField, StructType}
 import org.apache.spark.sql.{Column, DataFrame}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala
index 30c5f4ca3e1b2..7a27fba1780b9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala
@@ -23,7 +23,8 @@ import java.util.Properties
 import org.apache.commons.lang3.StringUtils
 
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.catalyst.expressions.{InternalRow, SpecificMutableRow}
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.SpecificMutableRow
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types._
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/json/JacksonParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/json/JacksonParser.scala
index 6222addc9aa3a..4b8ab63b5ab39 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/json/JacksonParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/json/JacksonParser.scala
@@ -24,6 +24,7 @@ import scala.collection.Map
 import com.fasterxml.jackson.core._
 
 import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.json.JacksonUtils.nextUntil
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
index 73d9520d6f53f..01ba05cbd14f1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
@@ -25,6 +25,7 @@ import com.fasterxml.jackson.databind.ObjectMapper
 
 import org.apache.spark.Logging
 import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.ScalaReflection
 import org.apache.spark.sql.catalyst.analysis.HiveTypeCoercion
 import org.apache.spark.sql.catalyst.expressions._
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala
index ae7cbf0624dc8..86a77bf965daa 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala
@@ -26,6 +26,7 @@ import org.apache.parquet.column.Dictionary
 import org.apache.parquet.io.api.{Binary, Converter, GroupConverter, PrimitiveConverter}
 import org.apache.parquet.schema.MessageType
 
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.parquet.CatalystConverter.FieldType
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
index b30fc171c0af1..9058b09375291 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
@@ -18,13 +18,11 @@
 package org.apache.spark.sql.parquet
 
 import java.io.IOException
-import java.lang.{Long => JLong}
 import java.text.{NumberFormat, SimpleDateFormat}
-import java.util.concurrent.{Callable, TimeUnit}
-import java.util.{Date, List => JList}
+import java.util.concurrent.TimeUnit
+import java.util.Date
 
 import scala.collection.JavaConversions._
-import scala.collection.mutable
 import scala.util.Try
 
 import com.google.common.cache.CacheBuilder
@@ -34,11 +32,8 @@ import org.apache.hadoop.mapreduce._
 import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat => NewFileInputFormat}
 import org.apache.hadoop.mapreduce.lib.output.{FileOutputCommitter, FileOutputFormat => NewFileOutputFormat}
 import org.apache.parquet.hadoop._
-import org.apache.parquet.hadoop.api.ReadSupport.ReadContext
-import org.apache.parquet.hadoop.api.{InitContext, ReadSupport}
-import org.apache.parquet.hadoop.metadata.GlobalMetaData
+import org.apache.parquet.hadoop.api.ReadSupport
 import org.apache.parquet.hadoop.util.ContextUtil
-import org.apache.parquet.io.ParquetDecodingException
 import org.apache.parquet.schema.MessageType
 
 import org.apache.spark.annotation.DeveloperApi
@@ -46,7 +41,8 @@ import org.apache.spark.mapred.SparkHadoopMapRedUtil
 import org.apache.spark.mapreduce.SparkHadoopMapReduceUtil
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.SQLConf
-import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, InternalRow, _}
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, _}
 import org.apache.spark.sql.execution.{LeafNode, SparkPlan, UnaryNode}
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.{Logging, TaskContext}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
index df2a96dfeb619..8402cd756140d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
@@ -29,7 +29,8 @@ import org.apache.parquet.io.api._
 import org.apache.parquet.schema.MessageType
 
 import org.apache.spark.Logging
-import org.apache.spark.sql.catalyst.expressions.{Attribute, InternalRow}
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala
index 66f7ba90140b5..70c9e06927582 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala
@@ -21,6 +21,7 @@ import org.apache.spark.{Logging, TaskContext}
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.rdd.{MapPartitionsRDD, RDD, UnionRDD}
 import org.apache.spark.sql._
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.planning.PhysicalOperation
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala
index 7214eb0b4169a..a97142d4a1142 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala
@@ -28,6 +28,7 @@ import org.apache.hadoop.mapreduce.lib.output.{FileOutputCommitter => MapReduceF
 import org.apache.spark._
 import org.apache.spark.mapred.SparkHadoopMapRedUtil
 import org.apache.spark.mapreduce.SparkHadoopMapReduceUtil
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.CatalystTypeConverters
 import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
 import org.apache.spark.sql.catalyst.expressions._
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
index b7095c8ead797..1f0b93e2850ba 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
@@ -26,7 +26,8 @@ import org.apache.spark.Logging
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.sql.catalyst.AbstractSparkSQLParser
 import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation
-import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, InternalRow}
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.execution.RunnableCommand
 import org.apache.spark.sql.types._
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
index 0b875304f9b0e..b13c5313b25c9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
@@ -27,8 +27,9 @@ import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext}
 import org.apache.spark.annotation.{DeveloperApi, Experimental}
 import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.execution.RDDConversions
 import org.apache.spark.sql.{DataFrame, Row, SaveMode, SQLContext}
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.execution.RDDConversions
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.codegen.GenerateMutableProjection
 import org.apache.spark.sql.types.StructType
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnStatsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnStatsSuite.scala
index 9bd7b221e93f8..3333fee6711c0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnStatsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnStatsSuite.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.columnar
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.catalyst.expressions.InternalRow
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.types._
 
 class ColumnStatsSuite extends SparkFunSuite {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala
index 71db6a2159857..9d9858b1c6151 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala
@@ -18,7 +18,8 @@
 package org.apache.spark.sql.execution.joins
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.catalyst.expressions.{Projection, InternalRow}
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.Projection
 import org.apache.spark.util.collection.CompactBuffer
 
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
index 7423d8030f2b5..924e2ba3ba442 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
@@ -24,6 +24,7 @@ import org.apache.hadoop.hive.serde2.typeinfo.{DecimalTypeInfo, TypeInfoFactory}
 import org.apache.hadoop.hive.serde2.{io => hiveIo}
 import org.apache.hadoop.{io => hadoopIo}
 
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.types._
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index f35ae96ee0b50..4b7a782c805a0 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -28,6 +28,7 @@ import org.apache.hadoop.hive.ql.metadata._
 import org.apache.hadoop.hive.ql.plan.TableDesc
 
 import org.apache.spark.Logging
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.{Catalog, MultiInstanceRelation, OverrideCatalog}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.planning.PhysicalOperation
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
index 452b7f0bcc749..ed359620a5f7f 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
@@ -22,9 +22,10 @@ import scala.collection.JavaConversions._
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.CatalystTypeConverters
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
 import org.apache.spark.sql.catalyst.expressions.codegen.GeneratePredicate
-import org.apache.spark.sql.catalyst.expressions.{InternalRow, _}
+import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.planning._
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
index b251a9523bed6..d65d29daacf31 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
@@ -32,6 +32,7 @@ import org.apache.hadoop.mapred.{FileInputFormat, InputFormat, JobConf}
 import org.apache.spark.Logging
 import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.rdd.{EmptyRDD, HadoopRDD, RDD, UnionRDD}
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.unsafe.types.UTF8String
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScan.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScan.scala
index f4c8c9a7e8a68..d33da8242cc1d 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScan.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScan.scala
@@ -27,6 +27,7 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.Object
 import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils
 
 import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.execution._
 import org.apache.spark.sql.hive._
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
index 05f425f2b65f3..8202e553afbfe 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
@@ -31,7 +31,8 @@ import org.apache.hadoop.mapred.{FileOutputFormat, JobConf}
 
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.Row
-import org.apache.spark.sql.catalyst.expressions.{Attribute, InternalRow}
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.execution.{UnaryNode, SparkPlan}
 import org.apache.spark.sql.hive.HiveShim.{ShimFileSinkDesc => FileSinkDesc}
 import org.apache.spark.sql.hive._
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala
index b967e191c5855..205e622195f09 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala
@@ -18,7 +18,6 @@
 package org.apache.spark.sql.hive.execution
 
 import java.io.{BufferedReader, DataInputStream, DataOutputStream, EOFException, InputStreamReader}
-import java.lang.ProcessBuilder.Redirect
 import java.util.Properties
 
 import scala.collection.JavaConversions._
@@ -28,6 +27,7 @@ import org.apache.hadoop.hive.serde2.AbstractSerDe
 import org.apache.hadoop.hive.serde2.objectinspector._
 
 import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.CatalystTypeConverters
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical.ScriptInputOutputSchema
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
index 4dea561ae5f60..1deef6b37eeac 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
@@ -33,6 +33,7 @@ import org.apache.hadoop.hive.ql.udf.generic.GenericUDFUtils.ConversionHelper
 
 import org.apache.spark.Logging
 import org.apache.spark.sql.AnalysisException
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis
 import org.apache.spark.sql.catalyst.analysis.FunctionRegistry.FunctionBuilder
 import org.apache.spark.sql.catalyst.errors.TreeNodeException
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
index 9dc9fbb78e01f..48d35a60a759b 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
@@ -35,6 +35,7 @@ import org.apache.spark.Logging
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.mapred.SparkHadoopMapRedUtil
 import org.apache.spark.rdd.{HadoopRDD, RDD}
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.hive.{HiveContext, HiveInspectors, HiveMetastoreTypes, HiveShim}
 import org.apache.spark.sql.sources.{Filter, _}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala
index a93acb938d5fa..8bb498a06fc9e 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala
@@ -28,7 +28,8 @@ import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectIn
 import org.apache.hadoop.io.LongWritable
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.catalyst.expressions.{Literal, InternalRow}
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.Literal
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.Row
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcPartitionDiscoverySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcPartitionDiscoverySuite.scala
index 8707f9f936be6..3c2efe329bfd5 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcPartitionDiscoverySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcPartitionDiscoverySuite.scala
@@ -21,7 +21,7 @@ import java.io.File
 import org.apache.hadoop.hive.conf.HiveConf.ConfVars
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql._
-import org.apache.spark.sql.catalyst.expressions.InternalRow
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.hive.test.TestHive
 import org.apache.spark.sql.hive.test.TestHive._
 import org.apache.spark.sql.hive.test.TestHive.implicits._

From 4ca90935c5ff15ccca13452572e317b86814f238 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Tue, 7 Jul 2015 17:57:17 -0700
Subject: [PATCH 041/549] [SPARK-7190] [SPARK-8804] [SPARK-7815] [SQL] unsafe
 UTF8String

Let UTF8String work with binary buffer. Before we have better idea on manage the lifecycle of UTF8String in Row, we still do the copy when calling `UnsafeRow.get()` for StringType.

cc rxin JoshRosen

Author: Davies Liu <davies@databricks.com>

Closes #7197 from davies/unsafe_string and squashes the following commits:

51b0ea0 [Davies Liu] fix test
50c1ebf [Davies Liu] remove optimization for upper/lower case
315d491 [Davies Liu] Merge branch 'master' of github.com:apache/spark into unsafe_string
93fce17 [Davies Liu] address comment
e9ff7ba [Davies Liu] clean up
67ec266 [Davies Liu] fix bug
7b74b1f [Davies Liu] fallback to String if local dependent
ab7857c [Davies Liu] address comments
7da92f5 [Davies Liu] handle local in toUpperCase/toLowerCase
59dbb23 [Davies Liu] revert python change
d1e0716 [Davies Liu] Merge branch 'master' of github.com:apache/spark into unsafe_string
002e35f [Davies Liu] rollback hashCode change
a87b7a8 [Davies Liu] improve toLowerCase and toUpperCase
76e794a [Davies Liu] fix test
8b2d5ce [Davies Liu] fix tests
fd3f0a6 [Davies Liu] bug fix
c4e9c88 [Davies Liu] Merge branch 'master' of github.com:apache/spark into unsafe_string
c45d921 [Davies Liu] address comments
175405f [Davies Liu] unsafe UTF8String
---
 .../sql/catalyst/expressions/UnsafeRow.java   |   1 +
 .../spark/sql/catalyst/expressions/Cast.scala |   2 +-
 .../expressions/stringOperations.scala        |   6 +-
 .../spark/unsafe/array/ByteArrayMethods.java  |  34 ++--
 .../spark/unsafe/map/BytesToBytesMap.java     |   2 +-
 .../apache/spark/unsafe/types/UTF8String.java | 155 ++++++++++++------
 .../map/AbstractBytesToBytesMapSuite.java     |   2 +-
 .../spark/unsafe/types/UTF8StringSuite.java   | 120 +++++++++-----
 8 files changed, 210 insertions(+), 112 deletions(-)

diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
index f077064a02ec0..aeb64b045812f 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
@@ -264,6 +264,7 @@ public Object get(int i) {
       int offset = (int) ((v >> OFFSET_BITS) & Integer.MAX_VALUE);
       int size = (int) (v & Integer.MAX_VALUE);
       final byte[] bytes = new byte[size];
+      // TODO(davies): Avoid the copy once we can manage the life cycle of Row well.
       PlatformDependent.copyMemory(
         baseObject,
         baseOffset + offset,
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
index cfe201bf83070..662ceeca7782d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -139,7 +139,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
   // UDFToBoolean
   private[this] def castToBoolean(from: DataType): Any => Any = from match {
     case StringType =>
-      buildCast[UTF8String](_, _.length() != 0)
+      buildCast[UTF8String](_, _.numBytes() != 0)
     case TimestampType =>
       buildCast[Long](_, t => t != 0)
     case DateType =>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
index 2e51cbb28c10a..47fc7cdaa826c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
@@ -250,7 +250,7 @@ case class Substring(str: Expression, pos: Expression, len: Expression)
           val (st, end) = slicePos(start, length, () => ba.length)
           ba.slice(st, end)
         case s: UTF8String =>
-          val (st, end) = slicePos(start, length, () => s.length())
+          val (st, end) = slicePos(start, length, () => s.numChars())
           s.substring(st, end)
       }
     }
@@ -265,10 +265,10 @@ case class StringLength(child: Expression) extends UnaryExpression with ExpectsI
   override def inputTypes: Seq[DataType] = Seq(StringType)
 
   protected override def nullSafeEval(string: Any): Any =
-    string.asInstanceOf[UTF8String].length
+    string.asInstanceOf[UTF8String].numChars
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
-    defineCodeGen(ctx, ev, c => s"($c).length()")
+    defineCodeGen(ctx, ev, c => s"($c).numChars()")
   }
 
   override def prettyName: String = "length"
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/array/ByteArrayMethods.java b/unsafe/src/main/java/org/apache/spark/unsafe/array/ByteArrayMethods.java
index 53eadf96a6b52..cf693d01a4f5b 100644
--- a/unsafe/src/main/java/org/apache/spark/unsafe/array/ByteArrayMethods.java
+++ b/unsafe/src/main/java/org/apache/spark/unsafe/array/ByteArrayMethods.java
@@ -17,7 +17,7 @@
 
 package org.apache.spark.unsafe.array;
 
-import org.apache.spark.unsafe.PlatformDependent;
+import static org.apache.spark.unsafe.PlatformDependent.*;
 
 public class ByteArrayMethods {
 
@@ -35,21 +35,27 @@ public static int roundNumberOfBytesToNearestWord(int numBytes) {
   }
 
   /**
-   * Optimized byte array equality check for 8-byte-word-aligned byte arrays.
+   * Optimized byte array equality check for byte arrays.
    * @return true if the arrays are equal, false otherwise
    */
-  public static boolean wordAlignedArrayEquals(
-      Object leftBaseObject,
-      long leftBaseOffset,
-      Object rightBaseObject,
-      long rightBaseOffset,
-      long arrayLengthInBytes) {
-    for (int i = 0; i < arrayLengthInBytes; i += 8) {
-      final long left =
-        PlatformDependent.UNSAFE.getLong(leftBaseObject, leftBaseOffset + i);
-      final long right =
-        PlatformDependent.UNSAFE.getLong(rightBaseObject, rightBaseOffset + i);
-      if (left != right) return false;
+  public static boolean arrayEquals(
+      Object leftBase,
+      long leftOffset,
+      Object rightBase,
+      long rightOffset,
+      final long length) {
+    int i = 0;
+    while (i <= length - 8) {
+      if (UNSAFE.getLong(leftBase, leftOffset + i) != UNSAFE.getLong(rightBase, rightOffset + i)) {
+        return false;
+      }
+      i += 8;
+    }
+    while (i < length) {
+      if (UNSAFE.getByte(leftBase, leftOffset + i) != UNSAFE.getByte(rightBase, rightOffset + i)) {
+        return false;
+      }
+      i += 1;
     }
     return true;
   }
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java b/unsafe/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java
index 0b4d8d286f5f9..8d8c08de52b84 100644
--- a/unsafe/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java
+++ b/unsafe/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java
@@ -277,7 +277,7 @@ public Location lookup(
             final MemoryLocation keyAddress = loc.getKeyAddress();
             final Object storedKeyBaseObject = keyAddress.getBaseObject();
             final long storedKeyBaseOffset = keyAddress.getBaseOffset();
-            final boolean areEqual = ByteArrayMethods.wordAlignedArrayEquals(
+            final boolean areEqual = ByteArrayMethods.arrayEquals(
               keyBaseObject,
               keyBaseOffset,
               storedKeyBaseObject,
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
index 9302b472925ed..d2a25096a5e7a 100644
--- a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
+++ b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -20,9 +20,10 @@
 import javax.annotation.Nonnull;
 import java.io.Serializable;
 import java.io.UnsupportedEncodingException;
-import java.util.Arrays;
 
-import org.apache.spark.unsafe.PlatformDependent;
+import org.apache.spark.unsafe.array.ByteArrayMethods;
+
+import static org.apache.spark.unsafe.PlatformDependent.*;
 
 /**
  * A UTF-8 String for internal Spark use.
@@ -35,7 +36,9 @@
 public final class UTF8String implements Comparable<UTF8String>, Serializable {
 
   @Nonnull
-  private byte[] bytes;
+  private final Object base;
+  private final long offset;
+  private final int numBytes;
 
   private static int[] bytesOfCodePointInUTF8 = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
@@ -44,60 +47,82 @@ public final class UTF8String implements Comparable<UTF8String>, Serializable {
     5, 5, 5, 5,
     6, 6, 6, 6};
 
+  /**
+   * Creates an UTF8String from byte array, which should be encoded in UTF-8.
+   *
+   * Note: `bytes` will be hold by returned UTF8String.
+   */
   public static UTF8String fromBytes(byte[] bytes) {
-    return (bytes != null) ? new UTF8String().set(bytes) : null;
-  }
-
-  public static UTF8String fromString(String str) {
-    return (str != null) ? new UTF8String().set(str) : null;
+    if (bytes != null) {
+      return new UTF8String(bytes, BYTE_ARRAY_OFFSET, bytes.length);
+    } else {
+      return null;
+    }
   }
 
   /**
-   * Updates the UTF8String with String.
+   * Creates an UTF8String from String.
    */
-  protected UTF8String set(final String str) {
+  public static UTF8String fromString(String str) {
+    if (str == null) return null;
     try {
-      bytes = str.getBytes("utf-8");
+      return fromBytes(str.getBytes("utf-8"));
     } catch (UnsupportedEncodingException e) {
       // Turn the exception into unchecked so we can find out about it at runtime, but
       // don't need to add lots of boilerplate code everywhere.
-      PlatformDependent.throwException(e);
+      throwException(e);
+      return null;
     }
-    return this;
   }
 
-  /**
-   * Updates the UTF8String with byte[], which should be encoded in UTF-8.
-   */
-  protected UTF8String set(final byte[] bytes) {
-    this.bytes = bytes;
-    return this;
+  protected UTF8String(Object base, long offset, int size) {
+    this.base = base;
+    this.offset = offset;
+    this.numBytes = size;
   }
 
   /**
    * Returns the number of bytes for a code point with the first byte as `b`
    * @param b The first byte of a code point
    */
-  public int numBytes(final byte b) {
+  private static int numBytesForFirstByte(final byte b) {
     final int offset = (b & 0xFF) - 192;
     return (offset >= 0) ? bytesOfCodePointInUTF8[offset] : 1;
   }
 
+  /**
+   * Returns the number of bytes
+   */
+  public int numBytes() {
+    return numBytes;
+  }
+
   /**
    * Returns the number of code points in it.
    *
    * This is only used by Substring() when `start` is negative.
    */
-  public int length() {
+  public int numChars() {
     int len = 0;
-    for (int i = 0; i < bytes.length; i+= numBytes(bytes[i])) {
+    for (int i = 0; i < numBytes; i += numBytesForFirstByte(getByte(i))) {
       len += 1;
     }
     return len;
   }
 
+  /**
+   * Returns the underline bytes, will be a copy of it if it's part of another array.
+   */
   public byte[] getBytes() {
-    return bytes;
+    // avoid copy if `base` is `byte[]`
+    if (offset == BYTE_ARRAY_OFFSET && base instanceof byte[]
+      && ((byte[]) base).length == numBytes) {
+      return (byte[]) base;
+    } else {
+      byte[] bytes = new byte[numBytes];
+      copyMemory(base, offset, bytes, BYTE_ARRAY_OFFSET, numBytes);
+      return bytes;
+    }
   }
 
   /**
@@ -106,92 +131,110 @@ public byte[] getBytes() {
    * @param until the position after last code point, exclusive.
    */
   public UTF8String substring(final int start, final int until) {
-    if (until <= start || start >= bytes.length) {
-      return UTF8String.fromBytes(new byte[0]);
+    if (until <= start || start >= numBytes) {
+      return fromBytes(new byte[0]);
     }
 
     int i = 0;
     int c = 0;
-    for (; i < bytes.length && c < start; i += numBytes(bytes[i])) {
+    while (i < numBytes && c < start) {
+      i += numBytesForFirstByte(getByte(i));
       c += 1;
     }
 
     int j = i;
-    for (; j < bytes.length && c < until; j += numBytes(bytes[i])) {
+    while (i < numBytes && c < until) {
+      i += numBytesForFirstByte(getByte(i));
       c += 1;
     }
 
-    return UTF8String.fromBytes(Arrays.copyOfRange(bytes, i, j));
+    byte[] bytes = new byte[i - j];
+    copyMemory(base, offset + j, bytes, BYTE_ARRAY_OFFSET, i - j);
+    return fromBytes(bytes);
   }
 
+  /**
+   * Returns whether this contains `substring` or not.
+   */
   public boolean contains(final UTF8String substring) {
-    final byte[] b = substring.getBytes();
-    if (b.length == 0) {
+    if (substring.numBytes == 0) {
       return true;
     }
 
-    for (int i = 0; i <= bytes.length - b.length; i++) {
-      if (bytes[i] == b[0] && startsWith(b, i)) {
+    byte first = substring.getByte(0);
+    for (int i = 0; i <= numBytes - substring.numBytes; i++) {
+      if (getByte(i) == first && matchAt(substring, i)) {
         return true;
       }
     }
     return false;
   }
 
-  private boolean startsWith(final byte[] prefix, int offsetInBytes) {
-    if (prefix.length + offsetInBytes > bytes.length || offsetInBytes < 0) {
+  /**
+   * Returns the byte at position `i`.
+   */
+  private byte getByte(int i) {
+    return UNSAFE.getByte(base, offset + i);
+  }
+
+  private boolean matchAt(final UTF8String s, int pos) {
+    if (s.numBytes + pos > numBytes || pos < 0) {
       return false;
     }
-    int i = 0;
-    while (i < prefix.length && prefix[i] == bytes[i + offsetInBytes]) {
-      i++;
-    }
-    return i == prefix.length;
+    return ByteArrayMethods.arrayEquals(base, offset + pos, s.base, s.offset, s.numBytes);
   }
 
   public boolean startsWith(final UTF8String prefix) {
-    return startsWith(prefix.getBytes(), 0);
+    return matchAt(prefix, 0);
   }
 
   public boolean endsWith(final UTF8String suffix) {
-    return startsWith(suffix.getBytes(), bytes.length - suffix.getBytes().length);
+    return matchAt(suffix, numBytes - suffix.numBytes);
   }
 
+  /**
+   * Returns the upper case of this string
+   */
   public UTF8String toUpperCase() {
-    return UTF8String.fromString(toString().toUpperCase());
+    return fromString(toString().toUpperCase());
   }
 
+  /**
+   * Returns the lower case of this string
+   */
   public UTF8String toLowerCase() {
-    return UTF8String.fromString(toString().toLowerCase());
+    return fromString(toString().toLowerCase());
   }
 
   @Override
   public String toString() {
     try {
-      return new String(bytes, "utf-8");
+      return new String(getBytes(), "utf-8");
     } catch (UnsupportedEncodingException e) {
       // Turn the exception into unchecked so we can find out about it at runtime, but
       // don't need to add lots of boilerplate code everywhere.
-      PlatformDependent.throwException(e);
+      throwException(e);
       return "unknown";  // we will never reach here.
     }
   }
 
   @Override
   public UTF8String clone() {
-    return new UTF8String().set(bytes);
+    return fromBytes(getBytes());
   }
 
   @Override
   public int compareTo(final UTF8String other) {
-    final byte[] b = other.getBytes();
-    for (int i = 0; i < bytes.length && i < b.length; i++) {
-      int res = bytes[i] - b[i];
+    int len = Math.min(numBytes, other.numBytes);
+    // TODO: compare 8 bytes as unsigned long
+    for (int i = 0; i < len; i ++) {
+      // In UTF-8, the byte should be unsigned, so we should compare them as unsigned int.
+      int res = (getByte(i) & 0xFF) - (other.getByte(i) & 0xFF);
       if (res != 0) {
         return res;
       }
     }
-    return bytes.length - b.length;
+    return numBytes - other.numBytes;
   }
 
   public int compare(final UTF8String other) {
@@ -201,7 +244,11 @@ public int compare(final UTF8String other) {
   @Override
   public boolean equals(final Object other) {
     if (other instanceof UTF8String) {
-      return Arrays.equals(bytes, ((UTF8String) other).getBytes());
+      UTF8String o = (UTF8String) other;
+      if (numBytes != o.numBytes){
+        return false;
+      }
+      return ByteArrayMethods.arrayEquals(base, offset, o.base, o.offset, numBytes);
     } else {
       return false;
     }
@@ -209,6 +256,10 @@ public boolean equals(final Object other) {
 
   @Override
   public int hashCode() {
-    return Arrays.hashCode(bytes);
+    int result = 1;
+    for (int i = 0; i < numBytes; i ++) {
+      result = 31 * result + getByte(i);
+    }
+    return result;
   }
 }
diff --git a/unsafe/src/test/java/org/apache/spark/unsafe/map/AbstractBytesToBytesMapSuite.java b/unsafe/src/test/java/org/apache/spark/unsafe/map/AbstractBytesToBytesMapSuite.java
index 81315f7c94645..dae47e4bab0cb 100644
--- a/unsafe/src/test/java/org/apache/spark/unsafe/map/AbstractBytesToBytesMapSuite.java
+++ b/unsafe/src/test/java/org/apache/spark/unsafe/map/AbstractBytesToBytesMapSuite.java
@@ -99,7 +99,7 @@ private static boolean arrayEquals(
       byte[] expected,
       MemoryLocation actualAddr,
       long actualLengthBytes) {
-    return (actualLengthBytes == expected.length) && ByteArrayMethods.wordAlignedArrayEquals(
+    return (actualLengthBytes == expected.length) && ByteArrayMethods.arrayEquals(
       expected,
       BYTE_ARRAY_OFFSET,
       actualAddr.getBaseObject(),
diff --git a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
index 796cdc9dbebdb..8ec69ebac8b37 100644
--- a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
+++ b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
@@ -19,73 +19,113 @@
 
 import java.io.UnsupportedEncodingException;
 
-import junit.framework.Assert;
 import org.junit.Test;
 
+import static junit.framework.Assert.*;
+
+import static org.apache.spark.unsafe.types.UTF8String.*;
+
 public class UTF8StringSuite {
 
   private void checkBasic(String str, int len) throws UnsupportedEncodingException {
-    Assert.assertEquals(UTF8String.fromString(str).length(), len);
-    Assert.assertEquals(UTF8String.fromBytes(str.getBytes("utf8")).length(), len);
+    UTF8String s1 = fromString(str);
+    UTF8String s2 = fromBytes(str.getBytes("utf8"));
+    assertEquals(s1.numChars(), len);
+    assertEquals(s2.numChars(), len);
+
+    assertEquals(s1.toString(), str);
+    assertEquals(s2.toString(), str);
+    assertEquals(s1, s2);
+
+    assertEquals(s1.hashCode(), s2.hashCode());
 
-    Assert.assertEquals(UTF8String.fromString(str).toString(), str);
-    Assert.assertEquals(UTF8String.fromBytes(str.getBytes("utf8")).toString(), str);
-    Assert.assertEquals(UTF8String.fromBytes(str.getBytes("utf8")), UTF8String.fromString(str));
+    assertEquals(s1.compareTo(s2), 0);
 
-    Assert.assertEquals(UTF8String.fromString(str).hashCode(),
-      UTF8String.fromBytes(str.getBytes("utf8")).hashCode());
+    assertEquals(s1.contains(s2), true);
+    assertEquals(s2.contains(s1), true);
+    assertEquals(s1.startsWith(s1), true);
+    assertEquals(s1.endsWith(s1), true);
   }
 
   @Test
   public void basicTest() throws UnsupportedEncodingException {
+    checkBasic("", 0);
     checkBasic("hello", 5);
-    checkBasic("世 界", 3);
+    checkBasic("大 千 世 界", 7);
+  }
+
+  @Test
+  public void compareTo() {
+    assertTrue(fromString("abc").compareTo(fromString("ABC")) > 0);
+    assertTrue(fromString("abc0").compareTo(fromString("abc")) > 0);
+    assertTrue(fromString("abcabcabc").compareTo(fromString("abcabcabc")) == 0);
+    assertTrue(fromString("aBcabcabc").compareTo(fromString("Abcabcabc")) > 0);
+    assertTrue(fromString("Abcabcabc").compareTo(fromString("abcabcabC")) < 0);
+    assertTrue(fromString("abcabcabc").compareTo(fromString("abcabcabC")) > 0);
+
+    assertTrue(fromString("abc").compareTo(fromString("世界")) < 0);
+    assertTrue(fromString("你好").compareTo(fromString("世界")) > 0);
+    assertTrue(fromString("你好123").compareTo(fromString("你好122")) > 0);
+  }
+
+  protected void testUpperandLower(String upper, String lower) {
+    UTF8String us = fromString(upper);
+    UTF8String ls = fromString(lower);
+    assertEquals(ls, us.toLowerCase());
+    assertEquals(us, ls.toUpperCase());
+    assertEquals(us, us.toUpperCase());
+    assertEquals(ls, ls.toLowerCase());
+  }
+
+  @Test
+  public void upperAndLower() {
+    testUpperandLower("", "");
+    testUpperandLower("0123456", "0123456");
+    testUpperandLower("ABCXYZ", "abcxyz");
+    testUpperandLower("ЀЁЂѺΏỀ", "ѐёђѻώề");
+    testUpperandLower("大千世界 数据砖头", "大千世界 数据砖头");
   }
 
   @Test
   public void contains() {
-    Assert.assertTrue(UTF8String.fromString("hello").contains(UTF8String.fromString("ello")));
-    Assert.assertFalse(UTF8String.fromString("hello").contains(UTF8String.fromString("vello")));
-    Assert.assertFalse(UTF8String.fromString("hello").contains(UTF8String.fromString("hellooo")));
-    Assert.assertTrue(UTF8String.fromString("大千世界").contains(UTF8String.fromString("千世")));
-    Assert.assertFalse(UTF8String.fromString("大千世界").contains(UTF8String.fromString("世千")));
-    Assert.assertFalse(
-      UTF8String.fromString("大千世界").contains(UTF8String.fromString("大千世界好")));
+    assertTrue(fromString("").contains(fromString("")));
+    assertTrue(fromString("hello").contains(fromString("ello")));
+    assertFalse(fromString("hello").contains(fromString("vello")));
+    assertFalse(fromString("hello").contains(fromString("hellooo")));
+    assertTrue(fromString("大千世界").contains(fromString("千世界")));
+    assertFalse(fromString("大千世界").contains(fromString("世千")));
+    assertFalse(fromString("大千世界").contains(fromString("大千世界好")));
   }
 
   @Test
   public void startsWith() {
-    Assert.assertTrue(UTF8String.fromString("hello").startsWith(UTF8String.fromString("hell")));
-    Assert.assertFalse(UTF8String.fromString("hello").startsWith(UTF8String.fromString("ell")));
-    Assert.assertFalse(UTF8String.fromString("hello").startsWith(UTF8String.fromString("hellooo")));
-    Assert.assertTrue(UTF8String.fromString("数据砖头").startsWith(UTF8String.fromString("数据")));
-    Assert.assertFalse(UTF8String.fromString("大千世界").startsWith(UTF8String.fromString("千")));
-    Assert.assertFalse(
-      UTF8String.fromString("大千世界").startsWith(UTF8String.fromString("大千世界好")));
+    assertTrue(fromString("").startsWith(fromString("")));
+    assertTrue(fromString("hello").startsWith(fromString("hell")));
+    assertFalse(fromString("hello").startsWith(fromString("ell")));
+    assertFalse(fromString("hello").startsWith(fromString("hellooo")));
+    assertTrue(fromString("数据砖头").startsWith(fromString("数据")));
+    assertFalse(fromString("大千世界").startsWith(fromString("千")));
+    assertFalse(fromString("大千世界").startsWith(fromString("大千世界好")));
   }
 
   @Test
   public void endsWith() {
-    Assert.assertTrue(UTF8String.fromString("hello").endsWith(UTF8String.fromString("ello")));
-    Assert.assertFalse(UTF8String.fromString("hello").endsWith(UTF8String.fromString("ellov")));
-    Assert.assertFalse(UTF8String.fromString("hello").endsWith(UTF8String.fromString("hhhello")));
-    Assert.assertTrue(UTF8String.fromString("大千世界").endsWith(UTF8String.fromString("世界")));
-    Assert.assertFalse(UTF8String.fromString("大千世界").endsWith(UTF8String.fromString("世")));
-    Assert.assertFalse(
-      UTF8String.fromString("数据砖头").endsWith(UTF8String.fromString("我的数据砖头")));
+    assertTrue(fromString("").endsWith(fromString("")));
+    assertTrue(fromString("hello").endsWith(fromString("ello")));
+    assertFalse(fromString("hello").endsWith(fromString("ellov")));
+    assertFalse(fromString("hello").endsWith(fromString("hhhello")));
+    assertTrue(fromString("大千世界").endsWith(fromString("世界")));
+    assertFalse(fromString("大千世界").endsWith(fromString("世")));
+    assertFalse(fromString("数据砖头").endsWith(fromString("我的数据砖头")));
   }
 
   @Test
   public void substring() {
-    Assert.assertEquals(
-      UTF8String.fromString("hello").substring(0, 0), UTF8String.fromString(""));
-    Assert.assertEquals(
-      UTF8String.fromString("hello").substring(1, 3), UTF8String.fromString("el"));
-    Assert.assertEquals(
-      UTF8String.fromString("数据砖头").substring(0, 1), UTF8String.fromString("数"));
-    Assert.assertEquals(
-      UTF8String.fromString("数据砖头").substring(1, 3), UTF8String.fromString("据砖"));
-    Assert.assertEquals(
-      UTF8String.fromString("数据砖头").substring(3, 5), UTF8String.fromString("头"));
+    assertEquals(fromString("hello").substring(0, 0), fromString(""));
+    assertEquals(fromString("hello").substring(1, 3), fromString("el"));
+    assertEquals(fromString("数据砖头").substring(0, 1), fromString("数"));
+    assertEquals(fromString("数据砖头").substring(1, 3), fromString("据砖"));
+    assertEquals(fromString("数据砖头").substring(3, 5), fromString("头"));
+    assertEquals(fromString("ߵ梷").substring(0, 2), fromString("ߵ梷"));
   }
 }

From 68a4a169714e11d8c537ad9431ae9974f6b7e8d3 Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Tue, 7 Jul 2015 18:36:35 -0700
Subject: [PATCH 042/549] [SPARK-8868] SqlSerializer2 can go into infinite loop
 when row consists only of NullType columns

https://issues.apache.org/jira/browse/SPARK-8868

Author: Yin Huai <yhuai@databricks.com>

Closes #7262 from yhuai/SPARK-8868 and squashes the following commits:

cb58780 [Yin Huai] Andrew's comment.
e456857 [Yin Huai] Josh's comments.
5122e65 [Yin Huai] If types of all columns are NullTypes, do not use serializer2.
---
 .../sql/execution/SparkSqlSerializer2.scala   | 25 +++++++++++++++----
 .../execution/SparkSqlSerializer2Suite.scala  | 20 ++++++++++++++-
 2 files changed, 39 insertions(+), 6 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlSerializer2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlSerializer2.scala
index 056d435eecd23..6ed822dc70d68 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlSerializer2.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlSerializer2.scala
@@ -179,23 +179,38 @@ private[sql] object SparkSqlSerializer2 {
 
   /**
    * Check if rows with the given schema can be serialized with ShuffleSerializer.
+   * Right now, we do not support a schema having complex types or UDTs, or all data types
+   * of fields are NullTypes.
    */
   def support(schema: Array[DataType]): Boolean = {
     if (schema == null) return true
 
+    var allNullTypes = true
     var i = 0
     while (i < schema.length) {
       schema(i) match {
-        case udt: UserDefinedType[_] => return false
-        case array: ArrayType => return false
-        case map: MapType => return false
-        case struct: StructType => return false
+        case NullType => // Do nothing
+        case udt: UserDefinedType[_] =>
+          allNullTypes = false
+          return false
+        case array: ArrayType =>
+          allNullTypes = false
+          return false
+        case map: MapType =>
+          allNullTypes = false
+          return false
+        case struct: StructType =>
+          allNullTypes = false
+          return false
         case _ =>
+          allNullTypes = false
       }
       i += 1
     }
 
-    return true
+    // If types of fields are all NullTypes, we return false.
+    // Otherwise, we return true.
+    return !allNullTypes
   }
 
   /**
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlSerializer2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlSerializer2Suite.scala
index 8631e247c6c05..71f6b26bcd01a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlSerializer2Suite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlSerializer2Suite.scala
@@ -42,7 +42,6 @@ class SparkSqlSerializer2DataTypeSuite extends SparkFunSuite {
   }
 
   checkSupported(null, isSupported = true)
-  checkSupported(NullType, isSupported = true)
   checkSupported(BooleanType, isSupported = true)
   checkSupported(ByteType, isSupported = true)
   checkSupported(ShortType, isSupported = true)
@@ -57,6 +56,8 @@ class SparkSqlSerializer2DataTypeSuite extends SparkFunSuite {
   checkSupported(DecimalType(10, 5), isSupported = true)
   checkSupported(DecimalType.Unlimited, isSupported = true)
 
+  // If NullType is the only data type in the schema, we do not support it.
+  checkSupported(NullType, isSupported = false)
   // For now, ArrayType, MapType, and StructType are not supported.
   checkSupported(ArrayType(DoubleType, true), isSupported = false)
   checkSupported(ArrayType(StringType, false), isSupported = false)
@@ -170,6 +171,23 @@ abstract class SparkSqlSerializer2Suite extends QueryTest with BeforeAndAfterAll
     val df = ctx.sql(s"SELECT 1 + 1 FROM shuffle")
     checkSerializer(df.queryExecution.executedPlan, classOf[SparkSqlSerializer])
   }
+
+  test("types of fields are all NullTypes") {
+    // Test range partitioning code path.
+    val nulls = ctx.sql(s"SELECT null as a, null as b, null as c")
+    val df = nulls.unionAll(nulls).sort("a")
+    checkSerializer(df.queryExecution.executedPlan, classOf[SparkSqlSerializer])
+    checkAnswer(
+      df,
+      Row(null, null, null) :: Row(null, null, null) :: Nil)
+
+    // Test hash partitioning code path.
+    val oneRow = ctx.sql(s"SELECT DISTINCT null, null, null FROM shuffle")
+    checkSerializer(oneRow.queryExecution.executedPlan, classOf[SparkSqlSerializer])
+    checkAnswer(
+      oneRow,
+      Row(null, null, null))
+  }
 }
 
 /** Tests SparkSqlSerializer2 with sort based shuffle without sort merge. */

From 5d603dfe497ded78d8f09711578f1e764e6e4e91 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Tue, 7 Jul 2015 19:12:40 -0700
Subject: [PATCH 043/549] [SPARK-8878][SQL] Improve unit test coverage for
 bitwise expressions.

Author: Reynold Xin <rxin@databricks.com>

Closes #7273 from rxin/bitwise-unittest and squashes the following commits:

60c5667 [Reynold Xin] [SPARK-8878][SQL] Improve unit test coverage for bitwise expressions.
---
 .../expressions/BitwiseFunctionsSuite.scala   | 108 ++++++++++--------
 1 file changed, 61 insertions(+), 47 deletions(-)

diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/BitwiseFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/BitwiseFunctionsSuite.scala
index c9bbc7a8b8c14..648fbf5a4c30b 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/BitwiseFunctionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/BitwiseFunctionsSuite.scala
@@ -18,63 +18,77 @@
 package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.types._
 
 
 class BitwiseFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
 
-  test("Bitwise operations") {
-    val row = create_row(1, 2, 3, null)
-    val c1 = 'a.int.at(0)
-    val c2 = 'a.int.at(1)
-    val c3 = 'a.int.at(2)
-    val c4 = 'a.int.at(3)
-
-    checkEvaluation(BitwiseAnd(c1, c4), null, row)
-    checkEvaluation(BitwiseAnd(c1, c2), 0, row)
-    checkEvaluation(BitwiseAnd(c1, Literal.create(null, IntegerType)), null, row)
-    checkEvaluation(
-      BitwiseAnd(Literal.create(null, IntegerType), Literal.create(null, IntegerType)), null, row)
-
-    checkEvaluation(BitwiseOr(c1, c4), null, row)
-    checkEvaluation(BitwiseOr(c1, c2), 3, row)
-    checkEvaluation(BitwiseOr(c1, Literal.create(null, IntegerType)), null, row)
-    checkEvaluation(
-      BitwiseOr(Literal.create(null, IntegerType), Literal.create(null, IntegerType)), null, row)
-
-    checkEvaluation(BitwiseXor(c1, c4), null, row)
-    checkEvaluation(BitwiseXor(c1, c2), 3, row)
-    checkEvaluation(BitwiseXor(c1, Literal.create(null, IntegerType)), null, row)
-    checkEvaluation(
-      BitwiseXor(Literal.create(null, IntegerType), Literal.create(null, IntegerType)), null, row)
-
-    checkEvaluation(BitwiseNot(c4), null, row)
-    checkEvaluation(BitwiseNot(c1), -2, row)
-    checkEvaluation(BitwiseNot(Literal.create(null, IntegerType)), null, row)
-
-    checkEvaluation(c1 & c2, 0, row)
-    checkEvaluation(c1 | c2, 3, row)
-    checkEvaluation(c1 ^ c2, 3, row)
-    checkEvaluation(~c1, -2, row)
+  test("BitwiseNOT") {
+    def check(input: Any, expected: Any): Unit = {
+      val expr = BitwiseNot(Literal(input))
+      assert(expr.dataType === Literal(input).dataType)
+      checkEvaluation(expr, expected)
+    }
+
+    check(1.toByte, ~1.toByte)
+    check(1000.toShort, ~1000.toShort)
+    check(1000000, ~1000000)
+    check(123456789123L, ~123456789123L)
+
+    checkEvaluation(BitwiseNot(Literal.create(null, IntegerType)), null)
   }
 
-  test("unary BitwiseNOT") {
-    checkEvaluation(BitwiseNot(1), -2)
-    assert(BitwiseNot(1).dataType === IntegerType)
-    assert(BitwiseNot(1).eval(EmptyRow).isInstanceOf[Int])
+  test("BitwiseAnd") {
+    def check(input1: Any, input2: Any, expected: Any): Unit = {
+      val expr = BitwiseAnd(Literal(input1), Literal(input2))
+      assert(expr.dataType === Literal(input1).dataType)
+      checkEvaluation(expr, expected)
+    }
+
+    check(1.toByte, 2.toByte, 1.toByte & 2.toByte)
+    check(1000.toShort, 2.toShort, 1000.toShort & 2.toShort)
+    check(1000000, 4, 1000000 & 4)
+    check(123456789123L, 5L, 123456789123L & 5L)
+
+    val nullLit = Literal.create(null, IntegerType)
+    checkEvaluation(BitwiseAnd(nullLit, Literal(1)), null)
+    checkEvaluation(BitwiseAnd(Literal(1), nullLit), null)
+    checkEvaluation(BitwiseAnd(nullLit, nullLit), null)
+  }
 
-    checkEvaluation(BitwiseNot(1.toLong), -2.toLong)
-    assert(BitwiseNot(1.toLong).dataType === LongType)
-    assert(BitwiseNot(1.toLong).eval(EmptyRow).isInstanceOf[Long])
+  test("BitwiseOr") {
+    def check(input1: Any, input2: Any, expected: Any): Unit = {
+      val expr = BitwiseOr(Literal(input1), Literal(input2))
+      assert(expr.dataType === Literal(input1).dataType)
+      checkEvaluation(expr, expected)
+    }
 
-    checkEvaluation(BitwiseNot(1.toShort), -2.toShort)
-    assert(BitwiseNot(1.toShort).dataType === ShortType)
-    assert(BitwiseNot(1.toShort).eval(EmptyRow).isInstanceOf[Short])
+    check(1.toByte, 2.toByte, 1.toByte | 2.toByte)
+    check(1000.toShort, 2.toShort, 1000.toShort | 2.toShort)
+    check(1000000, 4, 1000000 | 4)
+    check(123456789123L, 5L, 123456789123L | 5L)
 
-    checkEvaluation(BitwiseNot(1.toByte), -2.toByte)
-    assert(BitwiseNot(1.toByte).dataType === ByteType)
-    assert(BitwiseNot(1.toByte).eval(EmptyRow).isInstanceOf[Byte])
+    val nullLit = Literal.create(null, IntegerType)
+    checkEvaluation(BitwiseOr(nullLit, Literal(1)), null)
+    checkEvaluation(BitwiseOr(Literal(1), nullLit), null)
+    checkEvaluation(BitwiseOr(nullLit, nullLit), null)
   }
 
+  test("BitwiseXor") {
+    def check(input1: Any, input2: Any, expected: Any): Unit = {
+      val expr = BitwiseXor(Literal(input1), Literal(input2))
+      assert(expr.dataType === Literal(input1).dataType)
+      checkEvaluation(expr, expected)
+    }
+
+    check(1.toByte, 2.toByte, 1.toByte ^ 2.toByte)
+    check(1000.toShort, 2.toShort, 1000.toShort ^ 2.toShort)
+    check(1000000, 4, 1000000 ^ 4)
+    check(123456789123L, 5L, 123456789123L ^ 5L)
+
+    val nullLit = Literal.create(null, IntegerType)
+    checkEvaluation(BitwiseXor(nullLit, Literal(1)), null)
+    checkEvaluation(BitwiseXor(Literal(1), nullLit), null)
+    checkEvaluation(BitwiseXor(nullLit, nullLit), null)
+  }
 }

From 61c3cf793d82f3e8da85bb918d36e95986244786 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Tue, 7 Jul 2015 22:12:46 -0700
Subject: [PATCH 044/549] [SPARK-8879][SQL] Remove EmptyRow class.

As a baby step towards no megamorphic InternalRow.

Author: Reynold Xin <rxin@databricks.com>

Closes #7277 from rxin/remove-empty-row and squashes the following commits:

594100e [Reynold Xin] [SPARK-8879][SQL] Remove EmptyRow class.
---
 .../expressions/complexTypeCreator.scala      |  2 ++
 .../sql/catalyst/expressions/package.scala    |  7 +++++++
 .../spark/sql/catalyst/expressions/rows.scala | 20 -------------------
 3 files changed, 9 insertions(+), 20 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
index 65b9d7e608320..d1e4c458864f1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
@@ -120,4 +120,6 @@ case class CreateNamedStruct(children: Seq[Expression]) extends Expression {
   override def eval(input: InternalRow): Any = {
     InternalRow(valExprs.map(_.eval(input)): _*)
   }
+
+  override def prettyName: String = "named_struct"
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/package.scala
index 5155bfe41ba66..30b7f8d3766a5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/package.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/package.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.catalyst
 
+import org.apache.spark.sql.catalyst.expressions._
+
 /**
  * A set of classes that can be used to represent trees of relational expressions.  A key goal of
  * the expression library is to hide the details of naming and scoping from developers who want to
@@ -49,6 +51,11 @@ package org.apache.spark.sql.catalyst
  */
 package object expressions  {
 
+  /**
+   * Used as input into expressions whose output does not depend on any input value.
+   */
+  val EmptyRow: InternalRow = null
+
   /**
    * Converts a [[InternalRow]] to another Row given a sequence of expression that define each
    * column of the new row. If the schema of the input row is specified, then the given expression
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala
index 1363884c201bc..8b472a529e5c9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala
@@ -54,26 +54,6 @@ abstract class MutableRow extends InternalRow {
   }
 }
 
-/**
- * A row with no data.  Calling any methods will result in an error.  Can be used as a placeholder.
- */
-object EmptyRow extends InternalRow {
-  override def apply(i: Int): Any = throw new UnsupportedOperationException
-  override def toSeq: Seq[Any] = Seq.empty
-  override def length: Int = 0
-  override def isNullAt(i: Int): Boolean = throw new UnsupportedOperationException
-  override def getInt(i: Int): Int = throw new UnsupportedOperationException
-  override def getLong(i: Int): Long = throw new UnsupportedOperationException
-  override def getDouble(i: Int): Double = throw new UnsupportedOperationException
-  override def getFloat(i: Int): Float = throw new UnsupportedOperationException
-  override def getBoolean(i: Int): Boolean = throw new UnsupportedOperationException
-  override def getShort(i: Int): Short = throw new UnsupportedOperationException
-  override def getByte(i: Int): Byte = throw new UnsupportedOperationException
-  override def getString(i: Int): String = throw new UnsupportedOperationException
-  override def getAs[T](i: Int): T = throw new UnsupportedOperationException
-  override def copy(): InternalRow = this
-}
-
 /**
  * A row implementation that uses an array of objects as the underlying storage.
  */

From 08192a1b8a375f29fbd852bbaed3a50021737475 Mon Sep 17 00:00:00 2001
From: Tijo Thomas <tijoparacka@gmail.com>
Date: Tue, 7 Jul 2015 22:35:39 -0700
Subject: [PATCH 045/549] [SPARK-8886][Documentation]python Style update

Fixed comment given by rxin

Author: Tijo Thomas <tijoparacka@gmail.com>

Closes #7281 from tijoparacka/modification_for_python_style and squashes the following commits:

6334e21 [Tijo Thomas] removed space
3de4cd8 [Tijo Thomas] python Style update
---
 docs/sql-programming-guide.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 88c96a9a095b3..7c255789790ed 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -1798,7 +1798,7 @@ DataFrame jdbcDF = sqlContext.read().format("jdbc"). options(options).load();
 
 {% highlight python %}
 
-df = sqlContext.read.format('jdbc').options(url = 'jdbc:postgresql:dbserver', dbtable='schema.tablename').load()
+df = sqlContext.read.format('jdbc').options(url='jdbc:postgresql:dbserver', dbtable='schema.tablename').load()
 
 {% endhighlight %}
 

From 351a36d0c54d2f995df956ffb0a4236e12f89aad Mon Sep 17 00:00:00 2001
From: Cheng Hao <hao.cheng@intel.com>
Date: Wed, 8 Jul 2015 00:10:24 -0700
Subject: [PATCH 046/549] [SPARK-8883][SQL]Remove the OverrideFunctionRegistry

Remove the `OverrideFunctionRegistry` from the Spark SQL, as the subclasses of `FunctionRegistry` have their own way to the delegate to the right underlying `FunctionRegistry`.

Author: Cheng Hao <hao.cheng@intel.com>

Closes #7260 from chenghao-intel/override and squashes the following commits:

164d093 [Cheng Hao] enable the function registry
2ca8459 [Cheng Hao] remove the OverrideFunctionRegistry
---
 .../sql/catalyst/analysis/FunctionRegistry.scala    | 13 -------------
 .../scala/org/apache/spark/sql/SQLContext.scala     |  3 +--
 .../org/apache/spark/sql/hive/HiveContext.scala     |  2 +-
 .../scala/org/apache/spark/sql/hive/hiveUDFs.scala  |  2 +-
 4 files changed, 3 insertions(+), 17 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index fef276353022c..5c25181e1cf50 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -35,19 +35,6 @@ trait FunctionRegistry {
   def lookupFunction(name: String, children: Seq[Expression]): Expression
 }
 
-class OverrideFunctionRegistry(underlying: FunctionRegistry) extends FunctionRegistry {
-
-  private val functionBuilders = StringKeyHashMap[FunctionBuilder](caseSensitive = false)
-
-  override def registerFunction(name: String, builder: FunctionBuilder): Unit = {
-    functionBuilders.put(name, builder)
-  }
-
-  override def lookupFunction(name: String, children: Seq[Expression]): Expression = {
-    functionBuilders.get(name).map(_(children)).getOrElse(underlying.lookupFunction(name, children))
-  }
-}
-
 class SimpleFunctionRegistry extends FunctionRegistry {
 
   private val functionBuilders = StringKeyHashMap[FunctionBuilder](caseSensitive = false)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index e81371e7b0e83..079f31ab8fe6d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -139,8 +139,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
 
   // TODO how to handle the temp function per user session?
   @transient
-  protected[sql] lazy val functionRegistry: FunctionRegistry =
-    new OverrideFunctionRegistry(FunctionRegistry.builtin)
+  protected[sql] lazy val functionRegistry: FunctionRegistry = FunctionRegistry.builtin
 
   @transient
   protected[sql] lazy val analyzer: Analyzer =
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index b91242af2d155..439d8cab5f257 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -371,7 +371,7 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
   // Note that HiveUDFs will be overridden by functions registered in this context.
   @transient
   override protected[sql] lazy val functionRegistry: FunctionRegistry =
-    new OverrideFunctionRegistry(new HiveFunctionRegistry(FunctionRegistry.builtin))
+    new HiveFunctionRegistry(FunctionRegistry.builtin)
 
   /* An analyzer that uses the Hive metastore. */
   @transient
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
index 1deef6b37eeac..0bc8adb16afc0 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
@@ -77,7 +77,7 @@ private[hive] class HiveFunctionRegistry(underlying: analysis.FunctionRegistry)
   }
 
   override def registerFunction(name: String, builder: FunctionBuilder): Unit =
-    throw new UnsupportedOperationException
+    underlying.registerFunction(name, builder)
 }
 
 private[hive] case class HiveSimpleUDF(funcWrapper: HiveFunctionWrapper, children: Seq[Expression])

From 8a9d9cc1561cf157793c90db6700ffa6f1f00a69 Mon Sep 17 00:00:00 2001
From: jerryshao <saisai.shao@intel.com>
Date: Wed, 8 Jul 2015 12:23:32 +0100
Subject: [PATCH 047/549] [SPARK-7050] [BUILD] Fix Python Kafka test assembly
 jar not found issue under Maven build

 To fix Spark Streaming unit test with maven build. Previously the name and path of maven generated jar is different from sbt, which will lead to following exception. This fix keep the same behavior with both Maven and sbt build.

```
Failed to find Spark Streaming Kafka assembly jar in /home/xyz/spark/external/kafka-assembly
You need to build Spark with  'build/sbt assembly/assembly streaming-kafka-assembly/assembly' or 'build/mvn package' before running this program
```

Author: jerryshao <saisai.shao@intel.com>

Closes #5632 from jerryshao/SPARK-7050 and squashes the following commits:

74b068d [jerryshao] Fix mvn build issue
---
 external/kafka-assembly/pom.xml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/external/kafka-assembly/pom.xml b/external/kafka-assembly/pom.xml
index 8059c443827ef..977514fa5a1ec 100644
--- a/external/kafka-assembly/pom.xml
+++ b/external/kafka-assembly/pom.xml
@@ -58,6 +58,7 @@
       <artifactId>maven-shade-plugin</artifactId>
       <configuration>
         <shadedArtifactAttached>false</shadedArtifactAttached>
+        <outputFile>${project.build.directory}/scala-${scala.binary.version}/spark-streaming-kafka-assembly-${project.version}.jar</outputFile>
         <artifactSet>
           <includes>
             <include>*:*</include>

From 3bb217750ada18a49c40d974ac57050ef2abfd2c Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 8 Jul 2015 08:44:58 -0700
Subject: [PATCH 048/549] [SPARK-8872] [MLLIB] added verification results from
 R for FPGrowthSuite

Author: Kashif Rasul <kashif.rasul@gmail.com>

Closes #7269 from kashif/SPARK-8872 and squashes the following commits:

2d5457f [Kashif Rasul] added R code for FP Int type
3de6808 [Kashif Rasul] added verification results from R for FPGrowthSuite
---
 .../spark/mllib/fpm/FPGrowthSuite.scala       | 114 ++++++++++++++++++
 1 file changed, 114 insertions(+)

diff --git a/mllib/src/test/scala/org/apache/spark/mllib/fpm/FPGrowthSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/fpm/FPGrowthSuite.scala
index 66ae3543ecc4e..ddc296a428907 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/fpm/FPGrowthSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/fpm/FPGrowthSuite.scala
@@ -39,6 +39,22 @@ class FPGrowthSuite extends SparkFunSuite with MLlibTestSparkContext {
       .setMinSupport(0.9)
       .setNumPartitions(1)
       .run(rdd)
+
+    /* Verify results using the `R` code:
+       transactions = as(sapply(
+         list("r z h k p",
+              "z y x w v u t s",
+              "s x o n r",
+              "x z y m t s q e",
+              "z",
+              "x z y r q t p"),
+         FUN=function(x) strsplit(x," ",fixed=TRUE)),
+         "transactions")
+       > eclat(transactions, parameter = list(support = 0.9))
+       ...
+       eclat - zero frequent items
+       set of 0 itemsets
+     */
     assert(model6.freqItemsets.count() === 0)
 
     val model3 = fpg
@@ -48,6 +64,33 @@ class FPGrowthSuite extends SparkFunSuite with MLlibTestSparkContext {
     val freqItemsets3 = model3.freqItemsets.collect().map { itemset =>
       (itemset.items.toSet, itemset.freq)
     }
+
+    /* Verify results using the `R` code:
+       fp = eclat(transactions, parameter = list(support = 0.5))
+       fpDF = as(sort(fp), "data.frame")
+       fpDF$support = fpDF$support * length(transactions)
+       names(fpDF)[names(fpDF) == "support"] = "freq"
+       > fpDF
+              items freq
+       13       {z}    5
+       14       {x}    4
+       1      {s,x}    3
+       2  {t,x,y,z}    3
+       3    {t,y,z}    3
+       4    {t,x,y}    3
+       5    {x,y,z}    3
+       6      {y,z}    3
+       7      {x,y}    3
+       8      {t,y}    3
+       9    {t,x,z}    3
+       10     {t,z}    3
+       11     {t,x}    3
+       12     {x,z}    3
+       15       {t}    3
+       16       {y}    3
+       17       {s}    3
+       18       {r}    3
+     */
     val expected = Set(
       (Set("s"), 3L), (Set("z"), 5L), (Set("x"), 4L), (Set("t"), 3L), (Set("y"), 3L),
       (Set("r"), 3L),
@@ -62,12 +105,30 @@ class FPGrowthSuite extends SparkFunSuite with MLlibTestSparkContext {
       .setMinSupport(0.3)
       .setNumPartitions(4)
       .run(rdd)
+
+    /* Verify results using the `R` code:
+       fp = eclat(transactions, parameter = list(support = 0.3))
+       fpDF = as(fp, "data.frame")
+       fpDF$support = fpDF$support * length(transactions)
+       names(fpDF)[names(fpDF) == "support"] = "freq"
+       > nrow(fpDF)
+       [1] 54
+     */
     assert(model2.freqItemsets.count() === 54)
 
     val model1 = fpg
       .setMinSupport(0.1)
       .setNumPartitions(8)
       .run(rdd)
+
+    /* Verify results using the `R` code:
+       fp = eclat(transactions, parameter = list(support = 0.1))
+       fpDF = as(fp, "data.frame")
+       fpDF$support = fpDF$support * length(transactions)
+       names(fpDF)[names(fpDF) == "support"] = "freq"
+       > nrow(fpDF)
+       [1] 625
+     */
     assert(model1.freqItemsets.count() === 625)
   }
 
@@ -89,6 +150,23 @@ class FPGrowthSuite extends SparkFunSuite with MLlibTestSparkContext {
       .setMinSupport(0.9)
       .setNumPartitions(1)
       .run(rdd)
+
+    /* Verify results using the `R` code:
+       transactions = as(sapply(
+         list("1 2 3",
+              "1 2 3 4",
+              "5 4 3 2 1",
+              "6 5 4 3 2 1",
+              "2 4",
+              "1 3",
+              "1 7"),
+         FUN=function(x) strsplit(x," ",fixed=TRUE)),
+         "transactions")
+       > eclat(transactions, parameter = list(support = 0.9))
+       ...
+       eclat - zero frequent items
+       set of 0 itemsets
+     */
     assert(model6.freqItemsets.count() === 0)
 
     val model3 = fpg
@@ -100,6 +178,24 @@ class FPGrowthSuite extends SparkFunSuite with MLlibTestSparkContext {
     val freqItemsets3 = model3.freqItemsets.collect().map { itemset =>
       (itemset.items.toSet, itemset.freq)
     }
+
+    /* Verify results using the `R` code:
+       fp = eclat(transactions, parameter = list(support = 0.5))
+       fpDF = as(sort(fp), "data.frame")
+       fpDF$support = fpDF$support * length(transactions)
+       names(fpDF)[names(fpDF) == "support"] = "freq"
+       > fpDF
+          items freq
+      6     {1}    6
+      3   {1,3}    5
+      7     {2}    5
+      8     {3}    5
+      1   {2,4}    4
+      2 {1,2,3}    4
+      4   {2,3}    4
+      5   {1,2}    4
+      9     {4}    4
+     */
     val expected = Set(
       (Set(1), 6L), (Set(2), 5L), (Set(3), 5L), (Set(4), 4L),
       (Set(1, 2), 4L), (Set(1, 3), 5L), (Set(2, 3), 4L),
@@ -110,12 +206,30 @@ class FPGrowthSuite extends SparkFunSuite with MLlibTestSparkContext {
       .setMinSupport(0.3)
       .setNumPartitions(4)
       .run(rdd)
+
+    /* Verify results using the `R` code:
+       fp = eclat(transactions, parameter = list(support = 0.3))
+       fpDF = as(fp, "data.frame")
+       fpDF$support = fpDF$support * length(transactions)
+       names(fpDF)[names(fpDF) == "support"] = "freq"
+       > nrow(fpDF)
+       [1] 15
+     */
     assert(model2.freqItemsets.count() === 15)
 
     val model1 = fpg
       .setMinSupport(0.1)
       .setNumPartitions(8)
       .run(rdd)
+
+    /* Verify results using the `R` code:
+       fp = eclat(transactions, parameter = list(support = 0.1))
+       fpDF = as(fp, "data.frame")
+       fpDF$support = fpDF$support * length(transactions)
+       names(fpDF)[names(fpDF) == "support"] = "freq"
+       > nrow(fpDF)
+       [1] 65
+     */
     assert(model1.freqItemsets.count() === 65)
   }
 }

From bf02e377168f39459d5c216e939097ae5705f573 Mon Sep 17 00:00:00 2001
From: Sun Rui <rui.sun@intel.com>
Date: Wed, 8 Jul 2015 09:48:16 -0700
Subject: [PATCH 049/549] [SPARK-8894] [SPARKR] [DOC] Example code errors in
 SparkR documentation.

Author: Sun Rui <rui.sun@intel.com>

Closes #7287 from sun-rui/SPARK-8894 and squashes the following commits:

da63898 [Sun Rui] [SPARK-8894][SPARKR][DOC] Example code errors in SparkR documentation.
---
 docs/sparkr.md                | 2 +-
 docs/sql-programming-guide.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/sparkr.md b/docs/sparkr.md
index 095ea4308cfeb..1197c0d601ebd 100644
--- a/docs/sparkr.md
+++ b/docs/sparkr.md
@@ -116,7 +116,7 @@ sql(hiveContext, "CREATE TABLE IF NOT EXISTS src (key INT, value STRING)")
 sql(hiveContext, "LOAD DATA LOCAL INPATH 'examples/src/main/resources/kv1.txt' INTO TABLE src")
 
 # Queries can be expressed in HiveQL.
-results <- hiveContext.sql("FROM src SELECT key, value")
+results <- sql(hiveContext, "FROM src SELECT key, value")
 
 # results is now a DataFrame
 head(results)
diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 7c255789790ed..26f6ba84b91b2 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -1637,7 +1637,7 @@ sql(sqlContext, "CREATE TABLE IF NOT EXISTS src (key INT, value STRING)")
 sql(sqlContext, "LOAD DATA LOCAL INPATH 'examples/src/main/resources/kv1.txt' INTO TABLE src")
 
 # Queries can be expressed in HiveQL.
-results = sqlContext.sql("FROM src SELECT key, value").collect()
+results <- collect(sql(sqlContext, "FROM src SELECT key, value"))
 
 {% endhighlight %}
 

From 6722aca809ddc28aa20abf3bbb2e0de8629a9903 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Wed, 8 Jul 2015 10:09:50 -0700
Subject: [PATCH 050/549] [SPARK-8785] [SQL] Improve Parquet schema merging

JIRA: https://issues.apache.org/jira/browse/SPARK-8785

Currently, the parquet schema merging (`ParquetRelation2.readSchema`) may spend much time to merge duplicate schema. We can select only non duplicate schema and merge them later.

Author: Liang-Chi Hsieh <viirya@gmail.com>
Author: Liang-Chi Hsieh <viirya@appier.com>

Closes #7182 from viirya/improve_parquet_merging and squashes the following commits:

5cf934f [Liang-Chi Hsieh] Refactor it to make it faster.
f3411ea [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into improve_parquet_merging
a63c3ff [Liang-Chi Hsieh] Improve Parquet schema merging.
---
 .../apache/spark/sql/parquet/newParquet.scala | 82 +++++++++++--------
 1 file changed, 48 insertions(+), 34 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
index 6bc69c6ad0847..ce456e7fbe17e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
@@ -21,6 +21,7 @@ import java.net.URI
 import java.util.{List => JList}
 
 import scala.collection.JavaConversions._
+import scala.collection.mutable
 import scala.util.Try
 
 import com.google.common.base.Objects
@@ -30,8 +31,9 @@ import org.apache.hadoop.mapreduce._
 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
 import org.apache.parquet.filter2.predicate.FilterApi
 import org.apache.parquet.hadoop._
-import org.apache.parquet.hadoop.metadata.CompressionCodecName
+import org.apache.parquet.hadoop.metadata.{FileMetaData, CompressionCodecName}
 import org.apache.parquet.hadoop.util.ContextUtil
+import org.apache.parquet.schema.MessageType
 
 import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.deploy.SparkHadoopUtil
@@ -508,44 +510,56 @@ private[sql] object ParquetRelation2 extends Logging {
 
   private[parquet] def readSchema(
       footers: Seq[Footer], sqlContext: SQLContext): Option[StructType] = {
-    footers.map { footer =>
+
+    def parseParquetSchema(schema: MessageType): StructType = {
+      StructType.fromAttributes(
+        // TODO Really no need to use `Attribute` here, we only need to know the data type.
+        ParquetTypesConverter.convertToAttributes(
+          schema,
+          sqlContext.conf.isParquetBinaryAsString,
+          sqlContext.conf.isParquetINT96AsTimestamp))
+    }
+
+    val seen = mutable.HashSet[String]()
+    val finalSchemas: Seq[StructType] = footers.flatMap { footer =>
       val metadata = footer.getParquetMetadata.getFileMetaData
-      val parquetSchema = metadata.getSchema
-      val maybeSparkSchema = metadata
+      val serializedSchema = metadata
         .getKeyValueMetaData
         .toMap
         .get(RowReadSupport.SPARK_METADATA_KEY)
-        .flatMap { serializedSchema =>
-          // Don't throw even if we failed to parse the serialized Spark schema. Just fallback to
-          // whatever is available.
-          Try(DataType.fromJson(serializedSchema))
-            .recover { case _: Throwable =>
-              logInfo(
-                s"Serialized Spark schema in Parquet key-value metadata is not in JSON format, " +
-                  "falling back to the deprecated DataType.fromCaseClassString parser.")
-              DataType.fromCaseClassString(serializedSchema)
-            }
-            .recover { case cause: Throwable =>
-              logWarning(
-                s"""Failed to parse serialized Spark schema in Parquet key-value metadata:
-                   |\t$serializedSchema
-                 """.stripMargin,
-                cause)
-            }
-            .map(_.asInstanceOf[StructType])
-            .toOption
-        }
-
-      maybeSparkSchema.getOrElse {
-        // Falls back to Parquet schema if Spark SQL schema is absent.
-        StructType.fromAttributes(
-          // TODO Really no need to use `Attribute` here, we only need to know the data type.
-          ParquetTypesConverter.convertToAttributes(
-            parquetSchema,
-            sqlContext.conf.isParquetBinaryAsString,
-            sqlContext.conf.isParquetINT96AsTimestamp))
+      if (serializedSchema == None) {
+        // Falls back to Parquet schema if no Spark SQL schema found.
+        Some(parseParquetSchema(metadata.getSchema))
+      } else if (!seen.contains(serializedSchema.get)) {
+        seen += serializedSchema.get
+
+        // Don't throw even if we failed to parse the serialized Spark schema. Just fallback to
+        // whatever is available.
+        Some(Try(DataType.fromJson(serializedSchema.get))
+          .recover { case _: Throwable =>
+            logInfo(
+              s"Serialized Spark schema in Parquet key-value metadata is not in JSON format, " +
+                "falling back to the deprecated DataType.fromCaseClassString parser.")
+            DataType.fromCaseClassString(serializedSchema.get)
+          }
+          .recover { case cause: Throwable =>
+            logWarning(
+              s"""Failed to parse serialized Spark schema in Parquet key-value metadata:
+                 |\t$serializedSchema
+               """.stripMargin,
+              cause)
+          }
+          .map(_.asInstanceOf[StructType])
+          .getOrElse {
+            // Falls back to Parquet schema if Spark SQL schema can't be parsed.
+            parseParquetSchema(metadata.getSchema)
+          })
+      } else {
+        None
       }
-    }.reduceOption { (left, right) =>
+    }
+
+    finalSchemas.reduceOption { (left, right) =>
       try left.merge(right) catch { case e: Throwable =>
         throw new SparkException(s"Failed to merge incompatible schemas $left and $right", e)
       }

From 3e831a26965a5e92210431f9ad6935f70aa01b48 Mon Sep 17 00:00:00 2001
From: Takeshi YAMAMURO <linguin.m.s@gmail.com>
Date: Wed, 8 Jul 2015 10:33:27 -0700
Subject: [PATCH 051/549] [SPARK-6912] [SQL] Throw an AnalysisException when
 unsupported Java Map<K,V> types used in Hive UDF

To make UDF developers understood, throw an exception when unsupported Map<K,V> types used in Hive UDF. This fix is the same with #7248.

Author: Takeshi YAMAMURO <linguin.m.s@gmail.com>

Closes #7257 from maropu/ThrowExceptionWhenMapUsed and squashes the following commits:

916099a [Takeshi YAMAMURO] Fix style errors
7886dcc [Takeshi YAMAMURO] Throw an exception when Map<> used in Hive UDF
---
 .../spark/sql/hive/HiveInspectors.scala       |  6 ++++
 .../sql/hive/execution/UDFToIntIntMap.java    | 35 +++++++++++++++++++
 .../sql/hive/execution/UDFToStringIntMap.java | 35 +++++++++++++++++++
 .../sql/hive/execution/HiveUDFSuite.scala     | 32 +++++++++++++++++
 4 files changed, 108 insertions(+)
 create mode 100644 sql/hive/src/test/java/org/apache/spark/sql/hive/execution/UDFToIntIntMap.java
 create mode 100644 sql/hive/src/test/java/org/apache/spark/sql/hive/execution/UDFToStringIntMap.java

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
index 924e2ba3ba442..4cba17524af6c 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
@@ -226,6 +226,12 @@ private[hive] trait HiveInspectors {
         "List type in java is unsupported because " +
         "JVM type erasure makes spark fail to catch a component type in List<>")
 
+    // java map type unsupported
+    case c: Class[_] if c == classOf[java.util.Map[_, _]] =>
+      throw new AnalysisException(
+        "Map type in java is unsupported because " +
+        "JVM type erasure makes spark fail to catch key and value types in Map<>")
+
     case c => throw new AnalysisException(s"Unsupported java type $c")
   }
 
diff --git a/sql/hive/src/test/java/org/apache/spark/sql/hive/execution/UDFToIntIntMap.java b/sql/hive/src/test/java/org/apache/spark/sql/hive/execution/UDFToIntIntMap.java
new file mode 100644
index 0000000000000..b3e8bcbbd8221
--- /dev/null
+++ b/sql/hive/src/test/java/org/apache/spark/sql/hive/execution/UDFToIntIntMap.java
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.execution;
+
+import org.apache.hadoop.hive.ql.exec.UDF;
+
+import java.util.HashMap;
+import java.util.Map;
+
+public class UDFToIntIntMap extends UDF {
+    public Map<Integer, Integer> evaluate(Object o) {
+        return new HashMap<Integer, Integer>() {
+            {
+                put(1, 1);
+                put(2, 1);
+                put(3, 1);
+            }
+        };
+    }
+}
diff --git a/sql/hive/src/test/java/org/apache/spark/sql/hive/execution/UDFToStringIntMap.java b/sql/hive/src/test/java/org/apache/spark/sql/hive/execution/UDFToStringIntMap.java
new file mode 100644
index 0000000000000..9eea5c9a881fd
--- /dev/null
+++ b/sql/hive/src/test/java/org/apache/spark/sql/hive/execution/UDFToStringIntMap.java
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.execution;
+
+import org.apache.hadoop.hive.ql.exec.UDF;
+
+import java.util.HashMap;
+import java.util.Map;
+
+public class UDFToStringIntMap extends UDF {
+    public Map<String, Integer> evaluate(Object o) {
+        return new HashMap<String, Integer>() {
+            {
+                put("key1", 1);
+                put("key2", 2);
+                put("key3", 3);
+            }
+        };
+    }
+}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala
index 44686204c2af7..eaaa88e17002b 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala
@@ -163,6 +163,38 @@ class HiveUDFSuite extends QueryTest {
     TestHive.reset()
   }
 
+  test("UDFToStringIntMap") {
+    val testData = TestHive.sparkContext.parallelize(StringCaseClass("") :: Nil).toDF()
+    testData.registerTempTable("inputTable")
+
+    sql(s"CREATE TEMPORARY FUNCTION testUDFToStringIntMap " +
+      s"AS '${classOf[UDFToStringIntMap].getName}'")
+    val errMsg = intercept[AnalysisException] {
+      sql("SELECT testUDFToStringIntMap(s) FROM inputTable")
+    }
+    assert(errMsg.getMessage === "Map type in java is unsupported because " +
+      "JVM type erasure makes spark fail to catch key and value types in Map<>;")
+
+    sql("DROP TEMPORARY FUNCTION IF EXISTS testUDFToStringIntMap")
+    TestHive.reset()
+  }
+
+  test("UDFToIntIntMap") {
+    val testData = TestHive.sparkContext.parallelize(StringCaseClass("") :: Nil).toDF()
+    testData.registerTempTable("inputTable")
+
+    sql(s"CREATE TEMPORARY FUNCTION testUDFToIntIntMap " +
+      s"AS '${classOf[UDFToIntIntMap].getName}'")
+    val errMsg = intercept[AnalysisException] {
+      sql("SELECT testUDFToIntIntMap(s) FROM inputTable")
+    }
+    assert(errMsg.getMessage === "Map type in java is unsupported because " +
+      "JVM type erasure makes spark fail to catch key and value types in Map<>;")
+
+    sql("DROP TEMPORARY FUNCTION IF EXISTS testUDFToIntIntMap")
+    TestHive.reset()
+  }
+
   test("UDFListListInt") {
     val testData = TestHive.sparkContext.parallelize(
       ListListIntCaseClass(Nil) ::

From 74335b31072951244967f878d8b766cd1bfc2ac6 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Wed, 8 Jul 2015 10:43:00 -0700
Subject: [PATCH 052/549] [SPARK-5707] [SQL] fix serialization of generated
 projection

Author: Davies Liu <davies@databricks.com>

Closes #7272 from davies/fix_projection and squashes the following commits:

075ef76 [Davies Liu] fix codegen with BroadcastHashJion
---
 .../spark/sql/execution/joins/BroadcastHashOuterJoin.scala     | 3 +--
 .../org/apache/spark/sql/execution/joins/HashOuterJoin.scala   | 2 +-
 .../org/apache/spark/sql/execution/joins/HashedRelation.scala  | 2 +-
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashOuterJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashOuterJoin.scala
index 06c244f211aba..ab757fc7de6cd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashOuterJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashOuterJoin.scala
@@ -79,8 +79,7 @@ case class BroadcastHashOuterJoin(
     // Note that we use .execute().collect() because we don't want to convert data to Scala types
     val input: Array[InternalRow] = buildPlan.execute().map(_.copy()).collect()
     // buildHashTable uses code-generated rows as keys, which are not serializable
-    val hashed =
-      buildHashTable(input.iterator, new InterpretedProjection(buildKeys, buildPlan.output))
+    val hashed = buildHashTable(input.iterator, newProjection(buildKeys, buildPlan.output))
     sparkContext.broadcast(hashed)
   }(BroadcastHashOuterJoin.broadcastHashOuterJoinExecutionContext)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashOuterJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashOuterJoin.scala
index 333745187728e..0522ee85eeb8a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashOuterJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashOuterJoin.scala
@@ -171,7 +171,7 @@ override def outputPartitioning: Partitioning = joinType match {
       var existingMatchList = hashTable.get(rowKey)
       if (existingMatchList == null) {
         existingMatchList = new CompactBuffer[InternalRow]()
-        hashTable.put(rowKey, existingMatchList)
+        hashTable.put(rowKey.copy(), existingMatchList)
       }
 
       existingMatchList += currentRow.copy()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
index de062c791f5fb..6b51f5d4151d3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
@@ -125,7 +125,7 @@ private[joins] object HashedRelation {
         val existingMatchList = hashTable.get(rowKey)
         val matchList = if (existingMatchList == null) {
           val newMatchList = new CompactBuffer[InternalRow]()
-          hashTable.put(rowKey, newMatchList)
+          hashTable.put(rowKey.copy(), newMatchList)
           newMatchList
         } else {
           keyIsUnique = false

From 0ba98c04c726a827df8cb19b0db17c352a647960 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@outlook.com>
Date: Wed, 8 Jul 2015 10:51:32 -0700
Subject: [PATCH 053/549] [SPARK-8753][SQL] Create an IntervalType data type

We need a new data type to represent time intervals. Because we can't determine how many days in a month, so we need 2 values for interval: a int `months`, a long `microseconds`.

The interval literal syntax looks like:
`interval 3 years -4 month 4 weeks 3 second`

Because we use number of 100ns as value of `TimestampType`, so it may not makes sense to support nano second unit.

Author: Wenchen Fan <cloud0fan@outlook.com>

Closes #7226 from cloud-fan/interval and squashes the following commits:

632062d [Wenchen Fan] address comments
ac348c3 [Wenchen Fan] use case class
0342d2e [Wenchen Fan] use array byte
df9256c [Wenchen Fan] fix style
fd6f18a [Wenchen Fan] address comments
1856af3 [Wenchen Fan] support interval type
---
 .../org/apache/spark/sql/types/DataTypes.java |  5 ++
 .../apache/spark/sql/catalyst/SqlParser.scala | 86 +++++++++++++++----
 .../apache/spark/sql/types/IntervalType.scala | 37 ++++++++
 .../spark/sql/types/TimestampType.scala       |  2 +-
 .../org/apache/spark/sql/sources/ddl.scala    |  3 +
 .../org/apache/spark/sql/SQLQuerySuite.scala  | 25 ++++++
 .../apache/spark/unsafe/types/Interval.java   | 47 ++++++++++
 7 files changed, 185 insertions(+), 20 deletions(-)
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/types/IntervalType.scala
 create mode 100644 unsafe/src/main/java/org/apache/spark/unsafe/types/Interval.java

diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/types/DataTypes.java b/sql/catalyst/src/main/java/org/apache/spark/sql/types/DataTypes.java
index e457542c647e7..d22ad6794d608 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/types/DataTypes.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/types/DataTypes.java
@@ -49,6 +49,11 @@ public class DataTypes {
    */
   public static final DataType TimestampType = TimestampType$.MODULE$;
 
+  /**
+   * Gets the IntervalType object.
+   */
+  public static final DataType IntervalType = IntervalType$.MODULE$;
+
   /**
    * Gets the DoubleType object.
    */
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
index e8e9b9802e94b..dedd8c8fa3620 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
@@ -25,6 +25,7 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.Interval
 
 /**
  * A very simple SQL parser.  Based loosely on:
@@ -72,6 +73,7 @@ class SqlParser extends AbstractSparkSQLParser with DataTypeParser {
   protected val INNER = Keyword("INNER")
   protected val INSERT = Keyword("INSERT")
   protected val INTERSECT = Keyword("INTERSECT")
+  protected val INTERVAL = Keyword("INTERVAL")
   protected val INTO = Keyword("INTO")
   protected val IS = Keyword("IS")
   protected val JOIN = Keyword("JOIN")
@@ -279,12 +281,12 @@ class SqlParser extends AbstractSparkSQLParser with DataTypeParser {
         throw new AnalysisException(s"invalid function approximate $udfName")
       }
     }
-    | APPROXIMATE ~> "(" ~> floatLit ~ ")" ~ ident ~ "(" ~ DISTINCT ~ expression <~ ")" ^^
+    | APPROXIMATE ~> "(" ~> unsignedFloat ~ ")" ~ ident ~ "(" ~ DISTINCT ~ expression <~ ")" ^^
       { case s ~ _ ~ udfName ~ _ ~ _ ~ exp =>
         if (lexical.normalizeKeyword(udfName) == "count") {
           ApproxCountDistinct(exp, s.toDouble)
         } else {
-          throw new AnalysisException(s"invalid function approximate($floatLit) $udfName")
+          throw new AnalysisException(s"invalid function approximate($s) $udfName")
         }
       }
     | CASE ~> whenThenElse ^^ CaseWhen
@@ -309,6 +311,7 @@ class SqlParser extends AbstractSparkSQLParser with DataTypeParser {
     ( numericLiteral
     | booleanLiteral
     | stringLit ^^ {case s => Literal.create(s, StringType) }
+    | intervalLiteral
     | NULL ^^^ Literal.create(null, NullType)
     )
 
@@ -318,21 +321,71 @@ class SqlParser extends AbstractSparkSQLParser with DataTypeParser {
     )
 
   protected lazy val numericLiteral: Parser[Literal] =
-    signedNumericLiteral | unsignedNumericLiteral
-
-  protected lazy val sign: Parser[String] =
-    "+" | "-"
-
-  protected lazy val signedNumericLiteral: Parser[Literal] =
-    ( sign ~ numericLit  ^^ { case s ~ l => Literal(toNarrowestIntegerType(s + l)) }
-    | sign ~ floatLit ^^ { case s ~ f => Literal((s + f).toDouble) }
+    ( integral  ^^ { case i => Literal(toNarrowestIntegerType(i)) }
+    | sign.? ~ unsignedFloat ^^ { case s ~ f => Literal((s.getOrElse("") + f).toDouble) }
     )
 
-  protected lazy val unsignedNumericLiteral: Parser[Literal] =
-    ( numericLit ^^ { n => Literal(toNarrowestIntegerType(n)) }
-    | floatLit ^^ { f => Literal(f.toDouble) }
+  protected lazy val unsignedFloat: Parser[String] =
+    ( "." ~> numericLit ^^ { u => "0." + u }
+    | elem("decimal", _.isInstanceOf[lexical.FloatLit]) ^^ (_.chars)
     )
 
+  protected lazy val sign: Parser[String] = ("+" | "-")
+
+  protected lazy val integral: Parser[String] =
+    sign.? ~ numericLit ^^ { case s ~ n => s.getOrElse("") + n }
+
+  private def intervalUnit(unitName: String) =
+    acceptIf {
+      case lexical.Identifier(str) =>
+        val normalized = lexical.normalizeKeyword(str)
+        normalized == unitName || normalized == unitName + "s"
+      case _ => false
+    } {_ => "wrong interval unit"}
+
+  protected lazy val month: Parser[Int] =
+    integral <~ intervalUnit("month") ^^ { case num => num.toInt }
+
+  protected lazy val year: Parser[Int] =
+    integral <~ intervalUnit("year") ^^ { case num => num.toInt * 12 }
+
+  protected lazy val microsecond: Parser[Long] =
+    integral <~ intervalUnit("microsecond") ^^ { case num => num.toLong }
+
+  protected lazy val millisecond: Parser[Long] =
+    integral <~ intervalUnit("millisecond") ^^ { case num => num.toLong * 1000 }
+
+  protected lazy val second: Parser[Long] =
+    integral <~ intervalUnit("second") ^^ { case num => num.toLong * 1000 * 1000 }
+
+  protected lazy val minute: Parser[Long] =
+    integral <~ intervalUnit("minute") ^^ { case num => num.toLong * 1000 * 1000 * 60 }
+
+  protected lazy val hour: Parser[Long] =
+    integral <~ intervalUnit("hour") ^^ { case num => num.toLong * 1000 * 1000 * 3600 }
+
+  protected lazy val day: Parser[Long] =
+    integral <~ intervalUnit("day") ^^ { case num => num.toLong * 1000 * 1000 * 3600 * 24 }
+
+  protected lazy val week: Parser[Long] =
+    integral <~ intervalUnit("week") ^^ { case num => num.toLong * 1000 * 1000 * 3600 * 24 * 7 }
+
+  protected lazy val intervalLiteral: Parser[Literal] =
+    INTERVAL ~> year.? ~ month.? ~ week.? ~ day.? ~ hour.? ~ minute.? ~ second.? ~
+      millisecond.? ~ microsecond.? ^^ {
+        case year ~ month ~ week ~ day ~ hour ~ minute ~ second ~
+          millisecond ~ microsecond =>
+          if (!Seq(year, month, week, day, hour, minute, second,
+            millisecond, microsecond).exists(_.isDefined)) {
+            throw new AnalysisException(
+              "at least one time unit should be given for interval literal")
+          }
+          val months = Seq(year, month).map(_.getOrElse(0)).sum
+          val microseconds = Seq(week, day, hour, minute, second, millisecond, microsecond)
+            .map(_.getOrElse(0L)).sum
+          Literal.create(new Interval(months, microseconds), IntervalType)
+      }
+
   private def toNarrowestIntegerType(value: String): Any = {
     val bigIntValue = BigDecimal(value)
 
@@ -343,11 +396,6 @@ class SqlParser extends AbstractSparkSQLParser with DataTypeParser {
     }
   }
 
-  protected lazy val floatLit: Parser[String] =
-    ( "." ~> unsignedNumericLiteral ^^ { u => "0." + u }
-    | elem("decimal", _.isInstanceOf[lexical.FloatLit]) ^^ (_.chars)
-    )
-
   protected lazy val baseExpression: Parser[Expression] =
     ( "*" ^^^ UnresolvedStar(None)
     | ident <~ "." ~ "*" ^^ { case tableName => UnresolvedStar(Option(tableName)) }
@@ -355,7 +403,7 @@ class SqlParser extends AbstractSparkSQLParser with DataTypeParser {
     )
 
   protected lazy val signedPrimary: Parser[Expression] =
-    sign ~ primary ^^ { case s ~ e => if (s == "-") UnaryMinus(e) else e}
+    sign ~ primary ^^ { case s ~ e => if (s == "-") UnaryMinus(e) else e }
 
   protected lazy val attributeName: Parser[String] = acceptMatch("attribute name", {
     case lexical.Identifier(str) => str
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/IntervalType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/IntervalType.scala
new file mode 100644
index 0000000000000..87c6e9e6e5e2c
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/IntervalType.scala
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.types
+
+import org.apache.spark.annotation.DeveloperApi
+
+
+/**
+ * :: DeveloperApi ::
+ * The data type representing time intervals.
+ *
+ * Please use the singleton [[DataTypes.IntervalType]].
+ */
+@DeveloperApi
+class IntervalType private() extends DataType {
+
+  override def defaultSize: Int = 4096
+
+  private[spark] override def asNullable: IntervalType = this
+}
+
+case object IntervalType extends IntervalType
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/TimestampType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/TimestampType.scala
index de4b511edccd9..2be9b2d76c9fe 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/TimestampType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/TimestampType.scala
@@ -41,7 +41,7 @@ class TimestampType private() extends AtomicType {
   private[sql] val ordering = implicitly[Ordering[InternalType]]
 
   /**
-   * The default size of a value of the TimestampType is 12 bytes.
+   * The default size of a value of the TimestampType is 8 bytes.
    */
   override def defaultSize: Int = 8
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
index 1f0b93e2850ba..d7440c55bd4a6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
@@ -304,6 +304,9 @@ private[sql] object ResolvedDataSource {
       mode: SaveMode,
       options: Map[String, String],
       data: DataFrame): ResolvedDataSource = {
+    if (data.schema.map(_.dataType).exists(_.isInstanceOf[IntervalType])) {
+      throw new AnalysisException("Cannot save interval data type into external storage.")
+    }
     val clazz: Class[_] = lookupDataSource(provider)
     val relation = clazz.newInstance() match {
       case dataSource: CreatableRelationProvider =>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 12ad019e8b473..231440892bf0b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -1467,4 +1467,29 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll with SQLTestUtils {
       checkAnswer(sql("select count, sort from t"), Row(1, "a"))
     }
   }
+
+  test("SPARK-8753: add interval type") {
+    import org.apache.spark.unsafe.types.Interval
+
+    val df = sql("select interval 3 years -3 month 7 week 123 microseconds")
+    checkAnswer(df, Row(new Interval(12 * 3 - 3, 7L * 1000 * 1000 * 3600 * 24 * 7 + 123 )))
+    withTempPath(f => {
+      // Currently we don't yet support saving out values of interval data type.
+      val e = intercept[AnalysisException] {
+        df.write.json(f.getCanonicalPath)
+      }
+      e.message.contains("Cannot save interval data type into external storage")
+    })
+
+    def checkIntervalParseError(s: String): Unit = {
+      val e = intercept[AnalysisException] {
+        sql(s)
+      }
+      e.message.contains("at least one time unit should be given for interval literal")
+    }
+
+    checkIntervalParseError("select interval")
+    // Currently we don't yet support nanosecond
+    checkIntervalParseError("select interval 23 nanosecond")
+  }
 }
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/Interval.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/Interval.java
new file mode 100644
index 0000000000000..3eb67ede062d9
--- /dev/null
+++ b/unsafe/src/main/java/org/apache/spark/unsafe/types/Interval.java
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.unsafe.types;
+
+import java.io.Serializable;
+
+/**
+ * The internal representation of interval type.
+ */
+public final class Interval implements Serializable {
+  public final int months;
+  public final long microseconds;
+
+  public Interval(int months, long microseconds) {
+    this.months = months;
+    this.microseconds = microseconds;
+  }
+
+  @Override
+  public boolean equals(Object other) {
+    if (this == other) return true;
+    if (other == null || !(other instanceof Interval)) return false;
+
+    Interval o = (Interval) other;
+    return this.months == o.months && this.microseconds == o.microseconds;
+  }
+
+  @Override
+  public int hashCode() {
+    return 31 * months + (int) microseconds;
+  }
+}

From f61c989b404808f79a58b6503cf3835cf602528a Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Wed, 8 Jul 2015 10:56:31 -0700
Subject: [PATCH 054/549] [SPARK-8888][SQL] Use java.util.HashMap in
 DynamicPartitionWriterContainer.

Just a baby step towards making it more efficient.

Author: Reynold Xin <rxin@databricks.com>

Closes #7282 from rxin/SPARK-8888 and squashes the following commits:

3da51ae [Reynold Xin] [SPARK-8888][SQL] Use java.util.HashMap in DynamicPartitionWriterContainer.
---
 .../apache/spark/sql/sources/commands.scala   | 36 ++++++++++++-------
 1 file changed, 23 insertions(+), 13 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala
index a97142d4a1142..ecbc889770625 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala
@@ -19,8 +19,6 @@ package org.apache.spark.sql.sources
 
 import java.util.{Date, UUID}
 
-import scala.collection.mutable
-
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapreduce._
 import org.apache.hadoop.mapreduce.lib.output.{FileOutputCommitter => MapReduceFileOutputCommitter, FileOutputFormat}
@@ -110,7 +108,7 @@ private[sql] case class InsertIntoHadoopFsRelation(
         !exists
     }
     // If we are appending data to an existing dir.
-    val isAppend = (pathExists) && (mode == SaveMode.Append)
+    val isAppend = pathExists && (mode == SaveMode.Append)
 
     if (doInsertion) {
       val job = new Job(hadoopConf)
@@ -142,9 +140,12 @@ private[sql] case class InsertIntoHadoopFsRelation(
       }
     }
 
-    Seq.empty[InternalRow]
+    Seq.empty[Row]
   }
 
+  /**
+   * Inserts the content of the [[DataFrame]] into a table without any partitioning columns.
+   */
   private def insert(writerContainer: BaseWriterContainer, df: DataFrame): Unit = {
     // Uses local vals for serialization
     val needsConversion = relation.needConversion
@@ -188,6 +189,9 @@ private[sql] case class InsertIntoHadoopFsRelation(
     }
   }
 
+  /**
+   * Inserts the content of the [[DataFrame]] into a table with partitioning columns.
+   */
   private def insertWithDynamicPartitions(
       sqlContext: SQLContext,
       writerContainer: BaseWriterContainer,
@@ -497,13 +501,14 @@ private[sql] class DynamicPartitionWriterContainer(
   extends BaseWriterContainer(relation, job, isAppend) {
 
   // All output writers are created on executor side.
-  @transient protected var outputWriters: mutable.Map[String, OutputWriter] = _
+  @transient protected var outputWriters: java.util.HashMap[String, OutputWriter] = _
 
   override protected def initWriters(): Unit = {
-    outputWriters = mutable.Map.empty[String, OutputWriter]
+    outputWriters = new java.util.HashMap[String, OutputWriter]
   }
 
   override def outputWriterForRow(row: Row): OutputWriter = {
+    // TODO (SPARK-8888): zip and all the stuff happening here is very inefficient.
     val partitionPath = partitionColumns.zip(row.toSeq).map { case (col, rawValue) =>
       val string = if (rawValue == null) null else String.valueOf(rawValue)
       val valueString = if (string == null || string.isEmpty) {
@@ -514,18 +519,23 @@ private[sql] class DynamicPartitionWriterContainer(
       s"/$col=$valueString"
     }.mkString.stripPrefix(Path.SEPARATOR)
 
-    outputWriters.getOrElseUpdate(partitionPath, {
+    val writer = outputWriters.get(partitionPath)
+    if (writer.eq(null)) {
       val path = new Path(getWorkPath, partitionPath)
-      taskAttemptContext.getConfiguration.set(
-        "spark.sql.sources.output.path",
+      taskAttemptContext.getConfiguration.set("spark.sql.sources.output.path",
         new Path(outputPath, partitionPath).toString)
-      outputWriterFactory.newInstance(path.toString, dataSchema, taskAttemptContext)
-    })
+      val newWriter = outputWriterFactory.newInstance(path.toString, dataSchema, taskAttemptContext)
+      outputWriters.put(partitionPath, newWriter)
+      newWriter
+    } else {
+      writer
+    }
   }
 
   private def clearOutputWriters(): Unit = {
-    if (outputWriters.nonEmpty) {
-      outputWriters.values.foreach(_.close())
+    if (!outputWriters.isEmpty) {
+      val iter = scala.collection.JavaConversions.asScalaIterator(outputWriters.values().iterator())
+      iter.foreach(_.close())
       outputWriters.clear()
     }
   }

From 26d9b6b8cae9ac6593f78ab98dd45a25d03cf71c Mon Sep 17 00:00:00 2001
From: Tao Li <litao@sogou-inc.com>
Date: Wed, 8 Jul 2015 19:02:24 +0100
Subject: [PATCH 055/549] [SPARK-8657] [YARN] Fail to upload resource to viewfs

Fail to upload resource to viewfs in spark-1.4
JIRA Link: https://issues.apache.org/jira/browse/SPARK-8657

Author: Tao Li <litao@sogou-inc.com>

Closes #7125 from litao-buptsse/SPARK-8657-for-master and squashes the following commits:

65b13f4 [Tao Li] [SPARK-8657] [YARN] Fail to upload resource to viewfs
---
 yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
index 67a5c95400e53..4d52ae774ea00 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -321,8 +321,9 @@ private[spark] class Client(
           val linkname = targetDir.map(_ + "/").getOrElse("") +
             destName.orElse(Option(localURI.getFragment())).getOrElse(localPath.getName())
           val destPath = copyFileToRemote(dst, localPath, replication)
+          val destFs = FileSystem.get(destPath.toUri(), hadoopConf)
           distCacheMgr.addResource(
-            fs, hadoopConf, destPath, localResources, resType, linkname, statCache,
+            destFs, hadoopConf, destPath, localResources, resType, linkname, statCache,
             appMasterOnly = appMasterOnly)
           (false, linkname)
         } else {

From 374c8a8a4a8ac4171d312a6c31080a6724e55c60 Mon Sep 17 00:00:00 2001
From: Shivaram Venkataraman <shivaram@cs.berkeley.edu>
Date: Wed, 8 Jul 2015 12:39:32 -0700
Subject: [PATCH 056/549] [SPARK-8900] [SPARKR] Fix sparkPackages in init
 documentation

cc pwendell

Author: Shivaram Venkataraman <shivaram@cs.berkeley.edu>

Closes #7293 from shivaram/sparkr-packages-doc and squashes the following commits:

c91471d [Shivaram Venkataraman] Fix sparkPackages in init documentation
---
 docs/sparkr.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/sparkr.md b/docs/sparkr.md
index 1197c0d601ebd..4385a4eeacd5c 100644
--- a/docs/sparkr.md
+++ b/docs/sparkr.md
@@ -68,7 +68,7 @@ you can specify the packages with the `packages` argument.
 
 <div data-lang="r" markdown="1">
 {% highlight r %}
-sc <- sparkR.init(packages="com.databricks:spark-csv_2.11:1.0.3")
+sc <- sparkR.init(sparkPackages="com.databricks:spark-csv_2.11:1.0.3")
 sqlContext <- sparkRSQL.init(sc)
 {% endhighlight %}
 </div>

From 2b40365d76b7d9d382ad5077cdf979906bca17f2 Mon Sep 17 00:00:00 2001
From: MechCoder <manojkumarsivaraj334@gmail.com>
Date: Wed, 8 Jul 2015 13:19:27 -0700
Subject: [PATCH 057/549] [SPARK-7785] [MLLIB] [PYSPARK] Add __str__ and
 __repr__ to Matrices

Adding __str__ and  __repr__ to DenseMatrix and SparseMatrix

Author: MechCoder <manojkumarsivaraj334@gmail.com>

Closes #6342 from MechCoder/spark-7785 and squashes the following commits:

7b9a82c [MechCoder] Add tests for greater than 16 elements
b88e9dd [MechCoder] Increment limit to 16
1425a01 [MechCoder] Change tests
36bd166 [MechCoder] Change str and repr representation
97f0da9 [MechCoder] zip is same as izip in python3
94ca4b2 [MechCoder] Added doctests and iterate over values instead of colPtrs
b26fa89 [MechCoder] minor
394dde9 [MechCoder] [SPARK-7785] Add __str__ and __repr__ to Matrices
---
 python/pyspark/mllib/linalg.py | 127 +++++++++++++++++++++++++++++++++
 python/pyspark/mllib/tests.py  |  52 +++++++++++++-
 2 files changed, 178 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/mllib/linalg.py b/python/pyspark/mllib/linalg.py
index 12d8dbbb92c56..51ac198305711 100644
--- a/python/pyspark/mllib/linalg.py
+++ b/python/pyspark/mllib/linalg.py
@@ -31,6 +31,7 @@
     xrange = range
     import copyreg as copy_reg
 else:
+    from itertools import izip as zip
     import copy_reg
 
 import numpy as np
@@ -116,6 +117,10 @@ def _format_float(f, digits=4):
     return s
 
 
+def _format_float_list(l):
+    return [_format_float(x) for x in l]
+
+
 class VectorUDT(UserDefinedType):
     """
     SQL user-defined type (UDT) for Vector.
@@ -870,6 +875,50 @@ def __reduce__(self):
             self.numRows, self.numCols, self.values.tostring(),
             int(self.isTransposed))
 
+    def __str__(self):
+        """
+        Pretty printing of a DenseMatrix
+
+        >>> dm = DenseMatrix(2, 2, range(4))
+        >>> print(dm)
+        DenseMatrix([[ 0.,  2.],
+                     [ 1.,  3.]])
+        >>> dm = DenseMatrix(2, 2, range(4), isTransposed=True)
+        >>> print(dm)
+        DenseMatrix([[ 0.,  1.],
+                     [ 2.,  3.]])
+        """
+        # Inspired by __repr__ in scipy matrices.
+        array_lines = repr(self.toArray()).splitlines()
+
+        # We need to adjust six spaces which is the difference in number
+        # of letters between "DenseMatrix" and "array"
+        x = '\n'.join([(" " * 6 + line) for line in array_lines[1:]])
+        return array_lines[0].replace("array", "DenseMatrix") + "\n" + x
+
+    def __repr__(self):
+        """
+        Representation of a DenseMatrix
+
+        >>> dm = DenseMatrix(2, 2, range(4))
+        >>> dm
+        DenseMatrix(2, 2, [0.0, 1.0, 2.0, 3.0], False)
+        """
+        # If the number of values are less than seventeen then return as it is.
+        # Else return first eight values and last eight values.
+        if len(self.values) < 17:
+            entries = _format_float_list(self.values)
+        else:
+            entries = (
+                _format_float_list(self.values[:8]) +
+                ["..."] +
+                _format_float_list(self.values[-8:])
+            )
+
+        entries = ", ".join(entries)
+        return "DenseMatrix({0}, {1}, [{2}], {3})".format(
+            self.numRows, self.numCols, entries, self.isTransposed)
+
     def toArray(self):
         """
         Return an numpy.ndarray
@@ -946,6 +995,84 @@ def __init__(self, numRows, numCols, colPtrs, rowIndices, values,
             raise ValueError("Expected rowIndices of length %d, got %d."
                              % (self.rowIndices.size, self.values.size))
 
+    def __str__(self):
+        """
+        Pretty printing of a SparseMatrix
+
+        >>> sm1 = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4])
+        >>> print(sm1)
+        2 X 2 CSCMatrix
+        (0,0) 2.0
+        (1,0) 3.0
+        (1,1) 4.0
+        >>> sm1 = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True)
+        >>> print(sm1)
+        2 X 2 CSRMatrix
+        (0,0) 2.0
+        (0,1) 3.0
+        (1,1) 4.0
+        """
+        spstr = "{0} X {1} ".format(self.numRows, self.numCols)
+        if self.isTransposed:
+            spstr += "CSRMatrix\n"
+        else:
+            spstr += "CSCMatrix\n"
+
+        cur_col = 0
+        smlist = []
+
+        # Display first 16 values.
+        if len(self.values) <= 16:
+            zipindval = zip(self.rowIndices, self.values)
+        else:
+            zipindval = zip(self.rowIndices[:16], self.values[:16])
+        for i, (rowInd, value) in enumerate(zipindval):
+            if self.colPtrs[cur_col + 1] <= i:
+                cur_col += 1
+            if self.isTransposed:
+                smlist.append('({0},{1}) {2}'.format(
+                    cur_col, rowInd, _format_float(value)))
+            else:
+                smlist.append('({0},{1}) {2}'.format(
+                    rowInd, cur_col, _format_float(value)))
+        spstr += "\n".join(smlist)
+
+        if len(self.values) > 16:
+            spstr += "\n.." * 2
+        return spstr
+
+    def __repr__(self):
+        """
+        Representation of a SparseMatrix
+
+        >>> sm1 = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4])
+        >>> sm1
+        SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2.0, 3.0, 4.0], False)
+        """
+        rowIndices = list(self.rowIndices)
+        colPtrs = list(self.colPtrs)
+
+        if len(self.values) <= 16:
+            values = _format_float_list(self.values)
+
+        else:
+            values = (
+                _format_float_list(self.values[:8]) +
+                ["..."] +
+                _format_float_list(self.values[-8:])
+            )
+            rowIndices = rowIndices[:8] + ["..."] + rowIndices[-8:]
+
+        if len(self.colPtrs) > 16:
+            colPtrs = colPtrs[:8] + ["..."] + colPtrs[-8:]
+
+        values = ", ".join(values)
+        rowIndices = ", ".join([str(ind) for ind in rowIndices])
+        colPtrs = ", ".join([str(ptr) for ptr in colPtrs])
+        return "SparseMatrix({0}, {1}, [{2}], [{3}], [{4}], {5})".format(
+            self.numRows, self.numCols, colPtrs, rowIndices,
+            values, self.isTransposed)
+
     def __reduce__(self):
         return SparseMatrix, (
             self.numRows, self.numCols, self.colPtrs.tostring(),
diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py
index d9f9874d50c1a..f2eab5b18f077 100644
--- a/python/pyspark/mllib/tests.py
+++ b/python/pyspark/mllib/tests.py
@@ -27,7 +27,7 @@
 from shutil import rmtree
 
 from numpy import (
-    array, array_equal, zeros, inf, random, exp, dot, all, mean, abs)
+    array, array_equal, zeros, inf, random, exp, dot, all, mean, abs, arange, tile, ones)
 from numpy import sum as array_sum
 
 from py4j.protocol import Py4JJavaError
@@ -189,6 +189,53 @@ def test_matrix_indexing(self):
             for j in range(2):
                 self.assertEquals(mat[i, j], expected[i][j])
 
+    def test_repr_dense_matrix(self):
+        mat = DenseMatrix(3, 2, [0, 1, 4, 6, 8, 10])
+        self.assertTrue(
+            repr(mat),
+            'DenseMatrix(3, 2, [0.0, 1.0, 4.0, 6.0, 8.0, 10.0], False)')
+
+        mat = DenseMatrix(3, 2, [0, 1, 4, 6, 8, 10], True)
+        self.assertTrue(
+            repr(mat),
+            'DenseMatrix(3, 2, [0.0, 1.0, 4.0, 6.0, 8.0, 10.0], False)')
+
+        mat = DenseMatrix(6, 3, zeros(18))
+        self.assertTrue(
+            repr(mat),
+            'DenseMatrix(6, 3, [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..., \
+                0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], False)')
+
+    def test_repr_sparse_matrix(self):
+        sm1t = SparseMatrix(
+            3, 4, [0, 2, 3, 5], [0, 1, 2, 0, 2], [3.0, 2.0, 4.0, 9.0, 8.0],
+            isTransposed=True)
+        self.assertTrue(
+            repr(sm1t),
+            'SparseMatrix(3, 4, [0, 2, 3, 5], [0, 1, 2, 0, 2], [3.0, 2.0, 4.0, 9.0, 8.0], True)')
+
+        indices = tile(arange(6), 3)
+        values = ones(18)
+        sm = SparseMatrix(6, 3, [0, 6, 12, 18], indices, values)
+        self.assertTrue(
+            repr(sm), "SparseMatrix(6, 3, [0, 6, 12, 18], \
+                [0, 1, 2, 3, 4, 5, 0, 1, ..., 4, 5, 0, 1, 2, 3, 4, 5], \
+                [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..., \
+                1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], False)")
+
+        self.assertTrue(
+            str(sm),
+            "6 X 3 CSCMatrix\n\
+            (0,0) 1.0\n(1,0) 1.0\n(2,0) 1.0\n(3,0) 1.0\n(4,0) 1.0\n(5,0) 1.0\n\
+            (0,1) 1.0\n(1,1) 1.0\n(2,1) 1.0\n(3,1) 1.0\n(4,1) 1.0\n(5,1) 1.0\n\
+            (0,2) 1.0\n(1,2) 1.0\n(2,2) 1.0\n(3,2) 1.0\n..\n..")
+
+        sm = SparseMatrix(1, 18, zeros(19), [], [])
+        self.assertTrue(
+            repr(sm),
+            'SparseMatrix(1, 18, \
+                [0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0], [], [], False)')
+
     def test_sparse_matrix(self):
         # Test sparse matrix creation.
         sm1 = SparseMatrix(
@@ -198,6 +245,9 @@ def test_sparse_matrix(self):
         self.assertEquals(sm1.colPtrs.tolist(), [0, 2, 2, 4, 4])
         self.assertEquals(sm1.rowIndices.tolist(), [1, 2, 1, 2])
         self.assertEquals(sm1.values.tolist(), [1.0, 2.0, 4.0, 5.0])
+        self.assertTrue(
+            repr(sm1),
+            'SparseMatrix(3, 4, [0, 2, 2, 4, 4], [1, 2, 1, 2], [1.0, 2.0, 4.0, 5.0], False)')
 
         # Test indexing
         expected = [

From f031543782e8f0f5b6a4471ba1c1d5c53efbe5cd Mon Sep 17 00:00:00 2001
From: Keuntae Park <sirpkt@apache.org>
Date: Wed, 8 Jul 2015 14:29:52 -0700
Subject: [PATCH 058/549] [SPARK-8783] [SQL] CTAS with WITH clause does not
 work

Currently, CTESubstitution only handles the case that WITH is on the top of the plan.
I think it SHOULD handle the case that WITH is child of CTAS.
This patch simply changes 'match' to 'transform' for recursive search of WITH in the plan.

Author: Keuntae Park <sirpkt@apache.org>

Closes #7180 from sirpkt/SPARK-8783 and squashes the following commits:

e4428f0 [Keuntae Park] Merge remote-tracking branch 'upstream/master' into CTASwithWITH
1671c77 [Keuntae Park] WITH clause can be inside CTAS
---
 .../spark/sql/catalyst/analysis/Analyzer.scala |  2 +-
 .../sql/hive/execution/SQLQuerySuite.scala     | 18 ++++++++++++++++++
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 15e84e68b9881..3fdc6d62bc786 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -85,7 +85,7 @@ class Analyzer(
    */
   object CTESubstitution extends Rule[LogicalPlan] {
     // TODO allow subquery to define CTE
-    def apply(plan: LogicalPlan): LogicalPlan = plan match {
+    def apply(plan: LogicalPlan): LogicalPlan = plan transform  {
       case With(child, relations) => substituteCTE(child, relations)
       case other => other
     }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index bf9f2ecd51793..05a1f0094e5e1 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -159,6 +159,24 @@ class SQLQuerySuite extends QueryTest {
     checkAnswer(query, Row(1, 1) :: Nil)
   }
 
+  test("CTAS with WITH clause") {
+    val df = Seq((1, 1)).toDF("c1", "c2")
+    df.registerTempTable("table1")
+
+    sql(
+      """
+        |CREATE TABLE with_table1 AS
+        |WITH T AS (
+        |  SELECT *
+        |  FROM table1
+        |)
+        |SELECT *
+        |FROM T
+      """.stripMargin)
+    val query = sql("SELECT * FROM with_table1")
+    checkAnswer(query, Row(1, 1) :: Nil)
+  }
+
   test("explode nested Field") {
     Seq(NestedArray1(NestedArray2(Seq(1, 2, 3)))).toDF.registerTempTable("nestedArray")
     checkAnswer(

From c5532e2fe700978da4bdfdb54a522f5934c3db55 Mon Sep 17 00:00:00 2001
From: Feynman Liang <fliang@databricks.com>
Date: Wed, 8 Jul 2015 14:49:52 -0700
Subject: [PATCH 059/549] [SPARK-8457] [ML] NGram Documentation

Add documentation for NGram feature transformer.

Author: Feynman Liang <fliang@databricks.com>

Closes #7244 from feynmanliang/SPARK-8457 and squashes the following commits:

5aface9 [Feynman Liang] Pretty print Scala output and add API doc to each codetab
60d5ac0 [Feynman Liang] Inline API doc and fix indentation
736ccbc [Feynman Liang] NGram feature transformer documentation
---
 docs/ml-features.md | 88 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 88 insertions(+)

diff --git a/docs/ml-features.md b/docs/ml-features.md
index f88c0248c1a8a..54068debe2159 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -288,6 +288,94 @@ for words_label in wordsDataFrame.select("words", "label").take(3):
 </div>
 
 
+## $n$-gram
+
+An [n-gram](https://en.wikipedia.org/wiki/N-gram) is a sequence of $n$ tokens (typically words) for some integer $n$. The `NGram` class can be used to transform input features into $n$-grams.
+
+`NGram` takes as input a sequence of strings (e.g. the output of a [Tokenizer](ml-features.html#tokenizer).  The parameter `n` is used to determine the number of terms in each $n$-gram. The output will consist of a sequence of $n$-grams where each $n$-gram is represented by a space-delimited string of $n$ consecutive words.  If the input sequence contains fewer than `n` strings, no output is produced.
+
+<div class="codetabs">
+<div data-lang="scala" markdown="1">
+<div class="codetabs">
+
+<div data-lang="scala" markdown="1">
+
+[`NGram`](api/scala/index.html#org.apache.spark.ml.feature.NGram) takes an input column name, an output column name, and an optional length parameter n (n=2 by default).
+
+{% highlight scala %}
+import org.apache.spark.ml.feature.NGram
+
+val wordDataFrame = sqlContext.createDataFrame(Seq(
+  (0, Array("Hi", "I", "heard", "about", "Spark")),
+  (1, Array("I", "wish", "Java", "could", "use", "case", "classes")),
+  (2, Array("Logistic", "regression", "models", "are", "neat"))
+)).toDF("label", "words")
+
+val ngram = new NGram().setInputCol("words").setOutputCol("ngrams")
+val ngramDataFrame = ngram.transform(wordDataFrame)
+ngramDataFrame.take(3).map(_.getAs[Stream[String]]("ngrams").toList).foreach(println)
+{% endhighlight %}
+</div>
+
+<div data-lang="java" markdown="1">
+
+[`NGram`](api/java/org/apache/spark/ml/feature/NGram.html) takes an input column name, an output column name, and an optional length parameter n (n=2 by default).
+
+{% highlight java %}
+import com.google.common.collect.Lists;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.NGram;
+import org.apache.spark.mllib.linalg.Vector;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+
+JavaRDD<Row> jrdd = jsc.parallelize(Lists.newArrayList(
+  RowFactory.create(0D, Lists.newArrayList("Hi", "I", "heard", "about", "Spark")),
+  RowFactory.create(1D, Lists.newArrayList("I", "wish", "Java", "could", "use", "case", "classes")),
+  RowFactory.create(2D, Lists.newArrayList("Logistic", "regression", "models", "are", "neat"))
+));
+StructType schema = new StructType(new StructField[]{
+  new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
+  new StructField("words", DataTypes.createArrayType(DataTypes.StringType), false, Metadata.empty())
+});
+DataFrame wordDataFrame = sqlContext.createDataFrame(jrdd, schema);
+NGram ngramTransformer = new NGram().setInputCol("words").setOutputCol("ngrams");
+DataFrame ngramDataFrame = ngramTransformer.transform(wordDataFrame);
+for (Row r : ngramDataFrame.select("ngrams", "label").take(3)) {
+  java.util.List<String> ngrams = r.getList(0);
+  for (String ngram : ngrams) System.out.print(ngram + " --- ");
+  System.out.println();
+}
+{% endhighlight %}
+</div>
+
+<div data-lang="python" markdown="1">
+
+[`NGram`](api/python/pyspark.ml.html#pyspark.ml.feature.NGram) takes an input column name, an output column name, and an optional length parameter n (n=2 by default).
+
+{% highlight python %}
+from pyspark.ml.feature import NGram
+
+wordDataFrame = sqlContext.createDataFrame([
+  (0, ["Hi", "I", "heard", "about", "Spark"]),
+  (1, ["I", "wish", "Java", "could", "use", "case", "classes"]),
+  (2, ["Logistic", "regression", "models", "are", "neat"])
+], ["label", "words"])
+ngram = NGram(inputCol="words", outputCol="ngrams")
+ngramDataFrame = ngram.transform(wordDataFrame)
+for ngrams_label in ngramDataFrame.select("ngrams", "label").take(3):
+  print(ngrams_label)
+{% endhighlight %}
+</div>
+</div>
+
+
 ## Binarizer
 
 Binarization is the process of thresholding numerical features to binary features. As some probabilistic estimators make assumption that the input data is distributed according to [Bernoulli distribution](http://en.wikipedia.org/wiki/Bernoulli_distribution), a binarizer is useful for pre-processing the input data with continuous numerical features.

From 8f3cd93278337dc10b9dd3a344d6f7b51ba9960d Mon Sep 17 00:00:00 2001
From: Alok Singh <“singhal@us.ibm.com”>
Date: Wed, 8 Jul 2015 14:51:18 -0700
Subject: [PATCH 060/549] =?UTF-8?q?[SPARK-8909][Documentation]=20Change=20?=
 =?UTF-8?q?the=20scala=20example=20in=20sql-programmi=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…ng-guide#Manually Specifying Options to be in sync with java,python, R version

Author: Alok Singh <“singhal@us.ibm.com”>

Closes #7299 from aloknsingh/aloknsingh_SPARK-8909 and squashes the following commits:

d3c20ba [Alok Singh] fix the file to .parquet from .json
d476140 [Alok Singh] [SPARK-8909][Documentation] Change the scala example in sql-programming-guide#Manually Specifying Options to be in sync with java,python, R version
---
 docs/sql-programming-guide.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 26f6ba84b91b2..5838bc172fe86 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -828,7 +828,7 @@ using this syntax.
 
 {% highlight scala %}
 val df = sqlContext.read.format("json").load("examples/src/main/resources/people.json")
-df.select("name", "age").write.format("json").save("namesAndAges.json")
+df.select("name", "age").write.format("parquet").save("namesAndAges.parquet")
 {% endhighlight %}
 
 </div>

From 00b265f12c0f0271b7036f831fee09b694908b29 Mon Sep 17 00:00:00 2001
From: Cheolsoo Park <cheolsoop@netflix.com>
Date: Wed, 8 Jul 2015 15:18:24 -0700
Subject: [PATCH 061/549] [SPARK-8908] [SQL] Add () to distinct definition in
 dataframe

Adding `()` to the definition of `distinct` in DataFrame allows distinct to be called with parentheses, which is consistent with `dropDuplicates`.

Author: Cheolsoo Park <cheolsoop@netflix.com>

Closes #7298 from piaozhexiu/SPARK-8908 and squashes the following commits:

7f0d923 [Cheolsoo Park] Add () to distinct definition in dataframe
---
 sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index 60142291eaf30..f33e19a0cb7dd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -1415,7 +1415,7 @@ class DataFrame private[sql](
    * @group dfops
    * @since 1.3.0
    */
-  override def distinct: DataFrame = dropDuplicates()
+  override def distinct(): DataFrame = dropDuplicates()
 
   /**
    * @group basic

From 57221934e0376e5bb8421dc35d4bf91db4deeca1 Mon Sep 17 00:00:00 2001
From: DB Tsai <dbt@netflix.com>
Date: Wed, 8 Jul 2015 15:21:58 -0700
Subject: [PATCH 062/549] [SPARK-8700][ML] Disable feature scaling in Logistic
 Regression

All compressed sensing applications, and some of the regression use-cases will have better result by turning the feature scaling off. However, if we implement this naively by training the dataset without doing any standardization, the rate of convergency will not be good. This can be implemented by still standardizing the training dataset but we penalize each component differently to get effectively the same objective function but a better numerical problem. As a result, for those columns with high variances, they will be penalized less, and vice versa. Without this, since all the features are standardized, so they will be penalized the same.

In R, there is an option for this.
`standardize`
Logical flag for x variable standardization, prior to fitting the model sequence. The coefficients are always returned on the original scale. Default is standardize=TRUE. If variables are in the same units already, you might not wish to standardize. See details below for y standardization with family="gaussian".

+cc holdenk mengxr jkbradley

Author: DB Tsai <dbt@netflix.com>

Closes #7080 from dbtsai/lors and squashes the following commits:

877e6c7 [DB Tsai] repahse the doc
7cf45f2 [DB Tsai] address feedback
78d75c9 [DB Tsai] small change
c2c9e60 [DB Tsai] style
6e1a8e0 [DB Tsai] first commit
---
 .../classification/LogisticRegression.scala   |  89 +++-
 .../ml/param/shared/SharedParamsCodeGen.scala |   3 +-
 .../spark/ml/param/shared/sharedParams.scala  |   4 +-
 .../LogisticRegressionSuite.scala             | 403 ++++++++++++++----
 project/MimaExcludes.scala                    |   2 +
 5 files changed, 384 insertions(+), 117 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index 3967151f76c7c..8fc9199fb4602 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -19,7 +19,7 @@ package org.apache.spark.ml.classification
 
 import scala.collection.mutable
 
-import breeze.linalg.{DenseVector => BDV, norm => brzNorm}
+import breeze.linalg.{DenseVector => BDV}
 import breeze.optimize.{CachedDiffFunction, DiffFunction, LBFGS => BreezeLBFGS, OWLQN => BreezeOWLQN}
 
 import org.apache.spark.{Logging, SparkException}
@@ -41,7 +41,7 @@ import org.apache.spark.storage.StorageLevel
  */
 private[classification] trait LogisticRegressionParams extends ProbabilisticClassifierParams
   with HasRegParam with HasElasticNetParam with HasMaxIter with HasFitIntercept with HasTol
-  with HasThreshold
+  with HasThreshold with HasStandardization
 
 /**
  * :: Experimental ::
@@ -98,6 +98,18 @@ class LogisticRegression(override val uid: String)
   def setFitIntercept(value: Boolean): this.type = set(fitIntercept, value)
   setDefault(fitIntercept -> true)
 
+  /**
+   * Whether to standardize the training features before fitting the model.
+   * The coefficients of models will be always returned on the original scale,
+   * so it will be transparent for users. Note that when no regularization,
+   * with or without standardization, the models should be always converged to
+   * the same solution.
+   * Default is true.
+   * @group setParam
+   * */
+  def setStandardization(value: Boolean): this.type = set(standardization, value)
+  setDefault(standardization -> true)
+
   /** @group setParam */
   def setThreshold(value: Double): this.type = set(threshold, value)
   setDefault(threshold -> 0.5)
@@ -149,15 +161,28 @@ class LogisticRegression(override val uid: String)
     val regParamL1 = $(elasticNetParam) * $(regParam)
     val regParamL2 = (1.0 - $(elasticNetParam)) * $(regParam)
 
-    val costFun = new LogisticCostFun(instances, numClasses, $(fitIntercept),
+    val costFun = new LogisticCostFun(instances, numClasses, $(fitIntercept), $(standardization),
       featuresStd, featuresMean, regParamL2)
 
     val optimizer = if ($(elasticNetParam) == 0.0 || $(regParam) == 0.0) {
       new BreezeLBFGS[BDV[Double]]($(maxIter), 10, $(tol))
     } else {
-      // Remove the L1 penalization on the intercept
       def regParamL1Fun = (index: Int) => {
-        if (index == numFeatures) 0.0 else regParamL1
+        // Remove the L1 penalization on the intercept
+        if (index == numFeatures) {
+          0.0
+        } else {
+          if ($(standardization)) {
+            regParamL1
+          } else {
+            // If `standardization` is false, we still standardize the data
+            // to improve the rate of convergence; as a result, we have to
+            // perform this reverse standardization by penalizing each component
+            // differently to get effectively the same objective function when
+            // the training dataset is not standardized.
+            if (featuresStd(index) != 0.0) regParamL1 / featuresStd(index) else 0.0
+          }
+        }
       }
       new BreezeOWLQN[Int, BDV[Double]]($(maxIter), 10, regParamL1Fun, $(tol))
     }
@@ -523,11 +548,13 @@ private class LogisticCostFun(
     data: RDD[(Double, Vector)],
     numClasses: Int,
     fitIntercept: Boolean,
+    standardization: Boolean,
     featuresStd: Array[Double],
     featuresMean: Array[Double],
     regParamL2: Double) extends DiffFunction[BDV[Double]] {
 
   override def calculate(weights: BDV[Double]): (Double, BDV[Double]) = {
+    val numFeatures = featuresStd.length
     val w = Vectors.fromBreeze(weights)
 
     val logisticAggregator = data.treeAggregate(new LogisticAggregator(w, numClasses, fitIntercept,
@@ -539,27 +566,43 @@ private class LogisticCostFun(
           case (aggregator1, aggregator2) => aggregator1.merge(aggregator2)
         })
 
-    // regVal is the sum of weight squares for L2 regularization
-    val norm = if (regParamL2 == 0.0) {
-      0.0
-    } else if (fitIntercept) {
-      brzNorm(Vectors.dense(weights.toArray.slice(0, weights.size -1)).toBreeze, 2.0)
-    } else {
-      brzNorm(weights, 2.0)
-    }
-    val regVal = 0.5 * regParamL2 * norm * norm
+    val totalGradientArray = logisticAggregator.gradient.toArray
 
-    val loss = logisticAggregator.loss + regVal
-    val gradient = logisticAggregator.gradient
-
-    if (fitIntercept) {
-      val wArray = w.toArray.clone()
-      wArray(wArray.length - 1) = 0.0
-      axpy(regParamL2, Vectors.dense(wArray), gradient)
+    // regVal is the sum of weight squares excluding intercept for L2 regularization.
+    val regVal = if (regParamL2 == 0.0) {
+      0.0
     } else {
-      axpy(regParamL2, w, gradient)
+      var sum = 0.0
+      w.foreachActive { (index, value) =>
+        // If `fitIntercept` is true, the last term which is intercept doesn't
+        // contribute to the regularization.
+        if (index != numFeatures) {
+          // The following code will compute the loss of the regularization; also
+          // the gradient of the regularization, and add back to totalGradientArray.
+          sum += {
+            if (standardization) {
+              totalGradientArray(index) += regParamL2 * value
+              value * value
+            } else {
+              if (featuresStd(index) != 0.0) {
+                // If `standardization` is false, we still standardize the data
+                // to improve the rate of convergence; as a result, we have to
+                // perform this reverse standardization by penalizing each component
+                // differently to get effectively the same objective function when
+                // the training dataset is not standardized.
+                val temp = value / (featuresStd(index) * featuresStd(index))
+                totalGradientArray(index) += regParamL2 * temp
+                value * temp
+              } else {
+                0.0
+              }
+            }
+          }
+        }
+      }
+      0.5 * regParamL2 * sum
     }
 
-    (loss, gradient.toBreeze.asInstanceOf[BDV[Double]])
+    (logisticAggregator.loss + regVal, new BDV(totalGradientArray))
   }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
index b0a6af171c01f..66b751a1b02ee 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
@@ -54,8 +54,7 @@ private[shared] object SharedParamsCodeGen {
         isValid = "ParamValidators.gtEq(1)"),
       ParamDesc[Boolean]("fitIntercept", "whether to fit an intercept term", Some("true")),
       ParamDesc[Boolean]("standardization", "whether to standardize the training features" +
-        " prior to fitting the model sequence. Note that the coefficients of models are" +
-        " always returned on the original scale.", Some("true")),
+        " before fitting the model.", Some("true")),
       ParamDesc[Long]("seed", "random seed", Some("this.getClass.getName.hashCode.toLong")),
       ParamDesc[Double]("elasticNetParam", "the ElasticNet mixing parameter, in range [0, 1]." +
         " For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.",
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
index bbe08939b6d75..f81bd76c22376 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
@@ -239,10 +239,10 @@ private[ml] trait HasFitIntercept extends Params {
 private[ml] trait HasStandardization extends Params {
 
   /**
-   * Param for whether to standardize the training features prior to fitting the model sequence. Note that the coefficients of models are always returned on the original scale..
+   * Param for whether to standardize the training features before fitting the model..
    * @group param
    */
-  final val standardization: BooleanParam = new BooleanParam(this, "standardization", "whether to standardize the training features prior to fitting the model sequence. Note that the coefficients of models are always returned on the original scale.")
+  final val standardization: BooleanParam = new BooleanParam(this, "standardization", "whether to standardize the training features before fitting the model.")
 
   setDefault(standardization, true)
 
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
index ba8fbee84197c..27253c1db2fff 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
@@ -77,6 +77,7 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
     assert(lr.getRawPredictionCol === "rawPrediction")
     assert(lr.getProbabilityCol === "probability")
     assert(lr.getFitIntercept)
+    assert(lr.getStandardization)
     val model = lr.fit(dataset)
     model.transform(dataset)
       .select("label", "probability", "prediction", "rawPrediction")
@@ -208,8 +209,11 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
   }
 
   test("binary logistic regression with intercept without regularization") {
-    val trainer = (new LogisticRegression).setFitIntercept(true)
-    val model = trainer.fit(binaryDataset)
+    val trainer1 = (new LogisticRegression).setFitIntercept(true).setStandardization(true)
+    val trainer2 = (new LogisticRegression).setFitIntercept(true).setStandardization(false)
+
+    val model1 = trainer1.fit(binaryDataset)
+    val model2 = trainer2.fit(binaryDataset)
 
     /*
        Using the following R code to load the data and train the model using glmnet package.
@@ -232,16 +236,26 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
     val interceptR = 2.8366423
     val weightsR = Array(-0.5895848, 0.8931147, -0.3925051, -0.7996864)
 
-    assert(model.intercept ~== interceptR relTol 1E-3)
-    assert(model.weights(0) ~== weightsR(0) relTol 1E-3)
-    assert(model.weights(1) ~== weightsR(1) relTol 1E-3)
-    assert(model.weights(2) ~== weightsR(2) relTol 1E-3)
-    assert(model.weights(3) ~== weightsR(3) relTol 1E-3)
+    assert(model1.intercept ~== interceptR relTol 1E-3)
+    assert(model1.weights(0) ~== weightsR(0) relTol 1E-3)
+    assert(model1.weights(1) ~== weightsR(1) relTol 1E-3)
+    assert(model1.weights(2) ~== weightsR(2) relTol 1E-3)
+    assert(model1.weights(3) ~== weightsR(3) relTol 1E-3)
+
+    // Without regularization, with or without standardization will converge to the same solution.
+    assert(model2.intercept ~== interceptR relTol 1E-3)
+    assert(model2.weights(0) ~== weightsR(0) relTol 1E-3)
+    assert(model2.weights(1) ~== weightsR(1) relTol 1E-3)
+    assert(model2.weights(2) ~== weightsR(2) relTol 1E-3)
+    assert(model2.weights(3) ~== weightsR(3) relTol 1E-3)
   }
 
   test("binary logistic regression without intercept without regularization") {
-    val trainer = (new LogisticRegression).setFitIntercept(false)
-    val model = trainer.fit(binaryDataset)
+    val trainer1 = (new LogisticRegression).setFitIntercept(false).setStandardization(true)
+    val trainer2 = (new LogisticRegression).setFitIntercept(false).setStandardization(false)
+
+    val model1 = trainer1.fit(binaryDataset)
+    val model2 = trainer2.fit(binaryDataset)
 
     /*
        Using the following R code to load the data and train the model using glmnet package.
@@ -265,17 +279,28 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
     val interceptR = 0.0
     val weightsR = Array(-0.3534996, 1.2964482, -0.3571741, -0.7407946)
 
-    assert(model.intercept ~== interceptR relTol 1E-3)
-    assert(model.weights(0) ~== weightsR(0) relTol 1E-2)
-    assert(model.weights(1) ~== weightsR(1) relTol 1E-2)
-    assert(model.weights(2) ~== weightsR(2) relTol 1E-3)
-    assert(model.weights(3) ~== weightsR(3) relTol 1E-3)
+    assert(model1.intercept ~== interceptR relTol 1E-3)
+    assert(model1.weights(0) ~== weightsR(0) relTol 1E-2)
+    assert(model1.weights(1) ~== weightsR(1) relTol 1E-2)
+    assert(model1.weights(2) ~== weightsR(2) relTol 1E-3)
+    assert(model1.weights(3) ~== weightsR(3) relTol 1E-3)
+
+    // Without regularization, with or without standardization should converge to the same solution.
+    assert(model2.intercept ~== interceptR relTol 1E-3)
+    assert(model2.weights(0) ~== weightsR(0) relTol 1E-2)
+    assert(model2.weights(1) ~== weightsR(1) relTol 1E-2)
+    assert(model2.weights(2) ~== weightsR(2) relTol 1E-3)
+    assert(model2.weights(3) ~== weightsR(3) relTol 1E-3)
   }
 
   test("binary logistic regression with intercept with L1 regularization") {
-    val trainer = (new LogisticRegression).setFitIntercept(true)
-      .setElasticNetParam(1.0).setRegParam(0.12)
-    val model = trainer.fit(binaryDataset)
+    val trainer1 = (new LogisticRegression).setFitIntercept(true)
+      .setElasticNetParam(1.0).setRegParam(0.12).setStandardization(true)
+    val trainer2 = (new LogisticRegression).setFitIntercept(true)
+      .setElasticNetParam(1.0).setRegParam(0.12).setStandardization(false)
+
+    val model1 = trainer1.fit(binaryDataset)
+    val model2 = trainer2.fit(binaryDataset)
 
     /*
        Using the following R code to load the data and train the model using glmnet package.
@@ -295,20 +320,52 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
        data.V4     -0.04325749
        data.V5     -0.02481551
      */
-    val interceptR = -0.05627428
-    val weightsR = Array(0.0, 0.0, -0.04325749, -0.02481551)
-
-    assert(model.intercept ~== interceptR relTol 1E-2)
-    assert(model.weights(0) ~== weightsR(0) relTol 1E-3)
-    assert(model.weights(1) ~== weightsR(1) relTol 1E-3)
-    assert(model.weights(2) ~== weightsR(2) relTol 1E-2)
-    assert(model.weights(3) ~== weightsR(3) relTol 2E-2)
+    val interceptR1 = -0.05627428
+    val weightsR1 = Array(0.0, 0.0, -0.04325749, -0.02481551)
+
+    assert(model1.intercept ~== interceptR1 relTol 1E-2)
+    assert(model1.weights(0) ~== weightsR1(0) absTol 1E-3)
+    assert(model1.weights(1) ~== weightsR1(1) absTol 1E-3)
+    assert(model1.weights(2) ~== weightsR1(2) relTol 1E-2)
+    assert(model1.weights(3) ~== weightsR1(3) relTol 2E-2)
+
+    /*
+       Using the following R code to load the data and train the model using glmnet package.
+
+       library("glmnet")
+       data <- read.csv("path", header=FALSE)
+       label = factor(data$V1)
+       features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
+       weights = coef(glmnet(features,label, family="binomial", alpha = 1, lambda = 0.12,
+           standardize=FALSE))
+       weights
+
+       5 x 1 sparse Matrix of class "dgCMatrix"
+                           s0
+       (Intercept)  0.3722152
+       data.V2       .
+       data.V3       .
+       data.V4     -0.1665453
+       data.V5       .
+     */
+    val interceptR2 = 0.3722152
+    val weightsR2 = Array(0.0, 0.0, -0.1665453, 0.0)
+
+    assert(model2.intercept ~== interceptR2 relTol 1E-2)
+    assert(model2.weights(0) ~== weightsR2(0) absTol 1E-3)
+    assert(model2.weights(1) ~== weightsR2(1) absTol 1E-3)
+    assert(model2.weights(2) ~== weightsR2(2) relTol 1E-2)
+    assert(model2.weights(3) ~== weightsR2(3) absTol 1E-3)
   }
 
   test("binary logistic regression without intercept with L1 regularization") {
-    val trainer = (new LogisticRegression).setFitIntercept(false)
-      .setElasticNetParam(1.0).setRegParam(0.12)
-    val model = trainer.fit(binaryDataset)
+    val trainer1 = (new LogisticRegression).setFitIntercept(false)
+      .setElasticNetParam(1.0).setRegParam(0.12).setStandardization(true)
+    val trainer2 = (new LogisticRegression).setFitIntercept(false)
+      .setElasticNetParam(1.0).setRegParam(0.12).setStandardization(false)
+
+    val model1 = trainer1.fit(binaryDataset)
+    val model2 = trainer2.fit(binaryDataset)
 
     /*
        Using the following R code to load the data and train the model using glmnet package.
@@ -329,20 +386,52 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
        data.V4     -0.05189203
        data.V5     -0.03891782
      */
-    val interceptR = 0.0
-    val weightsR = Array(0.0, 0.0, -0.05189203, -0.03891782)
+    val interceptR1 = 0.0
+    val weightsR1 = Array(0.0, 0.0, -0.05189203, -0.03891782)
+
+    assert(model1.intercept ~== interceptR1 relTol 1E-3)
+    assert(model1.weights(0) ~== weightsR1(0) absTol 1E-3)
+    assert(model1.weights(1) ~== weightsR1(1) absTol 1E-3)
+    assert(model1.weights(2) ~== weightsR1(2) relTol 1E-2)
+    assert(model1.weights(3) ~== weightsR1(3) relTol 1E-2)
+
+    /*
+       Using the following R code to load the data and train the model using glmnet package.
+
+       library("glmnet")
+       data <- read.csv("path", header=FALSE)
+       label = factor(data$V1)
+       features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
+       weights = coef(glmnet(features,label, family="binomial", alpha = 1, lambda = 0.12,
+           intercept=FALSE, standardize=FALSE))
+       weights
 
-    assert(model.intercept ~== interceptR relTol 1E-3)
-    assert(model.weights(0) ~== weightsR(0) relTol 1E-3)
-    assert(model.weights(1) ~== weightsR(1) relTol 1E-3)
-    assert(model.weights(2) ~== weightsR(2) relTol 1E-2)
-    assert(model.weights(3) ~== weightsR(3) relTol 1E-2)
+       5 x 1 sparse Matrix of class "dgCMatrix"
+                            s0
+       (Intercept)   .
+       data.V2       .
+       data.V3       .
+       data.V4     -0.08420782
+       data.V5       .
+     */
+    val interceptR2 = 0.0
+    val weightsR2 = Array(0.0, 0.0, -0.08420782, 0.0)
+
+    assert(model2.intercept ~== interceptR2 relTol 1E-3)
+    assert(model2.weights(0) ~== weightsR2(0) absTol 1E-3)
+    assert(model2.weights(1) ~== weightsR2(1) absTol 1E-3)
+    assert(model2.weights(2) ~== weightsR2(2) relTol 1E-2)
+    assert(model2.weights(3) ~== weightsR2(3) absTol 1E-3)
   }
 
   test("binary logistic regression with intercept with L2 regularization") {
-    val trainer = (new LogisticRegression).setFitIntercept(true)
-      .setElasticNetParam(0.0).setRegParam(1.37)
-    val model = trainer.fit(binaryDataset)
+    val trainer1 = (new LogisticRegression).setFitIntercept(true)
+      .setElasticNetParam(0.0).setRegParam(1.37).setStandardization(true)
+    val trainer2 = (new LogisticRegression).setFitIntercept(true)
+      .setElasticNetParam(0.0).setRegParam(1.37).setStandardization(false)
+
+    val model1 = trainer1.fit(binaryDataset)
+    val model2 = trainer2.fit(binaryDataset)
 
     /*
        Using the following R code to load the data and train the model using glmnet package.
@@ -362,20 +451,52 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
        data.V4     -0.04865309
        data.V5     -0.10062872
      */
-    val interceptR = 0.15021751
-    val weightsR = Array(-0.07251837, 0.10724191, -0.04865309, -0.10062872)
-
-    assert(model.intercept ~== interceptR relTol 1E-3)
-    assert(model.weights(0) ~== weightsR(0) relTol 1E-3)
-    assert(model.weights(1) ~== weightsR(1) relTol 1E-3)
-    assert(model.weights(2) ~== weightsR(2) relTol 1E-3)
-    assert(model.weights(3) ~== weightsR(3) relTol 1E-3)
+    val interceptR1 = 0.15021751
+    val weightsR1 = Array(-0.07251837, 0.10724191, -0.04865309, -0.10062872)
+
+    assert(model1.intercept ~== interceptR1 relTol 1E-3)
+    assert(model1.weights(0) ~== weightsR1(0) relTol 1E-3)
+    assert(model1.weights(1) ~== weightsR1(1) relTol 1E-3)
+    assert(model1.weights(2) ~== weightsR1(2) relTol 1E-3)
+    assert(model1.weights(3) ~== weightsR1(3) relTol 1E-3)
+
+    /*
+       Using the following R code to load the data and train the model using glmnet package.
+
+       library("glmnet")
+       data <- read.csv("path", header=FALSE)
+       label = factor(data$V1)
+       features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
+       weights = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 1.37,
+           standardize=FALSE))
+       weights
+
+       5 x 1 sparse Matrix of class "dgCMatrix"
+                            s0
+       (Intercept)  0.48657516
+       data.V2     -0.05155371
+       data.V3      0.02301057
+       data.V4     -0.11482896
+       data.V5     -0.06266838
+     */
+    val interceptR2 = 0.48657516
+    val weightsR2 = Array(-0.05155371, 0.02301057, -0.11482896, -0.06266838)
+
+    assert(model2.intercept ~== interceptR2 relTol 1E-3)
+    assert(model2.weights(0) ~== weightsR2(0) relTol 1E-3)
+    assert(model2.weights(1) ~== weightsR2(1) relTol 1E-3)
+    assert(model2.weights(2) ~== weightsR2(2) relTol 1E-3)
+    assert(model2.weights(3) ~== weightsR2(3) relTol 1E-3)
   }
 
   test("binary logistic regression without intercept with L2 regularization") {
-    val trainer = (new LogisticRegression).setFitIntercept(false)
-      .setElasticNetParam(0.0).setRegParam(1.37)
-    val model = trainer.fit(binaryDataset)
+    val trainer1 = (new LogisticRegression).setFitIntercept(false)
+      .setElasticNetParam(0.0).setRegParam(1.37).setStandardization(true)
+    val trainer2 = (new LogisticRegression).setFitIntercept(false)
+      .setElasticNetParam(0.0).setRegParam(1.37).setStandardization(false)
+
+    val model1 = trainer1.fit(binaryDataset)
+    val model2 = trainer2.fit(binaryDataset)
 
     /*
        Using the following R code to load the data and train the model using glmnet package.
@@ -396,20 +517,52 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
        data.V4     -0.04708770
        data.V5     -0.09799775
      */
-    val interceptR = 0.0
-    val weightsR = Array(-0.06099165, 0.12857058, -0.04708770, -0.09799775)
+    val interceptR1 = 0.0
+    val weightsR1 = Array(-0.06099165, 0.12857058, -0.04708770, -0.09799775)
+
+    assert(model1.intercept ~== interceptR1 relTol 1E-3)
+    assert(model1.weights(0) ~== weightsR1(0) relTol 1E-2)
+    assert(model1.weights(1) ~== weightsR1(1) relTol 1E-2)
+    assert(model1.weights(2) ~== weightsR1(2) relTol 1E-3)
+    assert(model1.weights(3) ~== weightsR1(3) relTol 1E-3)
+
+    /*
+       Using the following R code to load the data and train the model using glmnet package.
+
+       library("glmnet")
+       data <- read.csv("path", header=FALSE)
+       label = factor(data$V1)
+       features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
+       weights = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 1.37,
+           intercept=FALSE, standardize=FALSE))
+       weights
 
-    assert(model.intercept ~== interceptR relTol 1E-3)
-    assert(model.weights(0) ~== weightsR(0) relTol 1E-2)
-    assert(model.weights(1) ~== weightsR(1) relTol 1E-2)
-    assert(model.weights(2) ~== weightsR(2) relTol 1E-3)
-    assert(model.weights(3) ~== weightsR(3) relTol 1E-3)
+       5 x 1 sparse Matrix of class "dgCMatrix"
+                             s0
+       (Intercept)   .
+       data.V2     -0.005679651
+       data.V3      0.048967094
+       data.V4     -0.093714016
+       data.V5     -0.053314311
+     */
+    val interceptR2 = 0.0
+    val weightsR2 = Array(-0.005679651, 0.048967094, -0.093714016, -0.053314311)
+
+    assert(model2.intercept ~== interceptR2 relTol 1E-3)
+    assert(model2.weights(0) ~== weightsR2(0) relTol 1E-2)
+    assert(model2.weights(1) ~== weightsR2(1) relTol 1E-2)
+    assert(model2.weights(2) ~== weightsR2(2) relTol 1E-3)
+    assert(model2.weights(3) ~== weightsR2(3) relTol 1E-3)
   }
 
   test("binary logistic regression with intercept with ElasticNet regularization") {
-    val trainer = (new LogisticRegression).setFitIntercept(true)
-      .setElasticNetParam(0.38).setRegParam(0.21)
-    val model = trainer.fit(binaryDataset)
+    val trainer1 = (new LogisticRegression).setFitIntercept(true)
+      .setElasticNetParam(0.38).setRegParam(0.21).setStandardization(true)
+    val trainer2 = (new LogisticRegression).setFitIntercept(true)
+      .setElasticNetParam(0.38).setRegParam(0.21).setStandardization(false)
+
+    val model1 = trainer1.fit(binaryDataset)
+    val model2 = trainer2.fit(binaryDataset)
 
     /*
        Using the following R code to load the data and train the model using glmnet package.
@@ -429,20 +582,52 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
        data.V4     -0.08849250
        data.V5     -0.15458796
      */
-    val interceptR = 0.57734851
-    val weightsR = Array(-0.05310287, 0.0, -0.08849250, -0.15458796)
-
-    assert(model.intercept ~== interceptR relTol 6E-3)
-    assert(model.weights(0) ~== weightsR(0) relTol 5E-3)
-    assert(model.weights(1) ~== weightsR(1) relTol 1E-3)
-    assert(model.weights(2) ~== weightsR(2) relTol 5E-3)
-    assert(model.weights(3) ~== weightsR(3) relTol 1E-3)
+    val interceptR1 = 0.57734851
+    val weightsR1 = Array(-0.05310287, 0.0, -0.08849250, -0.15458796)
+
+    assert(model1.intercept ~== interceptR1 relTol 6E-3)
+    assert(model1.weights(0) ~== weightsR1(0) relTol 5E-3)
+    assert(model1.weights(1) ~== weightsR1(1) absTol 1E-3)
+    assert(model1.weights(2) ~== weightsR1(2) relTol 5E-3)
+    assert(model1.weights(3) ~== weightsR1(3) relTol 1E-3)
+
+    /*
+       Using the following R code to load the data and train the model using glmnet package.
+
+       library("glmnet")
+       data <- read.csv("path", header=FALSE)
+       label = factor(data$V1)
+       features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
+       weights = coef(glmnet(features,label, family="binomial", alpha = 0.38, lambda = 0.21,
+           standardize=FALSE))
+       weights
+
+       5 x 1 sparse Matrix of class "dgCMatrix"
+                            s0
+       (Intercept)  0.51555993
+       data.V2       .
+       data.V3       .
+       data.V4     -0.18807395
+       data.V5     -0.05350074
+     */
+    val interceptR2 = 0.51555993
+    val weightsR2 = Array(0.0, 0.0, -0.18807395, -0.05350074)
+
+    assert(model2.intercept ~== interceptR2 relTol 6E-3)
+    assert(model2.weights(0) ~== weightsR2(0) absTol 1E-3)
+    assert(model2.weights(1) ~== weightsR2(1) absTol 1E-3)
+    assert(model2.weights(2) ~== weightsR2(2) relTol 5E-3)
+    assert(model2.weights(3) ~== weightsR2(3) relTol 1E-2)
   }
 
   test("binary logistic regression without intercept with ElasticNet regularization") {
-    val trainer = (new LogisticRegression).setFitIntercept(false)
-      .setElasticNetParam(0.38).setRegParam(0.21)
-    val model = trainer.fit(binaryDataset)
+    val trainer1 = (new LogisticRegression).setFitIntercept(false)
+      .setElasticNetParam(0.38).setRegParam(0.21).setStandardization(true)
+    val trainer2 = (new LogisticRegression).setFitIntercept(false)
+      .setElasticNetParam(0.38).setRegParam(0.21).setStandardization(false)
+
+    val model1 = trainer1.fit(binaryDataset)
+    val model2 = trainer2.fit(binaryDataset)
 
     /*
        Using the following R code to load the data and train the model using glmnet package.
@@ -463,20 +648,52 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
        data.V4     -0.081203769
        data.V5     -0.142534158
      */
-    val interceptR = 0.0
-    val weightsR = Array(-0.001005743, 0.072577857, -0.081203769, -0.142534158)
+    val interceptR1 = 0.0
+    val weightsR1 = Array(-0.001005743, 0.072577857, -0.081203769, -0.142534158)
+
+    assert(model1.intercept ~== interceptR1 relTol 1E-3)
+    assert(model1.weights(0) ~== weightsR1(0) absTol 1E-2)
+    assert(model1.weights(1) ~== weightsR1(1) absTol 1E-2)
+    assert(model1.weights(2) ~== weightsR1(2) relTol 1E-3)
+    assert(model1.weights(3) ~== weightsR1(3) relTol 1E-2)
+
+    /*
+       Using the following R code to load the data and train the model using glmnet package.
+
+       library("glmnet")
+       data <- read.csv("path", header=FALSE)
+       label = factor(data$V1)
+       features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
+       weights = coef(glmnet(features,label, family="binomial", alpha = 0.38, lambda = 0.21,
+           intercept=FALSE, standardize=FALSE))
+       weights
 
-    assert(model.intercept ~== interceptR relTol 1E-3)
-    assert(model.weights(0) ~== weightsR(0) absTol 1E-3)
-    assert(model.weights(1) ~== weightsR(1) absTol 1E-2)
-    assert(model.weights(2) ~== weightsR(2) relTol 1E-3)
-    assert(model.weights(3) ~== weightsR(3) relTol 1E-2)
+       5 x 1 sparse Matrix of class "dgCMatrix"
+                            s0
+       (Intercept)   .
+       data.V2       .
+       data.V3      0.03345223
+       data.V4     -0.11304532
+       data.V5       .
+     */
+    val interceptR2 = 0.0
+    val weightsR2 = Array(0.0, 0.03345223, -0.11304532, 0.0)
+
+    assert(model2.intercept ~== interceptR2 relTol 1E-3)
+    assert(model2.weights(0) ~== weightsR2(0) absTol 1E-3)
+    assert(model2.weights(1) ~== weightsR2(1) relTol 1E-2)
+    assert(model2.weights(2) ~== weightsR2(2) relTol 1E-2)
+    assert(model2.weights(3) ~== weightsR2(3) absTol 1E-3)
   }
 
   test("binary logistic regression with intercept with strong L1 regularization") {
-    val trainer = (new LogisticRegression).setFitIntercept(true)
-      .setElasticNetParam(1.0).setRegParam(6.0)
-    val model = trainer.fit(binaryDataset)
+    val trainer1 = (new LogisticRegression).setFitIntercept(true)
+      .setElasticNetParam(1.0).setRegParam(6.0).setStandardization(true)
+    val trainer2 = (new LogisticRegression).setFitIntercept(true)
+      .setElasticNetParam(1.0).setRegParam(6.0).setStandardization(false)
+
+    val model1 = trainer1.fit(binaryDataset)
+    val model2 = trainer2.fit(binaryDataset)
 
     val histogram = binaryDataset.map { case Row(label: Double, features: Vector) => label }
       .treeAggregate(new MultiClassSummarizer)(
@@ -502,11 +719,17 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
     val interceptTheory = math.log(histogram(1).toDouble / histogram(0).toDouble)
     val weightsTheory = Array(0.0, 0.0, 0.0, 0.0)
 
-    assert(model.intercept ~== interceptTheory relTol 1E-5)
-    assert(model.weights(0) ~== weightsTheory(0) absTol 1E-6)
-    assert(model.weights(1) ~== weightsTheory(1) absTol 1E-6)
-    assert(model.weights(2) ~== weightsTheory(2) absTol 1E-6)
-    assert(model.weights(3) ~== weightsTheory(3) absTol 1E-6)
+    assert(model1.intercept ~== interceptTheory relTol 1E-5)
+    assert(model1.weights(0) ~== weightsTheory(0) absTol 1E-6)
+    assert(model1.weights(1) ~== weightsTheory(1) absTol 1E-6)
+    assert(model1.weights(2) ~== weightsTheory(2) absTol 1E-6)
+    assert(model1.weights(3) ~== weightsTheory(3) absTol 1E-6)
+
+    assert(model2.intercept ~== interceptTheory relTol 1E-5)
+    assert(model2.weights(0) ~== weightsTheory(0) absTol 1E-6)
+    assert(model2.weights(1) ~== weightsTheory(1) absTol 1E-6)
+    assert(model2.weights(2) ~== weightsTheory(2) absTol 1E-6)
+    assert(model2.weights(3) ~== weightsTheory(3) absTol 1E-6)
 
     /*
        Using the following R code to load the data and train the model using glmnet package.
@@ -529,10 +752,10 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
     val interceptR = -0.248065
     val weightsR = Array(0.0, 0.0, 0.0, 0.0)
 
-    assert(model.intercept ~== interceptR relTol 1E-5)
-    assert(model.weights(0) ~== weightsR(0) absTol 1E-6)
-    assert(model.weights(1) ~== weightsR(1) absTol 1E-6)
-    assert(model.weights(2) ~== weightsR(2) absTol 1E-6)
-    assert(model.weights(3) ~== weightsR(3) absTol 1E-6)
+    assert(model1.intercept ~== interceptR relTol 1E-5)
+    assert(model1.weights(0) ~== weightsR(0) absTol 1E-6)
+    assert(model1.weights(1) ~== weightsR(1) absTol 1E-6)
+    assert(model1.weights(2) ~== weightsR(2) absTol 1E-6)
+    assert(model1.weights(3) ~== weightsR(3) absTol 1E-6)
   }
 }
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 680b699e9e4a1..41e19fd9cc11e 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -58,6 +58,8 @@ object MimaExcludes {
               "org.apache.spark.ml.regression.LeastSquaresAggregator.this"),
             ProblemFilters.exclude[MissingMethodProblem](
               "org.apache.spark.ml.regression.LeastSquaresCostFun.this"),
+            ProblemFilters.exclude[MissingMethodProblem](
+              "org.apache.spark.ml.classification.LogisticCostFun.this"),
             // SQL execution is considered private.
             excludePackage("org.apache.spark.sql.execution"),
             // NanoTime and CatalystTimestampConverter is only used inside catalyst,

From 5687f76552369fa20b3a4385eab4810214653aa7 Mon Sep 17 00:00:00 2001
From: Daniel Darabos <darabos.daniel@gmail.com>
Date: Thu, 9 Jul 2015 07:34:02 +0900
Subject: [PATCH 063/549] [SPARK-8902] Correctly print hostname in error

With "+" the strings are separate expressions, and format() is called on the last string before concatenation. (So substitution does not happen.) Without "+" the string literals are merged first by the parser, so format() is called on the complete string.

Should I make a JIRA for this?

Author: Daniel Darabos <darabos.daniel@gmail.com>

Closes #7288 from darabos/patch-2 and squashes the following commits:

be0d3b7 [Daniel Darabos] Correctly print hostname in error
---
 ec2/spark_ec2.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ec2/spark_ec2.py b/ec2/spark_ec2.py
index 8582d434a8c55..dd0c12d25980b 100755
--- a/ec2/spark_ec2.py
+++ b/ec2/spark_ec2.py
@@ -1153,8 +1153,8 @@ def ssh(host, opts, command):
                 # If this was an ssh failure, provide the user with hints.
                 if e.returncode == 255:
                     raise UsageError(
-                        "Failed to SSH to remote host {0}.\n" +
-                        "Please check that you have provided the correct --identity-file and " +
+                        "Failed to SSH to remote host {0}.\n"
+                        "Please check that you have provided the correct --identity-file and "
                         "--key-pair parameters and try again.".format(host))
                 else:
                     raise e

From 4ffc27caaf46ffac56c3c0b3e928f1aff227a184 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Wed, 8 Jul 2015 15:51:01 -0700
Subject: [PATCH 064/549] [SPARK-6123] [SPARK-6775] [SPARK-6776] [SQL]
 Refactors Parquet read path for interoperability and backwards-compatibility

This PR is a follow-up of #6617 and is part of [SPARK-6774] [2], which aims to ensure interoperability and backwards-compatibility for Spark SQL Parquet support.  And this one fixes the read path.  Now Spark SQL is expected to be able to read legacy Parquet data files generated by most (if not all) common libraries/tools like parquet-thrift, parquet-avro, and parquet-hive. However, we still need to refactor the write path to write standard Parquet LISTs and MAPs ([SPARK-8848] [4]).

### Major changes

1. `CatalystConverter` class hierarchy refactoring

   - Replaces `CatalystConverter` trait with a much simpler `ParentContainerUpdater`.

     Now instead of extending the original `CatalystConverter` trait, every converter class accepts an updater which is responsible for propagating the converted value to some parent container. For example, appending array elements to a parent array buffer, appending a key-value pairs to a parent mutable map, or setting a converted value to some specific field of a parent row. Root converter doesn't have a parent and thus uses a `NoopUpdater`.

     This simplifies the design since converters don't need to care about details of their parent converters anymore.

   - Unifies `CatalystRootConverter`, `CatalystGroupConverter` and `CatalystPrimitiveRowConverter` into `CatalystRowConverter`

     Specifically, now all row objects are represented by `SpecificMutableRow` during conversion.

   - Refactors `CatalystArrayConverter`, and removes `CatalystArrayContainsNullConverter` and `CatalystNativeArrayConverter`

     `CatalystNativeArrayConverter` was probably designed with the intention of avoiding boxing costs. However, the way it uses Scala generics actually doesn't achieve this goal.

     The new `CatalystArrayConverter` handles both nullable and non-nullable array elements in a consistent way.

   - Implements backwards-compatibility rules in `CatalystArrayConverter`

     When Parquet records are being converted, schema of Parquet files should have already been verified. So we only need to care about the structure rather than field names in the Parquet schema. Since all map objects represented in legacy systems have the same structure as the standard one (see [backwards-compatibility rules for MAP] [1]), we only need to deal with LIST (namely array) in `CatalystArrayConverter`.

2. Requested columns handling

   When specifying requested columns in `RowReadSupport`, we used to use a Parquet `MessageType` converted from a Catalyst `StructType` which contains all requested columns.  This is not preferable when taking compatibility and interoperability into consideration.  Because the actual Parquet file may have different physical structure from the converted schema.

   In this PR, the schema for requested columns is constructed using the following method:

   - For a column that exists in the target Parquet file, we extract the column type by name from the full file schema, and construct a single-field `MessageType` for that column.
   - For a column that doesn't exist in the target Parquet file, we create a single-field `StructType` and convert it to a `MessageType` using `CatalystSchemaConverter`.
   - Unions all single-field `MessageType`s into a full schema containing all requested fields

   With this change, we also fix [SPARK-6123] [3] by validating the global schema against each individual Parquet part-files.

### Testing

This PR also adds compatibility tests for parquet-avro, parquet-thrift, and parquet-hive. Please refer to `README.md` under `sql/core/src/test` for more information about these tests. To avoid build time code generation and adding extra complexity to the build system, Java code generated from testing Thrift schema and Avro IDL is also checked in.

[1]: https://github.com/apache/incubator-parquet-format/blob/master/LogicalTypes.md#backward-compatibility-rules-1
[2]: https://issues.apache.org/jira/browse/SPARK-6774
[3]: https://issues.apache.org/jira/browse/SPARK-6123
[4]: https://issues.apache.org/jira/browse/SPARK-8848

Author: Cheng Lian <lian@databricks.com>

Closes #7231 from liancheng/spark-6776 and squashes the following commits:

360fe18 [Cheng Lian] Adds ParquetHiveCompatibilitySuite
c6fbc06 [Cheng Lian] Removes WIP file committed by mistake
b8c1295 [Cheng Lian] Excludes the whole parquet package from MiMa
598c3e8 [Cheng Lian] Adds extra Maven repo for hadoop-lzo, which is a transitive dependency of parquet-thrift
926af87 [Cheng Lian] Simplifies Parquet compatibility test suites
7946ee1 [Cheng Lian] Fixes Scala styling issues
3d7ab36 [Cheng Lian] Fixes .rat-excludes
a8f13bb [Cheng Lian] Using Parquet writer API to do compatibility tests
f2208cd [Cheng Lian] Adds README.md for Thrift/Avro code generation
1d390aa [Cheng Lian] Adds parquet-thrift compatibility test
440f7b3 [Cheng Lian] Adds generated files to .rat-excludes
13b9121 [Cheng Lian] Adds ParquetAvroCompatibilitySuite
06cfe9d [Cheng Lian] Adds comments about TimestampType handling
a099d3e [Cheng Lian] More comments
0cc1b37 [Cheng Lian] Fixes MiMa checks
884d3e6 [Cheng Lian] Fixes styling issue and reverts unnecessary changes
802cbd7 [Cheng Lian] Fixes bugs related to schema merging and empty requested columns
38fe1e7 [Cheng Lian] Adds explicit return type
7fb21f1 [Cheng Lian] Reverts an unnecessary debugging change
1781dff [Cheng Lian] Adds test case for SPARK-8811
6437d4b [Cheng Lian] Assembles requested schema from Parquet file schema
bcac49f [Cheng Lian] Removes the 16-byte restriction of decimals
a74fb2c [Cheng Lian] More comments
0525346 [Cheng Lian] Removes old Parquet record converters
03c3bd9 [Cheng Lian] Refactors Parquet read path to implement backwards-compatibility rules
---
 .rat-excludes                                 |    2 +
 pom.xml                                       |   33 +
 project/MimaExcludes.scala                    |   17 +-
 .../org/apache/spark/sql/types/DataType.scala |    6 +-
 .../apache/spark/sql/types/StructType.scala   |    5 +
 sql/core/pom.xml                              |   36 +
 .../sql/parquet/CatalystRowConverter.scala    |  434 +++
 .../sql/parquet/CatalystSchemaConverter.scala |   21 +-
 .../spark/sql/parquet/ParquetConverter.scala  |  831 +----
 .../sql/parquet/ParquetTableSupport.scala     |  179 +-
 .../apache/spark/sql/parquet/newParquet.scala |   22 +-
 sql/core/src/test/README.md                   |   33 +
 sql/core/src/test/avro/parquet-compat.avdl    |   47 +
 sql/core/src/test/avro/parquet-compat.avpr    |   86 +
 .../parquet/test/avro/CompatibilityTest.java  |   17 +
 .../spark/sql/parquet/test/avro/Nested.java   |  196 ++
 .../parquet/test/avro/ParquetAvroCompat.java  | 1001 ++++++
 .../spark/sql/parquet/test/thrift/Nested.java |  541 ++++
 .../test/thrift/ParquetThriftCompat.java      | 2808 +++++++++++++++++
 .../spark/sql/parquet/test/thrift/Suit.java   |   51 +
 .../ParquetAvroCompatibilitySuite.scala       |  125 +
 .../parquet/ParquetCompatibilityTest.scala    |   56 +
 .../ParquetThriftCompatibilitySuite.scala     |  140 +
 sql/core/src/test/scripts/gen-code.sh         |   31 +
 .../src/test/thrift/parquet-compat.thrift     |   60 +
 .../hive/ParquetHiveCompatibilitySuite.scala  |   92 +
 .../apache/spark/sql/hive/parquetSuites.scala |   33 +-
 27 files changed, 5984 insertions(+), 919 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystRowConverter.scala
 create mode 100644 sql/core/src/test/README.md
 create mode 100644 sql/core/src/test/avro/parquet-compat.avdl
 create mode 100644 sql/core/src/test/avro/parquet-compat.avpr
 create mode 100644 sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/avro/CompatibilityTest.java
 create mode 100644 sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/avro/Nested.java
 create mode 100644 sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/avro/ParquetAvroCompat.java
 create mode 100644 sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/thrift/Nested.java
 create mode 100644 sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/thrift/ParquetThriftCompat.java
 create mode 100644 sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/thrift/Suit.java
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetAvroCompatibilitySuite.scala
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetCompatibilityTest.scala
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetThriftCompatibilitySuite.scala
 create mode 100755 sql/core/src/test/scripts/gen-code.sh
 create mode 100644 sql/core/src/test/thrift/parquet-compat.thrift
 create mode 100644 sql/hive/src/test/scala/org/apache/spark/sql/hive/ParquetHiveCompatibilitySuite.scala

diff --git a/.rat-excludes b/.rat-excludes
index 0240e81c45ea2..236c2db05367c 100644
--- a/.rat-excludes
+++ b/.rat-excludes
@@ -91,3 +91,5 @@ help/*
 html/*
 INDEX
 .lintr
+gen-java.*
+.*avpr
diff --git a/pom.xml b/pom.xml
index bece526a2a3e5..9cf2471b51304 100644
--- a/pom.xml
+++ b/pom.xml
@@ -161,6 +161,7 @@
     <fasterxml.jackson.version>2.4.4</fasterxml.jackson.version>
     <snappy.version>1.1.1.7</snappy.version>
     <netlib.java.version>1.1.2</netlib.java.version>
+    <thrift.version>0.9.2</thrift.version>
     <!-- For maven shade plugin (see SPARK-8819) -->
     <create.dependency.reduced.pom>false</create.dependency.reduced.pom>
 
@@ -179,6 +180,8 @@
     <hbase.deps.scope>compile</hbase.deps.scope>
     <hive.deps.scope>compile</hive.deps.scope>
     <parquet.deps.scope>compile</parquet.deps.scope>
+    <parquet.test.deps.scope>test</parquet.test.deps.scope>
+    <thrift.test.deps.scope>test</thrift.test.deps.scope>
 
     <!--
       Overridable test home. So that you can call individual pom files directly without
@@ -270,6 +273,18 @@
         <enabled>false</enabled>
       </snapshots>
     </repository>
+    <!-- For transitive dependencies brougt by parquet-thrift -->
+    <repository>
+      <id>twttr-repo</id>
+      <name>Twttr Repository</name>
+      <url>http://maven.twttr.com</url>
+      <releases>
+        <enabled>true</enabled>
+      </releases>
+      <snapshots>
+        <enabled>false</enabled>
+      </snapshots>
+    </repository>
     <!-- TODO: This can be deleted after Spark 1.4 is posted -->
     <repository>
       <id>spark-1.4-staging</id>
@@ -1101,6 +1116,24 @@
         <version>${parquet.version}</version>
         <scope>${parquet.deps.scope}</scope>
       </dependency>
+      <dependency>
+        <groupId>org.apache.parquet</groupId>
+        <artifactId>parquet-avro</artifactId>
+        <version>${parquet.version}</version>
+        <scope>${parquet.test.deps.scope}</scope>
+      </dependency>
+      <dependency>
+        <groupId>org.apache.parquet</groupId>
+        <artifactId>parquet-thrift</artifactId>
+        <version>${parquet.version}</version>
+        <scope>${parquet.test.deps.scope}</scope>
+      </dependency>
+      <dependency>
+        <groupId>org.apache.thrift</groupId>
+        <artifactId>libthrift</artifactId>
+        <version>${thrift.version}</version>
+        <scope>${thrift.test.deps.scope}</scope>
+      </dependency>
       <dependency>
         <groupId>org.apache.flume</groupId>
         <artifactId>flume-ng-core</artifactId>
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 41e19fd9cc11e..7346d804632bc 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -62,21 +62,8 @@ object MimaExcludes {
               "org.apache.spark.ml.classification.LogisticCostFun.this"),
             // SQL execution is considered private.
             excludePackage("org.apache.spark.sql.execution"),
-            // NanoTime and CatalystTimestampConverter is only used inside catalyst,
-            // not needed anymore
-            ProblemFilters.exclude[MissingClassProblem](
-              "org.apache.spark.sql.parquet.timestamp.NanoTime"),
-              ProblemFilters.exclude[MissingClassProblem](
-              "org.apache.spark.sql.parquet.timestamp.NanoTime$"),
-            ProblemFilters.exclude[MissingClassProblem](
-              "org.apache.spark.sql.parquet.CatalystTimestampConverter"),
-            ProblemFilters.exclude[MissingClassProblem](
-              "org.apache.spark.sql.parquet.CatalystTimestampConverter$"),
-            // SPARK-6777 Implements backwards compatibility rules in CatalystSchemaConverter
-            ProblemFilters.exclude[MissingClassProblem](
-              "org.apache.spark.sql.parquet.ParquetTypeInfo"),
-            ProblemFilters.exclude[MissingClassProblem](
-              "org.apache.spark.sql.parquet.ParquetTypeInfo$")
+            // Parquet support is considered private.
+            excludePackage("org.apache.spark.sql.parquet")
           ) ++ Seq(
             // SPARK-8479 Add numNonzeros and numActives to Matrix.
             ProblemFilters.exclude[MissingMethodProblem](
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala
index 7d00047d08d74..a4c2da8e05f5d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala
@@ -17,11 +17,12 @@
 
 package org.apache.spark.sql.types
 
+import scala.util.Try
 import scala.util.parsing.combinator.RegexParsers
 
-import org.json4s._
 import org.json4s.JsonAST.JValue
 import org.json4s.JsonDSL._
+import org.json4s._
 import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.annotation.DeveloperApi
@@ -82,6 +83,9 @@ abstract class DataType extends AbstractDataType {
 
 
 object DataType {
+  private[sql] def fromString(raw: String): DataType = {
+    Try(DataType.fromJson(raw)).getOrElse(DataType.fromCaseClassString(raw))
+  }
 
   def fromJson(json: String): DataType = parseDataType(parse(json))
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
index 3b17566d54d9b..e2d3f53f7d978 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
@@ -311,6 +311,11 @@ object StructType extends AbstractDataType {
 
   private[sql] override def simpleString: String = "struct"
 
+  private[sql] def fromString(raw: String): StructType = DataType.fromString(raw) match {
+    case t: StructType => t
+    case _ => throw new RuntimeException(s"Failed parsing StructType: $raw")
+  }
+
   def apply(fields: Seq[StructField]): StructType = StructType(fields.toArray)
 
   def apply(fields: java.util.List[StructField]): StructType = {
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index 8fc16928adbd9..f90099f22d4bd 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -101,9 +101,45 @@
       <version>9.3-1102-jdbc41</version>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>org.apache.parquet</groupId>
+      <artifactId>parquet-avro</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.parquet</groupId>
+      <artifactId>parquet-thrift</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.thrift</groupId>
+      <artifactId>libthrift</artifactId>
+      <scope>test</scope>
+    </dependency>
   </dependencies>
   <build>
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
     <testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
+    <plugins>
+      <plugin>
+        <groupId>org.codehaus.mojo</groupId>
+        <artifactId>build-helper-maven-plugin</artifactId>
+        <executions>
+          <execution>
+            <id>add-scala-test-sources</id>
+            <phase>generate-test-sources</phase>
+            <goals>
+              <goal>add-test-source</goal>
+            </goals>
+            <configuration>
+              <sources>
+                <source>src/test/scala</source>
+                <source>src/test/gen-java</source>
+              </sources>
+            </configuration>
+          </execution>
+        </executions>
+      </plugin>
+    </plugins>
   </build>
 </project>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystRowConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystRowConverter.scala
new file mode 100644
index 0000000000000..0c3d8fdab6bd2
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystRowConverter.scala
@@ -0,0 +1,434 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.parquet
+
+import java.nio.ByteOrder
+
+import scala.collection.JavaConversions._
+import scala.collection.mutable
+import scala.collection.mutable.ArrayBuffer
+
+import org.apache.parquet.column.Dictionary
+import org.apache.parquet.io.api.{Binary, Converter, GroupConverter, PrimitiveConverter}
+import org.apache.parquet.schema.Type.Repetition
+import org.apache.parquet.schema.{GroupType, PrimitiveType, Type}
+
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.util.DateTimeUtils
+import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.UTF8String
+
+/**
+ * A [[ParentContainerUpdater]] is used by a Parquet converter to set converted values to some
+ * corresponding parent container. For example, a converter for a `StructType` field may set
+ * converted values to a [[MutableRow]]; or a converter for array elements may append converted
+ * values to an [[ArrayBuffer]].
+ */
+private[parquet] trait ParentContainerUpdater {
+  def set(value: Any): Unit = ()
+  def setBoolean(value: Boolean): Unit = set(value)
+  def setByte(value: Byte): Unit = set(value)
+  def setShort(value: Short): Unit = set(value)
+  def setInt(value: Int): Unit = set(value)
+  def setLong(value: Long): Unit = set(value)
+  def setFloat(value: Float): Unit = set(value)
+  def setDouble(value: Double): Unit = set(value)
+}
+
+/** A no-op updater used for root converter (who doesn't have a parent). */
+private[parquet] object NoopUpdater extends ParentContainerUpdater
+
+/**
+ * A [[CatalystRowConverter]] is used to convert Parquet "structs" into Spark SQL [[Row]]s.  Since
+ * any Parquet record is also a struct, this converter can also be used as root converter.
+ *
+ * When used as a root converter, [[NoopUpdater]] should be used since root converters don't have
+ * any "parent" container.
+ *
+ * @param parquetType Parquet schema of Parquet records
+ * @param catalystType Spark SQL schema that corresponds to the Parquet record type
+ * @param updater An updater which propagates converted field values to the parent container
+ */
+private[parquet] class CatalystRowConverter(
+    parquetType: GroupType,
+    catalystType: StructType,
+    updater: ParentContainerUpdater)
+  extends GroupConverter {
+
+  /**
+   * Updater used together with field converters within a [[CatalystRowConverter]].  It propagates
+   * converted filed values to the `ordinal`-th cell in `currentRow`.
+   */
+  private final class RowUpdater(row: MutableRow, ordinal: Int) extends ParentContainerUpdater {
+    override def set(value: Any): Unit = row(ordinal) = value
+    override def setBoolean(value: Boolean): Unit = row.setBoolean(ordinal, value)
+    override def setByte(value: Byte): Unit = row.setByte(ordinal, value)
+    override def setShort(value: Short): Unit = row.setShort(ordinal, value)
+    override def setInt(value: Int): Unit = row.setInt(ordinal, value)
+    override def setLong(value: Long): Unit = row.setLong(ordinal, value)
+    override def setDouble(value: Double): Unit = row.setDouble(ordinal, value)
+    override def setFloat(value: Float): Unit = row.setFloat(ordinal, value)
+  }
+
+  /**
+   * Represents the converted row object once an entire Parquet record is converted.
+   *
+   * @todo Uses [[UnsafeRow]] for better performance.
+   */
+  val currentRow = new SpecificMutableRow(catalystType.map(_.dataType))
+
+  // Converters for each field.
+  private val fieldConverters: Array[Converter] = {
+    parquetType.getFields.zip(catalystType).zipWithIndex.map {
+      case ((parquetFieldType, catalystField), ordinal) =>
+        // Converted field value should be set to the `ordinal`-th cell of `currentRow`
+        newConverter(parquetFieldType, catalystField.dataType, new RowUpdater(currentRow, ordinal))
+    }.toArray
+  }
+
+  override def getConverter(fieldIndex: Int): Converter = fieldConverters(fieldIndex)
+
+  override def end(): Unit = updater.set(currentRow)
+
+  override def start(): Unit = {
+    var i = 0
+    while (i < currentRow.length) {
+      currentRow.setNullAt(i)
+      i += 1
+    }
+  }
+
+  /**
+   * Creates a converter for the given Parquet type `parquetType` and Spark SQL data type
+   * `catalystType`. Converted values are handled by `updater`.
+   */
+  private def newConverter(
+      parquetType: Type,
+      catalystType: DataType,
+      updater: ParentContainerUpdater): Converter = {
+
+    catalystType match {
+      case BooleanType | IntegerType | LongType | FloatType | DoubleType | BinaryType =>
+        new CatalystPrimitiveConverter(updater)
+
+      case ByteType =>
+        new PrimitiveConverter {
+          override def addInt(value: Int): Unit =
+            updater.setByte(value.asInstanceOf[ByteType#InternalType])
+        }
+
+      case ShortType =>
+        new PrimitiveConverter {
+          override def addInt(value: Int): Unit =
+            updater.setShort(value.asInstanceOf[ShortType#InternalType])
+        }
+
+      case t: DecimalType =>
+        new CatalystDecimalConverter(t, updater)
+
+      case StringType =>
+        new CatalystStringConverter(updater)
+
+      case TimestampType =>
+        // TODO Implements `TIMESTAMP_MICROS` once parquet-mr has that.
+        new PrimitiveConverter {
+          // Converts nanosecond timestamps stored as INT96
+          override def addBinary(value: Binary): Unit = {
+            assert(
+              value.length() == 12,
+              "Timestamps (with nanoseconds) are expected to be stored in 12-byte long binaries, " +
+              s"but got a ${value.length()}-byte binary.")
+
+            val buf = value.toByteBuffer.order(ByteOrder.LITTLE_ENDIAN)
+            val timeOfDayNanos = buf.getLong
+            val julianDay = buf.getInt
+            updater.setLong(DateTimeUtils.fromJulianDay(julianDay, timeOfDayNanos))
+          }
+        }
+
+      case DateType =>
+        new PrimitiveConverter {
+          override def addInt(value: Int): Unit = {
+            // DateType is not specialized in `SpecificMutableRow`, have to box it here.
+            updater.set(value.asInstanceOf[DateType#InternalType])
+          }
+        }
+
+      case t: ArrayType =>
+        new CatalystArrayConverter(parquetType.asGroupType(), t, updater)
+
+      case t: MapType =>
+        new CatalystMapConverter(parquetType.asGroupType(), t, updater)
+
+      case t: StructType =>
+        new CatalystRowConverter(parquetType.asGroupType(), t, new ParentContainerUpdater {
+          override def set(value: Any): Unit = updater.set(value.asInstanceOf[Row].copy())
+        })
+
+      case t: UserDefinedType[_] =>
+        val catalystTypeForUDT = t.sqlType
+        val nullable = parquetType.isRepetition(Repetition.OPTIONAL)
+        val field = StructField("udt", catalystTypeForUDT, nullable)
+        val parquetTypeForUDT = new CatalystSchemaConverter().convertField(field)
+        newConverter(parquetTypeForUDT, catalystTypeForUDT, updater)
+
+      case _ =>
+        throw new RuntimeException(
+          s"Unable to create Parquet converter for data type ${catalystType.json}")
+    }
+  }
+
+  /**
+   * Parquet converter for Parquet primitive types.  Note that not all Spark SQL atomic types
+   * are handled by this converter.  Parquet primitive types are only a subset of those of Spark
+   * SQL.  For example, BYTE, SHORT, and INT in Spark SQL are all covered by INT32 in Parquet.
+   */
+  private final class CatalystPrimitiveConverter(updater: ParentContainerUpdater)
+    extends PrimitiveConverter {
+
+    override def addBoolean(value: Boolean): Unit = updater.setBoolean(value)
+    override def addInt(value: Int): Unit = updater.setInt(value)
+    override def addLong(value: Long): Unit = updater.setLong(value)
+    override def addFloat(value: Float): Unit = updater.setFloat(value)
+    override def addDouble(value: Double): Unit = updater.setDouble(value)
+    override def addBinary(value: Binary): Unit = updater.set(value.getBytes)
+  }
+
+  /**
+   * Parquet converter for strings. A dictionary is used to minimize string decoding cost.
+   */
+  private final class CatalystStringConverter(updater: ParentContainerUpdater)
+    extends PrimitiveConverter {
+
+    private var expandedDictionary: Array[UTF8String] = null
+
+    override def hasDictionarySupport: Boolean = true
+
+    override def setDictionary(dictionary: Dictionary): Unit = {
+      this.expandedDictionary = Array.tabulate(dictionary.getMaxId + 1) { i =>
+        UTF8String.fromBytes(dictionary.decodeToBinary(i).getBytes)
+      }
+    }
+
+    override def addValueFromDictionary(dictionaryId: Int): Unit = {
+      updater.set(expandedDictionary(dictionaryId))
+    }
+
+    override def addBinary(value: Binary): Unit = {
+      updater.set(UTF8String.fromBytes(value.getBytes))
+    }
+  }
+
+  /**
+   * Parquet converter for fixed-precision decimals.
+   */
+  private final class CatalystDecimalConverter(
+      decimalType: DecimalType,
+      updater: ParentContainerUpdater)
+    extends PrimitiveConverter {
+
+    // Converts decimals stored as INT32
+    override def addInt(value: Int): Unit = {
+      addLong(value: Long)
+    }
+
+    // Converts decimals stored as INT64
+    override def addLong(value: Long): Unit = {
+      updater.set(Decimal(value, decimalType.precision, decimalType.scale))
+    }
+
+    // Converts decimals stored as either FIXED_LENGTH_BYTE_ARRAY or BINARY
+    override def addBinary(value: Binary): Unit = {
+      updater.set(toDecimal(value))
+    }
+
+    private def toDecimal(value: Binary): Decimal = {
+      val precision = decimalType.precision
+      val scale = decimalType.scale
+      val bytes = value.getBytes
+
+      var unscaled = 0L
+      var i = 0
+
+      while (i < bytes.length) {
+        unscaled = (unscaled << 8) | (bytes(i) & 0xff)
+        i += 1
+      }
+
+      val bits = 8 * bytes.length
+      unscaled = (unscaled << (64 - bits)) >> (64 - bits)
+      Decimal(unscaled, precision, scale)
+    }
+  }
+
+  /**
+   * Parquet converter for arrays.  Spark SQL arrays are represented as Parquet lists.  Standard
+   * Parquet lists are represented as a 3-level group annotated by `LIST`:
+   * {{{
+   *   <list-repetition> group <name> (LIST) {            <-- parquetSchema points here
+   *     repeated group list {
+   *       <element-repetition> <element-type> element;
+   *     }
+   *   }
+   * }}}
+   * The `parquetSchema` constructor argument points to the outermost group.
+   *
+   * However, before this representation is standardized, some Parquet libraries/tools also use some
+   * non-standard formats to represent list-like structures.  Backwards-compatibility rules for
+   * handling these cases are described in Parquet format spec.
+   *
+   * @see https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists
+   */
+  private final class CatalystArrayConverter(
+      parquetSchema: GroupType,
+      catalystSchema: ArrayType,
+      updater: ParentContainerUpdater)
+    extends GroupConverter {
+
+    private var currentArray: ArrayBuffer[Any] = _
+
+    private val elementConverter: Converter = {
+      val repeatedType = parquetSchema.getType(0)
+      val elementType = catalystSchema.elementType
+
+      if (isElementType(repeatedType, elementType)) {
+        newConverter(repeatedType, elementType, new ParentContainerUpdater {
+          override def set(value: Any): Unit = currentArray += value
+        })
+      } else {
+        new ElementConverter(repeatedType.asGroupType().getType(0), elementType)
+      }
+    }
+
+    override def getConverter(fieldIndex: Int): Converter = elementConverter
+
+    override def end(): Unit = updater.set(currentArray)
+
+    // NOTE: We can't reuse the mutable `ArrayBuffer` here and must instantiate a new buffer for the
+    // next value.  `Row.copy()` only copies row cells, it doesn't do deep copy to objects stored
+    // in row cells.
+    override def start(): Unit = currentArray = ArrayBuffer.empty[Any]
+
+    // scalastyle:off
+    /**
+     * Returns whether the given type is the element type of a list or is a syntactic group with
+     * one field that is the element type.  This is determined by checking whether the type can be
+     * a syntactic group and by checking whether a potential syntactic group matches the expected
+     * schema.
+     * {{{
+     *   <list-repetition> group <name> (LIST) {
+     *     repeated group list {                          <-- repeatedType points here
+     *       <element-repetition> <element-type> element;
+     *     }
+     *   }
+     * }}}
+     * In short, here we handle Parquet list backwards-compatibility rules on the read path.  This
+     * method is based on `AvroIndexedRecordConverter.isElementType`.
+     *
+     * @see https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#backward-compatibility-rules
+     */
+    // scalastyle:on
+    private def isElementType(parquetRepeatedType: Type, catalystElementType: DataType): Boolean = {
+      (parquetRepeatedType, catalystElementType) match {
+        case (t: PrimitiveType, _) => true
+        case (t: GroupType, _) if t.getFieldCount > 1 => true
+        case (t: GroupType, StructType(Array(f))) if f.name == t.getFieldName(0) => true
+        case _ => false
+      }
+    }
+
+    /** Array element converter */
+    private final class ElementConverter(parquetType: Type, catalystType: DataType)
+      extends GroupConverter {
+
+      private var currentElement: Any = _
+
+      private val converter = newConverter(parquetType, catalystType, new ParentContainerUpdater {
+        override def set(value: Any): Unit = currentElement = value
+      })
+
+      override def getConverter(fieldIndex: Int): Converter = converter
+
+      override def end(): Unit = currentArray += currentElement
+
+      override def start(): Unit = currentElement = null
+    }
+  }
+
+  /** Parquet converter for maps */
+  private final class CatalystMapConverter(
+      parquetType: GroupType,
+      catalystType: MapType,
+      updater: ParentContainerUpdater)
+    extends GroupConverter {
+
+    private var currentMap: mutable.Map[Any, Any] = _
+
+    private val keyValueConverter = {
+      val repeatedType = parquetType.getType(0).asGroupType()
+      new KeyValueConverter(
+        repeatedType.getType(0),
+        repeatedType.getType(1),
+        catalystType.keyType,
+        catalystType.valueType)
+    }
+
+    override def getConverter(fieldIndex: Int): Converter = keyValueConverter
+
+    override def end(): Unit = updater.set(currentMap)
+
+    // NOTE: We can't reuse the mutable Map here and must instantiate a new `Map` for the next
+    // value.  `Row.copy()` only copies row cells, it doesn't do deep copy to objects stored in row
+    // cells.
+    override def start(): Unit = currentMap = mutable.Map.empty[Any, Any]
+
+    /** Parquet converter for key-value pairs within the map. */
+    private final class KeyValueConverter(
+        parquetKeyType: Type,
+        parquetValueType: Type,
+        catalystKeyType: DataType,
+        catalystValueType: DataType)
+      extends GroupConverter {
+
+      private var currentKey: Any = _
+
+      private var currentValue: Any = _
+
+      private val converters = Array(
+        // Converter for keys
+        newConverter(parquetKeyType, catalystKeyType, new ParentContainerUpdater {
+          override def set(value: Any): Unit = currentKey = value
+        }),
+
+        // Converter for values
+        newConverter(parquetValueType, catalystValueType, new ParentContainerUpdater {
+          override def set(value: Any): Unit = currentValue = value
+        }))
+
+      override def getConverter(fieldIndex: Int): Converter = converters(fieldIndex)
+
+      override def end(): Unit = currentMap(currentKey) = currentValue
+
+      override def start(): Unit = {
+        currentKey = null
+        currentValue = null
+      }
+    }
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala
index 4ab274ec17a02..de3a72d8146c5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala
@@ -358,9 +358,24 @@ private[parquet] class CatalystSchemaConverter(
       case DateType =>
         Types.primitive(INT32, repetition).as(DATE).named(field.name)
 
-      // NOTE: !! This timestamp type is not specified in Parquet format spec !!
-      // However, Impala and older versions of Spark SQL use INT96 to store timestamps with
-      // nanosecond precision (not TIME_MILLIS or TIMESTAMP_MILLIS described in the spec).
+      // NOTE: Spark SQL TimestampType is NOT a well defined type in Parquet format spec.
+      //
+      // As stated in PARQUET-323, Parquet `INT96` was originally introduced to represent nanosecond
+      // timestamp in Impala for some historical reasons, it's not recommended to be used for any
+      // other types and will probably be deprecated in future Parquet format spec.  That's the
+      // reason why Parquet format spec only defines `TIMESTAMP_MILLIS` and `TIMESTAMP_MICROS` which
+      // are both logical types annotating `INT64`.
+      //
+      // Originally, Spark SQL uses the same nanosecond timestamp type as Impala and Hive.  Starting
+      // from Spark 1.5.0, we resort to a timestamp type with 100 ns precision so that we can store
+      // a timestamp into a `Long`.  This design decision is subject to change though, for example,
+      // we may resort to microsecond precision in the future.
+      //
+      // For Parquet, we plan to write all `TimestampType` value as `TIMESTAMP_MICROS`, but it's
+      // currently not implemented yet because parquet-mr 1.7.0 (the version we're currently using)
+      // hasn't implemented `TIMESTAMP_MICROS` yet.
+      //
+      // TODO Implements `TIMESTAMP_MICROS` once parquet-mr has that.
       case TimestampType =>
         Types.primitive(INT96, repetition).named(field.name)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala
index 86a77bf965daa..be0a2029d233b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala
@@ -17,61 +17,15 @@
 
 package org.apache.spark.sql.parquet
 
-import java.nio.ByteOrder
-
-import scala.collection.mutable.{ArrayBuffer, Buffer, HashMap}
-
-import org.apache.parquet.Preconditions
-import org.apache.parquet.column.Dictionary
-import org.apache.parquet.io.api.{Binary, Converter, GroupConverter, PrimitiveConverter}
-import org.apache.parquet.schema.MessageType
-
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.util.DateTimeUtils
-import org.apache.spark.sql.parquet.CatalystConverter.FieldType
-import org.apache.spark.sql.types._
-import org.apache.spark.unsafe.types.UTF8String
-
-/**
- * Collection of converters of Parquet types (group and primitive types) that
- * model arrays and maps. The conversions are partly based on the AvroParquet
- * converters that are part of Parquet in order to be able to process these
- * types.
- *
- * There are several types of converters:
- * <ul>
- *   <li>[[org.apache.spark.sql.parquet.CatalystPrimitiveConverter]] for primitive
- *   (numeric, boolean and String) types</li>
- *   <li>[[org.apache.spark.sql.parquet.CatalystNativeArrayConverter]] for arrays
- *   of native JVM element types; note: currently null values are not supported!</li>
- *   <li>[[org.apache.spark.sql.parquet.CatalystArrayConverter]] for arrays of
- *   arbitrary element types (including nested element types); note: currently
- *   null values are not supported!</li>
- *   <li>[[org.apache.spark.sql.parquet.CatalystStructConverter]] for structs</li>
- *   <li>[[org.apache.spark.sql.parquet.CatalystMapConverter]] for maps; note:
- *   currently null values are not supported!</li>
- *   <li>[[org.apache.spark.sql.parquet.CatalystPrimitiveRowConverter]] for rows
- *   of only primitive element types</li>
- *   <li>[[org.apache.spark.sql.parquet.CatalystGroupConverter]] for other nested
- *   records, including the top-level row record</li>
- * </ul>
- */
 
 private[sql] object CatalystConverter {
-  // The type internally used for fields
-  type FieldType = StructField
-
   // This is mostly Parquet convention (see, e.g., `ConversionPatterns`).
   // Note that "array" for the array elements is chosen by ParquetAvro.
   // Using a different value will result in Parquet silently dropping columns.
   val ARRAY_CONTAINS_NULL_BAG_SCHEMA_NAME = "bag"
   val ARRAY_ELEMENTS_SCHEMA_NAME = "array"
-  // SPARK-4520: Thrift generated parquet files have different array element
-  // schema names than avro. Thrift parquet uses array_schema_name + "_tuple"
-  // as opposed to "array" used by default. For more information, check
-  // TestThriftSchemaConverter.java in parquet.thrift.
-  val THRIFT_ARRAY_ELEMENTS_SCHEMA_NAME_SUFFIX = "_tuple"
+
   val MAP_KEY_SCHEMA_NAME = "key"
   val MAP_VALUE_SCHEMA_NAME = "value"
   val MAP_SCHEMA_NAME = "map"
@@ -80,787 +34,4 @@ private[sql] object CatalystConverter {
   type ArrayScalaType[T] = Seq[T]
   type StructScalaType[T] = InternalRow
   type MapScalaType[K, V] = Map[K, V]
-
-  protected[parquet] def createConverter(
-      field: FieldType,
-      fieldIndex: Int,
-      parent: CatalystConverter): Converter = {
-    val fieldType: DataType = field.dataType
-    fieldType match {
-      case udt: UserDefinedType[_] => {
-        createConverter(field.copy(dataType = udt.sqlType), fieldIndex, parent)
-      }
-      // For native JVM types we use a converter with native arrays
-      case ArrayType(elementType: AtomicType, false) => {
-        new CatalystNativeArrayConverter(elementType, fieldIndex, parent)
-      }
-      // This is for other types of arrays, including those with nested fields
-      case ArrayType(elementType: DataType, false) => {
-        new CatalystArrayConverter(elementType, fieldIndex, parent)
-      }
-      case ArrayType(elementType: DataType, true) => {
-        new CatalystArrayContainsNullConverter(elementType, fieldIndex, parent)
-      }
-      case StructType(fields: Array[StructField]) => {
-        new CatalystStructConverter(fields, fieldIndex, parent)
-      }
-      case MapType(keyType: DataType, valueType: DataType, valueContainsNull: Boolean) => {
-        new CatalystMapConverter(
-          Array(
-            new FieldType(MAP_KEY_SCHEMA_NAME, keyType, false),
-            new FieldType(MAP_VALUE_SCHEMA_NAME, valueType, valueContainsNull)),
-          fieldIndex,
-          parent)
-      }
-      // Strings, Shorts and Bytes do not have a corresponding type in Parquet
-      // so we need to treat them separately
-      case StringType =>
-        new CatalystPrimitiveStringConverter(parent, fieldIndex)
-      case ShortType => {
-        new CatalystPrimitiveConverter(parent, fieldIndex) {
-          override def addInt(value: Int): Unit =
-            parent.updateShort(fieldIndex, value.asInstanceOf[ShortType.InternalType])
-        }
-      }
-      case ByteType => {
-        new CatalystPrimitiveConverter(parent, fieldIndex) {
-          override def addInt(value: Int): Unit =
-            parent.updateByte(fieldIndex, value.asInstanceOf[ByteType.InternalType])
-        }
-      }
-      case DateType => {
-        new CatalystPrimitiveConverter(parent, fieldIndex) {
-          override def addInt(value: Int): Unit =
-            parent.updateDate(fieldIndex, value.asInstanceOf[DateType.InternalType])
-        }
-      }
-      case d: DecimalType => {
-        new CatalystPrimitiveConverter(parent, fieldIndex) {
-          override def addBinary(value: Binary): Unit =
-            parent.updateDecimal(fieldIndex, value, d)
-        }
-      }
-      case TimestampType => {
-        new CatalystPrimitiveConverter(parent, fieldIndex) {
-          override def addBinary(value: Binary): Unit =
-            parent.updateTimestamp(fieldIndex, value)
-        }
-      }
-      // All other primitive types use the default converter
-      case ctype: DataType if ParquetTypesConverter.isPrimitiveType(ctype) => {
-        // note: need the type tag here!
-        new CatalystPrimitiveConverter(parent, fieldIndex)
-      }
-      case _ => throw new RuntimeException(
-        s"unable to convert datatype ${field.dataType.toString} in CatalystConverter")
-    }
-  }
-
-  protected[parquet] def createRootConverter(
-      parquetSchema: MessageType,
-      attributes: Seq[Attribute]): CatalystConverter = {
-    // For non-nested types we use the optimized Row converter
-    if (attributes.forall(a => ParquetTypesConverter.isPrimitiveType(a.dataType))) {
-      new CatalystPrimitiveRowConverter(attributes.toArray)
-    } else {
-      new CatalystGroupConverter(attributes.toArray)
-    }
-  }
-}
-
-private[parquet] abstract class CatalystConverter extends GroupConverter {
-  /**
-   * The number of fields this group has
-   */
-  protected[parquet] val size: Int
-
-  /**
-   * The index of this converter in the parent
-   */
-  protected[parquet] val index: Int
-
-  /**
-   * The parent converter
-   */
-  protected[parquet] val parent: CatalystConverter
-
-  /**
-   * Called by child converters to update their value in its parent (this).
-   * Note that if possible the more specific update methods below should be used
-   * to avoid auto-boxing of native JVM types.
-   *
-   * @param fieldIndex
-   * @param value
-   */
-  protected[parquet] def updateField(fieldIndex: Int, value: Any): Unit
-
-  protected[parquet] def updateBoolean(fieldIndex: Int, value: Boolean): Unit =
-    updateField(fieldIndex, value)
-
-  protected[parquet] def updateInt(fieldIndex: Int, value: Int): Unit =
-    updateField(fieldIndex, value)
-
-  protected[parquet] def updateDate(fieldIndex: Int, value: Int): Unit =
-    updateField(fieldIndex, value)
-
-  protected[parquet] def updateLong(fieldIndex: Int, value: Long): Unit =
-    updateField(fieldIndex, value)
-
-  protected[parquet] def updateShort(fieldIndex: Int, value: Short): Unit =
-    updateField(fieldIndex, value)
-
-  protected[parquet] def updateByte(fieldIndex: Int, value: Byte): Unit =
-    updateField(fieldIndex, value)
-
-  protected[parquet] def updateDouble(fieldIndex: Int, value: Double): Unit =
-    updateField(fieldIndex, value)
-
-  protected[parquet] def updateFloat(fieldIndex: Int, value: Float): Unit =
-    updateField(fieldIndex, value)
-
-  protected[parquet] def updateBinary(fieldIndex: Int, value: Binary): Unit =
-    updateField(fieldIndex, value.getBytes)
-
-  protected[parquet] def updateString(fieldIndex: Int, value: Array[Byte]): Unit =
-    updateField(fieldIndex, UTF8String.fromBytes(value))
-
-  protected[parquet] def updateTimestamp(fieldIndex: Int, value: Binary): Unit =
-    updateField(fieldIndex, readTimestamp(value))
-
-  protected[parquet] def updateDecimal(fieldIndex: Int, value: Binary, ctype: DecimalType): Unit =
-    updateField(fieldIndex, readDecimal(new Decimal(), value, ctype))
-
-  protected[parquet] def isRootConverter: Boolean = parent == null
-
-  protected[parquet] def clearBuffer(): Unit
-
-  /**
-   * Should only be called in the root (group) converter!
-   *
-   * @return
-   */
-  def getCurrentRecord: InternalRow = throw new UnsupportedOperationException
-
-  /**
-   * Read a decimal value from a Parquet Binary into "dest". Only supports decimals that fit in
-   * a long (i.e. precision <= 18)
-   *
-   * Returned value is needed by CatalystConverter, which doesn't reuse the Decimal object.
-   */
-  protected[parquet] def readDecimal(dest: Decimal, value: Binary, ctype: DecimalType): Decimal = {
-    val precision = ctype.precisionInfo.get.precision
-    val scale = ctype.precisionInfo.get.scale
-    val bytes = value.getBytes
-    require(bytes.length <= 16, "Decimal field too large to read")
-    var unscaled = 0L
-    var i = 0
-    while (i < bytes.length) {
-      unscaled = (unscaled << 8) | (bytes(i) & 0xFF)
-      i += 1
-    }
-    // Make sure unscaled has the right sign, by sign-extending the first bit
-    val numBits = 8 * bytes.length
-    unscaled = (unscaled << (64 - numBits)) >> (64 - numBits)
-    dest.set(unscaled, precision, scale)
-  }
-
-  /**
-   * Read a Timestamp value from a Parquet Int96Value
-   */
-  protected[parquet] def readTimestamp(value: Binary): Long = {
-    Preconditions.checkArgument(value.length() == 12, "Must be 12 bytes")
-    val buf = value.toByteBuffer
-    buf.order(ByteOrder.LITTLE_ENDIAN)
-    val timeOfDayNanos = buf.getLong
-    val julianDay = buf.getInt
-    DateTimeUtils.fromJulianDay(julianDay, timeOfDayNanos)
-  }
-}
-
-/**
- * A `parquet.io.api.GroupConverter` that is able to convert a Parquet record
- * to a [[org.apache.spark.sql.catalyst.expressions.InternalRow]] object.
- *
- * @param schema The corresponding Catalyst schema in the form of a list of attributes.
- */
-private[parquet] class CatalystGroupConverter(
-    protected[parquet] val schema: Array[FieldType],
-    protected[parquet] val index: Int,
-    protected[parquet] val parent: CatalystConverter,
-    protected[parquet] var current: ArrayBuffer[Any],
-    protected[parquet] var buffer: ArrayBuffer[InternalRow])
-  extends CatalystConverter {
-
-  def this(schema: Array[FieldType], index: Int, parent: CatalystConverter) =
-    this(
-      schema,
-      index,
-      parent,
-      current = null,
-      buffer = new ArrayBuffer[InternalRow](
-        CatalystArrayConverter.INITIAL_ARRAY_SIZE))
-
-  /**
-   * This constructor is used for the root converter only!
-   */
-  def this(attributes: Array[Attribute]) =
-    this(attributes.map(a => new FieldType(a.name, a.dataType, a.nullable)), 0, null)
-
-  protected [parquet] val converters: Array[Converter] =
-    schema.zipWithIndex.map {
-      case (field, idx) => CatalystConverter.createConverter(field, idx, this)
-    }.toArray
-
-  override val size = schema.size
-
-  override def getCurrentRecord: InternalRow = {
-    assert(isRootConverter, "getCurrentRecord should only be called in root group converter!")
-    // TODO: use iterators if possible
-    // Note: this will ever only be called in the root converter when the record has been
-    // fully processed. Therefore it will be difficult to use mutable rows instead, since
-    // any non-root converter never would be sure when it would be safe to re-use the buffer.
-    new GenericInternalRow(current.toArray)
-  }
-
-  override def getConverter(fieldIndex: Int): Converter = converters(fieldIndex)
-
-  // for child converters to update upstream values
-  override protected[parquet] def updateField(fieldIndex: Int, value: Any): Unit = {
-    current.update(fieldIndex, value)
-  }
-
-  override protected[parquet] def clearBuffer(): Unit = buffer.clear()
-
-  override def start(): Unit = {
-    current = ArrayBuffer.fill(size)(null)
-    converters.foreach { converter =>
-      if (!converter.isPrimitive) {
-        converter.asInstanceOf[CatalystConverter].clearBuffer()
-      }
-    }
-  }
-
-  override def end(): Unit = {
-    if (!isRootConverter) {
-      assert(current != null) // there should be no empty groups
-      buffer.append(new GenericInternalRow(current.toArray))
-      parent.updateField(index, new GenericInternalRow(buffer.toArray.asInstanceOf[Array[Any]]))
-    }
-  }
-}
-
-/**
- * A `parquet.io.api.GroupConverter` that is able to convert a Parquet record
- * to a [[org.apache.spark.sql.catalyst.expressions.InternalRow]] object. Note that his
- * converter is optimized for rows of primitive types (non-nested records).
- */
-private[parquet] class CatalystPrimitiveRowConverter(
-    protected[parquet] val schema: Array[FieldType],
-    protected[parquet] var current: MutableRow)
-  extends CatalystConverter {
-
-  // This constructor is used for the root converter only
-  def this(attributes: Array[Attribute]) =
-    this(
-      attributes.map(a => new FieldType(a.name, a.dataType, a.nullable)),
-      new SpecificMutableRow(attributes.map(_.dataType)))
-
-  protected [parquet] val converters: Array[Converter] =
-    schema.zipWithIndex.map {
-      case (field, idx) => CatalystConverter.createConverter(field, idx, this)
-    }.toArray
-
-  override val size = schema.size
-
-  override val index = 0
-
-  override val parent = null
-
-  // Should be only called in root group converter!
-  override def getCurrentRecord: InternalRow = current
-
-  override def getConverter(fieldIndex: Int): Converter = converters(fieldIndex)
-
-  // for child converters to update upstream values
-  override protected[parquet] def updateField(fieldIndex: Int, value: Any): Unit = {
-    throw new UnsupportedOperationException // child converters should use the
-    // specific update methods below
-  }
-
-  override protected[parquet] def clearBuffer(): Unit = {}
-
-  override def start(): Unit = {
-    var i = 0
-    while (i < size) {
-      current.setNullAt(i)
-      i = i + 1
-    }
-  }
-
-  override def end(): Unit = {}
-
-  // Overridden here to avoid auto-boxing for primitive types
-  override protected[parquet] def updateBoolean(fieldIndex: Int, value: Boolean): Unit =
-    current.setBoolean(fieldIndex, value)
-
-  override protected[parquet] def updateInt(fieldIndex: Int, value: Int): Unit =
-    current.setInt(fieldIndex, value)
-
-  override protected[parquet] def updateDate(fieldIndex: Int, value: Int): Unit =
-    current.setInt(fieldIndex, value)
-
-  override protected[parquet] def updateLong(fieldIndex: Int, value: Long): Unit =
-    current.setLong(fieldIndex, value)
-
-  override protected[parquet] def updateShort(fieldIndex: Int, value: Short): Unit =
-    current.setShort(fieldIndex, value)
-
-  override protected[parquet] def updateByte(fieldIndex: Int, value: Byte): Unit =
-    current.setByte(fieldIndex, value)
-
-  override protected[parquet] def updateDouble(fieldIndex: Int, value: Double): Unit =
-    current.setDouble(fieldIndex, value)
-
-  override protected[parquet] def updateFloat(fieldIndex: Int, value: Float): Unit =
-    current.setFloat(fieldIndex, value)
-
-  override protected[parquet] def updateBinary(fieldIndex: Int, value: Binary): Unit =
-    current.update(fieldIndex, value.getBytes)
-
-  override protected[parquet] def updateString(fieldIndex: Int, value: Array[Byte]): Unit =
-    current.update(fieldIndex, UTF8String.fromBytes(value))
-
-  override protected[parquet] def updateTimestamp(fieldIndex: Int, value: Binary): Unit =
-    current.setLong(fieldIndex, readTimestamp(value))
-
-  override protected[parquet] def updateDecimal(
-      fieldIndex: Int, value: Binary, ctype: DecimalType): Unit = {
-    var decimal = current(fieldIndex).asInstanceOf[Decimal]
-    if (decimal == null) {
-      decimal = new Decimal
-      current(fieldIndex) = decimal
-    }
-    readDecimal(decimal, value, ctype)
-  }
-}
-
-/**
- * A `parquet.io.api.PrimitiveConverter` that converts Parquet types to Catalyst types.
- *
- * @param parent The parent group converter.
- * @param fieldIndex The index inside the record.
- */
-private[parquet] class CatalystPrimitiveConverter(
-    parent: CatalystConverter,
-    fieldIndex: Int) extends PrimitiveConverter {
-  override def addBinary(value: Binary): Unit =
-    parent.updateBinary(fieldIndex, value)
-
-  override def addBoolean(value: Boolean): Unit =
-    parent.updateBoolean(fieldIndex, value)
-
-  override def addDouble(value: Double): Unit =
-    parent.updateDouble(fieldIndex, value)
-
-  override def addFloat(value: Float): Unit =
-    parent.updateFloat(fieldIndex, value)
-
-  override def addInt(value: Int): Unit =
-    parent.updateInt(fieldIndex, value)
-
-  override def addLong(value: Long): Unit =
-    parent.updateLong(fieldIndex, value)
-}
-
-/**
- * A `parquet.io.api.PrimitiveConverter` that converts Parquet Binary to Catalyst String.
- * Supports dictionaries to reduce Binary to String conversion overhead.
- *
- * Follows pattern in Parquet of using dictionaries, where supported, for String conversion.
- *
- * @param parent The parent group converter.
- * @param fieldIndex The index inside the record.
- */
-private[parquet] class CatalystPrimitiveStringConverter(parent: CatalystConverter, fieldIndex: Int)
-  extends CatalystPrimitiveConverter(parent, fieldIndex) {
-
-  private[this] var dict: Array[Array[Byte]] = null
-
-  override def hasDictionarySupport: Boolean = true
-
-  override def setDictionary(dictionary: Dictionary): Unit =
-    dict = Array.tabulate(dictionary.getMaxId + 1) { dictionary.decodeToBinary(_).getBytes }
-
-  override def addValueFromDictionary(dictionaryId: Int): Unit =
-    parent.updateString(fieldIndex, dict(dictionaryId))
-
-  override def addBinary(value: Binary): Unit =
-    parent.updateString(fieldIndex, value.getBytes)
-}
-
-private[parquet] object CatalystArrayConverter {
-  val INITIAL_ARRAY_SIZE = 20
-}
-
-/**
- * A `parquet.io.api.GroupConverter` that converts a single-element groups that
- * match the characteristics of an array (see
- * [[org.apache.spark.sql.parquet.ParquetTypesConverter]]) into an
- * [[org.apache.spark.sql.types.ArrayType]].
- *
- * @param elementType The type of the array elements (complex or primitive)
- * @param index The position of this (array) field inside its parent converter
- * @param parent The parent converter
- * @param buffer A data buffer
- */
-private[parquet] class CatalystArrayConverter(
-    val elementType: DataType,
-    val index: Int,
-    protected[parquet] val parent: CatalystConverter,
-    protected[parquet] var buffer: Buffer[Any])
-  extends CatalystConverter {
-
-  def this(elementType: DataType, index: Int, parent: CatalystConverter) =
-    this(
-      elementType,
-      index,
-      parent,
-      new ArrayBuffer[Any](CatalystArrayConverter.INITIAL_ARRAY_SIZE))
-
-  protected[parquet] val converter: Converter = CatalystConverter.createConverter(
-    new CatalystConverter.FieldType(
-      CatalystConverter.ARRAY_ELEMENTS_SCHEMA_NAME,
-      elementType,
-      false),
-    fieldIndex = 0,
-    parent = this)
-
-  override def getConverter(fieldIndex: Int): Converter = converter
-
-  // arrays have only one (repeated) field, which is its elements
-  override val size = 1
-
-  override protected[parquet] def updateField(fieldIndex: Int, value: Any): Unit = {
-    // fieldIndex is ignored (assumed to be zero but not checked)
-    if (value == null) {
-      throw new IllegalArgumentException("Null values inside Parquet arrays are not supported!")
-    }
-    buffer += value
-  }
-
-  override protected[parquet] def clearBuffer(): Unit = {
-    buffer.clear()
-  }
-
-  override def start(): Unit = {
-    if (!converter.isPrimitive) {
-      converter.asInstanceOf[CatalystConverter].clearBuffer()
-    }
-  }
-
-  override def end(): Unit = {
-    assert(parent != null)
-    // here we need to make sure to use ArrayScalaType
-    parent.updateField(index, buffer.toArray.toSeq)
-    clearBuffer()
-  }
-}
-
-/**
- * A `parquet.io.api.GroupConverter` that converts a single-element groups that
- * match the characteristics of an array (see
- * [[org.apache.spark.sql.parquet.ParquetTypesConverter]]) into an
- * [[org.apache.spark.sql.types.ArrayType]].
- *
- * @param elementType The type of the array elements (native)
- * @param index The position of this (array) field inside its parent converter
- * @param parent The parent converter
- * @param capacity The (initial) capacity of the buffer
- */
-private[parquet] class CatalystNativeArrayConverter(
-    val elementType: AtomicType,
-    val index: Int,
-    protected[parquet] val parent: CatalystConverter,
-    protected[parquet] var capacity: Int = CatalystArrayConverter.INITIAL_ARRAY_SIZE)
-  extends CatalystConverter {
-
-  type NativeType = elementType.InternalType
-
-  private var buffer: Array[NativeType] = elementType.classTag.newArray(capacity)
-
-  private var elements: Int = 0
-
-  protected[parquet] val converter: Converter = CatalystConverter.createConverter(
-    new CatalystConverter.FieldType(
-      CatalystConverter.ARRAY_ELEMENTS_SCHEMA_NAME,
-      elementType,
-      false),
-    fieldIndex = 0,
-    parent = this)
-
-  override def getConverter(fieldIndex: Int): Converter = converter
-
-  // arrays have only one (repeated) field, which is its elements
-  override val size = 1
-
-  override protected[parquet] def updateField(fieldIndex: Int, value: Any): Unit =
-    throw new UnsupportedOperationException
-
-  // Overridden here to avoid auto-boxing for primitive types
-  override protected[parquet] def updateBoolean(fieldIndex: Int, value: Boolean): Unit = {
-    checkGrowBuffer()
-    buffer(elements) = value.asInstanceOf[NativeType]
-    elements += 1
-  }
-
-  override protected[parquet] def updateInt(fieldIndex: Int, value: Int): Unit = {
-    checkGrowBuffer()
-    buffer(elements) = value.asInstanceOf[NativeType]
-    elements += 1
-  }
-
-  override protected[parquet] def updateShort(fieldIndex: Int, value: Short): Unit = {
-    checkGrowBuffer()
-    buffer(elements) = value.asInstanceOf[NativeType]
-    elements += 1
-  }
-
-  override protected[parquet] def updateByte(fieldIndex: Int, value: Byte): Unit = {
-    checkGrowBuffer()
-    buffer(elements) = value.asInstanceOf[NativeType]
-    elements += 1
-  }
-
-  override protected[parquet] def updateLong(fieldIndex: Int, value: Long): Unit = {
-    checkGrowBuffer()
-    buffer(elements) = value.asInstanceOf[NativeType]
-    elements += 1
-  }
-
-  override protected[parquet] def updateDouble(fieldIndex: Int, value: Double): Unit = {
-    checkGrowBuffer()
-    buffer(elements) = value.asInstanceOf[NativeType]
-    elements += 1
-  }
-
-  override protected[parquet] def updateFloat(fieldIndex: Int, value: Float): Unit = {
-    checkGrowBuffer()
-    buffer(elements) = value.asInstanceOf[NativeType]
-    elements += 1
-  }
-
-  override protected[parquet] def updateBinary(fieldIndex: Int, value: Binary): Unit = {
-    checkGrowBuffer()
-    buffer(elements) = value.getBytes.asInstanceOf[NativeType]
-    elements += 1
-  }
-
-  override protected[parquet] def updateString(fieldIndex: Int, value: Array[Byte]): Unit = {
-    checkGrowBuffer()
-    buffer(elements) = UTF8String.fromBytes(value).asInstanceOf[NativeType]
-    elements += 1
-  }
-
-  override protected[parquet] def clearBuffer(): Unit = {
-    elements = 0
-  }
-
-  override def start(): Unit = {}
-
-  override def end(): Unit = {
-    assert(parent != null)
-    // here we need to make sure to use ArrayScalaType
-    parent.updateField(
-      index,
-      buffer.slice(0, elements).toSeq)
-    clearBuffer()
-  }
-
-  private def checkGrowBuffer(): Unit = {
-    if (elements >= capacity) {
-      val newCapacity = 2 * capacity
-      val tmp: Array[NativeType] = elementType.classTag.newArray(newCapacity)
-      Array.copy(buffer, 0, tmp, 0, capacity)
-      buffer = tmp
-      capacity = newCapacity
-    }
-  }
-}
-
-/**
- * A `parquet.io.api.GroupConverter` that converts a single-element groups that
- * match the characteristics of an array contains null (see
- * [[org.apache.spark.sql.parquet.ParquetTypesConverter]]) into an
- * [[org.apache.spark.sql.types.ArrayType]].
- *
- * @param elementType The type of the array elements (complex or primitive)
- * @param index The position of this (array) field inside its parent converter
- * @param parent The parent converter
- * @param buffer A data buffer
- */
-private[parquet] class CatalystArrayContainsNullConverter(
-    val elementType: DataType,
-    val index: Int,
-    protected[parquet] val parent: CatalystConverter,
-    protected[parquet] var buffer: Buffer[Any])
-  extends CatalystConverter {
-
-  def this(elementType: DataType, index: Int, parent: CatalystConverter) =
-    this(
-      elementType,
-      index,
-      parent,
-      new ArrayBuffer[Any](CatalystArrayConverter.INITIAL_ARRAY_SIZE))
-
-  protected[parquet] val converter: Converter = new CatalystConverter {
-
-    private var current: Any = null
-
-    val converter = CatalystConverter.createConverter(
-      new CatalystConverter.FieldType(
-        CatalystConverter.ARRAY_ELEMENTS_SCHEMA_NAME,
-        elementType,
-        false),
-      fieldIndex = 0,
-      parent = this)
-
-    override def getConverter(fieldIndex: Int): Converter = converter
-
-    override def end(): Unit = parent.updateField(index, current)
-
-    override def start(): Unit = {
-      current = null
-    }
-
-    override protected[parquet] val size: Int = 1
-    override protected[parquet] val index: Int = 0
-    override protected[parquet] val parent = CatalystArrayContainsNullConverter.this
-
-    override protected[parquet] def updateField(fieldIndex: Int, value: Any): Unit = {
-      current = value
-    }
-
-    override protected[parquet] def clearBuffer(): Unit = {}
-  }
-
-  override def getConverter(fieldIndex: Int): Converter = converter
-
-  // arrays have only one (repeated) field, which is its elements
-  override val size = 1
-
-  override protected[parquet] def updateField(fieldIndex: Int, value: Any): Unit = {
-    buffer += value
-  }
-
-  override protected[parquet] def clearBuffer(): Unit = {
-    buffer.clear()
-  }
-
-  override def start(): Unit = {}
-
-  override def end(): Unit = {
-    assert(parent != null)
-    // here we need to make sure to use ArrayScalaType
-    parent.updateField(index, buffer.toArray.toSeq)
-    clearBuffer()
-  }
-}
-
-/**
- * This converter is for multi-element groups of primitive or complex types
- * that have repetition level optional or required (so struct fields).
- *
- * @param schema The corresponding Catalyst schema in the form of a list of
- *               attributes.
- * @param index
- * @param parent
- */
-private[parquet] class CatalystStructConverter(
-    override protected[parquet] val schema: Array[FieldType],
-    override protected[parquet] val index: Int,
-    override protected[parquet] val parent: CatalystConverter)
-  extends CatalystGroupConverter(schema, index, parent) {
-
-  override protected[parquet] def clearBuffer(): Unit = {}
-
-  // TODO: think about reusing the buffer
-  override def end(): Unit = {
-    assert(!isRootConverter)
-    // here we need to make sure to use StructScalaType
-    // Note: we need to actually make a copy of the array since we
-    // may be in a nested field
-    parent.updateField(index, new GenericInternalRow(current.toArray))
-  }
-}
-
-/**
- * A `parquet.io.api.GroupConverter` that converts two-element groups that
- * match the characteristics of a map (see
- * [[org.apache.spark.sql.parquet.ParquetTypesConverter]]) into an
- * [[org.apache.spark.sql.types.MapType]].
- *
- * @param schema
- * @param index
- * @param parent
- */
-private[parquet] class CatalystMapConverter(
-    protected[parquet] val schema: Array[FieldType],
-    override protected[parquet] val index: Int,
-    override protected[parquet] val parent: CatalystConverter)
-  extends CatalystConverter {
-
-  private val map = new HashMap[Any, Any]()
-
-  private val keyValueConverter = new CatalystConverter {
-    private var currentKey: Any = null
-    private var currentValue: Any = null
-    val keyConverter = CatalystConverter.createConverter(schema(0), 0, this)
-    val valueConverter = CatalystConverter.createConverter(schema(1), 1, this)
-
-    override def getConverter(fieldIndex: Int): Converter = {
-      if (fieldIndex == 0) keyConverter else valueConverter
-    }
-
-    override def end(): Unit = CatalystMapConverter.this.map += currentKey -> currentValue
-
-    override def start(): Unit = {
-      currentKey = null
-      currentValue = null
-    }
-
-    override protected[parquet] val size: Int = 2
-    override protected[parquet] val index: Int = 0
-    override protected[parquet] val parent: CatalystConverter = CatalystMapConverter.this
-
-    override protected[parquet] def updateField(fieldIndex: Int, value: Any): Unit = {
-      fieldIndex match {
-        case 0 =>
-          currentKey = value
-        case 1 =>
-          currentValue = value
-        case _ =>
-          new RuntimePermission(s"trying to update Map with fieldIndex $fieldIndex")
-      }
-    }
-
-    override protected[parquet] def clearBuffer(): Unit = {}
-  }
-
-  override protected[parquet] val size: Int = 1
-
-  override protected[parquet] def clearBuffer(): Unit = {}
-
-  override def start(): Unit = {
-    map.clear()
-  }
-
-  override def end(): Unit = {
-    // here we need to make sure to use MapScalaType
-    parent.updateField(index, map.toMap)
-  }
-
-  override def getConverter(fieldIndex: Int): Converter = keyValueConverter
-
-  override protected[parquet] def updateField(fieldIndex: Int, value: Any): Unit =
-    throw new UnsupportedOperationException
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
index 8402cd756140d..e8851ddb68026 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
@@ -17,14 +17,17 @@
 
 package org.apache.spark.sql.parquet
 
-import java.nio.{ByteOrder, ByteBuffer}
+import java.nio.{ByteBuffer, ByteOrder}
+import java.util
 import java.util.{HashMap => JHashMap}
 
+import scala.collection.JavaConversions._
+
 import org.apache.hadoop.conf.Configuration
 import org.apache.parquet.column.ParquetProperties
 import org.apache.parquet.hadoop.ParquetOutputFormat
 import org.apache.parquet.hadoop.api.ReadSupport.ReadContext
-import org.apache.parquet.hadoop.api.{ReadSupport, WriteSupport}
+import org.apache.parquet.hadoop.api.{InitContext, ReadSupport, WriteSupport}
 import org.apache.parquet.io.api._
 import org.apache.parquet.schema.MessageType
 
@@ -36,87 +39,133 @@ import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
 
 /**
- * A `parquet.io.api.RecordMaterializer` for Rows.
+ * A [[RecordMaterializer]] for Catalyst rows.
  *
- *@param root The root group converter for the record.
+ * @param parquetSchema Parquet schema of the records to be read
+ * @param catalystSchema Catalyst schema of the rows to be constructed
  */
-private[parquet] class RowRecordMaterializer(root: CatalystConverter)
+private[parquet] class RowRecordMaterializer(parquetSchema: MessageType, catalystSchema: StructType)
   extends RecordMaterializer[InternalRow] {
 
-  def this(parquetSchema: MessageType, attributes: Seq[Attribute]) =
-    this(CatalystConverter.createRootConverter(parquetSchema, attributes))
+  private val rootConverter = new CatalystRowConverter(parquetSchema, catalystSchema, NoopUpdater)
 
-  override def getCurrentRecord: InternalRow = root.getCurrentRecord
+  override def getCurrentRecord: InternalRow = rootConverter.currentRow
 
-  override def getRootConverter: GroupConverter = root.asInstanceOf[GroupConverter]
+  override def getRootConverter: GroupConverter = rootConverter
 }
 
-/**
- * A `parquet.hadoop.api.ReadSupport` for Row objects.
- */
 private[parquet] class RowReadSupport extends ReadSupport[InternalRow] with Logging {
-
   override def prepareForRead(
       conf: Configuration,
-      stringMap: java.util.Map[String, String],
+      keyValueMetaData: util.Map[String, String],
       fileSchema: MessageType,
       readContext: ReadContext): RecordMaterializer[InternalRow] = {
-    log.debug(s"preparing for read with Parquet file schema $fileSchema")
-    // Note: this very much imitates AvroParquet
-    val parquetSchema = readContext.getRequestedSchema
-    var schema: Seq[Attribute] = null
-
-    if (readContext.getReadSupportMetadata != null) {
-      // first try to find the read schema inside the metadata (can result from projections)
-      if (
-        readContext
-          .getReadSupportMetadata
-          .get(RowReadSupport.SPARK_ROW_REQUESTED_SCHEMA) != null) {
-        schema = ParquetTypesConverter.convertFromString(
-          readContext.getReadSupportMetadata.get(RowReadSupport.SPARK_ROW_REQUESTED_SCHEMA))
-      } else {
-        // if unavailable, try the schema that was read originally from the file or provided
-        // during the creation of the Parquet relation
-        if (readContext.getReadSupportMetadata.get(RowReadSupport.SPARK_METADATA_KEY) != null) {
-          schema = ParquetTypesConverter.convertFromString(
-            readContext.getReadSupportMetadata.get(RowReadSupport.SPARK_METADATA_KEY))
-        }
+    log.debug(s"Preparing for read Parquet file with message type: $fileSchema")
+
+    val toCatalyst = new CatalystSchemaConverter(conf)
+    val parquetRequestedSchema = readContext.getRequestedSchema
+
+    val catalystRequestedSchema =
+      Option(readContext.getReadSupportMetadata).map(_.toMap).flatMap { metadata =>
+        metadata
+          // First tries to read requested schema, which may result from projections
+          .get(RowReadSupport.SPARK_ROW_REQUESTED_SCHEMA)
+          // If not available, tries to read Catalyst schema from file metadata.  It's only
+          // available if the target file is written by Spark SQL.
+          .orElse(metadata.get(RowReadSupport.SPARK_METADATA_KEY))
+      }.map(StructType.fromString).getOrElse {
+        logDebug("Catalyst schema not available, falling back to Parquet schema")
+        toCatalyst.convert(parquetRequestedSchema)
       }
-    }
-    // if both unavailable, fall back to deducing the schema from the given Parquet schema
-    // TODO: Why it can be null?
-    if (schema == null)  {
-      log.debug("falling back to Parquet read schema")
-      schema = ParquetTypesConverter.convertToAttributes(parquetSchema, false, true)
-    }
-    log.debug(s"list of attributes that will be read: $schema")
-    new RowRecordMaterializer(parquetSchema, schema)
+
+    logDebug(s"Catalyst schema used to read Parquet files: $catalystRequestedSchema")
+    new RowRecordMaterializer(parquetRequestedSchema, catalystRequestedSchema)
   }
 
-  override def init(
-      configuration: Configuration,
-      keyValueMetaData: java.util.Map[String, String],
-      fileSchema: MessageType): ReadContext = {
-    var parquetSchema = fileSchema
-    val metadata = new JHashMap[String, String]()
-    val requestedAttributes = RowReadSupport.getRequestedSchema(configuration)
-
-    if (requestedAttributes != null) {
-      // If the parquet file is thrift derived, there is a good chance that
-      // it will have the thrift class in metadata.
-      val isThriftDerived = keyValueMetaData.keySet().contains("thrift.class")
-      parquetSchema = ParquetTypesConverter.convertFromAttributes(requestedAttributes)
-      metadata.put(
-        RowReadSupport.SPARK_ROW_REQUESTED_SCHEMA,
-        ParquetTypesConverter.convertToString(requestedAttributes))
-    }
+  override def init(context: InitContext): ReadContext = {
+    val conf = context.getConfiguration
+
+    // If the target file was written by Spark SQL, we should be able to find a serialized Catalyst
+    // schema of this file from its the metadata.
+    val maybeRowSchema = Option(conf.get(RowWriteSupport.SPARK_ROW_SCHEMA))
+
+    // Optional schema of requested columns, in the form of a string serialized from a Catalyst
+    // `StructType` containing all requested columns.
+    val maybeRequestedSchema = Option(conf.get(RowReadSupport.SPARK_ROW_REQUESTED_SCHEMA))
+
+    // Below we construct a Parquet schema containing all requested columns.  This schema tells
+    // Parquet which columns to read.
+    //
+    // If `maybeRequestedSchema` is defined, we assemble an equivalent Parquet schema.  Otherwise,
+    // we have to fallback to the full file schema which contains all columns in the file.
+    // Obviously this may waste IO bandwidth since it may read more columns than requested.
+    //
+    // Two things to note:
+    //
+    // 1. It's possible that some requested columns don't exist in the target Parquet file.  For
+    //    example, in the case of schema merging, the globally merged schema may contain extra
+    //    columns gathered from other Parquet files.  These columns will be simply filled with nulls
+    //    when actually reading the target Parquet file.
+    //
+    // 2. When `maybeRequestedSchema` is available, we can't simply convert the Catalyst schema to
+    //    Parquet schema using `CatalystSchemaConverter`, because the mapping is not unique due to
+    //    non-standard behaviors of some Parquet libraries/tools.  For example, a Parquet file
+    //    containing a single integer array field `f1` may have the following legacy 2-level
+    //    structure:
+    //
+    //      message root {
+    //        optional group f1 (LIST) {
+    //          required INT32 element;
+    //        }
+    //      }
+    //
+    //    while `CatalystSchemaConverter` may generate a standard 3-level structure:
+    //
+    //      message root {
+    //        optional group f1 (LIST) {
+    //          repeated group list {
+    //            required INT32 element;
+    //          }
+    //        }
+    //      }
+    //
+    //    Apparently, we can't use the 2nd schema to read the target Parquet file as they have
+    //    different physical structures.
+    val parquetRequestedSchema =
+      maybeRequestedSchema.fold(context.getFileSchema) { schemaString =>
+        val toParquet = new CatalystSchemaConverter(conf)
+        val fileSchema = context.getFileSchema.asGroupType()
+        val fileFieldNames = fileSchema.getFields.map(_.getName).toSet
+
+        StructType
+          // Deserializes the Catalyst schema of requested columns
+          .fromString(schemaString)
+          .map { field =>
+            if (fileFieldNames.contains(field.name)) {
+              // If the field exists in the target Parquet file, extracts the field type from the
+              // full file schema and makes a single-field Parquet schema
+              new MessageType("root", fileSchema.getType(field.name))
+            } else {
+              // Otherwise, just resorts to `CatalystSchemaConverter`
+              toParquet.convert(StructType(Array(field)))
+            }
+          }
+          // Merges all single-field Parquet schemas to form a complete schema for all requested
+          // columns.  Note that it's possible that no columns are requested at all (e.g., count
+          // some partition column of a partitioned Parquet table). That's why `fold` is used here
+          // and always fallback to an empty Parquet schema.
+          .fold(new MessageType("root")) {
+            _ union _
+          }
+      }
 
-    val origAttributesStr: String = configuration.get(RowWriteSupport.SPARK_ROW_SCHEMA)
-    if (origAttributesStr != null) {
-      metadata.put(RowReadSupport.SPARK_METADATA_KEY, origAttributesStr)
-    }
+    val metadata =
+      Map.empty[String, String] ++
+        maybeRequestedSchema.map(RowReadSupport.SPARK_ROW_REQUESTED_SCHEMA -> _) ++
+        maybeRowSchema.map(RowWriteSupport.SPARK_ROW_SCHEMA -> _)
 
-    new ReadSupport.ReadContext(parquetSchema, metadata)
+    logInfo(s"Going to read Parquet file with these requested columns: $parquetRequestedSchema")
+    new ReadContext(parquetRequestedSchema, metadata)
   }
 }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
index ce456e7fbe17e..01dd6f471bd7c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
@@ -259,6 +259,10 @@ private[sql] class ParquetRelation2(
       broadcastedConf: Broadcast[SerializableConfiguration]): RDD[Row] = {
     val useMetadataCache = sqlContext.getConf(SQLConf.PARQUET_CACHE_METADATA)
     val parquetFilterPushDown = sqlContext.conf.parquetFilterPushDown
+    val assumeBinaryIsString = sqlContext.conf.isParquetBinaryAsString
+    val assumeInt96IsTimestamp = sqlContext.conf.isParquetINT96AsTimestamp
+    val followParquetFormatSpec = sqlContext.conf.followParquetFormatSpec
+
     // Create the function to set variable Parquet confs at both driver and executor side.
     val initLocalJobFuncOpt =
       ParquetRelation2.initializeLocalJobFunc(
@@ -266,7 +270,11 @@ private[sql] class ParquetRelation2(
         filters,
         dataSchema,
         useMetadataCache,
-        parquetFilterPushDown) _
+        parquetFilterPushDown,
+        assumeBinaryIsString,
+        assumeInt96IsTimestamp,
+        followParquetFormatSpec) _
+
     // Create the function to set input paths at the driver side.
     val setInputPaths = ParquetRelation2.initializeDriverSideJobFunc(inputFiles) _
 
@@ -471,9 +479,12 @@ private[sql] object ParquetRelation2 extends Logging {
       filters: Array[Filter],
       dataSchema: StructType,
       useMetadataCache: Boolean,
-      parquetFilterPushDown: Boolean)(job: Job): Unit = {
+      parquetFilterPushDown: Boolean,
+      assumeBinaryIsString: Boolean,
+      assumeInt96IsTimestamp: Boolean,
+      followParquetFormatSpec: Boolean)(job: Job): Unit = {
     val conf = job.getConfiguration
-    conf.set(ParquetInputFormat.READ_SUPPORT_CLASS, classOf[RowReadSupport].getName())
+    conf.set(ParquetInputFormat.READ_SUPPORT_CLASS, classOf[RowReadSupport].getName)
 
     // Try to push down filters when filter push-down is enabled.
     if (parquetFilterPushDown) {
@@ -497,6 +508,11 @@ private[sql] object ParquetRelation2 extends Logging {
 
     // Tell FilteringParquetRowInputFormat whether it's okay to cache Parquet and FS metadata
     conf.setBoolean(SQLConf.PARQUET_CACHE_METADATA.key, useMetadataCache)
+
+    // Sets flags for Parquet schema conversion
+    conf.setBoolean(SQLConf.PARQUET_BINARY_AS_STRING.key, assumeBinaryIsString)
+    conf.setBoolean(SQLConf.PARQUET_INT96_AS_TIMESTAMP.key, assumeInt96IsTimestamp)
+    conf.setBoolean(SQLConf.PARQUET_FOLLOW_PARQUET_FORMAT_SPEC.key, followParquetFormatSpec)
   }
 
   /** This closure sets input paths at the driver side. */
diff --git a/sql/core/src/test/README.md b/sql/core/src/test/README.md
new file mode 100644
index 0000000000000..3dd9861b4896d
--- /dev/null
+++ b/sql/core/src/test/README.md
@@ -0,0 +1,33 @@
+# Notes for Parquet compatibility tests
+
+The following directories and files are used for Parquet compatibility tests:
+
+```
+.
+├── README.md                   # This file
+├── avro
+│   ├── parquet-compat.avdl     # Testing Avro IDL
+│   └── parquet-compat.avpr     # !! NO TOUCH !! Protocol file generated from parquet-compat.avdl
+├── gen-java                    # !! NO TOUCH !! Generated Java code
+├── scripts
+│   └── gen-code.sh             # Script used to generate Java code for Thrift and Avro
+└── thrift
+    └── parquet-compat.thrift   # Testing Thrift schema
+```
+
+Generated Java code are used in the following test suites:
+
+- `org.apache.spark.sql.parquet.ParquetAvroCompatibilitySuite`
+- `org.apache.spark.sql.parquet.ParquetThriftCompatibilitySuite`
+
+To avoid code generation during build time, Java code generated from testing Thrift schema and Avro IDL are also checked in.
+
+When updating the testing Thrift schema and Avro IDL, please run `gen-code.sh` to update all the generated Java code.
+
+## Prerequisites
+
+Please ensure `avro-tools` and `thrift` are installed.  You may install these two on Mac OS X via:
+
+```bash
+$ brew install thrift avro-tools
+```
diff --git a/sql/core/src/test/avro/parquet-compat.avdl b/sql/core/src/test/avro/parquet-compat.avdl
new file mode 100644
index 0000000000000..24729f6143e6c
--- /dev/null
+++ b/sql/core/src/test/avro/parquet-compat.avdl
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// This is a test protocol for testing parquet-avro compatibility.
+@namespace("org.apache.spark.sql.parquet.test.avro")
+protocol CompatibilityTest {
+    record Nested {
+        array<int> nested_ints_column;
+        string nested_string_column;
+    }
+
+    record ParquetAvroCompat {
+        boolean bool_column;
+        int int_column;
+        long long_column;
+        float float_column;
+        double double_column;
+        bytes binary_column;
+        string string_column;
+
+        union { null, boolean } maybe_bool_column;
+        union { null, int } maybe_int_column;
+        union { null, long } maybe_long_column;
+        union { null, float } maybe_float_column;
+        union { null, double } maybe_double_column;
+        union { null, bytes } maybe_binary_column;
+        union { null, string } maybe_string_column;
+
+        array<string> strings_column;
+        map<int> string_to_int_column;
+        map<array<Nested>> complex_column;
+    }
+}
diff --git a/sql/core/src/test/avro/parquet-compat.avpr b/sql/core/src/test/avro/parquet-compat.avpr
new file mode 100644
index 0000000000000..a83b7c990dd2e
--- /dev/null
+++ b/sql/core/src/test/avro/parquet-compat.avpr
@@ -0,0 +1,86 @@
+{
+  "protocol" : "CompatibilityTest",
+  "namespace" : "org.apache.spark.sql.parquet.test.avro",
+  "types" : [ {
+    "type" : "record",
+    "name" : "Nested",
+    "fields" : [ {
+      "name" : "nested_ints_column",
+      "type" : {
+        "type" : "array",
+        "items" : "int"
+      }
+    }, {
+      "name" : "nested_string_column",
+      "type" : "string"
+    } ]
+  }, {
+    "type" : "record",
+    "name" : "ParquetAvroCompat",
+    "fields" : [ {
+      "name" : "bool_column",
+      "type" : "boolean"
+    }, {
+      "name" : "int_column",
+      "type" : "int"
+    }, {
+      "name" : "long_column",
+      "type" : "long"
+    }, {
+      "name" : "float_column",
+      "type" : "float"
+    }, {
+      "name" : "double_column",
+      "type" : "double"
+    }, {
+      "name" : "binary_column",
+      "type" : "bytes"
+    }, {
+      "name" : "string_column",
+      "type" : "string"
+    }, {
+      "name" : "maybe_bool_column",
+      "type" : [ "null", "boolean" ]
+    }, {
+      "name" : "maybe_int_column",
+      "type" : [ "null", "int" ]
+    }, {
+      "name" : "maybe_long_column",
+      "type" : [ "null", "long" ]
+    }, {
+      "name" : "maybe_float_column",
+      "type" : [ "null", "float" ]
+    }, {
+      "name" : "maybe_double_column",
+      "type" : [ "null", "double" ]
+    }, {
+      "name" : "maybe_binary_column",
+      "type" : [ "null", "bytes" ]
+    }, {
+      "name" : "maybe_string_column",
+      "type" : [ "null", "string" ]
+    }, {
+      "name" : "strings_column",
+      "type" : {
+        "type" : "array",
+        "items" : "string"
+      }
+    }, {
+      "name" : "string_to_int_column",
+      "type" : {
+        "type" : "map",
+        "values" : "int"
+      }
+    }, {
+      "name" : "complex_column",
+      "type" : {
+        "type" : "map",
+        "values" : {
+          "type" : "array",
+          "items" : "Nested"
+        }
+      }
+    } ]
+  } ],
+  "messages" : { }
+}
\ No newline at end of file
diff --git a/sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/avro/CompatibilityTest.java b/sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/avro/CompatibilityTest.java
new file mode 100644
index 0000000000000..daec65a5bbe57
--- /dev/null
+++ b/sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/avro/CompatibilityTest.java
@@ -0,0 +1,17 @@
+/**
+ * Autogenerated by Avro
+ * 
+ * DO NOT EDIT DIRECTLY
+ */
+package org.apache.spark.sql.parquet.test.avro;
+
+@SuppressWarnings("all")
+@org.apache.avro.specific.AvroGenerated
+public interface CompatibilityTest {
+  public static final org.apache.avro.Protocol PROTOCOL = org.apache.avro.Protocol.parse("{\"protocol\":\"CompatibilityTest\",\"namespace\":\"org.apache.spark.sql.parquet.test.avro\",\"types\":[{\"type\":\"record\",\"name\":\"Nested\",\"fields\":[{\"name\":\"nested_ints_column\",\"type\":{\"type\":\"array\",\"items\":\"int\"}},{\"name\":\"nested_string_column\",\"type\":{\"type\":\"string\",\"avro.java.string\":\"String\"}}]},{\"type\":\"record\",\"name\":\"ParquetAvroCompat\",\"fields\":[{\"name\":\"bool_column\",\"type\":\"boolean\"},{\"name\":\"int_column\",\"type\":\"int\"},{\"name\":\"long_column\",\"type\":\"long\"},{\"name\":\"float_column\",\"type\":\"float\"},{\"name\":\"double_column\",\"type\":\"double\"},{\"name\":\"binary_column\",\"type\":\"bytes\"},{\"name\":\"string_column\",\"type\":{\"type\":\"string\",\"avro.java.string\":\"String\"}},{\"name\":\"maybe_bool_column\",\"type\":[\"null\",\"boolean\"]},{\"name\":\"maybe_int_column\",\"type\":[\"null\",\"int\"]},{\"name\":\"maybe_long_column\",\"type\":[\"null\",\"long\"]},{\"name\":\"maybe_float_column\",\"type\":[\"null\",\"float\"]},{\"name\":\"maybe_double_column\",\"type\":[\"null\",\"double\"]},{\"name\":\"maybe_binary_column\",\"type\":[\"null\",\"bytes\"]},{\"name\":\"maybe_string_column\",\"type\":[\"null\",{\"type\":\"string\",\"avro.java.string\":\"String\"}]},{\"name\":\"strings_column\",\"type\":{\"type\":\"array\",\"items\":{\"type\":\"string\",\"avro.java.string\":\"String\"}}},{\"name\":\"string_to_int_column\",\"type\":{\"type\":\"map\",\"values\":\"int\",\"avro.java.string\":\"String\"}},{\"name\":\"complex_column\",\"type\":{\"type\":\"map\",\"values\":{\"type\":\"array\",\"items\":\"Nested\"},\"avro.java.string\":\"String\"}}]}],\"messages\":{}}");
+
+  @SuppressWarnings("all")
+  public interface Callback extends CompatibilityTest {
+    public static final org.apache.avro.Protocol PROTOCOL = org.apache.spark.sql.parquet.test.avro.CompatibilityTest.PROTOCOL;
+  }
+}
\ No newline at end of file
diff --git a/sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/avro/Nested.java b/sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/avro/Nested.java
new file mode 100644
index 0000000000000..051f1ee903863
--- /dev/null
+++ b/sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/avro/Nested.java
@@ -0,0 +1,196 @@
+/**
+ * Autogenerated by Avro
+ * 
+ * DO NOT EDIT DIRECTLY
+ */
+package org.apache.spark.sql.parquet.test.avro;  
+@SuppressWarnings("all")
+@org.apache.avro.specific.AvroGenerated
+public class Nested extends org.apache.avro.specific.SpecificRecordBase implements org.apache.avro.specific.SpecificRecord {
+  public static final org.apache.avro.Schema SCHEMA$ = new org.apache.avro.Schema.Parser().parse("{\"type\":\"record\",\"name\":\"Nested\",\"namespace\":\"org.apache.spark.sql.parquet.test.avro\",\"fields\":[{\"name\":\"nested_ints_column\",\"type\":{\"type\":\"array\",\"items\":\"int\"}},{\"name\":\"nested_string_column\",\"type\":{\"type\":\"string\",\"avro.java.string\":\"String\"}}]}");
+  public static org.apache.avro.Schema getClassSchema() { return SCHEMA$; }
+  @Deprecated public java.util.List<java.lang.Integer> nested_ints_column;
+  @Deprecated public java.lang.String nested_string_column;
+
+  /**
+   * Default constructor.  Note that this does not initialize fields
+   * to their default values from the schema.  If that is desired then
+   * one should use <code>newBuilder()</code>. 
+   */
+  public Nested() {}
+
+  /**
+   * All-args constructor.
+   */
+  public Nested(java.util.List<java.lang.Integer> nested_ints_column, java.lang.String nested_string_column) {
+    this.nested_ints_column = nested_ints_column;
+    this.nested_string_column = nested_string_column;
+  }
+
+  public org.apache.avro.Schema getSchema() { return SCHEMA$; }
+  // Used by DatumWriter.  Applications should not call. 
+  public java.lang.Object get(int field$) {
+    switch (field$) {
+    case 0: return nested_ints_column;
+    case 1: return nested_string_column;
+    default: throw new org.apache.avro.AvroRuntimeException("Bad index");
+    }
+  }
+  // Used by DatumReader.  Applications should not call. 
+  @SuppressWarnings(value="unchecked")
+  public void put(int field$, java.lang.Object value$) {
+    switch (field$) {
+    case 0: nested_ints_column = (java.util.List<java.lang.Integer>)value$; break;
+    case 1: nested_string_column = (java.lang.String)value$; break;
+    default: throw new org.apache.avro.AvroRuntimeException("Bad index");
+    }
+  }
+
+  /**
+   * Gets the value of the 'nested_ints_column' field.
+   */
+  public java.util.List<java.lang.Integer> getNestedIntsColumn() {
+    return nested_ints_column;
+  }
+
+  /**
+   * Sets the value of the 'nested_ints_column' field.
+   * @param value the value to set.
+   */
+  public void setNestedIntsColumn(java.util.List<java.lang.Integer> value) {
+    this.nested_ints_column = value;
+  }
+
+  /**
+   * Gets the value of the 'nested_string_column' field.
+   */
+  public java.lang.String getNestedStringColumn() {
+    return nested_string_column;
+  }
+
+  /**
+   * Sets the value of the 'nested_string_column' field.
+   * @param value the value to set.
+   */
+  public void setNestedStringColumn(java.lang.String value) {
+    this.nested_string_column = value;
+  }
+
+  /** Creates a new Nested RecordBuilder */
+  public static org.apache.spark.sql.parquet.test.avro.Nested.Builder newBuilder() {
+    return new org.apache.spark.sql.parquet.test.avro.Nested.Builder();
+  }
+  
+  /** Creates a new Nested RecordBuilder by copying an existing Builder */
+  public static org.apache.spark.sql.parquet.test.avro.Nested.Builder newBuilder(org.apache.spark.sql.parquet.test.avro.Nested.Builder other) {
+    return new org.apache.spark.sql.parquet.test.avro.Nested.Builder(other);
+  }
+  
+  /** Creates a new Nested RecordBuilder by copying an existing Nested instance */
+  public static org.apache.spark.sql.parquet.test.avro.Nested.Builder newBuilder(org.apache.spark.sql.parquet.test.avro.Nested other) {
+    return new org.apache.spark.sql.parquet.test.avro.Nested.Builder(other);
+  }
+  
+  /**
+   * RecordBuilder for Nested instances.
+   */
+  public static class Builder extends org.apache.avro.specific.SpecificRecordBuilderBase<Nested>
+    implements org.apache.avro.data.RecordBuilder<Nested> {
+
+    private java.util.List<java.lang.Integer> nested_ints_column;
+    private java.lang.String nested_string_column;
+
+    /** Creates a new Builder */
+    private Builder() {
+      super(org.apache.spark.sql.parquet.test.avro.Nested.SCHEMA$);
+    }
+    
+    /** Creates a Builder by copying an existing Builder */
+    private Builder(org.apache.spark.sql.parquet.test.avro.Nested.Builder other) {
+      super(other);
+      if (isValidValue(fields()[0], other.nested_ints_column)) {
+        this.nested_ints_column = data().deepCopy(fields()[0].schema(), other.nested_ints_column);
+        fieldSetFlags()[0] = true;
+      }
+      if (isValidValue(fields()[1], other.nested_string_column)) {
+        this.nested_string_column = data().deepCopy(fields()[1].schema(), other.nested_string_column);
+        fieldSetFlags()[1] = true;
+      }
+    }
+    
+    /** Creates a Builder by copying an existing Nested instance */
+    private Builder(org.apache.spark.sql.parquet.test.avro.Nested other) {
+            super(org.apache.spark.sql.parquet.test.avro.Nested.SCHEMA$);
+      if (isValidValue(fields()[0], other.nested_ints_column)) {
+        this.nested_ints_column = data().deepCopy(fields()[0].schema(), other.nested_ints_column);
+        fieldSetFlags()[0] = true;
+      }
+      if (isValidValue(fields()[1], other.nested_string_column)) {
+        this.nested_string_column = data().deepCopy(fields()[1].schema(), other.nested_string_column);
+        fieldSetFlags()[1] = true;
+      }
+    }
+
+    /** Gets the value of the 'nested_ints_column' field */
+    public java.util.List<java.lang.Integer> getNestedIntsColumn() {
+      return nested_ints_column;
+    }
+    
+    /** Sets the value of the 'nested_ints_column' field */
+    public org.apache.spark.sql.parquet.test.avro.Nested.Builder setNestedIntsColumn(java.util.List<java.lang.Integer> value) {
+      validate(fields()[0], value);
+      this.nested_ints_column = value;
+      fieldSetFlags()[0] = true;
+      return this; 
+    }
+    
+    /** Checks whether the 'nested_ints_column' field has been set */
+    public boolean hasNestedIntsColumn() {
+      return fieldSetFlags()[0];
+    }
+    
+    /** Clears the value of the 'nested_ints_column' field */
+    public org.apache.spark.sql.parquet.test.avro.Nested.Builder clearNestedIntsColumn() {
+      nested_ints_column = null;
+      fieldSetFlags()[0] = false;
+      return this;
+    }
+
+    /** Gets the value of the 'nested_string_column' field */
+    public java.lang.String getNestedStringColumn() {
+      return nested_string_column;
+    }
+    
+    /** Sets the value of the 'nested_string_column' field */
+    public org.apache.spark.sql.parquet.test.avro.Nested.Builder setNestedStringColumn(java.lang.String value) {
+      validate(fields()[1], value);
+      this.nested_string_column = value;
+      fieldSetFlags()[1] = true;
+      return this; 
+    }
+    
+    /** Checks whether the 'nested_string_column' field has been set */
+    public boolean hasNestedStringColumn() {
+      return fieldSetFlags()[1];
+    }
+    
+    /** Clears the value of the 'nested_string_column' field */
+    public org.apache.spark.sql.parquet.test.avro.Nested.Builder clearNestedStringColumn() {
+      nested_string_column = null;
+      fieldSetFlags()[1] = false;
+      return this;
+    }
+
+    @Override
+    public Nested build() {
+      try {
+        Nested record = new Nested();
+        record.nested_ints_column = fieldSetFlags()[0] ? this.nested_ints_column : (java.util.List<java.lang.Integer>) defaultValue(fields()[0]);
+        record.nested_string_column = fieldSetFlags()[1] ? this.nested_string_column : (java.lang.String) defaultValue(fields()[1]);
+        return record;
+      } catch (Exception e) {
+        throw new org.apache.avro.AvroRuntimeException(e);
+      }
+    }
+  }
+}
diff --git a/sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/avro/ParquetAvroCompat.java b/sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/avro/ParquetAvroCompat.java
new file mode 100644
index 0000000000000..354c9d73cca31
--- /dev/null
+++ b/sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/avro/ParquetAvroCompat.java
@@ -0,0 +1,1001 @@
+/**
+ * Autogenerated by Avro
+ * 
+ * DO NOT EDIT DIRECTLY
+ */
+package org.apache.spark.sql.parquet.test.avro;  
+@SuppressWarnings("all")
+@org.apache.avro.specific.AvroGenerated
+public class ParquetAvroCompat extends org.apache.avro.specific.SpecificRecordBase implements org.apache.avro.specific.SpecificRecord {
+  public static final org.apache.avro.Schema SCHEMA$ = new org.apache.avro.Schema.Parser().parse("{\"type\":\"record\",\"name\":\"ParquetAvroCompat\",\"namespace\":\"org.apache.spark.sql.parquet.test.avro\",\"fields\":[{\"name\":\"bool_column\",\"type\":\"boolean\"},{\"name\":\"int_column\",\"type\":\"int\"},{\"name\":\"long_column\",\"type\":\"long\"},{\"name\":\"float_column\",\"type\":\"float\"},{\"name\":\"double_column\",\"type\":\"double\"},{\"name\":\"binary_column\",\"type\":\"bytes\"},{\"name\":\"string_column\",\"type\":{\"type\":\"string\",\"avro.java.string\":\"String\"}},{\"name\":\"maybe_bool_column\",\"type\":[\"null\",\"boolean\"]},{\"name\":\"maybe_int_column\",\"type\":[\"null\",\"int\"]},{\"name\":\"maybe_long_column\",\"type\":[\"null\",\"long\"]},{\"name\":\"maybe_float_column\",\"type\":[\"null\",\"float\"]},{\"name\":\"maybe_double_column\",\"type\":[\"null\",\"double\"]},{\"name\":\"maybe_binary_column\",\"type\":[\"null\",\"bytes\"]},{\"name\":\"maybe_string_column\",\"type\":[\"null\",{\"type\":\"string\",\"avro.java.string\":\"String\"}]},{\"name\":\"strings_column\",\"type\":{\"type\":\"array\",\"items\":{\"type\":\"string\",\"avro.java.string\":\"String\"}}},{\"name\":\"string_to_int_column\",\"type\":{\"type\":\"map\",\"values\":\"int\",\"avro.java.string\":\"String\"}},{\"name\":\"complex_column\",\"type\":{\"type\":\"map\",\"values\":{\"type\":\"array\",\"items\":{\"type\":\"record\",\"name\":\"Nested\",\"fields\":[{\"name\":\"nested_ints_column\",\"type\":{\"type\":\"array\",\"items\":\"int\"}},{\"name\":\"nested_string_column\",\"type\":{\"type\":\"string\",\"avro.java.string\":\"String\"}}]}},\"avro.java.string\":\"String\"}}]}");
+  public static org.apache.avro.Schema getClassSchema() { return SCHEMA$; }
+  @Deprecated public boolean bool_column;
+  @Deprecated public int int_column;
+  @Deprecated public long long_column;
+  @Deprecated public float float_column;
+  @Deprecated public double double_column;
+  @Deprecated public java.nio.ByteBuffer binary_column;
+  @Deprecated public java.lang.String string_column;
+  @Deprecated public java.lang.Boolean maybe_bool_column;
+  @Deprecated public java.lang.Integer maybe_int_column;
+  @Deprecated public java.lang.Long maybe_long_column;
+  @Deprecated public java.lang.Float maybe_float_column;
+  @Deprecated public java.lang.Double maybe_double_column;
+  @Deprecated public java.nio.ByteBuffer maybe_binary_column;
+  @Deprecated public java.lang.String maybe_string_column;
+  @Deprecated public java.util.List<java.lang.String> strings_column;
+  @Deprecated public java.util.Map<java.lang.String,java.lang.Integer> string_to_int_column;
+  @Deprecated public java.util.Map<java.lang.String,java.util.List<org.apache.spark.sql.parquet.test.avro.Nested>> complex_column;
+
+  /**
+   * Default constructor.  Note that this does not initialize fields
+   * to their default values from the schema.  If that is desired then
+   * one should use <code>newBuilder()</code>. 
+   */
+  public ParquetAvroCompat() {}
+
+  /**
+   * All-args constructor.
+   */
+  public ParquetAvroCompat(java.lang.Boolean bool_column, java.lang.Integer int_column, java.lang.Long long_column, java.lang.Float float_column, java.lang.Double double_column, java.nio.ByteBuffer binary_column, java.lang.String string_column, java.lang.Boolean maybe_bool_column, java.lang.Integer maybe_int_column, java.lang.Long maybe_long_column, java.lang.Float maybe_float_column, java.lang.Double maybe_double_column, java.nio.ByteBuffer maybe_binary_column, java.lang.String maybe_string_column, java.util.List<java.lang.String> strings_column, java.util.Map<java.lang.String,java.lang.Integer> string_to_int_column, java.util.Map<java.lang.String,java.util.List<org.apache.spark.sql.parquet.test.avro.Nested>> complex_column) {
+    this.bool_column = bool_column;
+    this.int_column = int_column;
+    this.long_column = long_column;
+    this.float_column = float_column;
+    this.double_column = double_column;
+    this.binary_column = binary_column;
+    this.string_column = string_column;
+    this.maybe_bool_column = maybe_bool_column;
+    this.maybe_int_column = maybe_int_column;
+    this.maybe_long_column = maybe_long_column;
+    this.maybe_float_column = maybe_float_column;
+    this.maybe_double_column = maybe_double_column;
+    this.maybe_binary_column = maybe_binary_column;
+    this.maybe_string_column = maybe_string_column;
+    this.strings_column = strings_column;
+    this.string_to_int_column = string_to_int_column;
+    this.complex_column = complex_column;
+  }
+
+  public org.apache.avro.Schema getSchema() { return SCHEMA$; }
+  // Used by DatumWriter.  Applications should not call. 
+  public java.lang.Object get(int field$) {
+    switch (field$) {
+    case 0: return bool_column;
+    case 1: return int_column;
+    case 2: return long_column;
+    case 3: return float_column;
+    case 4: return double_column;
+    case 5: return binary_column;
+    case 6: return string_column;
+    case 7: return maybe_bool_column;
+    case 8: return maybe_int_column;
+    case 9: return maybe_long_column;
+    case 10: return maybe_float_column;
+    case 11: return maybe_double_column;
+    case 12: return maybe_binary_column;
+    case 13: return maybe_string_column;
+    case 14: return strings_column;
+    case 15: return string_to_int_column;
+    case 16: return complex_column;
+    default: throw new org.apache.avro.AvroRuntimeException("Bad index");
+    }
+  }
+  // Used by DatumReader.  Applications should not call. 
+  @SuppressWarnings(value="unchecked")
+  public void put(int field$, java.lang.Object value$) {
+    switch (field$) {
+    case 0: bool_column = (java.lang.Boolean)value$; break;
+    case 1: int_column = (java.lang.Integer)value$; break;
+    case 2: long_column = (java.lang.Long)value$; break;
+    case 3: float_column = (java.lang.Float)value$; break;
+    case 4: double_column = (java.lang.Double)value$; break;
+    case 5: binary_column = (java.nio.ByteBuffer)value$; break;
+    case 6: string_column = (java.lang.String)value$; break;
+    case 7: maybe_bool_column = (java.lang.Boolean)value$; break;
+    case 8: maybe_int_column = (java.lang.Integer)value$; break;
+    case 9: maybe_long_column = (java.lang.Long)value$; break;
+    case 10: maybe_float_column = (java.lang.Float)value$; break;
+    case 11: maybe_double_column = (java.lang.Double)value$; break;
+    case 12: maybe_binary_column = (java.nio.ByteBuffer)value$; break;
+    case 13: maybe_string_column = (java.lang.String)value$; break;
+    case 14: strings_column = (java.util.List<java.lang.String>)value$; break;
+    case 15: string_to_int_column = (java.util.Map<java.lang.String,java.lang.Integer>)value$; break;
+    case 16: complex_column = (java.util.Map<java.lang.String,java.util.List<org.apache.spark.sql.parquet.test.avro.Nested>>)value$; break;
+    default: throw new org.apache.avro.AvroRuntimeException("Bad index");
+    }
+  }
+
+  /**
+   * Gets the value of the 'bool_column' field.
+   */
+  public java.lang.Boolean getBoolColumn() {
+    return bool_column;
+  }
+
+  /**
+   * Sets the value of the 'bool_column' field.
+   * @param value the value to set.
+   */
+  public void setBoolColumn(java.lang.Boolean value) {
+    this.bool_column = value;
+  }
+
+  /**
+   * Gets the value of the 'int_column' field.
+   */
+  public java.lang.Integer getIntColumn() {
+    return int_column;
+  }
+
+  /**
+   * Sets the value of the 'int_column' field.
+   * @param value the value to set.
+   */
+  public void setIntColumn(java.lang.Integer value) {
+    this.int_column = value;
+  }
+
+  /**
+   * Gets the value of the 'long_column' field.
+   */
+  public java.lang.Long getLongColumn() {
+    return long_column;
+  }
+
+  /**
+   * Sets the value of the 'long_column' field.
+   * @param value the value to set.
+   */
+  public void setLongColumn(java.lang.Long value) {
+    this.long_column = value;
+  }
+
+  /**
+   * Gets the value of the 'float_column' field.
+   */
+  public java.lang.Float getFloatColumn() {
+    return float_column;
+  }
+
+  /**
+   * Sets the value of the 'float_column' field.
+   * @param value the value to set.
+   */
+  public void setFloatColumn(java.lang.Float value) {
+    this.float_column = value;
+  }
+
+  /**
+   * Gets the value of the 'double_column' field.
+   */
+  public java.lang.Double getDoubleColumn() {
+    return double_column;
+  }
+
+  /**
+   * Sets the value of the 'double_column' field.
+   * @param value the value to set.
+   */
+  public void setDoubleColumn(java.lang.Double value) {
+    this.double_column = value;
+  }
+
+  /**
+   * Gets the value of the 'binary_column' field.
+   */
+  public java.nio.ByteBuffer getBinaryColumn() {
+    return binary_column;
+  }
+
+  /**
+   * Sets the value of the 'binary_column' field.
+   * @param value the value to set.
+   */
+  public void setBinaryColumn(java.nio.ByteBuffer value) {
+    this.binary_column = value;
+  }
+
+  /**
+   * Gets the value of the 'string_column' field.
+   */
+  public java.lang.String getStringColumn() {
+    return string_column;
+  }
+
+  /**
+   * Sets the value of the 'string_column' field.
+   * @param value the value to set.
+   */
+  public void setStringColumn(java.lang.String value) {
+    this.string_column = value;
+  }
+
+  /**
+   * Gets the value of the 'maybe_bool_column' field.
+   */
+  public java.lang.Boolean getMaybeBoolColumn() {
+    return maybe_bool_column;
+  }
+
+  /**
+   * Sets the value of the 'maybe_bool_column' field.
+   * @param value the value to set.
+   */
+  public void setMaybeBoolColumn(java.lang.Boolean value) {
+    this.maybe_bool_column = value;
+  }
+
+  /**
+   * Gets the value of the 'maybe_int_column' field.
+   */
+  public java.lang.Integer getMaybeIntColumn() {
+    return maybe_int_column;
+  }
+
+  /**
+   * Sets the value of the 'maybe_int_column' field.
+   * @param value the value to set.
+   */
+  public void setMaybeIntColumn(java.lang.Integer value) {
+    this.maybe_int_column = value;
+  }
+
+  /**
+   * Gets the value of the 'maybe_long_column' field.
+   */
+  public java.lang.Long getMaybeLongColumn() {
+    return maybe_long_column;
+  }
+
+  /**
+   * Sets the value of the 'maybe_long_column' field.
+   * @param value the value to set.
+   */
+  public void setMaybeLongColumn(java.lang.Long value) {
+    this.maybe_long_column = value;
+  }
+
+  /**
+   * Gets the value of the 'maybe_float_column' field.
+   */
+  public java.lang.Float getMaybeFloatColumn() {
+    return maybe_float_column;
+  }
+
+  /**
+   * Sets the value of the 'maybe_float_column' field.
+   * @param value the value to set.
+   */
+  public void setMaybeFloatColumn(java.lang.Float value) {
+    this.maybe_float_column = value;
+  }
+
+  /**
+   * Gets the value of the 'maybe_double_column' field.
+   */
+  public java.lang.Double getMaybeDoubleColumn() {
+    return maybe_double_column;
+  }
+
+  /**
+   * Sets the value of the 'maybe_double_column' field.
+   * @param value the value to set.
+   */
+  public void setMaybeDoubleColumn(java.lang.Double value) {
+    this.maybe_double_column = value;
+  }
+
+  /**
+   * Gets the value of the 'maybe_binary_column' field.
+   */
+  public java.nio.ByteBuffer getMaybeBinaryColumn() {
+    return maybe_binary_column;
+  }
+
+  /**
+   * Sets the value of the 'maybe_binary_column' field.
+   * @param value the value to set.
+   */
+  public void setMaybeBinaryColumn(java.nio.ByteBuffer value) {
+    this.maybe_binary_column = value;
+  }
+
+  /**
+   * Gets the value of the 'maybe_string_column' field.
+   */
+  public java.lang.String getMaybeStringColumn() {
+    return maybe_string_column;
+  }
+
+  /**
+   * Sets the value of the 'maybe_string_column' field.
+   * @param value the value to set.
+   */
+  public void setMaybeStringColumn(java.lang.String value) {
+    this.maybe_string_column = value;
+  }
+
+  /**
+   * Gets the value of the 'strings_column' field.
+   */
+  public java.util.List<java.lang.String> getStringsColumn() {
+    return strings_column;
+  }
+
+  /**
+   * Sets the value of the 'strings_column' field.
+   * @param value the value to set.
+   */
+  public void setStringsColumn(java.util.List<java.lang.String> value) {
+    this.strings_column = value;
+  }
+
+  /**
+   * Gets the value of the 'string_to_int_column' field.
+   */
+  public java.util.Map<java.lang.String,java.lang.Integer> getStringToIntColumn() {
+    return string_to_int_column;
+  }
+
+  /**
+   * Sets the value of the 'string_to_int_column' field.
+   * @param value the value to set.
+   */
+  public void setStringToIntColumn(java.util.Map<java.lang.String,java.lang.Integer> value) {
+    this.string_to_int_column = value;
+  }
+
+  /**
+   * Gets the value of the 'complex_column' field.
+   */
+  public java.util.Map<java.lang.String,java.util.List<org.apache.spark.sql.parquet.test.avro.Nested>> getComplexColumn() {
+    return complex_column;
+  }
+
+  /**
+   * Sets the value of the 'complex_column' field.
+   * @param value the value to set.
+   */
+  public void setComplexColumn(java.util.Map<java.lang.String,java.util.List<org.apache.spark.sql.parquet.test.avro.Nested>> value) {
+    this.complex_column = value;
+  }
+
+  /** Creates a new ParquetAvroCompat RecordBuilder */
+  public static org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder newBuilder() {
+    return new org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder();
+  }
+  
+  /** Creates a new ParquetAvroCompat RecordBuilder by copying an existing Builder */
+  public static org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder newBuilder(org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder other) {
+    return new org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder(other);
+  }
+  
+  /** Creates a new ParquetAvroCompat RecordBuilder by copying an existing ParquetAvroCompat instance */
+  public static org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder newBuilder(org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat other) {
+    return new org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder(other);
+  }
+  
+  /**
+   * RecordBuilder for ParquetAvroCompat instances.
+   */
+  public static class Builder extends org.apache.avro.specific.SpecificRecordBuilderBase<ParquetAvroCompat>
+    implements org.apache.avro.data.RecordBuilder<ParquetAvroCompat> {
+
+    private boolean bool_column;
+    private int int_column;
+    private long long_column;
+    private float float_column;
+    private double double_column;
+    private java.nio.ByteBuffer binary_column;
+    private java.lang.String string_column;
+    private java.lang.Boolean maybe_bool_column;
+    private java.lang.Integer maybe_int_column;
+    private java.lang.Long maybe_long_column;
+    private java.lang.Float maybe_float_column;
+    private java.lang.Double maybe_double_column;
+    private java.nio.ByteBuffer maybe_binary_column;
+    private java.lang.String maybe_string_column;
+    private java.util.List<java.lang.String> strings_column;
+    private java.util.Map<java.lang.String,java.lang.Integer> string_to_int_column;
+    private java.util.Map<java.lang.String,java.util.List<org.apache.spark.sql.parquet.test.avro.Nested>> complex_column;
+
+    /** Creates a new Builder */
+    private Builder() {
+      super(org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.SCHEMA$);
+    }
+    
+    /** Creates a Builder by copying an existing Builder */
+    private Builder(org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder other) {
+      super(other);
+      if (isValidValue(fields()[0], other.bool_column)) {
+        this.bool_column = data().deepCopy(fields()[0].schema(), other.bool_column);
+        fieldSetFlags()[0] = true;
+      }
+      if (isValidValue(fields()[1], other.int_column)) {
+        this.int_column = data().deepCopy(fields()[1].schema(), other.int_column);
+        fieldSetFlags()[1] = true;
+      }
+      if (isValidValue(fields()[2], other.long_column)) {
+        this.long_column = data().deepCopy(fields()[2].schema(), other.long_column);
+        fieldSetFlags()[2] = true;
+      }
+      if (isValidValue(fields()[3], other.float_column)) {
+        this.float_column = data().deepCopy(fields()[3].schema(), other.float_column);
+        fieldSetFlags()[3] = true;
+      }
+      if (isValidValue(fields()[4], other.double_column)) {
+        this.double_column = data().deepCopy(fields()[4].schema(), other.double_column);
+        fieldSetFlags()[4] = true;
+      }
+      if (isValidValue(fields()[5], other.binary_column)) {
+        this.binary_column = data().deepCopy(fields()[5].schema(), other.binary_column);
+        fieldSetFlags()[5] = true;
+      }
+      if (isValidValue(fields()[6], other.string_column)) {
+        this.string_column = data().deepCopy(fields()[6].schema(), other.string_column);
+        fieldSetFlags()[6] = true;
+      }
+      if (isValidValue(fields()[7], other.maybe_bool_column)) {
+        this.maybe_bool_column = data().deepCopy(fields()[7].schema(), other.maybe_bool_column);
+        fieldSetFlags()[7] = true;
+      }
+      if (isValidValue(fields()[8], other.maybe_int_column)) {
+        this.maybe_int_column = data().deepCopy(fields()[8].schema(), other.maybe_int_column);
+        fieldSetFlags()[8] = true;
+      }
+      if (isValidValue(fields()[9], other.maybe_long_column)) {
+        this.maybe_long_column = data().deepCopy(fields()[9].schema(), other.maybe_long_column);
+        fieldSetFlags()[9] = true;
+      }
+      if (isValidValue(fields()[10], other.maybe_float_column)) {
+        this.maybe_float_column = data().deepCopy(fields()[10].schema(), other.maybe_float_column);
+        fieldSetFlags()[10] = true;
+      }
+      if (isValidValue(fields()[11], other.maybe_double_column)) {
+        this.maybe_double_column = data().deepCopy(fields()[11].schema(), other.maybe_double_column);
+        fieldSetFlags()[11] = true;
+      }
+      if (isValidValue(fields()[12], other.maybe_binary_column)) {
+        this.maybe_binary_column = data().deepCopy(fields()[12].schema(), other.maybe_binary_column);
+        fieldSetFlags()[12] = true;
+      }
+      if (isValidValue(fields()[13], other.maybe_string_column)) {
+        this.maybe_string_column = data().deepCopy(fields()[13].schema(), other.maybe_string_column);
+        fieldSetFlags()[13] = true;
+      }
+      if (isValidValue(fields()[14], other.strings_column)) {
+        this.strings_column = data().deepCopy(fields()[14].schema(), other.strings_column);
+        fieldSetFlags()[14] = true;
+      }
+      if (isValidValue(fields()[15], other.string_to_int_column)) {
+        this.string_to_int_column = data().deepCopy(fields()[15].schema(), other.string_to_int_column);
+        fieldSetFlags()[15] = true;
+      }
+      if (isValidValue(fields()[16], other.complex_column)) {
+        this.complex_column = data().deepCopy(fields()[16].schema(), other.complex_column);
+        fieldSetFlags()[16] = true;
+      }
+    }
+    
+    /** Creates a Builder by copying an existing ParquetAvroCompat instance */
+    private Builder(org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat other) {
+            super(org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.SCHEMA$);
+      if (isValidValue(fields()[0], other.bool_column)) {
+        this.bool_column = data().deepCopy(fields()[0].schema(), other.bool_column);
+        fieldSetFlags()[0] = true;
+      }
+      if (isValidValue(fields()[1], other.int_column)) {
+        this.int_column = data().deepCopy(fields()[1].schema(), other.int_column);
+        fieldSetFlags()[1] = true;
+      }
+      if (isValidValue(fields()[2], other.long_column)) {
+        this.long_column = data().deepCopy(fields()[2].schema(), other.long_column);
+        fieldSetFlags()[2] = true;
+      }
+      if (isValidValue(fields()[3], other.float_column)) {
+        this.float_column = data().deepCopy(fields()[3].schema(), other.float_column);
+        fieldSetFlags()[3] = true;
+      }
+      if (isValidValue(fields()[4], other.double_column)) {
+        this.double_column = data().deepCopy(fields()[4].schema(), other.double_column);
+        fieldSetFlags()[4] = true;
+      }
+      if (isValidValue(fields()[5], other.binary_column)) {
+        this.binary_column = data().deepCopy(fields()[5].schema(), other.binary_column);
+        fieldSetFlags()[5] = true;
+      }
+      if (isValidValue(fields()[6], other.string_column)) {
+        this.string_column = data().deepCopy(fields()[6].schema(), other.string_column);
+        fieldSetFlags()[6] = true;
+      }
+      if (isValidValue(fields()[7], other.maybe_bool_column)) {
+        this.maybe_bool_column = data().deepCopy(fields()[7].schema(), other.maybe_bool_column);
+        fieldSetFlags()[7] = true;
+      }
+      if (isValidValue(fields()[8], other.maybe_int_column)) {
+        this.maybe_int_column = data().deepCopy(fields()[8].schema(), other.maybe_int_column);
+        fieldSetFlags()[8] = true;
+      }
+      if (isValidValue(fields()[9], other.maybe_long_column)) {
+        this.maybe_long_column = data().deepCopy(fields()[9].schema(), other.maybe_long_column);
+        fieldSetFlags()[9] = true;
+      }
+      if (isValidValue(fields()[10], other.maybe_float_column)) {
+        this.maybe_float_column = data().deepCopy(fields()[10].schema(), other.maybe_float_column);
+        fieldSetFlags()[10] = true;
+      }
+      if (isValidValue(fields()[11], other.maybe_double_column)) {
+        this.maybe_double_column = data().deepCopy(fields()[11].schema(), other.maybe_double_column);
+        fieldSetFlags()[11] = true;
+      }
+      if (isValidValue(fields()[12], other.maybe_binary_column)) {
+        this.maybe_binary_column = data().deepCopy(fields()[12].schema(), other.maybe_binary_column);
+        fieldSetFlags()[12] = true;
+      }
+      if (isValidValue(fields()[13], other.maybe_string_column)) {
+        this.maybe_string_column = data().deepCopy(fields()[13].schema(), other.maybe_string_column);
+        fieldSetFlags()[13] = true;
+      }
+      if (isValidValue(fields()[14], other.strings_column)) {
+        this.strings_column = data().deepCopy(fields()[14].schema(), other.strings_column);
+        fieldSetFlags()[14] = true;
+      }
+      if (isValidValue(fields()[15], other.string_to_int_column)) {
+        this.string_to_int_column = data().deepCopy(fields()[15].schema(), other.string_to_int_column);
+        fieldSetFlags()[15] = true;
+      }
+      if (isValidValue(fields()[16], other.complex_column)) {
+        this.complex_column = data().deepCopy(fields()[16].schema(), other.complex_column);
+        fieldSetFlags()[16] = true;
+      }
+    }
+
+    /** Gets the value of the 'bool_column' field */
+    public java.lang.Boolean getBoolColumn() {
+      return bool_column;
+    }
+    
+    /** Sets the value of the 'bool_column' field */
+    public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder setBoolColumn(boolean value) {
+      validate(fields()[0], value);
+      this.bool_column = value;
+      fieldSetFlags()[0] = true;
+      return this; 
+    }
+    
+    /** Checks whether the 'bool_column' field has been set */
+    public boolean hasBoolColumn() {
+      return fieldSetFlags()[0];
+    }
+    
+    /** Clears the value of the 'bool_column' field */
+    public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder clearBoolColumn() {
+      fieldSetFlags()[0] = false;
+      return this;
+    }
+
+    /** Gets the value of the 'int_column' field */
+    public java.lang.Integer getIntColumn() {
+      return int_column;
+    }
+    
+    /** Sets the value of the 'int_column' field */
+    public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder setIntColumn(int value) {
+      validate(fields()[1], value);
+      this.int_column = value;
+      fieldSetFlags()[1] = true;
+      return this; 
+    }
+    
+    /** Checks whether the 'int_column' field has been set */
+    public boolean hasIntColumn() {
+      return fieldSetFlags()[1];
+    }
+    
+    /** Clears the value of the 'int_column' field */
+    public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder clearIntColumn() {
+      fieldSetFlags()[1] = false;
+      return this;
+    }
+
+    /** Gets the value of the 'long_column' field */
+    public java.lang.Long getLongColumn() {
+      return long_column;
+    }
+    
+    /** Sets the value of the 'long_column' field */
+    public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder setLongColumn(long value) {
+      validate(fields()[2], value);
+      this.long_column = value;
+      fieldSetFlags()[2] = true;
+      return this; 
+    }
+    
+    /** Checks whether the 'long_column' field has been set */
+    public boolean hasLongColumn() {
+      return fieldSetFlags()[2];
+    }
+    
+    /** Clears the value of the 'long_column' field */
+    public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder clearLongColumn() {
+      fieldSetFlags()[2] = false;
+      return this;
+    }
+
+    /** Gets the value of the 'float_column' field */
+    public java.lang.Float getFloatColumn() {
+      return float_column;
+    }
+    
+    /** Sets the value of the 'float_column' field */
+    public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder setFloatColumn(float value) {
+      validate(fields()[3], value);
+      this.float_column = value;
+      fieldSetFlags()[3] = true;
+      return this; 
+    }
+    
+    /** Checks whether the 'float_column' field has been set */
+    public boolean hasFloatColumn() {
+      return fieldSetFlags()[3];
+    }
+    
+    /** Clears the value of the 'float_column' field */
+    public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder clearFloatColumn() {
+      fieldSetFlags()[3] = false;
+      return this;
+    }
+
+    /** Gets the value of the 'double_column' field */
+    public java.lang.Double getDoubleColumn() {
+      return double_column;
+    }
+    
+    /** Sets the value of the 'double_column' field */
+    public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder setDoubleColumn(double value) {
+      validate(fields()[4], value);
+      this.double_column = value;
+      fieldSetFlags()[4] = true;
+      return this; 
+    }
+    
+    /** Checks whether the 'double_column' field has been set */
+    public boolean hasDoubleColumn() {
+      return fieldSetFlags()[4];
+    }
+    
+    /** Clears the value of the 'double_column' field */
+    public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder clearDoubleColumn() {
+      fieldSetFlags()[4] = false;
+      return this;
+    }
+
+    /** Gets the value of the 'binary_column' field */
+    public java.nio.ByteBuffer getBinaryColumn() {
+      return binary_column;
+    }
+    
+    /** Sets the value of the 'binary_column' field */
+    public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder setBinaryColumn(java.nio.ByteBuffer value) {
+      validate(fields()[5], value);
+      this.binary_column = value;
+      fieldSetFlags()[5] = true;
+      return this; 
+    }
+    
+    /** Checks whether the 'binary_column' field has been set */
+    public boolean hasBinaryColumn() {
+      return fieldSetFlags()[5];
+    }
+    
+    /** Clears the value of the 'binary_column' field */
+    public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder clearBinaryColumn() {
+      binary_column = null;
+      fieldSetFlags()[5] = false;
+      return this;
+    }
+
+    /** Gets the value of the 'string_column' field */
+    public java.lang.String getStringColumn() {
+      return string_column;
+    }
+    
+    /** Sets the value of the 'string_column' field */
+    public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder setStringColumn(java.lang.String value) {
+      validate(fields()[6], value);
+      this.string_column = value;
+      fieldSetFlags()[6] = true;
+      return this; 
+    }
+    
+    /** Checks whether the 'string_column' field has been set */
+    public boolean hasStringColumn() {
+      return fieldSetFlags()[6];
+    }
+    
+    /** Clears the value of the 'string_column' field */
+    public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder clearStringColumn() {
+      string_column = null;
+      fieldSetFlags()[6] = false;
+      return this;
+    }
+
+    /** Gets the value of the 'maybe_bool_column' field */
+    public java.lang.Boolean getMaybeBoolColumn() {
+      return maybe_bool_column;
+    }
+    
+    /** Sets the value of the 'maybe_bool_column' field */
+    public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder setMaybeBoolColumn(java.lang.Boolean value) {
+      validate(fields()[7], value);
+      this.maybe_bool_column = value;
+      fieldSetFlags()[7] = true;
+      return this; 
+    }
+    
+    /** Checks whether the 'maybe_bool_column' field has been set */
+    public boolean hasMaybeBoolColumn() {
+      return fieldSetFlags()[7];
+    }
+    
+    /** Clears the value of the 'maybe_bool_column' field */
+    public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder clearMaybeBoolColumn() {
+      maybe_bool_column = null;
+      fieldSetFlags()[7] = false;
+      return this;
+    }
+
+    /** Gets the value of the 'maybe_int_column' field */
+    public java.lang.Integer getMaybeIntColumn() {
+      return maybe_int_column;
+    }
+    
+    /** Sets the value of the 'maybe_int_column' field */
+    public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder setMaybeIntColumn(java.lang.Integer value) {
+      validate(fields()[8], value);
+      this.maybe_int_column = value;
+      fieldSetFlags()[8] = true;
+      return this; 
+    }
+    
+    /** Checks whether the 'maybe_int_column' field has been set */
+    public boolean hasMaybeIntColumn() {
+      return fieldSetFlags()[8];
+    }
+    
+    /** Clears the value of the 'maybe_int_column' field */
+    public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder clearMaybeIntColumn() {
+      maybe_int_column = null;
+      fieldSetFlags()[8] = false;
+      return this;
+    }
+
+    /** Gets the value of the 'maybe_long_column' field */
+    public java.lang.Long getMaybeLongColumn() {
+      return maybe_long_column;
+    }
+    
+    /** Sets the value of the 'maybe_long_column' field */
+    public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder setMaybeLongColumn(java.lang.Long value) {
+      validate(fields()[9], value);
+      this.maybe_long_column = value;
+      fieldSetFlags()[9] = true;
+      return this; 
+    }
+    
+    /** Checks whether the 'maybe_long_column' field has been set */
+    public boolean hasMaybeLongColumn() {
+      return fieldSetFlags()[9];
+    }
+    
+    /** Clears the value of the 'maybe_long_column' field */
+    public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder clearMaybeLongColumn() {
+      maybe_long_column = null;
+      fieldSetFlags()[9] = false;
+      return this;
+    }
+
+    /** Gets the value of the 'maybe_float_column' field */
+    public java.lang.Float getMaybeFloatColumn() {
+      return maybe_float_column;
+    }
+    
+    /** Sets the value of the 'maybe_float_column' field */
+    public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder setMaybeFloatColumn(java.lang.Float value) {
+      validate(fields()[10], value);
+      this.maybe_float_column = value;
+      fieldSetFlags()[10] = true;
+      return this; 
+    }
+    
+    /** Checks whether the 'maybe_float_column' field has been set */
+    public boolean hasMaybeFloatColumn() {
+      return fieldSetFlags()[10];
+    }
+    
+    /** Clears the value of the 'maybe_float_column' field */
+    public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder clearMaybeFloatColumn() {
+      maybe_float_column = null;
+      fieldSetFlags()[10] = false;
+      return this;
+    }
+
+    /** Gets the value of the 'maybe_double_column' field */
+    public java.lang.Double getMaybeDoubleColumn() {
+      return maybe_double_column;
+    }
+    
+    /** Sets the value of the 'maybe_double_column' field */
+    public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder setMaybeDoubleColumn(java.lang.Double value) {
+      validate(fields()[11], value);
+      this.maybe_double_column = value;
+      fieldSetFlags()[11] = true;
+      return this; 
+    }
+    
+    /** Checks whether the 'maybe_double_column' field has been set */
+    public boolean hasMaybeDoubleColumn() {
+      return fieldSetFlags()[11];
+    }
+    
+    /** Clears the value of the 'maybe_double_column' field */
+    public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder clearMaybeDoubleColumn() {
+      maybe_double_column = null;
+      fieldSetFlags()[11] = false;
+      return this;
+    }
+
+    /** Gets the value of the 'maybe_binary_column' field */
+    public java.nio.ByteBuffer getMaybeBinaryColumn() {
+      return maybe_binary_column;
+    }
+    
+    /** Sets the value of the 'maybe_binary_column' field */
+    public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder setMaybeBinaryColumn(java.nio.ByteBuffer value) {
+      validate(fields()[12], value);
+      this.maybe_binary_column = value;
+      fieldSetFlags()[12] = true;
+      return this; 
+    }
+    
+    /** Checks whether the 'maybe_binary_column' field has been set */
+    public boolean hasMaybeBinaryColumn() {
+      return fieldSetFlags()[12];
+    }
+    
+    /** Clears the value of the 'maybe_binary_column' field */
+    public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder clearMaybeBinaryColumn() {
+      maybe_binary_column = null;
+      fieldSetFlags()[12] = false;
+      return this;
+    }
+
+    /** Gets the value of the 'maybe_string_column' field */
+    public java.lang.String getMaybeStringColumn() {
+      return maybe_string_column;
+    }
+    
+    /** Sets the value of the 'maybe_string_column' field */
+    public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder setMaybeStringColumn(java.lang.String value) {
+      validate(fields()[13], value);
+      this.maybe_string_column = value;
+      fieldSetFlags()[13] = true;
+      return this; 
+    }
+    
+    /** Checks whether the 'maybe_string_column' field has been set */
+    public boolean hasMaybeStringColumn() {
+      return fieldSetFlags()[13];
+    }
+    
+    /** Clears the value of the 'maybe_string_column' field */
+    public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder clearMaybeStringColumn() {
+      maybe_string_column = null;
+      fieldSetFlags()[13] = false;
+      return this;
+    }
+
+    /** Gets the value of the 'strings_column' field */
+    public java.util.List<java.lang.String> getStringsColumn() {
+      return strings_column;
+    }
+    
+    /** Sets the value of the 'strings_column' field */
+    public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder setStringsColumn(java.util.List<java.lang.String> value) {
+      validate(fields()[14], value);
+      this.strings_column = value;
+      fieldSetFlags()[14] = true;
+      return this; 
+    }
+    
+    /** Checks whether the 'strings_column' field has been set */
+    public boolean hasStringsColumn() {
+      return fieldSetFlags()[14];
+    }
+    
+    /** Clears the value of the 'strings_column' field */
+    public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder clearStringsColumn() {
+      strings_column = null;
+      fieldSetFlags()[14] = false;
+      return this;
+    }
+
+    /** Gets the value of the 'string_to_int_column' field */
+    public java.util.Map<java.lang.String,java.lang.Integer> getStringToIntColumn() {
+      return string_to_int_column;
+    }
+    
+    /** Sets the value of the 'string_to_int_column' field */
+    public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder setStringToIntColumn(java.util.Map<java.lang.String,java.lang.Integer> value) {
+      validate(fields()[15], value);
+      this.string_to_int_column = value;
+      fieldSetFlags()[15] = true;
+      return this; 
+    }
+    
+    /** Checks whether the 'string_to_int_column' field has been set */
+    public boolean hasStringToIntColumn() {
+      return fieldSetFlags()[15];
+    }
+    
+    /** Clears the value of the 'string_to_int_column' field */
+    public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder clearStringToIntColumn() {
+      string_to_int_column = null;
+      fieldSetFlags()[15] = false;
+      return this;
+    }
+
+    /** Gets the value of the 'complex_column' field */
+    public java.util.Map<java.lang.String,java.util.List<org.apache.spark.sql.parquet.test.avro.Nested>> getComplexColumn() {
+      return complex_column;
+    }
+    
+    /** Sets the value of the 'complex_column' field */
+    public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder setComplexColumn(java.util.Map<java.lang.String,java.util.List<org.apache.spark.sql.parquet.test.avro.Nested>> value) {
+      validate(fields()[16], value);
+      this.complex_column = value;
+      fieldSetFlags()[16] = true;
+      return this; 
+    }
+    
+    /** Checks whether the 'complex_column' field has been set */
+    public boolean hasComplexColumn() {
+      return fieldSetFlags()[16];
+    }
+    
+    /** Clears the value of the 'complex_column' field */
+    public org.apache.spark.sql.parquet.test.avro.ParquetAvroCompat.Builder clearComplexColumn() {
+      complex_column = null;
+      fieldSetFlags()[16] = false;
+      return this;
+    }
+
+    @Override
+    public ParquetAvroCompat build() {
+      try {
+        ParquetAvroCompat record = new ParquetAvroCompat();
+        record.bool_column = fieldSetFlags()[0] ? this.bool_column : (java.lang.Boolean) defaultValue(fields()[0]);
+        record.int_column = fieldSetFlags()[1] ? this.int_column : (java.lang.Integer) defaultValue(fields()[1]);
+        record.long_column = fieldSetFlags()[2] ? this.long_column : (java.lang.Long) defaultValue(fields()[2]);
+        record.float_column = fieldSetFlags()[3] ? this.float_column : (java.lang.Float) defaultValue(fields()[3]);
+        record.double_column = fieldSetFlags()[4] ? this.double_column : (java.lang.Double) defaultValue(fields()[4]);
+        record.binary_column = fieldSetFlags()[5] ? this.binary_column : (java.nio.ByteBuffer) defaultValue(fields()[5]);
+        record.string_column = fieldSetFlags()[6] ? this.string_column : (java.lang.String) defaultValue(fields()[6]);
+        record.maybe_bool_column = fieldSetFlags()[7] ? this.maybe_bool_column : (java.lang.Boolean) defaultValue(fields()[7]);
+        record.maybe_int_column = fieldSetFlags()[8] ? this.maybe_int_column : (java.lang.Integer) defaultValue(fields()[8]);
+        record.maybe_long_column = fieldSetFlags()[9] ? this.maybe_long_column : (java.lang.Long) defaultValue(fields()[9]);
+        record.maybe_float_column = fieldSetFlags()[10] ? this.maybe_float_column : (java.lang.Float) defaultValue(fields()[10]);
+        record.maybe_double_column = fieldSetFlags()[11] ? this.maybe_double_column : (java.lang.Double) defaultValue(fields()[11]);
+        record.maybe_binary_column = fieldSetFlags()[12] ? this.maybe_binary_column : (java.nio.ByteBuffer) defaultValue(fields()[12]);
+        record.maybe_string_column = fieldSetFlags()[13] ? this.maybe_string_column : (java.lang.String) defaultValue(fields()[13]);
+        record.strings_column = fieldSetFlags()[14] ? this.strings_column : (java.util.List<java.lang.String>) defaultValue(fields()[14]);
+        record.string_to_int_column = fieldSetFlags()[15] ? this.string_to_int_column : (java.util.Map<java.lang.String,java.lang.Integer>) defaultValue(fields()[15]);
+        record.complex_column = fieldSetFlags()[16] ? this.complex_column : (java.util.Map<java.lang.String,java.util.List<org.apache.spark.sql.parquet.test.avro.Nested>>) defaultValue(fields()[16]);
+        return record;
+      } catch (Exception e) {
+        throw new org.apache.avro.AvroRuntimeException(e);
+      }
+    }
+  }
+}
diff --git a/sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/thrift/Nested.java b/sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/thrift/Nested.java
new file mode 100644
index 0000000000000..281e60cc3ae34
--- /dev/null
+++ b/sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/thrift/Nested.java
@@ -0,0 +1,541 @@
+/**
+ * Autogenerated by Thrift Compiler (0.9.2)
+ *
+ * DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
+ *  @generated
+ */
+package org.apache.spark.sql.parquet.test.thrift;
+
+import org.apache.thrift.scheme.IScheme;
+import org.apache.thrift.scheme.SchemeFactory;
+import org.apache.thrift.scheme.StandardScheme;
+
+import org.apache.thrift.scheme.TupleScheme;
+import org.apache.thrift.protocol.TTupleProtocol;
+import org.apache.thrift.protocol.TProtocolException;
+import org.apache.thrift.EncodingUtils;
+import org.apache.thrift.TException;
+import org.apache.thrift.async.AsyncMethodCallback;
+import org.apache.thrift.server.AbstractNonblockingServer.*;
+import java.util.List;
+import java.util.ArrayList;
+import java.util.Map;
+import java.util.HashMap;
+import java.util.EnumMap;
+import java.util.Set;
+import java.util.HashSet;
+import java.util.EnumSet;
+import java.util.Collections;
+import java.util.BitSet;
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+import javax.annotation.Generated;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+@SuppressWarnings({"cast", "rawtypes", "serial", "unchecked"})
+@Generated(value = "Autogenerated by Thrift Compiler (0.9.2)", date = "2015-7-7")
+public class Nested implements org.apache.thrift.TBase<Nested, Nested._Fields>, java.io.Serializable, Cloneable, Comparable<Nested> {
+  private static final org.apache.thrift.protocol.TStruct STRUCT_DESC = new org.apache.thrift.protocol.TStruct("Nested");
+
+  private static final org.apache.thrift.protocol.TField NESTED_INTS_COLUMN_FIELD_DESC = new org.apache.thrift.protocol.TField("nestedIntsColumn", org.apache.thrift.protocol.TType.LIST, (short)1);
+  private static final org.apache.thrift.protocol.TField NESTED_STRING_COLUMN_FIELD_DESC = new org.apache.thrift.protocol.TField("nestedStringColumn", org.apache.thrift.protocol.TType.STRING, (short)2);
+
+  private static final Map<Class<? extends IScheme>, SchemeFactory> schemes = new HashMap<Class<? extends IScheme>, SchemeFactory>();
+  static {
+    schemes.put(StandardScheme.class, new NestedStandardSchemeFactory());
+    schemes.put(TupleScheme.class, new NestedTupleSchemeFactory());
+  }
+
+  public List<Integer> nestedIntsColumn; // required
+  public String nestedStringColumn; // required
+
+  /** The set of fields this struct contains, along with convenience methods for finding and manipulating them. */
+  public enum _Fields implements org.apache.thrift.TFieldIdEnum {
+    NESTED_INTS_COLUMN((short)1, "nestedIntsColumn"),
+    NESTED_STRING_COLUMN((short)2, "nestedStringColumn");
+
+    private static final Map<String, _Fields> byName = new HashMap<String, _Fields>();
+
+    static {
+      for (_Fields field : EnumSet.allOf(_Fields.class)) {
+        byName.put(field.getFieldName(), field);
+      }
+    }
+
+    /**
+     * Find the _Fields constant that matches fieldId, or null if its not found.
+     */
+    public static _Fields findByThriftId(int fieldId) {
+      switch(fieldId) {
+        case 1: // NESTED_INTS_COLUMN
+          return NESTED_INTS_COLUMN;
+        case 2: // NESTED_STRING_COLUMN
+          return NESTED_STRING_COLUMN;
+        default:
+          return null;
+      }
+    }
+
+    /**
+     * Find the _Fields constant that matches fieldId, throwing an exception
+     * if it is not found.
+     */
+    public static _Fields findByThriftIdOrThrow(int fieldId) {
+      _Fields fields = findByThriftId(fieldId);
+      if (fields == null) throw new IllegalArgumentException("Field " + fieldId + " doesn't exist!");
+      return fields;
+    }
+
+    /**
+     * Find the _Fields constant that matches name, or null if its not found.
+     */
+    public static _Fields findByName(String name) {
+      return byName.get(name);
+    }
+
+    private final short _thriftId;
+    private final String _fieldName;
+
+    _Fields(short thriftId, String fieldName) {
+      _thriftId = thriftId;
+      _fieldName = fieldName;
+    }
+
+    public short getThriftFieldId() {
+      return _thriftId;
+    }
+
+    public String getFieldName() {
+      return _fieldName;
+    }
+  }
+
+  // isset id assignments
+  public static final Map<_Fields, org.apache.thrift.meta_data.FieldMetaData> metaDataMap;
+  static {
+    Map<_Fields, org.apache.thrift.meta_data.FieldMetaData> tmpMap = new EnumMap<_Fields, org.apache.thrift.meta_data.FieldMetaData>(_Fields.class);
+    tmpMap.put(_Fields.NESTED_INTS_COLUMN, new org.apache.thrift.meta_data.FieldMetaData("nestedIntsColumn", org.apache.thrift.TFieldRequirementType.REQUIRED, 
+        new org.apache.thrift.meta_data.ListMetaData(org.apache.thrift.protocol.TType.LIST, 
+            new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.I32))));
+    tmpMap.put(_Fields.NESTED_STRING_COLUMN, new org.apache.thrift.meta_data.FieldMetaData("nestedStringColumn", org.apache.thrift.TFieldRequirementType.REQUIRED, 
+        new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.STRING)));
+    metaDataMap = Collections.unmodifiableMap(tmpMap);
+    org.apache.thrift.meta_data.FieldMetaData.addStructMetaDataMap(Nested.class, metaDataMap);
+  }
+
+  public Nested() {
+  }
+
+  public Nested(
+    List<Integer> nestedIntsColumn,
+    String nestedStringColumn)
+  {
+    this();
+    this.nestedIntsColumn = nestedIntsColumn;
+    this.nestedStringColumn = nestedStringColumn;
+  }
+
+  /**
+   * Performs a deep copy on <i>other</i>.
+   */
+  public Nested(Nested other) {
+    if (other.isSetNestedIntsColumn()) {
+      List<Integer> __this__nestedIntsColumn = new ArrayList<Integer>(other.nestedIntsColumn);
+      this.nestedIntsColumn = __this__nestedIntsColumn;
+    }
+    if (other.isSetNestedStringColumn()) {
+      this.nestedStringColumn = other.nestedStringColumn;
+    }
+  }
+
+  public Nested deepCopy() {
+    return new Nested(this);
+  }
+
+  @Override
+  public void clear() {
+    this.nestedIntsColumn = null;
+    this.nestedStringColumn = null;
+  }
+
+  public int getNestedIntsColumnSize() {
+    return (this.nestedIntsColumn == null) ? 0 : this.nestedIntsColumn.size();
+  }
+
+  public java.util.Iterator<Integer> getNestedIntsColumnIterator() {
+    return (this.nestedIntsColumn == null) ? null : this.nestedIntsColumn.iterator();
+  }
+
+  public void addToNestedIntsColumn(int elem) {
+    if (this.nestedIntsColumn == null) {
+      this.nestedIntsColumn = new ArrayList<Integer>();
+    }
+    this.nestedIntsColumn.add(elem);
+  }
+
+  public List<Integer> getNestedIntsColumn() {
+    return this.nestedIntsColumn;
+  }
+
+  public Nested setNestedIntsColumn(List<Integer> nestedIntsColumn) {
+    this.nestedIntsColumn = nestedIntsColumn;
+    return this;
+  }
+
+  public void unsetNestedIntsColumn() {
+    this.nestedIntsColumn = null;
+  }
+
+  /** Returns true if field nestedIntsColumn is set (has been assigned a value) and false otherwise */
+  public boolean isSetNestedIntsColumn() {
+    return this.nestedIntsColumn != null;
+  }
+
+  public void setNestedIntsColumnIsSet(boolean value) {
+    if (!value) {
+      this.nestedIntsColumn = null;
+    }
+  }
+
+  public String getNestedStringColumn() {
+    return this.nestedStringColumn;
+  }
+
+  public Nested setNestedStringColumn(String nestedStringColumn) {
+    this.nestedStringColumn = nestedStringColumn;
+    return this;
+  }
+
+  public void unsetNestedStringColumn() {
+    this.nestedStringColumn = null;
+  }
+
+  /** Returns true if field nestedStringColumn is set (has been assigned a value) and false otherwise */
+  public boolean isSetNestedStringColumn() {
+    return this.nestedStringColumn != null;
+  }
+
+  public void setNestedStringColumnIsSet(boolean value) {
+    if (!value) {
+      this.nestedStringColumn = null;
+    }
+  }
+
+  public void setFieldValue(_Fields field, Object value) {
+    switch (field) {
+    case NESTED_INTS_COLUMN:
+      if (value == null) {
+        unsetNestedIntsColumn();
+      } else {
+        setNestedIntsColumn((List<Integer>)value);
+      }
+      break;
+
+    case NESTED_STRING_COLUMN:
+      if (value == null) {
+        unsetNestedStringColumn();
+      } else {
+        setNestedStringColumn((String)value);
+      }
+      break;
+
+    }
+  }
+
+  public Object getFieldValue(_Fields field) {
+    switch (field) {
+    case NESTED_INTS_COLUMN:
+      return getNestedIntsColumn();
+
+    case NESTED_STRING_COLUMN:
+      return getNestedStringColumn();
+
+    }
+    throw new IllegalStateException();
+  }
+
+  /** Returns true if field corresponding to fieldID is set (has been assigned a value) and false otherwise */
+  public boolean isSet(_Fields field) {
+    if (field == null) {
+      throw new IllegalArgumentException();
+    }
+
+    switch (field) {
+    case NESTED_INTS_COLUMN:
+      return isSetNestedIntsColumn();
+    case NESTED_STRING_COLUMN:
+      return isSetNestedStringColumn();
+    }
+    throw new IllegalStateException();
+  }
+
+  @Override
+  public boolean equals(Object that) {
+    if (that == null)
+      return false;
+    if (that instanceof Nested)
+      return this.equals((Nested)that);
+    return false;
+  }
+
+  public boolean equals(Nested that) {
+    if (that == null)
+      return false;
+
+    boolean this_present_nestedIntsColumn = true && this.isSetNestedIntsColumn();
+    boolean that_present_nestedIntsColumn = true && that.isSetNestedIntsColumn();
+    if (this_present_nestedIntsColumn || that_present_nestedIntsColumn) {
+      if (!(this_present_nestedIntsColumn && that_present_nestedIntsColumn))
+        return false;
+      if (!this.nestedIntsColumn.equals(that.nestedIntsColumn))
+        return false;
+    }
+
+    boolean this_present_nestedStringColumn = true && this.isSetNestedStringColumn();
+    boolean that_present_nestedStringColumn = true && that.isSetNestedStringColumn();
+    if (this_present_nestedStringColumn || that_present_nestedStringColumn) {
+      if (!(this_present_nestedStringColumn && that_present_nestedStringColumn))
+        return false;
+      if (!this.nestedStringColumn.equals(that.nestedStringColumn))
+        return false;
+    }
+
+    return true;
+  }
+
+  @Override
+  public int hashCode() {
+    List<Object> list = new ArrayList<Object>();
+
+    boolean present_nestedIntsColumn = true && (isSetNestedIntsColumn());
+    list.add(present_nestedIntsColumn);
+    if (present_nestedIntsColumn)
+      list.add(nestedIntsColumn);
+
+    boolean present_nestedStringColumn = true && (isSetNestedStringColumn());
+    list.add(present_nestedStringColumn);
+    if (present_nestedStringColumn)
+      list.add(nestedStringColumn);
+
+    return list.hashCode();
+  }
+
+  @Override
+  public int compareTo(Nested other) {
+    if (!getClass().equals(other.getClass())) {
+      return getClass().getName().compareTo(other.getClass().getName());
+    }
+
+    int lastComparison = 0;
+
+    lastComparison = Boolean.valueOf(isSetNestedIntsColumn()).compareTo(other.isSetNestedIntsColumn());
+    if (lastComparison != 0) {
+      return lastComparison;
+    }
+    if (isSetNestedIntsColumn()) {
+      lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.nestedIntsColumn, other.nestedIntsColumn);
+      if (lastComparison != 0) {
+        return lastComparison;
+      }
+    }
+    lastComparison = Boolean.valueOf(isSetNestedStringColumn()).compareTo(other.isSetNestedStringColumn());
+    if (lastComparison != 0) {
+      return lastComparison;
+    }
+    if (isSetNestedStringColumn()) {
+      lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.nestedStringColumn, other.nestedStringColumn);
+      if (lastComparison != 0) {
+        return lastComparison;
+      }
+    }
+    return 0;
+  }
+
+  public _Fields fieldForId(int fieldId) {
+    return _Fields.findByThriftId(fieldId);
+  }
+
+  public void read(org.apache.thrift.protocol.TProtocol iprot) throws org.apache.thrift.TException {
+    schemes.get(iprot.getScheme()).getScheme().read(iprot, this);
+  }
+
+  public void write(org.apache.thrift.protocol.TProtocol oprot) throws org.apache.thrift.TException {
+    schemes.get(oprot.getScheme()).getScheme().write(oprot, this);
+  }
+
+  @Override
+  public String toString() {
+    StringBuilder sb = new StringBuilder("Nested(");
+    boolean first = true;
+
+    sb.append("nestedIntsColumn:");
+    if (this.nestedIntsColumn == null) {
+      sb.append("null");
+    } else {
+      sb.append(this.nestedIntsColumn);
+    }
+    first = false;
+    if (!first) sb.append(", ");
+    sb.append("nestedStringColumn:");
+    if (this.nestedStringColumn == null) {
+      sb.append("null");
+    } else {
+      sb.append(this.nestedStringColumn);
+    }
+    first = false;
+    sb.append(")");
+    return sb.toString();
+  }
+
+  public void validate() throws org.apache.thrift.TException {
+    // check for required fields
+    if (nestedIntsColumn == null) {
+      throw new org.apache.thrift.protocol.TProtocolException("Required field 'nestedIntsColumn' was not present! Struct: " + toString());
+    }
+    if (nestedStringColumn == null) {
+      throw new org.apache.thrift.protocol.TProtocolException("Required field 'nestedStringColumn' was not present! Struct: " + toString());
+    }
+    // check for sub-struct validity
+  }
+
+  private void writeObject(java.io.ObjectOutputStream out) throws java.io.IOException {
+    try {
+      write(new org.apache.thrift.protocol.TCompactProtocol(new org.apache.thrift.transport.TIOStreamTransport(out)));
+    } catch (org.apache.thrift.TException te) {
+      throw new java.io.IOException(te);
+    }
+  }
+
+  private void readObject(java.io.ObjectInputStream in) throws java.io.IOException, ClassNotFoundException {
+    try {
+      read(new org.apache.thrift.protocol.TCompactProtocol(new org.apache.thrift.transport.TIOStreamTransport(in)));
+    } catch (org.apache.thrift.TException te) {
+      throw new java.io.IOException(te);
+    }
+  }
+
+  private static class NestedStandardSchemeFactory implements SchemeFactory {
+    public NestedStandardScheme getScheme() {
+      return new NestedStandardScheme();
+    }
+  }
+
+  private static class NestedStandardScheme extends StandardScheme<Nested> {
+
+    public void read(org.apache.thrift.protocol.TProtocol iprot, Nested struct) throws org.apache.thrift.TException {
+      org.apache.thrift.protocol.TField schemeField;
+      iprot.readStructBegin();
+      while (true)
+      {
+        schemeField = iprot.readFieldBegin();
+        if (schemeField.type == org.apache.thrift.protocol.TType.STOP) { 
+          break;
+        }
+        switch (schemeField.id) {
+          case 1: // NESTED_INTS_COLUMN
+            if (schemeField.type == org.apache.thrift.protocol.TType.LIST) {
+              {
+                org.apache.thrift.protocol.TList _list0 = iprot.readListBegin();
+                struct.nestedIntsColumn = new ArrayList<Integer>(_list0.size);
+                int _elem1;
+                for (int _i2 = 0; _i2 < _list0.size; ++_i2)
+                {
+                  _elem1 = iprot.readI32();
+                  struct.nestedIntsColumn.add(_elem1);
+                }
+                iprot.readListEnd();
+              }
+              struct.setNestedIntsColumnIsSet(true);
+            } else { 
+              org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
+            }
+            break;
+          case 2: // NESTED_STRING_COLUMN
+            if (schemeField.type == org.apache.thrift.protocol.TType.STRING) {
+              struct.nestedStringColumn = iprot.readString();
+              struct.setNestedStringColumnIsSet(true);
+            } else { 
+              org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
+            }
+            break;
+          default:
+            org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
+        }
+        iprot.readFieldEnd();
+      }
+      iprot.readStructEnd();
+
+      // check for required fields of primitive type, which can't be checked in the validate method
+      struct.validate();
+    }
+
+    public void write(org.apache.thrift.protocol.TProtocol oprot, Nested struct) throws org.apache.thrift.TException {
+      struct.validate();
+
+      oprot.writeStructBegin(STRUCT_DESC);
+      if (struct.nestedIntsColumn != null) {
+        oprot.writeFieldBegin(NESTED_INTS_COLUMN_FIELD_DESC);
+        {
+          oprot.writeListBegin(new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.I32, struct.nestedIntsColumn.size()));
+          for (int _iter3 : struct.nestedIntsColumn)
+          {
+            oprot.writeI32(_iter3);
+          }
+          oprot.writeListEnd();
+        }
+        oprot.writeFieldEnd();
+      }
+      if (struct.nestedStringColumn != null) {
+        oprot.writeFieldBegin(NESTED_STRING_COLUMN_FIELD_DESC);
+        oprot.writeString(struct.nestedStringColumn);
+        oprot.writeFieldEnd();
+      }
+      oprot.writeFieldStop();
+      oprot.writeStructEnd();
+    }
+
+  }
+
+  private static class NestedTupleSchemeFactory implements SchemeFactory {
+    public NestedTupleScheme getScheme() {
+      return new NestedTupleScheme();
+    }
+  }
+
+  private static class NestedTupleScheme extends TupleScheme<Nested> {
+
+    @Override
+    public void write(org.apache.thrift.protocol.TProtocol prot, Nested struct) throws org.apache.thrift.TException {
+      TTupleProtocol oprot = (TTupleProtocol) prot;
+      {
+        oprot.writeI32(struct.nestedIntsColumn.size());
+        for (int _iter4 : struct.nestedIntsColumn)
+        {
+          oprot.writeI32(_iter4);
+        }
+      }
+      oprot.writeString(struct.nestedStringColumn);
+    }
+
+    @Override
+    public void read(org.apache.thrift.protocol.TProtocol prot, Nested struct) throws org.apache.thrift.TException {
+      TTupleProtocol iprot = (TTupleProtocol) prot;
+      {
+        org.apache.thrift.protocol.TList _list5 = new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.I32, iprot.readI32());
+        struct.nestedIntsColumn = new ArrayList<Integer>(_list5.size);
+        int _elem6;
+        for (int _i7 = 0; _i7 < _list5.size; ++_i7)
+        {
+          _elem6 = iprot.readI32();
+          struct.nestedIntsColumn.add(_elem6);
+        }
+      }
+      struct.setNestedIntsColumnIsSet(true);
+      struct.nestedStringColumn = iprot.readString();
+      struct.setNestedStringColumnIsSet(true);
+    }
+  }
+
+}
+
diff --git a/sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/thrift/ParquetThriftCompat.java b/sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/thrift/ParquetThriftCompat.java
new file mode 100644
index 0000000000000..326ae9dbaa0d1
--- /dev/null
+++ b/sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/thrift/ParquetThriftCompat.java
@@ -0,0 +1,2808 @@
+/**
+ * Autogenerated by Thrift Compiler (0.9.2)
+ *
+ * DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
+ *  @generated
+ */
+package org.apache.spark.sql.parquet.test.thrift;
+
+import org.apache.thrift.scheme.IScheme;
+import org.apache.thrift.scheme.SchemeFactory;
+import org.apache.thrift.scheme.StandardScheme;
+
+import org.apache.thrift.scheme.TupleScheme;
+import org.apache.thrift.protocol.TTupleProtocol;
+import org.apache.thrift.protocol.TProtocolException;
+import org.apache.thrift.EncodingUtils;
+import org.apache.thrift.TException;
+import org.apache.thrift.async.AsyncMethodCallback;
+import org.apache.thrift.server.AbstractNonblockingServer.*;
+import java.util.List;
+import java.util.ArrayList;
+import java.util.Map;
+import java.util.HashMap;
+import java.util.EnumMap;
+import java.util.Set;
+import java.util.HashSet;
+import java.util.EnumSet;
+import java.util.Collections;
+import java.util.BitSet;
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+import javax.annotation.Generated;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+@SuppressWarnings({"cast", "rawtypes", "serial", "unchecked"})
+/**
+ * This is a test struct for testing parquet-thrift compatibility.
+ */
+@Generated(value = "Autogenerated by Thrift Compiler (0.9.2)", date = "2015-7-7")
+public class ParquetThriftCompat implements org.apache.thrift.TBase<ParquetThriftCompat, ParquetThriftCompat._Fields>, java.io.Serializable, Cloneable, Comparable<ParquetThriftCompat> {
+  private static final org.apache.thrift.protocol.TStruct STRUCT_DESC = new org.apache.thrift.protocol.TStruct("ParquetThriftCompat");
+
+  private static final org.apache.thrift.protocol.TField BOOL_COLUMN_FIELD_DESC = new org.apache.thrift.protocol.TField("boolColumn", org.apache.thrift.protocol.TType.BOOL, (short)1);
+  private static final org.apache.thrift.protocol.TField BYTE_COLUMN_FIELD_DESC = new org.apache.thrift.protocol.TField("byteColumn", org.apache.thrift.protocol.TType.BYTE, (short)2);
+  private static final org.apache.thrift.protocol.TField SHORT_COLUMN_FIELD_DESC = new org.apache.thrift.protocol.TField("shortColumn", org.apache.thrift.protocol.TType.I16, (short)3);
+  private static final org.apache.thrift.protocol.TField INT_COLUMN_FIELD_DESC = new org.apache.thrift.protocol.TField("intColumn", org.apache.thrift.protocol.TType.I32, (short)4);
+  private static final org.apache.thrift.protocol.TField LONG_COLUMN_FIELD_DESC = new org.apache.thrift.protocol.TField("longColumn", org.apache.thrift.protocol.TType.I64, (short)5);
+  private static final org.apache.thrift.protocol.TField DOUBLE_COLUMN_FIELD_DESC = new org.apache.thrift.protocol.TField("doubleColumn", org.apache.thrift.protocol.TType.DOUBLE, (short)6);
+  private static final org.apache.thrift.protocol.TField BINARY_COLUMN_FIELD_DESC = new org.apache.thrift.protocol.TField("binaryColumn", org.apache.thrift.protocol.TType.STRING, (short)7);
+  private static final org.apache.thrift.protocol.TField STRING_COLUMN_FIELD_DESC = new org.apache.thrift.protocol.TField("stringColumn", org.apache.thrift.protocol.TType.STRING, (short)8);
+  private static final org.apache.thrift.protocol.TField ENUM_COLUMN_FIELD_DESC = new org.apache.thrift.protocol.TField("enumColumn", org.apache.thrift.protocol.TType.I32, (short)9);
+  private static final org.apache.thrift.protocol.TField MAYBE_BOOL_COLUMN_FIELD_DESC = new org.apache.thrift.protocol.TField("maybeBoolColumn", org.apache.thrift.protocol.TType.BOOL, (short)10);
+  private static final org.apache.thrift.protocol.TField MAYBE_BYTE_COLUMN_FIELD_DESC = new org.apache.thrift.protocol.TField("maybeByteColumn", org.apache.thrift.protocol.TType.BYTE, (short)11);
+  private static final org.apache.thrift.protocol.TField MAYBE_SHORT_COLUMN_FIELD_DESC = new org.apache.thrift.protocol.TField("maybeShortColumn", org.apache.thrift.protocol.TType.I16, (short)12);
+  private static final org.apache.thrift.protocol.TField MAYBE_INT_COLUMN_FIELD_DESC = new org.apache.thrift.protocol.TField("maybeIntColumn", org.apache.thrift.protocol.TType.I32, (short)13);
+  private static final org.apache.thrift.protocol.TField MAYBE_LONG_COLUMN_FIELD_DESC = new org.apache.thrift.protocol.TField("maybeLongColumn", org.apache.thrift.protocol.TType.I64, (short)14);
+  private static final org.apache.thrift.protocol.TField MAYBE_DOUBLE_COLUMN_FIELD_DESC = new org.apache.thrift.protocol.TField("maybeDoubleColumn", org.apache.thrift.protocol.TType.DOUBLE, (short)15);
+  private static final org.apache.thrift.protocol.TField MAYBE_BINARY_COLUMN_FIELD_DESC = new org.apache.thrift.protocol.TField("maybeBinaryColumn", org.apache.thrift.protocol.TType.STRING, (short)16);
+  private static final org.apache.thrift.protocol.TField MAYBE_STRING_COLUMN_FIELD_DESC = new org.apache.thrift.protocol.TField("maybeStringColumn", org.apache.thrift.protocol.TType.STRING, (short)17);
+  private static final org.apache.thrift.protocol.TField MAYBE_ENUM_COLUMN_FIELD_DESC = new org.apache.thrift.protocol.TField("maybeEnumColumn", org.apache.thrift.protocol.TType.I32, (short)18);
+  private static final org.apache.thrift.protocol.TField STRINGS_COLUMN_FIELD_DESC = new org.apache.thrift.protocol.TField("stringsColumn", org.apache.thrift.protocol.TType.LIST, (short)19);
+  private static final org.apache.thrift.protocol.TField INT_SET_COLUMN_FIELD_DESC = new org.apache.thrift.protocol.TField("intSetColumn", org.apache.thrift.protocol.TType.SET, (short)20);
+  private static final org.apache.thrift.protocol.TField INT_TO_STRING_COLUMN_FIELD_DESC = new org.apache.thrift.protocol.TField("intToStringColumn", org.apache.thrift.protocol.TType.MAP, (short)21);
+  private static final org.apache.thrift.protocol.TField COMPLEX_COLUMN_FIELD_DESC = new org.apache.thrift.protocol.TField("complexColumn", org.apache.thrift.protocol.TType.MAP, (short)22);
+
+  private static final Map<Class<? extends IScheme>, SchemeFactory> schemes = new HashMap<Class<? extends IScheme>, SchemeFactory>();
+  static {
+    schemes.put(StandardScheme.class, new ParquetThriftCompatStandardSchemeFactory());
+    schemes.put(TupleScheme.class, new ParquetThriftCompatTupleSchemeFactory());
+  }
+
+  public boolean boolColumn; // required
+  public byte byteColumn; // required
+  public short shortColumn; // required
+  public int intColumn; // required
+  public long longColumn; // required
+  public double doubleColumn; // required
+  public ByteBuffer binaryColumn; // required
+  public String stringColumn; // required
+  /**
+   * 
+   * @see Suit
+   */
+  public Suit enumColumn; // required
+  public boolean maybeBoolColumn; // optional
+  public byte maybeByteColumn; // optional
+  public short maybeShortColumn; // optional
+  public int maybeIntColumn; // optional
+  public long maybeLongColumn; // optional
+  public double maybeDoubleColumn; // optional
+  public ByteBuffer maybeBinaryColumn; // optional
+  public String maybeStringColumn; // optional
+  /**
+   * 
+   * @see Suit
+   */
+  public Suit maybeEnumColumn; // optional
+  public List<String> stringsColumn; // required
+  public Set<Integer> intSetColumn; // required
+  public Map<Integer,String> intToStringColumn; // required
+  public Map<Integer,List<Nested>> complexColumn; // required
+
+  /** The set of fields this struct contains, along with convenience methods for finding and manipulating them. */
+  public enum _Fields implements org.apache.thrift.TFieldIdEnum {
+    BOOL_COLUMN((short)1, "boolColumn"),
+    BYTE_COLUMN((short)2, "byteColumn"),
+    SHORT_COLUMN((short)3, "shortColumn"),
+    INT_COLUMN((short)4, "intColumn"),
+    LONG_COLUMN((short)5, "longColumn"),
+    DOUBLE_COLUMN((short)6, "doubleColumn"),
+    BINARY_COLUMN((short)7, "binaryColumn"),
+    STRING_COLUMN((short)8, "stringColumn"),
+    /**
+     * 
+     * @see Suit
+     */
+    ENUM_COLUMN((short)9, "enumColumn"),
+    MAYBE_BOOL_COLUMN((short)10, "maybeBoolColumn"),
+    MAYBE_BYTE_COLUMN((short)11, "maybeByteColumn"),
+    MAYBE_SHORT_COLUMN((short)12, "maybeShortColumn"),
+    MAYBE_INT_COLUMN((short)13, "maybeIntColumn"),
+    MAYBE_LONG_COLUMN((short)14, "maybeLongColumn"),
+    MAYBE_DOUBLE_COLUMN((short)15, "maybeDoubleColumn"),
+    MAYBE_BINARY_COLUMN((short)16, "maybeBinaryColumn"),
+    MAYBE_STRING_COLUMN((short)17, "maybeStringColumn"),
+    /**
+     * 
+     * @see Suit
+     */
+    MAYBE_ENUM_COLUMN((short)18, "maybeEnumColumn"),
+    STRINGS_COLUMN((short)19, "stringsColumn"),
+    INT_SET_COLUMN((short)20, "intSetColumn"),
+    INT_TO_STRING_COLUMN((short)21, "intToStringColumn"),
+    COMPLEX_COLUMN((short)22, "complexColumn");
+
+    private static final Map<String, _Fields> byName = new HashMap<String, _Fields>();
+
+    static {
+      for (_Fields field : EnumSet.allOf(_Fields.class)) {
+        byName.put(field.getFieldName(), field);
+      }
+    }
+
+    /**
+     * Find the _Fields constant that matches fieldId, or null if its not found.
+     */
+    public static _Fields findByThriftId(int fieldId) {
+      switch(fieldId) {
+        case 1: // BOOL_COLUMN
+          return BOOL_COLUMN;
+        case 2: // BYTE_COLUMN
+          return BYTE_COLUMN;
+        case 3: // SHORT_COLUMN
+          return SHORT_COLUMN;
+        case 4: // INT_COLUMN
+          return INT_COLUMN;
+        case 5: // LONG_COLUMN
+          return LONG_COLUMN;
+        case 6: // DOUBLE_COLUMN
+          return DOUBLE_COLUMN;
+        case 7: // BINARY_COLUMN
+          return BINARY_COLUMN;
+        case 8: // STRING_COLUMN
+          return STRING_COLUMN;
+        case 9: // ENUM_COLUMN
+          return ENUM_COLUMN;
+        case 10: // MAYBE_BOOL_COLUMN
+          return MAYBE_BOOL_COLUMN;
+        case 11: // MAYBE_BYTE_COLUMN
+          return MAYBE_BYTE_COLUMN;
+        case 12: // MAYBE_SHORT_COLUMN
+          return MAYBE_SHORT_COLUMN;
+        case 13: // MAYBE_INT_COLUMN
+          return MAYBE_INT_COLUMN;
+        case 14: // MAYBE_LONG_COLUMN
+          return MAYBE_LONG_COLUMN;
+        case 15: // MAYBE_DOUBLE_COLUMN
+          return MAYBE_DOUBLE_COLUMN;
+        case 16: // MAYBE_BINARY_COLUMN
+          return MAYBE_BINARY_COLUMN;
+        case 17: // MAYBE_STRING_COLUMN
+          return MAYBE_STRING_COLUMN;
+        case 18: // MAYBE_ENUM_COLUMN
+          return MAYBE_ENUM_COLUMN;
+        case 19: // STRINGS_COLUMN
+          return STRINGS_COLUMN;
+        case 20: // INT_SET_COLUMN
+          return INT_SET_COLUMN;
+        case 21: // INT_TO_STRING_COLUMN
+          return INT_TO_STRING_COLUMN;
+        case 22: // COMPLEX_COLUMN
+          return COMPLEX_COLUMN;
+        default:
+          return null;
+      }
+    }
+
+    /**
+     * Find the _Fields constant that matches fieldId, throwing an exception
+     * if it is not found.
+     */
+    public static _Fields findByThriftIdOrThrow(int fieldId) {
+      _Fields fields = findByThriftId(fieldId);
+      if (fields == null) throw new IllegalArgumentException("Field " + fieldId + " doesn't exist!");
+      return fields;
+    }
+
+    /**
+     * Find the _Fields constant that matches name, or null if its not found.
+     */
+    public static _Fields findByName(String name) {
+      return byName.get(name);
+    }
+
+    private final short _thriftId;
+    private final String _fieldName;
+
+    _Fields(short thriftId, String fieldName) {
+      _thriftId = thriftId;
+      _fieldName = fieldName;
+    }
+
+    public short getThriftFieldId() {
+      return _thriftId;
+    }
+
+    public String getFieldName() {
+      return _fieldName;
+    }
+  }
+
+  // isset id assignments
+  private static final int __BOOLCOLUMN_ISSET_ID = 0;
+  private static final int __BYTECOLUMN_ISSET_ID = 1;
+  private static final int __SHORTCOLUMN_ISSET_ID = 2;
+  private static final int __INTCOLUMN_ISSET_ID = 3;
+  private static final int __LONGCOLUMN_ISSET_ID = 4;
+  private static final int __DOUBLECOLUMN_ISSET_ID = 5;
+  private static final int __MAYBEBOOLCOLUMN_ISSET_ID = 6;
+  private static final int __MAYBEBYTECOLUMN_ISSET_ID = 7;
+  private static final int __MAYBESHORTCOLUMN_ISSET_ID = 8;
+  private static final int __MAYBEINTCOLUMN_ISSET_ID = 9;
+  private static final int __MAYBELONGCOLUMN_ISSET_ID = 10;
+  private static final int __MAYBEDOUBLECOLUMN_ISSET_ID = 11;
+  private short __isset_bitfield = 0;
+  private static final _Fields optionals[] = {_Fields.MAYBE_BOOL_COLUMN,_Fields.MAYBE_BYTE_COLUMN,_Fields.MAYBE_SHORT_COLUMN,_Fields.MAYBE_INT_COLUMN,_Fields.MAYBE_LONG_COLUMN,_Fields.MAYBE_DOUBLE_COLUMN,_Fields.MAYBE_BINARY_COLUMN,_Fields.MAYBE_STRING_COLUMN,_Fields.MAYBE_ENUM_COLUMN};
+  public static final Map<_Fields, org.apache.thrift.meta_data.FieldMetaData> metaDataMap;
+  static {
+    Map<_Fields, org.apache.thrift.meta_data.FieldMetaData> tmpMap = new EnumMap<_Fields, org.apache.thrift.meta_data.FieldMetaData>(_Fields.class);
+    tmpMap.put(_Fields.BOOL_COLUMN, new org.apache.thrift.meta_data.FieldMetaData("boolColumn", org.apache.thrift.TFieldRequirementType.REQUIRED, 
+        new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.BOOL)));
+    tmpMap.put(_Fields.BYTE_COLUMN, new org.apache.thrift.meta_data.FieldMetaData("byteColumn", org.apache.thrift.TFieldRequirementType.REQUIRED, 
+        new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.BYTE)));
+    tmpMap.put(_Fields.SHORT_COLUMN, new org.apache.thrift.meta_data.FieldMetaData("shortColumn", org.apache.thrift.TFieldRequirementType.REQUIRED, 
+        new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.I16)));
+    tmpMap.put(_Fields.INT_COLUMN, new org.apache.thrift.meta_data.FieldMetaData("intColumn", org.apache.thrift.TFieldRequirementType.REQUIRED, 
+        new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.I32)));
+    tmpMap.put(_Fields.LONG_COLUMN, new org.apache.thrift.meta_data.FieldMetaData("longColumn", org.apache.thrift.TFieldRequirementType.REQUIRED, 
+        new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.I64)));
+    tmpMap.put(_Fields.DOUBLE_COLUMN, new org.apache.thrift.meta_data.FieldMetaData("doubleColumn", org.apache.thrift.TFieldRequirementType.REQUIRED, 
+        new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.DOUBLE)));
+    tmpMap.put(_Fields.BINARY_COLUMN, new org.apache.thrift.meta_data.FieldMetaData("binaryColumn", org.apache.thrift.TFieldRequirementType.REQUIRED, 
+        new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.STRING        , true)));
+    tmpMap.put(_Fields.STRING_COLUMN, new org.apache.thrift.meta_data.FieldMetaData("stringColumn", org.apache.thrift.TFieldRequirementType.REQUIRED, 
+        new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.STRING)));
+    tmpMap.put(_Fields.ENUM_COLUMN, new org.apache.thrift.meta_data.FieldMetaData("enumColumn", org.apache.thrift.TFieldRequirementType.REQUIRED, 
+        new org.apache.thrift.meta_data.EnumMetaData(org.apache.thrift.protocol.TType.ENUM, Suit.class)));
+    tmpMap.put(_Fields.MAYBE_BOOL_COLUMN, new org.apache.thrift.meta_data.FieldMetaData("maybeBoolColumn", org.apache.thrift.TFieldRequirementType.OPTIONAL, 
+        new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.BOOL)));
+    tmpMap.put(_Fields.MAYBE_BYTE_COLUMN, new org.apache.thrift.meta_data.FieldMetaData("maybeByteColumn", org.apache.thrift.TFieldRequirementType.OPTIONAL, 
+        new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.BYTE)));
+    tmpMap.put(_Fields.MAYBE_SHORT_COLUMN, new org.apache.thrift.meta_data.FieldMetaData("maybeShortColumn", org.apache.thrift.TFieldRequirementType.OPTIONAL, 
+        new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.I16)));
+    tmpMap.put(_Fields.MAYBE_INT_COLUMN, new org.apache.thrift.meta_data.FieldMetaData("maybeIntColumn", org.apache.thrift.TFieldRequirementType.OPTIONAL, 
+        new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.I32)));
+    tmpMap.put(_Fields.MAYBE_LONG_COLUMN, new org.apache.thrift.meta_data.FieldMetaData("maybeLongColumn", org.apache.thrift.TFieldRequirementType.OPTIONAL, 
+        new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.I64)));
+    tmpMap.put(_Fields.MAYBE_DOUBLE_COLUMN, new org.apache.thrift.meta_data.FieldMetaData("maybeDoubleColumn", org.apache.thrift.TFieldRequirementType.OPTIONAL, 
+        new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.DOUBLE)));
+    tmpMap.put(_Fields.MAYBE_BINARY_COLUMN, new org.apache.thrift.meta_data.FieldMetaData("maybeBinaryColumn", org.apache.thrift.TFieldRequirementType.OPTIONAL, 
+        new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.STRING        , true)));
+    tmpMap.put(_Fields.MAYBE_STRING_COLUMN, new org.apache.thrift.meta_data.FieldMetaData("maybeStringColumn", org.apache.thrift.TFieldRequirementType.OPTIONAL, 
+        new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.STRING)));
+    tmpMap.put(_Fields.MAYBE_ENUM_COLUMN, new org.apache.thrift.meta_data.FieldMetaData("maybeEnumColumn", org.apache.thrift.TFieldRequirementType.OPTIONAL, 
+        new org.apache.thrift.meta_data.EnumMetaData(org.apache.thrift.protocol.TType.ENUM, Suit.class)));
+    tmpMap.put(_Fields.STRINGS_COLUMN, new org.apache.thrift.meta_data.FieldMetaData("stringsColumn", org.apache.thrift.TFieldRequirementType.REQUIRED, 
+        new org.apache.thrift.meta_data.ListMetaData(org.apache.thrift.protocol.TType.LIST, 
+            new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.STRING))));
+    tmpMap.put(_Fields.INT_SET_COLUMN, new org.apache.thrift.meta_data.FieldMetaData("intSetColumn", org.apache.thrift.TFieldRequirementType.REQUIRED, 
+        new org.apache.thrift.meta_data.SetMetaData(org.apache.thrift.protocol.TType.SET, 
+            new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.I32))));
+    tmpMap.put(_Fields.INT_TO_STRING_COLUMN, new org.apache.thrift.meta_data.FieldMetaData("intToStringColumn", org.apache.thrift.TFieldRequirementType.REQUIRED, 
+        new org.apache.thrift.meta_data.MapMetaData(org.apache.thrift.protocol.TType.MAP, 
+            new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.I32), 
+            new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.STRING))));
+    tmpMap.put(_Fields.COMPLEX_COLUMN, new org.apache.thrift.meta_data.FieldMetaData("complexColumn", org.apache.thrift.TFieldRequirementType.REQUIRED, 
+        new org.apache.thrift.meta_data.MapMetaData(org.apache.thrift.protocol.TType.MAP, 
+            new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.I32), 
+            new org.apache.thrift.meta_data.ListMetaData(org.apache.thrift.protocol.TType.LIST, 
+                new org.apache.thrift.meta_data.StructMetaData(org.apache.thrift.protocol.TType.STRUCT, Nested.class)))));
+    metaDataMap = Collections.unmodifiableMap(tmpMap);
+    org.apache.thrift.meta_data.FieldMetaData.addStructMetaDataMap(ParquetThriftCompat.class, metaDataMap);
+  }
+
+  public ParquetThriftCompat() {
+  }
+
+  public ParquetThriftCompat(
+    boolean boolColumn,
+    byte byteColumn,
+    short shortColumn,
+    int intColumn,
+    long longColumn,
+    double doubleColumn,
+    ByteBuffer binaryColumn,
+    String stringColumn,
+    Suit enumColumn,
+    List<String> stringsColumn,
+    Set<Integer> intSetColumn,
+    Map<Integer,String> intToStringColumn,
+    Map<Integer,List<Nested>> complexColumn)
+  {
+    this();
+    this.boolColumn = boolColumn;
+    setBoolColumnIsSet(true);
+    this.byteColumn = byteColumn;
+    setByteColumnIsSet(true);
+    this.shortColumn = shortColumn;
+    setShortColumnIsSet(true);
+    this.intColumn = intColumn;
+    setIntColumnIsSet(true);
+    this.longColumn = longColumn;
+    setLongColumnIsSet(true);
+    this.doubleColumn = doubleColumn;
+    setDoubleColumnIsSet(true);
+    this.binaryColumn = org.apache.thrift.TBaseHelper.copyBinary(binaryColumn);
+    this.stringColumn = stringColumn;
+    this.enumColumn = enumColumn;
+    this.stringsColumn = stringsColumn;
+    this.intSetColumn = intSetColumn;
+    this.intToStringColumn = intToStringColumn;
+    this.complexColumn = complexColumn;
+  }
+
+  /**
+   * Performs a deep copy on <i>other</i>.
+   */
+  public ParquetThriftCompat(ParquetThriftCompat other) {
+    __isset_bitfield = other.__isset_bitfield;
+    this.boolColumn = other.boolColumn;
+    this.byteColumn = other.byteColumn;
+    this.shortColumn = other.shortColumn;
+    this.intColumn = other.intColumn;
+    this.longColumn = other.longColumn;
+    this.doubleColumn = other.doubleColumn;
+    if (other.isSetBinaryColumn()) {
+      this.binaryColumn = org.apache.thrift.TBaseHelper.copyBinary(other.binaryColumn);
+    }
+    if (other.isSetStringColumn()) {
+      this.stringColumn = other.stringColumn;
+    }
+    if (other.isSetEnumColumn()) {
+      this.enumColumn = other.enumColumn;
+    }
+    this.maybeBoolColumn = other.maybeBoolColumn;
+    this.maybeByteColumn = other.maybeByteColumn;
+    this.maybeShortColumn = other.maybeShortColumn;
+    this.maybeIntColumn = other.maybeIntColumn;
+    this.maybeLongColumn = other.maybeLongColumn;
+    this.maybeDoubleColumn = other.maybeDoubleColumn;
+    if (other.isSetMaybeBinaryColumn()) {
+      this.maybeBinaryColumn = org.apache.thrift.TBaseHelper.copyBinary(other.maybeBinaryColumn);
+    }
+    if (other.isSetMaybeStringColumn()) {
+      this.maybeStringColumn = other.maybeStringColumn;
+    }
+    if (other.isSetMaybeEnumColumn()) {
+      this.maybeEnumColumn = other.maybeEnumColumn;
+    }
+    if (other.isSetStringsColumn()) {
+      List<String> __this__stringsColumn = new ArrayList<String>(other.stringsColumn);
+      this.stringsColumn = __this__stringsColumn;
+    }
+    if (other.isSetIntSetColumn()) {
+      Set<Integer> __this__intSetColumn = new HashSet<Integer>(other.intSetColumn);
+      this.intSetColumn = __this__intSetColumn;
+    }
+    if (other.isSetIntToStringColumn()) {
+      Map<Integer,String> __this__intToStringColumn = new HashMap<Integer,String>(other.intToStringColumn);
+      this.intToStringColumn = __this__intToStringColumn;
+    }
+    if (other.isSetComplexColumn()) {
+      Map<Integer,List<Nested>> __this__complexColumn = new HashMap<Integer,List<Nested>>(other.complexColumn.size());
+      for (Map.Entry<Integer, List<Nested>> other_element : other.complexColumn.entrySet()) {
+
+        Integer other_element_key = other_element.getKey();
+        List<Nested> other_element_value = other_element.getValue();
+
+        Integer __this__complexColumn_copy_key = other_element_key;
+
+        List<Nested> __this__complexColumn_copy_value = new ArrayList<Nested>(other_element_value.size());
+        for (Nested other_element_value_element : other_element_value) {
+          __this__complexColumn_copy_value.add(new Nested(other_element_value_element));
+        }
+
+        __this__complexColumn.put(__this__complexColumn_copy_key, __this__complexColumn_copy_value);
+      }
+      this.complexColumn = __this__complexColumn;
+    }
+  }
+
+  public ParquetThriftCompat deepCopy() {
+    return new ParquetThriftCompat(this);
+  }
+
+  @Override
+  public void clear() {
+    setBoolColumnIsSet(false);
+    this.boolColumn = false;
+    setByteColumnIsSet(false);
+    this.byteColumn = 0;
+    setShortColumnIsSet(false);
+    this.shortColumn = 0;
+    setIntColumnIsSet(false);
+    this.intColumn = 0;
+    setLongColumnIsSet(false);
+    this.longColumn = 0;
+    setDoubleColumnIsSet(false);
+    this.doubleColumn = 0.0;
+    this.binaryColumn = null;
+    this.stringColumn = null;
+    this.enumColumn = null;
+    setMaybeBoolColumnIsSet(false);
+    this.maybeBoolColumn = false;
+    setMaybeByteColumnIsSet(false);
+    this.maybeByteColumn = 0;
+    setMaybeShortColumnIsSet(false);
+    this.maybeShortColumn = 0;
+    setMaybeIntColumnIsSet(false);
+    this.maybeIntColumn = 0;
+    setMaybeLongColumnIsSet(false);
+    this.maybeLongColumn = 0;
+    setMaybeDoubleColumnIsSet(false);
+    this.maybeDoubleColumn = 0.0;
+    this.maybeBinaryColumn = null;
+    this.maybeStringColumn = null;
+    this.maybeEnumColumn = null;
+    this.stringsColumn = null;
+    this.intSetColumn = null;
+    this.intToStringColumn = null;
+    this.complexColumn = null;
+  }
+
+  public boolean isBoolColumn() {
+    return this.boolColumn;
+  }
+
+  public ParquetThriftCompat setBoolColumn(boolean boolColumn) {
+    this.boolColumn = boolColumn;
+    setBoolColumnIsSet(true);
+    return this;
+  }
+
+  public void unsetBoolColumn() {
+    __isset_bitfield = EncodingUtils.clearBit(__isset_bitfield, __BOOLCOLUMN_ISSET_ID);
+  }
+
+  /** Returns true if field boolColumn is set (has been assigned a value) and false otherwise */
+  public boolean isSetBoolColumn() {
+    return EncodingUtils.testBit(__isset_bitfield, __BOOLCOLUMN_ISSET_ID);
+  }
+
+  public void setBoolColumnIsSet(boolean value) {
+    __isset_bitfield = EncodingUtils.setBit(__isset_bitfield, __BOOLCOLUMN_ISSET_ID, value);
+  }
+
+  public byte getByteColumn() {
+    return this.byteColumn;
+  }
+
+  public ParquetThriftCompat setByteColumn(byte byteColumn) {
+    this.byteColumn = byteColumn;
+    setByteColumnIsSet(true);
+    return this;
+  }
+
+  public void unsetByteColumn() {
+    __isset_bitfield = EncodingUtils.clearBit(__isset_bitfield, __BYTECOLUMN_ISSET_ID);
+  }
+
+  /** Returns true if field byteColumn is set (has been assigned a value) and false otherwise */
+  public boolean isSetByteColumn() {
+    return EncodingUtils.testBit(__isset_bitfield, __BYTECOLUMN_ISSET_ID);
+  }
+
+  public void setByteColumnIsSet(boolean value) {
+    __isset_bitfield = EncodingUtils.setBit(__isset_bitfield, __BYTECOLUMN_ISSET_ID, value);
+  }
+
+  public short getShortColumn() {
+    return this.shortColumn;
+  }
+
+  public ParquetThriftCompat setShortColumn(short shortColumn) {
+    this.shortColumn = shortColumn;
+    setShortColumnIsSet(true);
+    return this;
+  }
+
+  public void unsetShortColumn() {
+    __isset_bitfield = EncodingUtils.clearBit(__isset_bitfield, __SHORTCOLUMN_ISSET_ID);
+  }
+
+  /** Returns true if field shortColumn is set (has been assigned a value) and false otherwise */
+  public boolean isSetShortColumn() {
+    return EncodingUtils.testBit(__isset_bitfield, __SHORTCOLUMN_ISSET_ID);
+  }
+
+  public void setShortColumnIsSet(boolean value) {
+    __isset_bitfield = EncodingUtils.setBit(__isset_bitfield, __SHORTCOLUMN_ISSET_ID, value);
+  }
+
+  public int getIntColumn() {
+    return this.intColumn;
+  }
+
+  public ParquetThriftCompat setIntColumn(int intColumn) {
+    this.intColumn = intColumn;
+    setIntColumnIsSet(true);
+    return this;
+  }
+
+  public void unsetIntColumn() {
+    __isset_bitfield = EncodingUtils.clearBit(__isset_bitfield, __INTCOLUMN_ISSET_ID);
+  }
+
+  /** Returns true if field intColumn is set (has been assigned a value) and false otherwise */
+  public boolean isSetIntColumn() {
+    return EncodingUtils.testBit(__isset_bitfield, __INTCOLUMN_ISSET_ID);
+  }
+
+  public void setIntColumnIsSet(boolean value) {
+    __isset_bitfield = EncodingUtils.setBit(__isset_bitfield, __INTCOLUMN_ISSET_ID, value);
+  }
+
+  public long getLongColumn() {
+    return this.longColumn;
+  }
+
+  public ParquetThriftCompat setLongColumn(long longColumn) {
+    this.longColumn = longColumn;
+    setLongColumnIsSet(true);
+    return this;
+  }
+
+  public void unsetLongColumn() {
+    __isset_bitfield = EncodingUtils.clearBit(__isset_bitfield, __LONGCOLUMN_ISSET_ID);
+  }
+
+  /** Returns true if field longColumn is set (has been assigned a value) and false otherwise */
+  public boolean isSetLongColumn() {
+    return EncodingUtils.testBit(__isset_bitfield, __LONGCOLUMN_ISSET_ID);
+  }
+
+  public void setLongColumnIsSet(boolean value) {
+    __isset_bitfield = EncodingUtils.setBit(__isset_bitfield, __LONGCOLUMN_ISSET_ID, value);
+  }
+
+  public double getDoubleColumn() {
+    return this.doubleColumn;
+  }
+
+  public ParquetThriftCompat setDoubleColumn(double doubleColumn) {
+    this.doubleColumn = doubleColumn;
+    setDoubleColumnIsSet(true);
+    return this;
+  }
+
+  public void unsetDoubleColumn() {
+    __isset_bitfield = EncodingUtils.clearBit(__isset_bitfield, __DOUBLECOLUMN_ISSET_ID);
+  }
+
+  /** Returns true if field doubleColumn is set (has been assigned a value) and false otherwise */
+  public boolean isSetDoubleColumn() {
+    return EncodingUtils.testBit(__isset_bitfield, __DOUBLECOLUMN_ISSET_ID);
+  }
+
+  public void setDoubleColumnIsSet(boolean value) {
+    __isset_bitfield = EncodingUtils.setBit(__isset_bitfield, __DOUBLECOLUMN_ISSET_ID, value);
+  }
+
+  public byte[] getBinaryColumn() {
+    setBinaryColumn(org.apache.thrift.TBaseHelper.rightSize(binaryColumn));
+    return binaryColumn == null ? null : binaryColumn.array();
+  }
+
+  public ByteBuffer bufferForBinaryColumn() {
+    return org.apache.thrift.TBaseHelper.copyBinary(binaryColumn);
+  }
+
+  public ParquetThriftCompat setBinaryColumn(byte[] binaryColumn) {
+    this.binaryColumn = binaryColumn == null ? (ByteBuffer)null : ByteBuffer.wrap(Arrays.copyOf(binaryColumn, binaryColumn.length));
+    return this;
+  }
+
+  public ParquetThriftCompat setBinaryColumn(ByteBuffer binaryColumn) {
+    this.binaryColumn = org.apache.thrift.TBaseHelper.copyBinary(binaryColumn);
+    return this;
+  }
+
+  public void unsetBinaryColumn() {
+    this.binaryColumn = null;
+  }
+
+  /** Returns true if field binaryColumn is set (has been assigned a value) and false otherwise */
+  public boolean isSetBinaryColumn() {
+    return this.binaryColumn != null;
+  }
+
+  public void setBinaryColumnIsSet(boolean value) {
+    if (!value) {
+      this.binaryColumn = null;
+    }
+  }
+
+  public String getStringColumn() {
+    return this.stringColumn;
+  }
+
+  public ParquetThriftCompat setStringColumn(String stringColumn) {
+    this.stringColumn = stringColumn;
+    return this;
+  }
+
+  public void unsetStringColumn() {
+    this.stringColumn = null;
+  }
+
+  /** Returns true if field stringColumn is set (has been assigned a value) and false otherwise */
+  public boolean isSetStringColumn() {
+    return this.stringColumn != null;
+  }
+
+  public void setStringColumnIsSet(boolean value) {
+    if (!value) {
+      this.stringColumn = null;
+    }
+  }
+
+  /**
+   * 
+   * @see Suit
+   */
+  public Suit getEnumColumn() {
+    return this.enumColumn;
+  }
+
+  /**
+   * 
+   * @see Suit
+   */
+  public ParquetThriftCompat setEnumColumn(Suit enumColumn) {
+    this.enumColumn = enumColumn;
+    return this;
+  }
+
+  public void unsetEnumColumn() {
+    this.enumColumn = null;
+  }
+
+  /** Returns true if field enumColumn is set (has been assigned a value) and false otherwise */
+  public boolean isSetEnumColumn() {
+    return this.enumColumn != null;
+  }
+
+  public void setEnumColumnIsSet(boolean value) {
+    if (!value) {
+      this.enumColumn = null;
+    }
+  }
+
+  public boolean isMaybeBoolColumn() {
+    return this.maybeBoolColumn;
+  }
+
+  public ParquetThriftCompat setMaybeBoolColumn(boolean maybeBoolColumn) {
+    this.maybeBoolColumn = maybeBoolColumn;
+    setMaybeBoolColumnIsSet(true);
+    return this;
+  }
+
+  public void unsetMaybeBoolColumn() {
+    __isset_bitfield = EncodingUtils.clearBit(__isset_bitfield, __MAYBEBOOLCOLUMN_ISSET_ID);
+  }
+
+  /** Returns true if field maybeBoolColumn is set (has been assigned a value) and false otherwise */
+  public boolean isSetMaybeBoolColumn() {
+    return EncodingUtils.testBit(__isset_bitfield, __MAYBEBOOLCOLUMN_ISSET_ID);
+  }
+
+  public void setMaybeBoolColumnIsSet(boolean value) {
+    __isset_bitfield = EncodingUtils.setBit(__isset_bitfield, __MAYBEBOOLCOLUMN_ISSET_ID, value);
+  }
+
+  public byte getMaybeByteColumn() {
+    return this.maybeByteColumn;
+  }
+
+  public ParquetThriftCompat setMaybeByteColumn(byte maybeByteColumn) {
+    this.maybeByteColumn = maybeByteColumn;
+    setMaybeByteColumnIsSet(true);
+    return this;
+  }
+
+  public void unsetMaybeByteColumn() {
+    __isset_bitfield = EncodingUtils.clearBit(__isset_bitfield, __MAYBEBYTECOLUMN_ISSET_ID);
+  }
+
+  /** Returns true if field maybeByteColumn is set (has been assigned a value) and false otherwise */
+  public boolean isSetMaybeByteColumn() {
+    return EncodingUtils.testBit(__isset_bitfield, __MAYBEBYTECOLUMN_ISSET_ID);
+  }
+
+  public void setMaybeByteColumnIsSet(boolean value) {
+    __isset_bitfield = EncodingUtils.setBit(__isset_bitfield, __MAYBEBYTECOLUMN_ISSET_ID, value);
+  }
+
+  public short getMaybeShortColumn() {
+    return this.maybeShortColumn;
+  }
+
+  public ParquetThriftCompat setMaybeShortColumn(short maybeShortColumn) {
+    this.maybeShortColumn = maybeShortColumn;
+    setMaybeShortColumnIsSet(true);
+    return this;
+  }
+
+  public void unsetMaybeShortColumn() {
+    __isset_bitfield = EncodingUtils.clearBit(__isset_bitfield, __MAYBESHORTCOLUMN_ISSET_ID);
+  }
+
+  /** Returns true if field maybeShortColumn is set (has been assigned a value) and false otherwise */
+  public boolean isSetMaybeShortColumn() {
+    return EncodingUtils.testBit(__isset_bitfield, __MAYBESHORTCOLUMN_ISSET_ID);
+  }
+
+  public void setMaybeShortColumnIsSet(boolean value) {
+    __isset_bitfield = EncodingUtils.setBit(__isset_bitfield, __MAYBESHORTCOLUMN_ISSET_ID, value);
+  }
+
+  public int getMaybeIntColumn() {
+    return this.maybeIntColumn;
+  }
+
+  public ParquetThriftCompat setMaybeIntColumn(int maybeIntColumn) {
+    this.maybeIntColumn = maybeIntColumn;
+    setMaybeIntColumnIsSet(true);
+    return this;
+  }
+
+  public void unsetMaybeIntColumn() {
+    __isset_bitfield = EncodingUtils.clearBit(__isset_bitfield, __MAYBEINTCOLUMN_ISSET_ID);
+  }
+
+  /** Returns true if field maybeIntColumn is set (has been assigned a value) and false otherwise */
+  public boolean isSetMaybeIntColumn() {
+    return EncodingUtils.testBit(__isset_bitfield, __MAYBEINTCOLUMN_ISSET_ID);
+  }
+
+  public void setMaybeIntColumnIsSet(boolean value) {
+    __isset_bitfield = EncodingUtils.setBit(__isset_bitfield, __MAYBEINTCOLUMN_ISSET_ID, value);
+  }
+
+  public long getMaybeLongColumn() {
+    return this.maybeLongColumn;
+  }
+
+  public ParquetThriftCompat setMaybeLongColumn(long maybeLongColumn) {
+    this.maybeLongColumn = maybeLongColumn;
+    setMaybeLongColumnIsSet(true);
+    return this;
+  }
+
+  public void unsetMaybeLongColumn() {
+    __isset_bitfield = EncodingUtils.clearBit(__isset_bitfield, __MAYBELONGCOLUMN_ISSET_ID);
+  }
+
+  /** Returns true if field maybeLongColumn is set (has been assigned a value) and false otherwise */
+  public boolean isSetMaybeLongColumn() {
+    return EncodingUtils.testBit(__isset_bitfield, __MAYBELONGCOLUMN_ISSET_ID);
+  }
+
+  public void setMaybeLongColumnIsSet(boolean value) {
+    __isset_bitfield = EncodingUtils.setBit(__isset_bitfield, __MAYBELONGCOLUMN_ISSET_ID, value);
+  }
+
+  public double getMaybeDoubleColumn() {
+    return this.maybeDoubleColumn;
+  }
+
+  public ParquetThriftCompat setMaybeDoubleColumn(double maybeDoubleColumn) {
+    this.maybeDoubleColumn = maybeDoubleColumn;
+    setMaybeDoubleColumnIsSet(true);
+    return this;
+  }
+
+  public void unsetMaybeDoubleColumn() {
+    __isset_bitfield = EncodingUtils.clearBit(__isset_bitfield, __MAYBEDOUBLECOLUMN_ISSET_ID);
+  }
+
+  /** Returns true if field maybeDoubleColumn is set (has been assigned a value) and false otherwise */
+  public boolean isSetMaybeDoubleColumn() {
+    return EncodingUtils.testBit(__isset_bitfield, __MAYBEDOUBLECOLUMN_ISSET_ID);
+  }
+
+  public void setMaybeDoubleColumnIsSet(boolean value) {
+    __isset_bitfield = EncodingUtils.setBit(__isset_bitfield, __MAYBEDOUBLECOLUMN_ISSET_ID, value);
+  }
+
+  public byte[] getMaybeBinaryColumn() {
+    setMaybeBinaryColumn(org.apache.thrift.TBaseHelper.rightSize(maybeBinaryColumn));
+    return maybeBinaryColumn == null ? null : maybeBinaryColumn.array();
+  }
+
+  public ByteBuffer bufferForMaybeBinaryColumn() {
+    return org.apache.thrift.TBaseHelper.copyBinary(maybeBinaryColumn);
+  }
+
+  public ParquetThriftCompat setMaybeBinaryColumn(byte[] maybeBinaryColumn) {
+    this.maybeBinaryColumn = maybeBinaryColumn == null ? (ByteBuffer)null : ByteBuffer.wrap(Arrays.copyOf(maybeBinaryColumn, maybeBinaryColumn.length));
+    return this;
+  }
+
+  public ParquetThriftCompat setMaybeBinaryColumn(ByteBuffer maybeBinaryColumn) {
+    this.maybeBinaryColumn = org.apache.thrift.TBaseHelper.copyBinary(maybeBinaryColumn);
+    return this;
+  }
+
+  public void unsetMaybeBinaryColumn() {
+    this.maybeBinaryColumn = null;
+  }
+
+  /** Returns true if field maybeBinaryColumn is set (has been assigned a value) and false otherwise */
+  public boolean isSetMaybeBinaryColumn() {
+    return this.maybeBinaryColumn != null;
+  }
+
+  public void setMaybeBinaryColumnIsSet(boolean value) {
+    if (!value) {
+      this.maybeBinaryColumn = null;
+    }
+  }
+
+  public String getMaybeStringColumn() {
+    return this.maybeStringColumn;
+  }
+
+  public ParquetThriftCompat setMaybeStringColumn(String maybeStringColumn) {
+    this.maybeStringColumn = maybeStringColumn;
+    return this;
+  }
+
+  public void unsetMaybeStringColumn() {
+    this.maybeStringColumn = null;
+  }
+
+  /** Returns true if field maybeStringColumn is set (has been assigned a value) and false otherwise */
+  public boolean isSetMaybeStringColumn() {
+    return this.maybeStringColumn != null;
+  }
+
+  public void setMaybeStringColumnIsSet(boolean value) {
+    if (!value) {
+      this.maybeStringColumn = null;
+    }
+  }
+
+  /**
+   * 
+   * @see Suit
+   */
+  public Suit getMaybeEnumColumn() {
+    return this.maybeEnumColumn;
+  }
+
+  /**
+   * 
+   * @see Suit
+   */
+  public ParquetThriftCompat setMaybeEnumColumn(Suit maybeEnumColumn) {
+    this.maybeEnumColumn = maybeEnumColumn;
+    return this;
+  }
+
+  public void unsetMaybeEnumColumn() {
+    this.maybeEnumColumn = null;
+  }
+
+  /** Returns true if field maybeEnumColumn is set (has been assigned a value) and false otherwise */
+  public boolean isSetMaybeEnumColumn() {
+    return this.maybeEnumColumn != null;
+  }
+
+  public void setMaybeEnumColumnIsSet(boolean value) {
+    if (!value) {
+      this.maybeEnumColumn = null;
+    }
+  }
+
+  public int getStringsColumnSize() {
+    return (this.stringsColumn == null) ? 0 : this.stringsColumn.size();
+  }
+
+  public java.util.Iterator<String> getStringsColumnIterator() {
+    return (this.stringsColumn == null) ? null : this.stringsColumn.iterator();
+  }
+
+  public void addToStringsColumn(String elem) {
+    if (this.stringsColumn == null) {
+      this.stringsColumn = new ArrayList<String>();
+    }
+    this.stringsColumn.add(elem);
+  }
+
+  public List<String> getStringsColumn() {
+    return this.stringsColumn;
+  }
+
+  public ParquetThriftCompat setStringsColumn(List<String> stringsColumn) {
+    this.stringsColumn = stringsColumn;
+    return this;
+  }
+
+  public void unsetStringsColumn() {
+    this.stringsColumn = null;
+  }
+
+  /** Returns true if field stringsColumn is set (has been assigned a value) and false otherwise */
+  public boolean isSetStringsColumn() {
+    return this.stringsColumn != null;
+  }
+
+  public void setStringsColumnIsSet(boolean value) {
+    if (!value) {
+      this.stringsColumn = null;
+    }
+  }
+
+  public int getIntSetColumnSize() {
+    return (this.intSetColumn == null) ? 0 : this.intSetColumn.size();
+  }
+
+  public java.util.Iterator<Integer> getIntSetColumnIterator() {
+    return (this.intSetColumn == null) ? null : this.intSetColumn.iterator();
+  }
+
+  public void addToIntSetColumn(int elem) {
+    if (this.intSetColumn == null) {
+      this.intSetColumn = new HashSet<Integer>();
+    }
+    this.intSetColumn.add(elem);
+  }
+
+  public Set<Integer> getIntSetColumn() {
+    return this.intSetColumn;
+  }
+
+  public ParquetThriftCompat setIntSetColumn(Set<Integer> intSetColumn) {
+    this.intSetColumn = intSetColumn;
+    return this;
+  }
+
+  public void unsetIntSetColumn() {
+    this.intSetColumn = null;
+  }
+
+  /** Returns true if field intSetColumn is set (has been assigned a value) and false otherwise */
+  public boolean isSetIntSetColumn() {
+    return this.intSetColumn != null;
+  }
+
+  public void setIntSetColumnIsSet(boolean value) {
+    if (!value) {
+      this.intSetColumn = null;
+    }
+  }
+
+  public int getIntToStringColumnSize() {
+    return (this.intToStringColumn == null) ? 0 : this.intToStringColumn.size();
+  }
+
+  public void putToIntToStringColumn(int key, String val) {
+    if (this.intToStringColumn == null) {
+      this.intToStringColumn = new HashMap<Integer,String>();
+    }
+    this.intToStringColumn.put(key, val);
+  }
+
+  public Map<Integer,String> getIntToStringColumn() {
+    return this.intToStringColumn;
+  }
+
+  public ParquetThriftCompat setIntToStringColumn(Map<Integer,String> intToStringColumn) {
+    this.intToStringColumn = intToStringColumn;
+    return this;
+  }
+
+  public void unsetIntToStringColumn() {
+    this.intToStringColumn = null;
+  }
+
+  /** Returns true if field intToStringColumn is set (has been assigned a value) and false otherwise */
+  public boolean isSetIntToStringColumn() {
+    return this.intToStringColumn != null;
+  }
+
+  public void setIntToStringColumnIsSet(boolean value) {
+    if (!value) {
+      this.intToStringColumn = null;
+    }
+  }
+
+  public int getComplexColumnSize() {
+    return (this.complexColumn == null) ? 0 : this.complexColumn.size();
+  }
+
+  public void putToComplexColumn(int key, List<Nested> val) {
+    if (this.complexColumn == null) {
+      this.complexColumn = new HashMap<Integer,List<Nested>>();
+    }
+    this.complexColumn.put(key, val);
+  }
+
+  public Map<Integer,List<Nested>> getComplexColumn() {
+    return this.complexColumn;
+  }
+
+  public ParquetThriftCompat setComplexColumn(Map<Integer,List<Nested>> complexColumn) {
+    this.complexColumn = complexColumn;
+    return this;
+  }
+
+  public void unsetComplexColumn() {
+    this.complexColumn = null;
+  }
+
+  /** Returns true if field complexColumn is set (has been assigned a value) and false otherwise */
+  public boolean isSetComplexColumn() {
+    return this.complexColumn != null;
+  }
+
+  public void setComplexColumnIsSet(boolean value) {
+    if (!value) {
+      this.complexColumn = null;
+    }
+  }
+
+  public void setFieldValue(_Fields field, Object value) {
+    switch (field) {
+    case BOOL_COLUMN:
+      if (value == null) {
+        unsetBoolColumn();
+      } else {
+        setBoolColumn((Boolean)value);
+      }
+      break;
+
+    case BYTE_COLUMN:
+      if (value == null) {
+        unsetByteColumn();
+      } else {
+        setByteColumn((Byte)value);
+      }
+      break;
+
+    case SHORT_COLUMN:
+      if (value == null) {
+        unsetShortColumn();
+      } else {
+        setShortColumn((Short)value);
+      }
+      break;
+
+    case INT_COLUMN:
+      if (value == null) {
+        unsetIntColumn();
+      } else {
+        setIntColumn((Integer)value);
+      }
+      break;
+
+    case LONG_COLUMN:
+      if (value == null) {
+        unsetLongColumn();
+      } else {
+        setLongColumn((Long)value);
+      }
+      break;
+
+    case DOUBLE_COLUMN:
+      if (value == null) {
+        unsetDoubleColumn();
+      } else {
+        setDoubleColumn((Double)value);
+      }
+      break;
+
+    case BINARY_COLUMN:
+      if (value == null) {
+        unsetBinaryColumn();
+      } else {
+        setBinaryColumn((ByteBuffer)value);
+      }
+      break;
+
+    case STRING_COLUMN:
+      if (value == null) {
+        unsetStringColumn();
+      } else {
+        setStringColumn((String)value);
+      }
+      break;
+
+    case ENUM_COLUMN:
+      if (value == null) {
+        unsetEnumColumn();
+      } else {
+        setEnumColumn((Suit)value);
+      }
+      break;
+
+    case MAYBE_BOOL_COLUMN:
+      if (value == null) {
+        unsetMaybeBoolColumn();
+      } else {
+        setMaybeBoolColumn((Boolean)value);
+      }
+      break;
+
+    case MAYBE_BYTE_COLUMN:
+      if (value == null) {
+        unsetMaybeByteColumn();
+      } else {
+        setMaybeByteColumn((Byte)value);
+      }
+      break;
+
+    case MAYBE_SHORT_COLUMN:
+      if (value == null) {
+        unsetMaybeShortColumn();
+      } else {
+        setMaybeShortColumn((Short)value);
+      }
+      break;
+
+    case MAYBE_INT_COLUMN:
+      if (value == null) {
+        unsetMaybeIntColumn();
+      } else {
+        setMaybeIntColumn((Integer)value);
+      }
+      break;
+
+    case MAYBE_LONG_COLUMN:
+      if (value == null) {
+        unsetMaybeLongColumn();
+      } else {
+        setMaybeLongColumn((Long)value);
+      }
+      break;
+
+    case MAYBE_DOUBLE_COLUMN:
+      if (value == null) {
+        unsetMaybeDoubleColumn();
+      } else {
+        setMaybeDoubleColumn((Double)value);
+      }
+      break;
+
+    case MAYBE_BINARY_COLUMN:
+      if (value == null) {
+        unsetMaybeBinaryColumn();
+      } else {
+        setMaybeBinaryColumn((ByteBuffer)value);
+      }
+      break;
+
+    case MAYBE_STRING_COLUMN:
+      if (value == null) {
+        unsetMaybeStringColumn();
+      } else {
+        setMaybeStringColumn((String)value);
+      }
+      break;
+
+    case MAYBE_ENUM_COLUMN:
+      if (value == null) {
+        unsetMaybeEnumColumn();
+      } else {
+        setMaybeEnumColumn((Suit)value);
+      }
+      break;
+
+    case STRINGS_COLUMN:
+      if (value == null) {
+        unsetStringsColumn();
+      } else {
+        setStringsColumn((List<String>)value);
+      }
+      break;
+
+    case INT_SET_COLUMN:
+      if (value == null) {
+        unsetIntSetColumn();
+      } else {
+        setIntSetColumn((Set<Integer>)value);
+      }
+      break;
+
+    case INT_TO_STRING_COLUMN:
+      if (value == null) {
+        unsetIntToStringColumn();
+      } else {
+        setIntToStringColumn((Map<Integer,String>)value);
+      }
+      break;
+
+    case COMPLEX_COLUMN:
+      if (value == null) {
+        unsetComplexColumn();
+      } else {
+        setComplexColumn((Map<Integer,List<Nested>>)value);
+      }
+      break;
+
+    }
+  }
+
+  public Object getFieldValue(_Fields field) {
+    switch (field) {
+    case BOOL_COLUMN:
+      return Boolean.valueOf(isBoolColumn());
+
+    case BYTE_COLUMN:
+      return Byte.valueOf(getByteColumn());
+
+    case SHORT_COLUMN:
+      return Short.valueOf(getShortColumn());
+
+    case INT_COLUMN:
+      return Integer.valueOf(getIntColumn());
+
+    case LONG_COLUMN:
+      return Long.valueOf(getLongColumn());
+
+    case DOUBLE_COLUMN:
+      return Double.valueOf(getDoubleColumn());
+
+    case BINARY_COLUMN:
+      return getBinaryColumn();
+
+    case STRING_COLUMN:
+      return getStringColumn();
+
+    case ENUM_COLUMN:
+      return getEnumColumn();
+
+    case MAYBE_BOOL_COLUMN:
+      return Boolean.valueOf(isMaybeBoolColumn());
+
+    case MAYBE_BYTE_COLUMN:
+      return Byte.valueOf(getMaybeByteColumn());
+
+    case MAYBE_SHORT_COLUMN:
+      return Short.valueOf(getMaybeShortColumn());
+
+    case MAYBE_INT_COLUMN:
+      return Integer.valueOf(getMaybeIntColumn());
+
+    case MAYBE_LONG_COLUMN:
+      return Long.valueOf(getMaybeLongColumn());
+
+    case MAYBE_DOUBLE_COLUMN:
+      return Double.valueOf(getMaybeDoubleColumn());
+
+    case MAYBE_BINARY_COLUMN:
+      return getMaybeBinaryColumn();
+
+    case MAYBE_STRING_COLUMN:
+      return getMaybeStringColumn();
+
+    case MAYBE_ENUM_COLUMN:
+      return getMaybeEnumColumn();
+
+    case STRINGS_COLUMN:
+      return getStringsColumn();
+
+    case INT_SET_COLUMN:
+      return getIntSetColumn();
+
+    case INT_TO_STRING_COLUMN:
+      return getIntToStringColumn();
+
+    case COMPLEX_COLUMN:
+      return getComplexColumn();
+
+    }
+    throw new IllegalStateException();
+  }
+
+  /** Returns true if field corresponding to fieldID is set (has been assigned a value) and false otherwise */
+  public boolean isSet(_Fields field) {
+    if (field == null) {
+      throw new IllegalArgumentException();
+    }
+
+    switch (field) {
+    case BOOL_COLUMN:
+      return isSetBoolColumn();
+    case BYTE_COLUMN:
+      return isSetByteColumn();
+    case SHORT_COLUMN:
+      return isSetShortColumn();
+    case INT_COLUMN:
+      return isSetIntColumn();
+    case LONG_COLUMN:
+      return isSetLongColumn();
+    case DOUBLE_COLUMN:
+      return isSetDoubleColumn();
+    case BINARY_COLUMN:
+      return isSetBinaryColumn();
+    case STRING_COLUMN:
+      return isSetStringColumn();
+    case ENUM_COLUMN:
+      return isSetEnumColumn();
+    case MAYBE_BOOL_COLUMN:
+      return isSetMaybeBoolColumn();
+    case MAYBE_BYTE_COLUMN:
+      return isSetMaybeByteColumn();
+    case MAYBE_SHORT_COLUMN:
+      return isSetMaybeShortColumn();
+    case MAYBE_INT_COLUMN:
+      return isSetMaybeIntColumn();
+    case MAYBE_LONG_COLUMN:
+      return isSetMaybeLongColumn();
+    case MAYBE_DOUBLE_COLUMN:
+      return isSetMaybeDoubleColumn();
+    case MAYBE_BINARY_COLUMN:
+      return isSetMaybeBinaryColumn();
+    case MAYBE_STRING_COLUMN:
+      return isSetMaybeStringColumn();
+    case MAYBE_ENUM_COLUMN:
+      return isSetMaybeEnumColumn();
+    case STRINGS_COLUMN:
+      return isSetStringsColumn();
+    case INT_SET_COLUMN:
+      return isSetIntSetColumn();
+    case INT_TO_STRING_COLUMN:
+      return isSetIntToStringColumn();
+    case COMPLEX_COLUMN:
+      return isSetComplexColumn();
+    }
+    throw new IllegalStateException();
+  }
+
+  @Override
+  public boolean equals(Object that) {
+    if (that == null)
+      return false;
+    if (that instanceof ParquetThriftCompat)
+      return this.equals((ParquetThriftCompat)that);
+    return false;
+  }
+
+  public boolean equals(ParquetThriftCompat that) {
+    if (that == null)
+      return false;
+
+    boolean this_present_boolColumn = true;
+    boolean that_present_boolColumn = true;
+    if (this_present_boolColumn || that_present_boolColumn) {
+      if (!(this_present_boolColumn && that_present_boolColumn))
+        return false;
+      if (this.boolColumn != that.boolColumn)
+        return false;
+    }
+
+    boolean this_present_byteColumn = true;
+    boolean that_present_byteColumn = true;
+    if (this_present_byteColumn || that_present_byteColumn) {
+      if (!(this_present_byteColumn && that_present_byteColumn))
+        return false;
+      if (this.byteColumn != that.byteColumn)
+        return false;
+    }
+
+    boolean this_present_shortColumn = true;
+    boolean that_present_shortColumn = true;
+    if (this_present_shortColumn || that_present_shortColumn) {
+      if (!(this_present_shortColumn && that_present_shortColumn))
+        return false;
+      if (this.shortColumn != that.shortColumn)
+        return false;
+    }
+
+    boolean this_present_intColumn = true;
+    boolean that_present_intColumn = true;
+    if (this_present_intColumn || that_present_intColumn) {
+      if (!(this_present_intColumn && that_present_intColumn))
+        return false;
+      if (this.intColumn != that.intColumn)
+        return false;
+    }
+
+    boolean this_present_longColumn = true;
+    boolean that_present_longColumn = true;
+    if (this_present_longColumn || that_present_longColumn) {
+      if (!(this_present_longColumn && that_present_longColumn))
+        return false;
+      if (this.longColumn != that.longColumn)
+        return false;
+    }
+
+    boolean this_present_doubleColumn = true;
+    boolean that_present_doubleColumn = true;
+    if (this_present_doubleColumn || that_present_doubleColumn) {
+      if (!(this_present_doubleColumn && that_present_doubleColumn))
+        return false;
+      if (this.doubleColumn != that.doubleColumn)
+        return false;
+    }
+
+    boolean this_present_binaryColumn = true && this.isSetBinaryColumn();
+    boolean that_present_binaryColumn = true && that.isSetBinaryColumn();
+    if (this_present_binaryColumn || that_present_binaryColumn) {
+      if (!(this_present_binaryColumn && that_present_binaryColumn))
+        return false;
+      if (!this.binaryColumn.equals(that.binaryColumn))
+        return false;
+    }
+
+    boolean this_present_stringColumn = true && this.isSetStringColumn();
+    boolean that_present_stringColumn = true && that.isSetStringColumn();
+    if (this_present_stringColumn || that_present_stringColumn) {
+      if (!(this_present_stringColumn && that_present_stringColumn))
+        return false;
+      if (!this.stringColumn.equals(that.stringColumn))
+        return false;
+    }
+
+    boolean this_present_enumColumn = true && this.isSetEnumColumn();
+    boolean that_present_enumColumn = true && that.isSetEnumColumn();
+    if (this_present_enumColumn || that_present_enumColumn) {
+      if (!(this_present_enumColumn && that_present_enumColumn))
+        return false;
+      if (!this.enumColumn.equals(that.enumColumn))
+        return false;
+    }
+
+    boolean this_present_maybeBoolColumn = true && this.isSetMaybeBoolColumn();
+    boolean that_present_maybeBoolColumn = true && that.isSetMaybeBoolColumn();
+    if (this_present_maybeBoolColumn || that_present_maybeBoolColumn) {
+      if (!(this_present_maybeBoolColumn && that_present_maybeBoolColumn))
+        return false;
+      if (this.maybeBoolColumn != that.maybeBoolColumn)
+        return false;
+    }
+
+    boolean this_present_maybeByteColumn = true && this.isSetMaybeByteColumn();
+    boolean that_present_maybeByteColumn = true && that.isSetMaybeByteColumn();
+    if (this_present_maybeByteColumn || that_present_maybeByteColumn) {
+      if (!(this_present_maybeByteColumn && that_present_maybeByteColumn))
+        return false;
+      if (this.maybeByteColumn != that.maybeByteColumn)
+        return false;
+    }
+
+    boolean this_present_maybeShortColumn = true && this.isSetMaybeShortColumn();
+    boolean that_present_maybeShortColumn = true && that.isSetMaybeShortColumn();
+    if (this_present_maybeShortColumn || that_present_maybeShortColumn) {
+      if (!(this_present_maybeShortColumn && that_present_maybeShortColumn))
+        return false;
+      if (this.maybeShortColumn != that.maybeShortColumn)
+        return false;
+    }
+
+    boolean this_present_maybeIntColumn = true && this.isSetMaybeIntColumn();
+    boolean that_present_maybeIntColumn = true && that.isSetMaybeIntColumn();
+    if (this_present_maybeIntColumn || that_present_maybeIntColumn) {
+      if (!(this_present_maybeIntColumn && that_present_maybeIntColumn))
+        return false;
+      if (this.maybeIntColumn != that.maybeIntColumn)
+        return false;
+    }
+
+    boolean this_present_maybeLongColumn = true && this.isSetMaybeLongColumn();
+    boolean that_present_maybeLongColumn = true && that.isSetMaybeLongColumn();
+    if (this_present_maybeLongColumn || that_present_maybeLongColumn) {
+      if (!(this_present_maybeLongColumn && that_present_maybeLongColumn))
+        return false;
+      if (this.maybeLongColumn != that.maybeLongColumn)
+        return false;
+    }
+
+    boolean this_present_maybeDoubleColumn = true && this.isSetMaybeDoubleColumn();
+    boolean that_present_maybeDoubleColumn = true && that.isSetMaybeDoubleColumn();
+    if (this_present_maybeDoubleColumn || that_present_maybeDoubleColumn) {
+      if (!(this_present_maybeDoubleColumn && that_present_maybeDoubleColumn))
+        return false;
+      if (this.maybeDoubleColumn != that.maybeDoubleColumn)
+        return false;
+    }
+
+    boolean this_present_maybeBinaryColumn = true && this.isSetMaybeBinaryColumn();
+    boolean that_present_maybeBinaryColumn = true && that.isSetMaybeBinaryColumn();
+    if (this_present_maybeBinaryColumn || that_present_maybeBinaryColumn) {
+      if (!(this_present_maybeBinaryColumn && that_present_maybeBinaryColumn))
+        return false;
+      if (!this.maybeBinaryColumn.equals(that.maybeBinaryColumn))
+        return false;
+    }
+
+    boolean this_present_maybeStringColumn = true && this.isSetMaybeStringColumn();
+    boolean that_present_maybeStringColumn = true && that.isSetMaybeStringColumn();
+    if (this_present_maybeStringColumn || that_present_maybeStringColumn) {
+      if (!(this_present_maybeStringColumn && that_present_maybeStringColumn))
+        return false;
+      if (!this.maybeStringColumn.equals(that.maybeStringColumn))
+        return false;
+    }
+
+    boolean this_present_maybeEnumColumn = true && this.isSetMaybeEnumColumn();
+    boolean that_present_maybeEnumColumn = true && that.isSetMaybeEnumColumn();
+    if (this_present_maybeEnumColumn || that_present_maybeEnumColumn) {
+      if (!(this_present_maybeEnumColumn && that_present_maybeEnumColumn))
+        return false;
+      if (!this.maybeEnumColumn.equals(that.maybeEnumColumn))
+        return false;
+    }
+
+    boolean this_present_stringsColumn = true && this.isSetStringsColumn();
+    boolean that_present_stringsColumn = true && that.isSetStringsColumn();
+    if (this_present_stringsColumn || that_present_stringsColumn) {
+      if (!(this_present_stringsColumn && that_present_stringsColumn))
+        return false;
+      if (!this.stringsColumn.equals(that.stringsColumn))
+        return false;
+    }
+
+    boolean this_present_intSetColumn = true && this.isSetIntSetColumn();
+    boolean that_present_intSetColumn = true && that.isSetIntSetColumn();
+    if (this_present_intSetColumn || that_present_intSetColumn) {
+      if (!(this_present_intSetColumn && that_present_intSetColumn))
+        return false;
+      if (!this.intSetColumn.equals(that.intSetColumn))
+        return false;
+    }
+
+    boolean this_present_intToStringColumn = true && this.isSetIntToStringColumn();
+    boolean that_present_intToStringColumn = true && that.isSetIntToStringColumn();
+    if (this_present_intToStringColumn || that_present_intToStringColumn) {
+      if (!(this_present_intToStringColumn && that_present_intToStringColumn))
+        return false;
+      if (!this.intToStringColumn.equals(that.intToStringColumn))
+        return false;
+    }
+
+    boolean this_present_complexColumn = true && this.isSetComplexColumn();
+    boolean that_present_complexColumn = true && that.isSetComplexColumn();
+    if (this_present_complexColumn || that_present_complexColumn) {
+      if (!(this_present_complexColumn && that_present_complexColumn))
+        return false;
+      if (!this.complexColumn.equals(that.complexColumn))
+        return false;
+    }
+
+    return true;
+  }
+
+  @Override
+  public int hashCode() {
+    List<Object> list = new ArrayList<Object>();
+
+    boolean present_boolColumn = true;
+    list.add(present_boolColumn);
+    if (present_boolColumn)
+      list.add(boolColumn);
+
+    boolean present_byteColumn = true;
+    list.add(present_byteColumn);
+    if (present_byteColumn)
+      list.add(byteColumn);
+
+    boolean present_shortColumn = true;
+    list.add(present_shortColumn);
+    if (present_shortColumn)
+      list.add(shortColumn);
+
+    boolean present_intColumn = true;
+    list.add(present_intColumn);
+    if (present_intColumn)
+      list.add(intColumn);
+
+    boolean present_longColumn = true;
+    list.add(present_longColumn);
+    if (present_longColumn)
+      list.add(longColumn);
+
+    boolean present_doubleColumn = true;
+    list.add(present_doubleColumn);
+    if (present_doubleColumn)
+      list.add(doubleColumn);
+
+    boolean present_binaryColumn = true && (isSetBinaryColumn());
+    list.add(present_binaryColumn);
+    if (present_binaryColumn)
+      list.add(binaryColumn);
+
+    boolean present_stringColumn = true && (isSetStringColumn());
+    list.add(present_stringColumn);
+    if (present_stringColumn)
+      list.add(stringColumn);
+
+    boolean present_enumColumn = true && (isSetEnumColumn());
+    list.add(present_enumColumn);
+    if (present_enumColumn)
+      list.add(enumColumn.getValue());
+
+    boolean present_maybeBoolColumn = true && (isSetMaybeBoolColumn());
+    list.add(present_maybeBoolColumn);
+    if (present_maybeBoolColumn)
+      list.add(maybeBoolColumn);
+
+    boolean present_maybeByteColumn = true && (isSetMaybeByteColumn());
+    list.add(present_maybeByteColumn);
+    if (present_maybeByteColumn)
+      list.add(maybeByteColumn);
+
+    boolean present_maybeShortColumn = true && (isSetMaybeShortColumn());
+    list.add(present_maybeShortColumn);
+    if (present_maybeShortColumn)
+      list.add(maybeShortColumn);
+
+    boolean present_maybeIntColumn = true && (isSetMaybeIntColumn());
+    list.add(present_maybeIntColumn);
+    if (present_maybeIntColumn)
+      list.add(maybeIntColumn);
+
+    boolean present_maybeLongColumn = true && (isSetMaybeLongColumn());
+    list.add(present_maybeLongColumn);
+    if (present_maybeLongColumn)
+      list.add(maybeLongColumn);
+
+    boolean present_maybeDoubleColumn = true && (isSetMaybeDoubleColumn());
+    list.add(present_maybeDoubleColumn);
+    if (present_maybeDoubleColumn)
+      list.add(maybeDoubleColumn);
+
+    boolean present_maybeBinaryColumn = true && (isSetMaybeBinaryColumn());
+    list.add(present_maybeBinaryColumn);
+    if (present_maybeBinaryColumn)
+      list.add(maybeBinaryColumn);
+
+    boolean present_maybeStringColumn = true && (isSetMaybeStringColumn());
+    list.add(present_maybeStringColumn);
+    if (present_maybeStringColumn)
+      list.add(maybeStringColumn);
+
+    boolean present_maybeEnumColumn = true && (isSetMaybeEnumColumn());
+    list.add(present_maybeEnumColumn);
+    if (present_maybeEnumColumn)
+      list.add(maybeEnumColumn.getValue());
+
+    boolean present_stringsColumn = true && (isSetStringsColumn());
+    list.add(present_stringsColumn);
+    if (present_stringsColumn)
+      list.add(stringsColumn);
+
+    boolean present_intSetColumn = true && (isSetIntSetColumn());
+    list.add(present_intSetColumn);
+    if (present_intSetColumn)
+      list.add(intSetColumn);
+
+    boolean present_intToStringColumn = true && (isSetIntToStringColumn());
+    list.add(present_intToStringColumn);
+    if (present_intToStringColumn)
+      list.add(intToStringColumn);
+
+    boolean present_complexColumn = true && (isSetComplexColumn());
+    list.add(present_complexColumn);
+    if (present_complexColumn)
+      list.add(complexColumn);
+
+    return list.hashCode();
+  }
+
+  @Override
+  public int compareTo(ParquetThriftCompat other) {
+    if (!getClass().equals(other.getClass())) {
+      return getClass().getName().compareTo(other.getClass().getName());
+    }
+
+    int lastComparison = 0;
+
+    lastComparison = Boolean.valueOf(isSetBoolColumn()).compareTo(other.isSetBoolColumn());
+    if (lastComparison != 0) {
+      return lastComparison;
+    }
+    if (isSetBoolColumn()) {
+      lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.boolColumn, other.boolColumn);
+      if (lastComparison != 0) {
+        return lastComparison;
+      }
+    }
+    lastComparison = Boolean.valueOf(isSetByteColumn()).compareTo(other.isSetByteColumn());
+    if (lastComparison != 0) {
+      return lastComparison;
+    }
+    if (isSetByteColumn()) {
+      lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.byteColumn, other.byteColumn);
+      if (lastComparison != 0) {
+        return lastComparison;
+      }
+    }
+    lastComparison = Boolean.valueOf(isSetShortColumn()).compareTo(other.isSetShortColumn());
+    if (lastComparison != 0) {
+      return lastComparison;
+    }
+    if (isSetShortColumn()) {
+      lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.shortColumn, other.shortColumn);
+      if (lastComparison != 0) {
+        return lastComparison;
+      }
+    }
+    lastComparison = Boolean.valueOf(isSetIntColumn()).compareTo(other.isSetIntColumn());
+    if (lastComparison != 0) {
+      return lastComparison;
+    }
+    if (isSetIntColumn()) {
+      lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.intColumn, other.intColumn);
+      if (lastComparison != 0) {
+        return lastComparison;
+      }
+    }
+    lastComparison = Boolean.valueOf(isSetLongColumn()).compareTo(other.isSetLongColumn());
+    if (lastComparison != 0) {
+      return lastComparison;
+    }
+    if (isSetLongColumn()) {
+      lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.longColumn, other.longColumn);
+      if (lastComparison != 0) {
+        return lastComparison;
+      }
+    }
+    lastComparison = Boolean.valueOf(isSetDoubleColumn()).compareTo(other.isSetDoubleColumn());
+    if (lastComparison != 0) {
+      return lastComparison;
+    }
+    if (isSetDoubleColumn()) {
+      lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.doubleColumn, other.doubleColumn);
+      if (lastComparison != 0) {
+        return lastComparison;
+      }
+    }
+    lastComparison = Boolean.valueOf(isSetBinaryColumn()).compareTo(other.isSetBinaryColumn());
+    if (lastComparison != 0) {
+      return lastComparison;
+    }
+    if (isSetBinaryColumn()) {
+      lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.binaryColumn, other.binaryColumn);
+      if (lastComparison != 0) {
+        return lastComparison;
+      }
+    }
+    lastComparison = Boolean.valueOf(isSetStringColumn()).compareTo(other.isSetStringColumn());
+    if (lastComparison != 0) {
+      return lastComparison;
+    }
+    if (isSetStringColumn()) {
+      lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.stringColumn, other.stringColumn);
+      if (lastComparison != 0) {
+        return lastComparison;
+      }
+    }
+    lastComparison = Boolean.valueOf(isSetEnumColumn()).compareTo(other.isSetEnumColumn());
+    if (lastComparison != 0) {
+      return lastComparison;
+    }
+    if (isSetEnumColumn()) {
+      lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.enumColumn, other.enumColumn);
+      if (lastComparison != 0) {
+        return lastComparison;
+      }
+    }
+    lastComparison = Boolean.valueOf(isSetMaybeBoolColumn()).compareTo(other.isSetMaybeBoolColumn());
+    if (lastComparison != 0) {
+      return lastComparison;
+    }
+    if (isSetMaybeBoolColumn()) {
+      lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.maybeBoolColumn, other.maybeBoolColumn);
+      if (lastComparison != 0) {
+        return lastComparison;
+      }
+    }
+    lastComparison = Boolean.valueOf(isSetMaybeByteColumn()).compareTo(other.isSetMaybeByteColumn());
+    if (lastComparison != 0) {
+      return lastComparison;
+    }
+    if (isSetMaybeByteColumn()) {
+      lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.maybeByteColumn, other.maybeByteColumn);
+      if (lastComparison != 0) {
+        return lastComparison;
+      }
+    }
+    lastComparison = Boolean.valueOf(isSetMaybeShortColumn()).compareTo(other.isSetMaybeShortColumn());
+    if (lastComparison != 0) {
+      return lastComparison;
+    }
+    if (isSetMaybeShortColumn()) {
+      lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.maybeShortColumn, other.maybeShortColumn);
+      if (lastComparison != 0) {
+        return lastComparison;
+      }
+    }
+    lastComparison = Boolean.valueOf(isSetMaybeIntColumn()).compareTo(other.isSetMaybeIntColumn());
+    if (lastComparison != 0) {
+      return lastComparison;
+    }
+    if (isSetMaybeIntColumn()) {
+      lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.maybeIntColumn, other.maybeIntColumn);
+      if (lastComparison != 0) {
+        return lastComparison;
+      }
+    }
+    lastComparison = Boolean.valueOf(isSetMaybeLongColumn()).compareTo(other.isSetMaybeLongColumn());
+    if (lastComparison != 0) {
+      return lastComparison;
+    }
+    if (isSetMaybeLongColumn()) {
+      lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.maybeLongColumn, other.maybeLongColumn);
+      if (lastComparison != 0) {
+        return lastComparison;
+      }
+    }
+    lastComparison = Boolean.valueOf(isSetMaybeDoubleColumn()).compareTo(other.isSetMaybeDoubleColumn());
+    if (lastComparison != 0) {
+      return lastComparison;
+    }
+    if (isSetMaybeDoubleColumn()) {
+      lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.maybeDoubleColumn, other.maybeDoubleColumn);
+      if (lastComparison != 0) {
+        return lastComparison;
+      }
+    }
+    lastComparison = Boolean.valueOf(isSetMaybeBinaryColumn()).compareTo(other.isSetMaybeBinaryColumn());
+    if (lastComparison != 0) {
+      return lastComparison;
+    }
+    if (isSetMaybeBinaryColumn()) {
+      lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.maybeBinaryColumn, other.maybeBinaryColumn);
+      if (lastComparison != 0) {
+        return lastComparison;
+      }
+    }
+    lastComparison = Boolean.valueOf(isSetMaybeStringColumn()).compareTo(other.isSetMaybeStringColumn());
+    if (lastComparison != 0) {
+      return lastComparison;
+    }
+    if (isSetMaybeStringColumn()) {
+      lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.maybeStringColumn, other.maybeStringColumn);
+      if (lastComparison != 0) {
+        return lastComparison;
+      }
+    }
+    lastComparison = Boolean.valueOf(isSetMaybeEnumColumn()).compareTo(other.isSetMaybeEnumColumn());
+    if (lastComparison != 0) {
+      return lastComparison;
+    }
+    if (isSetMaybeEnumColumn()) {
+      lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.maybeEnumColumn, other.maybeEnumColumn);
+      if (lastComparison != 0) {
+        return lastComparison;
+      }
+    }
+    lastComparison = Boolean.valueOf(isSetStringsColumn()).compareTo(other.isSetStringsColumn());
+    if (lastComparison != 0) {
+      return lastComparison;
+    }
+    if (isSetStringsColumn()) {
+      lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.stringsColumn, other.stringsColumn);
+      if (lastComparison != 0) {
+        return lastComparison;
+      }
+    }
+    lastComparison = Boolean.valueOf(isSetIntSetColumn()).compareTo(other.isSetIntSetColumn());
+    if (lastComparison != 0) {
+      return lastComparison;
+    }
+    if (isSetIntSetColumn()) {
+      lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.intSetColumn, other.intSetColumn);
+      if (lastComparison != 0) {
+        return lastComparison;
+      }
+    }
+    lastComparison = Boolean.valueOf(isSetIntToStringColumn()).compareTo(other.isSetIntToStringColumn());
+    if (lastComparison != 0) {
+      return lastComparison;
+    }
+    if (isSetIntToStringColumn()) {
+      lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.intToStringColumn, other.intToStringColumn);
+      if (lastComparison != 0) {
+        return lastComparison;
+      }
+    }
+    lastComparison = Boolean.valueOf(isSetComplexColumn()).compareTo(other.isSetComplexColumn());
+    if (lastComparison != 0) {
+      return lastComparison;
+    }
+    if (isSetComplexColumn()) {
+      lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.complexColumn, other.complexColumn);
+      if (lastComparison != 0) {
+        return lastComparison;
+      }
+    }
+    return 0;
+  }
+
+  public _Fields fieldForId(int fieldId) {
+    return _Fields.findByThriftId(fieldId);
+  }
+
+  public void read(org.apache.thrift.protocol.TProtocol iprot) throws org.apache.thrift.TException {
+    schemes.get(iprot.getScheme()).getScheme().read(iprot, this);
+  }
+
+  public void write(org.apache.thrift.protocol.TProtocol oprot) throws org.apache.thrift.TException {
+    schemes.get(oprot.getScheme()).getScheme().write(oprot, this);
+  }
+
+  @Override
+  public String toString() {
+    StringBuilder sb = new StringBuilder("ParquetThriftCompat(");
+    boolean first = true;
+
+    sb.append("boolColumn:");
+    sb.append(this.boolColumn);
+    first = false;
+    if (!first) sb.append(", ");
+    sb.append("byteColumn:");
+    sb.append(this.byteColumn);
+    first = false;
+    if (!first) sb.append(", ");
+    sb.append("shortColumn:");
+    sb.append(this.shortColumn);
+    first = false;
+    if (!first) sb.append(", ");
+    sb.append("intColumn:");
+    sb.append(this.intColumn);
+    first = false;
+    if (!first) sb.append(", ");
+    sb.append("longColumn:");
+    sb.append(this.longColumn);
+    first = false;
+    if (!first) sb.append(", ");
+    sb.append("doubleColumn:");
+    sb.append(this.doubleColumn);
+    first = false;
+    if (!first) sb.append(", ");
+    sb.append("binaryColumn:");
+    if (this.binaryColumn == null) {
+      sb.append("null");
+    } else {
+      org.apache.thrift.TBaseHelper.toString(this.binaryColumn, sb);
+    }
+    first = false;
+    if (!first) sb.append(", ");
+    sb.append("stringColumn:");
+    if (this.stringColumn == null) {
+      sb.append("null");
+    } else {
+      sb.append(this.stringColumn);
+    }
+    first = false;
+    if (!first) sb.append(", ");
+    sb.append("enumColumn:");
+    if (this.enumColumn == null) {
+      sb.append("null");
+    } else {
+      sb.append(this.enumColumn);
+    }
+    first = false;
+    if (isSetMaybeBoolColumn()) {
+      if (!first) sb.append(", ");
+      sb.append("maybeBoolColumn:");
+      sb.append(this.maybeBoolColumn);
+      first = false;
+    }
+    if (isSetMaybeByteColumn()) {
+      if (!first) sb.append(", ");
+      sb.append("maybeByteColumn:");
+      sb.append(this.maybeByteColumn);
+      first = false;
+    }
+    if (isSetMaybeShortColumn()) {
+      if (!first) sb.append(", ");
+      sb.append("maybeShortColumn:");
+      sb.append(this.maybeShortColumn);
+      first = false;
+    }
+    if (isSetMaybeIntColumn()) {
+      if (!first) sb.append(", ");
+      sb.append("maybeIntColumn:");
+      sb.append(this.maybeIntColumn);
+      first = false;
+    }
+    if (isSetMaybeLongColumn()) {
+      if (!first) sb.append(", ");
+      sb.append("maybeLongColumn:");
+      sb.append(this.maybeLongColumn);
+      first = false;
+    }
+    if (isSetMaybeDoubleColumn()) {
+      if (!first) sb.append(", ");
+      sb.append("maybeDoubleColumn:");
+      sb.append(this.maybeDoubleColumn);
+      first = false;
+    }
+    if (isSetMaybeBinaryColumn()) {
+      if (!first) sb.append(", ");
+      sb.append("maybeBinaryColumn:");
+      if (this.maybeBinaryColumn == null) {
+        sb.append("null");
+      } else {
+        org.apache.thrift.TBaseHelper.toString(this.maybeBinaryColumn, sb);
+      }
+      first = false;
+    }
+    if (isSetMaybeStringColumn()) {
+      if (!first) sb.append(", ");
+      sb.append("maybeStringColumn:");
+      if (this.maybeStringColumn == null) {
+        sb.append("null");
+      } else {
+        sb.append(this.maybeStringColumn);
+      }
+      first = false;
+    }
+    if (isSetMaybeEnumColumn()) {
+      if (!first) sb.append(", ");
+      sb.append("maybeEnumColumn:");
+      if (this.maybeEnumColumn == null) {
+        sb.append("null");
+      } else {
+        sb.append(this.maybeEnumColumn);
+      }
+      first = false;
+    }
+    if (!first) sb.append(", ");
+    sb.append("stringsColumn:");
+    if (this.stringsColumn == null) {
+      sb.append("null");
+    } else {
+      sb.append(this.stringsColumn);
+    }
+    first = false;
+    if (!first) sb.append(", ");
+    sb.append("intSetColumn:");
+    if (this.intSetColumn == null) {
+      sb.append("null");
+    } else {
+      sb.append(this.intSetColumn);
+    }
+    first = false;
+    if (!first) sb.append(", ");
+    sb.append("intToStringColumn:");
+    if (this.intToStringColumn == null) {
+      sb.append("null");
+    } else {
+      sb.append(this.intToStringColumn);
+    }
+    first = false;
+    if (!first) sb.append(", ");
+    sb.append("complexColumn:");
+    if (this.complexColumn == null) {
+      sb.append("null");
+    } else {
+      sb.append(this.complexColumn);
+    }
+    first = false;
+    sb.append(")");
+    return sb.toString();
+  }
+
+  public void validate() throws org.apache.thrift.TException {
+    // check for required fields
+    // alas, we cannot check 'boolColumn' because it's a primitive and you chose the non-beans generator.
+    // alas, we cannot check 'byteColumn' because it's a primitive and you chose the non-beans generator.
+    // alas, we cannot check 'shortColumn' because it's a primitive and you chose the non-beans generator.
+    // alas, we cannot check 'intColumn' because it's a primitive and you chose the non-beans generator.
+    // alas, we cannot check 'longColumn' because it's a primitive and you chose the non-beans generator.
+    // alas, we cannot check 'doubleColumn' because it's a primitive and you chose the non-beans generator.
+    if (binaryColumn == null) {
+      throw new org.apache.thrift.protocol.TProtocolException("Required field 'binaryColumn' was not present! Struct: " + toString());
+    }
+    if (stringColumn == null) {
+      throw new org.apache.thrift.protocol.TProtocolException("Required field 'stringColumn' was not present! Struct: " + toString());
+    }
+    if (enumColumn == null) {
+      throw new org.apache.thrift.protocol.TProtocolException("Required field 'enumColumn' was not present! Struct: " + toString());
+    }
+    if (stringsColumn == null) {
+      throw new org.apache.thrift.protocol.TProtocolException("Required field 'stringsColumn' was not present! Struct: " + toString());
+    }
+    if (intSetColumn == null) {
+      throw new org.apache.thrift.protocol.TProtocolException("Required field 'intSetColumn' was not present! Struct: " + toString());
+    }
+    if (intToStringColumn == null) {
+      throw new org.apache.thrift.protocol.TProtocolException("Required field 'intToStringColumn' was not present! Struct: " + toString());
+    }
+    if (complexColumn == null) {
+      throw new org.apache.thrift.protocol.TProtocolException("Required field 'complexColumn' was not present! Struct: " + toString());
+    }
+    // check for sub-struct validity
+  }
+
+  private void writeObject(java.io.ObjectOutputStream out) throws java.io.IOException {
+    try {
+      write(new org.apache.thrift.protocol.TCompactProtocol(new org.apache.thrift.transport.TIOStreamTransport(out)));
+    } catch (org.apache.thrift.TException te) {
+      throw new java.io.IOException(te);
+    }
+  }
+
+  private void readObject(java.io.ObjectInputStream in) throws java.io.IOException, ClassNotFoundException {
+    try {
+      // it doesn't seem like you should have to do this, but java serialization is wacky, and doesn't call the default constructor.
+      __isset_bitfield = 0;
+      read(new org.apache.thrift.protocol.TCompactProtocol(new org.apache.thrift.transport.TIOStreamTransport(in)));
+    } catch (org.apache.thrift.TException te) {
+      throw new java.io.IOException(te);
+    }
+  }
+
+  private static class ParquetThriftCompatStandardSchemeFactory implements SchemeFactory {
+    public ParquetThriftCompatStandardScheme getScheme() {
+      return new ParquetThriftCompatStandardScheme();
+    }
+  }
+
+  private static class ParquetThriftCompatStandardScheme extends StandardScheme<ParquetThriftCompat> {
+
+    public void read(org.apache.thrift.protocol.TProtocol iprot, ParquetThriftCompat struct) throws org.apache.thrift.TException {
+      org.apache.thrift.protocol.TField schemeField;
+      iprot.readStructBegin();
+      while (true)
+      {
+        schemeField = iprot.readFieldBegin();
+        if (schemeField.type == org.apache.thrift.protocol.TType.STOP) { 
+          break;
+        }
+        switch (schemeField.id) {
+          case 1: // BOOL_COLUMN
+            if (schemeField.type == org.apache.thrift.protocol.TType.BOOL) {
+              struct.boolColumn = iprot.readBool();
+              struct.setBoolColumnIsSet(true);
+            } else { 
+              org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
+            }
+            break;
+          case 2: // BYTE_COLUMN
+            if (schemeField.type == org.apache.thrift.protocol.TType.BYTE) {
+              struct.byteColumn = iprot.readByte();
+              struct.setByteColumnIsSet(true);
+            } else { 
+              org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
+            }
+            break;
+          case 3: // SHORT_COLUMN
+            if (schemeField.type == org.apache.thrift.protocol.TType.I16) {
+              struct.shortColumn = iprot.readI16();
+              struct.setShortColumnIsSet(true);
+            } else { 
+              org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
+            }
+            break;
+          case 4: // INT_COLUMN
+            if (schemeField.type == org.apache.thrift.protocol.TType.I32) {
+              struct.intColumn = iprot.readI32();
+              struct.setIntColumnIsSet(true);
+            } else { 
+              org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
+            }
+            break;
+          case 5: // LONG_COLUMN
+            if (schemeField.type == org.apache.thrift.protocol.TType.I64) {
+              struct.longColumn = iprot.readI64();
+              struct.setLongColumnIsSet(true);
+            } else { 
+              org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
+            }
+            break;
+          case 6: // DOUBLE_COLUMN
+            if (schemeField.type == org.apache.thrift.protocol.TType.DOUBLE) {
+              struct.doubleColumn = iprot.readDouble();
+              struct.setDoubleColumnIsSet(true);
+            } else { 
+              org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
+            }
+            break;
+          case 7: // BINARY_COLUMN
+            if (schemeField.type == org.apache.thrift.protocol.TType.STRING) {
+              struct.binaryColumn = iprot.readBinary();
+              struct.setBinaryColumnIsSet(true);
+            } else { 
+              org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
+            }
+            break;
+          case 8: // STRING_COLUMN
+            if (schemeField.type == org.apache.thrift.protocol.TType.STRING) {
+              struct.stringColumn = iprot.readString();
+              struct.setStringColumnIsSet(true);
+            } else { 
+              org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
+            }
+            break;
+          case 9: // ENUM_COLUMN
+            if (schemeField.type == org.apache.thrift.protocol.TType.I32) {
+              struct.enumColumn = org.apache.spark.sql.parquet.test.thrift.Suit.findByValue(iprot.readI32());
+              struct.setEnumColumnIsSet(true);
+            } else { 
+              org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
+            }
+            break;
+          case 10: // MAYBE_BOOL_COLUMN
+            if (schemeField.type == org.apache.thrift.protocol.TType.BOOL) {
+              struct.maybeBoolColumn = iprot.readBool();
+              struct.setMaybeBoolColumnIsSet(true);
+            } else { 
+              org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
+            }
+            break;
+          case 11: // MAYBE_BYTE_COLUMN
+            if (schemeField.type == org.apache.thrift.protocol.TType.BYTE) {
+              struct.maybeByteColumn = iprot.readByte();
+              struct.setMaybeByteColumnIsSet(true);
+            } else { 
+              org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
+            }
+            break;
+          case 12: // MAYBE_SHORT_COLUMN
+            if (schemeField.type == org.apache.thrift.protocol.TType.I16) {
+              struct.maybeShortColumn = iprot.readI16();
+              struct.setMaybeShortColumnIsSet(true);
+            } else { 
+              org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
+            }
+            break;
+          case 13: // MAYBE_INT_COLUMN
+            if (schemeField.type == org.apache.thrift.protocol.TType.I32) {
+              struct.maybeIntColumn = iprot.readI32();
+              struct.setMaybeIntColumnIsSet(true);
+            } else { 
+              org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
+            }
+            break;
+          case 14: // MAYBE_LONG_COLUMN
+            if (schemeField.type == org.apache.thrift.protocol.TType.I64) {
+              struct.maybeLongColumn = iprot.readI64();
+              struct.setMaybeLongColumnIsSet(true);
+            } else { 
+              org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
+            }
+            break;
+          case 15: // MAYBE_DOUBLE_COLUMN
+            if (schemeField.type == org.apache.thrift.protocol.TType.DOUBLE) {
+              struct.maybeDoubleColumn = iprot.readDouble();
+              struct.setMaybeDoubleColumnIsSet(true);
+            } else { 
+              org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
+            }
+            break;
+          case 16: // MAYBE_BINARY_COLUMN
+            if (schemeField.type == org.apache.thrift.protocol.TType.STRING) {
+              struct.maybeBinaryColumn = iprot.readBinary();
+              struct.setMaybeBinaryColumnIsSet(true);
+            } else { 
+              org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
+            }
+            break;
+          case 17: // MAYBE_STRING_COLUMN
+            if (schemeField.type == org.apache.thrift.protocol.TType.STRING) {
+              struct.maybeStringColumn = iprot.readString();
+              struct.setMaybeStringColumnIsSet(true);
+            } else { 
+              org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
+            }
+            break;
+          case 18: // MAYBE_ENUM_COLUMN
+            if (schemeField.type == org.apache.thrift.protocol.TType.I32) {
+              struct.maybeEnumColumn = org.apache.spark.sql.parquet.test.thrift.Suit.findByValue(iprot.readI32());
+              struct.setMaybeEnumColumnIsSet(true);
+            } else { 
+              org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
+            }
+            break;
+          case 19: // STRINGS_COLUMN
+            if (schemeField.type == org.apache.thrift.protocol.TType.LIST) {
+              {
+                org.apache.thrift.protocol.TList _list8 = iprot.readListBegin();
+                struct.stringsColumn = new ArrayList<String>(_list8.size);
+                String _elem9;
+                for (int _i10 = 0; _i10 < _list8.size; ++_i10)
+                {
+                  _elem9 = iprot.readString();
+                  struct.stringsColumn.add(_elem9);
+                }
+                iprot.readListEnd();
+              }
+              struct.setStringsColumnIsSet(true);
+            } else { 
+              org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
+            }
+            break;
+          case 20: // INT_SET_COLUMN
+            if (schemeField.type == org.apache.thrift.protocol.TType.SET) {
+              {
+                org.apache.thrift.protocol.TSet _set11 = iprot.readSetBegin();
+                struct.intSetColumn = new HashSet<Integer>(2*_set11.size);
+                int _elem12;
+                for (int _i13 = 0; _i13 < _set11.size; ++_i13)
+                {
+                  _elem12 = iprot.readI32();
+                  struct.intSetColumn.add(_elem12);
+                }
+                iprot.readSetEnd();
+              }
+              struct.setIntSetColumnIsSet(true);
+            } else { 
+              org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
+            }
+            break;
+          case 21: // INT_TO_STRING_COLUMN
+            if (schemeField.type == org.apache.thrift.protocol.TType.MAP) {
+              {
+                org.apache.thrift.protocol.TMap _map14 = iprot.readMapBegin();
+                struct.intToStringColumn = new HashMap<Integer,String>(2*_map14.size);
+                int _key15;
+                String _val16;
+                for (int _i17 = 0; _i17 < _map14.size; ++_i17)
+                {
+                  _key15 = iprot.readI32();
+                  _val16 = iprot.readString();
+                  struct.intToStringColumn.put(_key15, _val16);
+                }
+                iprot.readMapEnd();
+              }
+              struct.setIntToStringColumnIsSet(true);
+            } else { 
+              org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
+            }
+            break;
+          case 22: // COMPLEX_COLUMN
+            if (schemeField.type == org.apache.thrift.protocol.TType.MAP) {
+              {
+                org.apache.thrift.protocol.TMap _map18 = iprot.readMapBegin();
+                struct.complexColumn = new HashMap<Integer,List<Nested>>(2*_map18.size);
+                int _key19;
+                List<Nested> _val20;
+                for (int _i21 = 0; _i21 < _map18.size; ++_i21)
+                {
+                  _key19 = iprot.readI32();
+                  {
+                    org.apache.thrift.protocol.TList _list22 = iprot.readListBegin();
+                    _val20 = new ArrayList<Nested>(_list22.size);
+                    Nested _elem23;
+                    for (int _i24 = 0; _i24 < _list22.size; ++_i24)
+                    {
+                      _elem23 = new Nested();
+                      _elem23.read(iprot);
+                      _val20.add(_elem23);
+                    }
+                    iprot.readListEnd();
+                  }
+                  struct.complexColumn.put(_key19, _val20);
+                }
+                iprot.readMapEnd();
+              }
+              struct.setComplexColumnIsSet(true);
+            } else { 
+              org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
+            }
+            break;
+          default:
+            org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
+        }
+        iprot.readFieldEnd();
+      }
+      iprot.readStructEnd();
+
+      // check for required fields of primitive type, which can't be checked in the validate method
+      if (!struct.isSetBoolColumn()) {
+        throw new org.apache.thrift.protocol.TProtocolException("Required field 'boolColumn' was not found in serialized data! Struct: " + toString());
+      }
+      if (!struct.isSetByteColumn()) {
+        throw new org.apache.thrift.protocol.TProtocolException("Required field 'byteColumn' was not found in serialized data! Struct: " + toString());
+      }
+      if (!struct.isSetShortColumn()) {
+        throw new org.apache.thrift.protocol.TProtocolException("Required field 'shortColumn' was not found in serialized data! Struct: " + toString());
+      }
+      if (!struct.isSetIntColumn()) {
+        throw new org.apache.thrift.protocol.TProtocolException("Required field 'intColumn' was not found in serialized data! Struct: " + toString());
+      }
+      if (!struct.isSetLongColumn()) {
+        throw new org.apache.thrift.protocol.TProtocolException("Required field 'longColumn' was not found in serialized data! Struct: " + toString());
+      }
+      if (!struct.isSetDoubleColumn()) {
+        throw new org.apache.thrift.protocol.TProtocolException("Required field 'doubleColumn' was not found in serialized data! Struct: " + toString());
+      }
+      struct.validate();
+    }
+
+    public void write(org.apache.thrift.protocol.TProtocol oprot, ParquetThriftCompat struct) throws org.apache.thrift.TException {
+      struct.validate();
+
+      oprot.writeStructBegin(STRUCT_DESC);
+      oprot.writeFieldBegin(BOOL_COLUMN_FIELD_DESC);
+      oprot.writeBool(struct.boolColumn);
+      oprot.writeFieldEnd();
+      oprot.writeFieldBegin(BYTE_COLUMN_FIELD_DESC);
+      oprot.writeByte(struct.byteColumn);
+      oprot.writeFieldEnd();
+      oprot.writeFieldBegin(SHORT_COLUMN_FIELD_DESC);
+      oprot.writeI16(struct.shortColumn);
+      oprot.writeFieldEnd();
+      oprot.writeFieldBegin(INT_COLUMN_FIELD_DESC);
+      oprot.writeI32(struct.intColumn);
+      oprot.writeFieldEnd();
+      oprot.writeFieldBegin(LONG_COLUMN_FIELD_DESC);
+      oprot.writeI64(struct.longColumn);
+      oprot.writeFieldEnd();
+      oprot.writeFieldBegin(DOUBLE_COLUMN_FIELD_DESC);
+      oprot.writeDouble(struct.doubleColumn);
+      oprot.writeFieldEnd();
+      if (struct.binaryColumn != null) {
+        oprot.writeFieldBegin(BINARY_COLUMN_FIELD_DESC);
+        oprot.writeBinary(struct.binaryColumn);
+        oprot.writeFieldEnd();
+      }
+      if (struct.stringColumn != null) {
+        oprot.writeFieldBegin(STRING_COLUMN_FIELD_DESC);
+        oprot.writeString(struct.stringColumn);
+        oprot.writeFieldEnd();
+      }
+      if (struct.enumColumn != null) {
+        oprot.writeFieldBegin(ENUM_COLUMN_FIELD_DESC);
+        oprot.writeI32(struct.enumColumn.getValue());
+        oprot.writeFieldEnd();
+      }
+      if (struct.isSetMaybeBoolColumn()) {
+        oprot.writeFieldBegin(MAYBE_BOOL_COLUMN_FIELD_DESC);
+        oprot.writeBool(struct.maybeBoolColumn);
+        oprot.writeFieldEnd();
+      }
+      if (struct.isSetMaybeByteColumn()) {
+        oprot.writeFieldBegin(MAYBE_BYTE_COLUMN_FIELD_DESC);
+        oprot.writeByte(struct.maybeByteColumn);
+        oprot.writeFieldEnd();
+      }
+      if (struct.isSetMaybeShortColumn()) {
+        oprot.writeFieldBegin(MAYBE_SHORT_COLUMN_FIELD_DESC);
+        oprot.writeI16(struct.maybeShortColumn);
+        oprot.writeFieldEnd();
+      }
+      if (struct.isSetMaybeIntColumn()) {
+        oprot.writeFieldBegin(MAYBE_INT_COLUMN_FIELD_DESC);
+        oprot.writeI32(struct.maybeIntColumn);
+        oprot.writeFieldEnd();
+      }
+      if (struct.isSetMaybeLongColumn()) {
+        oprot.writeFieldBegin(MAYBE_LONG_COLUMN_FIELD_DESC);
+        oprot.writeI64(struct.maybeLongColumn);
+        oprot.writeFieldEnd();
+      }
+      if (struct.isSetMaybeDoubleColumn()) {
+        oprot.writeFieldBegin(MAYBE_DOUBLE_COLUMN_FIELD_DESC);
+        oprot.writeDouble(struct.maybeDoubleColumn);
+        oprot.writeFieldEnd();
+      }
+      if (struct.maybeBinaryColumn != null) {
+        if (struct.isSetMaybeBinaryColumn()) {
+          oprot.writeFieldBegin(MAYBE_BINARY_COLUMN_FIELD_DESC);
+          oprot.writeBinary(struct.maybeBinaryColumn);
+          oprot.writeFieldEnd();
+        }
+      }
+      if (struct.maybeStringColumn != null) {
+        if (struct.isSetMaybeStringColumn()) {
+          oprot.writeFieldBegin(MAYBE_STRING_COLUMN_FIELD_DESC);
+          oprot.writeString(struct.maybeStringColumn);
+          oprot.writeFieldEnd();
+        }
+      }
+      if (struct.maybeEnumColumn != null) {
+        if (struct.isSetMaybeEnumColumn()) {
+          oprot.writeFieldBegin(MAYBE_ENUM_COLUMN_FIELD_DESC);
+          oprot.writeI32(struct.maybeEnumColumn.getValue());
+          oprot.writeFieldEnd();
+        }
+      }
+      if (struct.stringsColumn != null) {
+        oprot.writeFieldBegin(STRINGS_COLUMN_FIELD_DESC);
+        {
+          oprot.writeListBegin(new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRING, struct.stringsColumn.size()));
+          for (String _iter25 : struct.stringsColumn)
+          {
+            oprot.writeString(_iter25);
+          }
+          oprot.writeListEnd();
+        }
+        oprot.writeFieldEnd();
+      }
+      if (struct.intSetColumn != null) {
+        oprot.writeFieldBegin(INT_SET_COLUMN_FIELD_DESC);
+        {
+          oprot.writeSetBegin(new org.apache.thrift.protocol.TSet(org.apache.thrift.protocol.TType.I32, struct.intSetColumn.size()));
+          for (int _iter26 : struct.intSetColumn)
+          {
+            oprot.writeI32(_iter26);
+          }
+          oprot.writeSetEnd();
+        }
+        oprot.writeFieldEnd();
+      }
+      if (struct.intToStringColumn != null) {
+        oprot.writeFieldBegin(INT_TO_STRING_COLUMN_FIELD_DESC);
+        {
+          oprot.writeMapBegin(new org.apache.thrift.protocol.TMap(org.apache.thrift.protocol.TType.I32, org.apache.thrift.protocol.TType.STRING, struct.intToStringColumn.size()));
+          for (Map.Entry<Integer, String> _iter27 : struct.intToStringColumn.entrySet())
+          {
+            oprot.writeI32(_iter27.getKey());
+            oprot.writeString(_iter27.getValue());
+          }
+          oprot.writeMapEnd();
+        }
+        oprot.writeFieldEnd();
+      }
+      if (struct.complexColumn != null) {
+        oprot.writeFieldBegin(COMPLEX_COLUMN_FIELD_DESC);
+        {
+          oprot.writeMapBegin(new org.apache.thrift.protocol.TMap(org.apache.thrift.protocol.TType.I32, org.apache.thrift.protocol.TType.LIST, struct.complexColumn.size()));
+          for (Map.Entry<Integer, List<Nested>> _iter28 : struct.complexColumn.entrySet())
+          {
+            oprot.writeI32(_iter28.getKey());
+            {
+              oprot.writeListBegin(new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRUCT, _iter28.getValue().size()));
+              for (Nested _iter29 : _iter28.getValue())
+              {
+                _iter29.write(oprot);
+              }
+              oprot.writeListEnd();
+            }
+          }
+          oprot.writeMapEnd();
+        }
+        oprot.writeFieldEnd();
+      }
+      oprot.writeFieldStop();
+      oprot.writeStructEnd();
+    }
+
+  }
+
+  private static class ParquetThriftCompatTupleSchemeFactory implements SchemeFactory {
+    public ParquetThriftCompatTupleScheme getScheme() {
+      return new ParquetThriftCompatTupleScheme();
+    }
+  }
+
+  private static class ParquetThriftCompatTupleScheme extends TupleScheme<ParquetThriftCompat> {
+
+    @Override
+    public void write(org.apache.thrift.protocol.TProtocol prot, ParquetThriftCompat struct) throws org.apache.thrift.TException {
+      TTupleProtocol oprot = (TTupleProtocol) prot;
+      oprot.writeBool(struct.boolColumn);
+      oprot.writeByte(struct.byteColumn);
+      oprot.writeI16(struct.shortColumn);
+      oprot.writeI32(struct.intColumn);
+      oprot.writeI64(struct.longColumn);
+      oprot.writeDouble(struct.doubleColumn);
+      oprot.writeBinary(struct.binaryColumn);
+      oprot.writeString(struct.stringColumn);
+      oprot.writeI32(struct.enumColumn.getValue());
+      {
+        oprot.writeI32(struct.stringsColumn.size());
+        for (String _iter30 : struct.stringsColumn)
+        {
+          oprot.writeString(_iter30);
+        }
+      }
+      {
+        oprot.writeI32(struct.intSetColumn.size());
+        for (int _iter31 : struct.intSetColumn)
+        {
+          oprot.writeI32(_iter31);
+        }
+      }
+      {
+        oprot.writeI32(struct.intToStringColumn.size());
+        for (Map.Entry<Integer, String> _iter32 : struct.intToStringColumn.entrySet())
+        {
+          oprot.writeI32(_iter32.getKey());
+          oprot.writeString(_iter32.getValue());
+        }
+      }
+      {
+        oprot.writeI32(struct.complexColumn.size());
+        for (Map.Entry<Integer, List<Nested>> _iter33 : struct.complexColumn.entrySet())
+        {
+          oprot.writeI32(_iter33.getKey());
+          {
+            oprot.writeI32(_iter33.getValue().size());
+            for (Nested _iter34 : _iter33.getValue())
+            {
+              _iter34.write(oprot);
+            }
+          }
+        }
+      }
+      BitSet optionals = new BitSet();
+      if (struct.isSetMaybeBoolColumn()) {
+        optionals.set(0);
+      }
+      if (struct.isSetMaybeByteColumn()) {
+        optionals.set(1);
+      }
+      if (struct.isSetMaybeShortColumn()) {
+        optionals.set(2);
+      }
+      if (struct.isSetMaybeIntColumn()) {
+        optionals.set(3);
+      }
+      if (struct.isSetMaybeLongColumn()) {
+        optionals.set(4);
+      }
+      if (struct.isSetMaybeDoubleColumn()) {
+        optionals.set(5);
+      }
+      if (struct.isSetMaybeBinaryColumn()) {
+        optionals.set(6);
+      }
+      if (struct.isSetMaybeStringColumn()) {
+        optionals.set(7);
+      }
+      if (struct.isSetMaybeEnumColumn()) {
+        optionals.set(8);
+      }
+      oprot.writeBitSet(optionals, 9);
+      if (struct.isSetMaybeBoolColumn()) {
+        oprot.writeBool(struct.maybeBoolColumn);
+      }
+      if (struct.isSetMaybeByteColumn()) {
+        oprot.writeByte(struct.maybeByteColumn);
+      }
+      if (struct.isSetMaybeShortColumn()) {
+        oprot.writeI16(struct.maybeShortColumn);
+      }
+      if (struct.isSetMaybeIntColumn()) {
+        oprot.writeI32(struct.maybeIntColumn);
+      }
+      if (struct.isSetMaybeLongColumn()) {
+        oprot.writeI64(struct.maybeLongColumn);
+      }
+      if (struct.isSetMaybeDoubleColumn()) {
+        oprot.writeDouble(struct.maybeDoubleColumn);
+      }
+      if (struct.isSetMaybeBinaryColumn()) {
+        oprot.writeBinary(struct.maybeBinaryColumn);
+      }
+      if (struct.isSetMaybeStringColumn()) {
+        oprot.writeString(struct.maybeStringColumn);
+      }
+      if (struct.isSetMaybeEnumColumn()) {
+        oprot.writeI32(struct.maybeEnumColumn.getValue());
+      }
+    }
+
+    @Override
+    public void read(org.apache.thrift.protocol.TProtocol prot, ParquetThriftCompat struct) throws org.apache.thrift.TException {
+      TTupleProtocol iprot = (TTupleProtocol) prot;
+      struct.boolColumn = iprot.readBool();
+      struct.setBoolColumnIsSet(true);
+      struct.byteColumn = iprot.readByte();
+      struct.setByteColumnIsSet(true);
+      struct.shortColumn = iprot.readI16();
+      struct.setShortColumnIsSet(true);
+      struct.intColumn = iprot.readI32();
+      struct.setIntColumnIsSet(true);
+      struct.longColumn = iprot.readI64();
+      struct.setLongColumnIsSet(true);
+      struct.doubleColumn = iprot.readDouble();
+      struct.setDoubleColumnIsSet(true);
+      struct.binaryColumn = iprot.readBinary();
+      struct.setBinaryColumnIsSet(true);
+      struct.stringColumn = iprot.readString();
+      struct.setStringColumnIsSet(true);
+      struct.enumColumn = org.apache.spark.sql.parquet.test.thrift.Suit.findByValue(iprot.readI32());
+      struct.setEnumColumnIsSet(true);
+      {
+        org.apache.thrift.protocol.TList _list35 = new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRING, iprot.readI32());
+        struct.stringsColumn = new ArrayList<String>(_list35.size);
+        String _elem36;
+        for (int _i37 = 0; _i37 < _list35.size; ++_i37)
+        {
+          _elem36 = iprot.readString();
+          struct.stringsColumn.add(_elem36);
+        }
+      }
+      struct.setStringsColumnIsSet(true);
+      {
+        org.apache.thrift.protocol.TSet _set38 = new org.apache.thrift.protocol.TSet(org.apache.thrift.protocol.TType.I32, iprot.readI32());
+        struct.intSetColumn = new HashSet<Integer>(2*_set38.size);
+        int _elem39;
+        for (int _i40 = 0; _i40 < _set38.size; ++_i40)
+        {
+          _elem39 = iprot.readI32();
+          struct.intSetColumn.add(_elem39);
+        }
+      }
+      struct.setIntSetColumnIsSet(true);
+      {
+        org.apache.thrift.protocol.TMap _map41 = new org.apache.thrift.protocol.TMap(org.apache.thrift.protocol.TType.I32, org.apache.thrift.protocol.TType.STRING, iprot.readI32());
+        struct.intToStringColumn = new HashMap<Integer,String>(2*_map41.size);
+        int _key42;
+        String _val43;
+        for (int _i44 = 0; _i44 < _map41.size; ++_i44)
+        {
+          _key42 = iprot.readI32();
+          _val43 = iprot.readString();
+          struct.intToStringColumn.put(_key42, _val43);
+        }
+      }
+      struct.setIntToStringColumnIsSet(true);
+      {
+        org.apache.thrift.protocol.TMap _map45 = new org.apache.thrift.protocol.TMap(org.apache.thrift.protocol.TType.I32, org.apache.thrift.protocol.TType.LIST, iprot.readI32());
+        struct.complexColumn = new HashMap<Integer,List<Nested>>(2*_map45.size);
+        int _key46;
+        List<Nested> _val47;
+        for (int _i48 = 0; _i48 < _map45.size; ++_i48)
+        {
+          _key46 = iprot.readI32();
+          {
+            org.apache.thrift.protocol.TList _list49 = new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRUCT, iprot.readI32());
+            _val47 = new ArrayList<Nested>(_list49.size);
+            Nested _elem50;
+            for (int _i51 = 0; _i51 < _list49.size; ++_i51)
+            {
+              _elem50 = new Nested();
+              _elem50.read(iprot);
+              _val47.add(_elem50);
+            }
+          }
+          struct.complexColumn.put(_key46, _val47);
+        }
+      }
+      struct.setComplexColumnIsSet(true);
+      BitSet incoming = iprot.readBitSet(9);
+      if (incoming.get(0)) {
+        struct.maybeBoolColumn = iprot.readBool();
+        struct.setMaybeBoolColumnIsSet(true);
+      }
+      if (incoming.get(1)) {
+        struct.maybeByteColumn = iprot.readByte();
+        struct.setMaybeByteColumnIsSet(true);
+      }
+      if (incoming.get(2)) {
+        struct.maybeShortColumn = iprot.readI16();
+        struct.setMaybeShortColumnIsSet(true);
+      }
+      if (incoming.get(3)) {
+        struct.maybeIntColumn = iprot.readI32();
+        struct.setMaybeIntColumnIsSet(true);
+      }
+      if (incoming.get(4)) {
+        struct.maybeLongColumn = iprot.readI64();
+        struct.setMaybeLongColumnIsSet(true);
+      }
+      if (incoming.get(5)) {
+        struct.maybeDoubleColumn = iprot.readDouble();
+        struct.setMaybeDoubleColumnIsSet(true);
+      }
+      if (incoming.get(6)) {
+        struct.maybeBinaryColumn = iprot.readBinary();
+        struct.setMaybeBinaryColumnIsSet(true);
+      }
+      if (incoming.get(7)) {
+        struct.maybeStringColumn = iprot.readString();
+        struct.setMaybeStringColumnIsSet(true);
+      }
+      if (incoming.get(8)) {
+        struct.maybeEnumColumn = org.apache.spark.sql.parquet.test.thrift.Suit.findByValue(iprot.readI32());
+        struct.setMaybeEnumColumnIsSet(true);
+      }
+    }
+  }
+
+}
+
diff --git a/sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/thrift/Suit.java b/sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/thrift/Suit.java
new file mode 100644
index 0000000000000..5315c6aae9372
--- /dev/null
+++ b/sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/thrift/Suit.java
@@ -0,0 +1,51 @@
+/**
+ * Autogenerated by Thrift Compiler (0.9.2)
+ *
+ * DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
+ *  @generated
+ */
+package org.apache.spark.sql.parquet.test.thrift;
+
+
+import java.util.Map;
+import java.util.HashMap;
+import org.apache.thrift.TEnum;
+
+public enum Suit implements org.apache.thrift.TEnum {
+  SPADES(0),
+  HEARTS(1),
+  DIAMONDS(2),
+  CLUBS(3);
+
+  private final int value;
+
+  private Suit(int value) {
+    this.value = value;
+  }
+
+  /**
+   * Get the integer value of this enum value, as defined in the Thrift IDL.
+   */
+  public int getValue() {
+    return value;
+  }
+
+  /**
+   * Find a the enum type by its integer value, as defined in the Thrift IDL.
+   * @return null if the value is not found.
+   */
+  public static Suit findByValue(int value) { 
+    switch (value) {
+      case 0:
+        return SPADES;
+      case 1:
+        return HEARTS;
+      case 2:
+        return DIAMONDS;
+      case 3:
+        return CLUBS;
+      default:
+        return null;
+    }
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetAvroCompatibilitySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetAvroCompatibilitySuite.scala
new file mode 100644
index 0000000000000..bfa427349ff6a
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetAvroCompatibilitySuite.scala
@@ -0,0 +1,125 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.parquet
+
+import java.nio.ByteBuffer
+import java.util.{List => JList, Map => JMap}
+
+import scala.collection.JavaConversions._
+
+import org.apache.hadoop.fs.Path
+import org.apache.parquet.avro.AvroParquetWriter
+
+import org.apache.spark.sql.parquet.test.avro.{Nested, ParquetAvroCompat}
+import org.apache.spark.sql.test.TestSQLContext
+import org.apache.spark.sql.{Row, SQLContext}
+
+class ParquetAvroCompatibilitySuite extends ParquetCompatibilityTest {
+  import ParquetCompatibilityTest._
+
+  override val sqlContext: SQLContext = TestSQLContext
+
+  override protected def beforeAll(): Unit = {
+    super.beforeAll()
+
+    val writer =
+      new AvroParquetWriter[ParquetAvroCompat](
+        new Path(parquetStore.getCanonicalPath),
+        ParquetAvroCompat.getClassSchema)
+
+    (0 until 10).foreach(i => writer.write(makeParquetAvroCompat(i)))
+    writer.close()
+  }
+
+  test("Read Parquet file generated by parquet-avro") {
+    logInfo(
+      s"""Schema of the Parquet file written by parquet-avro:
+         |${readParquetSchema(parquetStore.getCanonicalPath)}
+       """.stripMargin)
+
+    checkAnswer(sqlContext.read.parquet(parquetStore.getCanonicalPath), (0 until 10).map { i =>
+      def nullable[T <: AnyRef]: ( => T) => T = makeNullable[T](i)
+
+      Row(
+        i % 2 == 0,
+        i,
+        i.toLong * 10,
+        i.toFloat + 0.1f,
+        i.toDouble + 0.2d,
+        s"val_$i".getBytes,
+        s"val_$i",
+
+        nullable(i % 2 == 0: java.lang.Boolean),
+        nullable(i: Integer),
+        nullable(i.toLong: java.lang.Long),
+        nullable(i.toFloat + 0.1f: java.lang.Float),
+        nullable(i.toDouble + 0.2d: java.lang.Double),
+        nullable(s"val_$i".getBytes),
+        nullable(s"val_$i"),
+
+        Seq.tabulate(3)(n => s"arr_${i + n}"),
+        Seq.tabulate(3)(n => n.toString -> (i + n: Integer)).toMap,
+        Seq.tabulate(3) { n =>
+          (i + n).toString -> Seq.tabulate(3) { m =>
+            Row(Seq.tabulate(3)(j => i + j + m), s"val_${i + m}")
+          }
+        }.toMap)
+    })
+  }
+
+  def makeParquetAvroCompat(i: Int): ParquetAvroCompat = {
+    def nullable[T <: AnyRef] = makeNullable[T](i) _
+
+    def makeComplexColumn(i: Int): JMap[String, JList[Nested]] = {
+      mapAsJavaMap(Seq.tabulate(3) { n =>
+        (i + n).toString -> seqAsJavaList(Seq.tabulate(3) { m =>
+          Nested
+            .newBuilder()
+            .setNestedIntsColumn(seqAsJavaList(Seq.tabulate(3)(j => i + j + m)))
+            .setNestedStringColumn(s"val_${i + m}")
+            .build()
+        })
+      }.toMap)
+    }
+
+    ParquetAvroCompat
+      .newBuilder()
+      .setBoolColumn(i % 2 == 0)
+      .setIntColumn(i)
+      .setLongColumn(i.toLong * 10)
+      .setFloatColumn(i.toFloat + 0.1f)
+      .setDoubleColumn(i.toDouble + 0.2d)
+      .setBinaryColumn(ByteBuffer.wrap(s"val_$i".getBytes))
+      .setStringColumn(s"val_$i")
+
+      .setMaybeBoolColumn(nullable(i % 2 == 0: java.lang.Boolean))
+      .setMaybeIntColumn(nullable(i: Integer))
+      .setMaybeLongColumn(nullable(i.toLong: java.lang.Long))
+      .setMaybeFloatColumn(nullable(i.toFloat + 0.1f: java.lang.Float))
+      .setMaybeDoubleColumn(nullable(i.toDouble + 0.2d: java.lang.Double))
+      .setMaybeBinaryColumn(nullable(ByteBuffer.wrap(s"val_$i".getBytes)))
+      .setMaybeStringColumn(nullable(s"val_$i"))
+
+      .setStringsColumn(Seq.tabulate(3)(n => s"arr_${i + n}"))
+      .setStringToIntColumn(
+        mapAsJavaMap(Seq.tabulate(3)(n => n.toString -> (i + n: Integer)).toMap))
+      .setComplexColumn(makeComplexColumn(i))
+
+      .build()
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetCompatibilityTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetCompatibilityTest.scala
new file mode 100644
index 0000000000000..b4cdfd9e98f6f
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetCompatibilityTest.scala
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.parquet
+import java.io.File
+
+import scala.collection.JavaConversions._
+
+import org.apache.hadoop.fs.Path
+import org.apache.parquet.hadoop.ParquetFileReader
+import org.apache.parquet.schema.MessageType
+import org.scalatest.BeforeAndAfterAll
+
+import org.apache.spark.sql.QueryTest
+import org.apache.spark.util.Utils
+
+abstract class ParquetCompatibilityTest extends QueryTest with ParquetTest with BeforeAndAfterAll {
+  protected var parquetStore: File = _
+
+  override protected def beforeAll(): Unit = {
+    parquetStore = Utils.createTempDir(namePrefix = "parquet-compat_")
+    parquetStore.delete()
+  }
+
+  override protected def afterAll(): Unit = {
+    Utils.deleteRecursively(parquetStore)
+  }
+
+  def readParquetSchema(path: String): MessageType = {
+    val fsPath = new Path(path)
+    val fs = fsPath.getFileSystem(configuration)
+    val parquetFiles = fs.listStatus(fsPath).toSeq.filterNot(_.getPath.getName.startsWith("_"))
+    val footers = ParquetFileReader.readAllFootersInParallel(configuration, parquetFiles, true)
+    footers.head.getParquetMetadata.getFileMetaData.getSchema
+  }
+}
+
+object ParquetCompatibilityTest {
+  def makeNullable[T <: AnyRef](i: Int)(f: => T): T = {
+    if (i % 3 == 0) null.asInstanceOf[T] else f
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetThriftCompatibilitySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetThriftCompatibilitySuite.scala
new file mode 100644
index 0000000000000..d22066cabc567
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetThriftCompatibilitySuite.scala
@@ -0,0 +1,140 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.parquet
+
+import java.nio.ByteBuffer
+import java.util.{List => JList, Map => JMap}
+
+import scala.collection.JavaConversions._
+
+import org.apache.hadoop.fs.Path
+import org.apache.parquet.hadoop.metadata.CompressionCodecName
+import org.apache.parquet.thrift.ThriftParquetWriter
+
+import org.apache.spark.sql.parquet.test.thrift.{Nested, ParquetThriftCompat, Suit}
+import org.apache.spark.sql.test.TestSQLContext
+import org.apache.spark.sql.{Row, SQLContext}
+
+class ParquetThriftCompatibilitySuite extends ParquetCompatibilityTest {
+  import ParquetCompatibilityTest._
+
+  override val sqlContext: SQLContext = TestSQLContext
+
+  override protected def beforeAll(): Unit = {
+    super.beforeAll()
+
+    val writer =
+      new ThriftParquetWriter[ParquetThriftCompat](
+        new Path(parquetStore.getCanonicalPath),
+        classOf[ParquetThriftCompat],
+        CompressionCodecName.SNAPPY)
+
+    (0 until 10).foreach(i => writer.write(makeParquetThriftCompat(i)))
+    writer.close()
+  }
+
+  test("Read Parquet file generated by parquet-thrift") {
+    logInfo(
+      s"""Schema of the Parquet file written by parquet-thrift:
+         |${readParquetSchema(parquetStore.getCanonicalPath)}
+       """.stripMargin)
+
+    checkAnswer(sqlContext.read.parquet(parquetStore.getCanonicalPath), (0 until 10).map { i =>
+      def nullable[T <: AnyRef]: ( => T) => T = makeNullable[T](i)
+
+      Row(
+        i % 2 == 0,
+        i.toByte,
+        (i + 1).toShort,
+        i + 2,
+        i.toLong * 10,
+        i.toDouble + 0.2d,
+        // Thrift `BINARY` values are actually unencoded `STRING` values, and thus are always
+        // treated as `BINARY (UTF8)` in parquet-thrift, since parquet-thrift always assume
+        // Thrift `STRING`s are encoded using UTF-8.
+        s"val_$i",
+        s"val_$i",
+        // Thrift ENUM values are converted to Parquet binaries containing UTF-8 strings
+        Suit.values()(i % 4).name(),
+
+        nullable(i % 2 == 0: java.lang.Boolean),
+        nullable(i.toByte: java.lang.Byte),
+        nullable((i + 1).toShort: java.lang.Short),
+        nullable(i + 2: Integer),
+        nullable((i * 10).toLong: java.lang.Long),
+        nullable(i.toDouble + 0.2d: java.lang.Double),
+        nullable(s"val_$i"),
+        nullable(s"val_$i"),
+        nullable(Suit.values()(i % 4).name()),
+
+        Seq.tabulate(3)(n => s"arr_${i + n}"),
+        // Thrift `SET`s are converted to Parquet `LIST`s
+        Seq(i),
+        Seq.tabulate(3)(n => (i + n: Integer) -> s"val_${i + n}").toMap,
+        Seq.tabulate(3) { n =>
+          (i + n) -> Seq.tabulate(3) { m =>
+            Row(Seq.tabulate(3)(j => i + j + m), s"val_${i + m}")
+          }
+        }.toMap)
+    })
+  }
+
+  def makeParquetThriftCompat(i: Int): ParquetThriftCompat = {
+    def makeComplexColumn(i: Int): JMap[Integer, JList[Nested]] = {
+      mapAsJavaMap(Seq.tabulate(3) { n =>
+        (i + n: Integer) -> seqAsJavaList(Seq.tabulate(3) { m =>
+          new Nested(
+            seqAsJavaList(Seq.tabulate(3)(j => i + j + m)),
+            s"val_${i + m}")
+        })
+      }.toMap)
+    }
+
+    val value =
+      new ParquetThriftCompat(
+        i % 2 == 0,
+        i.toByte,
+        (i + 1).toShort,
+        i + 2,
+        i.toLong * 10,
+        i.toDouble + 0.2d,
+        ByteBuffer.wrap(s"val_$i".getBytes),
+        s"val_$i",
+        Suit.values()(i % 4),
+
+        seqAsJavaList(Seq.tabulate(3)(n => s"arr_${i + n}")),
+        setAsJavaSet(Set(i)),
+        mapAsJavaMap(Seq.tabulate(3)(n => (i + n: Integer) -> s"val_${i + n}").toMap),
+        makeComplexColumn(i))
+
+    if (i % 3 == 0) {
+      value
+    } else {
+      value
+        .setMaybeBoolColumn(i % 2 == 0)
+        .setMaybeByteColumn(i.toByte)
+        .setMaybeShortColumn((i + 1).toShort)
+        .setMaybeIntColumn(i + 2)
+        .setMaybeLongColumn(i.toLong * 10)
+        .setMaybeDoubleColumn(i.toDouble + 0.2d)
+        .setMaybeBinaryColumn(ByteBuffer.wrap(s"val_$i".getBytes))
+        .setMaybeStringColumn(s"val_$i")
+        .setMaybeEnumColumn(Suit.values()(i % 4))
+    }
+  }
+}
diff --git a/sql/core/src/test/scripts/gen-code.sh b/sql/core/src/test/scripts/gen-code.sh
new file mode 100755
index 0000000000000..5d8d8ad08555c
--- /dev/null
+++ b/sql/core/src/test/scripts/gen-code.sh
@@ -0,0 +1,31 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+cd $(dirname $0)/..
+BASEDIR=`pwd`
+cd -
+
+rm -rf $BASEDIR/gen-java
+mkdir -p $BASEDIR/gen-java
+
+thrift\
+    --gen java\
+    -out $BASEDIR/gen-java\
+    $BASEDIR/thrift/parquet-compat.thrift
+
+avro-tools idl $BASEDIR/avro/parquet-compat.avdl > $BASEDIR/avro/parquet-compat.avpr
+avro-tools compile -string protocol $BASEDIR/avro/parquet-compat.avpr $BASEDIR/gen-java
diff --git a/sql/core/src/test/thrift/parquet-compat.thrift b/sql/core/src/test/thrift/parquet-compat.thrift
new file mode 100644
index 0000000000000..fa5ed8c62306a
--- /dev/null
+++ b/sql/core/src/test/thrift/parquet-compat.thrift
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace java org.apache.spark.sql.parquet.test.thrift
+
+enum Suit {
+    SPADES,
+    HEARTS,
+    DIAMONDS,
+    CLUBS
+}
+
+struct Nested {
+    1: required list<i32> nestedIntsColumn;
+    2: required string nestedStringColumn;
+}
+
+/**
+ * This is a test struct for testing parquet-thrift compatibility.
+ */
+struct ParquetThriftCompat {
+    1: required bool boolColumn;
+    2: required byte byteColumn;
+    3: required i16 shortColumn;
+    4: required i32 intColumn;
+    5: required i64 longColumn;
+    6: required double doubleColumn;
+    7: required binary binaryColumn;
+    8: required string stringColumn;
+    9: required Suit enumColumn
+
+    10: optional bool maybeBoolColumn;
+    11: optional byte maybeByteColumn;
+    12: optional i16 maybeShortColumn;
+    13: optional i32 maybeIntColumn;
+    14: optional i64 maybeLongColumn;
+    15: optional double maybeDoubleColumn;
+    16: optional binary maybeBinaryColumn;
+    17: optional string maybeStringColumn;
+    18: optional Suit maybeEnumColumn;
+
+    19: required list<string> stringsColumn;
+    20: required set<i32> intSetColumn;
+    21: required map<i32, string> intToStringColumn;
+    22: required map<i32, list<Nested>> complexColumn;
+}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/ParquetHiveCompatibilitySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/ParquetHiveCompatibilitySuite.scala
new file mode 100644
index 0000000000000..bb5f1febe9ad4
--- /dev/null
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/ParquetHiveCompatibilitySuite.scala
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive
+
+import org.apache.spark.sql.hive.test.TestHive
+import org.apache.spark.sql.parquet.ParquetCompatibilityTest
+import org.apache.spark.sql.{Row, SQLConf, SQLContext}
+
+class ParquetHiveCompatibilitySuite extends ParquetCompatibilityTest {
+  import ParquetCompatibilityTest.makeNullable
+
+  override val sqlContext: SQLContext = TestHive
+
+  override protected def beforeAll(): Unit = {
+    super.beforeAll()
+
+    withSQLConf(HiveContext.CONVERT_METASTORE_PARQUET.key -> "false") {
+      withTempTable("data") {
+        sqlContext.sql(
+          s"""CREATE TABLE parquet_compat(
+             |  bool_column BOOLEAN,
+             |  byte_column TINYINT,
+             |  short_column SMALLINT,
+             |  int_column INT,
+             |  long_column BIGINT,
+             |  float_column FLOAT,
+             |  double_column DOUBLE,
+             |
+             |  strings_column ARRAY<STRING>,
+             |  int_to_string_column MAP<INT, STRING>
+             |)
+             |STORED AS PARQUET
+             |LOCATION '${parquetStore.getCanonicalPath}'
+           """.stripMargin)
+
+        val schema = sqlContext.table("parquet_compat").schema
+        val rowRDD = sqlContext.sparkContext.parallelize(makeRows).coalesce(1)
+        sqlContext.createDataFrame(rowRDD, schema).registerTempTable("data")
+        sqlContext.sql("INSERT INTO TABLE parquet_compat SELECT * FROM data")
+      }
+    }
+  }
+
+  override protected def afterAll(): Unit = {
+    sqlContext.sql("DROP TABLE parquet_compat")
+  }
+
+  test("Read Parquet file generated by parquet-hive") {
+    logInfo(
+      s"""Schema of the Parquet file written by parquet-hive:
+         |${readParquetSchema(parquetStore.getCanonicalPath)}
+       """.stripMargin)
+
+    // Unfortunately parquet-hive doesn't add `UTF8` annotation to BINARY when writing strings.
+    // Have to assume all BINARY values are strings here.
+    withSQLConf(SQLConf.PARQUET_BINARY_AS_STRING.key -> "true") {
+      checkAnswer(sqlContext.read.parquet(parquetStore.getCanonicalPath), makeRows)
+    }
+  }
+
+  def makeRows: Seq[Row] = {
+    (0 until 10).map { i =>
+      def nullable[T <: AnyRef]: ( => T) => T = makeNullable[T](i)
+
+      Row(
+        nullable(i % 2 == 0: java.lang.Boolean),
+        nullable(i.toByte: java.lang.Byte),
+        nullable((i + 1).toShort: java.lang.Short),
+        nullable(i + 2: Integer),
+        nullable(i.toLong * 10: java.lang.Long),
+        nullable(i.toFloat + 0.1f: java.lang.Float),
+        nullable(i.toDouble + 0.2d: java.lang.Double),
+        nullable(Seq.tabulate(3)(n => s"arr_${i + n}")),
+        nullable(Seq.tabulate(3)(n => (i + n: Integer) -> s"val_${i + n}").toMap))
+    }
+  }
+}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
index c2e09800933b5..9d79a4b007d66 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
@@ -21,14 +21,16 @@ import java.io.File
 
 import org.scalatest.BeforeAndAfterAll
 
+import org.apache.spark.sql._
 import org.apache.spark.sql.execution.{ExecutedCommand, PhysicalRDD}
 import org.apache.spark.sql.hive.execution.HiveTableScan
+import org.apache.spark.sql.hive.test.TestHive
 import org.apache.spark.sql.hive.test.TestHive._
 import org.apache.spark.sql.hive.test.TestHive.implicits._
 import org.apache.spark.sql.parquet.{ParquetRelation2, ParquetTableScan}
 import org.apache.spark.sql.sources.{InsertIntoDataSource, InsertIntoHadoopFsRelation, LogicalRelation}
+import org.apache.spark.sql.test.SQLTestUtils
 import org.apache.spark.sql.types._
-import org.apache.spark.sql.{DataFrame, QueryTest, Row, SQLConf, SaveMode}
 import org.apache.spark.util.Utils
 
 // The data where the partitioning key exists only in the directory structure.
@@ -685,6 +687,31 @@ class ParquetSourceSuiteBase extends ParquetPartitioningTest {
 
     sql("drop table spark_6016_fix")
   }
+
+  test("SPARK-8811: compatibility with array of struct in Hive") {
+    withTempPath { dir =>
+      val path = dir.getCanonicalPath
+
+      withTable("array_of_struct") {
+        val conf = Seq(
+          HiveContext.CONVERT_METASTORE_PARQUET.key -> "false",
+          SQLConf.PARQUET_BINARY_AS_STRING.key -> "true",
+          SQLConf.PARQUET_FOLLOW_PARQUET_FORMAT_SPEC.key -> "true")
+
+        withSQLConf(conf: _*) {
+          sql(
+            s"""CREATE TABLE array_of_struct
+               |STORED AS PARQUET LOCATION '$path'
+               |AS SELECT '1st', '2nd', ARRAY(NAMED_STRUCT('a', 'val_a', 'b', 'val_b'))
+             """.stripMargin)
+
+          checkAnswer(
+            sqlContext.read.parquet(path),
+            Row("1st", "2nd", Seq(Row("val_a", "val_b"))))
+        }
+      }
+    }
+  }
 }
 
 class ParquetDataSourceOnSourceSuite extends ParquetSourceSuiteBase {
@@ -762,7 +789,9 @@ class ParquetDataSourceOffSourceSuite extends ParquetSourceSuiteBase {
 /**
  * A collection of tests for parquet data with various forms of partitioning.
  */
-abstract class ParquetPartitioningTest extends QueryTest with BeforeAndAfterAll {
+abstract class ParquetPartitioningTest extends QueryTest with SQLTestUtils with BeforeAndAfterAll {
+  override def sqlContext: SQLContext = TestHive
+
   var partitionedTableDir: File = null
   var normalTableDir: File = null
   var partitionedTableDirWithKey: File = null

From 381cb161ba4e3a30f2da3c4ef4ee19869d51f101 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Wed, 8 Jul 2015 16:21:28 -0700
Subject: [PATCH 065/549] [SPARK-8068] [MLLIB] Add confusionMatrix method at
 class MulticlassMetrics in pyspark/mllib

Add confusionMatrix method at class MulticlassMetrics in pyspark/mllib

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #7286 from yanboliang/spark-8068 and squashes the following commits:

6109fe1 [Yanbo Liang] Add confusionMatrix method at class MulticlassMetrics in pyspark/mllib
---
 python/pyspark/mllib/evaluation.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/python/pyspark/mllib/evaluation.py b/python/pyspark/mllib/evaluation.py
index c5cf3a4e7ff22..f21403707e12a 100644
--- a/python/pyspark/mllib/evaluation.py
+++ b/python/pyspark/mllib/evaluation.py
@@ -152,6 +152,10 @@ class MulticlassMetrics(JavaModelWrapper):
     >>> predictionAndLabels = sc.parallelize([(0.0, 0.0), (0.0, 1.0), (0.0, 0.0),
     ...     (1.0, 0.0), (1.0, 1.0), (1.0, 1.0), (1.0, 1.0), (2.0, 2.0), (2.0, 0.0)])
     >>> metrics = MulticlassMetrics(predictionAndLabels)
+    >>> metrics.confusionMatrix().toArray()
+    array([[ 2.,  1.,  1.],
+           [ 1.,  3.,  0.],
+           [ 0.,  0.,  1.]])
     >>> metrics.falsePositiveRate(0.0)
     0.2...
     >>> metrics.precision(1.0)
@@ -186,6 +190,13 @@ def __init__(self, predictionAndLabels):
         java_model = java_class(df._jdf)
         super(MulticlassMetrics, self).__init__(java_model)
 
+    def confusionMatrix(self):
+        """
+        Returns confusion matrix: predicted classes are in columns,
+        they are ordered by class label ascending, as in "labels".
+        """
+        return self.call("confusionMatrix")
+
     def truePositiveRate(self, label):
         """
         Returns true positive rate for a given label (category).

From 8c32b2e870c7c250a63e838718df833edf6dea07 Mon Sep 17 00:00:00 2001
From: Feynman Liang <fliang@databricks.com>
Date: Wed, 8 Jul 2015 16:27:11 -0700
Subject: [PATCH 066/549] [SPARK-8877] [MLLIB] Public API for association rule
 generation

Adds FPGrowth.generateAssociationRules to public API for generating association rules after mining frequent itemsets.

Author: Feynman Liang <fliang@databricks.com>

Closes #7271 from feynmanliang/SPARK-8877 and squashes the following commits:

83b8baf [Feynman Liang] Add API Doc
867abff [Feynman Liang] Add FPGrowth.generateAssociationRules and change access modifiers for AssociationRules
---
 .../spark/mllib/fpm/AssociationRules.scala    |  5 ++-
 .../org/apache/spark/mllib/fpm/FPGrowth.scala | 11 ++++-
 .../spark/mllib/fpm/FPGrowthSuite.scala       | 42 +++++++++++++++++++
 3 files changed, 55 insertions(+), 3 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala
index 4a0f842f3338d..7e2bbfe31c1b7 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala
@@ -33,7 +33,7 @@ import org.apache.spark.rdd.RDD
  * association rules which have a single item as the consequent.
  */
 @Experimental
-class AssociationRules private (
+class AssociationRules private[fpm] (
     private var minConfidence: Double) extends Logging with Serializable {
 
   /**
@@ -45,6 +45,7 @@ class AssociationRules private (
    * Sets the minimal confidence (default: `0.8`).
    */
   def setMinConfidence(minConfidence: Double): this.type = {
+    require(minConfidence >= 0.0 && minConfidence <= 1.0)
     this.minConfidence = minConfidence
     this
   }
@@ -91,7 +92,7 @@ object AssociationRules {
    * @tparam Item item type
    */
   @Experimental
-  class Rule[Item] private[mllib] (
+  class Rule[Item] private[fpm] (
       val antecedent: Array[Item],
       val consequent: Array[Item],
       freqUnion: Double,
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
index 0da59e812d5f9..9cb9a00dbd9c7 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
@@ -40,7 +40,16 @@ import org.apache.spark.storage.StorageLevel
  * @tparam Item item type
  */
 @Experimental
-class FPGrowthModel[Item: ClassTag](val freqItemsets: RDD[FreqItemset[Item]]) extends Serializable
+class FPGrowthModel[Item: ClassTag](val freqItemsets: RDD[FreqItemset[Item]]) extends Serializable {
+  /**
+   * Generates association rules for the [[Item]]s in [[freqItemsets]].
+   * @param confidence minimal confidence of the rules produced
+   */
+  def generateAssociationRules(confidence: Double): RDD[AssociationRules.Rule[Item]] = {
+    val associationRules = new AssociationRules(confidence)
+    associationRules.run(freqItemsets)
+  }
+}
 
 /**
  * :: Experimental ::
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/fpm/FPGrowthSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/fpm/FPGrowthSuite.scala
index ddc296a428907..4a9bfdb348d9f 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/fpm/FPGrowthSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/fpm/FPGrowthSuite.scala
@@ -132,6 +132,48 @@ class FPGrowthSuite extends SparkFunSuite with MLlibTestSparkContext {
     assert(model1.freqItemsets.count() === 625)
   }
 
+  test("FP-Growth String type association rule generation") {
+    val transactions = Seq(
+      "r z h k p",
+      "z y x w v u t s",
+      "s x o n r",
+      "x z y m t s q e",
+      "z",
+      "x z y r q t p")
+      .map(_.split(" "))
+    val rdd = sc.parallelize(transactions, 2).cache()
+
+    /* Verify results using the `R` code:
+       transactions = as(sapply(
+         list("r z h k p",
+              "z y x w v u t s",
+              "s x o n r",
+              "x z y m t s q e",
+              "z",
+              "x z y r q t p"),
+         FUN=function(x) strsplit(x," ",fixed=TRUE)),
+         "transactions")
+       ars = apriori(transactions,
+                     parameter = list(support = 0.0, confidence = 0.5, target="rules", minlen=2))
+       arsDF = as(ars, "data.frame")
+       arsDF$support = arsDF$support * length(transactions)
+       names(arsDF)[names(arsDF) == "support"] = "freq"
+       > nrow(arsDF)
+       [1] 23
+       > sum(arsDF$confidence == 1)
+       [1] 23
+     */
+    val rules = (new FPGrowth())
+      .setMinSupport(0.5)
+      .setNumPartitions(2)
+      .run(rdd)
+      .generateAssociationRules(0.9)
+      .collect()
+
+    assert(rules.size === 23)
+    assert(rules.count(rule => math.abs(rule.confidence - 1.0D) < 1e-6) == 23)
+  }
+
   test("FP-Growth using Int type") {
     val transactions = Seq(
       "1 2 3",

From f472b8cdc00839780dc79be0bbe53a098cde230c Mon Sep 17 00:00:00 2001
From: Feynman Liang <fliang@databricks.com>
Date: Wed, 8 Jul 2015 16:32:00 -0700
Subject: [PATCH 067/549] [SPARK-5016] [MLLIB] Distribute GMM mixture
 components to executors

Distribute expensive portions of computation for Gaussian mixture components (in particular, pre-computation of `MultivariateGaussian.rootSigmaInv`, the inverse covariance matrix and covariance determinant) across executors. Repost of PR#4654.

Notes for reviewers:
 * What should be the policy for when to distribute computation. Always? When numClusters > threshold? User-specified param?

TODO:
 * Performance testing and comparison for large number of clusters

Author: Feynman Liang <fliang@databricks.com>

Closes #7166 from feynmanliang/GMM_parallel_mixtures and squashes the following commits:

4f351fa [Feynman Liang] Update heuristic and scaladoc
5ea947e [Feynman Liang] Fix parallelization logic
00eb7db [Feynman Liang] Add helper method for GMM's M step, remove distributeGaussians flag
e7c8127 [Feynman Liang] Add distributeGaussians flag and tests
1da3c7f [Feynman Liang] Distribute mixtures
---
 .../mllib/clustering/GaussianMixture.scala    | 44 +++++++++++++++----
 1 file changed, 36 insertions(+), 8 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
index fc509d2ba1470..e459367333d26 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
@@ -140,6 +140,10 @@ class GaussianMixture private (
     // Get length of the input vectors
     val d = breezeData.first().length
 
+    // Heuristic to distribute the computation of the [[MultivariateGaussian]]s, approximately when
+    // d > 25 except for when k is very small
+    val distributeGaussians = ((k - 1.0) / k) * d > 25
+
     // Determine initial weights and corresponding Gaussians.
     // If the user supplied an initial GMM, we use those values, otherwise
     // we start with uniform weights, a random mean from the data, and
@@ -171,14 +175,25 @@ class GaussianMixture private (
       // Create new distributions based on the partial assignments
       // (often referred to as the "M" step in literature)
       val sumWeights = sums.weights.sum
-      var i = 0
-      while (i < k) {
-        val mu = sums.means(i) / sums.weights(i)
-        BLAS.syr(-sums.weights(i), Vectors.fromBreeze(mu),
-          Matrices.fromBreeze(sums.sigmas(i)).asInstanceOf[DenseMatrix])
-        weights(i) = sums.weights(i) / sumWeights
-        gaussians(i) = new MultivariateGaussian(mu, sums.sigmas(i) / sums.weights(i))
-        i = i + 1
+
+      if (distributeGaussians) {
+        val numPartitions = math.min(k, 1024)
+        val tuples =
+          Seq.tabulate(k)(i => (sums.means(i), sums.sigmas(i), sums.weights(i)))
+        val (ws, gs) = sc.parallelize(tuples, numPartitions).map { case (mean, sigma, weight) =>
+          updateWeightsAndGaussians(mean, sigma, weight, sumWeights)
+        }.collect.unzip
+        Array.copy(ws, 0, weights, 0, ws.length)
+        Array.copy(gs, 0, gaussians, 0, gs.length)
+      } else {
+        var i = 0
+        while (i < k) {
+          val (weight, gaussian) =
+            updateWeightsAndGaussians(sums.means(i), sums.sigmas(i), sums.weights(i), sumWeights)
+          weights(i) = weight
+          gaussians(i) = gaussian
+          i = i + 1
+        }
       }
 
       llhp = llh // current becomes previous
@@ -192,6 +207,19 @@ class GaussianMixture private (
   /** Java-friendly version of [[run()]] */
   def run(data: JavaRDD[Vector]): GaussianMixtureModel = run(data.rdd)
 
+  private def updateWeightsAndGaussians(
+      mean: BDV[Double],
+      sigma: BreezeMatrix[Double],
+      weight: Double,
+      sumWeights: Double): (Double, MultivariateGaussian) = {
+    val mu = (mean /= weight)
+    BLAS.syr(-weight, Vectors.fromBreeze(mu),
+      Matrices.fromBreeze(sigma).asInstanceOf[DenseMatrix])
+    val newWeight = weight / sumWeights
+    val newGaussian = new MultivariateGaussian(mu, sigma / weight)
+    (newWeight, newGaussian)
+  }
+
   /** Average of dense breeze vectors */
   private def vectorMean(x: IndexedSeq[BV[Double]]): BDV[Double] = {
     val v = BDV.zeros[Double](x(0).length)

From 2a4f88b6c16f2991e63b17c0e103bcd79f04dbbc Mon Sep 17 00:00:00 2001
From: Kousuke Saruta <sarutak@oss.nttdata.co.jp>
Date: Wed, 8 Jul 2015 18:09:39 -0700
Subject: [PATCH 068/549] [SPARK-8914][SQL] Remove RDDApi

As rxin suggested in #7298 , we should consider to remove `RDDApi`.

Author: Kousuke Saruta <sarutak@oss.nttdata.co.jp>

Closes #7302 from sarutak/remove-rddapi and squashes the following commits:

e495d35 [Kousuke Saruta] Fixed mima
cb7ebb9 [Kousuke Saruta] Removed overriding RDDApi
---
 project/MimaExcludes.scala                    |  5 ++
 .../org/apache/spark/sql/DataFrame.scala      | 39 ++++++-----
 .../scala/org/apache/spark/sql/RDDApi.scala   | 67 -------------------
 3 files changed, 24 insertions(+), 87 deletions(-)
 delete mode 100644 sql/core/src/main/scala/org/apache/spark/sql/RDDApi.scala

diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 7346d804632bc..57a86bf8deb64 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -70,7 +70,12 @@ object MimaExcludes {
               "org.apache.spark.mllib.linalg.Matrix.numNonzeros"),
             ProblemFilters.exclude[MissingMethodProblem](
               "org.apache.spark.mllib.linalg.Matrix.numActives")
+          ) ++ Seq(
+            // SPARK-8914 Remove RDDApi
+            ProblemFilters.exclude[MissingClassProblem](
+            "org.apache.spark.sql.RDDApi")
           )
+
         case v if v.startsWith("1.4") =>
           Seq(
             MimaBuild.excludeSparkPackage("deploy"),
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index f33e19a0cb7dd..eeefc85255d14 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -115,8 +115,7 @@ private[sql] object DataFrame {
 @Experimental
 class DataFrame private[sql](
     @transient val sqlContext: SQLContext,
-    @DeveloperApi @transient val queryExecution: SQLContext#QueryExecution)
-  extends RDDApi[Row] with Serializable {
+    @DeveloperApi @transient val queryExecution: SQLContext#QueryExecution) extends Serializable {
 
   /**
    * A constructor that automatically analyzes the logical plan.
@@ -1320,14 +1319,14 @@ class DataFrame private[sql](
    * @group action
    * @since 1.3.0
    */
-  override def first(): Row = head()
+  def first(): Row = head()
 
   /**
    * Returns a new RDD by applying a function to all rows of this DataFrame.
    * @group rdd
    * @since 1.3.0
    */
-  override def map[R: ClassTag](f: Row => R): RDD[R] = rdd.map(f)
+  def map[R: ClassTag](f: Row => R): RDD[R] = rdd.map(f)
 
   /**
    * Returns a new RDD by first applying a function to all rows of this [[DataFrame]],
@@ -1335,14 +1334,14 @@ class DataFrame private[sql](
    * @group rdd
    * @since 1.3.0
    */
-  override def flatMap[R: ClassTag](f: Row => TraversableOnce[R]): RDD[R] = rdd.flatMap(f)
+  def flatMap[R: ClassTag](f: Row => TraversableOnce[R]): RDD[R] = rdd.flatMap(f)
 
   /**
    * Returns a new RDD by applying a function to each partition of this DataFrame.
    * @group rdd
    * @since 1.3.0
    */
-  override def mapPartitions[R: ClassTag](f: Iterator[Row] => Iterator[R]): RDD[R] = {
+  def mapPartitions[R: ClassTag](f: Iterator[Row] => Iterator[R]): RDD[R] = {
     rdd.mapPartitions(f)
   }
 
@@ -1351,49 +1350,49 @@ class DataFrame private[sql](
    * @group rdd
    * @since 1.3.0
    */
-  override def foreach(f: Row => Unit): Unit = rdd.foreach(f)
+  def foreach(f: Row => Unit): Unit = rdd.foreach(f)
 
   /**
    * Applies a function f to each partition of this [[DataFrame]].
    * @group rdd
    * @since 1.3.0
    */
-  override def foreachPartition(f: Iterator[Row] => Unit): Unit = rdd.foreachPartition(f)
+  def foreachPartition(f: Iterator[Row] => Unit): Unit = rdd.foreachPartition(f)
 
   /**
    * Returns the first `n` rows in the [[DataFrame]].
    * @group action
    * @since 1.3.0
    */
-  override def take(n: Int): Array[Row] = head(n)
+  def take(n: Int): Array[Row] = head(n)
 
   /**
    * Returns an array that contains all of [[Row]]s in this [[DataFrame]].
    * @group action
    * @since 1.3.0
    */
-  override def collect(): Array[Row] = queryExecution.executedPlan.executeCollect()
+  def collect(): Array[Row] = queryExecution.executedPlan.executeCollect()
 
   /**
    * Returns a Java list that contains all of [[Row]]s in this [[DataFrame]].
    * @group action
    * @since 1.3.0
    */
-  override def collectAsList(): java.util.List[Row] = java.util.Arrays.asList(rdd.collect() : _*)
+  def collectAsList(): java.util.List[Row] = java.util.Arrays.asList(rdd.collect() : _*)
 
   /**
    * Returns the number of rows in the [[DataFrame]].
    * @group action
    * @since 1.3.0
    */
-  override def count(): Long = groupBy().count().collect().head.getLong(0)
+  def count(): Long = groupBy().count().collect().head.getLong(0)
 
   /**
    * Returns a new [[DataFrame]] that has exactly `numPartitions` partitions.
    * @group rdd
    * @since 1.3.0
    */
-  override def repartition(numPartitions: Int): DataFrame = {
+  def repartition(numPartitions: Int): DataFrame = {
     Repartition(numPartitions, shuffle = true, logicalPlan)
   }
 
@@ -1405,7 +1404,7 @@ class DataFrame private[sql](
    * @group rdd
    * @since 1.4.0
    */
-  override def coalesce(numPartitions: Int): DataFrame = {
+  def coalesce(numPartitions: Int): DataFrame = {
     Repartition(numPartitions, shuffle = false, logicalPlan)
   }
 
@@ -1415,13 +1414,13 @@ class DataFrame private[sql](
    * @group dfops
    * @since 1.3.0
    */
-  override def distinct(): DataFrame = dropDuplicates()
+  def distinct(): DataFrame = dropDuplicates()
 
   /**
    * @group basic
    * @since 1.3.0
    */
-  override def persist(): this.type = {
+  def persist(): this.type = {
     sqlContext.cacheManager.cacheQuery(this)
     this
   }
@@ -1430,13 +1429,13 @@ class DataFrame private[sql](
    * @group basic
    * @since 1.3.0
    */
-  override def cache(): this.type = persist()
+  def cache(): this.type = persist()
 
   /**
    * @group basic
    * @since 1.3.0
    */
-  override def persist(newLevel: StorageLevel): this.type = {
+  def persist(newLevel: StorageLevel): this.type = {
     sqlContext.cacheManager.cacheQuery(this, None, newLevel)
     this
   }
@@ -1445,7 +1444,7 @@ class DataFrame private[sql](
    * @group basic
    * @since 1.3.0
    */
-  override def unpersist(blocking: Boolean): this.type = {
+  def unpersist(blocking: Boolean): this.type = {
     sqlContext.cacheManager.tryUncacheQuery(this, blocking)
     this
   }
@@ -1454,7 +1453,7 @@ class DataFrame private[sql](
    * @group basic
    * @since 1.3.0
    */
-  override def unpersist(): this.type = unpersist(blocking = false)
+  def unpersist(): this.type = unpersist(blocking = false)
 
   /////////////////////////////////////////////////////////////////////////////
   // I/O
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/RDDApi.scala b/sql/core/src/main/scala/org/apache/spark/sql/RDDApi.scala
deleted file mode 100644
index 63dbab19947c0..0000000000000
--- a/sql/core/src/main/scala/org/apache/spark/sql/RDDApi.scala
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
-* Licensed to the Apache Software Foundation (ASF) under one or more
-* contributor license agreements.  See the NOTICE file distributed with
-* this work for additional information regarding copyright ownership.
-* The ASF licenses this file to You under the Apache License, Version 2.0
-* (the "License"); you may not use this file except in compliance with
-* the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-package org.apache.spark.sql
-
-import scala.reflect.ClassTag
-
-import org.apache.spark.rdd.RDD
-import org.apache.spark.storage.StorageLevel
-
-
-/**
- * An internal interface defining the RDD-like methods for [[DataFrame]].
- * Please use [[DataFrame]] directly, and do NOT use this.
- */
-private[sql] trait RDDApi[T] {
-
-  def cache(): this.type
-
-  def persist(): this.type
-
-  def persist(newLevel: StorageLevel): this.type
-
-  def unpersist(): this.type
-
-  def unpersist(blocking: Boolean): this.type
-
-  def map[R: ClassTag](f: T => R): RDD[R]
-
-  def flatMap[R: ClassTag](f: T => TraversableOnce[R]): RDD[R]
-
-  def mapPartitions[R: ClassTag](f: Iterator[T] => Iterator[R]): RDD[R]
-
-  def foreach(f: T => Unit): Unit
-
-  def foreachPartition(f: Iterator[T] => Unit): Unit
-
-  def take(n: Int): Array[T]
-
-  def collect(): Array[T]
-
-  def collectAsList(): java.util.List[T]
-
-  def count(): Long
-
-  def first(): T
-
-  def repartition(numPartitions: Int): DataFrame
-
-  def coalesce(numPartitions: Int): DataFrame
-
-  def distinct: DataFrame
-}

From 74d8d3d928cc9a7386b68588ac89ae042847d146 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Wed, 8 Jul 2015 18:22:53 -0700
Subject: [PATCH 069/549] [SPARK-8450] [SQL] [PYSARK] cleanup type converter
 for Python DataFrame

This PR fixes the converter for Python DataFrame, especially for DecimalType

Closes #7106

Author: Davies Liu <davies@databricks.com>

Closes #7131 from davies/decimal_python and squashes the following commits:

4d3c234 [Davies Liu] Merge branch 'master' of github.com:apache/spark into decimal_python
20531d6 [Davies Liu] Merge branch 'master' of github.com:apache/spark into decimal_python
7d73168 [Davies Liu] fix conflit
6cdd86a [Davies Liu] Merge branch 'master' of github.com:apache/spark into decimal_python
7104e97 [Davies Liu] improve type infer
9cd5a21 [Davies Liu] run python tests with SPARK_PREPEND_CLASSES
829a05b [Davies Liu] fix UDT in python
c99e8c5 [Davies Liu] fix mima
c46814a [Davies Liu] convert decimal for Python DataFrames
---
 .../apache/spark/mllib/linalg/Matrices.scala  | 10 +-
 .../apache/spark/mllib/linalg/Vectors.scala   | 16 +---
 project/MimaExcludes.scala                    |  5 +-
 python/pyspark/sql/tests.py                   | 13 +++
 python/pyspark/sql/types.py                   |  4 +
 python/run-tests.py                           |  3 +-
 .../org/apache/spark/sql/DataFrame.scala      |  4 +-
 .../org/apache/spark/sql/SQLContext.scala     | 28 +-----
 .../spark/sql/execution/pythonUDFs.scala      | 95 ++++++++++---------
 9 files changed, 84 insertions(+), 94 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
index 75e7004464af9..0df07663405a3 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
@@ -24,9 +24,9 @@ import scala.collection.mutable.{ArrayBuilder => MArrayBuilder, HashSet => MHash
 import breeze.linalg.{CSCMatrix => BSM, DenseMatrix => BDM, Matrix => BM}
 
 import org.apache.spark.annotation.DeveloperApi
-import org.apache.spark.sql.Row
-import org.apache.spark.sql.types._
 import org.apache.spark.sql.catalyst.expressions.GenericMutableRow
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.types._
 
 /**
  * Trait for a local matrix.
@@ -147,7 +147,7 @@ private[spark] class MatrixUDT extends UserDefinedType[Matrix] {
       ))
   }
 
-  override def serialize(obj: Any): Row = {
+  override def serialize(obj: Any): InternalRow = {
     val row = new GenericMutableRow(7)
     obj match {
       case sm: SparseMatrix =>
@@ -173,9 +173,7 @@ private[spark] class MatrixUDT extends UserDefinedType[Matrix] {
 
   override def deserialize(datum: Any): Matrix = {
     datum match {
-      // TODO: something wrong with UDT serialization, should never happen.
-      case m: Matrix => m
-      case row: Row =>
+      case row: InternalRow =>
         require(row.length == 7,
           s"MatrixUDT.deserialize given row with length ${row.length} but requires length == 7")
         val tpe = row.getByte(0)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
index c9c27425d2877..e048b01d92462 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
@@ -28,7 +28,7 @@ import breeze.linalg.{DenseVector => BDV, SparseVector => BSV, Vector => BV}
 import org.apache.spark.SparkException
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.mllib.util.NumericParser
-import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.GenericMutableRow
 import org.apache.spark.sql.types._
 
@@ -175,7 +175,7 @@ private[spark] class VectorUDT extends UserDefinedType[Vector] {
       StructField("values", ArrayType(DoubleType, containsNull = false), nullable = true)))
   }
 
-  override def serialize(obj: Any): Row = {
+  override def serialize(obj: Any): InternalRow = {
     obj match {
       case SparseVector(size, indices, values) =>
         val row = new GenericMutableRow(4)
@@ -191,17 +191,12 @@ private[spark] class VectorUDT extends UserDefinedType[Vector] {
         row.setNullAt(2)
         row.update(3, values.toSeq)
         row
-      // TODO: There are bugs in UDT serialization because we don't have a clear separation between
-      // TODO: internal SQL types and language specific types (including UDT). UDT serialize and
-      // TODO: deserialize may get called twice. See SPARK-7186.
-      case row: Row =>
-        row
     }
   }
 
   override def deserialize(datum: Any): Vector = {
     datum match {
-      case row: Row =>
+      case row: InternalRow =>
         require(row.length == 4,
           s"VectorUDT.deserialize given row with length ${row.length} but requires length == 4")
         val tpe = row.getByte(0)
@@ -215,11 +210,6 @@ private[spark] class VectorUDT extends UserDefinedType[Vector] {
             val values = row.getAs[Iterable[Double]](3).toArray
             new DenseVector(values)
         }
-      // TODO: There are bugs in UDT serialization because we don't have a clear separation between
-      // TODO: internal SQL types and language specific types (including UDT). UDT serialize and
-      // TODO: deserialize may get called twice. See SPARK-7186.
-      case v: Vector =>
-        v
     }
   }
 
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 57a86bf8deb64..821aadd477ef3 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -63,7 +63,10 @@ object MimaExcludes {
             // SQL execution is considered private.
             excludePackage("org.apache.spark.sql.execution"),
             // Parquet support is considered private.
-            excludePackage("org.apache.spark.sql.parquet")
+            excludePackage("org.apache.spark.sql.parquet"),
+            // local function inside a method
+            ProblemFilters.exclude[MissingMethodProblem](
+              "org.apache.spark.sql.SQLContext.org$apache$spark$sql$SQLContext$$needsConversion$1")
           ) ++ Seq(
             // SPARK-8479 Add numNonzeros and numActives to Matrix.
             ProblemFilters.exclude[MissingMethodProblem](
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 333378c7f1854..66827d48850d9 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -700,6 +700,19 @@ def test_time_with_timezone(self):
         self.assertTrue(now - now1 < datetime.timedelta(0.001))
         self.assertTrue(now - utcnow1 < datetime.timedelta(0.001))
 
+    def test_decimal(self):
+        from decimal import Decimal
+        schema = StructType([StructField("decimal", DecimalType(10, 5))])
+        df = self.sqlCtx.createDataFrame([(Decimal("3.14159"),)], schema)
+        row = df.select(df.decimal + 1).first()
+        self.assertEqual(row[0], Decimal("4.14159"))
+        tmpPath = tempfile.mkdtemp()
+        shutil.rmtree(tmpPath)
+        df.write.parquet(tmpPath)
+        df2 = self.sqlCtx.read.parquet(tmpPath)
+        row = df2.first()
+        self.assertEqual(row[0], Decimal("3.14159"))
+
     def test_dropna(self):
         schema = StructType([
             StructField("name", StringType(), True),
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index 160df40d65cc1..7e64cb0b54dba 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -1069,6 +1069,10 @@ def _verify_type(obj, dataType):
     if obj is None:
         return
 
+    # StringType can work with any types
+    if isinstance(dataType, StringType):
+        return
+
     if isinstance(dataType, UserDefinedType):
         if not (hasattr(obj, '__UDT__') and obj.__UDT__ == dataType):
             raise ValueError("%r is not an instance of type %r" % (obj, dataType))
diff --git a/python/run-tests.py b/python/run-tests.py
index 7638854def2e8..cc560779373b3 100755
--- a/python/run-tests.py
+++ b/python/run-tests.py
@@ -72,7 +72,8 @@ def print_red(text):
 
 
 def run_individual_python_test(test_name, pyspark_python):
-    env = {'SPARK_TESTING': '1', 'PYSPARK_PYTHON': which(pyspark_python)}
+    env = dict(os.environ)
+    env.update({'SPARK_TESTING': '1', 'PYSPARK_PYTHON': which(pyspark_python)})
     LOGGER.debug("Starting test(%s): %s", pyspark_python, test_name)
     start_time = time.time()
     try:
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index eeefc85255d14..d9f987ae0252f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -1549,8 +1549,8 @@ class DataFrame private[sql](
    * Converts a JavaRDD to a PythonRDD.
    */
   protected[sql] def javaToPython: JavaRDD[Array[Byte]] = {
-    val fieldTypes = schema.fields.map(_.dataType)
-    val jrdd = rdd.map(EvaluatePython.rowToArray(_, fieldTypes)).toJavaRDD()
+    val structType = schema  // capture it for closure
+    val jrdd = queryExecution.toRdd.map(EvaluatePython.toJava(_, structType)).toJavaRDD()
     SerDeUtil.javaToPython(jrdd)
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 079f31ab8fe6d..477dea9164726 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -1044,33 +1044,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
       rdd: RDD[Array[Any]],
       schema: StructType): DataFrame = {
 
-    def needsConversion(dataType: DataType): Boolean = dataType match {
-      case ByteType => true
-      case ShortType => true
-      case LongType => true
-      case FloatType => true
-      case DateType => true
-      case TimestampType => true
-      case StringType => true
-      case ArrayType(_, _) => true
-      case MapType(_, _, _) => true
-      case StructType(_) => true
-      case udt: UserDefinedType[_] => needsConversion(udt.sqlType)
-      case other => false
-    }
-
-    val convertedRdd = if (schema.fields.exists(f => needsConversion(f.dataType))) {
-      rdd.map(m => m.zip(schema.fields).map {
-        case (value, field) => EvaluatePython.fromJava(value, field.dataType)
-      })
-    } else {
-      rdd
-    }
-
-    val rowRdd = convertedRdd.mapPartitions { iter =>
-      iter.map { m => new GenericInternalRow(m): InternalRow}
-    }
-
+    val rowRdd = rdd.map(r => EvaluatePython.fromJava(r, schema).asInstanceOf[InternalRow])
     DataFrame(this, LogicalRDD(schema.toAttributes, rowRdd)(self))
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUDFs.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUDFs.scala
index 6946e798b71b0..1c8130b07c7fb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUDFs.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUDFs.scala
@@ -24,20 +24,19 @@ import scala.collection.JavaConverters._
 
 import net.razorvine.pickle.{Pickler, Unpickler}
 
-import org.apache.spark.{Accumulator, Logging => SparkLogging}
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.api.python.{PythonBroadcast, PythonRDD}
 import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.plans.logical
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
+import org.apache.spark.{Accumulator, Logging => SparkLogging}
 
 /**
  * A serialized version of a Python lambda function.  Suitable for use in a [[PythonRDD]].
@@ -125,59 +124,86 @@ object EvaluatePython {
     new EvaluatePython(udf, child, AttributeReference("pythonUDF", udf.dataType)())
 
   /**
-   * Helper for converting a Scala object to a java suitable for pyspark serialization.
+   * Helper for converting from Catalyst type to java type suitable for Pyrolite.
    */
   def toJava(obj: Any, dataType: DataType): Any = (obj, dataType) match {
     case (null, _) => null
 
-    case (row: Row, struct: StructType) =>
+    case (row: InternalRow, struct: StructType) =>
       val fields = struct.fields.map(field => field.dataType)
-      row.toSeq.zip(fields).map {
-        case (obj, dataType) => toJava(obj, dataType)
-      }.toArray
+      rowToArray(row, fields)
 
     case (seq: Seq[Any], array: ArrayType) =>
       seq.map(x => toJava(x, array.elementType)).asJava
-    case (list: JList[_], array: ArrayType) =>
-      list.map(x => toJava(x, array.elementType)).asJava
-    case (arr, array: ArrayType) if arr.getClass.isArray =>
-      arr.asInstanceOf[Array[Any]].map(x => toJava(x, array.elementType))
 
     case (obj: Map[_, _], mt: MapType) => obj.map {
       case (k, v) => (toJava(k, mt.keyType), toJava(v, mt.valueType))
     }.asJava
 
-    case (ud, udt: UserDefinedType[_]) => toJava(udt.serialize(ud), udt.sqlType)
+    case (ud, udt: UserDefinedType[_]) => toJava(ud, udt.sqlType)
 
     case (date: Int, DateType) => DateTimeUtils.toJavaDate(date)
     case (t: Long, TimestampType) => DateTimeUtils.toJavaTimestamp(t)
+
+    case (d: Decimal, _) => d.toJavaBigDecimal
+
     case (s: UTF8String, StringType) => s.toString
 
-    // Pyrolite can handle Timestamp and Decimal
     case (other, _) => other
   }
 
   /**
    * Convert Row into Java Array (for pickled into Python)
    */
-  def rowToArray(row: Row, fields: Seq[DataType]): Array[Any] = {
+  def rowToArray(row: InternalRow, fields: Seq[DataType]): Array[Any] = {
     // TODO: this is slow!
     row.toSeq.zip(fields).map {case (obj, dt) => toJava(obj, dt)}.toArray
   }
 
-  // Converts value to the type specified by the data type.
-  // Because Python does not have data types for TimestampType, FloatType, ShortType, and
-  // ByteType, we need to explicitly convert values in columns of these data types to the desired
-  // JVM data types.
+  /**
+   * Converts `obj` to the type specified by the data type, or returns null if the type of obj is
+   * unexpected. Because Python doesn't enforce the type.
+   */
   def fromJava(obj: Any, dataType: DataType): Any = (obj, dataType) match {
-    // TODO: We should check nullable
     case (null, _) => null
 
+    case (c: Boolean, BooleanType) => c
+
+    case (c: Int, ByteType) => c.toByte
+    case (c: Long, ByteType) => c.toByte
+
+    case (c: Int, ShortType) => c.toShort
+    case (c: Long, ShortType) => c.toShort
+
+    case (c: Int, IntegerType) => c
+    case (c: Long, IntegerType) => c.toInt
+
+    case (c: Int, LongType) => c.toLong
+    case (c: Long, LongType) => c
+
+    case (c: Double, FloatType) => c.toFloat
+
+    case (c: Double, DoubleType) => c
+
+    case (c: java.math.BigDecimal, dt: DecimalType) => Decimal(c)
+
+    case (c: Int, DateType) => c
+
+    case (c: Long, TimestampType) => c
+
+    case (c: String, StringType) => UTF8String.fromString(c)
+    case (c, StringType) =>
+      // If we get here, c is not a string. Call toString on it.
+      UTF8String.fromString(c.toString)
+
+    case (c: String, BinaryType) => c.getBytes("utf-8")
+    case (c, BinaryType) if c.getClass.isArray && c.getClass.getComponentType.getName == "byte" => c
+
     case (c: java.util.List[_], ArrayType(elementType, _)) =>
-      c.map { e => fromJava(e, elementType)}: Seq[Any]
+      c.map { e => fromJava(e, elementType)}.toSeq
 
     case (c, ArrayType(elementType, _)) if c.getClass.isArray =>
-      c.asInstanceOf[Array[_]].map(e => fromJava(e, elementType)): Seq[Any]
+      c.asInstanceOf[Array[_]].map(e => fromJava(e, elementType)).toSeq
 
     case (c: java.util.Map[_, _], MapType(keyType, valueType, _)) => c.map {
       case (key, value) => (fromJava(key, keyType), fromJava(value, valueType))
@@ -188,30 +214,11 @@ object EvaluatePython {
         case (e, f) => fromJava(e, f.dataType)
       })
 
-    case (c: java.util.Calendar, DateType) =>
-      DateTimeUtils.fromJavaDate(new java.sql.Date(c.getTimeInMillis))
-
-    case (c: java.util.Calendar, TimestampType) =>
-      c.getTimeInMillis * 10000L
-    case (t: java.sql.Timestamp, TimestampType) =>
-      DateTimeUtils.fromJavaTimestamp(t)
-
-    case (_, udt: UserDefinedType[_]) =>
-      fromJava(obj, udt.sqlType)
-
-    case (c: Int, ByteType) => c.toByte
-    case (c: Long, ByteType) => c.toByte
-    case (c: Int, ShortType) => c.toShort
-    case (c: Long, ShortType) => c.toShort
-    case (c: Long, IntegerType) => c.toInt
-    case (c: Int, LongType) => c.toLong
-    case (c: Double, FloatType) => c.toFloat
-    case (c: String, StringType) => UTF8String.fromString(c)
-    case (c, StringType) =>
-      // If we get here, c is not a string. Call toString on it.
-      UTF8String.fromString(c.toString)
+    case (_, udt: UserDefinedType[_]) => fromJava(obj, udt.sqlType)
 
-    case (c, _) => c
+    // all other unexpected type should be null, or we will have runtime exception
+    // TODO(davies): we could improve this by try to cast the object to expected type
+    case (c, _) => null
   }
 }
 

From 28fa01e2ba146e823489f6d81c5eb3a76b20c71f Mon Sep 17 00:00:00 2001
From: Jonathan Alter <jonalter@users.noreply.github.com>
Date: Thu, 9 Jul 2015 03:28:51 +0100
Subject: [PATCH 070/549] [SPARK-8927] [DOCS] Format wrong for some config
 descriptions

A couple descriptions were not inside `<td></td>` and were being displayed immediately under the section title instead of in their row.

Author: Jonathan Alter <jonalter@users.noreply.github.com>

Closes #7292 from jonalter/docs-config and squashes the following commits:

5ce1570 [Jonathan Alter] [DOCS] Format wrong for some config descriptions
---
 docs/configuration.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/configuration.md b/docs/configuration.md
index bebaf6f62e90a..892c02b27df32 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -1007,9 +1007,9 @@ Apart from these, the following properties are also available, and may be useful
 <tr>
   <td><code>spark.rpc.numRetries</code></td>
   <td>3</td>
+  <td>
     Number of times to retry before an RPC task gives up.
     An RPC task will run at most times of this number.
-  <td>
   </td>
 </tr>
 <tr>
@@ -1029,8 +1029,8 @@ Apart from these, the following properties are also available, and may be useful
 <tr>
   <td><code>spark.rpc.lookupTimeout</code></td>
   <td>120s</td>
-    Duration for an RPC remote endpoint lookup operation to wait before timing out.
   <td>
+    Duration for an RPC remote endpoint lookup operation to wait before timing out.  
   </td>
 </tr>
 </table>

From a290814877308c6fa9b0f78b1a81145db7651ca4 Mon Sep 17 00:00:00 2001
From: Yijie Shen <henry.yijieshen@gmail.com>
Date: Wed, 8 Jul 2015 20:20:17 -0700
Subject: [PATCH 071/549] [SPARK-8866][SQL] use 1us precision for timestamp
 type

JIRA: https://issues.apache.org/jira/browse/SPARK-8866

Author: Yijie Shen <henry.yijieshen@gmail.com>

Closes #7283 from yijieshen/micro_timestamp and squashes the following commits:

dc735df [Yijie Shen] update CastSuite to avoid round error
714eaea [Yijie Shen] add timestamp_udf into blacklist due to precision lose
c3ca2f4 [Yijie Shen] fix unhandled case in CurrentTimestamp
8d4aa6b [Yijie Shen] use 1us precision for timestamp type
---
 python/pyspark/sql/types.py                   |  2 +-
 .../spark/sql/catalyst/expressions/Cast.scala | 18 ++++-----
 .../expressions/datetimeFunctions.scala       |  2 +-
 .../sql/catalyst/util/DateTimeUtils.scala     | 38 +++++++++----------
 .../sql/catalyst/expressions/CastSuite.scala  | 10 ++---
 .../catalyst/util/DateTimeUtilsSuite.scala    |  8 ++--
 .../apache/spark/sql/json/JacksonParser.scala |  4 +-
 .../org/apache/spark/sql/json/JsonRDD.scala   |  6 +--
 .../org/apache/spark/sql/jdbc/JDBCSuite.scala |  2 +-
 .../execution/HiveCompatibilitySuite.scala    |  6 +--
 .../spark/sql/hive/HiveInspectors.scala       |  4 +-
 11 files changed, 50 insertions(+), 50 deletions(-)

diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index 7e64cb0b54dba..fecfe6d71e9a7 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -775,7 +775,7 @@ def to_posix_timstamp(dt):
             if dt:
                 seconds = (calendar.timegm(dt.utctimetuple()) if dt.tzinfo
                            else time.mktime(dt.timetuple()))
-                return int(seconds * 1e7 + dt.microsecond * 10)
+                return int(seconds * 1e6 + dt.microsecond)
         return to_posix_timstamp
 
     else:
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
index 662ceeca7782d..567feca7136f9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -186,7 +186,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
     case ByteType =>
       buildCast[Byte](_, b => longToTimestamp(b.toLong))
     case DateType =>
-      buildCast[Int](_, d => DateTimeUtils.daysToMillis(d) * 10000)
+      buildCast[Int](_, d => DateTimeUtils.daysToMillis(d) * 1000)
     // TimestampWritable.decimalToTimestamp
     case DecimalType() =>
       buildCast[Decimal](_, d => decimalToTimestamp(d))
@@ -207,16 +207,16 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
   }
 
   private[this] def decimalToTimestamp(d: Decimal): Long = {
-    (d.toBigDecimal * 10000000L).longValue()
+    (d.toBigDecimal * 1000000L).longValue()
   }
 
-  // converting milliseconds to 100ns
-  private[this] def longToTimestamp(t: Long): Long = t * 10000L
-  // converting 100ns to seconds
-  private[this] def timestampToLong(ts: Long): Long = math.floor(ts.toDouble / 10000000L).toLong
-  // converting 100ns to seconds in double
+  // converting milliseconds to us
+  private[this] def longToTimestamp(t: Long): Long = t * 1000L
+  // converting us to seconds
+  private[this] def timestampToLong(ts: Long): Long = math.floor(ts.toDouble / 1000000L).toLong
+  // converting us to seconds in double
   private[this] def timestampToDouble(ts: Long): Double = {
-    ts / 10000000.0
+    ts / 1000000.0
   }
 
   // DateConverter
@@ -229,7 +229,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
     case TimestampType =>
       // throw valid precision more than seconds, according to Hive.
       // Timestamp.nanos is in 0 to 999,999,999, no more than a second.
-      buildCast[Long](_, t => DateTimeUtils.millisToDays(t / 10000L))
+      buildCast[Long](_, t => DateTimeUtils.millisToDays(t / 1000L))
     // Hive throws this exception as a Semantic Exception
     // It is never possible to compare result when hive return with exception,
     // so we can return null
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeFunctions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeFunctions.scala
index a492b966a5e31..dd5ec330a771b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeFunctions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeFunctions.scala
@@ -51,6 +51,6 @@ case class CurrentTimestamp() extends LeafExpression {
   override def dataType: DataType = TimestampType
 
   override def eval(input: InternalRow): Any = {
-    System.currentTimeMillis() * 10000L
+    System.currentTimeMillis() * 1000L
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
index 4269ad5d56737..c1ddee3ef0230 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
@@ -34,8 +34,8 @@ object DateTimeUtils {
   // see http://stackoverflow.com/questions/466321/convert-unix-timestamp-to-julian
   final val JULIAN_DAY_OF_EPOCH = 2440587  // and .5
   final val SECONDS_PER_DAY = 60 * 60 * 24L
-  final val HUNDRED_NANOS_PER_SECOND = 1000L * 1000L * 10L
-  final val NANOS_PER_SECOND = HUNDRED_NANOS_PER_SECOND * 100
+  final val MICROS_PER_SECOND = 1000L * 1000L
+  final val NANOS_PER_SECOND = MICROS_PER_SECOND * 1000L
 
 
   // Java TimeZone has no mention of thread safety. Use thread local instance to be safe.
@@ -77,8 +77,8 @@ object DateTimeUtils {
     threadLocalDateFormat.get.format(toJavaDate(days))
 
   // Converts Timestamp to string according to Hive TimestampWritable convention.
-  def timestampToString(num100ns: Long): String = {
-    val ts = toJavaTimestamp(num100ns)
+  def timestampToString(us: Long): String = {
+    val ts = toJavaTimestamp(us)
     val timestampString = ts.toString
     val formatted = threadLocalTimestampFormat.get.format(ts)
 
@@ -132,52 +132,52 @@ object DateTimeUtils {
   }
 
   /**
-   * Returns a java.sql.Timestamp from number of 100ns since epoch.
+   * Returns a java.sql.Timestamp from number of micros since epoch.
    */
-  def toJavaTimestamp(num100ns: Long): Timestamp = {
+  def toJavaTimestamp(us: Long): Timestamp = {
     // setNanos() will overwrite the millisecond part, so the milliseconds should be
     // cut off at seconds
-    var seconds = num100ns / HUNDRED_NANOS_PER_SECOND
-    var nanos = num100ns % HUNDRED_NANOS_PER_SECOND
+    var seconds = us / MICROS_PER_SECOND
+    var micros = us % MICROS_PER_SECOND
     // setNanos() can not accept negative value
-    if (nanos < 0) {
-      nanos += HUNDRED_NANOS_PER_SECOND
+    if (micros < 0) {
+      micros += MICROS_PER_SECOND
       seconds -= 1
     }
     val t = new Timestamp(seconds * 1000)
-    t.setNanos(nanos.toInt * 100)
+    t.setNanos(micros.toInt * 1000)
     t
   }
 
   /**
-   * Returns the number of 100ns since epoch from java.sql.Timestamp.
+   * Returns the number of micros since epoch from java.sql.Timestamp.
    */
   def fromJavaTimestamp(t: Timestamp): Long = {
     if (t != null) {
-      t.getTime() * 10000L + (t.getNanos().toLong / 100) % 10000L
+      t.getTime() * 1000L + (t.getNanos().toLong / 1000) % 1000L
     } else {
       0L
     }
   }
 
   /**
-   * Returns the number of 100ns (hundred of nanoseconds) since epoch from Julian day
+   * Returns the number of microseconds since epoch from Julian day
    * and nanoseconds in a day
    */
   def fromJulianDay(day: Int, nanoseconds: Long): Long = {
     // use Long to avoid rounding errors
     val seconds = (day - JULIAN_DAY_OF_EPOCH).toLong * SECONDS_PER_DAY - SECONDS_PER_DAY / 2
-    seconds * HUNDRED_NANOS_PER_SECOND + nanoseconds / 100L
+    seconds * MICROS_PER_SECOND + nanoseconds / 1000L
   }
 
   /**
-   * Returns Julian day and nanoseconds in a day from the number of 100ns (hundred of nanoseconds)
+   * Returns Julian day and nanoseconds in a day from the number of microseconds
    */
-  def toJulianDay(num100ns: Long): (Int, Long) = {
-    val seconds = num100ns / HUNDRED_NANOS_PER_SECOND + SECONDS_PER_DAY / 2
+  def toJulianDay(us: Long): (Int, Long) = {
+    val seconds = us / MICROS_PER_SECOND + SECONDS_PER_DAY / 2
     val day = seconds / SECONDS_PER_DAY + JULIAN_DAY_OF_EPOCH
     val secondsInDay = seconds % SECONDS_PER_DAY
-    val nanos = (num100ns % HUNDRED_NANOS_PER_SECOND) * 100L
+    val nanos = (us % MICROS_PER_SECOND) * 1000L
     (day.toInt, secondsInDay * NANOS_PER_SECOND + nanos)
   }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
index 518961e38396f..919fdd470b79a 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
@@ -293,15 +293,15 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
   }
 
   test("cast from timestamp") {
-    val millis = 15 * 1000 + 2
-    val seconds = millis * 1000 + 2
+    val millis = 15 * 1000 + 3
+    val seconds = millis * 1000 + 3
     val ts = new Timestamp(millis)
     val tss = new Timestamp(seconds)
     checkEvaluation(cast(ts, ShortType), 15.toShort)
     checkEvaluation(cast(ts, IntegerType), 15)
     checkEvaluation(cast(ts, LongType), 15.toLong)
-    checkEvaluation(cast(ts, FloatType), 15.002f)
-    checkEvaluation(cast(ts, DoubleType), 15.002)
+    checkEvaluation(cast(ts, FloatType), 15.003f)
+    checkEvaluation(cast(ts, DoubleType), 15.003)
     checkEvaluation(cast(cast(tss, ShortType), TimestampType), DateTimeUtils.fromJavaTimestamp(ts))
     checkEvaluation(cast(cast(tss, IntegerType), TimestampType),
       DateTimeUtils.fromJavaTimestamp(ts))
@@ -317,7 +317,7 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
       Decimal(1))
 
     // A test for higher precision than millis
-    checkEvaluation(cast(cast(0.0000001, TimestampType), DoubleType), 0.0000001)
+    checkEvaluation(cast(cast(0.000001, TimestampType), DoubleType), 0.000001)
 
     checkEvaluation(cast(Double.NaN, TimestampType), null)
     checkEvaluation(cast(1.0 / 0.0, TimestampType), null)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
index 1d4a60c81efc5..f63ac191e7366 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
@@ -24,11 +24,11 @@ import org.apache.spark.SparkFunSuite
 
 class DateTimeUtilsSuite extends SparkFunSuite {
 
-  test("timestamp and 100ns") {
+  test("timestamp and us") {
     val now = new Timestamp(System.currentTimeMillis())
-    now.setNanos(100)
+    now.setNanos(1000)
     val ns = DateTimeUtils.fromJavaTimestamp(now)
-    assert(ns % 10000000L === 1)
+    assert(ns % 1000000L === 1)
     assert(DateTimeUtils.toJavaTimestamp(ns) === now)
 
     List(-111111111111L, -1L, 0, 1L, 111111111111L).foreach { t =>
@@ -38,7 +38,7 @@ class DateTimeUtilsSuite extends SparkFunSuite {
     }
   }
 
-  test("100ns and julian day") {
+  test("us and julian day") {
     val (d, ns) = DateTimeUtils.toJulianDay(0)
     assert(d === DateTimeUtils.JULIAN_DAY_OF_EPOCH)
     assert(ns === DateTimeUtils.SECONDS_PER_DAY / 2 * DateTimeUtils.NANOS_PER_SECOND)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/json/JacksonParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/json/JacksonParser.scala
index 4b8ab63b5ab39..381e7ed54428f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/json/JacksonParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/json/JacksonParser.scala
@@ -67,10 +67,10 @@ private[sql] object JacksonParser {
         DateTimeUtils.millisToDays(DateTimeUtils.stringToTime(parser.getText).getTime)
 
       case (VALUE_STRING, TimestampType) =>
-        DateTimeUtils.stringToTime(parser.getText).getTime * 10000L
+        DateTimeUtils.stringToTime(parser.getText).getTime * 1000L
 
       case (VALUE_NUMBER_INT, TimestampType) =>
-        parser.getLongValue * 10000L
+        parser.getLongValue * 1000L
 
       case (_, StringType) =>
         val writer = new ByteArrayOutputStream()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
index 01ba05cbd14f1..b392a51bf7dce 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
@@ -401,9 +401,9 @@ private[sql] object JsonRDD extends Logging {
 
   private def toTimestamp(value: Any): Long = {
     value match {
-      case value: java.lang.Integer => value.asInstanceOf[Int].toLong * 10000L
-      case value: java.lang.Long => value * 10000L
-      case value: java.lang.String => DateTimeUtils.stringToTime(value).getTime * 10000L
+      case value: java.lang.Integer => value.asInstanceOf[Int].toLong * 1000L
+      case value: java.lang.Long => value * 1000L
+      case value: java.lang.String => DateTimeUtils.stringToTime(value).getTime * 1000L
     }
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
index 69ab1c292d221..566a52dc1b784 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
@@ -326,7 +326,7 @@ class JDBCSuite extends SparkFunSuite with BeforeAndAfter {
     assert(cal.get(Calendar.HOUR) === 11)
     assert(cal.get(Calendar.MINUTE) === 22)
     assert(cal.get(Calendar.SECOND) === 33)
-    assert(rows(0).getAs[java.sql.Timestamp](2).getNanos === 543543500)
+    assert(rows(0).getAs[java.sql.Timestamp](2).getNanos === 543543000)
   }
 
   test("test DATE types") {
diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
index 415a81644c58f..c884c399281a8 100644
--- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
+++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
@@ -254,9 +254,10 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     // the answer is sensitive for jdk version
     "udf_java_method",
 
-    // Spark SQL use Long for TimestampType, lose the precision under 100ns
+    // Spark SQL use Long for TimestampType, lose the precision under 1us
     "timestamp_1",
-    "timestamp_2"
+    "timestamp_2",
+    "timestamp_udf"
   )
 
   /**
@@ -803,7 +804,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "timestamp_comparison",
     "timestamp_lazy",
     "timestamp_null",
-    "timestamp_udf",
     "touch",
     "transform_ppr1",
     "transform_ppr2",
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
index 4cba17524af6c..a8f2ee37cb8ed 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
@@ -267,7 +267,7 @@ private[hive] trait HiveInspectors {
         poi.getWritableConstantValue.getHiveDecimal)
     case poi: WritableConstantTimestampObjectInspector =>
       val t = poi.getWritableConstantValue
-      t.getSeconds * 10000000L + t.getNanos / 100L
+      t.getSeconds * 1000000L + t.getNanos / 1000L
     case poi: WritableConstantIntObjectInspector =>
       poi.getWritableConstantValue.get()
     case poi: WritableConstantDoubleObjectInspector =>
@@ -332,7 +332,7 @@ private[hive] trait HiveInspectors {
       case x: DateObjectInspector => DateTimeUtils.fromJavaDate(x.getPrimitiveJavaObject(data))
       case x: TimestampObjectInspector if x.preferWritable() =>
         val t = x.getPrimitiveWritableObject(data)
-        t.getSeconds * 10000000L + t.getNanos / 100
+        t.getSeconds * 1000000L + t.getNanos / 1000L
       case ti: TimestampObjectInspector =>
         DateTimeUtils.fromJavaTimestamp(ti.getPrimitiveJavaObject(data))
       case _ => pi.getPrimitiveJavaObject(data)

From b55499a44ab74e33378211fb0d6940905d7c6318 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Wed, 8 Jul 2015 20:28:05 -0700
Subject: [PATCH 072/549] [SPARK-8932] Support copy() for UnsafeRows that do
 not use ObjectPools

We call Row.copy() in many places throughout SQL but UnsafeRow currently throws UnsupportedOperationException when copy() is called.

Supporting copying when ObjectPool is used may be difficult, since we may need to handle deep-copying of objects in the pool. In addition, this copy() method needs to produce a self-contained row object which may be passed around / buffered by downstream code which does not understand the UnsafeRow format.

In the long run, we'll need to figure out how to handle the ObjectPool corner cases, but this may be unnecessary if other changes are made. Therefore, in order to unblock my sort patch (#6444) I propose that we support copy() for the cases where UnsafeRow does not use an ObjectPool and continue to throw UnsupportedOperationException when an ObjectPool is used.

This patch accomplishes this by modifying UnsafeRow so that it knows the size of the row's backing data in order to be able to copy it into a byte array.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #7306 from JoshRosen/SPARK-8932 and squashes the following commits:

338e6bf [Josh Rosen] Support copy for UnsafeRows that do not use ObjectPools.
---
 .../UnsafeFixedWidthAggregationMap.java       | 12 +++--
 .../sql/catalyst/expressions/UnsafeRow.java   | 32 +++++++++++-
 .../expressions/UnsafeRowConverter.scala      | 10 +++-
 .../expressions/UnsafeRowConverterSuite.scala | 52 ++++++++++++++-----
 4 files changed, 87 insertions(+), 19 deletions(-)

diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMap.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMap.java
index 1e79f4b2e88e5..79d55b36dab01 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMap.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMap.java
@@ -120,9 +120,11 @@ public UnsafeFixedWidthAggregationMap(
     this.bufferPool = new ObjectPool(initialCapacity);
 
     InternalRow initRow = initProjection.apply(emptyRow);
-    this.emptyBuffer = new byte[bufferConverter.getSizeRequirement(initRow)];
+    int emptyBufferSize = bufferConverter.getSizeRequirement(initRow);
+    this.emptyBuffer = new byte[emptyBufferSize];
     int writtenLength = bufferConverter.writeRow(
-      initRow, emptyBuffer, PlatformDependent.BYTE_ARRAY_OFFSET, bufferPool);
+      initRow, emptyBuffer, PlatformDependent.BYTE_ARRAY_OFFSET, emptyBufferSize,
+      bufferPool);
     assert (writtenLength == emptyBuffer.length): "Size requirement calculation was wrong!";
     // re-use the empty buffer only when there is no object saved in pool.
     reuseEmptyBuffer = bufferPool.size() == 0;
@@ -142,6 +144,7 @@ public UnsafeRow getAggregationBuffer(InternalRow groupingKey) {
       groupingKey,
       groupingKeyConversionScratchSpace,
       PlatformDependent.BYTE_ARRAY_OFFSET,
+      groupingKeySize,
       keyPool);
     assert (groupingKeySize == actualGroupingKeySize) : "Size requirement calculation was wrong!";
 
@@ -157,7 +160,7 @@ public UnsafeRow getAggregationBuffer(InternalRow groupingKey) {
         // There is some objects referenced by emptyBuffer, so generate a new one
         InternalRow initRow = initProjection.apply(emptyRow);
         bufferConverter.writeRow(initRow, emptyBuffer, PlatformDependent.BYTE_ARRAY_OFFSET,
-          bufferPool);
+          groupingKeySize, bufferPool);
       }
       loc.putNewKey(
         groupingKeyConversionScratchSpace,
@@ -175,6 +178,7 @@ public UnsafeRow getAggregationBuffer(InternalRow groupingKey) {
       address.getBaseObject(),
       address.getBaseOffset(),
       bufferConverter.numFields(),
+      loc.getValueLength(),
       bufferPool
     );
     return currentBuffer;
@@ -214,12 +218,14 @@ public MapEntry next() {
           keyAddress.getBaseObject(),
           keyAddress.getBaseOffset(),
           keyConverter.numFields(),
+          loc.getKeyLength(),
           keyPool
         );
         entry.value.pointTo(
           valueAddress.getBaseObject(),
           valueAddress.getBaseOffset(),
           bufferConverter.numFields(),
+          loc.getValueLength(),
           bufferPool
         );
         return entry;
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
index aeb64b045812f..edb7202245289 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
@@ -68,6 +68,9 @@ public final class UnsafeRow extends MutableRow {
   /** The number of fields in this row, used for calculating the bitset width (and in assertions) */
   private int numFields;
 
+  /** The size of this row's backing data, in bytes) */
+  private int sizeInBytes;
+
   public int length() { return numFields; }
 
   /** The width of the null tracking bit set, in bytes */
@@ -95,14 +98,17 @@ public UnsafeRow() { }
    * @param baseObject the base object
    * @param baseOffset the offset within the base object
    * @param numFields the number of fields in this row
+   * @param sizeInBytes the size of this row's backing data, in bytes
    * @param pool the object pool to hold arbitrary objects
    */
-  public void pointTo(Object baseObject, long baseOffset, int numFields, ObjectPool pool) {
+  public void pointTo(
+      Object baseObject, long baseOffset, int numFields, int sizeInBytes, ObjectPool pool) {
     assert numFields >= 0 : "numFields should >= 0";
     this.bitSetWidthInBytes = calculateBitSetWidthInBytes(numFields);
     this.baseObject = baseObject;
     this.baseOffset = baseOffset;
     this.numFields = numFields;
+    this.sizeInBytes = sizeInBytes;
     this.pool = pool;
   }
 
@@ -336,9 +342,31 @@ public double getDouble(int i) {
     }
   }
 
+  /**
+   * Copies this row, returning a self-contained UnsafeRow that stores its data in an internal
+   * byte array rather than referencing data stored in a data page.
+   * <p>
+   * This method is only supported on UnsafeRows that do not use ObjectPools.
+   */
   @Override
   public InternalRow copy() {
-    throw new UnsupportedOperationException();
+    if (pool != null) {
+      throw new UnsupportedOperationException(
+        "Copy is not supported for UnsafeRows that use object pools");
+    } else {
+      UnsafeRow rowCopy = new UnsafeRow();
+      final byte[] rowDataCopy = new byte[sizeInBytes];
+      PlatformDependent.copyMemory(
+        baseObject,
+        baseOffset,
+        rowDataCopy,
+        PlatformDependent.BYTE_ARRAY_OFFSET,
+        sizeInBytes
+      );
+      rowCopy.pointTo(
+        rowDataCopy, PlatformDependent.BYTE_ARRAY_OFFSET, numFields, sizeInBytes, null);
+      return rowCopy;
+    }
   }
 
   @Override
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverter.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverter.scala
index 1f395497a9839..6af5e6200e57b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverter.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverter.scala
@@ -70,10 +70,16 @@ class UnsafeRowConverter(fieldTypes: Array[DataType]) {
    * @param row the row to convert
    * @param baseObject the base object of the destination address
    * @param baseOffset the base offset of the destination address
+   * @param rowLengthInBytes the length calculated by `getSizeRequirement(row)`
    * @return the number of bytes written. This should be equal to `getSizeRequirement(row)`.
    */
-  def writeRow(row: InternalRow, baseObject: Object, baseOffset: Long, pool: ObjectPool): Int = {
-    unsafeRow.pointTo(baseObject, baseOffset, writers.length, pool)
+  def writeRow(
+      row: InternalRow,
+      baseObject: Object,
+      baseOffset: Long,
+      rowLengthInBytes: Int,
+      pool: ObjectPool): Int = {
+    unsafeRow.pointTo(baseObject, baseOffset, writers.length, rowLengthInBytes, pool)
 
     if (writers.length > 0) {
       // zero-out the bitset
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala
index 96d4e64ea344a..d00aeb4dfbf47 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala
@@ -44,19 +44,32 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers {
     val sizeRequired: Int = converter.getSizeRequirement(row)
     assert(sizeRequired === 8 + (3 * 8))
     val buffer: Array[Long] = new Array[Long](sizeRequired / 8)
-    val numBytesWritten = converter.writeRow(row, buffer, PlatformDependent.LONG_ARRAY_OFFSET, null)
+    val numBytesWritten =
+      converter.writeRow(row, buffer, PlatformDependent.LONG_ARRAY_OFFSET, sizeRequired, null)
     assert(numBytesWritten === sizeRequired)
 
     val unsafeRow = new UnsafeRow()
-    unsafeRow.pointTo(buffer, PlatformDependent.LONG_ARRAY_OFFSET, fieldTypes.length, null)
+    unsafeRow.pointTo(
+      buffer, PlatformDependent.LONG_ARRAY_OFFSET, fieldTypes.length, sizeRequired, null)
     assert(unsafeRow.getLong(0) === 0)
     assert(unsafeRow.getLong(1) === 1)
     assert(unsafeRow.getInt(2) === 2)
 
+    // We can copy UnsafeRows as long as they don't reference ObjectPools
+    val unsafeRowCopy = unsafeRow.copy()
+    assert(unsafeRowCopy.getLong(0) === 0)
+    assert(unsafeRowCopy.getLong(1) === 1)
+    assert(unsafeRowCopy.getInt(2) === 2)
+
     unsafeRow.setLong(1, 3)
     assert(unsafeRow.getLong(1) === 3)
     unsafeRow.setInt(2, 4)
     assert(unsafeRow.getInt(2) === 4)
+
+    // Mutating the original row should not have changed the copy
+    assert(unsafeRowCopy.getLong(0) === 0)
+    assert(unsafeRowCopy.getLong(1) === 1)
+    assert(unsafeRowCopy.getInt(2) === 2)
   }
 
   test("basic conversion with primitive, string and binary types") {
@@ -73,12 +86,14 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers {
       ByteArrayMethods.roundNumberOfBytesToNearestWord("Hello".getBytes.length) +
       ByteArrayMethods.roundNumberOfBytesToNearestWord("World".getBytes.length))
     val buffer: Array[Long] = new Array[Long](sizeRequired / 8)
-    val numBytesWritten = converter.writeRow(row, buffer, PlatformDependent.LONG_ARRAY_OFFSET, null)
+    val numBytesWritten = converter.writeRow(
+      row, buffer, PlatformDependent.LONG_ARRAY_OFFSET, sizeRequired, null)
     assert(numBytesWritten === sizeRequired)
 
     val unsafeRow = new UnsafeRow()
     val pool = new ObjectPool(10)
-    unsafeRow.pointTo(buffer, PlatformDependent.LONG_ARRAY_OFFSET, fieldTypes.length, pool)
+    unsafeRow.pointTo(
+      buffer, PlatformDependent.LONG_ARRAY_OFFSET, fieldTypes.length, sizeRequired, pool)
     assert(unsafeRow.getLong(0) === 0)
     assert(unsafeRow.getString(1) === "Hello")
     assert(unsafeRow.get(2) === "World".getBytes)
@@ -96,6 +111,11 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers {
     unsafeRow.update(2, "Hello World".getBytes)
     assert(unsafeRow.get(2) === "Hello World".getBytes)
     assert(pool.size === 2)
+
+    // We do not support copy() for UnsafeRows that reference ObjectPools
+    intercept[UnsupportedOperationException] {
+      unsafeRow.copy()
+    }
   }
 
   test("basic conversion with primitive, decimal and array") {
@@ -111,12 +131,14 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers {
     val sizeRequired: Int = converter.getSizeRequirement(row)
     assert(sizeRequired === 8 + (8 * 3))
     val buffer: Array[Long] = new Array[Long](sizeRequired / 8)
-    val numBytesWritten = converter.writeRow(row, buffer, PlatformDependent.LONG_ARRAY_OFFSET, pool)
+    val numBytesWritten =
+      converter.writeRow(row, buffer, PlatformDependent.LONG_ARRAY_OFFSET, sizeRequired, pool)
     assert(numBytesWritten === sizeRequired)
     assert(pool.size === 2)
 
     val unsafeRow = new UnsafeRow()
-    unsafeRow.pointTo(buffer, PlatformDependent.LONG_ARRAY_OFFSET, fieldTypes.length, pool)
+    unsafeRow.pointTo(
+      buffer, PlatformDependent.LONG_ARRAY_OFFSET, fieldTypes.length, sizeRequired, pool)
     assert(unsafeRow.getLong(0) === 0)
     assert(unsafeRow.get(1) === Decimal(1))
     assert(unsafeRow.get(2) === Array(2))
@@ -142,11 +164,13 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers {
     assert(sizeRequired === 8 + (8 * 4) +
       ByteArrayMethods.roundNumberOfBytesToNearestWord("Hello".getBytes.length))
     val buffer: Array[Long] = new Array[Long](sizeRequired / 8)
-    val numBytesWritten = converter.writeRow(row, buffer, PlatformDependent.LONG_ARRAY_OFFSET, null)
+    val numBytesWritten =
+      converter.writeRow(row, buffer, PlatformDependent.LONG_ARRAY_OFFSET, sizeRequired, null)
     assert(numBytesWritten === sizeRequired)
 
     val unsafeRow = new UnsafeRow()
-    unsafeRow.pointTo(buffer, PlatformDependent.LONG_ARRAY_OFFSET, fieldTypes.length, null)
+    unsafeRow.pointTo(
+      buffer, PlatformDependent.LONG_ARRAY_OFFSET, fieldTypes.length, sizeRequired, null)
     assert(unsafeRow.getLong(0) === 0)
     assert(unsafeRow.getString(1) === "Hello")
     // Date is represented as Int in unsafeRow
@@ -190,12 +214,14 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers {
     val sizeRequired: Int = converter.getSizeRequirement(rowWithAllNullColumns)
     val createdFromNullBuffer: Array[Long] = new Array[Long](sizeRequired / 8)
     val numBytesWritten = converter.writeRow(
-      rowWithAllNullColumns, createdFromNullBuffer, PlatformDependent.LONG_ARRAY_OFFSET, null)
+      rowWithAllNullColumns, createdFromNullBuffer, PlatformDependent.LONG_ARRAY_OFFSET,
+      sizeRequired, null)
     assert(numBytesWritten === sizeRequired)
 
     val createdFromNull = new UnsafeRow()
     createdFromNull.pointTo(
-      createdFromNullBuffer, PlatformDependent.LONG_ARRAY_OFFSET, fieldTypes.length, null)
+      createdFromNullBuffer, PlatformDependent.LONG_ARRAY_OFFSET, fieldTypes.length,
+      sizeRequired, null)
     for (i <- 0 to fieldTypes.length - 1) {
       assert(createdFromNull.isNullAt(i))
     }
@@ -233,10 +259,12 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers {
     val pool = new ObjectPool(1)
     val setToNullAfterCreationBuffer: Array[Long] = new Array[Long](sizeRequired / 8 + 2)
     converter.writeRow(
-      rowWithNoNullColumns, setToNullAfterCreationBuffer, PlatformDependent.LONG_ARRAY_OFFSET, pool)
+      rowWithNoNullColumns, setToNullAfterCreationBuffer, PlatformDependent.LONG_ARRAY_OFFSET,
+      sizeRequired, pool)
     val setToNullAfterCreation = new UnsafeRow()
     setToNullAfterCreation.pointTo(
-      setToNullAfterCreationBuffer, PlatformDependent.LONG_ARRAY_OFFSET, fieldTypes.length, pool)
+      setToNullAfterCreationBuffer, PlatformDependent.LONG_ARRAY_OFFSET, fieldTypes.length,
+      sizeRequired, pool)
 
     assert(setToNullAfterCreation.isNullAt(0) === rowWithNoNullColumns.isNullAt(0))
     assert(setToNullAfterCreation.getBoolean(1) === rowWithNoNullColumns.getBoolean(1))

From 47ef423f860c3109d50c7e321616b267f4296e34 Mon Sep 17 00:00:00 2001
From: Andrew Or <andrew@databricks.com>
Date: Wed, 8 Jul 2015 20:29:08 -0700
Subject: [PATCH 073/549] [SPARK-8910] Fix MiMa flaky due to port contention
 issue

Due to the way MiMa works, we currently start a `SQLContext` pretty early on. This causes us to start a `SparkUI` that attempts to bind to port 4040. Because many tests run in parallel on the Jenkins machines, this  causes port contention sometimes and fails the MiMa tests.

Note that we already disabled the SparkUI for scalatests. However, the MiMa test is run before we even have a chance to load the default scalatest settings, so we need to explicitly disable the UI ourselves.

Author: Andrew Or <andrew@databricks.com>

Closes #7300 from andrewor14/mima-flaky and squashes the following commits:

b55a547 [Andrew Or] Do not enable SparkUI during tests
---
 .../scala/org/apache/spark/sql/test/TestSQLContext.scala  | 8 ++++----
 .../scala/org/apache/spark/sql/hive/test/TestHive.scala   | 7 ++++---
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/test/TestSQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/test/TestSQLContext.scala
index 9fa394525d65c..b3a4231da91c2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/test/TestSQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/test/TestSQLContext.scala
@@ -26,10 +26,10 @@ import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 /** A SQLContext that can be used for local testing. */
 class LocalSQLContext
   extends SQLContext(
-    new SparkContext(
-      "local[2]",
-      "TestSQLContext",
-      new SparkConf().set("spark.sql.testkey", "true"))) {
+    new SparkContext("local[2]", "TestSQLContext", new SparkConf()
+      .set("spark.sql.testkey", "true")
+      // SPARK-8910
+      .set("spark.ui.enabled", "false"))) {
 
   override protected[sql] def createSession(): SQLSession = {
     new this.SQLSession()
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
index 7978fdacaedba..0f217bc66869f 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
@@ -53,9 +53,10 @@ object TestHive
       "TestSQLContext",
       new SparkConf()
         .set("spark.sql.test", "")
-        .set(
-          "spark.sql.hive.metastore.barrierPrefixes",
-          "org.apache.spark.sql.hive.execution.PairSerDe")))
+        .set("spark.sql.hive.metastore.barrierPrefixes",
+          "org.apache.spark.sql.hive.execution.PairSerDe")
+        // SPARK-8910
+        .set("spark.ui.enabled", "false")))
 
 /**
  * A locally running test instance of Spark's Hive execution engine.

From aba5784dab24c03ddad89f7a1b5d3d0dc8d109be Mon Sep 17 00:00:00 2001
From: Kousuke Saruta <sarutak@oss.nttdata.co.jp>
Date: Thu, 9 Jul 2015 13:28:17 +0900
Subject: [PATCH 074/549] [SPARK-8937] [TEST] A setting
 `spark.unsafe.exceptionOnMemoryLeak ` is missing in ScalaTest config.

`spark.unsafe.exceptionOnMemoryLeak` is present in the config of surefire.

```
        <!-- Surefire runs all Java tests -->
        <plugin>
          <groupId>org.apache.maven.plugins</groupId>
          <artifactId>maven-surefire-plugin</artifactId>
          <version>2.18.1</version>
          <!-- Note config is repeated in scalatest config -->
...

<spark.unsafe.exceptionOnMemoryLeak>true</spark.unsafe.exceptionOnMemoryLeak>
            </systemProperties>
...
```

 but is absent in the config ScalaTest.

Author: Kousuke Saruta <sarutak@oss.nttdata.co.jp>

Closes #7308 from sarutak/add-setting-for-memory-leak and squashes the following commits:

95644e7 [Kousuke Saruta] Added a setting for memory leak
---
 pom.xml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pom.xml b/pom.xml
index 9cf2471b51304..529e47f8b5253 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1339,6 +1339,7 @@
               <spark.ui.enabled>false</spark.ui.enabled>
               <spark.ui.showConsoleProgress>false</spark.ui.showConsoleProgress>
               <spark.driver.allowMultipleContexts>true</spark.driver.allowMultipleContexts>
+              <spark.unsafe.exceptionOnMemoryLeak>true</spark.unsafe.exceptionOnMemoryLeak>
             </systemProperties>
           </configuration>
           <executions>

From 768907eb7b0d3c11a420ef281454e36167011c89 Mon Sep 17 00:00:00 2001
From: Michael Armbrust <michael@databricks.com>
Date: Wed, 8 Jul 2015 22:05:58 -0700
Subject: [PATCH 075/549] [SPARK-8926][SQL] Good errors for ExpectsInputType
 expressions

For example: `cannot resolve 'testfunction(null)' due to data type mismatch: argument 1 is expected to be of type int, however, null is of type datetype.`

Author: Michael Armbrust <michael@databricks.com>

Closes #7303 from marmbrus/expectsTypeErrors and squashes the following commits:

c654a0e [Michael Armbrust] fix udts and make errors pretty
137160d [Michael Armbrust] style
5428fda [Michael Armbrust] style
10fac82 [Michael Armbrust] [SPARK-8926][SQL] Good errors for ExpectsInputType expressions
---
 .../catalyst/analysis/HiveTypeCoercion.scala  |  12 +-
 .../expressions/ExpectsInputTypes.scala       |  13 +-
 .../spark/sql/types/AbstractDataType.scala    |  30 +++-
 .../apache/spark/sql/types/ArrayType.scala    |   8 +-
 .../org/apache/spark/sql/types/DataType.scala |   4 +-
 .../apache/spark/sql/types/DecimalType.scala  |   8 +-
 .../org/apache/spark/sql/types/MapType.scala  |   8 +-
 .../apache/spark/sql/types/StructType.scala   |   8 +-
 .../spark/sql/types/UserDefinedType.scala     |   5 +-
 .../analysis/AnalysisErrorSuite.scala         | 167 ++++++++++++++++++
 .../sql/catalyst/analysis/AnalysisSuite.scala | 126 ++-----------
 .../analysis/HiveTypeCoercionSuite.scala      |   8 +
 .../apache/spark/sql/hive/HiveContext.scala   |   2 +-
 13 files changed, 256 insertions(+), 143 deletions(-)
 create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
index 5367b7f3308ee..8cb71995eb818 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
@@ -702,11 +702,19 @@ object HiveTypeCoercion {
       @Nullable val ret: Expression = (inType, expectedType) match {
 
         // If the expected type is already a parent of the input type, no need to cast.
-        case _ if expectedType.isParentOf(inType) => e
+        case _ if expectedType.isSameType(inType) => e
 
         // Cast null type (usually from null literals) into target types
         case (NullType, target) => Cast(e, target.defaultConcreteType)
 
+        // If the function accepts any numeric type (i.e. the ADT `NumericType`) and the input is
+        // already a number, leave it as is.
+        case (_: NumericType, NumericType) => e
+
+        // If the function accepts any numeric type and the input is a string, we follow the hive
+        // convention and cast that input into a double
+        case (StringType, NumericType) => Cast(e, NumericType.defaultConcreteType)
+
         // Implicit cast among numeric types
         // If input is a numeric type but not decimal, and we expect a decimal type,
         // cast the input to unlimited precision decimal.
@@ -732,7 +740,7 @@ object HiveTypeCoercion {
         // First see if we can find our input type in the type collection. If we can, then just
         // use the current expression; otherwise, find the first one we can implicitly cast.
         case (_, TypeCollection(types)) =>
-          if (types.exists(_.isParentOf(inType))) {
+          if (types.exists(_.isSameType(inType))) {
             e
           } else {
             types.flatMap(implicitCast(e, _)).headOption.orNull
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExpectsInputTypes.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExpectsInputTypes.scala
index 916e30154d4f1..986cc09499d1f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExpectsInputTypes.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExpectsInputTypes.scala
@@ -37,7 +37,16 @@ trait ExpectsInputTypes { self: Expression =>
   def inputTypes: Seq[AbstractDataType]
 
   override def checkInputDataTypes(): TypeCheckResult = {
-    // TODO: implement proper type checking.
-    TypeCheckResult.TypeCheckSuccess
+    val mismatches = children.zip(inputTypes).zipWithIndex.collect {
+      case ((child, expected), idx) if !expected.acceptsType(child.dataType) =>
+        s"Argument ${idx + 1} is expected to be of type ${expected.simpleString}, " +
+        s"however, ${child.prettyString} is of type ${child.dataType.simpleString}."
+    }
+
+    if (mismatches.isEmpty) {
+      TypeCheckResult.TypeCheckSuccess
+    } else {
+      TypeCheckResult.TypeCheckFailure(mismatches.mkString(" "))
+    }
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala
index fb1b47e946214..ad75fa2e31d90 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala
@@ -34,9 +34,16 @@ private[sql] abstract class AbstractDataType {
   private[sql] def defaultConcreteType: DataType
 
   /**
-   * Returns true if this data type is a parent of the `childCandidate`.
+   * Returns true if this data type is the same type as `other`.  This is different that equality
+   * as equality will also consider data type parametrization, such as decimal precision.
    */
-  private[sql] def isParentOf(childCandidate: DataType): Boolean
+  private[sql] def isSameType(other: DataType): Boolean
+
+  /**
+   * Returns true if `other` is an acceptable input type for a function that expectes this,
+   * possibly abstract, DataType.
+   */
+  private[sql] def acceptsType(other: DataType): Boolean = isSameType(other)
 
   /** Readable string representation for the type. */
   private[sql] def simpleString: String
@@ -58,11 +65,14 @@ private[sql] class TypeCollection(private val types: Seq[AbstractDataType])
 
   require(types.nonEmpty, s"TypeCollection ($types) cannot be empty")
 
-  private[sql] override def defaultConcreteType: DataType = types.head.defaultConcreteType
+  override private[sql] def defaultConcreteType: DataType = types.head.defaultConcreteType
+
+  override private[sql] def isSameType(other: DataType): Boolean = false
 
-  private[sql] override def isParentOf(childCandidate: DataType): Boolean = false
+  override private[sql] def acceptsType(other: DataType): Boolean =
+    types.exists(_.isSameType(other))
 
-  private[sql] override def simpleString: String = {
+  override private[sql] def simpleString: String = {
     types.map(_.simpleString).mkString("(", " or ", ")")
   }
 }
@@ -108,7 +118,7 @@ abstract class NumericType extends AtomicType {
 }
 
 
-private[sql] object NumericType {
+private[sql] object NumericType extends AbstractDataType {
   /**
    * Enables matching against NumericType for expressions:
    * {{{
@@ -117,6 +127,14 @@ private[sql] object NumericType {
    * }}}
    */
   def unapply(e: Expression): Boolean = e.dataType.isInstanceOf[NumericType]
+
+  override private[sql] def defaultConcreteType: DataType = DoubleType
+
+  override private[sql] def simpleString: String = "numeric"
+
+  override private[sql] def isSameType(other: DataType): Boolean = false
+
+  override private[sql] def acceptsType(other: DataType): Boolean = other.isInstanceOf[NumericType]
 }
 
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala
index 43413ec761e6b..76ca7a84c1d1a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala
@@ -26,13 +26,13 @@ object ArrayType extends AbstractDataType {
   /** Construct a [[ArrayType]] object with the given element type. The `containsNull` is true. */
   def apply(elementType: DataType): ArrayType = ArrayType(elementType, containsNull = true)
 
-  private[sql] override def defaultConcreteType: DataType = ArrayType(NullType, containsNull = true)
+  override private[sql] def defaultConcreteType: DataType = ArrayType(NullType, containsNull = true)
 
-  private[sql] override def isParentOf(childCandidate: DataType): Boolean = {
-    childCandidate.isInstanceOf[ArrayType]
+  override private[sql] def isSameType(other: DataType): Boolean = {
+    other.isInstanceOf[ArrayType]
   }
 
-  private[sql] override def simpleString: String = "array"
+  override private[sql] def simpleString: String = "array"
 }
 
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala
index a4c2da8e05f5d..57718228e490f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala
@@ -76,9 +76,9 @@ abstract class DataType extends AbstractDataType {
    */
   private[spark] def asNullable: DataType
 
-  private[sql] override def defaultConcreteType: DataType = this
+  override private[sql] def defaultConcreteType: DataType = this
 
-  private[sql] override def isParentOf(childCandidate: DataType): Boolean = this == childCandidate
+  override private[sql] def isSameType(other: DataType): Boolean = this == other
 }
 
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala
index 127b16ff85bed..a1cafeab1704d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala
@@ -84,13 +84,13 @@ case class DecimalType(precisionInfo: Option[PrecisionInfo]) extends FractionalT
 /** Extra factory methods and pattern matchers for Decimals */
 object DecimalType extends AbstractDataType {
 
-  private[sql] override def defaultConcreteType: DataType = Unlimited
+  override private[sql] def defaultConcreteType: DataType = Unlimited
 
-  private[sql] override def isParentOf(childCandidate: DataType): Boolean = {
-    childCandidate.isInstanceOf[DecimalType]
+  override private[sql] def isSameType(other: DataType): Boolean = {
+    other.isInstanceOf[DecimalType]
   }
 
-  private[sql] override def simpleString: String = "decimal"
+  override private[sql] def simpleString: String = "decimal"
 
   val Unlimited: DecimalType = DecimalType(None)
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala
index 868dea13d971e..ddead10bc2171 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala
@@ -69,13 +69,13 @@ case class MapType(
 
 object MapType extends AbstractDataType {
 
-  private[sql] override def defaultConcreteType: DataType = apply(NullType, NullType)
+  override private[sql] def defaultConcreteType: DataType = apply(NullType, NullType)
 
-  private[sql] override def isParentOf(childCandidate: DataType): Boolean = {
-    childCandidate.isInstanceOf[MapType]
+  override private[sql] def isSameType(other: DataType): Boolean = {
+    other.isInstanceOf[MapType]
   }
 
-  private[sql] override def simpleString: String = "map"
+  override private[sql] def simpleString: String = "map"
 
   /**
    * Construct a [[MapType]] object with the given key type and value type.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
index e2d3f53f7d978..e0b8ff91786a7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
@@ -303,13 +303,13 @@ case class StructType(fields: Array[StructField]) extends DataType with Seq[Stru
 
 object StructType extends AbstractDataType {
 
-  private[sql] override def defaultConcreteType: DataType = new StructType
+  override private[sql] def defaultConcreteType: DataType = new StructType
 
-  private[sql] override def isParentOf(childCandidate: DataType): Boolean = {
-    childCandidate.isInstanceOf[StructType]
+  override private[sql] def isSameType(other: DataType): Boolean = {
+    other.isInstanceOf[StructType]
   }
 
-  private[sql] override def simpleString: String = "struct"
+  override private[sql] def simpleString: String = "struct"
 
   private[sql] def fromString(raw: String): StructType = DataType.fromString(raw) match {
     case t: StructType => t
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/UserDefinedType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/UserDefinedType.scala
index 6b20505c6009a..e47cfb4833bd8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/UserDefinedType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/UserDefinedType.scala
@@ -77,5 +77,8 @@ abstract class UserDefinedType[UserType] extends DataType with Serializable {
    * For UDT, asNullable will not change the nullability of its internal sqlType and just returns
    * itself.
    */
-  private[spark] override def asNullable: UserDefinedType[UserType] = this
+  override private[spark] def asNullable: UserDefinedType[UserType] = this
+
+  override private[sql] def acceptsType(dataType: DataType) =
+    this.getClass == dataType.getClass
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
new file mode 100644
index 0000000000000..73236c3acbca2
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
@@ -0,0 +1,167 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.analysis
+
+import org.scalatest.BeforeAndAfter
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.AnalysisException
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.plans.logical._
+import org.apache.spark.sql.types._
+import org.apache.spark.sql.catalyst.{InternalRow, SimpleCatalystConf}
+import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.catalyst.dsl.plans._
+
+case class TestFunction(
+    children: Seq[Expression],
+    inputTypes: Seq[AbstractDataType]) extends Expression with ExpectsInputTypes {
+  override def nullable: Boolean = true
+  override def eval(input: InternalRow): Any = throw new UnsupportedOperationException
+  override def dataType: DataType = StringType
+}
+
+case class UnresolvedTestPlan() extends LeafNode {
+  override lazy val resolved = false
+  override def output: Seq[Attribute] = Nil
+}
+
+class AnalysisErrorSuite extends SparkFunSuite with BeforeAndAfter {
+  import AnalysisSuite._
+
+  def errorTest(
+      name: String,
+      plan: LogicalPlan,
+      errorMessages: Seq[String],
+      caseSensitive: Boolean = true): Unit = {
+    test(name) {
+      val error = intercept[AnalysisException] {
+        if (caseSensitive) {
+          caseSensitiveAnalyze(plan)
+        } else {
+          caseInsensitiveAnalyze(plan)
+        }
+      }
+
+      errorMessages.foreach(m => assert(error.getMessage.toLowerCase contains m.toLowerCase))
+    }
+  }
+
+  val dateLit = Literal.create(null, DateType)
+
+  errorTest(
+    "single invalid type, single arg",
+    testRelation.select(TestFunction(dateLit :: Nil, IntegerType :: Nil).as('a)),
+    "cannot resolve" :: "testfunction" :: "argument 1" :: "expected to be of type int" ::
+    "null is of type date" ::Nil)
+
+  errorTest(
+    "single invalid type, second arg",
+    testRelation.select(
+      TestFunction(dateLit :: dateLit :: Nil, DateType :: IntegerType :: Nil).as('a)),
+    "cannot resolve" :: "testfunction" :: "argument 2" :: "expected to be of type int" ::
+    "null is of type date" ::Nil)
+
+  errorTest(
+    "multiple invalid type",
+    testRelation.select(
+      TestFunction(dateLit :: dateLit :: Nil, IntegerType :: IntegerType :: Nil).as('a)),
+    "cannot resolve" :: "testfunction" :: "argument 1" :: "argument 2" ::
+    "expected to be of type int" :: "null is of type date" ::Nil)
+
+  errorTest(
+    "unresolved window function",
+    testRelation2.select(
+      WindowExpression(
+        UnresolvedWindowFunction(
+          "lead",
+          UnresolvedAttribute("c") :: Nil),
+        WindowSpecDefinition(
+          UnresolvedAttribute("a") :: Nil,
+          SortOrder(UnresolvedAttribute("b"), Ascending) :: Nil,
+          UnspecifiedFrame)).as('window)),
+    "lead" :: "window functions currently requires a HiveContext" :: Nil)
+
+  errorTest(
+    "too many generators",
+    listRelation.select(Explode('list).as('a), Explode('list).as('b)),
+    "only one generator" :: "explode" :: Nil)
+
+  errorTest(
+    "unresolved attributes",
+    testRelation.select('abcd),
+    "cannot resolve" :: "abcd" :: Nil)
+
+  errorTest(
+    "bad casts",
+    testRelation.select(Literal(1).cast(BinaryType).as('badCast)),
+    "cannot cast" :: Literal(1).dataType.simpleString :: BinaryType.simpleString :: Nil)
+
+  errorTest(
+    "non-boolean filters",
+    testRelation.where(Literal(1)),
+    "filter" :: "'1'" :: "not a boolean" :: Literal(1).dataType.simpleString :: Nil)
+
+  errorTest(
+    "missing group by",
+    testRelation2.groupBy('a)('b),
+    "'b'" :: "group by" :: Nil
+  )
+
+  errorTest(
+    "ambiguous field",
+    nestedRelation.select($"top.duplicateField"),
+    "Ambiguous reference to fields" :: "duplicateField" :: Nil,
+    caseSensitive = false)
+
+  errorTest(
+    "ambiguous field due to case insensitivity",
+    nestedRelation.select($"top.differentCase"),
+    "Ambiguous reference to fields" :: "differentCase" :: "differentcase" :: Nil,
+    caseSensitive = false)
+
+  errorTest(
+    "missing field",
+    nestedRelation2.select($"top.c"),
+    "No such struct field" :: "aField" :: "bField" :: "cField" :: Nil,
+    caseSensitive = false)
+
+  errorTest(
+    "catch all unresolved plan",
+    UnresolvedTestPlan(),
+    "unresolved" :: Nil)
+
+
+  test("SPARK-6452 regression test") {
+    // CheckAnalysis should throw AnalysisException when Aggregate contains missing attribute(s)
+    val plan =
+      Aggregate(
+        Nil,
+        Alias(Sum(AttributeReference("a", IntegerType)(exprId = ExprId(1))), "b")() :: Nil,
+        LocalRelation(
+          AttributeReference("a", IntegerType)(exprId = ExprId(2))))
+
+    assert(plan.resolved)
+
+    val message = intercept[AnalysisException] {
+      caseSensitiveAnalyze(plan)
+    }.getMessage
+
+    assert(message.contains("resolved attribute(s) a#1 missing from a#2"))
+  }
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
index 77ca080f366cd..58df1de983a09 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
@@ -28,7 +28,7 @@ import org.apache.spark.sql.catalyst.SimpleCatalystConf
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.dsl.plans._
 
-class AnalysisSuite extends SparkFunSuite with BeforeAndAfter {
+object AnalysisSuite {
   val caseSensitiveConf = new SimpleCatalystConf(true)
   val caseInsensitiveConf = new SimpleCatalystConf(false)
 
@@ -61,25 +61,28 @@ class AnalysisSuite extends SparkFunSuite with BeforeAndAfter {
   val nestedRelation = LocalRelation(
     AttributeReference("top", StructType(
       StructField("duplicateField", StringType) ::
-      StructField("duplicateField", StringType) ::
-      StructField("differentCase", StringType) ::
-      StructField("differentcase", StringType) :: Nil
+        StructField("duplicateField", StringType) ::
+        StructField("differentCase", StringType) ::
+        StructField("differentcase", StringType) :: Nil
     ))())
 
   val nestedRelation2 = LocalRelation(
     AttributeReference("top", StructType(
       StructField("aField", StringType) ::
-      StructField("bField", StringType) ::
-      StructField("cField", StringType) :: Nil
+        StructField("bField", StringType) ::
+        StructField("cField", StringType) :: Nil
     ))())
 
   val listRelation = LocalRelation(
     AttributeReference("list", ArrayType(IntegerType))())
 
-  before {
-    caseSensitiveCatalog.registerTable(Seq("TaBlE"), testRelation)
-    caseInsensitiveCatalog.registerTable(Seq("TaBlE"), testRelation)
-  }
+  caseSensitiveCatalog.registerTable(Seq("TaBlE"), testRelation)
+  caseInsensitiveCatalog.registerTable(Seq("TaBlE"), testRelation)
+}
+
+
+class AnalysisSuite extends SparkFunSuite with BeforeAndAfter {
+  import AnalysisSuite._
 
   test("union project *") {
     val plan = (1 to 100)
@@ -149,91 +152,6 @@ class AnalysisSuite extends SparkFunSuite with BeforeAndAfter {
       caseInsensitiveAnalyzer.execute(UnresolvedRelation(Seq("TaBlE"), None)) === testRelation)
   }
 
-  def errorTest(
-      name: String,
-      plan: LogicalPlan,
-      errorMessages: Seq[String],
-      caseSensitive: Boolean = true): Unit = {
-    test(name) {
-      val error = intercept[AnalysisException] {
-        if (caseSensitive) {
-          caseSensitiveAnalyze(plan)
-        } else {
-          caseInsensitiveAnalyze(plan)
-        }
-      }
-
-      errorMessages.foreach(m => assert(error.getMessage.toLowerCase contains m.toLowerCase))
-    }
-  }
-
-  errorTest(
-    "unresolved window function",
-    testRelation2.select(
-      WindowExpression(
-        UnresolvedWindowFunction(
-          "lead",
-          UnresolvedAttribute("c") :: Nil),
-        WindowSpecDefinition(
-          UnresolvedAttribute("a") :: Nil,
-          SortOrder(UnresolvedAttribute("b"), Ascending) :: Nil,
-          UnspecifiedFrame)).as('window)),
-      "lead" :: "window functions currently requires a HiveContext" :: Nil)
-
-  errorTest(
-    "too many generators",
-    listRelation.select(Explode('list).as('a), Explode('list).as('b)),
-    "only one generator" :: "explode" :: Nil)
-
-  errorTest(
-    "unresolved attributes",
-    testRelation.select('abcd),
-    "cannot resolve" :: "abcd" :: Nil)
-
-  errorTest(
-    "bad casts",
-    testRelation.select(Literal(1).cast(BinaryType).as('badCast)),
-    "cannot cast" :: Literal(1).dataType.simpleString :: BinaryType.simpleString :: Nil)
-
-  errorTest(
-    "non-boolean filters",
-    testRelation.where(Literal(1)),
-    "filter" :: "'1'" :: "not a boolean" :: Literal(1).dataType.simpleString :: Nil)
-
-  errorTest(
-    "missing group by",
-    testRelation2.groupBy('a)('b),
-    "'b'" :: "group by" :: Nil
-  )
-
-  errorTest(
-    "ambiguous field",
-    nestedRelation.select($"top.duplicateField"),
-    "Ambiguous reference to fields" :: "duplicateField" :: Nil,
-    caseSensitive = false)
-
-  errorTest(
-    "ambiguous field due to case insensitivity",
-    nestedRelation.select($"top.differentCase"),
-    "Ambiguous reference to fields" :: "differentCase" :: "differentcase" :: Nil,
-    caseSensitive = false)
-
-  errorTest(
-    "missing field",
-    nestedRelation2.select($"top.c"),
-    "No such struct field" :: "aField" :: "bField" :: "cField" :: Nil,
-    caseSensitive = false)
-
-  case class UnresolvedTestPlan() extends LeafNode {
-    override lazy val resolved = false
-    override def output: Seq[Attribute] = Nil
-  }
-
-  errorTest(
-    "catch all unresolved plan",
-    UnresolvedTestPlan(),
-    "unresolved" :: Nil)
-
 
   test("divide should be casted into fractional types") {
     val testRelation2 = LocalRelation(
@@ -258,22 +176,4 @@ class AnalysisSuite extends SparkFunSuite with BeforeAndAfter {
     assert(pl(3).dataType == DecimalType.Unlimited)
     assert(pl(4).dataType == DoubleType)
   }
-
-  test("SPARK-6452 regression test") {
-    // CheckAnalysis should throw AnalysisException when Aggregate contains missing attribute(s)
-    val plan =
-      Aggregate(
-        Nil,
-        Alias(Sum(AttributeReference("a", IntegerType)(exprId = ExprId(1))), "b")() :: Nil,
-        LocalRelation(
-          AttributeReference("a", IntegerType)(exprId = ExprId(2))))
-
-    assert(plan.resolved)
-
-    val message = intercept[AnalysisException] {
-      caseSensitiveAnalyze(plan)
-    }.getMessage
-
-    assert(message.contains("resolved attribute(s) a#1 missing from a#2"))
-  }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
index 93db33d44eb25..6e3aa0eebeb15 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
@@ -77,6 +77,14 @@ class HiveTypeCoercionSuite extends PlanTest {
     shouldCast(DecimalType(10, 2), TypeCollection(IntegerType, DecimalType), DecimalType(10, 2))
     shouldCast(DecimalType(10, 2), TypeCollection(DecimalType, IntegerType), DecimalType(10, 2))
     shouldCast(IntegerType, TypeCollection(DecimalType(10, 2), StringType), DecimalType(10, 2))
+
+    shouldCast(StringType, NumericType, DoubleType)
+
+    // NumericType should not be changed when function accepts any of them.
+    Seq(ByteType, ShortType, IntegerType, LongType, FloatType, DoubleType,
+      DecimalType.Unlimited, DecimalType(10, 2)).foreach { tpe =>
+      shouldCast(tpe, NumericType, tpe)
+    }
   }
 
   test("ineligible implicit type cast") {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index 439d8cab5f257..bbc39b892b79e 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -359,7 +359,7 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
     hiveconf.set(key, value)
   }
 
-  private[sql] override def setConf[T](entry: SQLConfEntry[T], value: T): Unit = {
+  override private[sql] def setConf[T](entry: SQLConfEntry[T], value: T): Unit = {
     setConf(entry.key, entry.stringConverter(value))
   }
 

From a240bf3b44b15d0da5182d6ebec281dbdc5439e8 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Wed, 8 Jul 2015 22:08:50 -0700
Subject: [PATCH 076/549] Closes #7310.


From 3dab0da42940a46f0c4aa4853bdb5c64c4cb2613 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Wed, 8 Jul 2015 22:09:12 -0700
Subject: [PATCH 077/549] [SPARK-8928] [SQL] Makes CatalystSchemaConverter
 sticking to 1.4.x- when handling Parquet LISTs in compatible mode

This PR is based on #7209 authored by Sephiroth-Lin.

Author: Weizhong Lin <linweizhong@huawei.com>

Closes #7304 from liancheng/spark-8928 and squashes the following commits:

75267fe [Cheng Lian] Makes CatalystSchemaConverter sticking to 1.4.x- when handling LISTs in compatible mode
---
 .../spark/sql/parquet/CatalystSchemaConverter.scala    |  6 ++++--
 .../apache/spark/sql/parquet/ParquetSchemaSuite.scala  | 10 +++++-----
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala
index de3a72d8146c5..1ea6926af6d5b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala
@@ -461,7 +461,8 @@ private[parquet] class CatalystSchemaConverter(
           field.name,
           Types
             .buildGroup(REPEATED)
-            .addField(convertField(StructField("element", elementType, nullable)))
+            // "array_element" is the name chosen by parquet-hive (1.7.0 and prior version)
+            .addField(convertField(StructField("array_element", elementType, nullable)))
             .named(CatalystConverter.ARRAY_CONTAINS_NULL_BAG_SCHEMA_NAME))
 
       // Spark 1.4.x and prior versions convert ArrayType with non-nullable elements into a 2-level
@@ -474,7 +475,8 @@ private[parquet] class CatalystSchemaConverter(
         ConversionPatterns.listType(
           repetition,
           field.name,
-          convertField(StructField("element", elementType, nullable), REPEATED))
+          // "array" is the name chosen by parquet-avro (1.7.0 and prior version)
+          convertField(StructField("array", elementType, nullable), REPEATED))
 
       // Spark 1.4.x and prior versions convert MapType into a 3-level group annotated by
       // MAP_KEY_VALUE.  This is covered by `convertGroupField(field: GroupType): DataType`.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala
index 35d3c33f99a06..fa629392674bd 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala
@@ -174,7 +174,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest {
     """
       |message root {
       |  optional group _1 (LIST) {
-      |    repeated int32 element;
+      |    repeated int32 array;
       |  }
       |}
     """.stripMargin)
@@ -198,7 +198,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest {
       |message root {
       |  optional group _1 (LIST) {
       |    repeated group bag {
-      |      optional int32 element;
+      |      optional int32 array_element;
       |    }
       |  }
       |}
@@ -267,7 +267,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest {
       |        optional binary _1 (UTF8);
       |        optional group _2 (LIST) {
       |          repeated group bag {
-      |            optional group element {
+      |            optional group array_element {
       |              required int32 _1;
       |              required double _2;
       |            }
@@ -616,7 +616,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     """message root {
       |  optional group f1 (LIST) {
       |    repeated group bag {
-      |      optional int32 element;
+      |      optional int32 array_element;
       |    }
       |  }
       |}
@@ -648,7 +648,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
         nullable = true))),
     """message root {
       |  optional group f1 (LIST) {
-      |    repeated int32 element;
+      |    repeated int32 array;
       |  }
       |}
     """.stripMargin)

From c056484c0741e2a03d4a916538e1b9e3e65e71c3 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Wed, 8 Jul 2015 22:14:38 -0700
Subject: [PATCH 078/549] Revert "[SPARK-8928] [SQL] Makes
 CatalystSchemaConverter sticking to 1.4.x- when handling Parquet LISTs in
 compatible mode"

This reverts commit 3dab0da42940a46f0c4aa4853bdb5c64c4cb2613.
---
 .../spark/sql/parquet/CatalystSchemaConverter.scala    |  6 ++----
 .../apache/spark/sql/parquet/ParquetSchemaSuite.scala  | 10 +++++-----
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala
index 1ea6926af6d5b..de3a72d8146c5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala
@@ -461,8 +461,7 @@ private[parquet] class CatalystSchemaConverter(
           field.name,
           Types
             .buildGroup(REPEATED)
-            // "array_element" is the name chosen by parquet-hive (1.7.0 and prior version)
-            .addField(convertField(StructField("array_element", elementType, nullable)))
+            .addField(convertField(StructField("element", elementType, nullable)))
             .named(CatalystConverter.ARRAY_CONTAINS_NULL_BAG_SCHEMA_NAME))
 
       // Spark 1.4.x and prior versions convert ArrayType with non-nullable elements into a 2-level
@@ -475,8 +474,7 @@ private[parquet] class CatalystSchemaConverter(
         ConversionPatterns.listType(
           repetition,
           field.name,
-          // "array" is the name chosen by parquet-avro (1.7.0 and prior version)
-          convertField(StructField("array", elementType, nullable), REPEATED))
+          convertField(StructField("element", elementType, nullable), REPEATED))
 
       // Spark 1.4.x and prior versions convert MapType into a 3-level group annotated by
       // MAP_KEY_VALUE.  This is covered by `convertGroupField(field: GroupType): DataType`.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala
index fa629392674bd..35d3c33f99a06 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala
@@ -174,7 +174,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest {
     """
       |message root {
       |  optional group _1 (LIST) {
-      |    repeated int32 array;
+      |    repeated int32 element;
       |  }
       |}
     """.stripMargin)
@@ -198,7 +198,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest {
       |message root {
       |  optional group _1 (LIST) {
       |    repeated group bag {
-      |      optional int32 array_element;
+      |      optional int32 element;
       |    }
       |  }
       |}
@@ -267,7 +267,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest {
       |        optional binary _1 (UTF8);
       |        optional group _2 (LIST) {
       |          repeated group bag {
-      |            optional group array_element {
+      |            optional group element {
       |              required int32 _1;
       |              required double _2;
       |            }
@@ -616,7 +616,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     """message root {
       |  optional group f1 (LIST) {
       |    repeated group bag {
-      |      optional int32 array_element;
+      |      optional int32 element;
       |    }
       |  }
       |}
@@ -648,7 +648,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
         nullable = true))),
     """message root {
       |  optional group f1 (LIST) {
-      |    repeated int32 array;
+      |    repeated int32 element;
       |  }
       |}
     """.stripMargin)

From 851e247caad0977cfd4998254d9602624e06539f Mon Sep 17 00:00:00 2001
From: Weizhong Lin <linweizhong@huawei.com>
Date: Wed, 8 Jul 2015 22:18:39 -0700
Subject: [PATCH 079/549] [SPARK-8928] [SQL] Makes CatalystSchemaConverter
 sticking to 1.4.x- when handling Parquet LISTs in compatible mode

This PR is based on #7209 authored by Sephiroth-Lin.

Author: Weizhong Lin <linweizhong@huawei.com>

Closes #7314 from liancheng/spark-8928 and squashes the following commits:

75267fe [Cheng Lian] Makes CatalystSchemaConverter sticking to 1.4.x- when handling LISTs in compatible mode
---
 .../spark/sql/parquet/CatalystSchemaConverter.scala    |  6 ++++--
 .../apache/spark/sql/parquet/ParquetSchemaSuite.scala  | 10 +++++-----
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala
index de3a72d8146c5..1ea6926af6d5b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala
@@ -461,7 +461,8 @@ private[parquet] class CatalystSchemaConverter(
           field.name,
           Types
             .buildGroup(REPEATED)
-            .addField(convertField(StructField("element", elementType, nullable)))
+            // "array_element" is the name chosen by parquet-hive (1.7.0 and prior version)
+            .addField(convertField(StructField("array_element", elementType, nullable)))
             .named(CatalystConverter.ARRAY_CONTAINS_NULL_BAG_SCHEMA_NAME))
 
       // Spark 1.4.x and prior versions convert ArrayType with non-nullable elements into a 2-level
@@ -474,7 +475,8 @@ private[parquet] class CatalystSchemaConverter(
         ConversionPatterns.listType(
           repetition,
           field.name,
-          convertField(StructField("element", elementType, nullable), REPEATED))
+          // "array" is the name chosen by parquet-avro (1.7.0 and prior version)
+          convertField(StructField("array", elementType, nullable), REPEATED))
 
       // Spark 1.4.x and prior versions convert MapType into a 3-level group annotated by
       // MAP_KEY_VALUE.  This is covered by `convertGroupField(field: GroupType): DataType`.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala
index 35d3c33f99a06..fa629392674bd 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala
@@ -174,7 +174,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest {
     """
       |message root {
       |  optional group _1 (LIST) {
-      |    repeated int32 element;
+      |    repeated int32 array;
       |  }
       |}
     """.stripMargin)
@@ -198,7 +198,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest {
       |message root {
       |  optional group _1 (LIST) {
       |    repeated group bag {
-      |      optional int32 element;
+      |      optional int32 array_element;
       |    }
       |  }
       |}
@@ -267,7 +267,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest {
       |        optional binary _1 (UTF8);
       |        optional group _2 (LIST) {
       |          repeated group bag {
-      |            optional group element {
+      |            optional group array_element {
       |              required int32 _1;
       |              required double _2;
       |            }
@@ -616,7 +616,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     """message root {
       |  optional group f1 (LIST) {
       |    repeated group bag {
-      |      optional int32 element;
+      |      optional int32 array_element;
       |    }
       |  }
       |}
@@ -648,7 +648,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
         nullable = true))),
     """message root {
       |  optional group f1 (LIST) {
-      |    repeated int32 element;
+      |    repeated int32 array;
       |  }
       |}
     """.stripMargin)

From 09cb0d9c2dcb83818ced22ff9bd6a51688ea7ffe Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@outlook.com>
Date: Thu, 9 Jul 2015 00:26:25 -0700
Subject: [PATCH 080/549] [SPARK-8942][SQL] use double not decimal when cast
 double and float to timestamp

Author: Wenchen Fan <cloud0fan@outlook.com>

Closes #7312 from cloud-fan/minor and squashes the following commits:

a4589fa [Wenchen Fan] use double not decimal when cast double and float to timestamp
---
 .../spark/sql/catalyst/expressions/Cast.scala  | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
index 567feca7136f9..7f2383dedc035 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -192,23 +192,18 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
       buildCast[Decimal](_, d => decimalToTimestamp(d))
     // TimestampWritable.doubleToTimestamp
     case DoubleType =>
-      buildCast[Double](_, d => try {
-        decimalToTimestamp(Decimal(d))
-      } catch {
-        case _: NumberFormatException => null
-      })
+      buildCast[Double](_, d => doubleToTimestamp(d))
     // TimestampWritable.floatToTimestamp
     case FloatType =>
-      buildCast[Float](_, f => try {
-        decimalToTimestamp(Decimal(f))
-      } catch {
-        case _: NumberFormatException => null
-      })
+      buildCast[Float](_, f => doubleToTimestamp(f.toDouble))
   }
 
   private[this] def decimalToTimestamp(d: Decimal): Long = {
     (d.toBigDecimal * 1000000L).longValue()
   }
+  private[this] def doubleToTimestamp(d: Double): Any = {
+    if (d.isNaN || d.isInfinite) null else (d * 1000000L).toLong
+  }
 
   // converting milliseconds to us
   private[this] def longToTimestamp(t: Long): Long = t * 1000L
@@ -396,8 +391,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
     buildCast[InternalRow](_, row => {
       var i = 0
       while (i < row.length) {
-        val v = row(i)
-        newRow.update(i, if (v == null) null else casts(i)(v))
+        newRow.update(i, if (row.isNullAt(i)) null else casts(i)(row(i)))
         i += 1
       }
       newRow.copy()

From f88b12537ee81d914ef7c51a08f80cb28d93c8ed Mon Sep 17 00:00:00 2001
From: lewuathe <lewuathe@me.com>
Date: Thu, 9 Jul 2015 08:16:26 -0700
Subject: [PATCH 081/549] [SPARK-6266] [MLLIB] PySpark SparseVector missing doc
 for size, indices, values

Write missing pydocs in `SparseVector` attributes.

Author: lewuathe <lewuathe@me.com>

Closes #7290 from Lewuathe/SPARK-6266 and squashes the following commits:

51d9895 [lewuathe] Update docs
0480d35 [lewuathe] Merge branch 'master' into SPARK-6266
ba42cf3 [lewuathe] [SPARK-6266] PySpark SparseVector missing doc for size, indices, values
---
 python/pyspark/mllib/linalg.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/mllib/linalg.py b/python/pyspark/mllib/linalg.py
index 51ac198305711..040886f71775b 100644
--- a/python/pyspark/mllib/linalg.py
+++ b/python/pyspark/mllib/linalg.py
@@ -445,8 +445,10 @@ def __init__(self, size, *args):
         values (sorted by index).
 
         :param size: Size of the vector.
-        :param args: Non-zero entries, as a dictionary, list of tupes,
-               or two sorted lists containing indices and values.
+        :param args: Active entries, as a dictionary {index: value, ...},
+          a list of tuples [(index, value), ...], or a list of strictly i
+          ncreasing indices and a list of corresponding values [index, ...],
+          [value, ...]. Inactive entries are treated as zeros.
 
         >>> SparseVector(4, {1: 1.0, 3: 5.5})
         SparseVector(4, {1: 1.0, 3: 5.5})
@@ -456,6 +458,7 @@ def __init__(self, size, *args):
         SparseVector(4, {1: 1.0, 3: 5.5})
         """
         self.size = int(size)
+        """ Size of the vector. """
         assert 1 <= len(args) <= 2, "must pass either 2 or 3 arguments"
         if len(args) == 1:
             pairs = args[0]
@@ -463,7 +466,9 @@ def __init__(self, size, *args):
                 pairs = pairs.items()
             pairs = sorted(pairs)
             self.indices = np.array([p[0] for p in pairs], dtype=np.int32)
+            """ A list of indices corresponding to active entries. """
             self.values = np.array([p[1] for p in pairs], dtype=np.float64)
+            """ A list of values corresponding to active entries. """
         else:
             if isinstance(args[0], bytes):
                 assert isinstance(args[1], bytes), "values should be string too"

From 23448a9e988a1b92bd05ee8c6c1a096c83375a12 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Thu, 9 Jul 2015 09:20:16 -0700
Subject: [PATCH 082/549] [SPARK-8931] [SQL] Fallback to interpreted evaluation
 if failed to compile in codegen

Exception will not be catched during tests.

cc marmbrus rxin

Author: Davies Liu <davies@databricks.com>

Closes #7309 from davies/fallback and squashes the following commits:

969a612 [Davies Liu] throw exception during tests
f844f77 [Davies Liu] fallback
a3091bc [Davies Liu] Merge branch 'master' of github.com:apache/spark into fallback
364a0d6 [Davies Liu] fallback to interpret mode if failed to compile
---
 .../spark/sql/execution/SparkPlan.scala       | 51 +++++++++++++++++--
 .../apache/spark/sql/sources/commands.scala   | 13 ++++-
 2 files changed, 58 insertions(+), 6 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
index ca53186383237..4d7d8626a0ecc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
@@ -153,12 +153,24 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ
     buf.toArray.map(converter(_).asInstanceOf[Row])
   }
 
+  private[this] def isTesting: Boolean = sys.props.contains("spark.testing")
+
   protected def newProjection(
       expressions: Seq[Expression], inputSchema: Seq[Attribute]): Projection = {
     log.debug(
       s"Creating Projection: $expressions, inputSchema: $inputSchema, codegen:$codegenEnabled")
     if (codegenEnabled) {
-      GenerateProjection.generate(expressions, inputSchema)
+      try {
+        GenerateProjection.generate(expressions, inputSchema)
+      } catch {
+        case e: Exception =>
+          if (isTesting) {
+            throw e
+          } else {
+            log.error("Failed to generate projection, fallback to interpret", e)
+            new InterpretedProjection(expressions, inputSchema)
+          }
+      }
     } else {
       new InterpretedProjection(expressions, inputSchema)
     }
@@ -170,17 +182,36 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ
     log.debug(
       s"Creating MutableProj: $expressions, inputSchema: $inputSchema, codegen:$codegenEnabled")
     if(codegenEnabled) {
-      GenerateMutableProjection.generate(expressions, inputSchema)
+      try {
+        GenerateMutableProjection.generate(expressions, inputSchema)
+      } catch {
+        case e: Exception =>
+          if (isTesting) {
+            throw e
+          } else {
+            log.error("Failed to generate mutable projection, fallback to interpreted", e)
+            () => new InterpretedMutableProjection(expressions, inputSchema)
+          }
+      }
     } else {
       () => new InterpretedMutableProjection(expressions, inputSchema)
     }
   }
 
-
   protected def newPredicate(
       expression: Expression, inputSchema: Seq[Attribute]): (InternalRow) => Boolean = {
     if (codegenEnabled) {
-      GeneratePredicate.generate(expression, inputSchema)
+      try {
+        GeneratePredicate.generate(expression, inputSchema)
+      } catch {
+        case e: Exception =>
+          if (isTesting) {
+            throw e
+          } else {
+            log.error("Failed to generate predicate, fallback to interpreted", e)
+            InterpretedPredicate.create(expression, inputSchema)
+          }
+      }
     } else {
       InterpretedPredicate.create(expression, inputSchema)
     }
@@ -190,7 +221,17 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ
       order: Seq[SortOrder],
       inputSchema: Seq[Attribute]): Ordering[InternalRow] = {
     if (codegenEnabled) {
-      GenerateOrdering.generate(order, inputSchema)
+      try {
+        GenerateOrdering.generate(order, inputSchema)
+      } catch {
+        case e: Exception =>
+          if (isTesting) {
+            throw e
+          } else {
+            log.error("Failed to generate ordering, fallback to interpreted", e)
+            new RowOrdering(order, inputSchema)
+          }
+      }
     } else {
       new RowOrdering(order, inputSchema)
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala
index ecbc889770625..9189d176111d5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala
@@ -276,7 +276,18 @@ private[sql] case class InsertIntoHadoopFsRelation(
     log.debug(
       s"Creating Projection: $expressions, inputSchema: $inputSchema, codegen:$codegenEnabled")
     if (codegenEnabled) {
-      GenerateProjection.generate(expressions, inputSchema)
+
+      try {
+        GenerateProjection.generate(expressions, inputSchema)
+      } catch {
+        case e: Exception =>
+          if (sys.props.contains("spark.testing")) {
+            throw e
+          } else {
+            log.error("failed to generate projection, fallback to interpreted", e)
+            new InterpretedProjection(expressions, inputSchema)
+          }
+      }
     } else {
       new InterpretedProjection(expressions, inputSchema)
     }

From a1964e9d902bb31f001893da8bc81f6dce08c908 Mon Sep 17 00:00:00 2001
From: Tarek Auel <tarek.auel@googlemail.com>
Date: Thu, 9 Jul 2015 09:22:24 -0700
Subject: [PATCH 083/549] [SPARK-8830] [SQL] native levenshtein distance

Jira: https://issues.apache.org/jira/browse/SPARK-8830

rxin and HuJiayin can you have a look on it.

Author: Tarek Auel <tarek.auel@googlemail.com>

Closes #7236 from tarekauel/native-levenshtein-distance and squashes the following commits:

ee4c4de [Tarek Auel] [SPARK-8830] implemented improvement proposals
c252e71 [Tarek Auel] [SPARK-8830] removed chartAt; use unsafe method for byte array comparison
ddf2222 [Tarek Auel] Merge branch 'master' into native-levenshtein-distance
179920a [Tarek Auel] [SPARK-8830] added description
5e9ed54 [Tarek Auel] [SPARK-8830] removed StringUtils import
dce4308 [Tarek Auel] [SPARK-8830] native levenshtein distance
---
 .../expressions/stringOperations.scala        |  9 ++-
 .../expressions/StringFunctionsSuite.scala    |  5 ++
 .../apache/spark/unsafe/types/UTF8String.java | 66 ++++++++++++++++++-
 .../spark/unsafe/types/UTF8StringSuite.java   | 24 +++++++
 4 files changed, 97 insertions(+), 7 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
index 47fc7cdaa826c..57f436485becf 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
@@ -284,13 +284,12 @@ case class Levenshtein(left: Expression, right: Expression) extends BinaryExpres
 
   override def dataType: DataType = IntegerType
 
-  protected override def nullSafeEval(input1: Any, input2: Any): Any =
-    StringUtils.getLevenshteinDistance(input1.toString, input2.toString)
+  protected override def nullSafeEval(leftValue: Any, rightValue: Any): Any =
+    leftValue.asInstanceOf[UTF8String].levenshteinDistance(rightValue.asInstanceOf[UTF8String])
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
-    val stringUtils = classOf[StringUtils].getName
-    defineCodeGen(ctx, ev, (left, right) =>
-      s"$stringUtils.getLevenshteinDistance($left.toString(), $right.toString())")
+    nullSafeCodeGen(ctx, ev, (left, right) =>
+      s"${ev.primitive} = $left.levenshteinDistance($right);")
   }
 }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringFunctionsSuite.scala
index 1efbe1a245e83..69bef1c63e9dc 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringFunctionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringFunctionsSuite.scala
@@ -282,5 +282,10 @@ class StringFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(Levenshtein(Literal("abc"), Literal("abc")), 0)
     checkEvaluation(Levenshtein(Literal("kitten"), Literal("sitting")), 3)
     checkEvaluation(Levenshtein(Literal("frog"), Literal("fog")), 1)
+    // scalastyle:off
+    // non ascii characters are not allowed in the code, so we disable the scalastyle here.
+    checkEvaluation(Levenshtein(Literal("千世"), Literal("fog")), 3)
+    checkEvaluation(Levenshtein(Literal("世界千世"), Literal("大a界b")), 4)
+    // scalastyle:on
   }
 }
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
index d2a25096a5e7a..847d80ad583f6 100644
--- a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
+++ b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -99,8 +99,6 @@ public int numBytes() {
 
   /**
    * Returns the number of code points in it.
-   *
-   * This is only used by Substring() when `start` is negative.
    */
   public int numChars() {
     int len = 0;
@@ -254,6 +252,70 @@ public boolean equals(final Object other) {
     }
   }
 
+  /**
+   * Levenshtein distance is a metric for measuring the distance of two strings. The distance is
+   * defined by the minimum number of single-character edits (i.e. insertions, deletions or
+   * substitutions) that are required to change one of the strings into the other.
+   */
+  public int levenshteinDistance(UTF8String other) {
+    // Implementation adopted from org.apache.common.lang3.StringUtils.getLevenshteinDistance
+
+    int n = numChars();
+    int m = other.numChars();
+
+    if (n == 0) {
+      return m;
+    } else if (m == 0) {
+      return n;
+    }
+
+    UTF8String s, t;
+
+    if (n <= m) {
+      s = this;
+      t = other;
+    } else {
+      s = other;
+      t = this;
+      int swap;
+      swap = n;
+      n = m;
+      m = swap;
+    }
+
+    int p[] = new int[n + 1];
+    int d[] = new int[n + 1];
+    int swap[];
+
+    int i, i_bytes, j, j_bytes, num_bytes_j, cost;
+
+    for (i = 0; i <= n; i++) {
+      p[i] = i;
+    }
+
+    for (j = 0, j_bytes = 0; j < m; j_bytes += num_bytes_j, j++) {
+      num_bytes_j = numBytesForFirstByte(t.getByte(j_bytes));
+      d[0] = j + 1;
+
+      for (i = 0, i_bytes = 0; i < n; i_bytes += numBytesForFirstByte(s.getByte(i_bytes)), i++) {
+        if (s.getByte(i_bytes) != t.getByte(j_bytes) ||
+              num_bytes_j != numBytesForFirstByte(s.getByte(i_bytes))) {
+          cost = 1;
+        } else {
+          cost = (ByteArrayMethods.arrayEquals(t.base, t.offset + j_bytes, s.base,
+              s.offset + i_bytes, num_bytes_j)) ? 0 : 1;
+        }
+        d[i + 1] = Math.min(Math.min(d[i] + 1, p[i + 1] + 1), p[i] + cost);
+      }
+
+      swap = p;
+      p = d;
+      d = swap;
+    }
+
+    return p[n];
+  }
+
   @Override
   public int hashCode() {
     int result = 1;
diff --git a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
index 8ec69ebac8b37..fb463ba17f50b 100644
--- a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
+++ b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
@@ -128,4 +128,28 @@ public void substring() {
     assertEquals(fromString("数据砖头").substring(3, 5), fromString("头"));
     assertEquals(fromString("ߵ梷").substring(0, 2), fromString("ߵ梷"));
   }
+  
+  @Test
+  public void levenshteinDistance() {
+    assertEquals(
+        UTF8String.fromString("").levenshteinDistance(UTF8String.fromString("")), 0);
+    assertEquals(
+        UTF8String.fromString("").levenshteinDistance(UTF8String.fromString("a")), 1);
+    assertEquals(
+        UTF8String.fromString("aaapppp").levenshteinDistance(UTF8String.fromString("")), 7);
+    assertEquals(
+        UTF8String.fromString("frog").levenshteinDistance(UTF8String.fromString("fog")), 1);
+    assertEquals(
+        UTF8String.fromString("fly").levenshteinDistance(UTF8String.fromString("ant")),3);
+    assertEquals(
+        UTF8String.fromString("elephant").levenshteinDistance(UTF8String.fromString("hippo")), 7);
+    assertEquals(
+        UTF8String.fromString("hippo").levenshteinDistance(UTF8String.fromString("elephant")), 7);
+    assertEquals(
+        UTF8String.fromString("hippo").levenshteinDistance(UTF8String.fromString("zzzzzzzz")), 8);
+    assertEquals(
+        UTF8String.fromString("hello").levenshteinDistance(UTF8String.fromString("hallo")),1);
+    assertEquals(
+        UTF8String.fromString("世界千世").levenshteinDistance(UTF8String.fromString("千a世b")),4);
+  }
 }

From 59cc38944fe5c1dffc6551775bd939e2ac66c65e Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@appier.com>
Date: Thu, 9 Jul 2015 09:57:12 -0700
Subject: [PATCH 084/549] [SPARK-8940] [SPARKR] Don't overwrite given schema in
 createDataFrame

JIRA: https://issues.apache.org/jira/browse/SPARK-8940

The given `schema` parameter will be overwritten in `createDataFrame` now. If it is not null, we shouldn't overwrite it.

Author: Liang-Chi Hsieh <viirya@appier.com>

Closes #7311 from viirya/df_not_overwrite_schema and squashes the following commits:

2385139 [Liang-Chi Hsieh] Don't overwrite given schema if it is not null.
---
 R/pkg/R/SQLContext.R | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R
index 9a743a3411533..30978bb50d339 100644
--- a/R/pkg/R/SQLContext.R
+++ b/R/pkg/R/SQLContext.R
@@ -86,7 +86,9 @@ infer_type <- function(x) {
 createDataFrame <- function(sqlContext, data, schema = NULL, samplingRatio = 1.0) {
   if (is.data.frame(data)) {
       # get the names of columns, they will be put into RDD
-      schema <- names(data)
+      if (is.null(schema)) {
+        schema <- names(data)
+      }
       n <- nrow(data)
       m <- ncol(data)
       # get rid of factor type

From e204d22bb70f28b1cc090ab60f12078479be4ae0 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Thu, 9 Jul 2015 10:01:01 -0700
Subject: [PATCH 085/549] [SPARK-8948][SQL] Remove ExtractValueWithOrdinal
 abstract class

Also added more documentation for the file.

Author: Reynold Xin <rxin@databricks.com>

Closes #7316 from rxin/extract-value and squashes the following commits:

069cb7e [Reynold Xin] Removed ExtractValueWithOrdinal.
621b705 [Reynold Xin] Reverted a line.
11ebd6c [Reynold Xin] [Minor][SQL] Improve documentation for complex type extractors.
---
 ...alue.scala => complexTypeExtractors.scala} | 54 ++++++++++++-------
 1 file changed, 34 insertions(+), 20 deletions(-)
 rename sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/{ExtractValue.scala => complexTypeExtractors.scala} (86%)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExtractValue.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala
similarity index 86%
rename from sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExtractValue.scala
rename to sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala
index 2b25ba03579ec..73cc930c45832 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExtractValue.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala
@@ -25,6 +25,11 @@ import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, CodeGenContext}
 import org.apache.spark.sql.types._
 
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// This file defines all the expressions to extract values out of complex types.
+// For example, getting a field out of an array, map, or struct.
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
 
 object ExtractValue {
   /**
@@ -73,11 +78,10 @@ object ExtractValue {
     }
   }
 
-  def unapply(g: ExtractValue): Option[(Expression, Expression)] = {
-    g match {
-      case o: ExtractValueWithOrdinal => Some((o.child, o.ordinal))
-      case s: ExtractValueWithStruct => Some((s.child, null))
-    }
+  def unapply(g: ExtractValue): Option[(Expression, Expression)] = g match {
+    case o: GetArrayItem => Some((o.child, o.ordinal))
+    case o: GetMapValue => Some((o.child, o.key))
+    case s: ExtractValueWithStruct => Some((s.child, null))
   }
 
   /**
@@ -117,6 +121,8 @@ abstract class ExtractValueWithStruct extends UnaryExpression with ExtractValue
 
 /**
  * Returns the value of fields in the Struct `child`.
+ *
+ * No need to do type checking since it is handled by [[ExtractValue]].
  */
 case class GetStructField(child: Expression, field: StructField, ordinal: Int)
   extends ExtractValueWithStruct {
@@ -142,6 +148,8 @@ case class GetStructField(child: Expression, field: StructField, ordinal: Int)
 
 /**
  * Returns the array of value of fields in the Array of Struct `child`.
+ *
+ * No need to do type checking since it is handled by [[ExtractValue]].
  */
 case class GetArrayStructFields(
     child: Expression,
@@ -178,25 +186,21 @@ case class GetArrayStructFields(
   }
 }
 
-abstract class ExtractValueWithOrdinal extends BinaryExpression with ExtractValue {
-  self: Product =>
+/**
+ * Returns the field at `ordinal` in the Array `child`.
+ *
+ * No need to do type checking since it is handled by [[ExtractValue]].
+ */
+case class GetArrayItem(child: Expression, ordinal: Expression)
+  extends BinaryExpression with ExtractValue {
 
-  def ordinal: Expression
-  def child: Expression
+  override def toString: String = s"$child[$ordinal]"
 
   override def left: Expression = child
   override def right: Expression = ordinal
 
   /** `Null` is returned for invalid ordinals. */
   override def nullable: Boolean = true
-  override def toString: String = s"$child[$ordinal]"
-}
-
-/**
- * Returns the field at `ordinal` in the Array `child`
- */
-case class GetArrayItem(child: Expression, ordinal: Expression)
-  extends ExtractValueWithOrdinal {
 
   override def dataType: DataType = child.dataType.asInstanceOf[ArrayType].elementType
 
@@ -227,10 +231,20 @@ case class GetArrayItem(child: Expression, ordinal: Expression)
 }
 
 /**
- * Returns the value of key `ordinal` in Map `child`
+ * Returns the value of key `ordinal` in Map `child`.
+ *
+ * No need to do type checking since it is handled by [[ExtractValue]].
  */
-case class GetMapValue(child: Expression, ordinal: Expression)
-  extends ExtractValueWithOrdinal {
+case class GetMapValue(child: Expression, key: Expression)
+  extends BinaryExpression with ExtractValue {
+
+  override def toString: String = s"$child[$key]"
+
+  override def left: Expression = child
+  override def right: Expression = key
+
+  /** `Null` is returned for invalid ordinals. */
+  override def nullable: Boolean = true
 
   override def dataType: DataType = child.dataType.asInstanceOf[MapType].valueType
 

From a870a82fb6f57bb63bd6f1e95da944a30f67519a Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Thu, 9 Jul 2015 10:01:33 -0700
Subject: [PATCH 086/549] [SPARK-8926][SQL] Code review followup.

I merged https://github.com/apache/spark/pull/7303 so it unblocks another PR. This addresses my own code review comment for that PR.

Author: Reynold Xin <rxin@databricks.com>

Closes #7313 from rxin/adt and squashes the following commits:

7ade82b [Reynold Xin] Fixed unit tests.
f8d5533 [Reynold Xin] [SPARK-8926][SQL] Code review followup.
---
 .../catalyst/expressions/ExpectsInputTypes.scala |  4 ++--
 .../spark/sql/types/AbstractDataType.scala       | 16 ++++++++++++++++
 .../catalyst/analysis/AnalysisErrorSuite.scala   |  8 ++++----
 .../analysis/HiveTypeCoercionSuite.scala         |  1 +
 4 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExpectsInputTypes.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExpectsInputTypes.scala
index 986cc09499d1f..3eb0eb195c80d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExpectsInputTypes.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExpectsInputTypes.scala
@@ -39,8 +39,8 @@ trait ExpectsInputTypes { self: Expression =>
   override def checkInputDataTypes(): TypeCheckResult = {
     val mismatches = children.zip(inputTypes).zipWithIndex.collect {
       case ((child, expected), idx) if !expected.acceptsType(child.dataType) =>
-        s"Argument ${idx + 1} is expected to be of type ${expected.simpleString}, " +
-        s"however, ${child.prettyString} is of type ${child.dataType.simpleString}."
+        s"argument ${idx + 1} is expected to be of type ${expected.simpleString}, " +
+        s"however, '${child.prettyString}' is of type ${child.dataType.simpleString}."
     }
 
     if (mismatches.isEmpty) {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala
index ad75fa2e31d90..32f87440b4e37 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala
@@ -36,12 +36,28 @@ private[sql] abstract class AbstractDataType {
   /**
    * Returns true if this data type is the same type as `other`.  This is different that equality
    * as equality will also consider data type parametrization, such as decimal precision.
+   *
+   * {{{
+   *   // this should return true
+   *   DecimalType.isSameType(DecimalType(10, 2))
+   *
+   *   // this should return false
+   *   NumericType.isSameType(DecimalType(10, 2))
+   * }}}
    */
   private[sql] def isSameType(other: DataType): Boolean
 
   /**
    * Returns true if `other` is an acceptable input type for a function that expectes this,
    * possibly abstract, DataType.
+   *
+   * {{{
+   *   // this should return true
+   *   DecimalType.isSameType(DecimalType(10, 2))
+   *
+   *   // this should return true as well
+   *   NumericType.acceptsType(DecimalType(10, 2))
+   * }}}
    */
   private[sql] def acceptsType(other: DataType): Boolean = isSameType(other)
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
index 73236c3acbca2..9d0c69a2451d1 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
@@ -58,7 +58,7 @@ class AnalysisErrorSuite extends SparkFunSuite with BeforeAndAfter {
         }
       }
 
-      errorMessages.foreach(m => assert(error.getMessage.toLowerCase contains m.toLowerCase))
+      errorMessages.foreach(m => assert(error.getMessage.toLowerCase.contains(m.toLowerCase)))
     }
   }
 
@@ -68,21 +68,21 @@ class AnalysisErrorSuite extends SparkFunSuite with BeforeAndAfter {
     "single invalid type, single arg",
     testRelation.select(TestFunction(dateLit :: Nil, IntegerType :: Nil).as('a)),
     "cannot resolve" :: "testfunction" :: "argument 1" :: "expected to be of type int" ::
-    "null is of type date" ::Nil)
+    "'null' is of type date" ::Nil)
 
   errorTest(
     "single invalid type, second arg",
     testRelation.select(
       TestFunction(dateLit :: dateLit :: Nil, DateType :: IntegerType :: Nil).as('a)),
     "cannot resolve" :: "testfunction" :: "argument 2" :: "expected to be of type int" ::
-    "null is of type date" ::Nil)
+    "'null' is of type date" ::Nil)
 
   errorTest(
     "multiple invalid type",
     testRelation.select(
       TestFunction(dateLit :: dateLit :: Nil, IntegerType :: IntegerType :: Nil).as('a)),
     "cannot resolve" :: "testfunction" :: "argument 1" :: "argument 2" ::
-    "expected to be of type int" :: "null is of type date" ::Nil)
+    "expected to be of type int" :: "'null' is of type date" ::Nil)
 
   errorTest(
     "unresolved window function",
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
index 6e3aa0eebeb15..acb9a433de903 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
@@ -79,6 +79,7 @@ class HiveTypeCoercionSuite extends PlanTest {
     shouldCast(IntegerType, TypeCollection(DecimalType(10, 2), StringType), DecimalType(10, 2))
 
     shouldCast(StringType, NumericType, DoubleType)
+    shouldCast(StringType, TypeCollection(NumericType, BinaryType), DoubleType)
 
     // NumericType should not be changed when function accepts any of them.
     Seq(ByteType, ShortType, IntegerType, LongType, FloatType, DoubleType,

From f6c0bd5c3755b2f9bab633a5d478240fdaf1c593 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@outlook.com>
Date: Thu, 9 Jul 2015 10:04:42 -0700
Subject: [PATCH 087/549] [SPARK-8938][SQL] Implement toString for Interval
 data type

Author: Wenchen Fan <cloud0fan@outlook.com>

Closes #7315 from cloud-fan/toString and squashes the following commits:

4fc8d80 [Wenchen Fan] Implement toString for Interval data type
---
 .../apache/spark/sql/catalyst/SqlParser.scala | 24 ++++++--
 .../apache/spark/unsafe/types/Interval.java   | 42 +++++++++++++
 .../spark/unsafe/types/IntervalSuite.java     | 59 +++++++++++++++++++
 3 files changed, 119 insertions(+), 6 deletions(-)
 create mode 100644 unsafe/src/test/java/org/apache/spark/unsafe/types/IntervalSuite.java

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
index dedd8c8fa3620..d4ef04c2294a2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
@@ -353,22 +353,34 @@ class SqlParser extends AbstractSparkSQLParser with DataTypeParser {
     integral <~ intervalUnit("microsecond") ^^ { case num => num.toLong }
 
   protected lazy val millisecond: Parser[Long] =
-    integral <~ intervalUnit("millisecond") ^^ { case num => num.toLong * 1000 }
+    integral <~ intervalUnit("millisecond") ^^ {
+      case num => num.toLong * Interval.MICROS_PER_MILLI
+    }
 
   protected lazy val second: Parser[Long] =
-    integral <~ intervalUnit("second") ^^ { case num => num.toLong * 1000 * 1000 }
+    integral <~ intervalUnit("second") ^^ {
+      case num => num.toLong * Interval.MICROS_PER_SECOND
+    }
 
   protected lazy val minute: Parser[Long] =
-    integral <~ intervalUnit("minute") ^^ { case num => num.toLong * 1000 * 1000 * 60 }
+    integral <~ intervalUnit("minute") ^^ {
+      case num => num.toLong * Interval.MICROS_PER_MINUTE
+    }
 
   protected lazy val hour: Parser[Long] =
-    integral <~ intervalUnit("hour") ^^ { case num => num.toLong * 1000 * 1000 * 3600 }
+    integral <~ intervalUnit("hour") ^^ {
+      case num => num.toLong * Interval.MICROS_PER_HOUR
+    }
 
   protected lazy val day: Parser[Long] =
-    integral <~ intervalUnit("day") ^^ { case num => num.toLong * 1000 * 1000 * 3600 * 24 }
+    integral <~ intervalUnit("day") ^^ {
+      case num => num.toLong * Interval.MICROS_PER_DAY
+    }
 
   protected lazy val week: Parser[Long] =
-    integral <~ intervalUnit("week") ^^ { case num => num.toLong * 1000 * 1000 * 3600 * 24 * 7 }
+    integral <~ intervalUnit("week") ^^ {
+      case num => num.toLong * Interval.MICROS_PER_WEEK
+    }
 
   protected lazy val intervalLiteral: Parser[Literal] =
     INTERVAL ~> year.? ~ month.? ~ week.? ~ day.? ~ hour.? ~ minute.? ~ second.? ~
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/Interval.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/Interval.java
index 3eb67ede062d9..0af982d4844c2 100644
--- a/unsafe/src/main/java/org/apache/spark/unsafe/types/Interval.java
+++ b/unsafe/src/main/java/org/apache/spark/unsafe/types/Interval.java
@@ -23,6 +23,13 @@
  * The internal representation of interval type.
  */
 public final class Interval implements Serializable {
+  public static final long MICROS_PER_MILLI = 1000L;
+  public static final long MICROS_PER_SECOND = MICROS_PER_MILLI * 1000;
+  public static final long MICROS_PER_MINUTE = MICROS_PER_SECOND * 60;
+  public static final long MICROS_PER_HOUR = MICROS_PER_MINUTE * 60;
+  public static final long MICROS_PER_DAY = MICROS_PER_HOUR * 24;
+  public static final long MICROS_PER_WEEK = MICROS_PER_DAY * 7;
+
   public final int months;
   public final long microseconds;
 
@@ -44,4 +51,39 @@ public boolean equals(Object other) {
   public int hashCode() {
     return 31 * months + (int) microseconds;
   }
+
+  @Override
+  public String toString() {
+    StringBuilder sb = new StringBuilder("interval");
+
+    if (months != 0) {
+      appendUnit(sb, months / 12, "year");
+      appendUnit(sb, months % 12, "month");
+    }
+
+    if (microseconds != 0) {
+      long rest = microseconds;
+      appendUnit(sb, rest / MICROS_PER_WEEK, "week");
+      rest %= MICROS_PER_WEEK;
+      appendUnit(sb, rest / MICROS_PER_DAY, "day");
+      rest %= MICROS_PER_DAY;
+      appendUnit(sb, rest / MICROS_PER_HOUR, "hour");
+      rest %= MICROS_PER_HOUR;
+      appendUnit(sb, rest / MICROS_PER_MINUTE, "minute");
+      rest %= MICROS_PER_MINUTE;
+      appendUnit(sb, rest / MICROS_PER_SECOND, "second");
+      rest %= MICROS_PER_SECOND;
+      appendUnit(sb, rest / MICROS_PER_MILLI, "millisecond");
+      rest %= MICROS_PER_MILLI;
+      appendUnit(sb, rest, "microsecond");
+    }
+
+    return sb.toString();
+  }
+
+  private void appendUnit(StringBuilder sb, long value, String unit) {
+    if (value != 0) {
+      sb.append(" " + value + " " + unit + "s");
+    }
+  }
 }
diff --git a/unsafe/src/test/java/org/apache/spark/unsafe/types/IntervalSuite.java b/unsafe/src/test/java/org/apache/spark/unsafe/types/IntervalSuite.java
new file mode 100644
index 0000000000000..0f4f38b2b03be
--- /dev/null
+++ b/unsafe/src/test/java/org/apache/spark/unsafe/types/IntervalSuite.java
@@ -0,0 +1,59 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.unsafe.types;
+
+import org.junit.Test;
+
+import static junit.framework.Assert.*;
+import static org.apache.spark.unsafe.types.Interval.*;
+
+public class IntervalSuite {
+
+  @Test
+  public void equalsTest() {
+    Interval i1 = new Interval(3, 123);
+    Interval i2 = new Interval(3, 321);
+    Interval i3 = new Interval(1, 123);
+    Interval i4 = new Interval(3, 123);
+
+    assertNotSame(i1, i2);
+    assertNotSame(i1, i3);
+    assertNotSame(i2, i3);
+    assertEquals(i1, i4);
+  }
+
+  @Test
+  public void toStringTest() {
+    Interval i;
+
+    i = new Interval(34, 0);
+    assertEquals(i.toString(), "interval 2 years 10 months");
+
+    i = new Interval(-34, 0);
+    assertEquals(i.toString(), "interval -2 years -10 months");
+
+    i = new Interval(0, 3 * MICROS_PER_WEEK + 13 * MICROS_PER_HOUR + 123);
+    assertEquals(i.toString(), "interval 3 weeks 13 hours 123 microseconds");
+
+    i = new Interval(0, -3 * MICROS_PER_WEEK - 13 * MICROS_PER_HOUR - 123);
+    assertEquals(i.toString(), "interval -3 weeks -13 hours -123 microseconds");
+
+    i = new Interval(34, 3 * MICROS_PER_WEEK + 13 * MICROS_PER_HOUR + 123);
+    assertEquals(i.toString(), "interval 2 years 10 months 3 weeks 13 hours 123 microseconds");
+  }
+}

From c59e268d17cf10e46dbdbe760e2a7580a6364692 Mon Sep 17 00:00:00 2001
From: JPark <JPark@JPark.me>
Date: Thu, 9 Jul 2015 10:23:36 -0700
Subject: [PATCH 088/549] [SPARK-8863] [EC2] Check aws access key from aws
 credentials if there is no boto config

'spark_ec2.py' use boto to control ec2.
And boto can support '~/.aws/credentials' which is AWS CLI default configuration file.

We can check this information from ref of boto.

"A boto config file is a text file formatted like an .ini configuration file that specifies values for options that control the behavior of the boto library. In Unix/Linux systems, on startup, the boto library looks for configuration files in the following locations and in the following order:
/etc/boto.cfg - for site-wide settings that all users on this machine will use
(if profile is given) ~/.aws/credentials - for credentials shared between SDKs
(if profile is given) ~/.boto - for user-specific settings
~/.aws/credentials - for credentials shared between SDKs
~/.boto - for user-specific settings"

* ref of boto: http://boto.readthedocs.org/en/latest/boto_config_tut.html
* ref of aws cli : http://docs.aws.amazon.com/cli/latest/userguide/cli-chap-getting-started.html

However 'spark_ec2.py' only check boto config & environment variable even if there is '~/.aws/credentials', and 'spark_ec2.py' is terminated.

So I changed to check '~/.aws/credentials'.

cc rxin

Jira : https://issues.apache.org/jira/browse/SPARK-8863

Author: JPark <JPark@JPark.me>

Closes #7252 from JuhongPark/master and squashes the following commits:

23c5792 [JPark] Check aws access key from aws credentials if there is no boto config
---
 ec2/spark_ec2.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/ec2/spark_ec2.py b/ec2/spark_ec2.py
index dd0c12d25980b..ae4f2ecc5bde7 100755
--- a/ec2/spark_ec2.py
+++ b/ec2/spark_ec2.py
@@ -325,14 +325,16 @@ def parse_args():
     home_dir = os.getenv('HOME')
     if home_dir is None or not os.path.isfile(home_dir + '/.boto'):
         if not os.path.isfile('/etc/boto.cfg'):
-            if os.getenv('AWS_ACCESS_KEY_ID') is None:
-                print("ERROR: The environment variable AWS_ACCESS_KEY_ID must be set",
-                      file=stderr)
-                sys.exit(1)
-            if os.getenv('AWS_SECRET_ACCESS_KEY') is None:
-                print("ERROR: The environment variable AWS_SECRET_ACCESS_KEY must be set",
-                      file=stderr)
-                sys.exit(1)
+            # If there is no boto config, check aws credentials
+            if not os.path.isfile(home_dir + '/.aws/credentials'):
+                if os.getenv('AWS_ACCESS_KEY_ID') is None:
+                    print("ERROR: The environment variable AWS_ACCESS_KEY_ID must be set",
+                          file=stderr)
+                    sys.exit(1)
+                if os.getenv('AWS_SECRET_ACCESS_KEY') is None:
+                    print("ERROR: The environment variable AWS_SECRET_ACCESS_KEY must be set",
+                          file=stderr)
+                    sys.exit(1)
     return (opts, action, cluster_name)
 
 
From 0cd84c86cac68600a74d84e50ad40c0c8b84822a Mon Sep 17 00:00:00 2001
From: Yuhao Yang <hhbyyh@gmail.com>
Date: Thu, 9 Jul 2015 10:26:38 -0700
Subject: [PATCH 089/549] [SPARK-8703] [ML] Add CountVectorizer as a ml
 transformer to convert document to words count vector

jira: https://issues.apache.org/jira/browse/SPARK-8703

Converts a text document to a sparse vector of token counts.

I can further add an estimator to extract vocabulary from corpus if that's appropriate.

Author: Yuhao Yang <hhbyyh@gmail.com>

Closes #7084 from hhbyyh/countVectorization and squashes the following commits:

5f3f655 [Yuhao Yang] text change
24728e4 [Yuhao Yang] style improvement
576728a [Yuhao Yang] rename to model and some fix
1deca28 [Yuhao Yang] Merge remote-tracking branch 'upstream/master' into countVectorization
99b0c14 [Yuhao Yang] undo extension from HashingTF
12c2dc8 [Yuhao Yang] Merge remote-tracking branch 'upstream/master' into countVectorization
7ee1c31 [Yuhao Yang] extends HashingTF
809fb59 [Yuhao Yang] minor fix for ut
7c61fb3 [Yuhao Yang] add countVectorizer
---
 .../ml/feature/CountVectorizerModel.scala     | 82 +++++++++++++++++++
 .../ml/feature/CountVectorizorSuite.scala     | 73 +++++++++++++++++
 2 files changed, 155 insertions(+)
 create mode 100644 mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizerModel.scala
 create mode 100644 mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizorSuite.scala

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizerModel.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizerModel.scala
new file mode 100644
index 0000000000000..6b77de89a0330
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizerModel.scala
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.ml.feature
+
+import scala.collection.mutable
+
+import org.apache.spark.annotation.Experimental
+import org.apache.spark.ml.UnaryTransformer
+import org.apache.spark.ml.param.{ParamMap, ParamValidators, IntParam}
+import org.apache.spark.ml.util.Identifiable
+import org.apache.spark.mllib.linalg.{Vectors, VectorUDT, Vector}
+import org.apache.spark.sql.types.{StringType, ArrayType, DataType}
+
+/**
+ * :: Experimental ::
+ * Converts a text document to a sparse vector of token counts.
+ * @param vocabulary An Array over terms. Only the terms in the vocabulary will be counted.
+ */
+@Experimental
+class CountVectorizerModel (override val uid: String, val vocabulary: Array[String])
+  extends UnaryTransformer[Seq[String], Vector, CountVectorizerModel] {
+
+  def this(vocabulary: Array[String]) =
+    this(Identifiable.randomUID("cntVec"), vocabulary)
+
+  /**
+   * Corpus-specific filter to ignore scarce words in a document. For each document, terms with
+   * frequency (count) less than the given threshold are ignored.
+   * Default: 1
+   * @group param
+   */
+  val minTermFreq: IntParam = new IntParam(this, "minTermFreq",
+    "minimum frequency (count) filter used to neglect scarce words (>= 1). For each document, " +
+      "terms with frequency less than the given threshold are ignored.", ParamValidators.gtEq(1))
+
+  /** @group setParam */
+  def setMinTermFreq(value: Int): this.type = set(minTermFreq, value)
+
+  /** @group getParam */
+  def getMinTermFreq: Int = $(minTermFreq)
+
+  setDefault(minTermFreq -> 1)
+
+  override protected def createTransformFunc: Seq[String] => Vector = {
+    val dict = vocabulary.zipWithIndex.toMap
+    document =>
+      val termCounts = mutable.HashMap.empty[Int, Double]
+      document.foreach { term =>
+        dict.get(term) match {
+          case Some(index) => termCounts.put(index, termCounts.getOrElse(index, 0.0) + 1.0)
+          case None => // ignore terms not in the vocabulary
+        }
+      }
+      Vectors.sparse(dict.size, termCounts.filter(_._2 >= $(minTermFreq)).toSeq)
+  }
+
+  override protected def validateInputType(inputType: DataType): Unit = {
+    require(inputType.sameType(ArrayType(StringType)),
+      s"Input type must be ArrayType(StringType) but got $inputType.")
+  }
+
+  override protected def outputDataType: DataType = new VectorUDT()
+
+  override def copy(extra: ParamMap): CountVectorizerModel = {
+    val copied = new CountVectorizerModel(uid, vocabulary)
+    copyValues(copied, extra)
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizorSuite.scala
new file mode 100644
index 0000000000000..e90d9d4ef21ff
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizorSuite.scala
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.ml.feature
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.ml.param.ParamsSuite
+import org.apache.spark.mllib.linalg.{Vector, Vectors}
+import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.mllib.util.TestingUtils._
+
+class CountVectorizerSuite extends SparkFunSuite with MLlibTestSparkContext {
+
+  test("params") {
+    ParamsSuite.checkParams(new CountVectorizerModel(Array("empty")))
+  }
+
+  test("CountVectorizerModel common cases") {
+    val df = sqlContext.createDataFrame(Seq(
+      (0, "a b c d".split(" ").toSeq,
+        Vectors.sparse(4, Seq((0, 1.0), (1, 1.0), (2, 1.0), (3, 1.0)))),
+      (1, "a b b c d  a".split(" ").toSeq,
+        Vectors.sparse(4, Seq((0, 2.0), (1, 2.0), (2, 1.0), (3, 1.0)))),
+      (2, "a".split(" ").toSeq, Vectors.sparse(4, Seq((0, 1.0)))),
+      (3, "".split(" ").toSeq, Vectors.sparse(4, Seq())), // empty string
+      (4, "a notInDict d".split(" ").toSeq,
+        Vectors.sparse(4, Seq((0, 1.0), (3, 1.0))))  // with words not in vocabulary
+    )).toDF("id", "words", "expected")
+    val cv = new CountVectorizerModel(Array("a", "b", "c", "d"))
+      .setInputCol("words")
+      .setOutputCol("features")
+    val output = cv.transform(df).collect()
+    output.foreach { p =>
+      val features = p.getAs[Vector]("features")
+      val expected = p.getAs[Vector]("expected")
+      assert(features ~== expected absTol 1e-14)
+    }
+  }
+
+  test("CountVectorizerModel with minTermFreq") {
+    val df = sqlContext.createDataFrame(Seq(
+      (0, "a a a b b c c c d ".split(" ").toSeq, Vectors.sparse(4, Seq((0, 3.0), (2, 3.0)))),
+      (1, "c c c c c c".split(" ").toSeq, Vectors.sparse(4, Seq((2, 6.0)))),
+      (2, "a".split(" ").toSeq, Vectors.sparse(4, Seq())),
+      (3, "e e e e e".split(" ").toSeq, Vectors.sparse(4, Seq())))
+    ).toDF("id", "words", "expected")
+    val cv = new CountVectorizerModel(Array("a", "b", "c", "d"))
+      .setInputCol("words")
+      .setOutputCol("features")
+      .setMinTermFreq(3)
+    val output = cv.transform(df).collect()
+    output.foreach { p =>
+      val features = p.getAs[Vector]("features")
+      val expected = p.getAs[Vector]("expected")
+      assert(features ~== expected absTol 1e-14)
+    }
+  }
+}
+
+

From 0b0b9ceaf73de472198c9804fb7ae61fa2a2e097 Mon Sep 17 00:00:00 2001
From: Cheng Hao <hao.cheng@intel.com>
Date: Thu, 9 Jul 2015 11:11:34 -0700
Subject: [PATCH 090/549] [SPARK-8247] [SPARK-8249] [SPARK-8252] [SPARK-8254]
 [SPARK-8257] [SPARK-8258] [SPARK-8259] [SPARK-8261] [SPARK-8262] [SPARK-8253]
 [SPARK-8260] [SPARK-8267] [SQL] Add String Expressions

Author: Cheng Hao <hao.cheng@intel.com>

Closes #6762 from chenghao-intel/str_funcs and squashes the following commits:

b09a909 [Cheng Hao] update the code as feedback
7ebbf4c [Cheng Hao] Add more string expressions
---
 .../catalyst/analysis/FunctionRegistry.scala  |  12 +
 .../expressions/stringOperations.scala        | 306 ++++++++++++++-
 .../expressions/StringFunctionsSuite.scala    | 138 +++++++
 .../org/apache/spark/sql/functions.scala      | 353 ++++++++++++++++++
 .../spark/sql/DataFrameFunctionsSuite.scala   | 132 ++++++-
 .../apache/spark/unsafe/types/UTF8String.java | 191 ++++++++++
 .../spark/unsafe/types/UTF8StringSuite.java   |  94 ++++-
 7 files changed, 1202 insertions(+), 24 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index 5c25181e1cf50..f62d79f8cea6d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -147,12 +147,24 @@ object FunctionRegistry {
     expression[Base64]("base64"),
     expression[Encode]("encode"),
     expression[Decode]("decode"),
+    expression[StringInstr]("instr"),
     expression[Lower]("lcase"),
     expression[Lower]("lower"),
     expression[StringLength]("length"),
     expression[Levenshtein]("levenshtein"),
+    expression[StringLocate]("locate"),
+    expression[StringLPad]("lpad"),
+    expression[StringTrimLeft]("ltrim"),
+    expression[StringFormat]("printf"),
+    expression[StringRPad]("rpad"),
+    expression[StringRepeat]("repeat"),
+    expression[StringReverse]("reverse"),
+    expression[StringTrimRight]("rtrim"),
+    expression[StringSpace]("space"),
+    expression[StringSplit]("split"),
     expression[Substring]("substr"),
     expression[Substring]("substring"),
+    expression[StringTrim]("trim"),
     expression[UnBase64]("unbase64"),
     expression[Upper]("ucase"),
     expression[Unhex]("unhex"),
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
index 57f436485becf..f64899c1ed84c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+import java.util.Locale
 import java.util.regex.Pattern
 
 import org.apache.commons.lang3.StringUtils
@@ -104,7 +105,7 @@ case class RLike(left: Expression, right: Expression)
   override def toString: String = s"$left RLIKE $right"
 }
 
-trait CaseConversionExpression extends ExpectsInputTypes {
+trait String2StringExpression extends ExpectsInputTypes {
   self: UnaryExpression =>
 
   def convert(v: UTF8String): UTF8String
@@ -119,7 +120,7 @@ trait CaseConversionExpression extends ExpectsInputTypes {
 /**
  * A function that converts the characters of a string to uppercase.
  */
-case class Upper(child: Expression) extends UnaryExpression with CaseConversionExpression {
+case class Upper(child: Expression) extends UnaryExpression with String2StringExpression {
 
   override def convert(v: UTF8String): UTF8String = v.toUpperCase
 
@@ -131,7 +132,7 @@ case class Upper(child: Expression) extends UnaryExpression with CaseConversionE
 /**
  * A function that converts the characters of a string to lowercase.
  */
-case class Lower(child: Expression) extends UnaryExpression with CaseConversionExpression {
+case class Lower(child: Expression) extends UnaryExpression with String2StringExpression {
 
   override def convert(v: UTF8String): UTF8String = v.toLowerCase
 
@@ -187,6 +188,301 @@ case class EndsWith(left: Expression, right: Expression)
   }
 }
 
+/**
+ * A function that trim the spaces from both ends for the specified string.
+ */
+case class StringTrim(child: Expression)
+  extends UnaryExpression with String2StringExpression {
+
+  def convert(v: UTF8String): UTF8String = v.trim()
+
+  override def prettyName: String = "trim"
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    defineCodeGen(ctx, ev, c => s"($c).trim()")
+  }
+}
+
+/**
+ * A function that trim the spaces from left end for given string.
+ */
+case class StringTrimLeft(child: Expression)
+  extends UnaryExpression with String2StringExpression {
+
+  def convert(v: UTF8String): UTF8String = v.trimLeft()
+
+  override def prettyName: String = "ltrim"
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    defineCodeGen(ctx, ev, c => s"($c).trimLeft()")
+  }
+}
+
+/**
+ * A function that trim the spaces from right end for given string.
+ */
+case class StringTrimRight(child: Expression)
+  extends UnaryExpression with String2StringExpression {
+
+  def convert(v: UTF8String): UTF8String = v.trimRight()
+
+  override def prettyName: String = "rtrim"
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    defineCodeGen(ctx, ev, c => s"($c).trimRight()")
+  }
+}
+
+/**
+ * A function that returns the position of the first occurrence of substr in the given string.
+ * Returns null if either of the arguments are null and
+ * returns 0 if substr could not be found in str.
+ *
+ * NOTE: that this is not zero based, but 1-based index. The first character in str has index 1.
+ */
+case class StringInstr(str: Expression, substr: Expression)
+  extends BinaryExpression with ExpectsInputTypes {
+
+  override def left: Expression = str
+  override def right: Expression = substr
+  override def dataType: DataType = IntegerType
+  override def inputTypes: Seq[DataType] = Seq(StringType, StringType)
+
+  override def nullSafeEval(string: Any, sub: Any): Any = {
+    string.asInstanceOf[UTF8String].indexOf(sub.asInstanceOf[UTF8String], 0) + 1
+  }
+
+  override def prettyName: String = "instr"
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    defineCodeGen(ctx, ev, (l, r) =>
+      s"($l).indexOf($r, 0) + 1")
+  }
+}
+
+/**
+ * A function that returns the position of the first occurrence of substr
+ * in given string after position pos.
+ */
+case class StringLocate(substr: Expression, str: Expression, start: Expression)
+  extends Expression with ExpectsInputTypes {
+
+  def this(substr: Expression, str: Expression) = {
+    this(substr, str, Literal(0))
+  }
+
+  override def children: Seq[Expression] = substr :: str :: start :: Nil
+  override def foldable: Boolean = children.forall(_.foldable)
+  override def nullable: Boolean = substr.nullable || str.nullable
+  override def dataType: DataType = IntegerType
+  override def inputTypes: Seq[DataType] = Seq(StringType, StringType, IntegerType)
+
+  override def eval(input: InternalRow): Any = {
+    val s = start.eval(input)
+    if (s == null) {
+      // if the start position is null, we need to return 0, (conform to Hive)
+      0
+    } else {
+      val r = substr.eval(input)
+      if (r == null) {
+        null
+      } else {
+        val l = str.eval(input)
+        if (l == null) {
+          null
+        } else {
+          l.asInstanceOf[UTF8String].indexOf(
+            r.asInstanceOf[UTF8String],
+            s.asInstanceOf[Int]) + 1
+        }
+      }
+    }
+  }
+
+  override def prettyName: String = "locate"
+}
+
+/**
+ * Returns str, left-padded with pad to a length of len.
+ */
+case class StringLPad(str: Expression, len: Expression, pad: Expression)
+  extends Expression with ExpectsInputTypes {
+
+  override def children: Seq[Expression] = str :: len :: pad :: Nil
+  override def foldable: Boolean = children.forall(_.foldable)
+  override def nullable: Boolean = children.exists(_.nullable)
+  override def dataType: DataType = StringType
+  override def inputTypes: Seq[DataType] = Seq(StringType, IntegerType, StringType)
+
+  override def eval(input: InternalRow): Any = {
+    val s = str.eval(input)
+    if (s == null) {
+      null
+    } else {
+      val l = len.eval(input)
+      if (l == null) {
+        null
+      } else {
+        val p = pad.eval(input)
+        if (p == null) {
+          null
+        } else {
+          val len = l.asInstanceOf[Int]
+          val str = s.asInstanceOf[UTF8String]
+          val pad = p.asInstanceOf[UTF8String]
+
+          str.lpad(len, pad)
+        }
+      }
+    }
+  }
+
+  override def prettyName: String = "lpad"
+}
+
+/**
+ * Returns str, right-padded with pad to a length of len.
+ */
+case class StringRPad(str: Expression, len: Expression, pad: Expression)
+  extends Expression with ExpectsInputTypes {
+
+  override def children: Seq[Expression] = str :: len :: pad :: Nil
+  override def foldable: Boolean = children.forall(_.foldable)
+  override def nullable: Boolean = children.exists(_.nullable)
+  override def dataType: DataType = StringType
+  override def inputTypes: Seq[DataType] = Seq(StringType, IntegerType, StringType)
+
+  override def eval(input: InternalRow): Any = {
+    val s = str.eval(input)
+    if (s == null) {
+      null
+    } else {
+      val l = len.eval(input)
+      if (l == null) {
+        null
+      } else {
+        val p = pad.eval(input)
+        if (p == null) {
+          null
+        } else {
+          val len = l.asInstanceOf[Int]
+          val str = s.asInstanceOf[UTF8String]
+          val pad = p.asInstanceOf[UTF8String]
+
+          str.rpad(len, pad)
+        }
+      }
+    }
+  }
+
+  override def prettyName: String = "rpad"
+}
+
+/**
+ * Returns the input formatted according do printf-style format strings
+ */
+case class StringFormat(children: Expression*) extends Expression {
+
+  require(children.length >=1, "printf() should take at least 1 argument")
+
+  override def foldable: Boolean = children.forall(_.foldable)
+  override def nullable: Boolean = children(0).nullable
+  override def dataType: DataType = StringType
+  private def format: Expression = children(0)
+  private def args: Seq[Expression] = children.tail
+
+  override def eval(input: InternalRow): Any = {
+    val pattern = format.eval(input)
+    if (pattern == null) {
+      null
+    } else {
+      val sb = new StringBuffer()
+      val formatter = new java.util.Formatter(sb, Locale.US)
+
+      val arglist = args.map(_.eval(input).asInstanceOf[AnyRef])
+      formatter.format(pattern.asInstanceOf[UTF8String].toString(), arglist: _*)
+
+      UTF8String.fromString(sb.toString)
+    }
+  }
+
+  override def prettyName: String = "printf"
+}
+
+/**
+ * Returns the string which repeat the given string value n times.
+ */
+case class StringRepeat(str: Expression, times: Expression)
+  extends BinaryExpression with ExpectsInputTypes {
+
+  override def left: Expression = str
+  override def right: Expression = times
+  override def dataType: DataType = StringType
+  override def inputTypes: Seq[DataType] = Seq(StringType, IntegerType)
+
+  override def nullSafeEval(string: Any, n: Any): Any = {
+    string.asInstanceOf[UTF8String].repeat(n.asInstanceOf[Integer])
+  }
+
+  override def prettyName: String = "repeat"
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    defineCodeGen(ctx, ev, (l, r) => s"($l).repeat($r)")
+  }
+}
+
+/**
+ * Returns the reversed given string.
+ */
+case class StringReverse(child: Expression) extends UnaryExpression with String2StringExpression {
+  override def convert(v: UTF8String): UTF8String = v.reverse()
+
+  override def prettyName: String = "reverse"
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    defineCodeGen(ctx, ev, c => s"($c).reverse()")
+  }
+}
+
+/**
+ * Returns a n spaces string.
+ */
+case class StringSpace(child: Expression) extends UnaryExpression with ExpectsInputTypes {
+
+  override def dataType: DataType = StringType
+  override def inputTypes: Seq[DataType] = Seq(IntegerType)
+
+  override def nullSafeEval(s: Any): Any = {
+    val length = s.asInstanceOf[Integer]
+
+    val spaces = new Array[Byte](if (length < 0) 0 else length)
+    java.util.Arrays.fill(spaces, ' '.asInstanceOf[Byte])
+    UTF8String.fromBytes(spaces)
+  }
+
+  override def prettyName: String = "space"
+}
+
+/**
+ * Splits str around pat (pattern is a regular expression).
+ */
+case class StringSplit(str: Expression, pattern: Expression)
+  extends BinaryExpression with ExpectsInputTypes {
+
+  override def left: Expression = str
+  override def right: Expression = pattern
+  override def dataType: DataType = ArrayType(StringType)
+  override def inputTypes: Seq[DataType] = Seq(StringType, StringType)
+
+  override def nullSafeEval(string: Any, regex: Any): Any = {
+    val splits =
+      string.asInstanceOf[UTF8String].toString.split(regex.asInstanceOf[UTF8String].toString, -1)
+    splits.toSeq.map(UTF8String.fromString)
+  }
+
+  override def prettyName: String = "split"
+}
+
 /**
  * A function that takes a substring of its first argument starting at a given position.
  * Defined for String and Binary types.
@@ -199,8 +495,7 @@ case class Substring(str: Expression, pos: Expression, len: Expression)
   }
 
   override def foldable: Boolean = str.foldable && pos.foldable && len.foldable
-
-  override  def nullable: Boolean = str.nullable || pos.nullable || len.nullable
+  override def nullable: Boolean = str.nullable || pos.nullable || len.nullable
 
   override def dataType: DataType = {
     if (!resolved) {
@@ -373,4 +668,3 @@ case class Encode(value: Expression, charset: Expression)
   }
 }
 
-
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringFunctionsSuite.scala
index 69bef1c63e9dc..b19f4ee37a109 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringFunctionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringFunctionsSuite.scala
@@ -288,4 +288,142 @@ class StringFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(Levenshtein(Literal("世界千世"), Literal("大a界b")), 4)
     // scalastyle:on
   }
+
+  test("TRIM/LTRIM/RTRIM") {
+    val s = 'a.string.at(0)
+    checkEvaluation(StringTrim(Literal(" aa  ")), "aa", create_row(" abdef "))
+    checkEvaluation(StringTrim(s), "abdef", create_row(" abdef "))
+
+    checkEvaluation(StringTrimLeft(Literal(" aa  ")), "aa  ", create_row(" abdef "))
+    checkEvaluation(StringTrimLeft(s), "abdef ", create_row(" abdef "))
+
+    checkEvaluation(StringTrimRight(Literal(" aa  ")), " aa", create_row(" abdef "))
+    checkEvaluation(StringTrimRight(s), " abdef", create_row(" abdef "))
+
+    // scalastyle:off
+    // non ascii characters are not allowed in the source code, so we disable the scalastyle.
+    checkEvaluation(StringTrimRight(s), "  花花世界", create_row("  花花世界 "))
+    checkEvaluation(StringTrimLeft(s), "花花世界 ", create_row("  花花世界 "))
+    checkEvaluation(StringTrim(s), "花花世界", create_row("  花花世界 "))
+    // scalastyle:on
+  }
+
+  test("FORMAT") {
+    val f = 'f.string.at(0)
+    val d1 = 'd.int.at(1)
+    val s1 = 's.int.at(2)
+
+    val row1 = create_row("aa%d%s", 12, "cc")
+    val row2 = create_row(null, 12, "cc")
+    checkEvaluation(StringFormat(Literal("aa%d%s"), Literal(123), Literal("a")), "aa123a", row1)
+    checkEvaluation(StringFormat(Literal("aa")), "aa", create_row(null))
+    checkEvaluation(StringFormat(Literal("aa%d%s"), Literal(123), Literal("a")), "aa123a", row1)
+
+    checkEvaluation(StringFormat(f, d1, s1), "aa12cc", row1)
+    checkEvaluation(StringFormat(f, d1, s1), null, row2)
+  }
+
+  test("INSTR") {
+    val s1 = 'a.string.at(0)
+    val s2 = 'b.string.at(1)
+    val s3 = 'c.string.at(2)
+    val row1 = create_row("aaads", "aa", "zz")
+
+    checkEvaluation(StringInstr(Literal("aaads"), Literal("aa")), 1, row1)
+    checkEvaluation(StringInstr(Literal("aaads"), Literal("de")), 0, row1)
+    checkEvaluation(StringInstr(Literal.create(null, StringType), Literal("de")), null, row1)
+    checkEvaluation(StringInstr(Literal("aaads"), Literal.create(null, StringType)), null, row1)
+
+    checkEvaluation(StringInstr(s1, s2), 1, row1)
+    checkEvaluation(StringInstr(s1, s3), 0, row1)
+
+    // scalastyle:off
+    // non ascii characters are not allowed in the source code, so we disable the scalastyle.
+    checkEvaluation(StringInstr(s1, s2), 3, create_row("花花世界", "世界"))
+    checkEvaluation(StringInstr(s1, s2), 1, create_row("花花世界", "花"))
+    checkEvaluation(StringInstr(s1, s2), 0, create_row("花花世界", "小"))
+    // scalastyle:on
+  }
+
+  test("LOCATE") {
+    val s1 = 'a.string.at(0)
+    val s2 = 'b.string.at(1)
+    val s3 = 'c.string.at(2)
+    val s4 = 'd.int.at(3)
+    val row1 = create_row("aaads", "aa", "zz", 1)
+
+    checkEvaluation(new StringLocate(Literal("aa"), Literal("aaads")), 1, row1)
+    checkEvaluation(StringLocate(Literal("aa"), Literal("aaads"), Literal(1)), 2, row1)
+    checkEvaluation(StringLocate(Literal("aa"), Literal("aaads"), Literal(2)), 0, row1)
+    checkEvaluation(new StringLocate(Literal("de"), Literal("aaads")), 0, row1)
+    checkEvaluation(StringLocate(Literal("de"), Literal("aaads"), 1), 0, row1)
+
+    checkEvaluation(new StringLocate(s2, s1), 1, row1)
+    checkEvaluation(StringLocate(s2, s1, s4), 2, row1)
+    checkEvaluation(new StringLocate(s3, s1), 0, row1)
+    checkEvaluation(StringLocate(s3, s1, Literal.create(null, IntegerType)), 0, row1)
+  }
+
+  test("LPAD/RPAD") {
+    val s1 = 'a.string.at(0)
+    val s2 = 'b.int.at(1)
+    val s3 = 'c.string.at(2)
+    val row1 = create_row("hi", 5, "??")
+    val row2 = create_row("hi", 1, "?")
+    val row3 = create_row(null, 1, "?")
+
+    checkEvaluation(StringLPad(Literal("hi"), Literal(5), Literal("??")), "???hi", row1)
+    checkEvaluation(StringLPad(Literal("hi"), Literal(1), Literal("??")), "h", row1)
+    checkEvaluation(StringLPad(s1, s2, s3), "???hi", row1)
+    checkEvaluation(StringLPad(s1, s2, s3), "h", row2)
+    checkEvaluation(StringLPad(s1, s2, s3), null, row3)
+
+    checkEvaluation(StringRPad(Literal("hi"), Literal(5), Literal("??")), "hi???", row1)
+    checkEvaluation(StringRPad(Literal("hi"), Literal(1), Literal("??")), "h", row1)
+    checkEvaluation(StringRPad(s1, s2, s3), "hi???", row1)
+    checkEvaluation(StringRPad(s1, s2, s3), "h", row2)
+    checkEvaluation(StringRPad(s1, s2, s3), null, row3)
+  }
+
+  test("REPEAT") {
+    val s1 = 'a.string.at(0)
+    val s2 = 'b.int.at(1)
+    val row1 = create_row("hi", 2)
+    val row2 = create_row(null, 1)
+
+    checkEvaluation(StringRepeat(Literal("hi"), Literal(2)), "hihi", row1)
+    checkEvaluation(StringRepeat(Literal("hi"), Literal(-1)), "", row1)
+    checkEvaluation(StringRepeat(s1, s2), "hihi", row1)
+    checkEvaluation(StringRepeat(s1, s2), null, row2)
+  }
+
+  test("REVERSE") {
+    val s = 'a.string.at(0)
+    val row1 = create_row("abccc")
+    checkEvaluation(StringReverse(Literal("abccc")), "cccba", row1)
+    checkEvaluation(StringReverse(s), "cccba", row1)
+  }
+
+  test("SPACE") {
+    val s1 = 'b.int.at(0)
+    val row1 = create_row(2)
+    val row2 = create_row(null)
+
+    checkEvaluation(StringSpace(Literal(2)), "  ", row1)
+    checkEvaluation(StringSpace(Literal(-1)), "", row1)
+    checkEvaluation(StringSpace(Literal(0)), "", row1)
+    checkEvaluation(StringSpace(s1), "  ", row1)
+    checkEvaluation(StringSpace(s1), null, row2)
+  }
+
+  test("SPLIT") {
+    val s1 = 'a.string.at(0)
+    val s2 = 'b.string.at(1)
+    val row1 = create_row("aa2bb3cc", "[1-9]+")
+
+    checkEvaluation(
+      StringSplit(Literal("aa2bb3cc"), Literal("[1-9]+")), Seq("aa", "bb", "cc"), row1)
+    checkEvaluation(
+      StringSplit(s1, s2), Seq("aa", "bb", "cc"), row1)
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 4da9ffc495e17..08bf37a5c223c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -1626,6 +1626,179 @@ object functions {
    */
   def ascii(columnName: String): Column = ascii(Column(columnName))
 
+  /**
+   * Trim the spaces from both ends for the specified string value.
+   *
+   * @group string_funcs
+   * @since 1.5.0
+   */
+  def trim(e: Column): Column = StringTrim(e.expr)
+
+  /**
+   * Trim the spaces from both ends for the specified column.
+   *
+   * @group string_funcs
+   * @since 1.5.0
+   */
+  def trim(columnName: String): Column = trim(Column(columnName))
+
+  /**
+   * Trim the spaces from left end for the specified string value.
+   *
+   * @group string_funcs
+   * @since 1.5.0
+   */
+  def ltrim(e: Column): Column = StringTrimLeft(e.expr)
+
+  /**
+   * Trim the spaces from left end for the specified column.
+   *
+   * @group string_funcs
+   * @since 1.5.0
+   */
+  def ltrim(columnName: String): Column = ltrim(Column(columnName))
+
+  /**
+   * Trim the spaces from right end for the specified string value.
+   *
+   * @group string_funcs
+   * @since 1.5.0
+   */
+  def rtrim(e: Column): Column = StringTrimRight(e.expr)
+
+  /**
+   * Trim the spaces from right end for the specified column.
+   *
+   * @group string_funcs
+   * @since 1.5.0
+   */
+  def rtrim(columnName: String): Column = rtrim(Column(columnName))
+
+  /**
+   * Format strings in printf-style.
+   *
+   * @group string_funcs
+   * @since 1.5.0
+   */
+  @scala.annotation.varargs
+  def formatString(format: Column, arguments: Column*): Column = {
+    StringFormat((format +: arguments).map(_.expr): _*)
+  }
+
+  /**
+   * Format strings in printf-style.
+   * NOTE: `format` is the string value of the formatter, not column name.
+   *
+   * @group string_funcs
+   * @since 1.5.0
+   */
+  @scala.annotation.varargs
+  def formatString(format: String, arguNames: String*): Column = {
+    StringFormat(lit(format).expr +: arguNames.map(Column(_).expr): _*)
+  }
+
+  /**
+   * Locate the position of the first occurrence of substr value in the given string.
+   * Returns null if either of the arguments are null.
+   *
+   * NOTE: The position is not zero based, but 1 based index, returns 0 if substr
+   * could not be found in str.
+   *
+   * @group string_funcs
+   * @since 1.5.0
+   */
+  def instr(substr: String, sub: String): Column = instr(Column(substr), Column(sub))
+
+  /**
+   * Locate the position of the first occurrence of substr column in the given string.
+   * Returns null if either of the arguments are null.
+   *
+   * NOTE: The position is not zero based, but 1 based index, returns 0 if substr
+   * could not be found in str.
+   *
+   * @group string_funcs
+   * @since 1.5.0
+   */
+  def instr(substr: Column, sub: Column): Column = StringInstr(substr.expr, sub.expr)
+
+  /**
+   * Locate the position of the first occurrence of substr.
+   *
+   * NOTE: The position is not zero based, but 1 based index, returns 0 if substr
+   * could not be found in str.
+   *
+   * @group string_funcs
+   * @since 1.5.0
+   */
+  def locate(substr: String, str: String): Column = {
+    locate(Column(substr), Column(str))
+  }
+
+  /**
+   * Locate the position of the first occurrence of substr.
+   *
+   * NOTE: The position is not zero based, but 1 based index, returns 0 if substr
+   * could not be found in str.
+   *
+   * @group string_funcs
+   * @since 1.5.0
+   */
+  def locate(substr: Column, str: Column): Column = {
+    new StringLocate(substr.expr, str.expr)
+  }
+
+  /**
+   * Locate the position of the first occurrence of substr in a given string after position pos.
+   *
+   * NOTE: The position is not zero based, but 1 based index, returns 0 if substr
+   * could not be found in str.
+   *
+   * @group string_funcs
+   * @since 1.5.0
+   */
+  def locate(substr: String, str: String, pos: String): Column = {
+    locate(Column(substr), Column(str), Column(pos))
+  }
+
+  /**
+   * Locate the position of the first occurrence of substr in a given string after position pos.
+   *
+   * NOTE: The position is not zero based, but 1 based index, returns 0 if substr
+   * could not be found in str.
+   *
+   * @group string_funcs
+   * @since 1.5.0
+   */
+  def locate(substr: Column, str: Column, pos: Column): Column = {
+    StringLocate(substr.expr, str.expr, pos.expr)
+  }
+
+  /**
+   * Locate the position of the first occurrence of substr in a given string after position pos.
+   *
+   * NOTE: The position is not zero based, but 1 based index, returns 0 if substr
+   * could not be found in str.
+   *
+   * @group string_funcs
+   * @since 1.5.0
+   */
+  def locate(substr: Column, str: Column, pos: Int): Column = {
+    StringLocate(substr.expr, str.expr, lit(pos).expr)
+  }
+
+  /**
+   * Locate the position of the first occurrence of substr in a given string after position pos.
+   *
+   * NOTE: The position is not zero based, but 1 based index, returns 0 if substr
+   * could not be found in str.
+   *
+   * @group string_funcs
+   * @since 1.5.0
+   */
+  def locate(substr: String, str: String, pos: Int): Column = {
+    locate(Column(substr), Column(str), lit(pos))
+  }
+
   /**
    * Computes the specified value from binary to a base64 string.
    *
@@ -1658,6 +1831,46 @@ object functions {
    */
   def unbase64(columnName: String): Column = unbase64(Column(columnName))
 
+  /**
+   * Left-padded with pad to a length of len.
+   *
+   * @group string_funcs
+   * @since 1.5.0
+   */
+  def lpad(str: String, len: String, pad: String): Column = {
+    lpad(Column(str), Column(len), Column(pad))
+  }
+
+  /**
+   * Left-padded with pad to a length of len.
+   *
+   * @group string_funcs
+   * @since 1.5.0
+   */
+  def lpad(str: Column, len: Column, pad: Column): Column = {
+    StringLPad(str.expr, len.expr, pad.expr)
+  }
+
+  /**
+   * Left-padded with pad to a length of len.
+   *
+   * @group string_funcs
+   * @since 1.5.0
+   */
+  def lpad(str: Column, len: Int, pad: Column): Column = {
+    StringLPad(str.expr, lit(len).expr, pad.expr)
+  }
+
+  /**
+   * Left-padded with pad to a length of len.
+   *
+   * @group string_funcs
+   * @since 1.5.0
+   */
+  def lpad(str: String, len: Int, pad: String): Column = {
+    lpad(Column(str), len, Column(pad))
+  }
+
   /**
    * Computes the first argument into a binary from a string using the provided character set
    * (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16').
@@ -1702,6 +1915,146 @@ object functions {
   def decode(columnName: String, charset: String): Column =
     decode(Column(columnName), charset)
 
+  /**
+   * Right-padded with pad to a length of len.
+   *
+   * @group string_funcs
+   * @since 1.5.0
+   */
+  def rpad(str: String, len: String, pad: String): Column = {
+    rpad(Column(str), Column(len), Column(pad))
+  }
+
+  /**
+   * Right-padded with pad to a length of len.
+   *
+   * @group string_funcs
+   * @since 1.5.0
+   */
+  def rpad(str: Column, len: Column, pad: Column): Column = {
+    StringRPad(str.expr, len.expr, pad.expr)
+  }
+
+  /**
+   * Right-padded with pad to a length of len.
+   *
+   * @group string_funcs
+   * @since 1.5.0
+   */
+  def rpad(str: String, len: Int, pad: String): Column = {
+    rpad(Column(str), len, Column(pad))
+  }
+
+  /**
+   * Right-padded with pad to a length of len.
+   *
+   * @group string_funcs
+   * @since 1.5.0
+   */
+  def rpad(str: Column, len: Int, pad: Column): Column = {
+    StringRPad(str.expr, lit(len).expr, pad.expr)
+  }
+
+  /**
+   * Repeat the string value of the specified column n times.
+   *
+   * @group string_funcs
+   * @since 1.5.0
+   */
+  def repeat(strColumn: String, timesColumn: String): Column = {
+    repeat(Column(strColumn), Column(timesColumn))
+  }
+
+  /**
+   * Repeat the string expression value n times.
+   *
+   * @group string_funcs
+   * @since 1.5.0
+   */
+  def repeat(str: Column, times: Column): Column = {
+    StringRepeat(str.expr, times.expr)
+  }
+
+  /**
+   * Repeat the string value of the specified column n times.
+   *
+   * @group string_funcs
+   * @since 1.5.0
+   */
+  def repeat(strColumn: String, times: Int): Column = {
+    repeat(Column(strColumn), times)
+  }
+
+  /**
+   * Repeat the string expression value n times.
+   *
+   * @group string_funcs
+   * @since 1.5.0
+   */
+  def repeat(str: Column, times: Int): Column = {
+    StringRepeat(str.expr, lit(times).expr)
+  }
+
+  /**
+   * Splits str around pattern (pattern is a regular expression).
+   *
+   * @group string_funcs
+   * @since 1.5.0
+   */
+  def split(strColumnName: String, pattern: String): Column = {
+    split(Column(strColumnName), pattern)
+  }
+
+  /**
+   * Splits str around pattern (pattern is a regular expression).
+   * NOTE: pattern is a string represent the regular expression.
+   *
+   * @group string_funcs
+   * @since 1.5.0
+   */
+  def split(str: Column, pattern: String): Column = {
+    StringSplit(str.expr, lit(pattern).expr)
+  }
+
+  /**
+   * Reversed the string for the specified column.
+   *
+   * @group string_funcs
+   * @since 1.5.0
+   */
+  def reverse(str: String): Column = {
+    reverse(Column(str))
+  }
+
+  /**
+   * Reversed the string for the specified value.
+   *
+   * @group string_funcs
+   * @since 1.5.0
+   */
+  def reverse(str: Column): Column = {
+    StringReverse(str.expr)
+  }
+
+  /**
+   * Make a n spaces of string.
+   *
+   * @group string_funcs
+   * @since 1.5.0
+   */
+  def space(n: String): Column = {
+    space(Column(n))
+  }
+
+  /**
+   * Make a n spaces of string.
+   *
+   * @group string_funcs
+   * @since 1.5.0
+   */
+  def space(n: Column): Column = {
+    StringSpace(n.expr)
+  }
 
   //////////////////////////////////////////////////////////////////////////////////////////////
   //////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
index afba28515e032..173280375c411 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
@@ -209,21 +209,14 @@ class DataFrameFunctionsSuite extends QueryTest {
   }
 
   test("string length function") {
+    val df = Seq(("abc", "")).toDF("a", "b")
     checkAnswer(
-      nullStrings.select(strlen($"s"), strlen("s")),
-      nullStrings.collect().toSeq.map { r =>
-        val v = r.getString(1)
-        val l = if (v == null) null else v.length
-        Row(l, l)
-      })
+      df.select(strlen($"a"), strlen("b")),
+      Row(3, 0))
 
     checkAnswer(
-      nullStrings.selectExpr("length(s)"),
-      nullStrings.collect().toSeq.map { r =>
-        val v = r.getString(1)
-        val l = if (v == null) null else v.length
-        Row(l)
-      })
+      df.selectExpr("length(a)", "length(b)"),
+      Row(3, 0))
   }
 
   test("Levenshtein distance") {
@@ -273,4 +266,119 @@ class DataFrameFunctionsSuite extends QueryTest {
       Row(bytes, "大千世界"))
     // scalastyle:on
   }
+
+  test("string trim functions") {
+    val df = Seq(("  example  ", "")).toDF("a", "b")
+
+    checkAnswer(
+      df.select(ltrim($"a"), rtrim($"a"), trim($"a")),
+      Row("example  ", "  example", "example"))
+
+    checkAnswer(
+      df.selectExpr("ltrim(a)", "rtrim(a)", "trim(a)"),
+      Row("example  ", "  example", "example"))
+  }
+
+  test("string formatString function") {
+    val df = Seq(("aa%d%s", 123, "cc")).toDF("a", "b", "c")
+
+    checkAnswer(
+      df.select(formatString($"a", $"b", $"c"), formatString("aa%d%s", "b", "c")),
+      Row("aa123cc", "aa123cc"))
+
+    checkAnswer(
+      df.selectExpr("printf(a, b, c)"),
+      Row("aa123cc"))
+  }
+
+  test("string instr function") {
+    val df = Seq(("aaads", "aa", "zz")).toDF("a", "b", "c")
+
+    checkAnswer(
+      df.select(instr($"a", $"b"), instr("a", "b")),
+      Row(1, 1))
+
+    checkAnswer(
+      df.selectExpr("instr(a, b)"),
+      Row(1))
+  }
+
+  test("string locate function") {
+    val df = Seq(("aaads", "aa", "zz", 1)).toDF("a", "b", "c", "d")
+
+    checkAnswer(
+      df.select(
+        locate($"b", $"a"), locate("b", "a"), locate($"b", $"a", 1),
+        locate("b", "a", 1), locate($"b", $"a", $"d"), locate("b", "a", "d")),
+      Row(1, 1, 2, 2, 2, 2))
+
+    checkAnswer(
+      df.selectExpr("locate(b, a)", "locate(b, a, d)"),
+      Row(1, 2))
+  }
+
+  test("string padding functions") {
+    val df = Seq(("hi", 5, "??")).toDF("a", "b", "c")
+
+    checkAnswer(
+          df.select(
+            lpad($"a", $"b", $"c"), rpad("a", "b", "c"),
+            lpad($"a", 1, $"c"), rpad("a", 1, "c")),
+          Row("???hi", "hi???", "h", "h"))
+
+    checkAnswer(
+      df.selectExpr("lpad(a, b, c)", "rpad(a, b, c)", "lpad(a, 1, c)", "rpad(a, 1, c)"),
+      Row("???hi", "hi???", "h", "h"))
+  }
+
+  test("string repeat function") {
+    val df = Seq(("hi", 2)).toDF("a", "b")
+
+    checkAnswer(
+      df.select(
+        repeat($"a", 2), repeat("a", 2), repeat($"a", $"b"), repeat("a", "b")),
+      Row("hihi", "hihi", "hihi", "hihi"))
+
+    checkAnswer(
+      df.selectExpr("repeat(a, 2)", "repeat(a, b)"),
+      Row("hihi", "hihi"))
+  }
+
+  test("string reverse function") {
+    val df = Seq(("hi", "hhhi")).toDF("a", "b")
+
+    checkAnswer(
+      df.select(reverse($"a"), reverse("b")),
+      Row("ih", "ihhh"))
+
+    checkAnswer(
+      df.selectExpr("reverse(b)"),
+      Row("ihhh"))
+  }
+
+  test("string space function") {
+    val df = Seq((2, 3)).toDF("a", "b")
+
+    checkAnswer(
+      df.select(space($"a"), space("b")),
+      Row("  ", "   "))
+
+    checkAnswer(
+      df.selectExpr("space(b)"),
+      Row("   "))
+  }
+
+  test("string split function") {
+    val df = Seq(("aa2bb3cc", "[1-9]+")).toDF("a", "b")
+
+    checkAnswer(
+      df.select(
+        split($"a", "[1-9]+"),
+        split("a", "[1-9]+")),
+      Row(Seq("aa", "bb", "cc"), Seq("aa", "bb", "cc")))
+
+    checkAnswer(
+      df.selectExpr("split(a, '[1-9]+')"),
+      Row(Seq("aa", "bb", "cc")))
+  }
 }
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
index 847d80ad583f6..60d050b0a0c97 100644
--- a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
+++ b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -25,6 +25,7 @@
 
 import static org.apache.spark.unsafe.PlatformDependent.*;
 
+
 /**
  * A UTF-8 String for internal Spark use.
  * <p>
@@ -204,6 +205,196 @@ public UTF8String toLowerCase() {
     return fromString(toString().toLowerCase());
   }
 
+  /**
+   * Copy the bytes from the current UTF8String, and make a new UTF8String.
+   * @param start the start position of the current UTF8String in bytes.
+   * @param end the end position of the current UTF8String in bytes.
+   * @return a new UTF8String in the position of [start, end] of current UTF8String bytes.
+   */
+  private UTF8String copyUTF8String(int start, int end) {
+    int len = end - start + 1;
+    byte[] newBytes = new byte[len];
+    copyMemory(base, offset + start, newBytes, BYTE_ARRAY_OFFSET, len);
+    return UTF8String.fromBytes(newBytes);
+  }
+
+  public UTF8String trim() {
+    int s = 0;
+    int e = this.numBytes - 1;
+    // skip all of the space (0x20) in the left side
+    while (s < this.numBytes && getByte(s) == 0x20) s++;
+    // skip all of the space (0x20) in the right side
+    while (e >= 0 && getByte(e) == 0x20) e--;
+
+    if (s > e) {
+      // empty string
+      return UTF8String.fromBytes(new byte[0]);
+    } else {
+      return copyUTF8String(s, e);
+    }
+  }
+
+  public UTF8String trimLeft() {
+    int s = 0;
+    // skip all of the space (0x20) in the left side
+    while (s < this.numBytes && getByte(s) == 0x20) s++;
+    if (s == this.numBytes) {
+      // empty string
+      return UTF8String.fromBytes(new byte[0]);
+    } else {
+      return copyUTF8String(s, this.numBytes - 1);
+    }
+  }
+
+  public UTF8String trimRight() {
+    int e = numBytes - 1;
+    // skip all of the space (0x20) in the right side
+    while (e >= 0 && getByte(e) == 0x20) e--;
+
+    if (e < 0) {
+      // empty string
+      return UTF8String.fromBytes(new byte[0]);
+    } else {
+      return copyUTF8String(0, e);
+    }
+  }
+
+  public UTF8String reverse() {
+    byte[] bytes = getBytes();
+    byte[] result = new byte[bytes.length];
+
+    int i = 0; // position in byte
+    while (i < numBytes) {
+      int len = numBytesForFirstByte(getByte(i));
+      System.arraycopy(bytes, i, result, result.length - i - len, len);
+
+      i += len;
+    }
+
+    return UTF8String.fromBytes(result);
+  }
+
+  public UTF8String repeat(int times) {
+    if (times <=0) {
+      return fromBytes(new byte[0]);
+    }
+
+    byte[] newBytes = new byte[numBytes * times];
+    System.arraycopy(getBytes(), 0, newBytes, 0, numBytes);
+
+    int copied = 1;
+    while (copied < times) {
+      int toCopy = Math.min(copied, times - copied);
+      System.arraycopy(newBytes, 0, newBytes, copied * numBytes, numBytes * toCopy);
+      copied += toCopy;
+    }
+
+    return UTF8String.fromBytes(newBytes);
+  }
+
+  /**
+   * Returns the position of the first occurrence of substr in
+   * current string from the specified position (0-based index).
+   *
+   * @param v the string to be searched
+   * @param start the start position of the current string for searching
+   * @return the position of the first occurrence of substr, if not found, -1 returned.
+   */
+  public int indexOf(UTF8String v, int start) {
+    if (v.numBytes() == 0) {
+      return 0;
+    }
+
+    // locate to the start position.
+    int i = 0; // position in byte
+    int c = 0; // position in character
+    while (i < numBytes && c < start) {
+      i += numBytesForFirstByte(getByte(i));
+      c += 1;
+    }
+
+    do {
+      if (i + v.numBytes > numBytes) {
+        return -1;
+      }
+      if (ByteArrayMethods.arrayEquals(base, offset + i, v.base, v.offset, v.numBytes)) {
+        return c;
+      }
+      i += numBytesForFirstByte(getByte(i));
+      c += 1;
+    } while(i < numBytes);
+
+    return -1;
+  }
+
+  /**
+   * Returns str, right-padded with pad to a length of len
+   * For example:
+   *   ('hi', 5, '??') => 'hi???'
+   *   ('hi', 1, '??') => 'h'
+   */
+  public UTF8String rpad(int len, UTF8String pad) {
+    int spaces = len - this.numChars(); // number of char need to pad
+    if (spaces <= 0) {
+      // no padding at all, return the substring of the current string
+      return substring(0, len);
+    } else {
+      int padChars = pad.numChars();
+      int count = spaces / padChars; // how many padding string needed
+      // the partial string of the padding
+      UTF8String remain = pad.substring(0, spaces - padChars * count);
+
+      byte[] data = new byte[this.numBytes + pad.numBytes * count + remain.numBytes];
+      System.arraycopy(getBytes(), 0, data, 0, this.numBytes);
+      int offset = this.numBytes;
+      int idx = 0;
+      byte[] padBytes = pad.getBytes();
+      while (idx < count) {
+        System.arraycopy(padBytes, 0, data, offset, pad.numBytes);
+        ++idx;
+        offset += pad.numBytes;
+      }
+      System.arraycopy(remain.getBytes(), 0, data, offset, remain.numBytes);
+
+      return UTF8String.fromBytes(data);
+    }
+  }
+
+  /**
+   * Returns str, left-padded with pad to a length of len.
+   * For example:
+   *   ('hi', 5, '??') => '???hi'
+   *   ('hi', 1, '??') => 'h'
+   */
+  public UTF8String lpad(int len, UTF8String pad) {
+    int spaces = len - this.numChars(); // number of char need to pad
+    if (spaces <= 0) {
+      // no padding at all, return the substring of the current string
+      return substring(0, len);
+    } else {
+      int padChars = pad.numChars();
+      int count = spaces / padChars; // how many padding string needed
+      // the partial string of the padding
+      UTF8String remain = pad.substring(0, spaces - padChars * count);
+
+      byte[] data = new byte[this.numBytes + pad.numBytes * count + remain.numBytes];
+
+      int offset = 0;
+      int idx = 0;
+      byte[] padBytes = pad.getBytes();
+      while (idx < count) {
+        System.arraycopy(padBytes, 0, data, offset, pad.numBytes);
+        ++idx;
+        offset += pad.numBytes;
+      }
+      System.arraycopy(remain.getBytes(), 0, data, offset, remain.numBytes);
+      offset += remain.numBytes;
+      System.arraycopy(getBytes(), 0, data, offset, numBytes());
+
+      return UTF8String.fromBytes(data);
+    }
+  }
+
   @Override
   public String toString() {
     try {
diff --git a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
index fb463ba17f50b..694bdc29f39d1 100644
--- a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
+++ b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
@@ -121,12 +121,94 @@ public void endsWith() {
 
   @Test
   public void substring() {
-    assertEquals(fromString("hello").substring(0, 0), fromString(""));
-    assertEquals(fromString("hello").substring(1, 3), fromString("el"));
-    assertEquals(fromString("数据砖头").substring(0, 1), fromString("数"));
-    assertEquals(fromString("数据砖头").substring(1, 3), fromString("据砖"));
-    assertEquals(fromString("数据砖头").substring(3, 5), fromString("头"));
-    assertEquals(fromString("ߵ梷").substring(0, 2), fromString("ߵ梷"));
+    assertEquals(fromString(""), fromString("hello").substring(0, 0));
+    assertEquals(fromString("el"), fromString("hello").substring(1, 3));
+    assertEquals(fromString("数"), fromString("数据砖头").substring(0, 1));
+    assertEquals(fromString("据砖"), fromString("数据砖头").substring(1, 3));
+    assertEquals(fromString("头"), fromString("数据砖头").substring(3, 5));
+    assertEquals(fromString("ߵ梷"), fromString("ߵ梷").substring(0, 2));
+  }
+
+  @Test
+  public void trims() {
+    assertEquals(fromString("hello"), fromString("  hello ").trim());
+    assertEquals(fromString("hello "), fromString("  hello ").trimLeft());
+    assertEquals(fromString("  hello"), fromString("  hello ").trimRight());
+
+    assertEquals(fromString(""), fromString("  ").trim());
+    assertEquals(fromString(""), fromString("  ").trimLeft());
+    assertEquals(fromString(""), fromString("  ").trimRight());
+
+    assertEquals(fromString("数据砖头"), fromString("  数据砖头 ").trim());
+    assertEquals(fromString("数据砖头 "), fromString("  数据砖头 ").trimLeft());
+    assertEquals(fromString("  数据砖头"), fromString("  数据砖头 ").trimRight());
+
+    assertEquals(fromString("数据砖头"), fromString("数据砖头").trim());
+    assertEquals(fromString("数据砖头"), fromString("数据砖头").trimLeft());
+    assertEquals(fromString("数据砖头"), fromString("数据砖头").trimRight());
+  }
+
+  @Test
+  public void indexOf() {
+    assertEquals(0, fromString("").indexOf(fromString(""), 0));
+    assertEquals(-1, fromString("").indexOf(fromString("l"), 0));
+    assertEquals(0, fromString("hello").indexOf(fromString(""), 0));
+    assertEquals(2, fromString("hello").indexOf(fromString("l"), 0));
+    assertEquals(3, fromString("hello").indexOf(fromString("l"), 3));
+    assertEquals(-1, fromString("hello").indexOf(fromString("a"), 0));
+    assertEquals(2, fromString("hello").indexOf(fromString("ll"), 0));
+    assertEquals(-1, fromString("hello").indexOf(fromString("ll"), 4));
+    assertEquals(1, fromString("数据砖头").indexOf(fromString("据砖"), 0));
+    assertEquals(-1, fromString("数据砖头").indexOf(fromString("数"), 3));
+    assertEquals(0, fromString("数据砖头").indexOf(fromString("数"), 0));
+    assertEquals(3, fromString("数据砖头").indexOf(fromString("头"), 0));
+  }
+
+  @Test
+  public void reverse() {
+    assertEquals(fromString("olleh"), fromString("hello").reverse());
+    assertEquals(fromString(""), fromString("").reverse());
+    assertEquals(fromString("者行孙"), fromString("孙行者").reverse());
+    assertEquals(fromString("者行孙 olleh"), fromString("hello 孙行者").reverse());
+  }
+
+  @Test
+  public void repeat() {
+    assertEquals(fromString("数d数d数d数d数d"), fromString("数d").repeat(5));
+    assertEquals(fromString("数d"), fromString("数d").repeat(1));
+    assertEquals(fromString(""), fromString("数d").repeat(-1));
+  }
+
+  @Test
+  public void pad() {
+    assertEquals(fromString("hel"), fromString("hello").lpad(3, fromString("????")));
+    assertEquals(fromString("hello"), fromString("hello").lpad(5, fromString("????")));
+    assertEquals(fromString("?hello"), fromString("hello").lpad(6, fromString("????")));
+    assertEquals(fromString("???????hello"), fromString("hello").lpad(12, fromString("????")));
+    assertEquals(fromString("?????hello"), fromString("hello").lpad(10, fromString("?????")));
+    assertEquals(fromString("???????"), fromString("").lpad(7, fromString("?????")));
+
+    assertEquals(fromString("hel"), fromString("hello").rpad(3, fromString("????")));
+    assertEquals(fromString("hello"), fromString("hello").rpad(5, fromString("????")));
+    assertEquals(fromString("hello?"), fromString("hello").rpad(6, fromString("????")));
+    assertEquals(fromString("hello???????"), fromString("hello").rpad(12, fromString("????")));
+    assertEquals(fromString("hello?????"), fromString("hello").rpad(10, fromString("?????")));
+    assertEquals(fromString("???????"), fromString("").rpad(7, fromString("?????")));
+
+
+    assertEquals(fromString("数据砖"), fromString("数据砖头").lpad(3, fromString("????")));
+    assertEquals(fromString("?数据砖头"), fromString("数据砖头").lpad(5, fromString("????")));
+    assertEquals(fromString("??数据砖头"), fromString("数据砖头").lpad(6, fromString("????")));
+    assertEquals(fromString("孙行数据砖头"), fromString("数据砖头").lpad(6, fromString("孙行者")));
+    assertEquals(fromString("孙行者数据砖头"), fromString("数据砖头").lpad(7, fromString("孙行者")));
+    assertEquals(fromString("孙行者孙行者孙行数据砖头"), fromString("数据砖头").lpad(12, fromString("孙行者")));
+
+    assertEquals(fromString("数据砖"), fromString("数据砖头").rpad(3, fromString("????")));
+    assertEquals(fromString("数据砖头?"), fromString("数据砖头").rpad(5, fromString("????")));
+    assertEquals(fromString("数据砖头??"), fromString("数据砖头").rpad(6, fromString("????")));
+    assertEquals(fromString("数据砖头孙行"), fromString("数据砖头").rpad(6, fromString("孙行者")));
+    assertEquals(fromString("数据砖头孙行者"), fromString("数据砖头").rpad(7, fromString("孙行者")));
+    assertEquals(fromString("数据砖头孙行者孙行者孙行"), fromString("数据砖头").rpad(12, fromString("孙行者")));
   }
   
   @Test

From 7ce3b818fb1ba3f291eda58988e4808e999cae3a Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Thu, 9 Jul 2015 13:19:36 -0700
Subject: [PATCH 091/549] [MINOR] [STREAMING] Fix log statements in
 ReceiverSupervisorImpl

Log statements incorrectly showed that the executor was being stopped when receiver was being stopped.

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #7328 from tdas/fix-log and squashes the following commits:

9cc6e99 [Tathagata Das] Fix log statements.
---
 .../spark/streaming/receiver/ReceiverSupervisor.scala       | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisor.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisor.scala
index 33be067ebdaf2..eeb14ca3a49e9 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisor.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisor.scala
@@ -182,12 +182,12 @@ private[streaming] abstract class ReceiverSupervisor(
 
   /** Wait the thread until the supervisor is stopped */
   def awaitTermination() {
+    logInfo("Waiting for receiver to be stopped")
     stopLatch.await()
-    logInfo("Waiting for executor stop is over")
     if (stoppingError != null) {
-      logError("Stopped executor with error: " + stoppingError)
+      logError("Stopped receiver with error: " + stoppingError)
     } else {
-      logWarning("Stopped executor without error")
+      logInfo("Stopped receiver without error")
     }
     if (stoppingError != null) {
       throw stoppingError

From 930fe95350f8865e2af2d7afa5b717210933cd43 Mon Sep 17 00:00:00 2001
From: xutingjun <xutingjun@huawei.com>
Date: Thu, 9 Jul 2015 13:21:10 -0700
Subject: [PATCH 092/549] [SPARK-8953] SPARK_EXECUTOR_CORES is not read in
 SparkSubmit

The configuration ```SPARK_EXECUTOR_CORES``` won't put into ```SparkConf```, so it has no effect to the dynamic executor allocation.

Author: xutingjun <xutingjun@huawei.com>

Closes #7322 from XuTingjun/SPARK_EXECUTOR_CORES and squashes the following commits:

2cafa89 [xutingjun] make SPARK_EXECUTOR_CORES has effect to dynamicAllocation
---
 .../scala/org/apache/spark/deploy/SparkSubmitArguments.scala     | 1 +
 1 file changed, 1 insertion(+)

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
index 73ab18332feb4..6e3c0b21b33c2 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
@@ -162,6 +162,7 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S
       .orNull
     executorCores = Option(executorCores)
       .orElse(sparkProperties.get("spark.executor.cores"))
+      .orElse(env.get("SPARK_EXECUTOR_CORES"))
       .orNull
     totalExecutorCores = Option(totalExecutorCores)
       .orElse(sparkProperties.get("spark.cores.max"))

From 88bf430331eef3c02438ca441616034486e15789 Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Thu, 9 Jul 2015 13:22:17 -0700
Subject: [PATCH 093/549] [SPARK-7419] [STREAMING] [TESTS] Fix
 CheckpointSuite.recovery with file input stream

Fix this failure: https://amplab.cs.berkeley.edu/jenkins/job/Spark-Master-SBT/2886/AMPLAB_JENKINS_BUILD_PROFILE=hadoop2.3,label=centos/testReport/junit/org.apache.spark.streaming/CheckpointSuite/recovery_with_file_input_stream/

To reproduce this failure, you can add `Thread.sleep(2000)` before this line
https://github.com/apache/spark/blob/a9c4e29950a14e32acaac547e9a0e8879fd37fc9/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala#L477

Author: zsxwing <zsxwing@gmail.com>

Closes #7323 from zsxwing/SPARK-7419 and squashes the following commits:

b3caf58 [zsxwing] Fix CheckpointSuite.recovery with file input stream
---
 .../spark/streaming/CheckpointSuite.scala      | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala
index 6b0a3f91d4d06..6a94928076236 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala
@@ -424,11 +424,11 @@ class CheckpointSuite extends TestSuiteBase {
             }
           }
         }
-        clock.advance(batchDuration.milliseconds)
         eventually(eventuallyTimeout) {
           // Wait until all files have been recorded and all batches have started
           assert(recordedFiles(ssc) === Seq(1, 2, 3) && batchCounter.getNumStartedBatches === 3)
         }
+        clock.advance(batchDuration.milliseconds)
         // Wait for a checkpoint to be written
         eventually(eventuallyTimeout) {
           assert(Checkpoint.getCheckpointFiles(checkpointDir).size === 6)
@@ -454,9 +454,12 @@ class CheckpointSuite extends TestSuiteBase {
       // recorded before failure were saved and successfully recovered
       logInfo("*********** RESTARTING ************")
       withStreamingContext(new StreamingContext(checkpointDir)) { ssc =>
-        // So that the restarted StreamingContext's clock has gone forward in time since failure
-        ssc.conf.set("spark.streaming.manualClock.jump", (batchDuration * 3).milliseconds.toString)
-        val oldClockTime = clock.getTimeMillis()
+        // "batchDuration.milliseconds * 3" has gone before restarting StreamingContext. And because
+        // the recovery time is read from the checkpoint time but the original clock doesn't align
+        // with the batch time, we need to add the offset "batchDuration.milliseconds / 2".
+        ssc.conf.set("spark.streaming.manualClock.jump",
+          (batchDuration.milliseconds / 2 + batchDuration.milliseconds * 3).toString)
+        val oldClockTime = clock.getTimeMillis() // 15000ms
         clock = ssc.scheduler.clock.asInstanceOf[ManualClock]
         val batchCounter = new BatchCounter(ssc)
         val outputStream = ssc.graph.getOutputStreams().head.asInstanceOf[TestOutputStream[Int]]
@@ -467,10 +470,10 @@ class CheckpointSuite extends TestSuiteBase {
         ssc.start()
         // Verify that the clock has traveled forward to the expected time
         eventually(eventuallyTimeout) {
-          clock.getTimeMillis() === oldClockTime
+          assert(clock.getTimeMillis() === oldClockTime)
         }
-        // Wait for pre-failure batch to be recomputed (3 while SSC was down plus last batch)
-        val numBatchesAfterRestart = 4
+        // There are 5 batches between 6000ms and 15000ms (inclusive).
+        val numBatchesAfterRestart = 5
         eventually(eventuallyTimeout) {
           assert(batchCounter.getNumCompletedBatches === numBatchesAfterRestart)
         }
@@ -483,7 +486,6 @@ class CheckpointSuite extends TestSuiteBase {
             assert(batchCounter.getNumCompletedBatches === index + numBatchesAfterRestart + 1)
           }
         }
-        clock.advance(batchDuration.milliseconds)
         logInfo("Output after restart = " + outputStream.output.mkString("[", ", ", "]"))
         assert(outputStream.output.size > 0, "No files processed after restart")
         ssc.stop()

From ebdf58538058e57381c04b6725d4be0c37847ed3 Mon Sep 17 00:00:00 2001
From: Andrew Or <andrew@databricks.com>
Date: Thu, 9 Jul 2015 13:25:11 -0700
Subject: [PATCH 094/549] [SPARK-2017] [UI] Stage page hangs with many tasks

(This reopens a patch that was closed in the past: #6248)

When you view the stage page while running the following:
```
sc.parallelize(1 to X, 10000).count()
```
The page never loads, the job is stalled, and you end up running into an OOM:
```
HTTP ERROR 500

Problem accessing /stages/stage/. Reason:
    Server Error
Caused by:
java.lang.OutOfMemoryError: Java heap space
    at java.util.Arrays.copyOf(Arrays.java:2367)
    at java.lang.AbstractStringBuilder.expandCapacity(AbstractStringBuilder.java:130)
```
This patch compresses Jetty responses in gzip. The correct long-term fix is to add pagination.

Author: Andrew Or <andrew@databricks.com>

Closes #7296 from andrewor14/gzip-jetty and squashes the following commits:

a051c64 [Andrew Or] Use GZIP to compress Jetty responses
---
 .../main/scala/org/apache/spark/ui/JettyUtils.scala    | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala b/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala
index 06e616220c706..f413c1d37fbb6 100644
--- a/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala
+++ b/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala
@@ -210,10 +210,16 @@ private[spark] object JettyUtils extends Logging {
       conf: SparkConf,
       serverName: String = ""): ServerInfo = {
 
-    val collection = new ContextHandlerCollection
-    collection.setHandlers(handlers.toArray)
     addFilters(handlers, conf)
 
+    val collection = new ContextHandlerCollection
+    val gzipHandlers = handlers.map { h =>
+      val gzipHandler = new GzipHandler
+      gzipHandler.setHandler(h)
+      gzipHandler
+    }
+    collection.setHandlers(gzipHandlers.toArray)
+
     // Bind to the given port, or throw a java.net.BindException if the port is occupied
     def connect(currentPort: Int): (Server, Int) = {
       val server = new Server(new InetSocketAddress(hostName, currentPort))

From c4830598b271cc6390d127bd4cf8ab02b28792e0 Mon Sep 17 00:00:00 2001
From: Iulian Dragos <jaguarul@gmail.com>
Date: Thu, 9 Jul 2015 13:26:46 -0700
Subject: [PATCH 095/549] [SPARK-6287] [MESOS] Add dynamic allocation to the
 coarse-grained Mesos scheduler

This is largely based on extracting the dynamic allocation parts from tnachen's #3861.

Author: Iulian Dragos <jaguarul@gmail.com>

Closes #4984 from dragos/issue/mesos-coarse-dynamicAllocation and squashes the following commits:

39df8cd [Iulian Dragos] Update tests to latest changes in core.
9d2c9fa [Iulian Dragos] Remove adjustment of executorLimitOption in doKillExecutors.
8b00f52 [Iulian Dragos] Latest round of reviews.
0cd00e0 [Iulian Dragos] Add persistent shuffle directory
15c45c1 [Iulian Dragos] Add dynamic allocation to the Spark coarse-grained scheduler.
---
 .../scala/org/apache/spark/SparkContext.scala |  19 +-
 .../mesos/CoarseMesosSchedulerBackend.scala   | 136 +++++++++++---
 .../cluster/mesos/MesosSchedulerUtils.scala   |   4 +-
 .../spark/storage/DiskBlockManager.scala      |   8 +-
 .../scala/org/apache/spark/util/Utils.scala   |  45 +++--
 .../CoarseMesosSchedulerBackendSuite.scala    | 175 ++++++++++++++++++
 6 files changed, 331 insertions(+), 56 deletions(-)
 create mode 100644 core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackendSuite.scala

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index d2547eeff2b4e..82704b1ab2189 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -532,7 +532,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
     _executorAllocationManager =
       if (dynamicAllocationEnabled) {
         assert(supportDynamicAllocation,
-          "Dynamic allocation of executors is currently only supported in YARN mode")
+          "Dynamic allocation of executors is currently only supported in YARN and Mesos mode")
         Some(new ExecutorAllocationManager(this, listenerBus, _conf))
       } else {
         None
@@ -853,7 +853,6 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
       minPartitions).setName(path)
   }
 
-
   /**
    * :: Experimental ::
    *
@@ -1364,10 +1363,14 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
 
   /**
    * Return whether dynamically adjusting the amount of resources allocated to
-   * this application is supported. This is currently only available for YARN.
+   * this application is supported. This is currently only available for YARN
+   * and Mesos coarse-grained mode.
    */
-  private[spark] def supportDynamicAllocation =
-    master.contains("yarn") || _conf.getBoolean("spark.dynamicAllocation.testing", false)
+  private[spark] def supportDynamicAllocation: Boolean = {
+    (master.contains("yarn")
+      || master.contains("mesos")
+      || _conf.getBoolean("spark.dynamicAllocation.testing", false))
+  }
 
   /**
    * :: DeveloperApi ::
@@ -1385,7 +1388,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
    */
   private[spark] override def requestTotalExecutors(numExecutors: Int): Boolean = {
     assert(supportDynamicAllocation,
-      "Requesting executors is currently only supported in YARN mode")
+      "Requesting executors is currently only supported in YARN and Mesos modes")
     schedulerBackend match {
       case b: CoarseGrainedSchedulerBackend =>
         b.requestTotalExecutors(numExecutors)
@@ -1403,7 +1406,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
   @DeveloperApi
   override def requestExecutors(numAdditionalExecutors: Int): Boolean = {
     assert(supportDynamicAllocation,
-      "Requesting executors is currently only supported in YARN mode")
+      "Requesting executors is currently only supported in YARN and Mesos modes")
     schedulerBackend match {
       case b: CoarseGrainedSchedulerBackend =>
         b.requestExecutors(numAdditionalExecutors)
@@ -1421,7 +1424,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
   @DeveloperApi
   override def killExecutors(executorIds: Seq[String]): Boolean = {
     assert(supportDynamicAllocation,
-      "Killing executors is currently only supported in YARN mode")
+      "Killing executors is currently only supported in YARN and Mesos modes")
     schedulerBackend match {
       case b: CoarseGrainedSchedulerBackend =>
         b.killExecutors(executorIds)
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala
index b68f8c7685eba..cbade131494bc 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala
@@ -18,11 +18,14 @@
 package org.apache.spark.scheduler.cluster.mesos
 
 import java.io.File
-import java.util.{List => JList}
+import java.util.{List => JList, Collections}
+import java.util.concurrent.locks.ReentrantLock
 
 import scala.collection.JavaConversions._
 import scala.collection.mutable.{HashMap, HashSet}
 
+import com.google.common.collect.HashBiMap
+import org.apache.mesos.Protos.{TaskInfo => MesosTaskInfo, _}
 import org.apache.mesos.{Scheduler => MScheduler, _}
 import org.apache.mesos.Protos.{TaskInfo => MesosTaskInfo, _}
 import org.apache.spark.{SparkContext, SparkEnv, SparkException, TaskState}
@@ -60,9 +63,27 @@ private[spark] class CoarseMesosSchedulerBackend(
 
   val slaveIdsWithExecutors = new HashSet[String]
 
-  val taskIdToSlaveId = new HashMap[Int, String]
-  val failuresBySlaveId = new HashMap[String, Int] // How many times tasks on each slave failed
+  val taskIdToSlaveId: HashBiMap[Int, String] = HashBiMap.create[Int, String]
+  // How many times tasks on each slave failed
+  val failuresBySlaveId: HashMap[String, Int] = new HashMap[String, Int]
+
+  /**
+   * The total number of executors we aim to have. Undefined when not using dynamic allocation
+   * and before the ExecutorAllocatorManager calls [[doRequesTotalExecutors]].
+   */
+  private var executorLimitOption: Option[Int] = None
+
+  /**
+   *  Return the current executor limit, which may be [[Int.MaxValue]]
+   *  before properly initialized.
+   */
+  private[mesos] def executorLimit: Int = executorLimitOption.getOrElse(Int.MaxValue)
+
+  private val pendingRemovedSlaveIds = new HashSet[String]
 
+  // private lock object protecting mutable state above. Using the intrinsic lock
+  // may lead to deadlocks since the superclass might also try to lock
+  private val stateLock = new ReentrantLock
 
   val extraCoresPerSlave = conf.getInt("spark.mesos.extra.cores", 0)
 
@@ -86,7 +107,7 @@ private[spark] class CoarseMesosSchedulerBackend(
     startScheduler(master, CoarseMesosSchedulerBackend.this, fwInfo)
   }
 
-  def createCommand(offer: Offer, numCores: Int): CommandInfo = {
+  def createCommand(offer: Offer, numCores: Int, taskId: Int): CommandInfo = {
     val executorSparkHome = conf.getOption("spark.mesos.executor.home")
       .orElse(sc.getSparkHome())
       .getOrElse {
@@ -120,10 +141,6 @@ private[spark] class CoarseMesosSchedulerBackend(
     }
     val command = CommandInfo.newBuilder()
       .setEnvironment(environment)
-    val driverUrl = sc.env.rpcEnv.uriOf(
-      SparkEnv.driverActorSystemName,
-      RpcAddress(conf.get("spark.driver.host"), conf.get("spark.driver.port").toInt),
-      CoarseGrainedSchedulerBackend.ENDPOINT_NAME)
 
     val uri = conf.getOption("spark.executor.uri")
       .orElse(Option(System.getenv("SPARK_EXECUTOR_URI")))
@@ -133,7 +150,7 @@ private[spark] class CoarseMesosSchedulerBackend(
       command.setValue(
         "%s \"%s\" org.apache.spark.executor.CoarseGrainedExecutorBackend"
           .format(prefixEnv, runScript) +
-        s" --driver-url $driverUrl" +
+        s" --driver-url $driverURL" +
         s" --executor-id ${offer.getSlaveId.getValue}" +
         s" --hostname ${offer.getHostname}" +
         s" --cores $numCores" +
@@ -142,11 +159,12 @@ private[spark] class CoarseMesosSchedulerBackend(
       // Grab everything to the first '.'. We'll use that and '*' to
       // glob the directory "correctly".
       val basename = uri.get.split('/').last.split('.').head
+      val executorId = sparkExecutorId(offer.getSlaveId.getValue, taskId.toString)
       command.setValue(
         s"cd $basename*; $prefixEnv " +
          "./bin/spark-class org.apache.spark.executor.CoarseGrainedExecutorBackend" +
-        s" --driver-url $driverUrl" +
-        s" --executor-id ${offer.getSlaveId.getValue}" +
+        s" --driver-url $driverURL" +
+        s" --executor-id $executorId" +
         s" --hostname ${offer.getHostname}" +
         s" --cores $numCores" +
         s" --app-id $appId")
@@ -155,6 +173,17 @@ private[spark] class CoarseMesosSchedulerBackend(
     command.build()
   }
 
+  protected def driverURL: String = {
+    if (conf.contains("spark.testing")) {
+      "driverURL"
+    } else {
+      sc.env.rpcEnv.uriOf(
+        SparkEnv.driverActorSystemName,
+        RpcAddress(conf.get("spark.driver.host"), conf.get("spark.driver.port").toInt),
+        CoarseGrainedSchedulerBackend.ENDPOINT_NAME)
+    }
+  }
+
   override def offerRescinded(d: SchedulerDriver, o: OfferID) {}
 
   override def registered(d: SchedulerDriver, frameworkId: FrameworkID, masterInfo: MasterInfo) {
@@ -172,17 +201,18 @@ private[spark] class CoarseMesosSchedulerBackend(
    * unless we've already launched more than we wanted to.
    */
   override def resourceOffers(d: SchedulerDriver, offers: JList[Offer]) {
-    synchronized {
+    stateLock.synchronized {
       val filters = Filters.newBuilder().setRefuseSeconds(5).build()
       for (offer <- offers) {
         val offerAttributes = toAttributeMap(offer.getAttributesList)
         val meetsConstraints = matchesAttributeRequirements(slaveOfferConstraints, offerAttributes)
-        val slaveId = offer.getSlaveId.toString
+        val slaveId = offer.getSlaveId.getValue
         val mem = getResource(offer.getResourcesList, "mem")
         val cpus = getResource(offer.getResourcesList, "cpus").toInt
         val id = offer.getId.getValue
-        if (meetsConstraints &&
+        if (taskIdToSlaveId.size < executorLimit &&
             totalCoresAcquired < maxCores &&
+            meetsConstraints &&
             mem >= calculateTotalMemory(sc) &&
             cpus >= 1 &&
             failuresBySlaveId.getOrElse(slaveId, 0) < MAX_SLAVE_FAILURES &&
@@ -197,7 +227,7 @@ private[spark] class CoarseMesosSchedulerBackend(
           val task = MesosTaskInfo.newBuilder()
             .setTaskId(TaskID.newBuilder().setValue(taskId.toString).build())
             .setSlaveId(offer.getSlaveId)
-            .setCommand(createCommand(offer, cpusToUse + extraCoresPerSlave))
+            .setCommand(createCommand(offer, cpusToUse + extraCoresPerSlave, taskId))
             .setName("Task " + taskId)
             .addResources(createResource("cpus", cpusToUse))
             .addResources(createResource("mem", calculateTotalMemory(sc)))
@@ -209,7 +239,9 @@ private[spark] class CoarseMesosSchedulerBackend(
 
           // accept the offer and launch the task
           logDebug(s"Accepting offer: $id with attributes: $offerAttributes mem: $mem cpu: $cpus")
-          d.launchTasks(List(offer.getId), List(task.build()), filters)
+          d.launchTasks(
+            Collections.singleton(offer.getId),
+            Collections.singleton(task.build()), filters)
         } else {
           // Decline the offer
           logDebug(s"Declining offer: $id with attributes: $offerAttributes mem: $mem cpu: $cpus")
@@ -224,7 +256,7 @@ private[spark] class CoarseMesosSchedulerBackend(
     val taskId = status.getTaskId.getValue.toInt
     val state = status.getState
     logInfo("Mesos task " + taskId + " is now " + state)
-    synchronized {
+    stateLock.synchronized {
       if (TaskState.isFinished(TaskState.fromMesos(state))) {
         val slaveId = taskIdToSlaveId(taskId)
         slaveIdsWithExecutors -= slaveId
@@ -242,8 +274,9 @@ private[spark] class CoarseMesosSchedulerBackend(
                 "is Spark installed on it?")
           }
         }
+        executorTerminated(d, slaveId, s"Executor finished with state $state")
         // In case we'd rejected everything before but have now lost a node
-        mesosDriver.reviveOffers()
+        d.reviveOffers()
       }
     }
   }
@@ -262,18 +295,39 @@ private[spark] class CoarseMesosSchedulerBackend(
 
   override def frameworkMessage(d: SchedulerDriver, e: ExecutorID, s: SlaveID, b: Array[Byte]) {}
 
-  override def slaveLost(d: SchedulerDriver, slaveId: SlaveID) {
-    logInfo("Mesos slave lost: " + slaveId.getValue)
-    synchronized {
-      if (slaveIdsWithExecutors.contains(slaveId.getValue)) {
-        // Note that the slave ID corresponds to the executor ID on that slave
-        slaveIdsWithExecutors -= slaveId.getValue
-        removeExecutor(slaveId.getValue, "Mesos slave lost")
+  /**
+   * Called when a slave is lost or a Mesos task finished. Update local view on
+   * what tasks are running and remove the terminated slave from the list of pending
+   * slave IDs that we might have asked to be killed. It also notifies the driver
+   * that an executor was removed.
+   */
+  private def executorTerminated(d: SchedulerDriver, slaveId: String, reason: String): Unit = {
+    stateLock.synchronized {
+      if (slaveIdsWithExecutors.contains(slaveId)) {
+        val slaveIdToTaskId = taskIdToSlaveId.inverse()
+        if (slaveIdToTaskId.contains(slaveId)) {
+          val taskId: Int = slaveIdToTaskId.get(slaveId)
+          taskIdToSlaveId.remove(taskId)
+          removeExecutor(sparkExecutorId(slaveId, taskId.toString), reason)
+        }
+        // TODO: This assumes one Spark executor per Mesos slave,
+        // which may no longer be true after SPARK-5095
+        pendingRemovedSlaveIds -= slaveId
+        slaveIdsWithExecutors -= slaveId
       }
     }
   }
 
-  override def executorLost(d: SchedulerDriver, e: ExecutorID, s: SlaveID, status: Int) {
+  private def sparkExecutorId(slaveId: String, taskId: String): String = {
+    s"$slaveId/$taskId"
+  }
+
+  override def slaveLost(d: SchedulerDriver, slaveId: SlaveID): Unit = {
+    logInfo("Mesos slave lost: " + slaveId.getValue)
+    executorTerminated(d, slaveId.getValue, "Mesos slave lost: " + slaveId.getValue)
+  }
+
+  override def executorLost(d: SchedulerDriver, e: ExecutorID, s: SlaveID, status: Int): Unit = {
     logInfo("Executor lost: %s, marking slave %s as lost".format(e.getValue, s.getValue))
     slaveLost(d, s)
   }
@@ -284,4 +338,34 @@ private[spark] class CoarseMesosSchedulerBackend(
       super.applicationId
     }
 
+  override def doRequestTotalExecutors(requestedTotal: Int): Boolean = {
+    // We don't truly know if we can fulfill the full amount of executors
+    // since at coarse grain it depends on the amount of slaves available.
+    logInfo("Capping the total amount of executors to " + requestedTotal)
+    executorLimitOption = Some(requestedTotal)
+    true
+  }
+
+  override def doKillExecutors(executorIds: Seq[String]): Boolean = {
+    if (mesosDriver == null) {
+      logWarning("Asked to kill executors before the Mesos driver was started.")
+      return false
+    }
+
+    val slaveIdToTaskId = taskIdToSlaveId.inverse()
+    for (executorId <- executorIds) {
+      val slaveId = executorId.split("/")(0)
+      if (slaveIdToTaskId.contains(slaveId)) {
+        mesosDriver.killTask(
+          TaskID.newBuilder().setValue(slaveIdToTaskId.get(slaveId).toString).build())
+        pendingRemovedSlaveIds += slaveId
+      } else {
+        logWarning("Unable to find executor Id '" + executorId + "' in Mesos scheduler")
+      }
+    }
+    // no need to adjust `executorLimitOption` since the AllocationManager already communicated
+    // the desired limit through a call to `doRequestTotalExecutors`.
+    // See [[o.a.s.scheduler.cluster.CoarseGrainedSchedulerBackend.killExecutors]]
+    true
+  }
 }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtils.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtils.scala
index d8a8c848bb4d1..925702e63afd3 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtils.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtils.scala
@@ -24,7 +24,7 @@ import scala.collection.JavaConversions._
 import scala.util.control.NonFatal
 
 import com.google.common.base.Splitter
-import org.apache.mesos.{MesosSchedulerDriver, Protos, Scheduler}
+import org.apache.mesos.{MesosSchedulerDriver, SchedulerDriver, Scheduler, Protos}
 import org.apache.mesos.Protos._
 import org.apache.mesos.protobuf.GeneratedMessage
 import org.apache.spark.{Logging, SparkContext}
@@ -39,7 +39,7 @@ private[mesos] trait MesosSchedulerUtils extends Logging {
   private final val registerLatch = new CountDownLatch(1)
 
   // Driver for talking to Mesos
-  protected var mesosDriver: MesosSchedulerDriver = null
+  protected var mesosDriver: SchedulerDriver = null
 
   /**
    * Starts the MesosSchedulerDriver with the provided information. This method returns
diff --git a/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala b/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala
index 91ef86389a0c3..5f537692a16c5 100644
--- a/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala
+++ b/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala
@@ -124,10 +124,16 @@ private[spark] class DiskBlockManager(blockManager: BlockManager, conf: SparkCon
     (blockId, getFile(blockId))
   }
 
+  /**
+   * Create local directories for storing block data. These directories are
+   * located inside configured local directories and won't
+   * be deleted on JVM exit when using the external shuffle service.
+   */
   private def createLocalDirs(conf: SparkConf): Array[File] = {
-    Utils.getOrCreateLocalRootDirs(conf).flatMap { rootDir =>
+    Utils.getConfiguredLocalDirs(conf).flatMap { rootDir =>
       try {
         val localDir = Utils.createDirectory(rootDir, "blockmgr")
+        Utils.chmod700(localDir)
         logInfo(s"Created local directory at $localDir")
         Some(localDir)
       } catch {
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index 944560a91354a..b6b932104a94d 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -733,7 +733,12 @@ private[spark] object Utils extends Logging {
     localRootDirs
   }
 
-  private def getOrCreateLocalRootDirsImpl(conf: SparkConf): Array[String] = {
+  /**
+   * Return the configured local directories where Spark can write files. This
+   * method does not create any directories on its own, it only encapsulates the
+   * logic of locating the local directories according to deployment mode.
+   */
+  def getConfiguredLocalDirs(conf: SparkConf): Array[String] = {
     if (isRunningInYarnContainer(conf)) {
       // If we are in yarn mode, systems can have different disk layouts so we must set it
       // to what Yarn on this system said was available. Note this assumes that Yarn has
@@ -749,27 +754,29 @@ private[spark] object Utils extends Logging {
       Option(conf.getenv("SPARK_LOCAL_DIRS"))
         .getOrElse(conf.get("spark.local.dir", System.getProperty("java.io.tmpdir")))
         .split(",")
-        .flatMap { root =>
-          try {
-            val rootDir = new File(root)
-            if (rootDir.exists || rootDir.mkdirs()) {
-              val dir = createTempDir(root)
-              chmod700(dir)
-              Some(dir.getAbsolutePath)
-            } else {
-              logError(s"Failed to create dir in $root. Ignoring this directory.")
-              None
-            }
-          } catch {
-            case e: IOException =>
-            logError(s"Failed to create local root dir in $root. Ignoring this directory.")
-            None
-          }
-        }
-        .toArray
     }
   }
 
+  private def getOrCreateLocalRootDirsImpl(conf: SparkConf): Array[String] = {
+    getConfiguredLocalDirs(conf).flatMap { root =>
+      try {
+        val rootDir = new File(root)
+        if (rootDir.exists || rootDir.mkdirs()) {
+          val dir = createTempDir(root)
+          chmod700(dir)
+          Some(dir.getAbsolutePath)
+        } else {
+          logError(s"Failed to create dir in $root. Ignoring this directory.")
+          None
+        }
+      } catch {
+        case e: IOException =>
+          logError(s"Failed to create local root dir in $root. Ignoring this directory.")
+          None
+      }
+    }.toArray
+  }
+
   /** Get the Yarn approved local directories. */
   private def getYarnLocalDirs(conf: SparkConf): String = {
     // Hadoop 0.23 and 2.x have different Environment variable names for the
diff --git a/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackendSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackendSuite.scala
new file mode 100644
index 0000000000000..3f1692917a357
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackendSuite.scala
@@ -0,0 +1,175 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.scheduler.cluster.mesos
+
+import java.util
+import java.util.Collections
+
+import org.apache.mesos.Protos.Value.Scalar
+import org.apache.mesos.Protos._
+import org.apache.mesos.SchedulerDriver
+import org.mockito.Matchers._
+import org.mockito.Mockito._
+import org.mockito.Matchers
+import org.scalatest.mock.MockitoSugar
+import org.scalatest.BeforeAndAfter
+
+import org.apache.spark.scheduler.TaskSchedulerImpl
+import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkFunSuite}
+
+class CoarseMesosSchedulerBackendSuite extends SparkFunSuite
+    with LocalSparkContext
+    with MockitoSugar
+    with BeforeAndAfter {
+
+  private def createOffer(offerId: String, slaveId: String, mem: Int, cpu: Int): Offer = {
+    val builder = Offer.newBuilder()
+    builder.addResourcesBuilder()
+      .setName("mem")
+      .setType(Value.Type.SCALAR)
+      .setScalar(Scalar.newBuilder().setValue(mem))
+    builder.addResourcesBuilder()
+      .setName("cpus")
+      .setType(Value.Type.SCALAR)
+      .setScalar(Scalar.newBuilder().setValue(cpu))
+    builder.setId(OfferID.newBuilder()
+      .setValue(offerId).build())
+      .setFrameworkId(FrameworkID.newBuilder()
+        .setValue("f1"))
+      .setSlaveId(SlaveID.newBuilder().setValue(slaveId))
+      .setHostname(s"host${slaveId}")
+      .build()
+  }
+
+  private def createSchedulerBackend(
+      taskScheduler: TaskSchedulerImpl,
+      driver: SchedulerDriver): CoarseMesosSchedulerBackend = {
+    val backend = new CoarseMesosSchedulerBackend(taskScheduler, sc, "master") {
+      mesosDriver = driver
+      markRegistered()
+    }
+    backend.start()
+    backend
+  }
+
+  var sparkConf: SparkConf = _
+
+  before {
+    sparkConf = (new SparkConf)
+      .setMaster("local[*]")
+      .setAppName("test-mesos-dynamic-alloc")
+      .setSparkHome("/path")
+
+    sc = new SparkContext(sparkConf)
+  }
+
+  test("mesos supports killing and limiting executors") {
+    val driver = mock[SchedulerDriver]
+    val taskScheduler = mock[TaskSchedulerImpl]
+    when(taskScheduler.sc).thenReturn(sc)
+
+    sparkConf.set("spark.driver.host", "driverHost")
+    sparkConf.set("spark.driver.port", "1234")
+
+    val backend = createSchedulerBackend(taskScheduler, driver)
+    val minMem = backend.calculateTotalMemory(sc).toInt
+    val minCpu = 4
+
+    val mesosOffers = new java.util.ArrayList[Offer]
+    mesosOffers.add(createOffer("o1", "s1", minMem, minCpu))
+
+    val taskID0 = TaskID.newBuilder().setValue("0").build()
+
+    backend.resourceOffers(driver, mesosOffers)
+    verify(driver, times(1)).launchTasks(
+      Matchers.eq(Collections.singleton(mesosOffers.get(0).getId)),
+      any[util.Collection[TaskInfo]],
+      any[Filters])
+
+    // simulate the allocation manager down-scaling executors
+    backend.doRequestTotalExecutors(0)
+    assert(backend.doKillExecutors(Seq("s1/0")))
+    verify(driver, times(1)).killTask(taskID0)
+
+    val mesosOffers2 = new java.util.ArrayList[Offer]
+    mesosOffers2.add(createOffer("o2", "s2", minMem, minCpu))
+    backend.resourceOffers(driver, mesosOffers2)
+
+    verify(driver, times(1))
+      .declineOffer(OfferID.newBuilder().setValue("o2").build())
+
+    // Verify we didn't launch any new executor
+    assert(backend.slaveIdsWithExecutors.size === 1)
+
+    backend.doRequestTotalExecutors(2)
+    backend.resourceOffers(driver, mesosOffers2)
+    verify(driver, times(1)).launchTasks(
+      Matchers.eq(Collections.singleton(mesosOffers2.get(0).getId)),
+      any[util.Collection[TaskInfo]],
+      any[Filters])
+
+    assert(backend.slaveIdsWithExecutors.size === 2)
+    backend.slaveLost(driver, SlaveID.newBuilder().setValue("s1").build())
+    assert(backend.slaveIdsWithExecutors.size === 1)
+  }
+
+  test("mesos supports killing and relaunching tasks with executors") {
+    val driver = mock[SchedulerDriver]
+    val taskScheduler = mock[TaskSchedulerImpl]
+    when(taskScheduler.sc).thenReturn(sc)
+
+    val backend = createSchedulerBackend(taskScheduler, driver)
+    val minMem = backend.calculateTotalMemory(sc).toInt + 1024
+    val minCpu = 4
+
+    val mesosOffers = new java.util.ArrayList[Offer]
+    val offer1 = createOffer("o1", "s1", minMem, minCpu)
+    mesosOffers.add(offer1)
+
+    val offer2 = createOffer("o2", "s1", minMem, 1);
+
+    backend.resourceOffers(driver, mesosOffers)
+
+    verify(driver, times(1)).launchTasks(
+      Matchers.eq(Collections.singleton(offer1.getId)),
+      anyObject(),
+      anyObject[Filters])
+
+    // Simulate task killed, executor no longer running
+    val status = TaskStatus.newBuilder()
+      .setTaskId(TaskID.newBuilder().setValue("0").build())
+      .setSlaveId(SlaveID.newBuilder().setValue("s1").build())
+      .setState(TaskState.TASK_KILLED)
+      .build
+
+    backend.statusUpdate(driver, status)
+    assert(!backend.slaveIdsWithExecutors.contains("s1"))
+
+    mesosOffers.clear()
+    mesosOffers.add(offer2)
+    backend.resourceOffers(driver, mesosOffers)
+    assert(backend.slaveIdsWithExecutors.contains("s1"))
+
+    verify(driver, times(1)).launchTasks(
+      Matchers.eq(Collections.singleton(offer2.getId)),
+      anyObject(),
+      anyObject[Filters])
+
+    verify(driver, times(1)).reviveOffers()
+  }
+}

From 1f6b0b1234cc03aa2e07aea7fec2de7563885238 Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Thu, 9 Jul 2015 13:48:29 -0700
Subject: [PATCH 096/549] [SPARK-8701] [STREAMING] [WEBUI] Add input metadata
 in the batch page

This PR adds `metadata` to `InputInfo`. `InputDStream` can report its metadata for a batch and it will be shown in the batch page.

For example,

![screen shot](https://cloud.githubusercontent.com/assets/1000778/8403741/d6ffc7e2-1e79-11e5-9888-c78c1575123a.png)

FileInputDStream will display the new files for a batch, and DirectKafkaInputDStream will display its offset ranges.

Author: zsxwing <zsxwing@gmail.com>

Closes #7081 from zsxwing/input-metadata and squashes the following commits:

f7abd9b [zsxwing] Revert the space changes in project/MimaExcludes.scala
d906209 [zsxwing] Merge branch 'master' into input-metadata
74762da [zsxwing] Fix MiMa tests
7903e33 [zsxwing] Merge branch 'master' into input-metadata
450a46c [zsxwing] Address comments
1d94582 [zsxwing] Raname InputInfo to StreamInputInfo and change "metadata" to Map[String, Any]
d496ae9 [zsxwing] Add input metadata in the batch page
---
 .../kafka/DirectKafkaInputDStream.scala       | 23 ++++++++--
 .../spark/streaming/kafka/OffsetRange.scala   |  2 +-
 project/MimaExcludes.scala                    |  6 +++
 .../streaming/dstream/FileInputDStream.scala  | 10 ++++-
 .../dstream/ReceiverInputDStream.scala        |  4 +-
 .../spark/streaming/scheduler/BatchInfo.scala |  9 ++--
 .../scheduler/InputInfoTracker.scala          | 38 +++++++++++++---
 .../streaming/scheduler/JobGenerator.scala    |  3 +-
 .../spark/streaming/scheduler/JobSet.scala    |  4 +-
 .../apache/spark/streaming/ui/BatchPage.scala | 43 +++++++++++++++++--
 .../spark/streaming/ui/BatchUIData.scala      |  8 ++--
 .../ui/StreamingJobProgressListener.scala     |  5 ++-
 .../streaming/StreamingListenerSuite.scala    |  6 +--
 .../spark/streaming/TestSuiteBase.scala       |  2 +-
 .../scheduler/InputInfoTrackerSuite.scala     |  8 ++--
 .../StreamingJobProgressListenerSuite.scala   | 28 ++++++------
 16 files changed, 148 insertions(+), 51 deletions(-)

diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala
index 876456c964770..48a1933d92f85 100644
--- a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala
+++ b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala
@@ -19,7 +19,7 @@ package org.apache.spark.streaming.kafka
 
 import scala.annotation.tailrec
 import scala.collection.mutable
-import scala.reflect.{classTag, ClassTag}
+import scala.reflect.ClassTag
 
 import kafka.common.TopicAndPartition
 import kafka.message.MessageAndMetadata
@@ -29,7 +29,7 @@ import org.apache.spark.{Logging, SparkException}
 import org.apache.spark.streaming.{StreamingContext, Time}
 import org.apache.spark.streaming.dstream._
 import org.apache.spark.streaming.kafka.KafkaCluster.LeaderOffset
-import org.apache.spark.streaming.scheduler.InputInfo
+import org.apache.spark.streaming.scheduler.StreamInputInfo
 
 /**
  *  A stream of {@link org.apache.spark.streaming.kafka.KafkaRDD} where
@@ -119,8 +119,23 @@ class DirectKafkaInputDStream[
     val rdd = KafkaRDD[K, V, U, T, R](
       context.sparkContext, kafkaParams, currentOffsets, untilOffsets, messageHandler)
 
-    // Report the record number of this batch interval to InputInfoTracker.
-    val inputInfo = InputInfo(id, rdd.count)
+    // Report the record number and metadata of this batch interval to InputInfoTracker.
+    val offsetRanges = currentOffsets.map { case (tp, fo) =>
+      val uo = untilOffsets(tp)
+      OffsetRange(tp.topic, tp.partition, fo, uo.offset)
+    }
+    val description = offsetRanges.filter { offsetRange =>
+      // Don't display empty ranges.
+      offsetRange.fromOffset != offsetRange.untilOffset
+    }.map { offsetRange =>
+      s"topic: ${offsetRange.topic}\tpartition: ${offsetRange.partition}\t" +
+        s"offsets: ${offsetRange.fromOffset} to ${offsetRange.untilOffset}"
+    }.mkString("\n")
+    // Copy offsetRanges to immutable.List to prevent from being modified by the user
+    val metadata = Map(
+      "offsets" -> offsetRanges.toList,
+      StreamInputInfo.METADATA_KEY_DESCRIPTION -> description)
+    val inputInfo = StreamInputInfo(id, rdd.count, metadata)
     ssc.scheduler.inputInfoTracker.reportInfo(validTime, inputInfo)
 
     currentOffsets = untilOffsets.map(kv => kv._1 -> kv._2.offset)
diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/OffsetRange.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/OffsetRange.scala
index 2675042666304..f326e7f1f6f8d 100644
--- a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/OffsetRange.scala
+++ b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/OffsetRange.scala
@@ -75,7 +75,7 @@ final class OffsetRange private(
   }
 
   override def toString(): String = {
-    s"OffsetRange(topic: '$topic', partition: $partition, range: [$fromOffset -> $untilOffset]"
+    s"OffsetRange(topic: '$topic', partition: $partition, range: [$fromOffset -> $untilOffset])"
   }
 
   /** this is to avoid ClassNotFoundException during checkpoint restore */
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 821aadd477ef3..79089aae2a37c 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -77,6 +77,12 @@ object MimaExcludes {
             // SPARK-8914 Remove RDDApi
             ProblemFilters.exclude[MissingClassProblem](
             "org.apache.spark.sql.RDDApi")
+          ) ++ Seq(
+            // SPARK-8701 Add input metadata in the batch page.
+            ProblemFilters.exclude[MissingClassProblem](
+              "org.apache.spark.streaming.scheduler.InputInfo$"),
+            ProblemFilters.exclude[MissingClassProblem](
+              "org.apache.spark.streaming.scheduler.InputInfo")
           )
 
         case v if v.startsWith("1.4") =>
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala
index 86a8e2beff57c..dd4da9d9ca6a2 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala
@@ -28,6 +28,7 @@ import org.apache.hadoop.mapreduce.{InputFormat => NewInputFormat}
 
 import org.apache.spark.rdd.{RDD, UnionRDD}
 import org.apache.spark.streaming._
+import org.apache.spark.streaming.scheduler.StreamInputInfo
 import org.apache.spark.util.{SerializableConfiguration, TimeStampedHashMap, Utils}
 
 /**
@@ -144,7 +145,14 @@ class FileInputDStream[K, V, F <: NewInputFormat[K, V]](
     logInfo("New files at time " + validTime + ":\n" + newFiles.mkString("\n"))
     batchTimeToSelectedFiles += ((validTime, newFiles))
     recentlySelectedFiles ++= newFiles
-    Some(filesToRDD(newFiles))
+    val rdds = Some(filesToRDD(newFiles))
+    // Copy newFiles to immutable.List to prevent from being modified by the user
+    val metadata = Map(
+      "files" -> newFiles.toList,
+      StreamInputInfo.METADATA_KEY_DESCRIPTION -> newFiles.mkString("\n"))
+    val inputInfo = StreamInputInfo(id, 0, metadata)
+    ssc.scheduler.inputInfoTracker.reportInfo(validTime, inputInfo)
+    rdds
   }
 
   /** Clear the old time-to-files mappings along with old RDDs */
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReceiverInputDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReceiverInputDStream.scala
index e76e7eb0dea19..a50f0efc030ce 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReceiverInputDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReceiverInputDStream.scala
@@ -24,7 +24,7 @@ import org.apache.spark.storage.BlockId
 import org.apache.spark.streaming._
 import org.apache.spark.streaming.rdd.WriteAheadLogBackedBlockRDD
 import org.apache.spark.streaming.receiver.Receiver
-import org.apache.spark.streaming.scheduler.InputInfo
+import org.apache.spark.streaming.scheduler.StreamInputInfo
 import org.apache.spark.streaming.util.WriteAheadLogUtils
 
 /**
@@ -70,7 +70,7 @@ abstract class ReceiverInputDStream[T: ClassTag](@transient ssc_ : StreamingCont
         val blockIds = blockInfos.map { _.blockId.asInstanceOf[BlockId] }.toArray
 
         // Register the input blocks information into InputInfoTracker
-        val inputInfo = InputInfo(id, blockInfos.flatMap(_.numRecords).sum)
+        val inputInfo = StreamInputInfo(id, blockInfos.flatMap(_.numRecords).sum)
         ssc.scheduler.inputInfoTracker.reportInfo(validTime, inputInfo)
 
         if (blockInfos.nonEmpty) {
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/BatchInfo.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/BatchInfo.scala
index 5b9bfbf9b01e3..9922b6bc1201b 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/BatchInfo.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/BatchInfo.scala
@@ -24,7 +24,7 @@ import org.apache.spark.streaming.Time
  * :: DeveloperApi ::
  * Class having information on completed batches.
  * @param batchTime   Time of the batch
- * @param streamIdToNumRecords A map of input stream id to record number
+ * @param streamIdToInputInfo A map of input stream id to its input info
  * @param submissionTime  Clock time of when jobs of this batch was submitted to
  *                        the streaming scheduler queue
  * @param processingStartTime Clock time of when the first job of this batch started processing
@@ -33,12 +33,15 @@ import org.apache.spark.streaming.Time
 @DeveloperApi
 case class BatchInfo(
     batchTime: Time,
-    streamIdToNumRecords: Map[Int, Long],
+    streamIdToInputInfo: Map[Int, StreamInputInfo],
     submissionTime: Long,
     processingStartTime: Option[Long],
     processingEndTime: Option[Long]
   ) {
 
+  @deprecated("Use streamIdToInputInfo instead", "1.5.0")
+  def streamIdToNumRecords: Map[Int, Long] = streamIdToInputInfo.mapValues(_.numRecords)
+
   /**
    * Time taken for the first job of this batch to start processing from the time this batch
    * was submitted to the streaming scheduler. Essentially, it is
@@ -63,5 +66,5 @@ case class BatchInfo(
   /**
    * The number of recorders received by the receivers in this batch.
    */
-  def numRecords: Long = streamIdToNumRecords.values.sum
+  def numRecords: Long = streamIdToInputInfo.values.map(_.numRecords).sum
 }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/InputInfoTracker.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/InputInfoTracker.scala
index 7c0db8a863c67..363c03d431f04 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/InputInfoTracker.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/InputInfoTracker.scala
@@ -20,11 +20,34 @@ package org.apache.spark.streaming.scheduler
 import scala.collection.mutable
 
 import org.apache.spark.Logging
+import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.streaming.{Time, StreamingContext}
 
-/** To track the information of input stream at specified batch time. */
-private[streaming] case class InputInfo(inputStreamId: Int, numRecords: Long) {
+/**
+ * :: DeveloperApi ::
+ * Track the information of input stream at specified batch time.
+ *
+ * @param inputStreamId the input stream id
+ * @param numRecords the number of records in a batch
+ * @param metadata metadata for this batch. It should contain at least one standard field named
+ *                 "Description" which maps to the content that will be shown in the UI.
+ */
+@DeveloperApi
+case class StreamInputInfo(
+    inputStreamId: Int, numRecords: Long, metadata: Map[String, Any] = Map.empty) {
   require(numRecords >= 0, "numRecords must not be negative")
+
+  def metadataDescription: Option[String] =
+    metadata.get(StreamInputInfo.METADATA_KEY_DESCRIPTION).map(_.toString)
+}
+
+@DeveloperApi
+object StreamInputInfo {
+
+  /**
+   * The key for description in `StreamInputInfo.metadata`.
+   */
+  val METADATA_KEY_DESCRIPTION: String = "Description"
 }
 
 /**
@@ -34,12 +57,13 @@ private[streaming] case class InputInfo(inputStreamId: Int, numRecords: Long) {
 private[streaming] class InputInfoTracker(ssc: StreamingContext) extends Logging {
 
   // Map to track all the InputInfo related to specific batch time and input stream.
-  private val batchTimeToInputInfos = new mutable.HashMap[Time, mutable.HashMap[Int, InputInfo]]
+  private val batchTimeToInputInfos =
+    new mutable.HashMap[Time, mutable.HashMap[Int, StreamInputInfo]]
 
   /** Report the input information with batch time to the tracker */
-  def reportInfo(batchTime: Time, inputInfo: InputInfo): Unit = synchronized {
+  def reportInfo(batchTime: Time, inputInfo: StreamInputInfo): Unit = synchronized {
     val inputInfos = batchTimeToInputInfos.getOrElseUpdate(batchTime,
-      new mutable.HashMap[Int, InputInfo]())
+      new mutable.HashMap[Int, StreamInputInfo]())
 
     if (inputInfos.contains(inputInfo.inputStreamId)) {
       throw new IllegalStateException(s"Input stream ${inputInfo.inputStreamId}} for batch" +
@@ -49,10 +73,10 @@ private[streaming] class InputInfoTracker(ssc: StreamingContext) extends Logging
   }
 
   /** Get the all the input stream's information of specified batch time */
-  def getInfo(batchTime: Time): Map[Int, InputInfo] = synchronized {
+  def getInfo(batchTime: Time): Map[Int, StreamInputInfo] = synchronized {
     val inputInfos = batchTimeToInputInfos.get(batchTime)
     // Convert mutable HashMap to immutable Map for the caller
-    inputInfos.map(_.toMap).getOrElse(Map[Int, InputInfo]())
+    inputInfos.map(_.toMap).getOrElse(Map[Int, StreamInputInfo]())
   }
 
   /** Cleanup the tracked input information older than threshold batch time */
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobGenerator.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobGenerator.scala
index 9f93d6cbc3c20..f5d41858646e4 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobGenerator.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobGenerator.scala
@@ -244,8 +244,7 @@ class JobGenerator(jobScheduler: JobScheduler) extends Logging {
     } match {
       case Success(jobs) =>
         val streamIdToInputInfos = jobScheduler.inputInfoTracker.getInfo(time)
-        val streamIdToNumRecords = streamIdToInputInfos.mapValues(_.numRecords)
-        jobScheduler.submitJobSet(JobSet(time, jobs, streamIdToNumRecords))
+        jobScheduler.submitJobSet(JobSet(time, jobs, streamIdToInputInfos))
       case Failure(e) =>
         jobScheduler.reportError("Error generating jobs for time " + time, e)
     }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobSet.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobSet.scala
index e6be63b2ddbdc..95833efc9417f 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobSet.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobSet.scala
@@ -28,7 +28,7 @@ private[streaming]
 case class JobSet(
     time: Time,
     jobs: Seq[Job],
-    streamIdToNumRecords: Map[Int, Long] = Map.empty) {
+    streamIdToInputInfo: Map[Int, StreamInputInfo] = Map.empty) {
 
   private val incompleteJobs = new HashSet[Job]()
   private val submissionTime = System.currentTimeMillis() // when this jobset was submitted
@@ -64,7 +64,7 @@ case class JobSet(
   def toBatchInfo: BatchInfo = {
     new BatchInfo(
       time,
-      streamIdToNumRecords,
+      streamIdToInputInfo,
       submissionTime,
       if (processingStartTime >= 0 ) Some(processingStartTime) else None,
       if (processingEndTime >= 0 ) Some(processingEndTime) else None
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchPage.scala b/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchPage.scala
index f75067669abe5..0c891662c264f 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchPage.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchPage.scala
@@ -17,11 +17,9 @@
 
 package org.apache.spark.streaming.ui
 
-import java.text.SimpleDateFormat
-import java.util.Date
 import javax.servlet.http.HttpServletRequest
 
-import scala.xml.{NodeSeq, Node, Text}
+import scala.xml.{NodeSeq, Node, Text, Unparsed}
 
 import org.apache.commons.lang3.StringEscapeUtils
 
@@ -303,6 +301,9 @@ private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") {
       batchUIData.processingDelay.map(SparkUIUtils.formatDuration).getOrElse("-")
     val formattedTotalDelay = batchUIData.totalDelay.map(SparkUIUtils.formatDuration).getOrElse("-")
 
+    val inputMetadatas = batchUIData.streamIdToInputInfo.values.flatMap { inputInfo =>
+      inputInfo.metadataDescription.map(desc => inputInfo.inputStreamId -> desc)
+    }.toSeq
     val summary: NodeSeq =
       <div>
         <ul class="unstyled">
@@ -326,6 +327,13 @@ private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") {
             <strong>Total delay: </strong>
             {formattedTotalDelay}
           </li>
+          {
+            if (inputMetadatas.nonEmpty) {
+              <li>
+                <strong>Input Metadata:</strong>{generateInputMetadataTable(inputMetadatas)}
+              </li>
+            }
+          }
         </ul>
       </div>
 
@@ -340,4 +348,33 @@ private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") {
 
     SparkUIUtils.headerSparkPage(s"Details of batch at $formattedBatchTime", content, parent)
   }
+
+  def generateInputMetadataTable(inputMetadatas: Seq[(Int, String)]): Seq[Node] = {
+    <table class={SparkUIUtils.TABLE_CLASS_STRIPED}>
+      <thead>
+        <tr>
+          <th>Input</th>
+          <th>Metadata</th>
+        </tr>
+      </thead>
+      <tbody>
+        {inputMetadatas.flatMap(generateInputMetadataRow)}
+      </tbody>
+    </table>
+  }
+
+  def generateInputMetadataRow(inputMetadata: (Int, String)): Seq[Node] = {
+    val streamId = inputMetadata._1
+
+    <tr>
+      <td>{streamingListener.streamName(streamId).getOrElse(s"Stream-$streamId")}</td>
+      <td>{metadataDescriptionToHTML(inputMetadata._2)}</td>
+    </tr>
+  }
+
+  private def metadataDescriptionToHTML(metadataDescription: String): Seq[Node] = {
+    // tab to 4 spaces and "\n" to "<br/>"
+    Unparsed(StringEscapeUtils.escapeHtml4(metadataDescription).
+      replaceAllLiterally("\t", "&nbsp;&nbsp;&nbsp;&nbsp;").replaceAllLiterally("\n", "<br/>"))
+  }
 }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchUIData.scala b/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchUIData.scala
index a5514dfd71c9f..ae508c0e9577b 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchUIData.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchUIData.scala
@@ -19,14 +19,14 @@
 package org.apache.spark.streaming.ui
 
 import org.apache.spark.streaming.Time
-import org.apache.spark.streaming.scheduler.BatchInfo
+import org.apache.spark.streaming.scheduler.{BatchInfo, StreamInputInfo}
 import org.apache.spark.streaming.ui.StreamingJobProgressListener._
 
 private[ui] case class OutputOpIdAndSparkJobId(outputOpId: OutputOpId, sparkJobId: SparkJobId)
 
 private[ui] case class BatchUIData(
     val batchTime: Time,
-    val streamIdToNumRecords: Map[Int, Long],
+    val streamIdToInputInfo: Map[Int, StreamInputInfo],
     val submissionTime: Long,
     val processingStartTime: Option[Long],
     val processingEndTime: Option[Long],
@@ -58,7 +58,7 @@ private[ui] case class BatchUIData(
   /**
    * The number of recorders received by the receivers in this batch.
    */
-  def numRecords: Long = streamIdToNumRecords.values.sum
+  def numRecords: Long = streamIdToInputInfo.values.map(_.numRecords).sum
 }
 
 private[ui] object BatchUIData {
@@ -66,7 +66,7 @@ private[ui] object BatchUIData {
   def apply(batchInfo: BatchInfo): BatchUIData = {
     new BatchUIData(
       batchInfo.batchTime,
-      batchInfo.streamIdToNumRecords,
+      batchInfo.streamIdToInputInfo,
       batchInfo.submissionTime,
       batchInfo.processingStartTime,
       batchInfo.processingEndTime
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingJobProgressListener.scala b/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingJobProgressListener.scala
index 68e8ce98945e0..b77c555c68b8b 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingJobProgressListener.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingJobProgressListener.scala
@@ -192,7 +192,7 @@ private[streaming] class StreamingJobProgressListener(ssc: StreamingContext)
   def receivedEventRateWithBatchTime: Map[Int, Seq[(Long, Double)]] = synchronized {
     val _retainedBatches = retainedBatches
     val latestBatches = _retainedBatches.map { batchUIData =>
-      (batchUIData.batchTime.milliseconds, batchUIData.streamIdToNumRecords)
+      (batchUIData.batchTime.milliseconds, batchUIData.streamIdToInputInfo.mapValues(_.numRecords))
     }
     streamIds.map { streamId =>
       val eventRates = latestBatches.map {
@@ -205,7 +205,8 @@ private[streaming] class StreamingJobProgressListener(ssc: StreamingContext)
   }
 
   def lastReceivedBatchRecords: Map[Int, Long] = synchronized {
-    val lastReceivedBlockInfoOption = lastReceivedBatch.map(_.streamIdToNumRecords)
+    val lastReceivedBlockInfoOption =
+      lastReceivedBatch.map(_.streamIdToInputInfo.mapValues(_.numRecords))
     lastReceivedBlockInfoOption.map { lastReceivedBlockInfo =>
       streamIds.map { streamId =>
         (streamId, lastReceivedBlockInfo.getOrElse(streamId, 0L))
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/StreamingListenerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/StreamingListenerSuite.scala
index 7bc7727a9fbe4..4bc1dd4a30fc4 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/StreamingListenerSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/StreamingListenerSuite.scala
@@ -59,7 +59,7 @@ class StreamingListenerSuite extends TestSuiteBase with Matchers {
 
     batchInfosSubmitted.foreach { info =>
       info.numRecords should be (1L)
-      info.streamIdToNumRecords should be (Map(0 -> 1L))
+      info.streamIdToInputInfo should be (Map(0 -> StreamInputInfo(0, 1L)))
     }
 
     isInIncreasingOrder(batchInfosSubmitted.map(_.submissionTime)) should be (true)
@@ -77,7 +77,7 @@ class StreamingListenerSuite extends TestSuiteBase with Matchers {
 
     batchInfosStarted.foreach { info =>
       info.numRecords should be (1L)
-      info.streamIdToNumRecords should be (Map(0 -> 1L))
+      info.streamIdToInputInfo should be (Map(0 -> StreamInputInfo(0, 1L)))
     }
 
     isInIncreasingOrder(batchInfosStarted.map(_.submissionTime)) should be (true)
@@ -98,7 +98,7 @@ class StreamingListenerSuite extends TestSuiteBase with Matchers {
 
     batchInfosCompleted.foreach { info =>
       info.numRecords should be (1L)
-      info.streamIdToNumRecords should be (Map(0 -> 1L))
+      info.streamIdToInputInfo should be (Map(0 -> StreamInputInfo(0, 1L)))
     }
 
     isInIncreasingOrder(batchInfosCompleted.map(_.submissionTime)) should be (true)
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala b/streaming/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala
index 31b1aebf6a8ec..0d58a7b54412f 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala
@@ -76,7 +76,7 @@ class TestInputStream[T: ClassTag](ssc_ : StreamingContext, input: Seq[Seq[T]],
     }
 
     // Report the input data's information to InputInfoTracker for testing
-    val inputInfo = InputInfo(id, selectedInput.length.toLong)
+    val inputInfo = StreamInputInfo(id, selectedInput.length.toLong)
     ssc.scheduler.inputInfoTracker.reportInfo(validTime, inputInfo)
 
     val rdd = ssc.sc.makeRDD(selectedInput, numPartitions)
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/InputInfoTrackerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/InputInfoTrackerSuite.scala
index 2e210397fe7c7..f5248acf712b9 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/InputInfoTrackerSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/InputInfoTrackerSuite.scala
@@ -46,8 +46,8 @@ class InputInfoTrackerSuite extends SparkFunSuite with BeforeAndAfter {
     val streamId1 = 0
     val streamId2 = 1
     val time = Time(0L)
-    val inputInfo1 = InputInfo(streamId1, 100L)
-    val inputInfo2 = InputInfo(streamId2, 300L)
+    val inputInfo1 = StreamInputInfo(streamId1, 100L)
+    val inputInfo2 = StreamInputInfo(streamId2, 300L)
     inputInfoTracker.reportInfo(time, inputInfo1)
     inputInfoTracker.reportInfo(time, inputInfo2)
 
@@ -63,8 +63,8 @@ class InputInfoTrackerSuite extends SparkFunSuite with BeforeAndAfter {
     val inputInfoTracker = new InputInfoTracker(ssc)
 
     val streamId1 = 0
-    val inputInfo1 = InputInfo(streamId1, 100L)
-    val inputInfo2 = InputInfo(streamId1, 300L)
+    val inputInfo1 = StreamInputInfo(streamId1, 100L)
+    val inputInfo2 = StreamInputInfo(streamId1, 300L)
     inputInfoTracker.reportInfo(Time(0), inputInfo1)
     inputInfoTracker.reportInfo(Time(1), inputInfo2)
 
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/ui/StreamingJobProgressListenerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/ui/StreamingJobProgressListenerSuite.scala
index c9175d61b1f49..40dc1fb601bd0 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/ui/StreamingJobProgressListenerSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/ui/StreamingJobProgressListenerSuite.scala
@@ -49,10 +49,12 @@ class StreamingJobProgressListenerSuite extends TestSuiteBase with Matchers {
     val ssc = setupStreams(input, operation)
     val listener = new StreamingJobProgressListener(ssc)
 
-    val streamIdToNumRecords = Map(0 -> 300L, 1 -> 300L)
+    val streamIdToInputInfo = Map(
+      0 -> StreamInputInfo(0, 300L),
+      1 -> StreamInputInfo(1, 300L, Map(StreamInputInfo.METADATA_KEY_DESCRIPTION -> "test")))
 
     // onBatchSubmitted
-    val batchInfoSubmitted = BatchInfo(Time(1000), streamIdToNumRecords, 1000, None, None)
+    val batchInfoSubmitted = BatchInfo(Time(1000), streamIdToInputInfo, 1000, None, None)
     listener.onBatchSubmitted(StreamingListenerBatchSubmitted(batchInfoSubmitted))
     listener.waitingBatches should be (List(BatchUIData(batchInfoSubmitted)))
     listener.runningBatches should be (Nil)
@@ -64,7 +66,7 @@ class StreamingJobProgressListenerSuite extends TestSuiteBase with Matchers {
     listener.numTotalReceivedRecords should be (0)
 
     // onBatchStarted
-    val batchInfoStarted = BatchInfo(Time(1000), streamIdToNumRecords, 1000, Some(2000), None)
+    val batchInfoStarted = BatchInfo(Time(1000), streamIdToInputInfo, 1000, Some(2000), None)
     listener.onBatchStarted(StreamingListenerBatchStarted(batchInfoStarted))
     listener.waitingBatches should be (Nil)
     listener.runningBatches should be (List(BatchUIData(batchInfoStarted)))
@@ -94,7 +96,9 @@ class StreamingJobProgressListenerSuite extends TestSuiteBase with Matchers {
     batchUIData.get.schedulingDelay should be (batchInfoStarted.schedulingDelay)
     batchUIData.get.processingDelay should be (batchInfoStarted.processingDelay)
     batchUIData.get.totalDelay should be (batchInfoStarted.totalDelay)
-    batchUIData.get.streamIdToNumRecords should be (Map(0 -> 300L, 1 -> 300L))
+    batchUIData.get.streamIdToInputInfo should be (Map(
+      0 -> StreamInputInfo(0, 300L),
+      1 -> StreamInputInfo(1, 300L, Map(StreamInputInfo.METADATA_KEY_DESCRIPTION -> "test"))))
     batchUIData.get.numRecords should be(600)
     batchUIData.get.outputOpIdSparkJobIdPairs should be
       Seq(OutputOpIdAndSparkJobId(0, 0),
@@ -103,7 +107,7 @@ class StreamingJobProgressListenerSuite extends TestSuiteBase with Matchers {
         OutputOpIdAndSparkJobId(1, 1))
 
     // onBatchCompleted
-    val batchInfoCompleted = BatchInfo(Time(1000), streamIdToNumRecords, 1000, Some(2000), None)
+    val batchInfoCompleted = BatchInfo(Time(1000), streamIdToInputInfo, 1000, Some(2000), None)
     listener.onBatchCompleted(StreamingListenerBatchCompleted(batchInfoCompleted))
     listener.waitingBatches should be (Nil)
     listener.runningBatches should be (Nil)
@@ -141,9 +145,9 @@ class StreamingJobProgressListenerSuite extends TestSuiteBase with Matchers {
     val limit = ssc.conf.getInt("spark.streaming.ui.retainedBatches", 1000)
     val listener = new StreamingJobProgressListener(ssc)
 
-    val streamIdToNumRecords = Map(0 -> 300L, 1 -> 300L)
+    val streamIdToInputInfo = Map(0 -> StreamInputInfo(0, 300L), 1 -> StreamInputInfo(1, 300L))
 
-    val batchInfoCompleted = BatchInfo(Time(1000), streamIdToNumRecords, 1000, Some(2000), None)
+    val batchInfoCompleted = BatchInfo(Time(1000), streamIdToInputInfo, 1000, Some(2000), None)
 
     for(_ <- 0 until (limit + 10)) {
       listener.onBatchCompleted(StreamingListenerBatchCompleted(batchInfoCompleted))
@@ -182,7 +186,7 @@ class StreamingJobProgressListenerSuite extends TestSuiteBase with Matchers {
     batchUIData.get.schedulingDelay should be (batchInfoSubmitted.schedulingDelay)
     batchUIData.get.processingDelay should be (batchInfoSubmitted.processingDelay)
     batchUIData.get.totalDelay should be (batchInfoSubmitted.totalDelay)
-    batchUIData.get.streamIdToNumRecords should be (Map.empty)
+    batchUIData.get.streamIdToInputInfo should be (Map.empty)
     batchUIData.get.numRecords should be (0)
     batchUIData.get.outputOpIdSparkJobIdPairs should be (Seq(OutputOpIdAndSparkJobId(0, 0)))
 
@@ -211,14 +215,14 @@ class StreamingJobProgressListenerSuite extends TestSuiteBase with Matchers {
     val limit = ssc.conf.getInt("spark.streaming.ui.retainedBatches", 1000)
 
     for (_ <- 0 until 2 * limit) {
-      val streamIdToNumRecords = Map(0 -> 300L, 1 -> 300L)
+      val streamIdToInputInfo = Map(0 -> StreamInputInfo(0, 300L), 1 -> StreamInputInfo(1, 300L))
 
       // onBatchSubmitted
-      val batchInfoSubmitted = BatchInfo(Time(1000), streamIdToNumRecords, 1000, None, None)
+      val batchInfoSubmitted = BatchInfo(Time(1000), streamIdToInputInfo, 1000, None, None)
       listener.onBatchSubmitted(StreamingListenerBatchSubmitted(batchInfoSubmitted))
 
       // onBatchStarted
-      val batchInfoStarted = BatchInfo(Time(1000), streamIdToNumRecords, 1000, Some(2000), None)
+      val batchInfoStarted = BatchInfo(Time(1000), streamIdToInputInfo, 1000, Some(2000), None)
       listener.onBatchStarted(StreamingListenerBatchStarted(batchInfoStarted))
 
       // onJobStart
@@ -235,7 +239,7 @@ class StreamingJobProgressListenerSuite extends TestSuiteBase with Matchers {
       listener.onJobStart(jobStart4)
 
       // onBatchCompleted
-      val batchInfoCompleted = BatchInfo(Time(1000), streamIdToNumRecords, 1000, Some(2000), None)
+      val batchInfoCompleted = BatchInfo(Time(1000), streamIdToInputInfo, 1000, Some(2000), None)
       listener.onBatchCompleted(StreamingListenerBatchCompleted(batchInfoCompleted))
     }
 

From 3ccebf36c5abe04702d4cf223552a94034d980fb Mon Sep 17 00:00:00 2001
From: jerryshao <saisai.shao@intel.com>
Date: Thu, 9 Jul 2015 13:54:44 -0700
Subject: [PATCH 097/549] [SPARK-8389] [STREAMING] [PYSPARK] Expose KafkaRDDs
 offsetRange in Python

This PR propose a simple way to expose OffsetRange in Python code, also the usage of offsetRanges is similar to Scala/Java way, here in Python we could get OffsetRange like:

```
dstream.foreachRDD(lambda r: KafkaUtils.offsetRanges(r))
```

Reason I didn't follow the way what SPARK-8389 suggested is that: Python Kafka API has one more step to decode the message compared to Scala/Java, Which makes Python API return a transformed RDD/DStream, not directly wrapped so-called JavaKafkaRDD, so it is hard to backtrack to the original RDD to get the offsetRange.

Author: jerryshao <saisai.shao@intel.com>

Closes #7185 from jerryshao/SPARK-8389 and squashes the following commits:

4c6d320 [jerryshao] Another way to fix subclass deserialization issue
e6a8011 [jerryshao] Address the comments
fd13937 [jerryshao] Fix serialization bug
7debf1c [jerryshao] bug fix
cff3893 [jerryshao] refactor the code according to the comments
2aabf9e [jerryshao] Style fix
848c708 [jerryshao] Add HasOffsetRanges for Python
---
 .../spark/streaming/kafka/KafkaUtils.scala    |  13 ++
 python/pyspark/streaming/kafka.py             | 123 ++++++++++++++++--
 python/pyspark/streaming/tests.py             |  64 +++++++++
 python/pyspark/streaming/util.py              |   7 +-
 4 files changed, 196 insertions(+), 11 deletions(-)

diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala
index 0e33362d34acd..f3b01bd60b178 100644
--- a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala
+++ b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala
@@ -670,4 +670,17 @@ private class KafkaUtilsPythonHelper {
     TopicAndPartition(topic, partition)
 
   def createBroker(host: String, port: JInt): Broker = Broker(host, port)
+
+  def offsetRangesOfKafkaRDD(rdd: RDD[_]): JList[OffsetRange] = {
+    val parentRDDs = rdd.getNarrowAncestors
+    val kafkaRDDs = parentRDDs.filter(rdd => rdd.isInstanceOf[KafkaRDD[_, _, _, _, _]])
+
+    require(
+      kafkaRDDs.length == 1,
+      "Cannot get offset ranges, as there may be multiple Kafka RDDs or no Kafka RDD associated" +
+        "with this RDD, please call this method only on a Kafka RDD.")
+
+    val kafkaRDD = kafkaRDDs.head.asInstanceOf[KafkaRDD[_, _, _, _, _]]
+    kafkaRDD.offsetRanges.toSeq
+  }
 }
diff --git a/python/pyspark/streaming/kafka.py b/python/pyspark/streaming/kafka.py
index 10a859a532e28..33dd596335b47 100644
--- a/python/pyspark/streaming/kafka.py
+++ b/python/pyspark/streaming/kafka.py
@@ -21,6 +21,8 @@
 from pyspark.storagelevel import StorageLevel
 from pyspark.serializers import PairDeserializer, NoOpSerializer
 from pyspark.streaming import DStream
+from pyspark.streaming.dstream import TransformedDStream
+from pyspark.streaming.util import TransformFunction
 
 __all__ = ['Broker', 'KafkaUtils', 'OffsetRange', 'TopicAndPartition', 'utf8_decoder']
 
@@ -122,8 +124,9 @@ def createDirectStream(ssc, topics, kafkaParams, fromOffsets={},
             raise e
 
         ser = PairDeserializer(NoOpSerializer(), NoOpSerializer())
-        stream = DStream(jstream, ssc, ser)
-        return stream.map(lambda k_v: (keyDecoder(k_v[0]), valueDecoder(k_v[1])))
+        stream = DStream(jstream, ssc, ser) \
+            .map(lambda k_v: (keyDecoder(k_v[0]), valueDecoder(k_v[1])))
+        return KafkaDStream(stream._jdstream, ssc, stream._jrdd_deserializer)
 
     @staticmethod
     def createRDD(sc, kafkaParams, offsetRanges, leaders={},
@@ -161,8 +164,8 @@ def createRDD(sc, kafkaParams, offsetRanges, leaders={},
             raise e
 
         ser = PairDeserializer(NoOpSerializer(), NoOpSerializer())
-        rdd = RDD(jrdd, sc, ser)
-        return rdd.map(lambda k_v: (keyDecoder(k_v[0]), valueDecoder(k_v[1])))
+        rdd = RDD(jrdd, sc, ser).map(lambda k_v: (keyDecoder(k_v[0]), valueDecoder(k_v[1])))
+        return KafkaRDD(rdd._jrdd, rdd.ctx, rdd._jrdd_deserializer)
 
     @staticmethod
     def _printErrorMsg(sc):
@@ -200,14 +203,30 @@ def __init__(self, topic, partition, fromOffset, untilOffset):
         :param fromOffset: Inclusive starting offset.
         :param untilOffset: Exclusive ending offset.
         """
-        self._topic = topic
-        self._partition = partition
-        self._fromOffset = fromOffset
-        self._untilOffset = untilOffset
+        self.topic = topic
+        self.partition = partition
+        self.fromOffset = fromOffset
+        self.untilOffset = untilOffset
+
+    def __eq__(self, other):
+        if isinstance(other, self.__class__):
+            return (self.topic == other.topic
+                    and self.partition == other.partition
+                    and self.fromOffset == other.fromOffset
+                    and self.untilOffset == other.untilOffset)
+        else:
+            return False
+
+    def __ne__(self, other):
+        return not self.__eq__(other)
+
+    def __str__(self):
+        return "OffsetRange(topic: %s, partition: %d, range: [%d -> %d]" \
+               % (self.topic, self.partition, self.fromOffset, self.untilOffset)
 
     def _jOffsetRange(self, helper):
-        return helper.createOffsetRange(self._topic, self._partition, self._fromOffset,
-                                        self._untilOffset)
+        return helper.createOffsetRange(self.topic, self.partition, self.fromOffset,
+                                        self.untilOffset)
 
 
 class TopicAndPartition(object):
@@ -244,3 +263,87 @@ def __init__(self, host, port):
 
     def _jBroker(self, helper):
         return helper.createBroker(self._host, self._port)
+
+
+class KafkaRDD(RDD):
+    """
+    A Python wrapper of KafkaRDD, to provide additional information on normal RDD.
+    """
+
+    def __init__(self, jrdd, ctx, jrdd_deserializer):
+        RDD.__init__(self, jrdd, ctx, jrdd_deserializer)
+
+    def offsetRanges(self):
+        """
+        Get the OffsetRange of specific KafkaRDD.
+        :return: A list of OffsetRange
+        """
+        try:
+            helperClass = self.ctx._jvm.java.lang.Thread.currentThread().getContextClassLoader() \
+                .loadClass("org.apache.spark.streaming.kafka.KafkaUtilsPythonHelper")
+            helper = helperClass.newInstance()
+            joffsetRanges = helper.offsetRangesOfKafkaRDD(self._jrdd.rdd())
+        except Py4JJavaError as e:
+            if 'ClassNotFoundException' in str(e.java_exception):
+                KafkaUtils._printErrorMsg(self.ctx)
+            raise e
+
+        ranges = [OffsetRange(o.topic(), o.partition(), o.fromOffset(), o.untilOffset())
+                  for o in joffsetRanges]
+        return ranges
+
+
+class KafkaDStream(DStream):
+    """
+    A Python wrapper of KafkaDStream
+    """
+
+    def __init__(self, jdstream, ssc, jrdd_deserializer):
+        DStream.__init__(self, jdstream, ssc, jrdd_deserializer)
+
+    def foreachRDD(self, func):
+        """
+        Apply a function to each RDD in this DStream.
+        """
+        if func.__code__.co_argcount == 1:
+            old_func = func
+            func = lambda r, rdd: old_func(rdd)
+        jfunc = TransformFunction(self._sc, func, self._jrdd_deserializer) \
+            .rdd_wrapper(lambda jrdd, ctx, ser: KafkaRDD(jrdd, ctx, ser))
+        api = self._ssc._jvm.PythonDStream
+        api.callForeachRDD(self._jdstream, jfunc)
+
+    def transform(self, func):
+        """
+        Return a new DStream in which each RDD is generated by applying a function
+        on each RDD of this DStream.
+
+        `func` can have one argument of `rdd`, or have two arguments of
+        (`time`, `rdd`)
+        """
+        if func.__code__.co_argcount == 1:
+            oldfunc = func
+            func = lambda t, rdd: oldfunc(rdd)
+        assert func.__code__.co_argcount == 2, "func should take one or two arguments"
+
+        return KafkaTransformedDStream(self, func)
+
+
+class KafkaTransformedDStream(TransformedDStream):
+    """
+    Kafka specific wrapper of TransformedDStream to transform on Kafka RDD.
+    """
+
+    def __init__(self, prev, func):
+        TransformedDStream.__init__(self, prev, func)
+
+    @property
+    def _jdstream(self):
+        if self._jdstream_val is not None:
+            return self._jdstream_val
+
+        jfunc = TransformFunction(self._sc, self.func, self.prev._jrdd_deserializer) \
+            .rdd_wrapper(lambda jrdd, ctx, ser: KafkaRDD(jrdd, ctx, ser))
+        dstream = self._sc._jvm.PythonTransformedDStream(self.prev._jdstream.dstream(), jfunc)
+        self._jdstream_val = dstream.asJavaDStream()
+        return self._jdstream_val
diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
index 188c8ff12067e..4ecae1e4bf282 100644
--- a/python/pyspark/streaming/tests.py
+++ b/python/pyspark/streaming/tests.py
@@ -678,6 +678,70 @@ def test_kafka_rdd_with_leaders(self):
         rdd = KafkaUtils.createRDD(self.sc, kafkaParams, offsetRanges, leaders)
         self._validateRddResult(sendData, rdd)
 
+    @unittest.skipIf(sys.version >= "3", "long type not support")
+    def test_kafka_rdd_get_offsetRanges(self):
+        """Test Python direct Kafka RDD get OffsetRanges."""
+        topic = self._randomTopic()
+        sendData = {"a": 3, "b": 4, "c": 5}
+        offsetRanges = [OffsetRange(topic, 0, long(0), long(sum(sendData.values())))]
+        kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress()}
+
+        self._kafkaTestUtils.createTopic(topic)
+        self._kafkaTestUtils.sendMessages(topic, sendData)
+        rdd = KafkaUtils.createRDD(self.sc, kafkaParams, offsetRanges)
+        self.assertEqual(offsetRanges, rdd.offsetRanges())
+
+    @unittest.skipIf(sys.version >= "3", "long type not support")
+    def test_kafka_direct_stream_foreach_get_offsetRanges(self):
+        """Test the Python direct Kafka stream foreachRDD get offsetRanges."""
+        topic = self._randomTopic()
+        sendData = {"a": 1, "b": 2, "c": 3}
+        kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress(),
+                       "auto.offset.reset": "smallest"}
+
+        self._kafkaTestUtils.createTopic(topic)
+        self._kafkaTestUtils.sendMessages(topic, sendData)
+
+        stream = KafkaUtils.createDirectStream(self.ssc, [topic], kafkaParams)
+
+        offsetRanges = []
+
+        def getOffsetRanges(_, rdd):
+            for o in rdd.offsetRanges():
+                offsetRanges.append(o)
+
+        stream.foreachRDD(getOffsetRanges)
+        self.ssc.start()
+        self.wait_for(offsetRanges, 1)
+
+        self.assertEqual(offsetRanges, [OffsetRange(topic, 0, long(0), long(6))])
+
+    @unittest.skipIf(sys.version >= "3", "long type not support")
+    def test_kafka_direct_stream_transform_get_offsetRanges(self):
+        """Test the Python direct Kafka stream transform get offsetRanges."""
+        topic = self._randomTopic()
+        sendData = {"a": 1, "b": 2, "c": 3}
+        kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress(),
+                       "auto.offset.reset": "smallest"}
+
+        self._kafkaTestUtils.createTopic(topic)
+        self._kafkaTestUtils.sendMessages(topic, sendData)
+
+        stream = KafkaUtils.createDirectStream(self.ssc, [topic], kafkaParams)
+
+        offsetRanges = []
+
+        def transformWithOffsetRanges(rdd):
+            for o in rdd.offsetRanges():
+                offsetRanges.append(o)
+            return rdd
+
+        stream.transform(transformWithOffsetRanges).foreachRDD(lambda rdd: rdd.count())
+        self.ssc.start()
+        self.wait_for(offsetRanges, 1)
+
+        self.assertEqual(offsetRanges, [OffsetRange(topic, 0, long(0), long(6))])
+
 
 class FlumeStreamTests(PySparkStreamingTestCase):
     timeout = 20  # seconds
diff --git a/python/pyspark/streaming/util.py b/python/pyspark/streaming/util.py
index a9bfec2aab8fc..b20613b1283bd 100644
--- a/python/pyspark/streaming/util.py
+++ b/python/pyspark/streaming/util.py
@@ -37,6 +37,11 @@ def __init__(self, ctx, func, *deserializers):
         self.ctx = ctx
         self.func = func
         self.deserializers = deserializers
+        self._rdd_wrapper = lambda jrdd, ctx, ser: RDD(jrdd, ctx, ser)
+
+    def rdd_wrapper(self, func):
+        self._rdd_wrapper = func
+        return self
 
     def call(self, milliseconds, jrdds):
         try:
@@ -51,7 +56,7 @@ def call(self, milliseconds, jrdds):
             if len(sers) < len(jrdds):
                 sers += (sers[0],) * (len(jrdds) - len(sers))
 
-            rdds = [RDD(jrdd, self.ctx, ser) if jrdd else None
+            rdds = [self._rdd_wrapper(jrdd, self.ctx, ser) if jrdd else None
                     for jrdd, ser in zip(jrdds, sers)]
             t = datetime.fromtimestamp(milliseconds / 1000.0)
             r = self.func(t, *rdds)

From c9e2ef52bb54f35a904427389dc492d61f29b018 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Thu, 9 Jul 2015 14:43:38 -0700
Subject: [PATCH 098/549] [SPARK-7902] [SPARK-6289] [SPARK-8685] [SQL]
 [PYSPARK] Refactor of serialization for Python DataFrame

This PR fix the long standing issue of serialization between Python RDD and DataFrame, it change to using a customized Pickler for InternalRow to enable customized unpickling (type conversion, especially for UDT), now we can support UDT for UDF, cc mengxr .

There is no generated `Row` anymore.

Author: Davies Liu <davies@databricks.com>

Closes #7301 from davies/sql_ser and squashes the following commits:

81bef71 [Davies Liu] address comments
e9217bd [Davies Liu] add regression tests
db34167 [Davies Liu] Refactor of serialization for Python DataFrame
---
 python/pyspark/sql/context.py                 |   5 +-
 python/pyspark/sql/dataframe.py               |  16 +-
 python/pyspark/sql/tests.py                   |  28 +-
 python/pyspark/sql/types.py                   | 419 ++++++------------
 .../spark/sql/catalyst/expressions/rows.scala |  12 +
 .../org/apache/spark/sql/DataFrame.scala      |   5 +-
 .../spark/sql/execution/pythonUDFs.scala      | 122 ++++-
 7 files changed, 292 insertions(+), 315 deletions(-)

diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py
index 309c11faf9319..c93a15badae29 100644
--- a/python/pyspark/sql/context.py
+++ b/python/pyspark/sql/context.py
@@ -30,7 +30,7 @@
 from pyspark.serializers import AutoBatchedSerializer, PickleSerializer
 from pyspark.sql import since
 from pyspark.sql.types import Row, StringType, StructType, _verify_type, \
-    _infer_schema, _has_nulltype, _merge_type, _create_converter, _python_to_sql_converter
+    _infer_schema, _has_nulltype, _merge_type, _create_converter
 from pyspark.sql.dataframe import DataFrame
 from pyspark.sql.readwriter import DataFrameReader
 from pyspark.sql.utils import install_exception_handler
@@ -388,8 +388,7 @@ def createDataFrame(self, data, schema=None, samplingRatio=None):
             raise TypeError("schema should be StructType or list or None")
 
         # convert python objects to sql data
-        converter = _python_to_sql_converter(schema)
-        rdd = rdd.map(converter)
+        rdd = rdd.map(schema.toInternal)
 
         jrdd = self._jvm.SerDeUtil.toJavaArray(rdd._to_java_object_rdd())
         df = self._ssql_ctx.applySchemaToPythonRDD(jrdd.rdd(), schema.json())
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 1e9c657cf81b3..83e02b85f06f1 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -31,7 +31,7 @@
 from pyspark.storagelevel import StorageLevel
 from pyspark.traceback_utils import SCCallSiteSync
 from pyspark.sql import since
-from pyspark.sql.types import _create_cls, _parse_datatype_json_string
+from pyspark.sql.types import _parse_datatype_json_string
 from pyspark.sql.column import Column, _to_seq, _to_java_column
 from pyspark.sql.readwriter import DataFrameWriter
 from pyspark.sql.types import *
@@ -83,15 +83,7 @@ def rdd(self):
         """
         if self._lazy_rdd is None:
             jrdd = self._jdf.javaToPython()
-            rdd = RDD(jrdd, self.sql_ctx._sc, BatchedSerializer(PickleSerializer()))
-            schema = self.schema
-
-            def applySchema(it):
-                cls = _create_cls(schema)
-                return map(cls, it)
-
-            self._lazy_rdd = rdd.mapPartitions(applySchema)
-
+            self._lazy_rdd = RDD(jrdd, self.sql_ctx._sc, BatchedSerializer(PickleSerializer()))
         return self._lazy_rdd
 
     @property
@@ -287,9 +279,7 @@ def collect(self):
         """
         with SCCallSiteSync(self._sc) as css:
             port = self._sc._jvm.PythonRDD.collectAndServe(self._jdf.javaToPython().rdd())
-        rs = list(_load_from_socket(port, BatchedSerializer(PickleSerializer())))
-        cls = _create_cls(self.schema)
-        return [cls(r) for r in rs]
+        return list(_load_from_socket(port, BatchedSerializer(PickleSerializer())))
 
     @ignore_unicode_prefix
     @since(1.3)
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 66827d48850d9..4d7cad5a1ab88 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -151,6 +151,17 @@ def test_range(self):
         self.assertEqual(self.sqlCtx.range(-2).count(), 0)
         self.assertEqual(self.sqlCtx.range(3).count(), 3)
 
+    def test_duplicated_column_names(self):
+        df = self.sqlCtx.createDataFrame([(1, 2)], ["c", "c"])
+        row = df.select('*').first()
+        self.assertEqual(1, row[0])
+        self.assertEqual(2, row[1])
+        self.assertEqual("Row(c=1, c=2)", str(row))
+        # Cannot access columns
+        self.assertRaises(AnalysisException, lambda: df.select(df[0]).first())
+        self.assertRaises(AnalysisException, lambda: df.select(df.c).first())
+        self.assertRaises(AnalysisException, lambda: df.select(df["c"]).first())
+
     def test_explode(self):
         from pyspark.sql.functions import explode
         d = [Row(a=1, intlist=[1, 2, 3], mapfield={"a": "b"})]
@@ -401,6 +412,14 @@ def test_apply_schema_with_udt(self):
         point = df.head().point
         self.assertEquals(point, ExamplePoint(1.0, 2.0))
 
+    def test_udf_with_udt(self):
+        from pyspark.sql.tests import ExamplePoint
+        row = Row(label=1.0, point=ExamplePoint(1.0, 2.0))
+        df = self.sc.parallelize([row]).toDF()
+        self.assertEqual(1.0, df.map(lambda r: r.point.x).first())
+        udf = UserDefinedFunction(lambda p: p.y, DoubleType())
+        self.assertEqual(2.0, df.select(udf(df.point)).first()[0])
+
     def test_parquet_with_udt(self):
         from pyspark.sql.tests import ExamplePoint
         row = Row(label=1.0, point=ExamplePoint(1.0, 2.0))
@@ -693,12 +712,9 @@ def test_time_with_timezone(self):
         utcnow = datetime.datetime.fromtimestamp(ts, utc)
         df = self.sqlCtx.createDataFrame([(day, now, utcnow)])
         day1, now1, utcnow1 = df.first()
-        # Pyrolite serialize java.sql.Date as datetime, will be fixed in new version
-        self.assertEqual(day1.date(), day)
-        # Pyrolite does not support microsecond, the error should be
-        # less than 1 millisecond
-        self.assertTrue(now - now1 < datetime.timedelta(0.001))
-        self.assertTrue(now - utcnow1 < datetime.timedelta(0.001))
+        self.assertEqual(day1, day)
+        self.assertEqual(now, now1)
+        self.assertEqual(now, utcnow1)
 
     def test_decimal(self):
         from decimal import Decimal
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index fecfe6d71e9a7..d63857691675a 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -20,13 +20,9 @@
 import time
 import datetime
 import calendar
-import keyword
-import warnings
 import json
 import re
-import weakref
 from array import array
-from operator import itemgetter
 
 if sys.version >= "3":
     long = int
@@ -71,6 +67,26 @@ def json(self):
                           separators=(',', ':'),
                           sort_keys=True)
 
+    def needConversion(self):
+        """
+        Does this type need to conversion between Python object and internal SQL object.
+
+        This is used to avoid the unnecessary conversion for ArrayType/MapType/StructType.
+        """
+        return False
+
+    def toInternal(self, obj):
+        """
+        Converts a Python object into an internal SQL object.
+        """
+        return obj
+
+    def fromInternal(self, obj):
+        """
+        Converts an internal SQL object into a native Python object.
+        """
+        return obj
+
 
 # This singleton pattern does not work with pickle, you will get
 # another object after pickle and unpickle
@@ -143,6 +159,17 @@ class DateType(AtomicType):
 
     __metaclass__ = DataTypeSingleton
 
+    EPOCH_ORDINAL = datetime.datetime(1970, 1, 1).toordinal()
+
+    def needConversion(self):
+        return True
+
+    def toInternal(self, d):
+        return d and d.toordinal() - self.EPOCH_ORDINAL
+
+    def fromInternal(self, v):
+        return v and datetime.date.fromordinal(v + self.EPOCH_ORDINAL)
+
 
 class TimestampType(AtomicType):
     """Timestamp (datetime.datetime) data type.
@@ -150,6 +177,19 @@ class TimestampType(AtomicType):
 
     __metaclass__ = DataTypeSingleton
 
+    def needConversion(self):
+        return True
+
+    def toInternal(self, dt):
+        if dt is not None:
+            seconds = (calendar.timegm(dt.utctimetuple()) if dt.tzinfo
+                       else time.mktime(dt.timetuple()))
+            return int(seconds * 1e6 + dt.microsecond)
+
+    def fromInternal(self, ts):
+        if ts is not None:
+            return datetime.datetime.fromtimestamp(ts / 1e6)
+
 
 class DecimalType(FractionalType):
     """Decimal (decimal.Decimal) data type.
@@ -259,6 +299,19 @@ def fromJson(cls, json):
         return ArrayType(_parse_datatype_json_value(json["elementType"]),
                          json["containsNull"])
 
+    def needConversion(self):
+        return self.elementType.needConversion()
+
+    def toInternal(self, obj):
+        if not self.needConversion():
+            return obj
+        return obj and [self.elementType.toInternal(v) for v in obj]
+
+    def fromInternal(self, obj):
+        if not self.needConversion():
+            return obj
+        return obj and [self.elementType.fromInternal(v) for v in obj]
+
 
 class MapType(DataType):
     """Map data type.
@@ -304,6 +357,21 @@ def fromJson(cls, json):
                        _parse_datatype_json_value(json["valueType"]),
                        json["valueContainsNull"])
 
+    def needConversion(self):
+        return self.keyType.needConversion() or self.valueType.needConversion()
+
+    def toInternal(self, obj):
+        if not self.needConversion():
+            return obj
+        return obj and dict((self.keyType.toInternal(k), self.valueType.toInternal(v))
+                            for k, v in obj.items())
+
+    def fromInternal(self, obj):
+        if not self.needConversion():
+            return obj
+        return obj and dict((self.keyType.fromInternal(k), self.valueType.fromInternal(v))
+                            for k, v in obj.items())
+
 
 class StructField(DataType):
     """A field in :class:`StructType`.
@@ -311,7 +379,7 @@ class StructField(DataType):
     :param name: string, name of the field.
     :param dataType: :class:`DataType` of the field.
     :param nullable: boolean, whether the field can be null (None) or not.
-    :param metadata: a dict from string to simple type that can be serialized to JSON automatically
+    :param metadata: a dict from string to simple type that can be toInternald to JSON automatically
     """
 
     def __init__(self, name, dataType, nullable=True, metadata=None):
@@ -351,6 +419,15 @@ def fromJson(cls, json):
                            json["nullable"],
                            json["metadata"])
 
+    def needConversion(self):
+        return self.dataType.needConversion()
+
+    def toInternal(self, obj):
+        return self.dataType.toInternal(obj)
+
+    def fromInternal(self, obj):
+        return self.dataType.fromInternal(obj)
+
 
 class StructType(DataType):
     """Struct type, consisting of a list of :class:`StructField`.
@@ -371,10 +448,13 @@ def __init__(self, fields=None):
         """
         if not fields:
             self.fields = []
+            self.names = []
         else:
             self.fields = fields
+            self.names = [f.name for f in fields]
             assert all(isinstance(f, StructField) for f in fields),\
                 "fields should be a list of StructField"
+        self._needSerializeFields = None
 
     def add(self, field, data_type=None, nullable=True, metadata=None):
         """
@@ -406,6 +486,7 @@ def add(self, field, data_type=None, nullable=True, metadata=None):
         """
         if isinstance(field, StructField):
             self.fields.append(field)
+            self.names.append(field.name)
         else:
             if isinstance(field, str) and data_type is None:
                 raise ValueError("Must specify DataType if passing name of struct_field to create.")
@@ -415,6 +496,7 @@ def add(self, field, data_type=None, nullable=True, metadata=None):
             else:
                 data_type_f = data_type
             self.fields.append(StructField(field, data_type_f, nullable, metadata))
+            self.names.append(field)
         return self
 
     def simpleString(self):
@@ -432,6 +514,41 @@ def jsonValue(self):
     def fromJson(cls, json):
         return StructType([StructField.fromJson(f) for f in json["fields"]])
 
+    def needConversion(self):
+        # We need convert Row()/namedtuple into tuple()
+        return True
+
+    def toInternal(self, obj):
+        if obj is None:
+            return
+
+        if self._needSerializeFields is None:
+            self._needSerializeFields = any(f.needConversion() for f in self.fields)
+
+        if self._needSerializeFields:
+            if isinstance(obj, dict):
+                return tuple(f.toInternal(obj.get(n)) for n, f in zip(names, self.fields))
+            elif isinstance(obj, (tuple, list)):
+                return tuple(f.toInternal(v) for f, v in zip(self.fields, obj))
+            else:
+                raise ValueError("Unexpected tuple %r with StructType" % obj)
+        else:
+            if isinstance(obj, dict):
+                return tuple(obj.get(n) for n in self.names)
+            elif isinstance(obj, (list, tuple)):
+                return tuple(obj)
+            else:
+                raise ValueError("Unexpected tuple %r with StructType" % obj)
+
+    def fromInternal(self, obj):
+        if obj is None:
+            return
+        if isinstance(obj, Row):
+            # it's already converted by pickler
+            return obj
+        values = [f.dataType.fromInternal(v) for f, v in zip(self.fields, obj)]
+        return _create_row(self.names, values)
+
 
 class UserDefinedType(DataType):
     """User-defined type (UDT).
@@ -464,17 +581,35 @@ def scalaUDT(cls):
         """
         raise NotImplementedError("UDT must have a paired Scala UDT.")
 
+    def needConversion(self):
+        return True
+
+    @classmethod
+    def _cachedSqlType(cls):
+        """
+        Cache the sqlType() into class, because it's heavy used in `toInternal`.
+        """
+        if not hasattr(cls, "_cached_sql_type"):
+            cls._cached_sql_type = cls.sqlType()
+        return cls._cached_sql_type
+
+    def toInternal(self, obj):
+        return self._cachedSqlType().toInternal(self.serialize(obj))
+
+    def fromInternal(self, obj):
+        return self.deserialize(self._cachedSqlType().fromInternal(obj))
+
     def serialize(self, obj):
         """
         Converts the a user-type object into a SQL datum.
         """
-        raise NotImplementedError("UDT must implement serialize().")
+        raise NotImplementedError("UDT must implement toInternal().")
 
     def deserialize(self, datum):
         """
         Converts a SQL datum into a user-type object.
         """
-        raise NotImplementedError("UDT must implement deserialize().")
+        raise NotImplementedError("UDT must implement fromInternal().")
 
     def simpleString(self):
         return 'udt'
@@ -671,117 +806,6 @@ def _infer_schema(row):
     return StructType(fields)
 
 
-def _need_python_to_sql_conversion(dataType):
-    """
-    Checks whether we need python to sql conversion for the given type.
-    For now, only UDTs need this conversion.
-
-    >>> _need_python_to_sql_conversion(DoubleType())
-    False
-    >>> schema0 = StructType([StructField("indices", ArrayType(IntegerType(), False), False),
-    ...                       StructField("values", ArrayType(DoubleType(), False), False)])
-    >>> _need_python_to_sql_conversion(schema0)
-    True
-    >>> _need_python_to_sql_conversion(ExamplePointUDT())
-    True
-    >>> schema1 = ArrayType(ExamplePointUDT(), False)
-    >>> _need_python_to_sql_conversion(schema1)
-    True
-    >>> schema2 = StructType([StructField("label", DoubleType(), False),
-    ...                       StructField("point", ExamplePointUDT(), False)])
-    >>> _need_python_to_sql_conversion(schema2)
-    True
-    """
-    if isinstance(dataType, StructType):
-        # convert namedtuple or Row into tuple
-        return True
-    elif isinstance(dataType, ArrayType):
-        return _need_python_to_sql_conversion(dataType.elementType)
-    elif isinstance(dataType, MapType):
-        return _need_python_to_sql_conversion(dataType.keyType) or \
-            _need_python_to_sql_conversion(dataType.valueType)
-    elif isinstance(dataType, UserDefinedType):
-        return True
-    elif isinstance(dataType, (DateType, TimestampType)):
-        return True
-    else:
-        return False
-
-
-EPOCH_ORDINAL = datetime.datetime(1970, 1, 1).toordinal()
-
-
-def _python_to_sql_converter(dataType):
-    """
-    Returns a converter that converts a Python object into a SQL datum for the given type.
-
-    >>> conv = _python_to_sql_converter(DoubleType())
-    >>> conv(1.0)
-    1.0
-    >>> conv = _python_to_sql_converter(ArrayType(DoubleType(), False))
-    >>> conv([1.0, 2.0])
-    [1.0, 2.0]
-    >>> conv = _python_to_sql_converter(ExamplePointUDT())
-    >>> conv(ExamplePoint(1.0, 2.0))
-    [1.0, 2.0]
-    >>> schema = StructType([StructField("label", DoubleType(), False),
-    ...                      StructField("point", ExamplePointUDT(), False)])
-    >>> conv = _python_to_sql_converter(schema)
-    >>> conv((1.0, ExamplePoint(1.0, 2.0)))
-    (1.0, [1.0, 2.0])
-    """
-    if not _need_python_to_sql_conversion(dataType):
-        return lambda x: x
-
-    if isinstance(dataType, StructType):
-        names, types = zip(*[(f.name, f.dataType) for f in dataType.fields])
-        if any(_need_python_to_sql_conversion(t) for t in types):
-            converters = [_python_to_sql_converter(t) for t in types]
-
-            def converter(obj):
-                if isinstance(obj, dict):
-                    return tuple(c(obj.get(n)) for n, c in zip(names, converters))
-                elif isinstance(obj, tuple):
-                    if hasattr(obj, "__fields__") or hasattr(obj, "_fields"):
-                        return tuple(c(v) for c, v in zip(converters, obj))
-                    else:
-                        return tuple(c(v) for c, v in zip(converters, obj))
-                elif obj is not None:
-                    raise ValueError("Unexpected tuple %r with type %r" % (obj, dataType))
-        else:
-            def converter(obj):
-                if isinstance(obj, dict):
-                    return tuple(obj.get(n) for n in names)
-                else:
-                    return tuple(obj)
-        return converter
-    elif isinstance(dataType, ArrayType):
-        element_converter = _python_to_sql_converter(dataType.elementType)
-        return lambda a: a and [element_converter(v) for v in a]
-    elif isinstance(dataType, MapType):
-        key_converter = _python_to_sql_converter(dataType.keyType)
-        value_converter = _python_to_sql_converter(dataType.valueType)
-        return lambda m: m and dict([(key_converter(k), value_converter(v)) for k, v in m.items()])
-
-    elif isinstance(dataType, UserDefinedType):
-        return lambda obj: obj and dataType.serialize(obj)
-
-    elif isinstance(dataType, DateType):
-        return lambda d: d and d.toordinal() - EPOCH_ORDINAL
-
-    elif isinstance(dataType, TimestampType):
-
-        def to_posix_timstamp(dt):
-            if dt:
-                seconds = (calendar.timegm(dt.utctimetuple()) if dt.tzinfo
-                           else time.mktime(dt.timetuple()))
-                return int(seconds * 1e6 + dt.microsecond)
-        return to_posix_timstamp
-
-    else:
-        raise ValueError("Unexpected type %r" % dataType)
-
-
 def _has_nulltype(dt):
     """ Return whether there is NullType in `dt` or not """
     if isinstance(dt, StructType):
@@ -1076,7 +1100,7 @@ def _verify_type(obj, dataType):
     if isinstance(dataType, UserDefinedType):
         if not (hasattr(obj, '__UDT__') and obj.__UDT__ == dataType):
             raise ValueError("%r is not an instance of type %r" % (obj, dataType))
-        _verify_type(dataType.serialize(obj), dataType.sqlType())
+        _verify_type(dataType.toInternal(obj), dataType.sqlType())
         return
 
     _type = type(dataType)
@@ -1086,7 +1110,7 @@ def _verify_type(obj, dataType):
         if not isinstance(obj, (tuple, list)):
             raise TypeError("StructType can not accept object in type %s" % type(obj))
     else:
-        # subclass of them can not be deserialized in JVM
+        # subclass of them can not be fromInternald in JVM
         if type(obj) not in _acceptable_types[_type]:
             raise TypeError("%s can not accept object in type %s" % (dataType, type(obj)))
 
@@ -1106,159 +1130,10 @@ def _verify_type(obj, dataType):
         for v, f in zip(obj, dataType.fields):
             _verify_type(v, f.dataType)
 
-_cached_cls = weakref.WeakValueDictionary()
-
-
-def _restore_object(dataType, obj):
-    """ Restore object during unpickling. """
-    # use id(dataType) as key to speed up lookup in dict
-    # Because of batched pickling, dataType will be the
-    # same object in most cases.
-    k = id(dataType)
-    cls = _cached_cls.get(k)
-    if cls is None or cls.__datatype is not dataType:
-        # use dataType as key to avoid create multiple class
-        cls = _cached_cls.get(dataType)
-        if cls is None:
-            cls = _create_cls(dataType)
-            _cached_cls[dataType] = cls
-        cls.__datatype = dataType
-        _cached_cls[k] = cls
-    return cls(obj)
-
-
-def _create_object(cls, v):
-    """ Create an customized object with class `cls`. """
-    # datetime.date would be deserialized as datetime.datetime
-    # from java type, so we need to set it back.
-    if cls is datetime.date and isinstance(v, datetime.datetime):
-        return v.date()
-    return cls(v) if v is not None else v
-
-
-def _create_getter(dt, i):
-    """ Create a getter for item `i` with schema """
-    cls = _create_cls(dt)
-
-    def getter(self):
-        return _create_object(cls, self[i])
-
-    return getter
-
-
-def _has_struct_or_date(dt):
-    """Return whether `dt` is or has StructType/DateType in it"""
-    if isinstance(dt, StructType):
-        return True
-    elif isinstance(dt, ArrayType):
-        return _has_struct_or_date(dt.elementType)
-    elif isinstance(dt, MapType):
-        return _has_struct_or_date(dt.keyType) or _has_struct_or_date(dt.valueType)
-    elif isinstance(dt, DateType):
-        return True
-    elif isinstance(dt, UserDefinedType):
-        return True
-    return False
-
-
-def _create_properties(fields):
-    """Create properties according to fields"""
-    ps = {}
-    for i, f in enumerate(fields):
-        name = f.name
-        if (name.startswith("__") and name.endswith("__")
-                or keyword.iskeyword(name)):
-            warnings.warn("field name %s can not be accessed in Python,"
-                          "use position to access it instead" % name)
-        if _has_struct_or_date(f.dataType):
-            # delay creating object until accessing it
-            getter = _create_getter(f.dataType, i)
-        else:
-            getter = itemgetter(i)
-        ps[name] = property(getter)
-    return ps
-
-
-def _create_cls(dataType):
-    """
-    Create an class by dataType
-
-    The created class is similar to namedtuple, but can have nested schema.
-
-    >>> schema = _parse_schema_abstract("a b c")
-    >>> row = (1, 1.0, "str")
-    >>> schema = _infer_schema_type(row, schema)
-    >>> obj = _create_cls(schema)(row)
-    >>> import pickle
-    >>> pickle.loads(pickle.dumps(obj))
-    Row(a=1, b=1.0, c='str')
-
-    >>> row = [[1], {"key": (1, 2.0)}]
-    >>> schema = _parse_schema_abstract("a[] b{c d}")
-    >>> schema = _infer_schema_type(row, schema)
-    >>> obj = _create_cls(schema)(row)
-    >>> pickle.loads(pickle.dumps(obj))
-    Row(a=[1], b={'key': Row(c=1, d=2.0)})
-    >>> pickle.loads(pickle.dumps(obj.a))
-    [1]
-    >>> pickle.loads(pickle.dumps(obj.b))
-    {'key': Row(c=1, d=2.0)}
-    """
-
-    if isinstance(dataType, ArrayType):
-        cls = _create_cls(dataType.elementType)
-
-        def List(l):
-            if l is None:
-                return
-            return [_create_object(cls, v) for v in l]
-
-        return List
-
-    elif isinstance(dataType, MapType):
-        kcls = _create_cls(dataType.keyType)
-        vcls = _create_cls(dataType.valueType)
-
-        def Dict(d):
-            if d is None:
-                return
-            return dict((_create_object(kcls, k), _create_object(vcls, v)) for k, v in d.items())
-
-        return Dict
-
-    elif isinstance(dataType, DateType):
-        return datetime.date
-
-    elif isinstance(dataType, UserDefinedType):
-        return lambda datum: dataType.deserialize(datum)
-
-    elif not isinstance(dataType, StructType):
-        # no wrapper for atomic types
-        return lambda x: x
-
-    class Row(tuple):
-
-        """ Row in DataFrame """
-        __datatype = dataType
-        __fields__ = tuple(f.name for f in dataType.fields)
-        __slots__ = ()
-
-        # create property for fast access
-        locals().update(_create_properties(dataType.fields))
-
-        def asDict(self):
-            """ Return as a dict """
-            return dict((n, getattr(self, n)) for n in self.__fields__)
-
-        def __repr__(self):
-            # call collect __repr__ for nested objects
-            return ("Row(%s)" % ", ".join("%s=%r" % (n, getattr(self, n))
-                                          for n in self.__fields__))
-
-        def __reduce__(self):
-            return (_restore_object, (self.__datatype, tuple(self)))
 
-    return Row
+# This is used to unpickle a Row from JVM
+def _create_row_inbound_converter(dataType):
+    return lambda *a: dataType.fromInternal(a)
 
 
 def _create_row(fields, values):
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala
index 8b472a529e5c9..094904bbf9c15 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala
@@ -132,6 +132,18 @@ class GenericInternalRow(protected[sql] val values: Array[Any])
   override def copy(): InternalRow = this
 }
 
+/**
+ * This is used for serialization of Python DataFrame
+ */
+class GenericInternalRowWithSchema(values: Array[Any], override val schema: StructType)
+  extends GenericInternalRow(values) {
+
+  /** No-arg constructor for serialization. */
+  protected def this() = this(null, null)
+
+  override def fieldIndex(name: String): Int = schema.fieldIndex(name)
+}
+
 class GenericMutableRow(val values: Array[Any]) extends MutableRow with ArrayBackedRow {
   /** No-arg constructor for serialization. */
   protected def this() = this(null)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index d9f987ae0252f..d7966651b1948 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -30,7 +30,6 @@ import org.apache.commons.lang3.StringUtils
 
 import org.apache.spark.annotation.{DeveloperApi, Experimental}
 import org.apache.spark.api.java.JavaRDD
-import org.apache.spark.api.python.SerDeUtil
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis._
@@ -1550,8 +1549,8 @@ class DataFrame private[sql](
    */
   protected[sql] def javaToPython: JavaRDD[Array[Byte]] = {
     val structType = schema  // capture it for closure
-    val jrdd = queryExecution.toRdd.map(EvaluatePython.toJava(_, structType)).toJavaRDD()
-    SerDeUtil.javaToPython(jrdd)
+    val rdd = queryExecution.toRdd.map(EvaluatePython.toJava(_, structType))
+    EvaluatePython.javaToPython(rdd)
   }
 
   ////////////////////////////////////////////////////////////////////////////
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUDFs.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUDFs.scala
index 1c8130b07c7fb..6d6e67dace177 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUDFs.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUDFs.scala
@@ -17,15 +17,16 @@
 
 package org.apache.spark.sql.execution
 
+import java.io.OutputStream
 import java.util.{List => JList, Map => JMap}
 
 import scala.collection.JavaConversions._
 import scala.collection.JavaConverters._
 
-import net.razorvine.pickle.{Pickler, Unpickler}
+import net.razorvine.pickle._
 
 import org.apache.spark.annotation.DeveloperApi
-import org.apache.spark.api.python.{PythonBroadcast, PythonRDD}
+import org.apache.spark.api.python.{PythonBroadcast, PythonRDD, SerDeUtil}
 import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
@@ -33,7 +34,6 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.rules.Rule
-import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
 import org.apache.spark.{Accumulator, Logging => SparkLogging}
@@ -130,8 +130,13 @@ object EvaluatePython {
     case (null, _) => null
 
     case (row: InternalRow, struct: StructType) =>
-      val fields = struct.fields.map(field => field.dataType)
-      rowToArray(row, fields)
+      val values = new Array[Any](row.size)
+      var i = 0
+      while (i < row.size) {
+        values(i) = toJava(row(i), struct.fields(i).dataType)
+        i += 1
+      }
+      new GenericInternalRowWithSchema(values, struct)
 
     case (seq: Seq[Any], array: ArrayType) =>
       seq.map(x => toJava(x, array.elementType)).asJava
@@ -142,9 +147,6 @@ object EvaluatePython {
 
     case (ud, udt: UserDefinedType[_]) => toJava(ud, udt.sqlType)
 
-    case (date: Int, DateType) => DateTimeUtils.toJavaDate(date)
-    case (t: Long, TimestampType) => DateTimeUtils.toJavaTimestamp(t)
-
     case (d: Decimal, _) => d.toJavaBigDecimal
 
     case (s: UTF8String, StringType) => s.toString
@@ -152,14 +154,6 @@ object EvaluatePython {
     case (other, _) => other
   }
 
-  /**
-   * Convert Row into Java Array (for pickled into Python)
-   */
-  def rowToArray(row: InternalRow, fields: Seq[DataType]): Array[Any] = {
-    // TODO: this is slow!
-    row.toSeq.zip(fields).map {case (obj, dt) => toJava(obj, dt)}.toArray
-  }
-
   /**
    * Converts `obj` to the type specified by the data type, or returns null if the type of obj is
    * unexpected. Because Python doesn't enforce the type.
@@ -220,6 +214,96 @@ object EvaluatePython {
     // TODO(davies): we could improve this by try to cast the object to expected type
     case (c, _) => null
   }
+
+
+  private val module = "pyspark.sql.types"
+
+  /**
+   * Pickler for StructType
+   */
+  private class StructTypePickler extends IObjectPickler {
+
+    private val cls = classOf[StructType]
+
+    def register(): Unit = {
+      Pickler.registerCustomPickler(cls, this)
+    }
+
+    def pickle(obj: Object, out: OutputStream, pickler: Pickler): Unit = {
+      out.write(Opcodes.GLOBAL)
+      out.write((module + "\n" + "_parse_datatype_json_string" + "\n").getBytes("utf-8"))
+      val schema = obj.asInstanceOf[StructType]
+      pickler.save(schema.json)
+      out.write(Opcodes.TUPLE1)
+      out.write(Opcodes.REDUCE)
+    }
+  }
+
+  /**
+   * Pickler for InternalRow
+   */
+  private class RowPickler extends IObjectPickler {
+
+    private val cls = classOf[GenericInternalRowWithSchema]
+
+    // register this to Pickler and Unpickler
+    def register(): Unit = {
+      Pickler.registerCustomPickler(this.getClass, this)
+      Pickler.registerCustomPickler(cls, this)
+    }
+
+    def pickle(obj: Object, out: OutputStream, pickler: Pickler): Unit = {
+      if (obj == this) {
+        out.write(Opcodes.GLOBAL)
+        out.write((module + "\n" + "_create_row_inbound_converter" + "\n").getBytes("utf-8"))
+      } else {
+        // it will be memorized by Pickler to save some bytes
+        pickler.save(this)
+        val row = obj.asInstanceOf[GenericInternalRowWithSchema]
+        // schema should always be same object for memoization
+        pickler.save(row.schema)
+        out.write(Opcodes.TUPLE1)
+        out.write(Opcodes.REDUCE)
+
+        out.write(Opcodes.MARK)
+        var i = 0
+        while (i < row.values.size) {
+          pickler.save(row.values(i))
+          i += 1
+        }
+        row.values.foreach(pickler.save)
+        out.write(Opcodes.TUPLE)
+        out.write(Opcodes.REDUCE)
+      }
+    }
+  }
+
+  private[this] var registered = false
+  /**
+   * This should be called before trying to serialize any above classes un cluster mode,
+   * this should be put in the closure
+   */
+  def registerPicklers(): Unit = {
+    synchronized {
+      if (!registered) {
+        SerDeUtil.initialize()
+        new StructTypePickler().register()
+        new RowPickler().register()
+        registered = true
+      }
+    }
+  }
+
+  /**
+   * Convert an RDD of Java objects to an RDD of serialized Python objects, that is usable by
+   * PySpark.
+   */
+  def javaToPython(rdd: RDD[Any]): RDD[Array[Byte]] = {
+    rdd.mapPartitions { iter =>
+      registerPicklers()  // let it called in executor
+      new SerDeUtil.AutoBatchedPickler(iter)
+    }
+  }
 }
 
 /**
@@ -254,12 +338,14 @@ case class BatchPythonEvaluation(udf: PythonUDF, output: Seq[Attribute], child:
     val childResults = child.execute().map(_.copy())
 
     val parent = childResults.mapPartitions { iter =>
+      EvaluatePython.registerPicklers()  // register pickler for Row
       val pickle = new Pickler
       val currentRow = newMutableProjection(udf.children, child.output)()
       val fields = udf.children.map(_.dataType)
-      iter.grouped(1000).map { inputRows =>
+      val schema = new StructType(fields.map(t => new StructField("", t, true)).toArray)
+      iter.grouped(100).map { inputRows =>
         val toBePickled = inputRows.map { row =>
-          EvaluatePython.rowToArray(currentRow(row), fields)
+          EvaluatePython.toJava(currentRow(row), schema)
         }.toArray
         pickle.dumps(toBePickled)
       }

From 897700369f3aedf1a8fdb0984dd3d6d8e498e3af Mon Sep 17 00:00:00 2001
From: guowei2 <guowei@growingio.com>
Date: Thu, 9 Jul 2015 15:01:53 -0700
Subject: [PATCH 099/549] [SPARK-8865] [STREAMING] FIX BUG: check key in kafka
 params

Author: guowei2 <guowei@growingio.com>

Closes #7254 from guowei2/spark-8865 and squashes the following commits:

48ca17a [guowei2] fix contains key
---
 .../scala/org/apache/spark/streaming/kafka/KafkaCluster.scala   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaCluster.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaCluster.scala
index 3e6b937af57b0..8465432c5850f 100644
--- a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaCluster.scala
+++ b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaCluster.scala
@@ -410,7 +410,7 @@ object KafkaCluster {
       }
 
       Seq("zookeeper.connect", "group.id").foreach { s =>
-        if (!props.contains(s)) {
+        if (!props.containsKey(s)) {
           props.setProperty(s, "")
         }
       }

From 69165330303a71ea1da748eca7a780ec172b326f Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Thu, 9 Jul 2015 15:14:14 -0700
Subject: [PATCH 100/549] Closes #6837 Closes #7321 Closes #2634 Closes #4963
 Closes #2137


From e29ce319fa6ffb9c8e5110814d4923d433aa1b76 Mon Sep 17 00:00:00 2001
From: Holden Karau <holden@pigscanfly.ca>
Date: Thu, 9 Jul 2015 15:49:30 -0700
Subject: [PATCH 101/549] [SPARK-8963][ML] cleanup tests in linear regression
 suite

Simplify model weight assertions to use vector comparision, switch to using absTol when comparing with 0.0 intercepts

Author: Holden Karau <holden@pigscanfly.ca>

Closes #7327 from holdenk/SPARK-8913-cleanup-tests-from-SPARK-8700-logistic-regression and squashes the following commits:

5bac185 [Holden Karau] Simplify model weight assertions to use vector comparision, switch to using absTol when comparing with 0.0 intercepts
---
 .../ml/regression/LinearRegressionSuite.scala | 57 ++++++++-----------
 1 file changed, 24 insertions(+), 33 deletions(-)

diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
index 5f39d44f37352..4f6a57739558b 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.ml.regression
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.mllib.linalg.DenseVector
+import org.apache.spark.mllib.linalg.{DenseVector, Vectors}
 import org.apache.spark.mllib.util.{LinearDataGenerator, MLlibTestSparkContext}
 import org.apache.spark.mllib.util.TestingUtils._
 import org.apache.spark.sql.{DataFrame, Row}
@@ -75,11 +75,10 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
        as.numeric.data.V3. 7.198257
      */
     val interceptR = 6.298698
-    val weightsR = Array(4.700706, 7.199082)
+    val weightsR = Vectors.dense(4.700706, 7.199082)
 
     assert(model.intercept ~== interceptR relTol 1E-3)
-    assert(model.weights(0) ~== weightsR(0) relTol 1E-3)
-    assert(model.weights(1) ~== weightsR(1) relTol 1E-3)
+    assert(model.weights ~= weightsR relTol 1E-3)
 
     model.transform(dataset).select("features", "prediction").collect().foreach {
       case Row(features: DenseVector, prediction1: Double) =>
@@ -104,11 +103,10 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
        as.numeric.data.V2. 6.995908
        as.numeric.data.V3. 5.275131
      */
-    val weightsR = Array(6.995908, 5.275131)
+    val weightsR = Vectors.dense(6.995908, 5.275131)
 
-    assert(model.intercept ~== 0 relTol 1E-3)
-    assert(model.weights(0) ~== weightsR(0) relTol 1E-3)
-    assert(model.weights(1) ~== weightsR(1) relTol 1E-3)
+    assert(model.intercept ~== 0 absTol 1E-3)
+    assert(model.weights ~= weightsR relTol 1E-3)
     /*
        Then again with the data with no intercept:
        > weightsWithoutIntercept
@@ -118,11 +116,10 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
        as.numeric.data3.V2. 4.70011
        as.numeric.data3.V3. 7.19943
      */
-    val weightsWithoutInterceptR = Array(4.70011, 7.19943)
+    val weightsWithoutInterceptR = Vectors.dense(4.70011, 7.19943)
 
-    assert(modelWithoutIntercept.intercept ~== 0 relTol 1E-3)
-    assert(modelWithoutIntercept.weights(0) ~== weightsWithoutInterceptR(0) relTol 1E-3)
-    assert(modelWithoutIntercept.weights(1) ~== weightsWithoutInterceptR(1) relTol 1E-3)
+    assert(modelWithoutIntercept.intercept ~== 0 absTol 1E-3)
+    assert(modelWithoutIntercept.weights ~= weightsWithoutInterceptR relTol 1E-3)
   }
 
   test("linear regression with intercept with L1 regularization") {
@@ -139,11 +136,10 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
        as.numeric.data.V3. 6.679841
      */
     val interceptR = 6.24300
-    val weightsR = Array(4.024821, 6.679841)
+    val weightsR = Vectors.dense(4.024821, 6.679841)
 
     assert(model.intercept ~== interceptR relTol 1E-3)
-    assert(model.weights(0) ~== weightsR(0) relTol 1E-3)
-    assert(model.weights(1) ~== weightsR(1) relTol 1E-3)
+    assert(model.weights ~= weightsR relTol 1E-3)
 
     model.transform(dataset).select("features", "prediction").collect().foreach {
       case Row(features: DenseVector, prediction1: Double) =>
@@ -169,11 +165,10 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
        as.numeric.data.V3. 4.772913
      */
     val interceptR = 0.0
-    val weightsR = Array(6.299752, 4.772913)
+    val weightsR = Vectors.dense(6.299752, 4.772913)
 
-    assert(model.intercept ~== interceptR relTol 1E-3)
-    assert(model.weights(0) ~== weightsR(0) relTol 1E-3)
-    assert(model.weights(1) ~== weightsR(1) relTol 1E-3)
+    assert(model.intercept ~== interceptR absTol 1E-5)
+    assert(model.weights ~= weightsR relTol 1E-3)
 
     model.transform(dataset).select("features", "prediction").collect().foreach {
       case Row(features: DenseVector, prediction1: Double) =>
@@ -197,11 +192,10 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
        as.numeric.data.V3. 4.926260
      */
     val interceptR = 5.269376
-    val weightsR = Array(3.736216, 5.712356)
+    val weightsR = Vectors.dense(3.736216, 5.712356)
 
     assert(model.intercept ~== interceptR relTol 1E-3)
-    assert(model.weights(0) ~== weightsR(0) relTol 1E-3)
-    assert(model.weights(1) ~== weightsR(1) relTol 1E-3)
+    assert(model.weights ~= weightsR relTol 1E-3)
 
     model.transform(dataset).select("features", "prediction").collect().foreach {
       case Row(features: DenseVector, prediction1: Double) =>
@@ -227,11 +221,10 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
        as.numeric.data.V3. 4.214502
      */
     val interceptR = 0.0
-    val weightsR = Array(5.522875, 4.214502)
+    val weightsR = Vectors.dense(5.522875, 4.214502)
 
-    assert(model.intercept ~== interceptR relTol 1E-3)
-    assert(model.weights(0) ~== weightsR(0) relTol 1E-3)
-    assert(model.weights(1) ~== weightsR(1) relTol 1E-3)
+    assert(model.intercept ~== interceptR absTol 1E-3)
+    assert(model.weights ~== weightsR relTol 1E-3)
 
     model.transform(dataset).select("features", "prediction").collect().foreach {
       case Row(features: DenseVector, prediction1: Double) =>
@@ -255,11 +248,10 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
        as.numeric.data.V3. 5.200403
      */
     val interceptR = 5.696056
-    val weightsR = Array(3.670489, 6.001122)
+    val weightsR = Vectors.dense(3.670489, 6.001122)
 
     assert(model.intercept ~== interceptR relTol 1E-3)
-    assert(model.weights(0) ~== weightsR(0) relTol 1E-3)
-    assert(model.weights(1) ~== weightsR(1) relTol 1E-3)
+    assert(model.weights ~== weightsR relTol 1E-3)
 
     model.transform(dataset).select("features", "prediction").collect().foreach {
       case Row(features: DenseVector, prediction1: Double) =>
@@ -285,11 +277,10 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
        as.numeric.dataM.V3. 4.322251
      */
     val interceptR = 0.0
-    val weightsR = Array(5.673348, 4.322251)
+    val weightsR = Vectors.dense(5.673348, 4.322251)
 
-    assert(model.intercept ~== interceptR relTol 1E-3)
-    assert(model.weights(0) ~== weightsR(0) relTol 1E-3)
-    assert(model.weights(1) ~== weightsR(1) relTol 1E-3)
+    assert(model.intercept ~== interceptR absTol 1E-3)
+    assert(model.weights ~= weightsR relTol 1E-3)
 
     model.transform(dataset).select("features", "prediction").collect().foreach {
       case Row(features: DenseVector, prediction1: Double) =>

From a0cc3e5aa3fcfd0fce6813c520152657d327aaf2 Mon Sep 17 00:00:00 2001
From: Feynman Liang <fliang@databricks.com>
Date: Thu, 9 Jul 2015 16:21:21 -0700
Subject: [PATCH 102/549] [SPARK-8538] [SPARK-8539] [ML] Linear Regression
 Training and Testing Results

Adds results (e.g. objective value at each iteration, residuals) on training and user-specified test sets for LinearRegressionModel.

Notes to Reviewers:
 * Are the `*TrainingResults` and `Results` classes too specialized for `LinearRegressionModel`? Where would be an appropriate level of abstraction?
 * Please check `transient` annotations are correct; the datasets should not be copied and kept during serialization.
 * Any thoughts on `RDD`s versus `DataFrame`s? If using `DataFrame`s, suggested schemas for each intermediate step? Also, how to create a "local DataFrame" without a `sqlContext`?

Author: Feynman Liang <fliang@databricks.com>

Closes #7099 from feynmanliang/SPARK-8538 and squashes the following commits:

d219fa4 [Feynman Liang] Update docs
4a42680 [Feynman Liang] Change Summary to hold values, move transient annotations down to metrics and predictions DF
6300031 [Feynman Liang] Code review changes
0a5e762 [Feynman Liang] Fix build error
e71102d [Feynman Liang] Merge branch 'master' into SPARK-8538
3367489 [Feynman Liang] Merge branch 'master' into SPARK-8538
70f267c [Feynman Liang] Make TrainingSummary transient and remove Serializable from *Summary and RegressionMetrics
1d9ea42 [Feynman Liang] Fix failing Java test
a65dfda [Feynman Liang] Make TrainingSummary and metrics serializable, prediction dataframe transient
0a605d8 [Feynman Liang] Replace Params from LinearRegression*Summary with private constructor vals
c2fe835 [Feynman Liang] Optimize imports
02d8a70 [Feynman Liang] Add Params to LinearModel*Summary, refactor tests and add test for evaluate()
8f999f4 [Feynman Liang] Refactor from jkbradley code review
072e948 [Feynman Liang] Style
509ae36 [Feynman Liang] Use DFs and localize serialization to LinearRegressionModel
9509c79 [Feynman Liang] Fix imports
b2bbaa3 [Feynman Liang] Refactored LinearRegressionResults API to be more private
ffceaec [Feynman Liang] Merge branch 'master' into SPARK-8538
1cedb2b [Feynman Liang] Add test for decreasing objective trace
dab0aff [Feynman Liang] Add LinearRegressionTrainingResults tests, make test suite code copy+pasteable
97b0a81 [Feynman Liang] Add LinearRegressionModel.evaluate() to get results on test sets
dc51bce [Feynman Liang] Style guide fixes
521f397 [Feynman Liang] Use RDD[(Double, Double)] instead of DF
2ff5710 [Feynman Liang] Add training results and model summary to ML LinearRegression
---
 .../ml/regression/LinearRegression.scala      | 139 +++++++++++++++++-
 .../ml/regression/LinearRegressionSuite.scala |  59 ++++++++
 2 files changed, 192 insertions(+), 6 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index f672c96576a33..8fc986056657d 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -22,18 +22,20 @@ import scala.collection.mutable
 import breeze.linalg.{DenseVector => BDV, norm => brzNorm}
 import breeze.optimize.{CachedDiffFunction, DiffFunction, LBFGS => BreezeLBFGS, OWLQN => BreezeOWLQN}
 
-import org.apache.spark.{SparkException, Logging}
+import org.apache.spark.{Logging, SparkException}
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.ml.PredictorParams
 import org.apache.spark.ml.param.ParamMap
 import org.apache.spark.ml.param.shared._
 import org.apache.spark.ml.util.Identifiable
+import org.apache.spark.mllib.evaluation.RegressionMetrics
 import org.apache.spark.mllib.linalg.{Vector, Vectors}
 import org.apache.spark.mllib.linalg.BLAS._
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.{DataFrame, Row}
+import org.apache.spark.sql.functions.{col, udf}
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.util.StatCounter
 
@@ -139,7 +141,16 @@ class LinearRegression(override val uid: String)
       logWarning(s"The standard deviation of the label is zero, so the weights will be zeros " +
         s"and the intercept will be the mean of the label; as a result, training is not needed.")
       if (handlePersistence) instances.unpersist()
-      return new LinearRegressionModel(uid, Vectors.sparse(numFeatures, Seq()), yMean)
+      val weights = Vectors.sparse(numFeatures, Seq())
+      val intercept = yMean
+
+      val model = new LinearRegressionModel(uid, weights, intercept)
+      val trainingSummary = new LinearRegressionTrainingSummary(
+        model.transform(dataset).select($(predictionCol), $(labelCol)),
+        $(predictionCol),
+        $(labelCol),
+        Array(0D))
+      return copyValues(model.setSummary(trainingSummary))
     }
 
     val featuresMean = summarizer.mean.toArray
@@ -178,7 +189,6 @@ class LinearRegression(override val uid: String)
         state = states.next()
         arrayBuilder += state.adjustedValue
       }
-
       if (state == null) {
         val msg = s"${optimizer.getClass.getName} failed."
         logError(msg)
@@ -209,7 +219,13 @@ class LinearRegression(override val uid: String)
 
     if (handlePersistence) instances.unpersist()
 
-    copyValues(new LinearRegressionModel(uid, weights, intercept))
+    val model = copyValues(new LinearRegressionModel(uid, weights, intercept))
+    val trainingSummary = new LinearRegressionTrainingSummary(
+      model.transform(dataset).select($(predictionCol), $(labelCol)),
+      $(predictionCol),
+      $(labelCol),
+      objectiveHistory)
+    model.setSummary(trainingSummary)
   }
 
   override def copy(extra: ParamMap): LinearRegression = defaultCopy(extra)
@@ -227,13 +243,124 @@ class LinearRegressionModel private[ml] (
   extends RegressionModel[Vector, LinearRegressionModel]
   with LinearRegressionParams {
 
+  private var trainingSummary: Option[LinearRegressionTrainingSummary] = None
+
+  /**
+   * Gets summary (e.g. residuals, mse, r-squared ) of model on training set. An exception is
+   * thrown if `trainingSummary == None`.
+   */
+  def summary: LinearRegressionTrainingSummary = trainingSummary match {
+    case Some(summ) => summ
+    case None =>
+      throw new SparkException(
+        "No training summary available for this LinearRegressionModel",
+        new NullPointerException())
+  }
+
+  private[regression] def setSummary(summary: LinearRegressionTrainingSummary): this.type = {
+    this.trainingSummary = Some(summary)
+    this
+  }
+
+  /** Indicates whether a training summary exists for this model instance. */
+  def hasSummary: Boolean = trainingSummary.isDefined
+
+  /**
+   * Evaluates the model on a testset.
+   * @param dataset Test dataset to evaluate model on.
+   */
+  // TODO: decide on a good name before exposing to public API
+  private[regression] def evaluate(dataset: DataFrame): LinearRegressionSummary = {
+    val t = udf { features: Vector => predict(features) }
+    val predictionAndObservations = dataset
+      .select(col($(labelCol)), t(col($(featuresCol))).as($(predictionCol)))
+
+    new LinearRegressionSummary(predictionAndObservations, $(predictionCol), $(labelCol))
+  }
+
   override protected def predict(features: Vector): Double = {
     dot(features, weights) + intercept
   }
 
   override def copy(extra: ParamMap): LinearRegressionModel = {
-    copyValues(new LinearRegressionModel(uid, weights, intercept), extra)
+    val newModel = copyValues(new LinearRegressionModel(uid, weights, intercept))
+    if (trainingSummary.isDefined) newModel.setSummary(trainingSummary.get)
+    newModel
+  }
+}
+
+/**
+ * :: Experimental ::
+ * Linear regression training results.
+ * @param predictions predictions outputted by the model's `transform` method.
+ * @param objectiveHistory objective function (scaled loss + regularization) at each iteration.
+ */
+@Experimental
+class LinearRegressionTrainingSummary private[regression] (
+    predictions: DataFrame,
+    predictionCol: String,
+    labelCol: String,
+    val objectiveHistory: Array[Double])
+  extends LinearRegressionSummary(predictions, predictionCol, labelCol) {
+
+  /** Number of training iterations until termination */
+  val totalIterations = objectiveHistory.length
+
+}
+
+/**
+ * :: Experimental ::
+ * Linear regression results evaluated on a dataset.
+ * @param predictions predictions outputted by the model's `transform` method.
+ */
+@Experimental
+class LinearRegressionSummary private[regression] (
+    @transient val predictions: DataFrame,
+    val predictionCol: String,
+    val labelCol: String) extends Serializable {
+
+  @transient private val metrics = new RegressionMetrics(
+    predictions
+      .select(predictionCol, labelCol)
+      .map { case Row(pred: Double, label: Double) => (pred, label) } )
+
+  /**
+   * Returns the explained variance regression score.
+   * explainedVariance = 1 - variance(y - \hat{y}) / variance(y)
+   * Reference: [[http://en.wikipedia.org/wiki/Explained_variation]]
+   */
+  val explainedVariance: Double = metrics.explainedVariance
+
+  /**
+   * Returns the mean absolute error, which is a risk function corresponding to the
+   * expected value of the absolute error loss or l1-norm loss.
+   */
+  val meanAbsoluteError: Double = metrics.meanAbsoluteError
+
+  /**
+   * Returns the mean squared error, which is a risk function corresponding to the
+   * expected value of the squared error loss or quadratic loss.
+   */
+  val meanSquaredError: Double = metrics.meanSquaredError
+
+  /**
+   * Returns the root mean squared error, which is defined as the square root of
+   * the mean squared error.
+   */
+  val rootMeanSquaredError: Double = metrics.rootMeanSquaredError
+
+  /**
+   * Returns R^2^, the coefficient of determination.
+   * Reference: [[http://en.wikipedia.org/wiki/Coefficient_of_determination]]
+   */
+  val r2: Double = metrics.r2
+
+  /** Residuals (predicted value - label value) */
+  @transient lazy val residuals: DataFrame = {
+    val t = udf { (pred: Double, label: Double) => pred - label}
+    predictions.select(t(col(predictionCol), col(labelCol)).as("residuals"))
   }
+
 }
 
 /**
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
index 4f6a57739558b..cf120cf2a4b47 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
@@ -289,4 +289,63 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
         assert(prediction1 ~== prediction2 relTol 1E-5)
     }
   }
+
+  test("linear regression model training summary") {
+    val trainer = new LinearRegression
+    val model = trainer.fit(dataset)
+
+    // Training results for the model should be available
+    assert(model.hasSummary)
+
+    // Residuals in [[LinearRegressionResults]] should equal those manually computed
+    val expectedResiduals = dataset.select("features", "label")
+      .map { case Row(features: DenseVector, label: Double) =>
+      val prediction =
+        features(0) * model.weights(0) + features(1) * model.weights(1) + model.intercept
+      prediction - label
+    }
+      .zip(model.summary.residuals.map(_.getDouble(0)))
+      .collect()
+      .foreach { case (manualResidual: Double, resultResidual: Double) =>
+      assert(manualResidual ~== resultResidual relTol 1E-5)
+    }
+
+    /*
+       Use the following R code to generate model training results.
+
+       predictions <- predict(fit, newx=features)
+       residuals <- predictions - label
+       > mean(residuals^2) # MSE
+       [1] 0.009720325
+       > mean(abs(residuals)) # MAD
+       [1] 0.07863206
+       > cor(predictions, label)^2# r^2
+               [,1]
+       s0 0.9998749
+     */
+    assert(model.summary.meanSquaredError ~== 0.00972035 relTol 1E-5)
+    assert(model.summary.meanAbsoluteError ~== 0.07863206  relTol 1E-5)
+    assert(model.summary.r2 ~== 0.9998749 relTol 1E-5)
+
+    // Objective function should be monotonically decreasing for linear regression
+    assert(
+      model.summary
+        .objectiveHistory
+        .sliding(2)
+        .forall(x => x(0) >= x(1)))
+  }
+
+  test("linear regression model testset evaluation summary") {
+    val trainer = new LinearRegression
+    val model = trainer.fit(dataset)
+
+    // Evaluating on training dataset should yield results summary equal to training summary
+    val testSummary = model.evaluate(dataset)
+    assert(model.summary.meanSquaredError ~== testSummary.meanSquaredError relTol 1E-5)
+    assert(model.summary.r2 ~== testSummary.r2 relTol 1E-5)
+    model.summary.residuals.select("residuals").collect()
+      .zip(testSummary.residuals.select("residuals").collect())
+      .forall { case (Row(r1: Double), Row(r2: Double)) => r1 ~== r2 relTol 1E-5 }
+  }
+
 }

From 2d45571fcb002cc9f03056c5a3f14493b83315a4 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Thu, 9 Jul 2015 17:09:16 -0700
Subject: [PATCH 103/549] [SPARK-8959] [SQL] [HOTFIX] Removes parquet-thrift
 and libthrift dependencies

These two dependencies were introduced in #7231 to help testing Parquet compatibility with `parquet-thrift`. However, they somehow crash the Scala compiler in Maven builds.

This PR fixes this issue by:

1. Removing these two dependencies, and
2. Instead of generating the testing Parquet file programmatically, checking in an actual testing Parquet file generated by `parquet-thrift` as a test resource.

This is just a quick fix to bring back Maven builds. Need to figure out the root case as binary Parquet files are harder to maintain.

Author: Cheng Lian <lian@databricks.com>

Closes #7330 from liancheng/spark-8959 and squashes the following commits:

cf69512 [Cheng Lian] Brings back Maven builds
---
 pom.xml                                       |   14 -
 sql/core/pom.xml                              |   10 -
 .../spark/sql/parquet/test/thrift/Nested.java |  541 ----
 .../test/thrift/ParquetThriftCompat.java      | 2808 -----------------
 .../spark/sql/parquet/test/thrift/Suit.java   |   51 -
 .../parquet-thrift-compat.snappy.parquet      |  Bin 0 -> 10550 bytes
 .../ParquetThriftCompatibilitySuite.scala     |   78 +-
 7 files changed, 8 insertions(+), 3494 deletions(-)
 delete mode 100644 sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/thrift/Nested.java
 delete mode 100644 sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/thrift/ParquetThriftCompat.java
 delete mode 100644 sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/thrift/Suit.java
 create mode 100755 sql/core/src/test/resources/parquet-thrift-compat.snappy.parquet

diff --git a/pom.xml b/pom.xml
index 529e47f8b5253..1eda108dc065b 100644
--- a/pom.xml
+++ b/pom.xml
@@ -161,7 +161,6 @@
     <fasterxml.jackson.version>2.4.4</fasterxml.jackson.version>
     <snappy.version>1.1.1.7</snappy.version>
     <netlib.java.version>1.1.2</netlib.java.version>
-    <thrift.version>0.9.2</thrift.version>
     <!-- For maven shade plugin (see SPARK-8819) -->
     <create.dependency.reduced.pom>false</create.dependency.reduced.pom>
 
@@ -181,7 +180,6 @@
     <hive.deps.scope>compile</hive.deps.scope>
     <parquet.deps.scope>compile</parquet.deps.scope>
     <parquet.test.deps.scope>test</parquet.test.deps.scope>
-    <thrift.test.deps.scope>test</thrift.test.deps.scope>
 
     <!--
       Overridable test home. So that you can call individual pom files directly without
@@ -1122,18 +1120,6 @@
         <version>${parquet.version}</version>
         <scope>${parquet.test.deps.scope}</scope>
       </dependency>
-      <dependency>
-        <groupId>org.apache.parquet</groupId>
-        <artifactId>parquet-thrift</artifactId>
-        <version>${parquet.version}</version>
-        <scope>${parquet.test.deps.scope}</scope>
-      </dependency>
-      <dependency>
-        <groupId>org.apache.thrift</groupId>
-        <artifactId>libthrift</artifactId>
-        <version>${thrift.version}</version>
-        <scope>${thrift.test.deps.scope}</scope>
-      </dependency>
       <dependency>
         <groupId>org.apache.flume</groupId>
         <artifactId>flume-ng-core</artifactId>
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index f90099f22d4bd..2c03fee9dbd71 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -106,16 +106,6 @@
       <artifactId>parquet-avro</artifactId>
       <scope>test</scope>
     </dependency>
-    <dependency>
-      <groupId>org.apache.parquet</groupId>
-      <artifactId>parquet-thrift</artifactId>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.thrift</groupId>
-      <artifactId>libthrift</artifactId>
-      <scope>test</scope>
-    </dependency>
   </dependencies>
   <build>
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
diff --git a/sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/thrift/Nested.java b/sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/thrift/Nested.java
deleted file mode 100644
index 281e60cc3ae34..0000000000000
--- a/sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/thrift/Nested.java
+++ /dev/null
@@ -1,541 +0,0 @@
-/**
- * Autogenerated by Thrift Compiler (0.9.2)
- *
- * DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
- *  @generated
- */
-package org.apache.spark.sql.parquet.test.thrift;
-
-import org.apache.thrift.scheme.IScheme;
-import org.apache.thrift.scheme.SchemeFactory;
-import org.apache.thrift.scheme.StandardScheme;
-
-import org.apache.thrift.scheme.TupleScheme;
-import org.apache.thrift.protocol.TTupleProtocol;
-import org.apache.thrift.protocol.TProtocolException;
-import org.apache.thrift.EncodingUtils;
-import org.apache.thrift.TException;
-import org.apache.thrift.async.AsyncMethodCallback;
-import org.apache.thrift.server.AbstractNonblockingServer.*;
-import java.util.List;
-import java.util.ArrayList;
-import java.util.Map;
-import java.util.HashMap;
-import java.util.EnumMap;
-import java.util.Set;
-import java.util.HashSet;
-import java.util.EnumSet;
-import java.util.Collections;
-import java.util.BitSet;
-import java.nio.ByteBuffer;
-import java.util.Arrays;
-import javax.annotation.Generated;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-@SuppressWarnings({"cast", "rawtypes", "serial", "unchecked"})
-@Generated(value = "Autogenerated by Thrift Compiler (0.9.2)", date = "2015-7-7")
-public class Nested implements org.apache.thrift.TBase<Nested, Nested._Fields>, java.io.Serializable, Cloneable, Comparable<Nested> {
-  private static final org.apache.thrift.protocol.TStruct STRUCT_DESC = new org.apache.thrift.protocol.TStruct("Nested");
-
-  private static final org.apache.thrift.protocol.TField NESTED_INTS_COLUMN_FIELD_DESC = new org.apache.thrift.protocol.TField("nestedIntsColumn", org.apache.thrift.protocol.TType.LIST, (short)1);
-  private static final org.apache.thrift.protocol.TField NESTED_STRING_COLUMN_FIELD_DESC = new org.apache.thrift.protocol.TField("nestedStringColumn", org.apache.thrift.protocol.TType.STRING, (short)2);
-
-  private static final Map<Class<? extends IScheme>, SchemeFactory> schemes = new HashMap<Class<? extends IScheme>, SchemeFactory>();
-  static {
-    schemes.put(StandardScheme.class, new NestedStandardSchemeFactory());
-    schemes.put(TupleScheme.class, new NestedTupleSchemeFactory());
-  }
-
-  public List<Integer> nestedIntsColumn; // required
-  public String nestedStringColumn; // required
-
-  /** The set of fields this struct contains, along with convenience methods for finding and manipulating them. */
-  public enum _Fields implements org.apache.thrift.TFieldIdEnum {
-    NESTED_INTS_COLUMN((short)1, "nestedIntsColumn"),
-    NESTED_STRING_COLUMN((short)2, "nestedStringColumn");
-
-    private static final Map<String, _Fields> byName = new HashMap<String, _Fields>();
-
-    static {
-      for (_Fields field : EnumSet.allOf(_Fields.class)) {
-        byName.put(field.getFieldName(), field);
-      }
-    }
-
-    /**
-     * Find the _Fields constant that matches fieldId, or null if its not found.
-     */
-    public static _Fields findByThriftId(int fieldId) {
-      switch(fieldId) {
-        case 1: // NESTED_INTS_COLUMN
-          return NESTED_INTS_COLUMN;
-        case 2: // NESTED_STRING_COLUMN
-          return NESTED_STRING_COLUMN;
-        default:
-          return null;
-      }
-    }
-
-    /**
-     * Find the _Fields constant that matches fieldId, throwing an exception
-     * if it is not found.
-     */
-    public static _Fields findByThriftIdOrThrow(int fieldId) {
-      _Fields fields = findByThriftId(fieldId);
-      if (fields == null) throw new IllegalArgumentException("Field " + fieldId + " doesn't exist!");
-      return fields;
-    }
-
-    /**
-     * Find the _Fields constant that matches name, or null if its not found.
-     */
-    public static _Fields findByName(String name) {
-      return byName.get(name);
-    }
-
-    private final short _thriftId;
-    private final String _fieldName;
-
-    _Fields(short thriftId, String fieldName) {
-      _thriftId = thriftId;
-      _fieldName = fieldName;
-    }
-
-    public short getThriftFieldId() {
-      return _thriftId;
-    }
-
-    public String getFieldName() {
-      return _fieldName;
-    }
-  }
-
-  // isset id assignments
-  public static final Map<_Fields, org.apache.thrift.meta_data.FieldMetaData> metaDataMap;
-  static {
-    Map<_Fields, org.apache.thrift.meta_data.FieldMetaData> tmpMap = new EnumMap<_Fields, org.apache.thrift.meta_data.FieldMetaData>(_Fields.class);
-    tmpMap.put(_Fields.NESTED_INTS_COLUMN, new org.apache.thrift.meta_data.FieldMetaData("nestedIntsColumn", org.apache.thrift.TFieldRequirementType.REQUIRED, 
-        new org.apache.thrift.meta_data.ListMetaData(org.apache.thrift.protocol.TType.LIST, 
-            new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.I32))));
-    tmpMap.put(_Fields.NESTED_STRING_COLUMN, new org.apache.thrift.meta_data.FieldMetaData("nestedStringColumn", org.apache.thrift.TFieldRequirementType.REQUIRED, 
-        new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.STRING)));
-    metaDataMap = Collections.unmodifiableMap(tmpMap);
-    org.apache.thrift.meta_data.FieldMetaData.addStructMetaDataMap(Nested.class, metaDataMap);
-  }
-
-  public Nested() {
-  }
-
-  public Nested(
-    List<Integer> nestedIntsColumn,
-    String nestedStringColumn)
-  {
-    this();
-    this.nestedIntsColumn = nestedIntsColumn;
-    this.nestedStringColumn = nestedStringColumn;
-  }
-
-  /**
-   * Performs a deep copy on <i>other</i>.
-   */
-  public Nested(Nested other) {
-    if (other.isSetNestedIntsColumn()) {
-      List<Integer> __this__nestedIntsColumn = new ArrayList<Integer>(other.nestedIntsColumn);
-      this.nestedIntsColumn = __this__nestedIntsColumn;
-    }
-    if (other.isSetNestedStringColumn()) {
-      this.nestedStringColumn = other.nestedStringColumn;
-    }
-  }
-
-  public Nested deepCopy() {
-    return new Nested(this);
-  }
-
-  @Override
-  public void clear() {
-    this.nestedIntsColumn = null;
-    this.nestedStringColumn = null;
-  }
-
-  public int getNestedIntsColumnSize() {
-    return (this.nestedIntsColumn == null) ? 0 : this.nestedIntsColumn.size();
-  }
-
-  public java.util.Iterator<Integer> getNestedIntsColumnIterator() {
-    return (this.nestedIntsColumn == null) ? null : this.nestedIntsColumn.iterator();
-  }
-
-  public void addToNestedIntsColumn(int elem) {
-    if (this.nestedIntsColumn == null) {
-      this.nestedIntsColumn = new ArrayList<Integer>();
-    }
-    this.nestedIntsColumn.add(elem);
-  }
-
-  public List<Integer> getNestedIntsColumn() {
-    return this.nestedIntsColumn;
-  }
-
-  public Nested setNestedIntsColumn(List<Integer> nestedIntsColumn) {
-    this.nestedIntsColumn = nestedIntsColumn;
-    return this;
-  }
-
-  public void unsetNestedIntsColumn() {
-    this.nestedIntsColumn = null;
-  }
-
-  /** Returns true if field nestedIntsColumn is set (has been assigned a value) and false otherwise */
-  public boolean isSetNestedIntsColumn() {
-    return this.nestedIntsColumn != null;
-  }
-
-  public void setNestedIntsColumnIsSet(boolean value) {
-    if (!value) {
-      this.nestedIntsColumn = null;
-    }
-  }
-
-  public String getNestedStringColumn() {
-    return this.nestedStringColumn;
-  }
-
-  public Nested setNestedStringColumn(String nestedStringColumn) {
-    this.nestedStringColumn = nestedStringColumn;
-    return this;
-  }
-
-  public void unsetNestedStringColumn() {
-    this.nestedStringColumn = null;
-  }
-
-  /** Returns true if field nestedStringColumn is set (has been assigned a value) and false otherwise */
-  public boolean isSetNestedStringColumn() {
-    return this.nestedStringColumn != null;
-  }
-
-  public void setNestedStringColumnIsSet(boolean value) {
-    if (!value) {
-      this.nestedStringColumn = null;
-    }
-  }
-
-  public void setFieldValue(_Fields field, Object value) {
-    switch (field) {
-    case NESTED_INTS_COLUMN:
-      if (value == null) {
-        unsetNestedIntsColumn();
-      } else {
-        setNestedIntsColumn((List<Integer>)value);
-      }
-      break;
-
-    case NESTED_STRING_COLUMN:
-      if (value == null) {
-        unsetNestedStringColumn();
-      } else {
-        setNestedStringColumn((String)value);
-      }
-      break;
-
-    }
-  }
-
-  public Object getFieldValue(_Fields field) {
-    switch (field) {
-    case NESTED_INTS_COLUMN:
-      return getNestedIntsColumn();
-
-    case NESTED_STRING_COLUMN:
-      return getNestedStringColumn();
-
-    }
-    throw new IllegalStateException();
-  }
-
-  /** Returns true if field corresponding to fieldID is set (has been assigned a value) and false otherwise */
-  public boolean isSet(_Fields field) {
-    if (field == null) {
-      throw new IllegalArgumentException();
-    }
-
-    switch (field) {
-    case NESTED_INTS_COLUMN:
-      return isSetNestedIntsColumn();
-    case NESTED_STRING_COLUMN:
-      return isSetNestedStringColumn();
-    }
-    throw new IllegalStateException();
-  }
-
-  @Override
-  public boolean equals(Object that) {
-    if (that == null)
-      return false;
-    if (that instanceof Nested)
-      return this.equals((Nested)that);
-    return false;
-  }
-
-  public boolean equals(Nested that) {
-    if (that == null)
-      return false;
-
-    boolean this_present_nestedIntsColumn = true && this.isSetNestedIntsColumn();
-    boolean that_present_nestedIntsColumn = true && that.isSetNestedIntsColumn();
-    if (this_present_nestedIntsColumn || that_present_nestedIntsColumn) {
-      if (!(this_present_nestedIntsColumn && that_present_nestedIntsColumn))
-        return false;
-      if (!this.nestedIntsColumn.equals(that.nestedIntsColumn))
-        return false;
-    }
-
-    boolean this_present_nestedStringColumn = true && this.isSetNestedStringColumn();
-    boolean that_present_nestedStringColumn = true && that.isSetNestedStringColumn();
-    if (this_present_nestedStringColumn || that_present_nestedStringColumn) {
-      if (!(this_present_nestedStringColumn && that_present_nestedStringColumn))
-        return false;
-      if (!this.nestedStringColumn.equals(that.nestedStringColumn))
-        return false;
-    }
-
-    return true;
-  }
-
-  @Override
-  public int hashCode() {
-    List<Object> list = new ArrayList<Object>();
-
-    boolean present_nestedIntsColumn = true && (isSetNestedIntsColumn());
-    list.add(present_nestedIntsColumn);
-    if (present_nestedIntsColumn)
-      list.add(nestedIntsColumn);
-
-    boolean present_nestedStringColumn = true && (isSetNestedStringColumn());
-    list.add(present_nestedStringColumn);
-    if (present_nestedStringColumn)
-      list.add(nestedStringColumn);
-
-    return list.hashCode();
-  }
-
-  @Override
-  public int compareTo(Nested other) {
-    if (!getClass().equals(other.getClass())) {
-      return getClass().getName().compareTo(other.getClass().getName());
-    }
-
-    int lastComparison = 0;
-
-    lastComparison = Boolean.valueOf(isSetNestedIntsColumn()).compareTo(other.isSetNestedIntsColumn());
-    if (lastComparison != 0) {
-      return lastComparison;
-    }
-    if (isSetNestedIntsColumn()) {
-      lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.nestedIntsColumn, other.nestedIntsColumn);
-      if (lastComparison != 0) {
-        return lastComparison;
-      }
-    }
-    lastComparison = Boolean.valueOf(isSetNestedStringColumn()).compareTo(other.isSetNestedStringColumn());
-    if (lastComparison != 0) {
-      return lastComparison;
-    }
-    if (isSetNestedStringColumn()) {
-      lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.nestedStringColumn, other.nestedStringColumn);
-      if (lastComparison != 0) {
-        return lastComparison;
-      }
-    }
-    return 0;
-  }
-
-  public _Fields fieldForId(int fieldId) {
-    return _Fields.findByThriftId(fieldId);
-  }
-
-  public void read(org.apache.thrift.protocol.TProtocol iprot) throws org.apache.thrift.TException {
-    schemes.get(iprot.getScheme()).getScheme().read(iprot, this);
-  }
-
-  public void write(org.apache.thrift.protocol.TProtocol oprot) throws org.apache.thrift.TException {
-    schemes.get(oprot.getScheme()).getScheme().write(oprot, this);
-  }
-
-  @Override
-  public String toString() {
-    StringBuilder sb = new StringBuilder("Nested(");
-    boolean first = true;
-
-    sb.append("nestedIntsColumn:");
-    if (this.nestedIntsColumn == null) {
-      sb.append("null");
-    } else {
-      sb.append(this.nestedIntsColumn);
-    }
-    first = false;
-    if (!first) sb.append(", ");
-    sb.append("nestedStringColumn:");
-    if (this.nestedStringColumn == null) {
-      sb.append("null");
-    } else {
-      sb.append(this.nestedStringColumn);
-    }
-    first = false;
-    sb.append(")");
-    return sb.toString();
-  }
-
-  public void validate() throws org.apache.thrift.TException {
-    // check for required fields
-    if (nestedIntsColumn == null) {
-      throw new org.apache.thrift.protocol.TProtocolException("Required field 'nestedIntsColumn' was not present! Struct: " + toString());
-    }
-    if (nestedStringColumn == null) {
-      throw new org.apache.thrift.protocol.TProtocolException("Required field 'nestedStringColumn' was not present! Struct: " + toString());
-    }
-    // check for sub-struct validity
-  }
-
-  private void writeObject(java.io.ObjectOutputStream out) throws java.io.IOException {
-    try {
-      write(new org.apache.thrift.protocol.TCompactProtocol(new org.apache.thrift.transport.TIOStreamTransport(out)));
-    } catch (org.apache.thrift.TException te) {
-      throw new java.io.IOException(te);
-    }
-  }
-
-  private void readObject(java.io.ObjectInputStream in) throws java.io.IOException, ClassNotFoundException {
-    try {
-      read(new org.apache.thrift.protocol.TCompactProtocol(new org.apache.thrift.transport.TIOStreamTransport(in)));
-    } catch (org.apache.thrift.TException te) {
-      throw new java.io.IOException(te);
-    }
-  }
-
-  private static class NestedStandardSchemeFactory implements SchemeFactory {
-    public NestedStandardScheme getScheme() {
-      return new NestedStandardScheme();
-    }
-  }
-
-  private static class NestedStandardScheme extends StandardScheme<Nested> {
-
-    public void read(org.apache.thrift.protocol.TProtocol iprot, Nested struct) throws org.apache.thrift.TException {
-      org.apache.thrift.protocol.TField schemeField;
-      iprot.readStructBegin();
-      while (true)
-      {
-        schemeField = iprot.readFieldBegin();
-        if (schemeField.type == org.apache.thrift.protocol.TType.STOP) { 
-          break;
-        }
-        switch (schemeField.id) {
-          case 1: // NESTED_INTS_COLUMN
-            if (schemeField.type == org.apache.thrift.protocol.TType.LIST) {
-              {
-                org.apache.thrift.protocol.TList _list0 = iprot.readListBegin();
-                struct.nestedIntsColumn = new ArrayList<Integer>(_list0.size);
-                int _elem1;
-                for (int _i2 = 0; _i2 < _list0.size; ++_i2)
-                {
-                  _elem1 = iprot.readI32();
-                  struct.nestedIntsColumn.add(_elem1);
-                }
-                iprot.readListEnd();
-              }
-              struct.setNestedIntsColumnIsSet(true);
-            } else { 
-              org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
-            }
-            break;
-          case 2: // NESTED_STRING_COLUMN
-            if (schemeField.type == org.apache.thrift.protocol.TType.STRING) {
-              struct.nestedStringColumn = iprot.readString();
-              struct.setNestedStringColumnIsSet(true);
-            } else { 
-              org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
-            }
-            break;
-          default:
-            org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
-        }
-        iprot.readFieldEnd();
-      }
-      iprot.readStructEnd();
-
-      // check for required fields of primitive type, which can't be checked in the validate method
-      struct.validate();
-    }
-
-    public void write(org.apache.thrift.protocol.TProtocol oprot, Nested struct) throws org.apache.thrift.TException {
-      struct.validate();
-
-      oprot.writeStructBegin(STRUCT_DESC);
-      if (struct.nestedIntsColumn != null) {
-        oprot.writeFieldBegin(NESTED_INTS_COLUMN_FIELD_DESC);
-        {
-          oprot.writeListBegin(new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.I32, struct.nestedIntsColumn.size()));
-          for (int _iter3 : struct.nestedIntsColumn)
-          {
-            oprot.writeI32(_iter3);
-          }
-          oprot.writeListEnd();
-        }
-        oprot.writeFieldEnd();
-      }
-      if (struct.nestedStringColumn != null) {
-        oprot.writeFieldBegin(NESTED_STRING_COLUMN_FIELD_DESC);
-        oprot.writeString(struct.nestedStringColumn);
-        oprot.writeFieldEnd();
-      }
-      oprot.writeFieldStop();
-      oprot.writeStructEnd();
-    }
-
-  }
-
-  private static class NestedTupleSchemeFactory implements SchemeFactory {
-    public NestedTupleScheme getScheme() {
-      return new NestedTupleScheme();
-    }
-  }
-
-  private static class NestedTupleScheme extends TupleScheme<Nested> {
-
-    @Override
-    public void write(org.apache.thrift.protocol.TProtocol prot, Nested struct) throws org.apache.thrift.TException {
-      TTupleProtocol oprot = (TTupleProtocol) prot;
-      {
-        oprot.writeI32(struct.nestedIntsColumn.size());
-        for (int _iter4 : struct.nestedIntsColumn)
-        {
-          oprot.writeI32(_iter4);
-        }
-      }
-      oprot.writeString(struct.nestedStringColumn);
-    }
-
-    @Override
-    public void read(org.apache.thrift.protocol.TProtocol prot, Nested struct) throws org.apache.thrift.TException {
-      TTupleProtocol iprot = (TTupleProtocol) prot;
-      {
-        org.apache.thrift.protocol.TList _list5 = new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.I32, iprot.readI32());
-        struct.nestedIntsColumn = new ArrayList<Integer>(_list5.size);
-        int _elem6;
-        for (int _i7 = 0; _i7 < _list5.size; ++_i7)
-        {
-          _elem6 = iprot.readI32();
-          struct.nestedIntsColumn.add(_elem6);
-        }
-      }
-      struct.setNestedIntsColumnIsSet(true);
-      struct.nestedStringColumn = iprot.readString();
-      struct.setNestedStringColumnIsSet(true);
-    }
-  }
-
-}
-
diff --git a/sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/thrift/ParquetThriftCompat.java b/sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/thrift/ParquetThriftCompat.java
deleted file mode 100644
index 326ae9dbaa0d1..0000000000000
--- a/sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/thrift/ParquetThriftCompat.java
+++ /dev/null
@@ -1,2808 +0,0 @@
-/**
- * Autogenerated by Thrift Compiler (0.9.2)
- *
- * DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
- *  @generated
- */
-package org.apache.spark.sql.parquet.test.thrift;
-
-import org.apache.thrift.scheme.IScheme;
-import org.apache.thrift.scheme.SchemeFactory;
-import org.apache.thrift.scheme.StandardScheme;
-
-import org.apache.thrift.scheme.TupleScheme;
-import org.apache.thrift.protocol.TTupleProtocol;
-import org.apache.thrift.protocol.TProtocolException;
-import org.apache.thrift.EncodingUtils;
-import org.apache.thrift.TException;
-import org.apache.thrift.async.AsyncMethodCallback;
-import org.apache.thrift.server.AbstractNonblockingServer.*;
-import java.util.List;
-import java.util.ArrayList;
-import java.util.Map;
-import java.util.HashMap;
-import java.util.EnumMap;
-import java.util.Set;
-import java.util.HashSet;
-import java.util.EnumSet;
-import java.util.Collections;
-import java.util.BitSet;
-import java.nio.ByteBuffer;
-import java.util.Arrays;
-import javax.annotation.Generated;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-@SuppressWarnings({"cast", "rawtypes", "serial", "unchecked"})
-/**
- * This is a test struct for testing parquet-thrift compatibility.
- */
-@Generated(value = "Autogenerated by Thrift Compiler (0.9.2)", date = "2015-7-7")
-public class ParquetThriftCompat implements org.apache.thrift.TBase<ParquetThriftCompat, ParquetThriftCompat._Fields>, java.io.Serializable, Cloneable, Comparable<ParquetThriftCompat> {
-  private static final org.apache.thrift.protocol.TStruct STRUCT_DESC = new org.apache.thrift.protocol.TStruct("ParquetThriftCompat");
-
-  private static final org.apache.thrift.protocol.TField BOOL_COLUMN_FIELD_DESC = new org.apache.thrift.protocol.TField("boolColumn", org.apache.thrift.protocol.TType.BOOL, (short)1);
-  private static final org.apache.thrift.protocol.TField BYTE_COLUMN_FIELD_DESC = new org.apache.thrift.protocol.TField("byteColumn", org.apache.thrift.protocol.TType.BYTE, (short)2);
-  private static final org.apache.thrift.protocol.TField SHORT_COLUMN_FIELD_DESC = new org.apache.thrift.protocol.TField("shortColumn", org.apache.thrift.protocol.TType.I16, (short)3);
-  private static final org.apache.thrift.protocol.TField INT_COLUMN_FIELD_DESC = new org.apache.thrift.protocol.TField("intColumn", org.apache.thrift.protocol.TType.I32, (short)4);
-  private static final org.apache.thrift.protocol.TField LONG_COLUMN_FIELD_DESC = new org.apache.thrift.protocol.TField("longColumn", org.apache.thrift.protocol.TType.I64, (short)5);
-  private static final org.apache.thrift.protocol.TField DOUBLE_COLUMN_FIELD_DESC = new org.apache.thrift.protocol.TField("doubleColumn", org.apache.thrift.protocol.TType.DOUBLE, (short)6);
-  private static final org.apache.thrift.protocol.TField BINARY_COLUMN_FIELD_DESC = new org.apache.thrift.protocol.TField("binaryColumn", org.apache.thrift.protocol.TType.STRING, (short)7);
-  private static final org.apache.thrift.protocol.TField STRING_COLUMN_FIELD_DESC = new org.apache.thrift.protocol.TField("stringColumn", org.apache.thrift.protocol.TType.STRING, (short)8);
-  private static final org.apache.thrift.protocol.TField ENUM_COLUMN_FIELD_DESC = new org.apache.thrift.protocol.TField("enumColumn", org.apache.thrift.protocol.TType.I32, (short)9);
-  private static final org.apache.thrift.protocol.TField MAYBE_BOOL_COLUMN_FIELD_DESC = new org.apache.thrift.protocol.TField("maybeBoolColumn", org.apache.thrift.protocol.TType.BOOL, (short)10);
-  private static final org.apache.thrift.protocol.TField MAYBE_BYTE_COLUMN_FIELD_DESC = new org.apache.thrift.protocol.TField("maybeByteColumn", org.apache.thrift.protocol.TType.BYTE, (short)11);
-  private static final org.apache.thrift.protocol.TField MAYBE_SHORT_COLUMN_FIELD_DESC = new org.apache.thrift.protocol.TField("maybeShortColumn", org.apache.thrift.protocol.TType.I16, (short)12);
-  private static final org.apache.thrift.protocol.TField MAYBE_INT_COLUMN_FIELD_DESC = new org.apache.thrift.protocol.TField("maybeIntColumn", org.apache.thrift.protocol.TType.I32, (short)13);
-  private static final org.apache.thrift.protocol.TField MAYBE_LONG_COLUMN_FIELD_DESC = new org.apache.thrift.protocol.TField("maybeLongColumn", org.apache.thrift.protocol.TType.I64, (short)14);
-  private static final org.apache.thrift.protocol.TField MAYBE_DOUBLE_COLUMN_FIELD_DESC = new org.apache.thrift.protocol.TField("maybeDoubleColumn", org.apache.thrift.protocol.TType.DOUBLE, (short)15);
-  private static final org.apache.thrift.protocol.TField MAYBE_BINARY_COLUMN_FIELD_DESC = new org.apache.thrift.protocol.TField("maybeBinaryColumn", org.apache.thrift.protocol.TType.STRING, (short)16);
-  private static final org.apache.thrift.protocol.TField MAYBE_STRING_COLUMN_FIELD_DESC = new org.apache.thrift.protocol.TField("maybeStringColumn", org.apache.thrift.protocol.TType.STRING, (short)17);
-  private static final org.apache.thrift.protocol.TField MAYBE_ENUM_COLUMN_FIELD_DESC = new org.apache.thrift.protocol.TField("maybeEnumColumn", org.apache.thrift.protocol.TType.I32, (short)18);
-  private static final org.apache.thrift.protocol.TField STRINGS_COLUMN_FIELD_DESC = new org.apache.thrift.protocol.TField("stringsColumn", org.apache.thrift.protocol.TType.LIST, (short)19);
-  private static final org.apache.thrift.protocol.TField INT_SET_COLUMN_FIELD_DESC = new org.apache.thrift.protocol.TField("intSetColumn", org.apache.thrift.protocol.TType.SET, (short)20);
-  private static final org.apache.thrift.protocol.TField INT_TO_STRING_COLUMN_FIELD_DESC = new org.apache.thrift.protocol.TField("intToStringColumn", org.apache.thrift.protocol.TType.MAP, (short)21);
-  private static final org.apache.thrift.protocol.TField COMPLEX_COLUMN_FIELD_DESC = new org.apache.thrift.protocol.TField("complexColumn", org.apache.thrift.protocol.TType.MAP, (short)22);
-
-  private static final Map<Class<? extends IScheme>, SchemeFactory> schemes = new HashMap<Class<? extends IScheme>, SchemeFactory>();
-  static {
-    schemes.put(StandardScheme.class, new ParquetThriftCompatStandardSchemeFactory());
-    schemes.put(TupleScheme.class, new ParquetThriftCompatTupleSchemeFactory());
-  }
-
-  public boolean boolColumn; // required
-  public byte byteColumn; // required
-  public short shortColumn; // required
-  public int intColumn; // required
-  public long longColumn; // required
-  public double doubleColumn; // required
-  public ByteBuffer binaryColumn; // required
-  public String stringColumn; // required
-  /**
-   * 
-   * @see Suit
-   */
-  public Suit enumColumn; // required
-  public boolean maybeBoolColumn; // optional
-  public byte maybeByteColumn; // optional
-  public short maybeShortColumn; // optional
-  public int maybeIntColumn; // optional
-  public long maybeLongColumn; // optional
-  public double maybeDoubleColumn; // optional
-  public ByteBuffer maybeBinaryColumn; // optional
-  public String maybeStringColumn; // optional
-  /**
-   * 
-   * @see Suit
-   */
-  public Suit maybeEnumColumn; // optional
-  public List<String> stringsColumn; // required
-  public Set<Integer> intSetColumn; // required
-  public Map<Integer,String> intToStringColumn; // required
-  public Map<Integer,List<Nested>> complexColumn; // required
-
-  /** The set of fields this struct contains, along with convenience methods for finding and manipulating them. */
-  public enum _Fields implements org.apache.thrift.TFieldIdEnum {
-    BOOL_COLUMN((short)1, "boolColumn"),
-    BYTE_COLUMN((short)2, "byteColumn"),
-    SHORT_COLUMN((short)3, "shortColumn"),
-    INT_COLUMN((short)4, "intColumn"),
-    LONG_COLUMN((short)5, "longColumn"),
-    DOUBLE_COLUMN((short)6, "doubleColumn"),
-    BINARY_COLUMN((short)7, "binaryColumn"),
-    STRING_COLUMN((short)8, "stringColumn"),
-    /**
-     * 
-     * @see Suit
-     */
-    ENUM_COLUMN((short)9, "enumColumn"),
-    MAYBE_BOOL_COLUMN((short)10, "maybeBoolColumn"),
-    MAYBE_BYTE_COLUMN((short)11, "maybeByteColumn"),
-    MAYBE_SHORT_COLUMN((short)12, "maybeShortColumn"),
-    MAYBE_INT_COLUMN((short)13, "maybeIntColumn"),
-    MAYBE_LONG_COLUMN((short)14, "maybeLongColumn"),
-    MAYBE_DOUBLE_COLUMN((short)15, "maybeDoubleColumn"),
-    MAYBE_BINARY_COLUMN((short)16, "maybeBinaryColumn"),
-    MAYBE_STRING_COLUMN((short)17, "maybeStringColumn"),
-    /**
-     * 
-     * @see Suit
-     */
-    MAYBE_ENUM_COLUMN((short)18, "maybeEnumColumn"),
-    STRINGS_COLUMN((short)19, "stringsColumn"),
-    INT_SET_COLUMN((short)20, "intSetColumn"),
-    INT_TO_STRING_COLUMN((short)21, "intToStringColumn"),
-    COMPLEX_COLUMN((short)22, "complexColumn");
-
-    private static final Map<String, _Fields> byName = new HashMap<String, _Fields>();
-
-    static {
-      for (_Fields field : EnumSet.allOf(_Fields.class)) {
-        byName.put(field.getFieldName(), field);
-      }
-    }
-
-    /**
-     * Find the _Fields constant that matches fieldId, or null if its not found.
-     */
-    public static _Fields findByThriftId(int fieldId) {
-      switch(fieldId) {
-        case 1: // BOOL_COLUMN
-          return BOOL_COLUMN;
-        case 2: // BYTE_COLUMN
-          return BYTE_COLUMN;
-        case 3: // SHORT_COLUMN
-          return SHORT_COLUMN;
-        case 4: // INT_COLUMN
-          return INT_COLUMN;
-        case 5: // LONG_COLUMN
-          return LONG_COLUMN;
-        case 6: // DOUBLE_COLUMN
-          return DOUBLE_COLUMN;
-        case 7: // BINARY_COLUMN
-          return BINARY_COLUMN;
-        case 8: // STRING_COLUMN
-          return STRING_COLUMN;
-        case 9: // ENUM_COLUMN
-          return ENUM_COLUMN;
-        case 10: // MAYBE_BOOL_COLUMN
-          return MAYBE_BOOL_COLUMN;
-        case 11: // MAYBE_BYTE_COLUMN
-          return MAYBE_BYTE_COLUMN;
-        case 12: // MAYBE_SHORT_COLUMN
-          return MAYBE_SHORT_COLUMN;
-        case 13: // MAYBE_INT_COLUMN
-          return MAYBE_INT_COLUMN;
-        case 14: // MAYBE_LONG_COLUMN
-          return MAYBE_LONG_COLUMN;
-        case 15: // MAYBE_DOUBLE_COLUMN
-          return MAYBE_DOUBLE_COLUMN;
-        case 16: // MAYBE_BINARY_COLUMN
-          return MAYBE_BINARY_COLUMN;
-        case 17: // MAYBE_STRING_COLUMN
-          return MAYBE_STRING_COLUMN;
-        case 18: // MAYBE_ENUM_COLUMN
-          return MAYBE_ENUM_COLUMN;
-        case 19: // STRINGS_COLUMN
-          return STRINGS_COLUMN;
-        case 20: // INT_SET_COLUMN
-          return INT_SET_COLUMN;
-        case 21: // INT_TO_STRING_COLUMN
-          return INT_TO_STRING_COLUMN;
-        case 22: // COMPLEX_COLUMN
-          return COMPLEX_COLUMN;
-        default:
-          return null;
-      }
-    }
-
-    /**
-     * Find the _Fields constant that matches fieldId, throwing an exception
-     * if it is not found.
-     */
-    public static _Fields findByThriftIdOrThrow(int fieldId) {
-      _Fields fields = findByThriftId(fieldId);
-      if (fields == null) throw new IllegalArgumentException("Field " + fieldId + " doesn't exist!");
-      return fields;
-    }
-
-    /**
-     * Find the _Fields constant that matches name, or null if its not found.
-     */
-    public static _Fields findByName(String name) {
-      return byName.get(name);
-    }
-
-    private final short _thriftId;
-    private final String _fieldName;
-
-    _Fields(short thriftId, String fieldName) {
-      _thriftId = thriftId;
-      _fieldName = fieldName;
-    }
-
-    public short getThriftFieldId() {
-      return _thriftId;
-    }
-
-    public String getFieldName() {
-      return _fieldName;
-    }
-  }
-
-  // isset id assignments
-  private static final int __BOOLCOLUMN_ISSET_ID = 0;
-  private static final int __BYTECOLUMN_ISSET_ID = 1;
-  private static final int __SHORTCOLUMN_ISSET_ID = 2;
-  private static final int __INTCOLUMN_ISSET_ID = 3;
-  private static final int __LONGCOLUMN_ISSET_ID = 4;
-  private static final int __DOUBLECOLUMN_ISSET_ID = 5;
-  private static final int __MAYBEBOOLCOLUMN_ISSET_ID = 6;
-  private static final int __MAYBEBYTECOLUMN_ISSET_ID = 7;
-  private static final int __MAYBESHORTCOLUMN_ISSET_ID = 8;
-  private static final int __MAYBEINTCOLUMN_ISSET_ID = 9;
-  private static final int __MAYBELONGCOLUMN_ISSET_ID = 10;
-  private static final int __MAYBEDOUBLECOLUMN_ISSET_ID = 11;
-  private short __isset_bitfield = 0;
-  private static final _Fields optionals[] = {_Fields.MAYBE_BOOL_COLUMN,_Fields.MAYBE_BYTE_COLUMN,_Fields.MAYBE_SHORT_COLUMN,_Fields.MAYBE_INT_COLUMN,_Fields.MAYBE_LONG_COLUMN,_Fields.MAYBE_DOUBLE_COLUMN,_Fields.MAYBE_BINARY_COLUMN,_Fields.MAYBE_STRING_COLUMN,_Fields.MAYBE_ENUM_COLUMN};
-  public static final Map<_Fields, org.apache.thrift.meta_data.FieldMetaData> metaDataMap;
-  static {
-    Map<_Fields, org.apache.thrift.meta_data.FieldMetaData> tmpMap = new EnumMap<_Fields, org.apache.thrift.meta_data.FieldMetaData>(_Fields.class);
-    tmpMap.put(_Fields.BOOL_COLUMN, new org.apache.thrift.meta_data.FieldMetaData("boolColumn", org.apache.thrift.TFieldRequirementType.REQUIRED, 
-        new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.BOOL)));
-    tmpMap.put(_Fields.BYTE_COLUMN, new org.apache.thrift.meta_data.FieldMetaData("byteColumn", org.apache.thrift.TFieldRequirementType.REQUIRED, 
-        new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.BYTE)));
-    tmpMap.put(_Fields.SHORT_COLUMN, new org.apache.thrift.meta_data.FieldMetaData("shortColumn", org.apache.thrift.TFieldRequirementType.REQUIRED, 
-        new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.I16)));
-    tmpMap.put(_Fields.INT_COLUMN, new org.apache.thrift.meta_data.FieldMetaData("intColumn", org.apache.thrift.TFieldRequirementType.REQUIRED, 
-        new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.I32)));
-    tmpMap.put(_Fields.LONG_COLUMN, new org.apache.thrift.meta_data.FieldMetaData("longColumn", org.apache.thrift.TFieldRequirementType.REQUIRED, 
-        new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.I64)));
-    tmpMap.put(_Fields.DOUBLE_COLUMN, new org.apache.thrift.meta_data.FieldMetaData("doubleColumn", org.apache.thrift.TFieldRequirementType.REQUIRED, 
-        new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.DOUBLE)));
-    tmpMap.put(_Fields.BINARY_COLUMN, new org.apache.thrift.meta_data.FieldMetaData("binaryColumn", org.apache.thrift.TFieldRequirementType.REQUIRED, 
-        new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.STRING        , true)));
-    tmpMap.put(_Fields.STRING_COLUMN, new org.apache.thrift.meta_data.FieldMetaData("stringColumn", org.apache.thrift.TFieldRequirementType.REQUIRED, 
-        new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.STRING)));
-    tmpMap.put(_Fields.ENUM_COLUMN, new org.apache.thrift.meta_data.FieldMetaData("enumColumn", org.apache.thrift.TFieldRequirementType.REQUIRED, 
-        new org.apache.thrift.meta_data.EnumMetaData(org.apache.thrift.protocol.TType.ENUM, Suit.class)));
-    tmpMap.put(_Fields.MAYBE_BOOL_COLUMN, new org.apache.thrift.meta_data.FieldMetaData("maybeBoolColumn", org.apache.thrift.TFieldRequirementType.OPTIONAL, 
-        new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.BOOL)));
-    tmpMap.put(_Fields.MAYBE_BYTE_COLUMN, new org.apache.thrift.meta_data.FieldMetaData("maybeByteColumn", org.apache.thrift.TFieldRequirementType.OPTIONAL, 
-        new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.BYTE)));
-    tmpMap.put(_Fields.MAYBE_SHORT_COLUMN, new org.apache.thrift.meta_data.FieldMetaData("maybeShortColumn", org.apache.thrift.TFieldRequirementType.OPTIONAL, 
-        new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.I16)));
-    tmpMap.put(_Fields.MAYBE_INT_COLUMN, new org.apache.thrift.meta_data.FieldMetaData("maybeIntColumn", org.apache.thrift.TFieldRequirementType.OPTIONAL, 
-        new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.I32)));
-    tmpMap.put(_Fields.MAYBE_LONG_COLUMN, new org.apache.thrift.meta_data.FieldMetaData("maybeLongColumn", org.apache.thrift.TFieldRequirementType.OPTIONAL, 
-        new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.I64)));
-    tmpMap.put(_Fields.MAYBE_DOUBLE_COLUMN, new org.apache.thrift.meta_data.FieldMetaData("maybeDoubleColumn", org.apache.thrift.TFieldRequirementType.OPTIONAL, 
-        new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.DOUBLE)));
-    tmpMap.put(_Fields.MAYBE_BINARY_COLUMN, new org.apache.thrift.meta_data.FieldMetaData("maybeBinaryColumn", org.apache.thrift.TFieldRequirementType.OPTIONAL, 
-        new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.STRING        , true)));
-    tmpMap.put(_Fields.MAYBE_STRING_COLUMN, new org.apache.thrift.meta_data.FieldMetaData("maybeStringColumn", org.apache.thrift.TFieldRequirementType.OPTIONAL, 
-        new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.STRING)));
-    tmpMap.put(_Fields.MAYBE_ENUM_COLUMN, new org.apache.thrift.meta_data.FieldMetaData("maybeEnumColumn", org.apache.thrift.TFieldRequirementType.OPTIONAL, 
-        new org.apache.thrift.meta_data.EnumMetaData(org.apache.thrift.protocol.TType.ENUM, Suit.class)));
-    tmpMap.put(_Fields.STRINGS_COLUMN, new org.apache.thrift.meta_data.FieldMetaData("stringsColumn", org.apache.thrift.TFieldRequirementType.REQUIRED, 
-        new org.apache.thrift.meta_data.ListMetaData(org.apache.thrift.protocol.TType.LIST, 
-            new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.STRING))));
-    tmpMap.put(_Fields.INT_SET_COLUMN, new org.apache.thrift.meta_data.FieldMetaData("intSetColumn", org.apache.thrift.TFieldRequirementType.REQUIRED, 
-        new org.apache.thrift.meta_data.SetMetaData(org.apache.thrift.protocol.TType.SET, 
-            new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.I32))));
-    tmpMap.put(_Fields.INT_TO_STRING_COLUMN, new org.apache.thrift.meta_data.FieldMetaData("intToStringColumn", org.apache.thrift.TFieldRequirementType.REQUIRED, 
-        new org.apache.thrift.meta_data.MapMetaData(org.apache.thrift.protocol.TType.MAP, 
-            new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.I32), 
-            new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.STRING))));
-    tmpMap.put(_Fields.COMPLEX_COLUMN, new org.apache.thrift.meta_data.FieldMetaData("complexColumn", org.apache.thrift.TFieldRequirementType.REQUIRED, 
-        new org.apache.thrift.meta_data.MapMetaData(org.apache.thrift.protocol.TType.MAP, 
-            new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.I32), 
-            new org.apache.thrift.meta_data.ListMetaData(org.apache.thrift.protocol.TType.LIST, 
-                new org.apache.thrift.meta_data.StructMetaData(org.apache.thrift.protocol.TType.STRUCT, Nested.class)))));
-    metaDataMap = Collections.unmodifiableMap(tmpMap);
-    org.apache.thrift.meta_data.FieldMetaData.addStructMetaDataMap(ParquetThriftCompat.class, metaDataMap);
-  }
-
-  public ParquetThriftCompat() {
-  }
-
-  public ParquetThriftCompat(
-    boolean boolColumn,
-    byte byteColumn,
-    short shortColumn,
-    int intColumn,
-    long longColumn,
-    double doubleColumn,
-    ByteBuffer binaryColumn,
-    String stringColumn,
-    Suit enumColumn,
-    List<String> stringsColumn,
-    Set<Integer> intSetColumn,
-    Map<Integer,String> intToStringColumn,
-    Map<Integer,List<Nested>> complexColumn)
-  {
-    this();
-    this.boolColumn = boolColumn;
-    setBoolColumnIsSet(true);
-    this.byteColumn = byteColumn;
-    setByteColumnIsSet(true);
-    this.shortColumn = shortColumn;
-    setShortColumnIsSet(true);
-    this.intColumn = intColumn;
-    setIntColumnIsSet(true);
-    this.longColumn = longColumn;
-    setLongColumnIsSet(true);
-    this.doubleColumn = doubleColumn;
-    setDoubleColumnIsSet(true);
-    this.binaryColumn = org.apache.thrift.TBaseHelper.copyBinary(binaryColumn);
-    this.stringColumn = stringColumn;
-    this.enumColumn = enumColumn;
-    this.stringsColumn = stringsColumn;
-    this.intSetColumn = intSetColumn;
-    this.intToStringColumn = intToStringColumn;
-    this.complexColumn = complexColumn;
-  }
-
-  /**
-   * Performs a deep copy on <i>other</i>.
-   */
-  public ParquetThriftCompat(ParquetThriftCompat other) {
-    __isset_bitfield = other.__isset_bitfield;
-    this.boolColumn = other.boolColumn;
-    this.byteColumn = other.byteColumn;
-    this.shortColumn = other.shortColumn;
-    this.intColumn = other.intColumn;
-    this.longColumn = other.longColumn;
-    this.doubleColumn = other.doubleColumn;
-    if (other.isSetBinaryColumn()) {
-      this.binaryColumn = org.apache.thrift.TBaseHelper.copyBinary(other.binaryColumn);
-    }
-    if (other.isSetStringColumn()) {
-      this.stringColumn = other.stringColumn;
-    }
-    if (other.isSetEnumColumn()) {
-      this.enumColumn = other.enumColumn;
-    }
-    this.maybeBoolColumn = other.maybeBoolColumn;
-    this.maybeByteColumn = other.maybeByteColumn;
-    this.maybeShortColumn = other.maybeShortColumn;
-    this.maybeIntColumn = other.maybeIntColumn;
-    this.maybeLongColumn = other.maybeLongColumn;
-    this.maybeDoubleColumn = other.maybeDoubleColumn;
-    if (other.isSetMaybeBinaryColumn()) {
-      this.maybeBinaryColumn = org.apache.thrift.TBaseHelper.copyBinary(other.maybeBinaryColumn);
-    }
-    if (other.isSetMaybeStringColumn()) {
-      this.maybeStringColumn = other.maybeStringColumn;
-    }
-    if (other.isSetMaybeEnumColumn()) {
-      this.maybeEnumColumn = other.maybeEnumColumn;
-    }
-    if (other.isSetStringsColumn()) {
-      List<String> __this__stringsColumn = new ArrayList<String>(other.stringsColumn);
-      this.stringsColumn = __this__stringsColumn;
-    }
-    if (other.isSetIntSetColumn()) {
-      Set<Integer> __this__intSetColumn = new HashSet<Integer>(other.intSetColumn);
-      this.intSetColumn = __this__intSetColumn;
-    }
-    if (other.isSetIntToStringColumn()) {
-      Map<Integer,String> __this__intToStringColumn = new HashMap<Integer,String>(other.intToStringColumn);
-      this.intToStringColumn = __this__intToStringColumn;
-    }
-    if (other.isSetComplexColumn()) {
-      Map<Integer,List<Nested>> __this__complexColumn = new HashMap<Integer,List<Nested>>(other.complexColumn.size());
-      for (Map.Entry<Integer, List<Nested>> other_element : other.complexColumn.entrySet()) {
-
-        Integer other_element_key = other_element.getKey();
-        List<Nested> other_element_value = other_element.getValue();
-
-        Integer __this__complexColumn_copy_key = other_element_key;
-
-        List<Nested> __this__complexColumn_copy_value = new ArrayList<Nested>(other_element_value.size());
-        for (Nested other_element_value_element : other_element_value) {
-          __this__complexColumn_copy_value.add(new Nested(other_element_value_element));
-        }
-
-        __this__complexColumn.put(__this__complexColumn_copy_key, __this__complexColumn_copy_value);
-      }
-      this.complexColumn = __this__complexColumn;
-    }
-  }
-
-  public ParquetThriftCompat deepCopy() {
-    return new ParquetThriftCompat(this);
-  }
-
-  @Override
-  public void clear() {
-    setBoolColumnIsSet(false);
-    this.boolColumn = false;
-    setByteColumnIsSet(false);
-    this.byteColumn = 0;
-    setShortColumnIsSet(false);
-    this.shortColumn = 0;
-    setIntColumnIsSet(false);
-    this.intColumn = 0;
-    setLongColumnIsSet(false);
-    this.longColumn = 0;
-    setDoubleColumnIsSet(false);
-    this.doubleColumn = 0.0;
-    this.binaryColumn = null;
-    this.stringColumn = null;
-    this.enumColumn = null;
-    setMaybeBoolColumnIsSet(false);
-    this.maybeBoolColumn = false;
-    setMaybeByteColumnIsSet(false);
-    this.maybeByteColumn = 0;
-    setMaybeShortColumnIsSet(false);
-    this.maybeShortColumn = 0;
-    setMaybeIntColumnIsSet(false);
-    this.maybeIntColumn = 0;
-    setMaybeLongColumnIsSet(false);
-    this.maybeLongColumn = 0;
-    setMaybeDoubleColumnIsSet(false);
-    this.maybeDoubleColumn = 0.0;
-    this.maybeBinaryColumn = null;
-    this.maybeStringColumn = null;
-    this.maybeEnumColumn = null;
-    this.stringsColumn = null;
-    this.intSetColumn = null;
-    this.intToStringColumn = null;
-    this.complexColumn = null;
-  }
-
-  public boolean isBoolColumn() {
-    return this.boolColumn;
-  }
-
-  public ParquetThriftCompat setBoolColumn(boolean boolColumn) {
-    this.boolColumn = boolColumn;
-    setBoolColumnIsSet(true);
-    return this;
-  }
-
-  public void unsetBoolColumn() {
-    __isset_bitfield = EncodingUtils.clearBit(__isset_bitfield, __BOOLCOLUMN_ISSET_ID);
-  }
-
-  /** Returns true if field boolColumn is set (has been assigned a value) and false otherwise */
-  public boolean isSetBoolColumn() {
-    return EncodingUtils.testBit(__isset_bitfield, __BOOLCOLUMN_ISSET_ID);
-  }
-
-  public void setBoolColumnIsSet(boolean value) {
-    __isset_bitfield = EncodingUtils.setBit(__isset_bitfield, __BOOLCOLUMN_ISSET_ID, value);
-  }
-
-  public byte getByteColumn() {
-    return this.byteColumn;
-  }
-
-  public ParquetThriftCompat setByteColumn(byte byteColumn) {
-    this.byteColumn = byteColumn;
-    setByteColumnIsSet(true);
-    return this;
-  }
-
-  public void unsetByteColumn() {
-    __isset_bitfield = EncodingUtils.clearBit(__isset_bitfield, __BYTECOLUMN_ISSET_ID);
-  }
-
-  /** Returns true if field byteColumn is set (has been assigned a value) and false otherwise */
-  public boolean isSetByteColumn() {
-    return EncodingUtils.testBit(__isset_bitfield, __BYTECOLUMN_ISSET_ID);
-  }
-
-  public void setByteColumnIsSet(boolean value) {
-    __isset_bitfield = EncodingUtils.setBit(__isset_bitfield, __BYTECOLUMN_ISSET_ID, value);
-  }
-
-  public short getShortColumn() {
-    return this.shortColumn;
-  }
-
-  public ParquetThriftCompat setShortColumn(short shortColumn) {
-    this.shortColumn = shortColumn;
-    setShortColumnIsSet(true);
-    return this;
-  }
-
-  public void unsetShortColumn() {
-    __isset_bitfield = EncodingUtils.clearBit(__isset_bitfield, __SHORTCOLUMN_ISSET_ID);
-  }
-
-  /** Returns true if field shortColumn is set (has been assigned a value) and false otherwise */
-  public boolean isSetShortColumn() {
-    return EncodingUtils.testBit(__isset_bitfield, __SHORTCOLUMN_ISSET_ID);
-  }
-
-  public void setShortColumnIsSet(boolean value) {
-    __isset_bitfield = EncodingUtils.setBit(__isset_bitfield, __SHORTCOLUMN_ISSET_ID, value);
-  }
-
-  public int getIntColumn() {
-    return this.intColumn;
-  }
-
-  public ParquetThriftCompat setIntColumn(int intColumn) {
-    this.intColumn = intColumn;
-    setIntColumnIsSet(true);
-    return this;
-  }
-
-  public void unsetIntColumn() {
-    __isset_bitfield = EncodingUtils.clearBit(__isset_bitfield, __INTCOLUMN_ISSET_ID);
-  }
-
-  /** Returns true if field intColumn is set (has been assigned a value) and false otherwise */
-  public boolean isSetIntColumn() {
-    return EncodingUtils.testBit(__isset_bitfield, __INTCOLUMN_ISSET_ID);
-  }
-
-  public void setIntColumnIsSet(boolean value) {
-    __isset_bitfield = EncodingUtils.setBit(__isset_bitfield, __INTCOLUMN_ISSET_ID, value);
-  }
-
-  public long getLongColumn() {
-    return this.longColumn;
-  }
-
-  public ParquetThriftCompat setLongColumn(long longColumn) {
-    this.longColumn = longColumn;
-    setLongColumnIsSet(true);
-    return this;
-  }
-
-  public void unsetLongColumn() {
-    __isset_bitfield = EncodingUtils.clearBit(__isset_bitfield, __LONGCOLUMN_ISSET_ID);
-  }
-
-  /** Returns true if field longColumn is set (has been assigned a value) and false otherwise */
-  public boolean isSetLongColumn() {
-    return EncodingUtils.testBit(__isset_bitfield, __LONGCOLUMN_ISSET_ID);
-  }
-
-  public void setLongColumnIsSet(boolean value) {
-    __isset_bitfield = EncodingUtils.setBit(__isset_bitfield, __LONGCOLUMN_ISSET_ID, value);
-  }
-
-  public double getDoubleColumn() {
-    return this.doubleColumn;
-  }
-
-  public ParquetThriftCompat setDoubleColumn(double doubleColumn) {
-    this.doubleColumn = doubleColumn;
-    setDoubleColumnIsSet(true);
-    return this;
-  }
-
-  public void unsetDoubleColumn() {
-    __isset_bitfield = EncodingUtils.clearBit(__isset_bitfield, __DOUBLECOLUMN_ISSET_ID);
-  }
-
-  /** Returns true if field doubleColumn is set (has been assigned a value) and false otherwise */
-  public boolean isSetDoubleColumn() {
-    return EncodingUtils.testBit(__isset_bitfield, __DOUBLECOLUMN_ISSET_ID);
-  }
-
-  public void setDoubleColumnIsSet(boolean value) {
-    __isset_bitfield = EncodingUtils.setBit(__isset_bitfield, __DOUBLECOLUMN_ISSET_ID, value);
-  }
-
-  public byte[] getBinaryColumn() {
-    setBinaryColumn(org.apache.thrift.TBaseHelper.rightSize(binaryColumn));
-    return binaryColumn == null ? null : binaryColumn.array();
-  }
-
-  public ByteBuffer bufferForBinaryColumn() {
-    return org.apache.thrift.TBaseHelper.copyBinary(binaryColumn);
-  }
-
-  public ParquetThriftCompat setBinaryColumn(byte[] binaryColumn) {
-    this.binaryColumn = binaryColumn == null ? (ByteBuffer)null : ByteBuffer.wrap(Arrays.copyOf(binaryColumn, binaryColumn.length));
-    return this;
-  }
-
-  public ParquetThriftCompat setBinaryColumn(ByteBuffer binaryColumn) {
-    this.binaryColumn = org.apache.thrift.TBaseHelper.copyBinary(binaryColumn);
-    return this;
-  }
-
-  public void unsetBinaryColumn() {
-    this.binaryColumn = null;
-  }
-
-  /** Returns true if field binaryColumn is set (has been assigned a value) and false otherwise */
-  public boolean isSetBinaryColumn() {
-    return this.binaryColumn != null;
-  }
-
-  public void setBinaryColumnIsSet(boolean value) {
-    if (!value) {
-      this.binaryColumn = null;
-    }
-  }
-
-  public String getStringColumn() {
-    return this.stringColumn;
-  }
-
-  public ParquetThriftCompat setStringColumn(String stringColumn) {
-    this.stringColumn = stringColumn;
-    return this;
-  }
-
-  public void unsetStringColumn() {
-    this.stringColumn = null;
-  }
-
-  /** Returns true if field stringColumn is set (has been assigned a value) and false otherwise */
-  public boolean isSetStringColumn() {
-    return this.stringColumn != null;
-  }
-
-  public void setStringColumnIsSet(boolean value) {
-    if (!value) {
-      this.stringColumn = null;
-    }
-  }
-
-  /**
-   * 
-   * @see Suit
-   */
-  public Suit getEnumColumn() {
-    return this.enumColumn;
-  }
-
-  /**
-   * 
-   * @see Suit
-   */
-  public ParquetThriftCompat setEnumColumn(Suit enumColumn) {
-    this.enumColumn = enumColumn;
-    return this;
-  }
-
-  public void unsetEnumColumn() {
-    this.enumColumn = null;
-  }
-
-  /** Returns true if field enumColumn is set (has been assigned a value) and false otherwise */
-  public boolean isSetEnumColumn() {
-    return this.enumColumn != null;
-  }
-
-  public void setEnumColumnIsSet(boolean value) {
-    if (!value) {
-      this.enumColumn = null;
-    }
-  }
-
-  public boolean isMaybeBoolColumn() {
-    return this.maybeBoolColumn;
-  }
-
-  public ParquetThriftCompat setMaybeBoolColumn(boolean maybeBoolColumn) {
-    this.maybeBoolColumn = maybeBoolColumn;
-    setMaybeBoolColumnIsSet(true);
-    return this;
-  }
-
-  public void unsetMaybeBoolColumn() {
-    __isset_bitfield = EncodingUtils.clearBit(__isset_bitfield, __MAYBEBOOLCOLUMN_ISSET_ID);
-  }
-
-  /** Returns true if field maybeBoolColumn is set (has been assigned a value) and false otherwise */
-  public boolean isSetMaybeBoolColumn() {
-    return EncodingUtils.testBit(__isset_bitfield, __MAYBEBOOLCOLUMN_ISSET_ID);
-  }
-
-  public void setMaybeBoolColumnIsSet(boolean value) {
-    __isset_bitfield = EncodingUtils.setBit(__isset_bitfield, __MAYBEBOOLCOLUMN_ISSET_ID, value);
-  }
-
-  public byte getMaybeByteColumn() {
-    return this.maybeByteColumn;
-  }
-
-  public ParquetThriftCompat setMaybeByteColumn(byte maybeByteColumn) {
-    this.maybeByteColumn = maybeByteColumn;
-    setMaybeByteColumnIsSet(true);
-    return this;
-  }
-
-  public void unsetMaybeByteColumn() {
-    __isset_bitfield = EncodingUtils.clearBit(__isset_bitfield, __MAYBEBYTECOLUMN_ISSET_ID);
-  }
-
-  /** Returns true if field maybeByteColumn is set (has been assigned a value) and false otherwise */
-  public boolean isSetMaybeByteColumn() {
-    return EncodingUtils.testBit(__isset_bitfield, __MAYBEBYTECOLUMN_ISSET_ID);
-  }
-
-  public void setMaybeByteColumnIsSet(boolean value) {
-    __isset_bitfield = EncodingUtils.setBit(__isset_bitfield, __MAYBEBYTECOLUMN_ISSET_ID, value);
-  }
-
-  public short getMaybeShortColumn() {
-    return this.maybeShortColumn;
-  }
-
-  public ParquetThriftCompat setMaybeShortColumn(short maybeShortColumn) {
-    this.maybeShortColumn = maybeShortColumn;
-    setMaybeShortColumnIsSet(true);
-    return this;
-  }
-
-  public void unsetMaybeShortColumn() {
-    __isset_bitfield = EncodingUtils.clearBit(__isset_bitfield, __MAYBESHORTCOLUMN_ISSET_ID);
-  }
-
-  /** Returns true if field maybeShortColumn is set (has been assigned a value) and false otherwise */
-  public boolean isSetMaybeShortColumn() {
-    return EncodingUtils.testBit(__isset_bitfield, __MAYBESHORTCOLUMN_ISSET_ID);
-  }
-
-  public void setMaybeShortColumnIsSet(boolean value) {
-    __isset_bitfield = EncodingUtils.setBit(__isset_bitfield, __MAYBESHORTCOLUMN_ISSET_ID, value);
-  }
-
-  public int getMaybeIntColumn() {
-    return this.maybeIntColumn;
-  }
-
-  public ParquetThriftCompat setMaybeIntColumn(int maybeIntColumn) {
-    this.maybeIntColumn = maybeIntColumn;
-    setMaybeIntColumnIsSet(true);
-    return this;
-  }
-
-  public void unsetMaybeIntColumn() {
-    __isset_bitfield = EncodingUtils.clearBit(__isset_bitfield, __MAYBEINTCOLUMN_ISSET_ID);
-  }
-
-  /** Returns true if field maybeIntColumn is set (has been assigned a value) and false otherwise */
-  public boolean isSetMaybeIntColumn() {
-    return EncodingUtils.testBit(__isset_bitfield, __MAYBEINTCOLUMN_ISSET_ID);
-  }
-
-  public void setMaybeIntColumnIsSet(boolean value) {
-    __isset_bitfield = EncodingUtils.setBit(__isset_bitfield, __MAYBEINTCOLUMN_ISSET_ID, value);
-  }
-
-  public long getMaybeLongColumn() {
-    return this.maybeLongColumn;
-  }
-
-  public ParquetThriftCompat setMaybeLongColumn(long maybeLongColumn) {
-    this.maybeLongColumn = maybeLongColumn;
-    setMaybeLongColumnIsSet(true);
-    return this;
-  }
-
-  public void unsetMaybeLongColumn() {
-    __isset_bitfield = EncodingUtils.clearBit(__isset_bitfield, __MAYBELONGCOLUMN_ISSET_ID);
-  }
-
-  /** Returns true if field maybeLongColumn is set (has been assigned a value) and false otherwise */
-  public boolean isSetMaybeLongColumn() {
-    return EncodingUtils.testBit(__isset_bitfield, __MAYBELONGCOLUMN_ISSET_ID);
-  }
-
-  public void setMaybeLongColumnIsSet(boolean value) {
-    __isset_bitfield = EncodingUtils.setBit(__isset_bitfield, __MAYBELONGCOLUMN_ISSET_ID, value);
-  }
-
-  public double getMaybeDoubleColumn() {
-    return this.maybeDoubleColumn;
-  }
-
-  public ParquetThriftCompat setMaybeDoubleColumn(double maybeDoubleColumn) {
-    this.maybeDoubleColumn = maybeDoubleColumn;
-    setMaybeDoubleColumnIsSet(true);
-    return this;
-  }
-
-  public void unsetMaybeDoubleColumn() {
-    __isset_bitfield = EncodingUtils.clearBit(__isset_bitfield, __MAYBEDOUBLECOLUMN_ISSET_ID);
-  }
-
-  /** Returns true if field maybeDoubleColumn is set (has been assigned a value) and false otherwise */
-  public boolean isSetMaybeDoubleColumn() {
-    return EncodingUtils.testBit(__isset_bitfield, __MAYBEDOUBLECOLUMN_ISSET_ID);
-  }
-
-  public void setMaybeDoubleColumnIsSet(boolean value) {
-    __isset_bitfield = EncodingUtils.setBit(__isset_bitfield, __MAYBEDOUBLECOLUMN_ISSET_ID, value);
-  }
-
-  public byte[] getMaybeBinaryColumn() {
-    setMaybeBinaryColumn(org.apache.thrift.TBaseHelper.rightSize(maybeBinaryColumn));
-    return maybeBinaryColumn == null ? null : maybeBinaryColumn.array();
-  }
-
-  public ByteBuffer bufferForMaybeBinaryColumn() {
-    return org.apache.thrift.TBaseHelper.copyBinary(maybeBinaryColumn);
-  }
-
-  public ParquetThriftCompat setMaybeBinaryColumn(byte[] maybeBinaryColumn) {
-    this.maybeBinaryColumn = maybeBinaryColumn == null ? (ByteBuffer)null : ByteBuffer.wrap(Arrays.copyOf(maybeBinaryColumn, maybeBinaryColumn.length));
-    return this;
-  }
-
-  public ParquetThriftCompat setMaybeBinaryColumn(ByteBuffer maybeBinaryColumn) {
-    this.maybeBinaryColumn = org.apache.thrift.TBaseHelper.copyBinary(maybeBinaryColumn);
-    return this;
-  }
-
-  public void unsetMaybeBinaryColumn() {
-    this.maybeBinaryColumn = null;
-  }
-
-  /** Returns true if field maybeBinaryColumn is set (has been assigned a value) and false otherwise */
-  public boolean isSetMaybeBinaryColumn() {
-    return this.maybeBinaryColumn != null;
-  }
-
-  public void setMaybeBinaryColumnIsSet(boolean value) {
-    if (!value) {
-      this.maybeBinaryColumn = null;
-    }
-  }
-
-  public String getMaybeStringColumn() {
-    return this.maybeStringColumn;
-  }
-
-  public ParquetThriftCompat setMaybeStringColumn(String maybeStringColumn) {
-    this.maybeStringColumn = maybeStringColumn;
-    return this;
-  }
-
-  public void unsetMaybeStringColumn() {
-    this.maybeStringColumn = null;
-  }
-
-  /** Returns true if field maybeStringColumn is set (has been assigned a value) and false otherwise */
-  public boolean isSetMaybeStringColumn() {
-    return this.maybeStringColumn != null;
-  }
-
-  public void setMaybeStringColumnIsSet(boolean value) {
-    if (!value) {
-      this.maybeStringColumn = null;
-    }
-  }
-
-  /**
-   * 
-   * @see Suit
-   */
-  public Suit getMaybeEnumColumn() {
-    return this.maybeEnumColumn;
-  }
-
-  /**
-   * 
-   * @see Suit
-   */
-  public ParquetThriftCompat setMaybeEnumColumn(Suit maybeEnumColumn) {
-    this.maybeEnumColumn = maybeEnumColumn;
-    return this;
-  }
-
-  public void unsetMaybeEnumColumn() {
-    this.maybeEnumColumn = null;
-  }
-
-  /** Returns true if field maybeEnumColumn is set (has been assigned a value) and false otherwise */
-  public boolean isSetMaybeEnumColumn() {
-    return this.maybeEnumColumn != null;
-  }
-
-  public void setMaybeEnumColumnIsSet(boolean value) {
-    if (!value) {
-      this.maybeEnumColumn = null;
-    }
-  }
-
-  public int getStringsColumnSize() {
-    return (this.stringsColumn == null) ? 0 : this.stringsColumn.size();
-  }
-
-  public java.util.Iterator<String> getStringsColumnIterator() {
-    return (this.stringsColumn == null) ? null : this.stringsColumn.iterator();
-  }
-
-  public void addToStringsColumn(String elem) {
-    if (this.stringsColumn == null) {
-      this.stringsColumn = new ArrayList<String>();
-    }
-    this.stringsColumn.add(elem);
-  }
-
-  public List<String> getStringsColumn() {
-    return this.stringsColumn;
-  }
-
-  public ParquetThriftCompat setStringsColumn(List<String> stringsColumn) {
-    this.stringsColumn = stringsColumn;
-    return this;
-  }
-
-  public void unsetStringsColumn() {
-    this.stringsColumn = null;
-  }
-
-  /** Returns true if field stringsColumn is set (has been assigned a value) and false otherwise */
-  public boolean isSetStringsColumn() {
-    return this.stringsColumn != null;
-  }
-
-  public void setStringsColumnIsSet(boolean value) {
-    if (!value) {
-      this.stringsColumn = null;
-    }
-  }
-
-  public int getIntSetColumnSize() {
-    return (this.intSetColumn == null) ? 0 : this.intSetColumn.size();
-  }
-
-  public java.util.Iterator<Integer> getIntSetColumnIterator() {
-    return (this.intSetColumn == null) ? null : this.intSetColumn.iterator();
-  }
-
-  public void addToIntSetColumn(int elem) {
-    if (this.intSetColumn == null) {
-      this.intSetColumn = new HashSet<Integer>();
-    }
-    this.intSetColumn.add(elem);
-  }
-
-  public Set<Integer> getIntSetColumn() {
-    return this.intSetColumn;
-  }
-
-  public ParquetThriftCompat setIntSetColumn(Set<Integer> intSetColumn) {
-    this.intSetColumn = intSetColumn;
-    return this;
-  }
-
-  public void unsetIntSetColumn() {
-    this.intSetColumn = null;
-  }
-
-  /** Returns true if field intSetColumn is set (has been assigned a value) and false otherwise */
-  public boolean isSetIntSetColumn() {
-    return this.intSetColumn != null;
-  }
-
-  public void setIntSetColumnIsSet(boolean value) {
-    if (!value) {
-      this.intSetColumn = null;
-    }
-  }
-
-  public int getIntToStringColumnSize() {
-    return (this.intToStringColumn == null) ? 0 : this.intToStringColumn.size();
-  }
-
-  public void putToIntToStringColumn(int key, String val) {
-    if (this.intToStringColumn == null) {
-      this.intToStringColumn = new HashMap<Integer,String>();
-    }
-    this.intToStringColumn.put(key, val);
-  }
-
-  public Map<Integer,String> getIntToStringColumn() {
-    return this.intToStringColumn;
-  }
-
-  public ParquetThriftCompat setIntToStringColumn(Map<Integer,String> intToStringColumn) {
-    this.intToStringColumn = intToStringColumn;
-    return this;
-  }
-
-  public void unsetIntToStringColumn() {
-    this.intToStringColumn = null;
-  }
-
-  /** Returns true if field intToStringColumn is set (has been assigned a value) and false otherwise */
-  public boolean isSetIntToStringColumn() {
-    return this.intToStringColumn != null;
-  }
-
-  public void setIntToStringColumnIsSet(boolean value) {
-    if (!value) {
-      this.intToStringColumn = null;
-    }
-  }
-
-  public int getComplexColumnSize() {
-    return (this.complexColumn == null) ? 0 : this.complexColumn.size();
-  }
-
-  public void putToComplexColumn(int key, List<Nested> val) {
-    if (this.complexColumn == null) {
-      this.complexColumn = new HashMap<Integer,List<Nested>>();
-    }
-    this.complexColumn.put(key, val);
-  }
-
-  public Map<Integer,List<Nested>> getComplexColumn() {
-    return this.complexColumn;
-  }
-
-  public ParquetThriftCompat setComplexColumn(Map<Integer,List<Nested>> complexColumn) {
-    this.complexColumn = complexColumn;
-    return this;
-  }
-
-  public void unsetComplexColumn() {
-    this.complexColumn = null;
-  }
-
-  /** Returns true if field complexColumn is set (has been assigned a value) and false otherwise */
-  public boolean isSetComplexColumn() {
-    return this.complexColumn != null;
-  }
-
-  public void setComplexColumnIsSet(boolean value) {
-    if (!value) {
-      this.complexColumn = null;
-    }
-  }
-
-  public void setFieldValue(_Fields field, Object value) {
-    switch (field) {
-    case BOOL_COLUMN:
-      if (value == null) {
-        unsetBoolColumn();
-      } else {
-        setBoolColumn((Boolean)value);
-      }
-      break;
-
-    case BYTE_COLUMN:
-      if (value == null) {
-        unsetByteColumn();
-      } else {
-        setByteColumn((Byte)value);
-      }
-      break;
-
-    case SHORT_COLUMN:
-      if (value == null) {
-        unsetShortColumn();
-      } else {
-        setShortColumn((Short)value);
-      }
-      break;
-
-    case INT_COLUMN:
-      if (value == null) {
-        unsetIntColumn();
-      } else {
-        setIntColumn((Integer)value);
-      }
-      break;
-
-    case LONG_COLUMN:
-      if (value == null) {
-        unsetLongColumn();
-      } else {
-        setLongColumn((Long)value);
-      }
-      break;
-
-    case DOUBLE_COLUMN:
-      if (value == null) {
-        unsetDoubleColumn();
-      } else {
-        setDoubleColumn((Double)value);
-      }
-      break;
-
-    case BINARY_COLUMN:
-      if (value == null) {
-        unsetBinaryColumn();
-      } else {
-        setBinaryColumn((ByteBuffer)value);
-      }
-      break;
-
-    case STRING_COLUMN:
-      if (value == null) {
-        unsetStringColumn();
-      } else {
-        setStringColumn((String)value);
-      }
-      break;
-
-    case ENUM_COLUMN:
-      if (value == null) {
-        unsetEnumColumn();
-      } else {
-        setEnumColumn((Suit)value);
-      }
-      break;
-
-    case MAYBE_BOOL_COLUMN:
-      if (value == null) {
-        unsetMaybeBoolColumn();
-      } else {
-        setMaybeBoolColumn((Boolean)value);
-      }
-      break;
-
-    case MAYBE_BYTE_COLUMN:
-      if (value == null) {
-        unsetMaybeByteColumn();
-      } else {
-        setMaybeByteColumn((Byte)value);
-      }
-      break;
-
-    case MAYBE_SHORT_COLUMN:
-      if (value == null) {
-        unsetMaybeShortColumn();
-      } else {
-        setMaybeShortColumn((Short)value);
-      }
-      break;
-
-    case MAYBE_INT_COLUMN:
-      if (value == null) {
-        unsetMaybeIntColumn();
-      } else {
-        setMaybeIntColumn((Integer)value);
-      }
-      break;
-
-    case MAYBE_LONG_COLUMN:
-      if (value == null) {
-        unsetMaybeLongColumn();
-      } else {
-        setMaybeLongColumn((Long)value);
-      }
-      break;
-
-    case MAYBE_DOUBLE_COLUMN:
-      if (value == null) {
-        unsetMaybeDoubleColumn();
-      } else {
-        setMaybeDoubleColumn((Double)value);
-      }
-      break;
-
-    case MAYBE_BINARY_COLUMN:
-      if (value == null) {
-        unsetMaybeBinaryColumn();
-      } else {
-        setMaybeBinaryColumn((ByteBuffer)value);
-      }
-      break;
-
-    case MAYBE_STRING_COLUMN:
-      if (value == null) {
-        unsetMaybeStringColumn();
-      } else {
-        setMaybeStringColumn((String)value);
-      }
-      break;
-
-    case MAYBE_ENUM_COLUMN:
-      if (value == null) {
-        unsetMaybeEnumColumn();
-      } else {
-        setMaybeEnumColumn((Suit)value);
-      }
-      break;
-
-    case STRINGS_COLUMN:
-      if (value == null) {
-        unsetStringsColumn();
-      } else {
-        setStringsColumn((List<String>)value);
-      }
-      break;
-
-    case INT_SET_COLUMN:
-      if (value == null) {
-        unsetIntSetColumn();
-      } else {
-        setIntSetColumn((Set<Integer>)value);
-      }
-      break;
-
-    case INT_TO_STRING_COLUMN:
-      if (value == null) {
-        unsetIntToStringColumn();
-      } else {
-        setIntToStringColumn((Map<Integer,String>)value);
-      }
-      break;
-
-    case COMPLEX_COLUMN:
-      if (value == null) {
-        unsetComplexColumn();
-      } else {
-        setComplexColumn((Map<Integer,List<Nested>>)value);
-      }
-      break;
-
-    }
-  }
-
-  public Object getFieldValue(_Fields field) {
-    switch (field) {
-    case BOOL_COLUMN:
-      return Boolean.valueOf(isBoolColumn());
-
-    case BYTE_COLUMN:
-      return Byte.valueOf(getByteColumn());
-
-    case SHORT_COLUMN:
-      return Short.valueOf(getShortColumn());
-
-    case INT_COLUMN:
-      return Integer.valueOf(getIntColumn());
-
-    case LONG_COLUMN:
-      return Long.valueOf(getLongColumn());
-
-    case DOUBLE_COLUMN:
-      return Double.valueOf(getDoubleColumn());
-
-    case BINARY_COLUMN:
-      return getBinaryColumn();
-
-    case STRING_COLUMN:
-      return getStringColumn();
-
-    case ENUM_COLUMN:
-      return getEnumColumn();
-
-    case MAYBE_BOOL_COLUMN:
-      return Boolean.valueOf(isMaybeBoolColumn());
-
-    case MAYBE_BYTE_COLUMN:
-      return Byte.valueOf(getMaybeByteColumn());
-
-    case MAYBE_SHORT_COLUMN:
-      return Short.valueOf(getMaybeShortColumn());
-
-    case MAYBE_INT_COLUMN:
-      return Integer.valueOf(getMaybeIntColumn());
-
-    case MAYBE_LONG_COLUMN:
-      return Long.valueOf(getMaybeLongColumn());
-
-    case MAYBE_DOUBLE_COLUMN:
-      return Double.valueOf(getMaybeDoubleColumn());
-
-    case MAYBE_BINARY_COLUMN:
-      return getMaybeBinaryColumn();
-
-    case MAYBE_STRING_COLUMN:
-      return getMaybeStringColumn();
-
-    case MAYBE_ENUM_COLUMN:
-      return getMaybeEnumColumn();
-
-    case STRINGS_COLUMN:
-      return getStringsColumn();
-
-    case INT_SET_COLUMN:
-      return getIntSetColumn();
-
-    case INT_TO_STRING_COLUMN:
-      return getIntToStringColumn();
-
-    case COMPLEX_COLUMN:
-      return getComplexColumn();
-
-    }
-    throw new IllegalStateException();
-  }
-
-  /** Returns true if field corresponding to fieldID is set (has been assigned a value) and false otherwise */
-  public boolean isSet(_Fields field) {
-    if (field == null) {
-      throw new IllegalArgumentException();
-    }
-
-    switch (field) {
-    case BOOL_COLUMN:
-      return isSetBoolColumn();
-    case BYTE_COLUMN:
-      return isSetByteColumn();
-    case SHORT_COLUMN:
-      return isSetShortColumn();
-    case INT_COLUMN:
-      return isSetIntColumn();
-    case LONG_COLUMN:
-      return isSetLongColumn();
-    case DOUBLE_COLUMN:
-      return isSetDoubleColumn();
-    case BINARY_COLUMN:
-      return isSetBinaryColumn();
-    case STRING_COLUMN:
-      return isSetStringColumn();
-    case ENUM_COLUMN:
-      return isSetEnumColumn();
-    case MAYBE_BOOL_COLUMN:
-      return isSetMaybeBoolColumn();
-    case MAYBE_BYTE_COLUMN:
-      return isSetMaybeByteColumn();
-    case MAYBE_SHORT_COLUMN:
-      return isSetMaybeShortColumn();
-    case MAYBE_INT_COLUMN:
-      return isSetMaybeIntColumn();
-    case MAYBE_LONG_COLUMN:
-      return isSetMaybeLongColumn();
-    case MAYBE_DOUBLE_COLUMN:
-      return isSetMaybeDoubleColumn();
-    case MAYBE_BINARY_COLUMN:
-      return isSetMaybeBinaryColumn();
-    case MAYBE_STRING_COLUMN:
-      return isSetMaybeStringColumn();
-    case MAYBE_ENUM_COLUMN:
-      return isSetMaybeEnumColumn();
-    case STRINGS_COLUMN:
-      return isSetStringsColumn();
-    case INT_SET_COLUMN:
-      return isSetIntSetColumn();
-    case INT_TO_STRING_COLUMN:
-      return isSetIntToStringColumn();
-    case COMPLEX_COLUMN:
-      return isSetComplexColumn();
-    }
-    throw new IllegalStateException();
-  }
-
-  @Override
-  public boolean equals(Object that) {
-    if (that == null)
-      return false;
-    if (that instanceof ParquetThriftCompat)
-      return this.equals((ParquetThriftCompat)that);
-    return false;
-  }
-
-  public boolean equals(ParquetThriftCompat that) {
-    if (that == null)
-      return false;
-
-    boolean this_present_boolColumn = true;
-    boolean that_present_boolColumn = true;
-    if (this_present_boolColumn || that_present_boolColumn) {
-      if (!(this_present_boolColumn && that_present_boolColumn))
-        return false;
-      if (this.boolColumn != that.boolColumn)
-        return false;
-    }
-
-    boolean this_present_byteColumn = true;
-    boolean that_present_byteColumn = true;
-    if (this_present_byteColumn || that_present_byteColumn) {
-      if (!(this_present_byteColumn && that_present_byteColumn))
-        return false;
-      if (this.byteColumn != that.byteColumn)
-        return false;
-    }
-
-    boolean this_present_shortColumn = true;
-    boolean that_present_shortColumn = true;
-    if (this_present_shortColumn || that_present_shortColumn) {
-      if (!(this_present_shortColumn && that_present_shortColumn))
-        return false;
-      if (this.shortColumn != that.shortColumn)
-        return false;
-    }
-
-    boolean this_present_intColumn = true;
-    boolean that_present_intColumn = true;
-    if (this_present_intColumn || that_present_intColumn) {
-      if (!(this_present_intColumn && that_present_intColumn))
-        return false;
-      if (this.intColumn != that.intColumn)
-        return false;
-    }
-
-    boolean this_present_longColumn = true;
-    boolean that_present_longColumn = true;
-    if (this_present_longColumn || that_present_longColumn) {
-      if (!(this_present_longColumn && that_present_longColumn))
-        return false;
-      if (this.longColumn != that.longColumn)
-        return false;
-    }
-
-    boolean this_present_doubleColumn = true;
-    boolean that_present_doubleColumn = true;
-    if (this_present_doubleColumn || that_present_doubleColumn) {
-      if (!(this_present_doubleColumn && that_present_doubleColumn))
-        return false;
-      if (this.doubleColumn != that.doubleColumn)
-        return false;
-    }
-
-    boolean this_present_binaryColumn = true && this.isSetBinaryColumn();
-    boolean that_present_binaryColumn = true && that.isSetBinaryColumn();
-    if (this_present_binaryColumn || that_present_binaryColumn) {
-      if (!(this_present_binaryColumn && that_present_binaryColumn))
-        return false;
-      if (!this.binaryColumn.equals(that.binaryColumn))
-        return false;
-    }
-
-    boolean this_present_stringColumn = true && this.isSetStringColumn();
-    boolean that_present_stringColumn = true && that.isSetStringColumn();
-    if (this_present_stringColumn || that_present_stringColumn) {
-      if (!(this_present_stringColumn && that_present_stringColumn))
-        return false;
-      if (!this.stringColumn.equals(that.stringColumn))
-        return false;
-    }
-
-    boolean this_present_enumColumn = true && this.isSetEnumColumn();
-    boolean that_present_enumColumn = true && that.isSetEnumColumn();
-    if (this_present_enumColumn || that_present_enumColumn) {
-      if (!(this_present_enumColumn && that_present_enumColumn))
-        return false;
-      if (!this.enumColumn.equals(that.enumColumn))
-        return false;
-    }
-
-    boolean this_present_maybeBoolColumn = true && this.isSetMaybeBoolColumn();
-    boolean that_present_maybeBoolColumn = true && that.isSetMaybeBoolColumn();
-    if (this_present_maybeBoolColumn || that_present_maybeBoolColumn) {
-      if (!(this_present_maybeBoolColumn && that_present_maybeBoolColumn))
-        return false;
-      if (this.maybeBoolColumn != that.maybeBoolColumn)
-        return false;
-    }
-
-    boolean this_present_maybeByteColumn = true && this.isSetMaybeByteColumn();
-    boolean that_present_maybeByteColumn = true && that.isSetMaybeByteColumn();
-    if (this_present_maybeByteColumn || that_present_maybeByteColumn) {
-      if (!(this_present_maybeByteColumn && that_present_maybeByteColumn))
-        return false;
-      if (this.maybeByteColumn != that.maybeByteColumn)
-        return false;
-    }
-
-    boolean this_present_maybeShortColumn = true && this.isSetMaybeShortColumn();
-    boolean that_present_maybeShortColumn = true && that.isSetMaybeShortColumn();
-    if (this_present_maybeShortColumn || that_present_maybeShortColumn) {
-      if (!(this_present_maybeShortColumn && that_present_maybeShortColumn))
-        return false;
-      if (this.maybeShortColumn != that.maybeShortColumn)
-        return false;
-    }
-
-    boolean this_present_maybeIntColumn = true && this.isSetMaybeIntColumn();
-    boolean that_present_maybeIntColumn = true && that.isSetMaybeIntColumn();
-    if (this_present_maybeIntColumn || that_present_maybeIntColumn) {
-      if (!(this_present_maybeIntColumn && that_present_maybeIntColumn))
-        return false;
-      if (this.maybeIntColumn != that.maybeIntColumn)
-        return false;
-    }
-
-    boolean this_present_maybeLongColumn = true && this.isSetMaybeLongColumn();
-    boolean that_present_maybeLongColumn = true && that.isSetMaybeLongColumn();
-    if (this_present_maybeLongColumn || that_present_maybeLongColumn) {
-      if (!(this_present_maybeLongColumn && that_present_maybeLongColumn))
-        return false;
-      if (this.maybeLongColumn != that.maybeLongColumn)
-        return false;
-    }
-
-    boolean this_present_maybeDoubleColumn = true && this.isSetMaybeDoubleColumn();
-    boolean that_present_maybeDoubleColumn = true && that.isSetMaybeDoubleColumn();
-    if (this_present_maybeDoubleColumn || that_present_maybeDoubleColumn) {
-      if (!(this_present_maybeDoubleColumn && that_present_maybeDoubleColumn))
-        return false;
-      if (this.maybeDoubleColumn != that.maybeDoubleColumn)
-        return false;
-    }
-
-    boolean this_present_maybeBinaryColumn = true && this.isSetMaybeBinaryColumn();
-    boolean that_present_maybeBinaryColumn = true && that.isSetMaybeBinaryColumn();
-    if (this_present_maybeBinaryColumn || that_present_maybeBinaryColumn) {
-      if (!(this_present_maybeBinaryColumn && that_present_maybeBinaryColumn))
-        return false;
-      if (!this.maybeBinaryColumn.equals(that.maybeBinaryColumn))
-        return false;
-    }
-
-    boolean this_present_maybeStringColumn = true && this.isSetMaybeStringColumn();
-    boolean that_present_maybeStringColumn = true && that.isSetMaybeStringColumn();
-    if (this_present_maybeStringColumn || that_present_maybeStringColumn) {
-      if (!(this_present_maybeStringColumn && that_present_maybeStringColumn))
-        return false;
-      if (!this.maybeStringColumn.equals(that.maybeStringColumn))
-        return false;
-    }
-
-    boolean this_present_maybeEnumColumn = true && this.isSetMaybeEnumColumn();
-    boolean that_present_maybeEnumColumn = true && that.isSetMaybeEnumColumn();
-    if (this_present_maybeEnumColumn || that_present_maybeEnumColumn) {
-      if (!(this_present_maybeEnumColumn && that_present_maybeEnumColumn))
-        return false;
-      if (!this.maybeEnumColumn.equals(that.maybeEnumColumn))
-        return false;
-    }
-
-    boolean this_present_stringsColumn = true && this.isSetStringsColumn();
-    boolean that_present_stringsColumn = true && that.isSetStringsColumn();
-    if (this_present_stringsColumn || that_present_stringsColumn) {
-      if (!(this_present_stringsColumn && that_present_stringsColumn))
-        return false;
-      if (!this.stringsColumn.equals(that.stringsColumn))
-        return false;
-    }
-
-    boolean this_present_intSetColumn = true && this.isSetIntSetColumn();
-    boolean that_present_intSetColumn = true && that.isSetIntSetColumn();
-    if (this_present_intSetColumn || that_present_intSetColumn) {
-      if (!(this_present_intSetColumn && that_present_intSetColumn))
-        return false;
-      if (!this.intSetColumn.equals(that.intSetColumn))
-        return false;
-    }
-
-    boolean this_present_intToStringColumn = true && this.isSetIntToStringColumn();
-    boolean that_present_intToStringColumn = true && that.isSetIntToStringColumn();
-    if (this_present_intToStringColumn || that_present_intToStringColumn) {
-      if (!(this_present_intToStringColumn && that_present_intToStringColumn))
-        return false;
-      if (!this.intToStringColumn.equals(that.intToStringColumn))
-        return false;
-    }
-
-    boolean this_present_complexColumn = true && this.isSetComplexColumn();
-    boolean that_present_complexColumn = true && that.isSetComplexColumn();
-    if (this_present_complexColumn || that_present_complexColumn) {
-      if (!(this_present_complexColumn && that_present_complexColumn))
-        return false;
-      if (!this.complexColumn.equals(that.complexColumn))
-        return false;
-    }
-
-    return true;
-  }
-
-  @Override
-  public int hashCode() {
-    List<Object> list = new ArrayList<Object>();
-
-    boolean present_boolColumn = true;
-    list.add(present_boolColumn);
-    if (present_boolColumn)
-      list.add(boolColumn);
-
-    boolean present_byteColumn = true;
-    list.add(present_byteColumn);
-    if (present_byteColumn)
-      list.add(byteColumn);
-
-    boolean present_shortColumn = true;
-    list.add(present_shortColumn);
-    if (present_shortColumn)
-      list.add(shortColumn);
-
-    boolean present_intColumn = true;
-    list.add(present_intColumn);
-    if (present_intColumn)
-      list.add(intColumn);
-
-    boolean present_longColumn = true;
-    list.add(present_longColumn);
-    if (present_longColumn)
-      list.add(longColumn);
-
-    boolean present_doubleColumn = true;
-    list.add(present_doubleColumn);
-    if (present_doubleColumn)
-      list.add(doubleColumn);
-
-    boolean present_binaryColumn = true && (isSetBinaryColumn());
-    list.add(present_binaryColumn);
-    if (present_binaryColumn)
-      list.add(binaryColumn);
-
-    boolean present_stringColumn = true && (isSetStringColumn());
-    list.add(present_stringColumn);
-    if (present_stringColumn)
-      list.add(stringColumn);
-
-    boolean present_enumColumn = true && (isSetEnumColumn());
-    list.add(present_enumColumn);
-    if (present_enumColumn)
-      list.add(enumColumn.getValue());
-
-    boolean present_maybeBoolColumn = true && (isSetMaybeBoolColumn());
-    list.add(present_maybeBoolColumn);
-    if (present_maybeBoolColumn)
-      list.add(maybeBoolColumn);
-
-    boolean present_maybeByteColumn = true && (isSetMaybeByteColumn());
-    list.add(present_maybeByteColumn);
-    if (present_maybeByteColumn)
-      list.add(maybeByteColumn);
-
-    boolean present_maybeShortColumn = true && (isSetMaybeShortColumn());
-    list.add(present_maybeShortColumn);
-    if (present_maybeShortColumn)
-      list.add(maybeShortColumn);
-
-    boolean present_maybeIntColumn = true && (isSetMaybeIntColumn());
-    list.add(present_maybeIntColumn);
-    if (present_maybeIntColumn)
-      list.add(maybeIntColumn);
-
-    boolean present_maybeLongColumn = true && (isSetMaybeLongColumn());
-    list.add(present_maybeLongColumn);
-    if (present_maybeLongColumn)
-      list.add(maybeLongColumn);
-
-    boolean present_maybeDoubleColumn = true && (isSetMaybeDoubleColumn());
-    list.add(present_maybeDoubleColumn);
-    if (present_maybeDoubleColumn)
-      list.add(maybeDoubleColumn);
-
-    boolean present_maybeBinaryColumn = true && (isSetMaybeBinaryColumn());
-    list.add(present_maybeBinaryColumn);
-    if (present_maybeBinaryColumn)
-      list.add(maybeBinaryColumn);
-
-    boolean present_maybeStringColumn = true && (isSetMaybeStringColumn());
-    list.add(present_maybeStringColumn);
-    if (present_maybeStringColumn)
-      list.add(maybeStringColumn);
-
-    boolean present_maybeEnumColumn = true && (isSetMaybeEnumColumn());
-    list.add(present_maybeEnumColumn);
-    if (present_maybeEnumColumn)
-      list.add(maybeEnumColumn.getValue());
-
-    boolean present_stringsColumn = true && (isSetStringsColumn());
-    list.add(present_stringsColumn);
-    if (present_stringsColumn)
-      list.add(stringsColumn);
-
-    boolean present_intSetColumn = true && (isSetIntSetColumn());
-    list.add(present_intSetColumn);
-    if (present_intSetColumn)
-      list.add(intSetColumn);
-
-    boolean present_intToStringColumn = true && (isSetIntToStringColumn());
-    list.add(present_intToStringColumn);
-    if (present_intToStringColumn)
-      list.add(intToStringColumn);
-
-    boolean present_complexColumn = true && (isSetComplexColumn());
-    list.add(present_complexColumn);
-    if (present_complexColumn)
-      list.add(complexColumn);
-
-    return list.hashCode();
-  }
-
-  @Override
-  public int compareTo(ParquetThriftCompat other) {
-    if (!getClass().equals(other.getClass())) {
-      return getClass().getName().compareTo(other.getClass().getName());
-    }
-
-    int lastComparison = 0;
-
-    lastComparison = Boolean.valueOf(isSetBoolColumn()).compareTo(other.isSetBoolColumn());
-    if (lastComparison != 0) {
-      return lastComparison;
-    }
-    if (isSetBoolColumn()) {
-      lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.boolColumn, other.boolColumn);
-      if (lastComparison != 0) {
-        return lastComparison;
-      }
-    }
-    lastComparison = Boolean.valueOf(isSetByteColumn()).compareTo(other.isSetByteColumn());
-    if (lastComparison != 0) {
-      return lastComparison;
-    }
-    if (isSetByteColumn()) {
-      lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.byteColumn, other.byteColumn);
-      if (lastComparison != 0) {
-        return lastComparison;
-      }
-    }
-    lastComparison = Boolean.valueOf(isSetShortColumn()).compareTo(other.isSetShortColumn());
-    if (lastComparison != 0) {
-      return lastComparison;
-    }
-    if (isSetShortColumn()) {
-      lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.shortColumn, other.shortColumn);
-      if (lastComparison != 0) {
-        return lastComparison;
-      }
-    }
-    lastComparison = Boolean.valueOf(isSetIntColumn()).compareTo(other.isSetIntColumn());
-    if (lastComparison != 0) {
-      return lastComparison;
-    }
-    if (isSetIntColumn()) {
-      lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.intColumn, other.intColumn);
-      if (lastComparison != 0) {
-        return lastComparison;
-      }
-    }
-    lastComparison = Boolean.valueOf(isSetLongColumn()).compareTo(other.isSetLongColumn());
-    if (lastComparison != 0) {
-      return lastComparison;
-    }
-    if (isSetLongColumn()) {
-      lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.longColumn, other.longColumn);
-      if (lastComparison != 0) {
-        return lastComparison;
-      }
-    }
-    lastComparison = Boolean.valueOf(isSetDoubleColumn()).compareTo(other.isSetDoubleColumn());
-    if (lastComparison != 0) {
-      return lastComparison;
-    }
-    if (isSetDoubleColumn()) {
-      lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.doubleColumn, other.doubleColumn);
-      if (lastComparison != 0) {
-        return lastComparison;
-      }
-    }
-    lastComparison = Boolean.valueOf(isSetBinaryColumn()).compareTo(other.isSetBinaryColumn());
-    if (lastComparison != 0) {
-      return lastComparison;
-    }
-    if (isSetBinaryColumn()) {
-      lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.binaryColumn, other.binaryColumn);
-      if (lastComparison != 0) {
-        return lastComparison;
-      }
-    }
-    lastComparison = Boolean.valueOf(isSetStringColumn()).compareTo(other.isSetStringColumn());
-    if (lastComparison != 0) {
-      return lastComparison;
-    }
-    if (isSetStringColumn()) {
-      lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.stringColumn, other.stringColumn);
-      if (lastComparison != 0) {
-        return lastComparison;
-      }
-    }
-    lastComparison = Boolean.valueOf(isSetEnumColumn()).compareTo(other.isSetEnumColumn());
-    if (lastComparison != 0) {
-      return lastComparison;
-    }
-    if (isSetEnumColumn()) {
-      lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.enumColumn, other.enumColumn);
-      if (lastComparison != 0) {
-        return lastComparison;
-      }
-    }
-    lastComparison = Boolean.valueOf(isSetMaybeBoolColumn()).compareTo(other.isSetMaybeBoolColumn());
-    if (lastComparison != 0) {
-      return lastComparison;
-    }
-    if (isSetMaybeBoolColumn()) {
-      lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.maybeBoolColumn, other.maybeBoolColumn);
-      if (lastComparison != 0) {
-        return lastComparison;
-      }
-    }
-    lastComparison = Boolean.valueOf(isSetMaybeByteColumn()).compareTo(other.isSetMaybeByteColumn());
-    if (lastComparison != 0) {
-      return lastComparison;
-    }
-    if (isSetMaybeByteColumn()) {
-      lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.maybeByteColumn, other.maybeByteColumn);
-      if (lastComparison != 0) {
-        return lastComparison;
-      }
-    }
-    lastComparison = Boolean.valueOf(isSetMaybeShortColumn()).compareTo(other.isSetMaybeShortColumn());
-    if (lastComparison != 0) {
-      return lastComparison;
-    }
-    if (isSetMaybeShortColumn()) {
-      lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.maybeShortColumn, other.maybeShortColumn);
-      if (lastComparison != 0) {
-        return lastComparison;
-      }
-    }
-    lastComparison = Boolean.valueOf(isSetMaybeIntColumn()).compareTo(other.isSetMaybeIntColumn());
-    if (lastComparison != 0) {
-      return lastComparison;
-    }
-    if (isSetMaybeIntColumn()) {
-      lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.maybeIntColumn, other.maybeIntColumn);
-      if (lastComparison != 0) {
-        return lastComparison;
-      }
-    }
-    lastComparison = Boolean.valueOf(isSetMaybeLongColumn()).compareTo(other.isSetMaybeLongColumn());
-    if (lastComparison != 0) {
-      return lastComparison;
-    }
-    if (isSetMaybeLongColumn()) {
-      lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.maybeLongColumn, other.maybeLongColumn);
-      if (lastComparison != 0) {
-        return lastComparison;
-      }
-    }
-    lastComparison = Boolean.valueOf(isSetMaybeDoubleColumn()).compareTo(other.isSetMaybeDoubleColumn());
-    if (lastComparison != 0) {
-      return lastComparison;
-    }
-    if (isSetMaybeDoubleColumn()) {
-      lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.maybeDoubleColumn, other.maybeDoubleColumn);
-      if (lastComparison != 0) {
-        return lastComparison;
-      }
-    }
-    lastComparison = Boolean.valueOf(isSetMaybeBinaryColumn()).compareTo(other.isSetMaybeBinaryColumn());
-    if (lastComparison != 0) {
-      return lastComparison;
-    }
-    if (isSetMaybeBinaryColumn()) {
-      lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.maybeBinaryColumn, other.maybeBinaryColumn);
-      if (lastComparison != 0) {
-        return lastComparison;
-      }
-    }
-    lastComparison = Boolean.valueOf(isSetMaybeStringColumn()).compareTo(other.isSetMaybeStringColumn());
-    if (lastComparison != 0) {
-      return lastComparison;
-    }
-    if (isSetMaybeStringColumn()) {
-      lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.maybeStringColumn, other.maybeStringColumn);
-      if (lastComparison != 0) {
-        return lastComparison;
-      }
-    }
-    lastComparison = Boolean.valueOf(isSetMaybeEnumColumn()).compareTo(other.isSetMaybeEnumColumn());
-    if (lastComparison != 0) {
-      return lastComparison;
-    }
-    if (isSetMaybeEnumColumn()) {
-      lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.maybeEnumColumn, other.maybeEnumColumn);
-      if (lastComparison != 0) {
-        return lastComparison;
-      }
-    }
-    lastComparison = Boolean.valueOf(isSetStringsColumn()).compareTo(other.isSetStringsColumn());
-    if (lastComparison != 0) {
-      return lastComparison;
-    }
-    if (isSetStringsColumn()) {
-      lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.stringsColumn, other.stringsColumn);
-      if (lastComparison != 0) {
-        return lastComparison;
-      }
-    }
-    lastComparison = Boolean.valueOf(isSetIntSetColumn()).compareTo(other.isSetIntSetColumn());
-    if (lastComparison != 0) {
-      return lastComparison;
-    }
-    if (isSetIntSetColumn()) {
-      lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.intSetColumn, other.intSetColumn);
-      if (lastComparison != 0) {
-        return lastComparison;
-      }
-    }
-    lastComparison = Boolean.valueOf(isSetIntToStringColumn()).compareTo(other.isSetIntToStringColumn());
-    if (lastComparison != 0) {
-      return lastComparison;
-    }
-    if (isSetIntToStringColumn()) {
-      lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.intToStringColumn, other.intToStringColumn);
-      if (lastComparison != 0) {
-        return lastComparison;
-      }
-    }
-    lastComparison = Boolean.valueOf(isSetComplexColumn()).compareTo(other.isSetComplexColumn());
-    if (lastComparison != 0) {
-      return lastComparison;
-    }
-    if (isSetComplexColumn()) {
-      lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.complexColumn, other.complexColumn);
-      if (lastComparison != 0) {
-        return lastComparison;
-      }
-    }
-    return 0;
-  }
-
-  public _Fields fieldForId(int fieldId) {
-    return _Fields.findByThriftId(fieldId);
-  }
-
-  public void read(org.apache.thrift.protocol.TProtocol iprot) throws org.apache.thrift.TException {
-    schemes.get(iprot.getScheme()).getScheme().read(iprot, this);
-  }
-
-  public void write(org.apache.thrift.protocol.TProtocol oprot) throws org.apache.thrift.TException {
-    schemes.get(oprot.getScheme()).getScheme().write(oprot, this);
-  }
-
-  @Override
-  public String toString() {
-    StringBuilder sb = new StringBuilder("ParquetThriftCompat(");
-    boolean first = true;
-
-    sb.append("boolColumn:");
-    sb.append(this.boolColumn);
-    first = false;
-    if (!first) sb.append(", ");
-    sb.append("byteColumn:");
-    sb.append(this.byteColumn);
-    first = false;
-    if (!first) sb.append(", ");
-    sb.append("shortColumn:");
-    sb.append(this.shortColumn);
-    first = false;
-    if (!first) sb.append(", ");
-    sb.append("intColumn:");
-    sb.append(this.intColumn);
-    first = false;
-    if (!first) sb.append(", ");
-    sb.append("longColumn:");
-    sb.append(this.longColumn);
-    first = false;
-    if (!first) sb.append(", ");
-    sb.append("doubleColumn:");
-    sb.append(this.doubleColumn);
-    first = false;
-    if (!first) sb.append(", ");
-    sb.append("binaryColumn:");
-    if (this.binaryColumn == null) {
-      sb.append("null");
-    } else {
-      org.apache.thrift.TBaseHelper.toString(this.binaryColumn, sb);
-    }
-    first = false;
-    if (!first) sb.append(", ");
-    sb.append("stringColumn:");
-    if (this.stringColumn == null) {
-      sb.append("null");
-    } else {
-      sb.append(this.stringColumn);
-    }
-    first = false;
-    if (!first) sb.append(", ");
-    sb.append("enumColumn:");
-    if (this.enumColumn == null) {
-      sb.append("null");
-    } else {
-      sb.append(this.enumColumn);
-    }
-    first = false;
-    if (isSetMaybeBoolColumn()) {
-      if (!first) sb.append(", ");
-      sb.append("maybeBoolColumn:");
-      sb.append(this.maybeBoolColumn);
-      first = false;
-    }
-    if (isSetMaybeByteColumn()) {
-      if (!first) sb.append(", ");
-      sb.append("maybeByteColumn:");
-      sb.append(this.maybeByteColumn);
-      first = false;
-    }
-    if (isSetMaybeShortColumn()) {
-      if (!first) sb.append(", ");
-      sb.append("maybeShortColumn:");
-      sb.append(this.maybeShortColumn);
-      first = false;
-    }
-    if (isSetMaybeIntColumn()) {
-      if (!first) sb.append(", ");
-      sb.append("maybeIntColumn:");
-      sb.append(this.maybeIntColumn);
-      first = false;
-    }
-    if (isSetMaybeLongColumn()) {
-      if (!first) sb.append(", ");
-      sb.append("maybeLongColumn:");
-      sb.append(this.maybeLongColumn);
-      first = false;
-    }
-    if (isSetMaybeDoubleColumn()) {
-      if (!first) sb.append(", ");
-      sb.append("maybeDoubleColumn:");
-      sb.append(this.maybeDoubleColumn);
-      first = false;
-    }
-    if (isSetMaybeBinaryColumn()) {
-      if (!first) sb.append(", ");
-      sb.append("maybeBinaryColumn:");
-      if (this.maybeBinaryColumn == null) {
-        sb.append("null");
-      } else {
-        org.apache.thrift.TBaseHelper.toString(this.maybeBinaryColumn, sb);
-      }
-      first = false;
-    }
-    if (isSetMaybeStringColumn()) {
-      if (!first) sb.append(", ");
-      sb.append("maybeStringColumn:");
-      if (this.maybeStringColumn == null) {
-        sb.append("null");
-      } else {
-        sb.append(this.maybeStringColumn);
-      }
-      first = false;
-    }
-    if (isSetMaybeEnumColumn()) {
-      if (!first) sb.append(", ");
-      sb.append("maybeEnumColumn:");
-      if (this.maybeEnumColumn == null) {
-        sb.append("null");
-      } else {
-        sb.append(this.maybeEnumColumn);
-      }
-      first = false;
-    }
-    if (!first) sb.append(", ");
-    sb.append("stringsColumn:");
-    if (this.stringsColumn == null) {
-      sb.append("null");
-    } else {
-      sb.append(this.stringsColumn);
-    }
-    first = false;
-    if (!first) sb.append(", ");
-    sb.append("intSetColumn:");
-    if (this.intSetColumn == null) {
-      sb.append("null");
-    } else {
-      sb.append(this.intSetColumn);
-    }
-    first = false;
-    if (!first) sb.append(", ");
-    sb.append("intToStringColumn:");
-    if (this.intToStringColumn == null) {
-      sb.append("null");
-    } else {
-      sb.append(this.intToStringColumn);
-    }
-    first = false;
-    if (!first) sb.append(", ");
-    sb.append("complexColumn:");
-    if (this.complexColumn == null) {
-      sb.append("null");
-    } else {
-      sb.append(this.complexColumn);
-    }
-    first = false;
-    sb.append(")");
-    return sb.toString();
-  }
-
-  public void validate() throws org.apache.thrift.TException {
-    // check for required fields
-    // alas, we cannot check 'boolColumn' because it's a primitive and you chose the non-beans generator.
-    // alas, we cannot check 'byteColumn' because it's a primitive and you chose the non-beans generator.
-    // alas, we cannot check 'shortColumn' because it's a primitive and you chose the non-beans generator.
-    // alas, we cannot check 'intColumn' because it's a primitive and you chose the non-beans generator.
-    // alas, we cannot check 'longColumn' because it's a primitive and you chose the non-beans generator.
-    // alas, we cannot check 'doubleColumn' because it's a primitive and you chose the non-beans generator.
-    if (binaryColumn == null) {
-      throw new org.apache.thrift.protocol.TProtocolException("Required field 'binaryColumn' was not present! Struct: " + toString());
-    }
-    if (stringColumn == null) {
-      throw new org.apache.thrift.protocol.TProtocolException("Required field 'stringColumn' was not present! Struct: " + toString());
-    }
-    if (enumColumn == null) {
-      throw new org.apache.thrift.protocol.TProtocolException("Required field 'enumColumn' was not present! Struct: " + toString());
-    }
-    if (stringsColumn == null) {
-      throw new org.apache.thrift.protocol.TProtocolException("Required field 'stringsColumn' was not present! Struct: " + toString());
-    }
-    if (intSetColumn == null) {
-      throw new org.apache.thrift.protocol.TProtocolException("Required field 'intSetColumn' was not present! Struct: " + toString());
-    }
-    if (intToStringColumn == null) {
-      throw new org.apache.thrift.protocol.TProtocolException("Required field 'intToStringColumn' was not present! Struct: " + toString());
-    }
-    if (complexColumn == null) {
-      throw new org.apache.thrift.protocol.TProtocolException("Required field 'complexColumn' was not present! Struct: " + toString());
-    }
-    // check for sub-struct validity
-  }
-
-  private void writeObject(java.io.ObjectOutputStream out) throws java.io.IOException {
-    try {
-      write(new org.apache.thrift.protocol.TCompactProtocol(new org.apache.thrift.transport.TIOStreamTransport(out)));
-    } catch (org.apache.thrift.TException te) {
-      throw new java.io.IOException(te);
-    }
-  }
-
-  private void readObject(java.io.ObjectInputStream in) throws java.io.IOException, ClassNotFoundException {
-    try {
-      // it doesn't seem like you should have to do this, but java serialization is wacky, and doesn't call the default constructor.
-      __isset_bitfield = 0;
-      read(new org.apache.thrift.protocol.TCompactProtocol(new org.apache.thrift.transport.TIOStreamTransport(in)));
-    } catch (org.apache.thrift.TException te) {
-      throw new java.io.IOException(te);
-    }
-  }
-
-  private static class ParquetThriftCompatStandardSchemeFactory implements SchemeFactory {
-    public ParquetThriftCompatStandardScheme getScheme() {
-      return new ParquetThriftCompatStandardScheme();
-    }
-  }
-
-  private static class ParquetThriftCompatStandardScheme extends StandardScheme<ParquetThriftCompat> {
-
-    public void read(org.apache.thrift.protocol.TProtocol iprot, ParquetThriftCompat struct) throws org.apache.thrift.TException {
-      org.apache.thrift.protocol.TField schemeField;
-      iprot.readStructBegin();
-      while (true)
-      {
-        schemeField = iprot.readFieldBegin();
-        if (schemeField.type == org.apache.thrift.protocol.TType.STOP) { 
-          break;
-        }
-        switch (schemeField.id) {
-          case 1: // BOOL_COLUMN
-            if (schemeField.type == org.apache.thrift.protocol.TType.BOOL) {
-              struct.boolColumn = iprot.readBool();
-              struct.setBoolColumnIsSet(true);
-            } else { 
-              org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
-            }
-            break;
-          case 2: // BYTE_COLUMN
-            if (schemeField.type == org.apache.thrift.protocol.TType.BYTE) {
-              struct.byteColumn = iprot.readByte();
-              struct.setByteColumnIsSet(true);
-            } else { 
-              org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
-            }
-            break;
-          case 3: // SHORT_COLUMN
-            if (schemeField.type == org.apache.thrift.protocol.TType.I16) {
-              struct.shortColumn = iprot.readI16();
-              struct.setShortColumnIsSet(true);
-            } else { 
-              org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
-            }
-            break;
-          case 4: // INT_COLUMN
-            if (schemeField.type == org.apache.thrift.protocol.TType.I32) {
-              struct.intColumn = iprot.readI32();
-              struct.setIntColumnIsSet(true);
-            } else { 
-              org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
-            }
-            break;
-          case 5: // LONG_COLUMN
-            if (schemeField.type == org.apache.thrift.protocol.TType.I64) {
-              struct.longColumn = iprot.readI64();
-              struct.setLongColumnIsSet(true);
-            } else { 
-              org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
-            }
-            break;
-          case 6: // DOUBLE_COLUMN
-            if (schemeField.type == org.apache.thrift.protocol.TType.DOUBLE) {
-              struct.doubleColumn = iprot.readDouble();
-              struct.setDoubleColumnIsSet(true);
-            } else { 
-              org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
-            }
-            break;
-          case 7: // BINARY_COLUMN
-            if (schemeField.type == org.apache.thrift.protocol.TType.STRING) {
-              struct.binaryColumn = iprot.readBinary();
-              struct.setBinaryColumnIsSet(true);
-            } else { 
-              org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
-            }
-            break;
-          case 8: // STRING_COLUMN
-            if (schemeField.type == org.apache.thrift.protocol.TType.STRING) {
-              struct.stringColumn = iprot.readString();
-              struct.setStringColumnIsSet(true);
-            } else { 
-              org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
-            }
-            break;
-          case 9: // ENUM_COLUMN
-            if (schemeField.type == org.apache.thrift.protocol.TType.I32) {
-              struct.enumColumn = org.apache.spark.sql.parquet.test.thrift.Suit.findByValue(iprot.readI32());
-              struct.setEnumColumnIsSet(true);
-            } else { 
-              org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
-            }
-            break;
-          case 10: // MAYBE_BOOL_COLUMN
-            if (schemeField.type == org.apache.thrift.protocol.TType.BOOL) {
-              struct.maybeBoolColumn = iprot.readBool();
-              struct.setMaybeBoolColumnIsSet(true);
-            } else { 
-              org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
-            }
-            break;
-          case 11: // MAYBE_BYTE_COLUMN
-            if (schemeField.type == org.apache.thrift.protocol.TType.BYTE) {
-              struct.maybeByteColumn = iprot.readByte();
-              struct.setMaybeByteColumnIsSet(true);
-            } else { 
-              org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
-            }
-            break;
-          case 12: // MAYBE_SHORT_COLUMN
-            if (schemeField.type == org.apache.thrift.protocol.TType.I16) {
-              struct.maybeShortColumn = iprot.readI16();
-              struct.setMaybeShortColumnIsSet(true);
-            } else { 
-              org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
-            }
-            break;
-          case 13: // MAYBE_INT_COLUMN
-            if (schemeField.type == org.apache.thrift.protocol.TType.I32) {
-              struct.maybeIntColumn = iprot.readI32();
-              struct.setMaybeIntColumnIsSet(true);
-            } else { 
-              org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
-            }
-            break;
-          case 14: // MAYBE_LONG_COLUMN
-            if (schemeField.type == org.apache.thrift.protocol.TType.I64) {
-              struct.maybeLongColumn = iprot.readI64();
-              struct.setMaybeLongColumnIsSet(true);
-            } else { 
-              org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
-            }
-            break;
-          case 15: // MAYBE_DOUBLE_COLUMN
-            if (schemeField.type == org.apache.thrift.protocol.TType.DOUBLE) {
-              struct.maybeDoubleColumn = iprot.readDouble();
-              struct.setMaybeDoubleColumnIsSet(true);
-            } else { 
-              org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
-            }
-            break;
-          case 16: // MAYBE_BINARY_COLUMN
-            if (schemeField.type == org.apache.thrift.protocol.TType.STRING) {
-              struct.maybeBinaryColumn = iprot.readBinary();
-              struct.setMaybeBinaryColumnIsSet(true);
-            } else { 
-              org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
-            }
-            break;
-          case 17: // MAYBE_STRING_COLUMN
-            if (schemeField.type == org.apache.thrift.protocol.TType.STRING) {
-              struct.maybeStringColumn = iprot.readString();
-              struct.setMaybeStringColumnIsSet(true);
-            } else { 
-              org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
-            }
-            break;
-          case 18: // MAYBE_ENUM_COLUMN
-            if (schemeField.type == org.apache.thrift.protocol.TType.I32) {
-              struct.maybeEnumColumn = org.apache.spark.sql.parquet.test.thrift.Suit.findByValue(iprot.readI32());
-              struct.setMaybeEnumColumnIsSet(true);
-            } else { 
-              org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
-            }
-            break;
-          case 19: // STRINGS_COLUMN
-            if (schemeField.type == org.apache.thrift.protocol.TType.LIST) {
-              {
-                org.apache.thrift.protocol.TList _list8 = iprot.readListBegin();
-                struct.stringsColumn = new ArrayList<String>(_list8.size);
-                String _elem9;
-                for (int _i10 = 0; _i10 < _list8.size; ++_i10)
-                {
-                  _elem9 = iprot.readString();
-                  struct.stringsColumn.add(_elem9);
-                }
-                iprot.readListEnd();
-              }
-              struct.setStringsColumnIsSet(true);
-            } else { 
-              org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
-            }
-            break;
-          case 20: // INT_SET_COLUMN
-            if (schemeField.type == org.apache.thrift.protocol.TType.SET) {
-              {
-                org.apache.thrift.protocol.TSet _set11 = iprot.readSetBegin();
-                struct.intSetColumn = new HashSet<Integer>(2*_set11.size);
-                int _elem12;
-                for (int _i13 = 0; _i13 < _set11.size; ++_i13)
-                {
-                  _elem12 = iprot.readI32();
-                  struct.intSetColumn.add(_elem12);
-                }
-                iprot.readSetEnd();
-              }
-              struct.setIntSetColumnIsSet(true);
-            } else { 
-              org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
-            }
-            break;
-          case 21: // INT_TO_STRING_COLUMN
-            if (schemeField.type == org.apache.thrift.protocol.TType.MAP) {
-              {
-                org.apache.thrift.protocol.TMap _map14 = iprot.readMapBegin();
-                struct.intToStringColumn = new HashMap<Integer,String>(2*_map14.size);
-                int _key15;
-                String _val16;
-                for (int _i17 = 0; _i17 < _map14.size; ++_i17)
-                {
-                  _key15 = iprot.readI32();
-                  _val16 = iprot.readString();
-                  struct.intToStringColumn.put(_key15, _val16);
-                }
-                iprot.readMapEnd();
-              }
-              struct.setIntToStringColumnIsSet(true);
-            } else { 
-              org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
-            }
-            break;
-          case 22: // COMPLEX_COLUMN
-            if (schemeField.type == org.apache.thrift.protocol.TType.MAP) {
-              {
-                org.apache.thrift.protocol.TMap _map18 = iprot.readMapBegin();
-                struct.complexColumn = new HashMap<Integer,List<Nested>>(2*_map18.size);
-                int _key19;
-                List<Nested> _val20;
-                for (int _i21 = 0; _i21 < _map18.size; ++_i21)
-                {
-                  _key19 = iprot.readI32();
-                  {
-                    org.apache.thrift.protocol.TList _list22 = iprot.readListBegin();
-                    _val20 = new ArrayList<Nested>(_list22.size);
-                    Nested _elem23;
-                    for (int _i24 = 0; _i24 < _list22.size; ++_i24)
-                    {
-                      _elem23 = new Nested();
-                      _elem23.read(iprot);
-                      _val20.add(_elem23);
-                    }
-                    iprot.readListEnd();
-                  }
-                  struct.complexColumn.put(_key19, _val20);
-                }
-                iprot.readMapEnd();
-              }
-              struct.setComplexColumnIsSet(true);
-            } else { 
-              org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
-            }
-            break;
-          default:
-            org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
-        }
-        iprot.readFieldEnd();
-      }
-      iprot.readStructEnd();
-
-      // check for required fields of primitive type, which can't be checked in the validate method
-      if (!struct.isSetBoolColumn()) {
-        throw new org.apache.thrift.protocol.TProtocolException("Required field 'boolColumn' was not found in serialized data! Struct: " + toString());
-      }
-      if (!struct.isSetByteColumn()) {
-        throw new org.apache.thrift.protocol.TProtocolException("Required field 'byteColumn' was not found in serialized data! Struct: " + toString());
-      }
-      if (!struct.isSetShortColumn()) {
-        throw new org.apache.thrift.protocol.TProtocolException("Required field 'shortColumn' was not found in serialized data! Struct: " + toString());
-      }
-      if (!struct.isSetIntColumn()) {
-        throw new org.apache.thrift.protocol.TProtocolException("Required field 'intColumn' was not found in serialized data! Struct: " + toString());
-      }
-      if (!struct.isSetLongColumn()) {
-        throw new org.apache.thrift.protocol.TProtocolException("Required field 'longColumn' was not found in serialized data! Struct: " + toString());
-      }
-      if (!struct.isSetDoubleColumn()) {
-        throw new org.apache.thrift.protocol.TProtocolException("Required field 'doubleColumn' was not found in serialized data! Struct: " + toString());
-      }
-      struct.validate();
-    }
-
-    public void write(org.apache.thrift.protocol.TProtocol oprot, ParquetThriftCompat struct) throws org.apache.thrift.TException {
-      struct.validate();
-
-      oprot.writeStructBegin(STRUCT_DESC);
-      oprot.writeFieldBegin(BOOL_COLUMN_FIELD_DESC);
-      oprot.writeBool(struct.boolColumn);
-      oprot.writeFieldEnd();
-      oprot.writeFieldBegin(BYTE_COLUMN_FIELD_DESC);
-      oprot.writeByte(struct.byteColumn);
-      oprot.writeFieldEnd();
-      oprot.writeFieldBegin(SHORT_COLUMN_FIELD_DESC);
-      oprot.writeI16(struct.shortColumn);
-      oprot.writeFieldEnd();
-      oprot.writeFieldBegin(INT_COLUMN_FIELD_DESC);
-      oprot.writeI32(struct.intColumn);
-      oprot.writeFieldEnd();
-      oprot.writeFieldBegin(LONG_COLUMN_FIELD_DESC);
-      oprot.writeI64(struct.longColumn);
-      oprot.writeFieldEnd();
-      oprot.writeFieldBegin(DOUBLE_COLUMN_FIELD_DESC);
-      oprot.writeDouble(struct.doubleColumn);
-      oprot.writeFieldEnd();
-      if (struct.binaryColumn != null) {
-        oprot.writeFieldBegin(BINARY_COLUMN_FIELD_DESC);
-        oprot.writeBinary(struct.binaryColumn);
-        oprot.writeFieldEnd();
-      }
-      if (struct.stringColumn != null) {
-        oprot.writeFieldBegin(STRING_COLUMN_FIELD_DESC);
-        oprot.writeString(struct.stringColumn);
-        oprot.writeFieldEnd();
-      }
-      if (struct.enumColumn != null) {
-        oprot.writeFieldBegin(ENUM_COLUMN_FIELD_DESC);
-        oprot.writeI32(struct.enumColumn.getValue());
-        oprot.writeFieldEnd();
-      }
-      if (struct.isSetMaybeBoolColumn()) {
-        oprot.writeFieldBegin(MAYBE_BOOL_COLUMN_FIELD_DESC);
-        oprot.writeBool(struct.maybeBoolColumn);
-        oprot.writeFieldEnd();
-      }
-      if (struct.isSetMaybeByteColumn()) {
-        oprot.writeFieldBegin(MAYBE_BYTE_COLUMN_FIELD_DESC);
-        oprot.writeByte(struct.maybeByteColumn);
-        oprot.writeFieldEnd();
-      }
-      if (struct.isSetMaybeShortColumn()) {
-        oprot.writeFieldBegin(MAYBE_SHORT_COLUMN_FIELD_DESC);
-        oprot.writeI16(struct.maybeShortColumn);
-        oprot.writeFieldEnd();
-      }
-      if (struct.isSetMaybeIntColumn()) {
-        oprot.writeFieldBegin(MAYBE_INT_COLUMN_FIELD_DESC);
-        oprot.writeI32(struct.maybeIntColumn);
-        oprot.writeFieldEnd();
-      }
-      if (struct.isSetMaybeLongColumn()) {
-        oprot.writeFieldBegin(MAYBE_LONG_COLUMN_FIELD_DESC);
-        oprot.writeI64(struct.maybeLongColumn);
-        oprot.writeFieldEnd();
-      }
-      if (struct.isSetMaybeDoubleColumn()) {
-        oprot.writeFieldBegin(MAYBE_DOUBLE_COLUMN_FIELD_DESC);
-        oprot.writeDouble(struct.maybeDoubleColumn);
-        oprot.writeFieldEnd();
-      }
-      if (struct.maybeBinaryColumn != null) {
-        if (struct.isSetMaybeBinaryColumn()) {
-          oprot.writeFieldBegin(MAYBE_BINARY_COLUMN_FIELD_DESC);
-          oprot.writeBinary(struct.maybeBinaryColumn);
-          oprot.writeFieldEnd();
-        }
-      }
-      if (struct.maybeStringColumn != null) {
-        if (struct.isSetMaybeStringColumn()) {
-          oprot.writeFieldBegin(MAYBE_STRING_COLUMN_FIELD_DESC);
-          oprot.writeString(struct.maybeStringColumn);
-          oprot.writeFieldEnd();
-        }
-      }
-      if (struct.maybeEnumColumn != null) {
-        if (struct.isSetMaybeEnumColumn()) {
-          oprot.writeFieldBegin(MAYBE_ENUM_COLUMN_FIELD_DESC);
-          oprot.writeI32(struct.maybeEnumColumn.getValue());
-          oprot.writeFieldEnd();
-        }
-      }
-      if (struct.stringsColumn != null) {
-        oprot.writeFieldBegin(STRINGS_COLUMN_FIELD_DESC);
-        {
-          oprot.writeListBegin(new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRING, struct.stringsColumn.size()));
-          for (String _iter25 : struct.stringsColumn)
-          {
-            oprot.writeString(_iter25);
-          }
-          oprot.writeListEnd();
-        }
-        oprot.writeFieldEnd();
-      }
-      if (struct.intSetColumn != null) {
-        oprot.writeFieldBegin(INT_SET_COLUMN_FIELD_DESC);
-        {
-          oprot.writeSetBegin(new org.apache.thrift.protocol.TSet(org.apache.thrift.protocol.TType.I32, struct.intSetColumn.size()));
-          for (int _iter26 : struct.intSetColumn)
-          {
-            oprot.writeI32(_iter26);
-          }
-          oprot.writeSetEnd();
-        }
-        oprot.writeFieldEnd();
-      }
-      if (struct.intToStringColumn != null) {
-        oprot.writeFieldBegin(INT_TO_STRING_COLUMN_FIELD_DESC);
-        {
-          oprot.writeMapBegin(new org.apache.thrift.protocol.TMap(org.apache.thrift.protocol.TType.I32, org.apache.thrift.protocol.TType.STRING, struct.intToStringColumn.size()));
-          for (Map.Entry<Integer, String> _iter27 : struct.intToStringColumn.entrySet())
-          {
-            oprot.writeI32(_iter27.getKey());
-            oprot.writeString(_iter27.getValue());
-          }
-          oprot.writeMapEnd();
-        }
-        oprot.writeFieldEnd();
-      }
-      if (struct.complexColumn != null) {
-        oprot.writeFieldBegin(COMPLEX_COLUMN_FIELD_DESC);
-        {
-          oprot.writeMapBegin(new org.apache.thrift.protocol.TMap(org.apache.thrift.protocol.TType.I32, org.apache.thrift.protocol.TType.LIST, struct.complexColumn.size()));
-          for (Map.Entry<Integer, List<Nested>> _iter28 : struct.complexColumn.entrySet())
-          {
-            oprot.writeI32(_iter28.getKey());
-            {
-              oprot.writeListBegin(new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRUCT, _iter28.getValue().size()));
-              for (Nested _iter29 : _iter28.getValue())
-              {
-                _iter29.write(oprot);
-              }
-              oprot.writeListEnd();
-            }
-          }
-          oprot.writeMapEnd();
-        }
-        oprot.writeFieldEnd();
-      }
-      oprot.writeFieldStop();
-      oprot.writeStructEnd();
-    }
-
-  }
-
-  private static class ParquetThriftCompatTupleSchemeFactory implements SchemeFactory {
-    public ParquetThriftCompatTupleScheme getScheme() {
-      return new ParquetThriftCompatTupleScheme();
-    }
-  }
-
-  private static class ParquetThriftCompatTupleScheme extends TupleScheme<ParquetThriftCompat> {
-
-    @Override
-    public void write(org.apache.thrift.protocol.TProtocol prot, ParquetThriftCompat struct) throws org.apache.thrift.TException {
-      TTupleProtocol oprot = (TTupleProtocol) prot;
-      oprot.writeBool(struct.boolColumn);
-      oprot.writeByte(struct.byteColumn);
-      oprot.writeI16(struct.shortColumn);
-      oprot.writeI32(struct.intColumn);
-      oprot.writeI64(struct.longColumn);
-      oprot.writeDouble(struct.doubleColumn);
-      oprot.writeBinary(struct.binaryColumn);
-      oprot.writeString(struct.stringColumn);
-      oprot.writeI32(struct.enumColumn.getValue());
-      {
-        oprot.writeI32(struct.stringsColumn.size());
-        for (String _iter30 : struct.stringsColumn)
-        {
-          oprot.writeString(_iter30);
-        }
-      }
-      {
-        oprot.writeI32(struct.intSetColumn.size());
-        for (int _iter31 : struct.intSetColumn)
-        {
-          oprot.writeI32(_iter31);
-        }
-      }
-      {
-        oprot.writeI32(struct.intToStringColumn.size());
-        for (Map.Entry<Integer, String> _iter32 : struct.intToStringColumn.entrySet())
-        {
-          oprot.writeI32(_iter32.getKey());
-          oprot.writeString(_iter32.getValue());
-        }
-      }
-      {
-        oprot.writeI32(struct.complexColumn.size());
-        for (Map.Entry<Integer, List<Nested>> _iter33 : struct.complexColumn.entrySet())
-        {
-          oprot.writeI32(_iter33.getKey());
-          {
-            oprot.writeI32(_iter33.getValue().size());
-            for (Nested _iter34 : _iter33.getValue())
-            {
-              _iter34.write(oprot);
-            }
-          }
-        }
-      }
-      BitSet optionals = new BitSet();
-      if (struct.isSetMaybeBoolColumn()) {
-        optionals.set(0);
-      }
-      if (struct.isSetMaybeByteColumn()) {
-        optionals.set(1);
-      }
-      if (struct.isSetMaybeShortColumn()) {
-        optionals.set(2);
-      }
-      if (struct.isSetMaybeIntColumn()) {
-        optionals.set(3);
-      }
-      if (struct.isSetMaybeLongColumn()) {
-        optionals.set(4);
-      }
-      if (struct.isSetMaybeDoubleColumn()) {
-        optionals.set(5);
-      }
-      if (struct.isSetMaybeBinaryColumn()) {
-        optionals.set(6);
-      }
-      if (struct.isSetMaybeStringColumn()) {
-        optionals.set(7);
-      }
-      if (struct.isSetMaybeEnumColumn()) {
-        optionals.set(8);
-      }
-      oprot.writeBitSet(optionals, 9);
-      if (struct.isSetMaybeBoolColumn()) {
-        oprot.writeBool(struct.maybeBoolColumn);
-      }
-      if (struct.isSetMaybeByteColumn()) {
-        oprot.writeByte(struct.maybeByteColumn);
-      }
-      if (struct.isSetMaybeShortColumn()) {
-        oprot.writeI16(struct.maybeShortColumn);
-      }
-      if (struct.isSetMaybeIntColumn()) {
-        oprot.writeI32(struct.maybeIntColumn);
-      }
-      if (struct.isSetMaybeLongColumn()) {
-        oprot.writeI64(struct.maybeLongColumn);
-      }
-      if (struct.isSetMaybeDoubleColumn()) {
-        oprot.writeDouble(struct.maybeDoubleColumn);
-      }
-      if (struct.isSetMaybeBinaryColumn()) {
-        oprot.writeBinary(struct.maybeBinaryColumn);
-      }
-      if (struct.isSetMaybeStringColumn()) {
-        oprot.writeString(struct.maybeStringColumn);
-      }
-      if (struct.isSetMaybeEnumColumn()) {
-        oprot.writeI32(struct.maybeEnumColumn.getValue());
-      }
-    }
-
-    @Override
-    public void read(org.apache.thrift.protocol.TProtocol prot, ParquetThriftCompat struct) throws org.apache.thrift.TException {
-      TTupleProtocol iprot = (TTupleProtocol) prot;
-      struct.boolColumn = iprot.readBool();
-      struct.setBoolColumnIsSet(true);
-      struct.byteColumn = iprot.readByte();
-      struct.setByteColumnIsSet(true);
-      struct.shortColumn = iprot.readI16();
-      struct.setShortColumnIsSet(true);
-      struct.intColumn = iprot.readI32();
-      struct.setIntColumnIsSet(true);
-      struct.longColumn = iprot.readI64();
-      struct.setLongColumnIsSet(true);
-      struct.doubleColumn = iprot.readDouble();
-      struct.setDoubleColumnIsSet(true);
-      struct.binaryColumn = iprot.readBinary();
-      struct.setBinaryColumnIsSet(true);
-      struct.stringColumn = iprot.readString();
-      struct.setStringColumnIsSet(true);
-      struct.enumColumn = org.apache.spark.sql.parquet.test.thrift.Suit.findByValue(iprot.readI32());
-      struct.setEnumColumnIsSet(true);
-      {
-        org.apache.thrift.protocol.TList _list35 = new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRING, iprot.readI32());
-        struct.stringsColumn = new ArrayList<String>(_list35.size);
-        String _elem36;
-        for (int _i37 = 0; _i37 < _list35.size; ++_i37)
-        {
-          _elem36 = iprot.readString();
-          struct.stringsColumn.add(_elem36);
-        }
-      }
-      struct.setStringsColumnIsSet(true);
-      {
-        org.apache.thrift.protocol.TSet _set38 = new org.apache.thrift.protocol.TSet(org.apache.thrift.protocol.TType.I32, iprot.readI32());
-        struct.intSetColumn = new HashSet<Integer>(2*_set38.size);
-        int _elem39;
-        for (int _i40 = 0; _i40 < _set38.size; ++_i40)
-        {
-          _elem39 = iprot.readI32();
-          struct.intSetColumn.add(_elem39);
-        }
-      }
-      struct.setIntSetColumnIsSet(true);
-      {
-        org.apache.thrift.protocol.TMap _map41 = new org.apache.thrift.protocol.TMap(org.apache.thrift.protocol.TType.I32, org.apache.thrift.protocol.TType.STRING, iprot.readI32());
-        struct.intToStringColumn = new HashMap<Integer,String>(2*_map41.size);
-        int _key42;
-        String _val43;
-        for (int _i44 = 0; _i44 < _map41.size; ++_i44)
-        {
-          _key42 = iprot.readI32();
-          _val43 = iprot.readString();
-          struct.intToStringColumn.put(_key42, _val43);
-        }
-      }
-      struct.setIntToStringColumnIsSet(true);
-      {
-        org.apache.thrift.protocol.TMap _map45 = new org.apache.thrift.protocol.TMap(org.apache.thrift.protocol.TType.I32, org.apache.thrift.protocol.TType.LIST, iprot.readI32());
-        struct.complexColumn = new HashMap<Integer,List<Nested>>(2*_map45.size);
-        int _key46;
-        List<Nested> _val47;
-        for (int _i48 = 0; _i48 < _map45.size; ++_i48)
-        {
-          _key46 = iprot.readI32();
-          {
-            org.apache.thrift.protocol.TList _list49 = new org.apache.thrift.protocol.TList(org.apache.thrift.protocol.TType.STRUCT, iprot.readI32());
-            _val47 = new ArrayList<Nested>(_list49.size);
-            Nested _elem50;
-            for (int _i51 = 0; _i51 < _list49.size; ++_i51)
-            {
-              _elem50 = new Nested();
-              _elem50.read(iprot);
-              _val47.add(_elem50);
-            }
-          }
-          struct.complexColumn.put(_key46, _val47);
-        }
-      }
-      struct.setComplexColumnIsSet(true);
-      BitSet incoming = iprot.readBitSet(9);
-      if (incoming.get(0)) {
-        struct.maybeBoolColumn = iprot.readBool();
-        struct.setMaybeBoolColumnIsSet(true);
-      }
-      if (incoming.get(1)) {
-        struct.maybeByteColumn = iprot.readByte();
-        struct.setMaybeByteColumnIsSet(true);
-      }
-      if (incoming.get(2)) {
-        struct.maybeShortColumn = iprot.readI16();
-        struct.setMaybeShortColumnIsSet(true);
-      }
-      if (incoming.get(3)) {
-        struct.maybeIntColumn = iprot.readI32();
-        struct.setMaybeIntColumnIsSet(true);
-      }
-      if (incoming.get(4)) {
-        struct.maybeLongColumn = iprot.readI64();
-        struct.setMaybeLongColumnIsSet(true);
-      }
-      if (incoming.get(5)) {
-        struct.maybeDoubleColumn = iprot.readDouble();
-        struct.setMaybeDoubleColumnIsSet(true);
-      }
-      if (incoming.get(6)) {
-        struct.maybeBinaryColumn = iprot.readBinary();
-        struct.setMaybeBinaryColumnIsSet(true);
-      }
-      if (incoming.get(7)) {
-        struct.maybeStringColumn = iprot.readString();
-        struct.setMaybeStringColumnIsSet(true);
-      }
-      if (incoming.get(8)) {
-        struct.maybeEnumColumn = org.apache.spark.sql.parquet.test.thrift.Suit.findByValue(iprot.readI32());
-        struct.setMaybeEnumColumnIsSet(true);
-      }
-    }
-  }
-
-}
-
diff --git a/sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/thrift/Suit.java b/sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/thrift/Suit.java
deleted file mode 100644
index 5315c6aae9372..0000000000000
--- a/sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/thrift/Suit.java
+++ /dev/null
@@ -1,51 +0,0 @@
-/**
- * Autogenerated by Thrift Compiler (0.9.2)
- *
- * DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
- *  @generated
- */
-package org.apache.spark.sql.parquet.test.thrift;
-
-
-import java.util.Map;
-import java.util.HashMap;
-import org.apache.thrift.TEnum;
-
-public enum Suit implements org.apache.thrift.TEnum {
-  SPADES(0),
-  HEARTS(1),
-  DIAMONDS(2),
-  CLUBS(3);
-
-  private final int value;
-
-  private Suit(int value) {
-    this.value = value;
-  }
-
-  /**
-   * Get the integer value of this enum value, as defined in the Thrift IDL.
-   */
-  public int getValue() {
-    return value;
-  }
-
-  /**
-   * Find a the enum type by its integer value, as defined in the Thrift IDL.
-   * @return null if the value is not found.
-   */
-  public static Suit findByValue(int value) { 
-    switch (value) {
-      case 0:
-        return SPADES;
-      case 1:
-        return HEARTS;
-      case 2:
-        return DIAMONDS;
-      case 3:
-        return CLUBS;
-      default:
-        return null;
-    }
-  }
-}
diff --git a/sql/core/src/test/resources/parquet-thrift-compat.snappy.parquet b/sql/core/src/test/resources/parquet-thrift-compat.snappy.parquet
new file mode 100755
index 0000000000000000000000000000000000000000..837e4876eea6385092610bb59016c3a7b97e1c72
GIT binary patch
literal 10550
zcmeHNO>7&-72ctiG^rn3a=43St<<z4FXTcML`szCl(qFo)hc956)87y5x5k!w%AaV
zOp%6_z=2>GQ5yt4<WM-OS`<ZLSZNRxg%K2n8st<UO^Wsq^xzuKC5Ijgq_7M0V7KUd
zvpdUK?k>etcF+PPFyeaVeeb<_^Ua&LLr)LSL?l5HC4aXRga^OmZ&wIWh#(8_@rbho
zdZ(o{H!KEVL=kbE3UzJbBZ20D)&LF1ywH5m_CWJPLy%onBlIdFVhvUGs6mijRTEW(
zE*PqI@&?r)2vq%)Nb}qb{b!&XKMHzIKUC+mAhZynvBfV0p?w)z7qlp}1hnxMe*z@v
zhbho?U*dJ2qo0l;#lP_;em*#^>lQcOB*g#0@jfpR9~=*~iexC@Zw?Io{v$$!R?v?&
zl@|Y4`;e|8Y<zbNft6cCTHvgBFJ|&*k1F)5m+C(g#udTt-nM`cffj|<2dy7k4B7y+
z!L~qWoCzL%VtPesMoLct+kh!YI*}%+>EW?tiZm{qNDj}WQyBNLiQ$t|r^Zs~&gkUq
zNDAC(mRfO^z*i)yPe&<UI!veO3wpetek$Ukwn{-gBEAZ#QLn!pIqH3vs33Gq8sUUQ
z95?ugj|VRqqKuY|*^Jdx!!cfizBwojsfI2?1NL~E;_w%zi$lhaI3^A8czs78NTzs&
zd9Y^^jk7I<U}GN?q|=-NKck@H$8r2nMT22OQKJwbLNKsLWaLeZ>tX^!!lVk)8D1wN
zoZ|1Jp9jZvoh*{T+2#kwk-*KwLfFm<SwZVd7EdWT<2pwpRNvPVhL4PajHU)fI|tc7
z{4r~@5HgKHw)2q4ah-2GA!X@H7${MqB&#AME?|>MfQHT{kh@EyS0*I|nO~q=GE*$>
zv?YajTR=SOYdO;$=|$fV62ZJZB8_)T32NQfu6VH@Ha*oycf+AI^fmDA;-3eQe+eQ6
zTK7kz{ps}Six)3lzrJxBH_C!E#}*s3RHyg;AVPzJ*Vl@>Kp<YSH;?PY_5!1%$!Ssg
zt~77Vv3Hwd;16o41>@4Oby}jH*-=m3lX8f29jUK6&Ioaevn{z+$Jyf$=l_bREK8+H
zsU67|zl#B7d99v>X$JWgXuz0T4bnYNx3noT>WMhO)dY9QbP{@!+n@d8y-yw<`|ROw
z;NcGsUm*eigZDlm0bzUNfN)|Tn%xvdF^Q}c4;^}Dlmt41NKkkjl}I3rTvbO$lNt;R
zAZ_7M8-xL5@Ci*jzYYV>A(`QE{dpKbUVm?W{Rdkx@DfrS9=BeG0eHOq`qtJjJ|uzo
z=Iz_tcj58FAL9cSK(>9k8@{|vq@R#(Jd`mp;MTNJ=5&7uCLdbZ_*w3?8%H*7L<7QK
z8tDePMH<9VtLG5J<j_;mXy+)zU^JFak83bz?b{bGF08{G*H^D!e{BoqwDqIgph)tB
z9zAoSWja&5P}WMR?1HwKk-A~64hu@-+|p8hbSYn6EPx#CSS@LMN7Kr}QnAGM1abxQ
z-y%ZtONH}Hpb<Kn=a<TJc~eU>cFpAqnc}K0br^i2ohzkcjzL*xqgE&{8ghPg(xdEK
z%&g97BNk81UW>aPrIpI2tUh~`y|jCRcbsXMw0Q4P+NkJQ)f8C&h;1HhBxRcnvykLx
z6oW!dpZAJBOUWboFdm}vl%@Y{sl1%mAS{4rhLEQ;BkDT91AzT}w=R#;ZP1-w;{5@L
zM~I0^!(wI`cn38Qu64tU+A3Q1P!5(2TP@k8MKFBH^v(?#V>s#%Xm0{0mo?oFumPU5
z7POU;HV<*POcs5YtA`B#DKqx#gZvaA5OOfo9(oi~{q2$vIxP4@JbfjP9F(7xkE#9y
z_m-lLw*&nRS(7SM_6;Kckf`Gc+o`7BB_@bxKZm+Y0H&8?m?r7jqu%$Ju*w$UK=(ZW
zs(KYszb`_{H5{z$BZ~Svk$+7<AfdRubg1up+oeVwZP|1T{}BNZz`MLysITv7mzoVc
zze6eh5s`mMKqTRiqSS3z1cuaU%lHDzpd+_oCG5#f{cC_CMO?nhUZ)n{C-QFz2qc`!
zclB3Ixb86@E_`)Lpzv||n;^uNA8QWv5nw!4jFkmzM7~Nu43-7~T!F8ywN$XSv_cI6
zc#VLZ$ssIcXt}Dc?X7^Tk`IGk0q9<nM|j@p&@20p68t<XDT;;ZCQ+~KgN0>vc~wQP
zzSWAv&~?}o8q@MN0YQ@<VR-Mf)bQ$`+Hk3OEqwZ8S~dv?8cS`d+12~|tMUAf=J5CT
zJL0bebv$xKU-3#RrRd)g`Hw`s7PJwDF8)0zXMm$VG1O&ExlV}|^d;oA2Kk0Zy(fV;
zwt`u`s?h!I6GB~<eC&{gsdH};`R4>sCOcq-jZ|?nq^lA`aeK1HB#o(=dh{<Mzv@wM
zhCoSGt*gFsfKAUu32wf!#!c*<1CBfL<EKq;tjXWOlaHTv@Z`=xreMLYK=*<_8*Hb7
z%UJ-PZ;JA(Uip@&-Z=<%t13zLS|^LZbM@GDI90&JEh7J|LB84m+MJ>zZ+FW<A=It3
zl@^M*bETenZ6#aGEti&x%0IvIa$`8$k(=)bKO62yrDtYG(;eN|m0if?=Zjha2fh<V
zDd;R@7BwnaDY~dM44unq`S}T|CxQx~tEgQl=Ze~*Rwz->O!8Z^6En#%1}UvBQ*F9(
z02pqJOifL8P!*Tq@8xd0bu$ApRh;$cZmmC`PS&)}m6HzU`*vx5A~I0ZI#fpbIptux
zf48>#qBU(-j&}#+vE3RU=&xy<FUcLu5A4$X*wpOEWbK9L>k9`D26yQJ?7oRp->4Y_
zzD{xQApYb%sMI?S_K)t;e)81pNkc7U1Dg9+`20h&ik>8TbyKv*%u4zVmL5Saad0pr
zHCbW$ZCEXskyETNc7<CDXk-H`kY4DTsPGH5QeD-HqeIXmMks9Qc4|63F?DKql41Vi
zbgTPNE9XYsQhwb&?38q=58O*Jn-rJ$BW{_$UMF@+z0?PI6LihL5x3-Dw+A~UVCq4o
zmUaq)+Zj-|3p=D?>H>ea;N*ncSx~PNbtGfz2VcxP`Qdgb)ayqbDVh4gzZ5w6;dV4U
zZa*q77fxQd9S`+-@&EK}t3>q!*|-{ViurDj#L0=2UXarxq;e#(ni<YX^Z0XehQjzH
z$(@@!)aMjheQ40m%IktzuwXw{>y>Yp+8ydw^B&M4U{SZztlpQ0RI=`PIk<xEJG{!!
z?%jrKf+hD#j9ARvV`aSsbqbi<(OYl8P7Y5SYGLZ|(vAg}6T2W<R%gz_TmD%DaFlvN
zOmhw$v#VE%pU-c)lfD4Y`4gn>b+_DKJFU4}rSrJg+~Uap$a>p-{-V~}lB;RTaLU(o
zz|7e@drRe=_Z1zgBP~%mr%X-<D%qS}{$MxzI9>#?LHw?qKh?j7aNCOkTym>9T<2t{
zUAGC*L1R+3B{40zEsZH)E>RuRq9)zf$kh2crbkuQnj#f2rfJEK%o={PBe`~8wUl|~
z#>1-D*`emOh;!lKUZ0tW?QOLxbul^o?x+<8c4=6hXqf`r+`=!gbAyR=K;cx*mZplc
zh1^mhZQM$-+fbvZ(S><!m1p*(f8*cFx|JO?UxwdBjXT+FKC`l-B$kTjdos%z_=US?
zWjRxPv1jE%z6T!gckiAO{1n+^$Q!?fr|FIEDE!`;DGAEg49Tx97Q<b0Ww^i%_eHb)
v0|Whi@&1^WiDw3TW7)pge7}~BpUV!!`m^wTE*=?}KO$%c1z{Tg$K<~OQ+>wm

literal 0
HcmV?d00001

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetThriftCompatibilitySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetThriftCompatibilitySuite.scala
index d22066cabc567..1c532d78790d2 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetThriftCompatibilitySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetThriftCompatibilitySuite.scala
@@ -17,16 +17,6 @@
 
 package org.apache.spark.sql.parquet
 
-import java.nio.ByteBuffer
-import java.util.{List => JList, Map => JMap}
-
-import scala.collection.JavaConversions._
-
-import org.apache.hadoop.fs.Path
-import org.apache.parquet.hadoop.metadata.CompressionCodecName
-import org.apache.parquet.thrift.ThriftParquetWriter
-
-import org.apache.spark.sql.parquet.test.thrift.{Nested, ParquetThriftCompat, Suit}
 import org.apache.spark.sql.test.TestSQLContext
 import org.apache.spark.sql.{Row, SQLContext}
 
@@ -35,28 +25,20 @@ class ParquetThriftCompatibilitySuite extends ParquetCompatibilityTest {
 
   override val sqlContext: SQLContext = TestSQLContext
 
-  override protected def beforeAll(): Unit = {
-    super.beforeAll()
-
-    val writer =
-      new ThriftParquetWriter[ParquetThriftCompat](
-        new Path(parquetStore.getCanonicalPath),
-        classOf[ParquetThriftCompat],
-        CompressionCodecName.SNAPPY)
-
-    (0 until 10).foreach(i => writer.write(makeParquetThriftCompat(i)))
-    writer.close()
-  }
+  private val parquetFilePath =
+    Thread.currentThread().getContextClassLoader.getResource("parquet-thrift-compat.snappy.parquet")
 
   test("Read Parquet file generated by parquet-thrift") {
     logInfo(
       s"""Schema of the Parquet file written by parquet-thrift:
-         |${readParquetSchema(parquetStore.getCanonicalPath)}
+         |${readParquetSchema(parquetFilePath.toString)}
        """.stripMargin)
 
-    checkAnswer(sqlContext.read.parquet(parquetStore.getCanonicalPath), (0 until 10).map { i =>
+    checkAnswer(sqlContext.read.parquet(parquetFilePath.toString), (0 until 10).map { i =>
       def nullable[T <: AnyRef]: ( => T) => T = makeNullable[T](i)
 
+      val suits = Array("SPADES", "HEARTS", "DIAMONDS", "CLUBS")
+
       Row(
         i % 2 == 0,
         i.toByte,
@@ -70,7 +52,7 @@ class ParquetThriftCompatibilitySuite extends ParquetCompatibilityTest {
         s"val_$i",
         s"val_$i",
         // Thrift ENUM values are converted to Parquet binaries containing UTF-8 strings
-        Suit.values()(i % 4).name(),
+        suits(i % 4),
 
         nullable(i % 2 == 0: java.lang.Boolean),
         nullable(i.toByte: java.lang.Byte),
@@ -80,7 +62,7 @@ class ParquetThriftCompatibilitySuite extends ParquetCompatibilityTest {
         nullable(i.toDouble + 0.2d: java.lang.Double),
         nullable(s"val_$i"),
         nullable(s"val_$i"),
-        nullable(Suit.values()(i % 4).name()),
+        nullable(suits(i % 4)),
 
         Seq.tabulate(3)(n => s"arr_${i + n}"),
         // Thrift `SET`s are converted to Parquet `LIST`s
@@ -93,48 +75,4 @@ class ParquetThriftCompatibilitySuite extends ParquetCompatibilityTest {
         }.toMap)
     })
   }
-
-  def makeParquetThriftCompat(i: Int): ParquetThriftCompat = {
-    def makeComplexColumn(i: Int): JMap[Integer, JList[Nested]] = {
-      mapAsJavaMap(Seq.tabulate(3) { n =>
-        (i + n: Integer) -> seqAsJavaList(Seq.tabulate(3) { m =>
-          new Nested(
-            seqAsJavaList(Seq.tabulate(3)(j => i + j + m)),
-            s"val_${i + m}")
-        })
-      }.toMap)
-    }
-
-    val value =
-      new ParquetThriftCompat(
-        i % 2 == 0,
-        i.toByte,
-        (i + 1).toShort,
-        i + 2,
-        i.toLong * 10,
-        i.toDouble + 0.2d,
-        ByteBuffer.wrap(s"val_$i".getBytes),
-        s"val_$i",
-        Suit.values()(i % 4),
-
-        seqAsJavaList(Seq.tabulate(3)(n => s"arr_${i + n}")),
-        setAsJavaSet(Set(i)),
-        mapAsJavaMap(Seq.tabulate(3)(n => (i + n: Integer) -> s"val_${i + n}").toMap),
-        makeComplexColumn(i))
-
-    if (i % 3 == 0) {
-      value
-    } else {
-      value
-        .setMaybeBoolColumn(i % 2 == 0)
-        .setMaybeByteColumn(i.toByte)
-        .setMaybeShortColumn((i + 1).toShort)
-        .setMaybeIntColumn(i + 2)
-        .setMaybeLongColumn(i.toLong * 10)
-        .setMaybeDoubleColumn(i.toDouble + 0.2d)
-        .setMaybeBinaryColumn(ByteBuffer.wrap(s"val_$i".getBytes))
-        .setMaybeStringColumn(s"val_$i")
-        .setMaybeEnumColumn(Suit.values()(i % 4))
-    }
-  }
 }

From 0e78e40c0b221e09d9ea22177a99f3bea4b2b4fb Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Thu, 9 Jul 2015 18:23:06 -0700
Subject: [PATCH 104/549] [SPARK-8852] [FLUME] Trim dependencies in flume
 assembly.

Also, add support for the *-provided profiles. This avoids repackaging
things that are already in the Spark assembly, or, in the case of the
*-provided profiles, are provided by the distribution.

The flume-ng-auth dependency was also excluded since it's not really
used by Spark.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #7247 from vanzin/SPARK-8852 and squashes the following commits:

298a7d5 [Marcelo Vanzin] Feedback.
c962082 [Marcelo Vanzin] [SPARK-8852] [flume] Trim dependencies in flume assembly.
---
 external/flume-assembly/pom.xml | 169 ++++++++++++++++++--------------
 pom.xml                         |   4 +
 2 files changed, 100 insertions(+), 73 deletions(-)

diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml
index 8565cd83edfa2..13189595d1d6c 100644
--- a/external/flume-assembly/pom.xml
+++ b/external/flume-assembly/pom.xml
@@ -32,6 +32,7 @@
   <url>http://spark.apache.org/</url>
 
   <properties>
+    <hadoop.deps.scope>provided</hadoop.deps.scope>
     <sbt.project.name>streaming-flume-assembly</sbt.project.name>
   </properties>
 
@@ -40,6 +41,16 @@
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-streaming-flume_${scala.binary.version}</artifactId>
       <version>${project.version}</version>
+      <exclusions>
+        <exclusion>
+          <groupId>org.mortbay.jetty</groupId>
+          <artifactId>jetty</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.mortbay.jetty</groupId>
+          <artifactId>jetty-util</artifactId>
+        </exclusion>
+      </exclusions>
     </dependency>
     <dependency>
       <groupId>org.apache.spark</groupId>
@@ -47,89 +58,101 @@
       <version>${project.version}</version>
       <scope>provided</scope>
     </dependency>
+    <!--
+      Demote already included in the Spark assembly. These are transitive dependencies of flume
+      or spark-streaming-flume, and this need to be explicitly included even through the parent
+      pom may declare them with ${hadoop.deps.scope}.
+    -->
+    <dependency>
+      <groupId>commons-codec</groupId>
+      <artifactId>commons-codec</artifactId>
+      <scope>provided</scope>
+    </dependency>
+    <dependency>
+      <groupId>commons-net</groupId>
+      <artifactId>commons-net</artifactId>
+      <scope>provided</scope>
+    </dependency>
+    <dependency>
+      <groupId>com.google.protobuf</groupId>
+      <artifactId>protobuf-java</artifactId>
+      <scope>provided</scope>
+    </dependency>
     <dependency>
       <groupId>org.apache.avro</groupId>
       <artifactId>avro</artifactId>
-      <version>${avro.version}</version>
+      <scope>provided</scope>
     </dependency>
     <dependency>
       <groupId>org.apache.avro</groupId>
       <artifactId>avro-ipc</artifactId>
-      <version>${avro.version}</version>
-      <exclusions>
-        <exclusion>
-          <groupId>io.netty</groupId>
-          <artifactId>netty</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>org.mortbay.jetty</groupId>
-          <artifactId>jetty</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>org.mortbay.jetty</groupId>
-          <artifactId>jetty-util</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>org.mortbay.jetty</groupId>
-          <artifactId>servlet-api</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>org.apache.velocity</groupId>
-          <artifactId>velocity</artifactId>
-        </exclusion>
-      </exclusions>
+      <scope>provided</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.scala-lang</groupId>
+      <artifactId>scala-library</artifactId>
+      <scope>provided</scope>
     </dependency>
   </dependencies>
 
   <build>
-  <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
-  <testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
-  <plugins>
-    <plugin>
-      <groupId>org.apache.maven.plugins</groupId>
-      <artifactId>maven-shade-plugin</artifactId>
-      <configuration>
-        <shadedArtifactAttached>false</shadedArtifactAttached>
-        <outputFile>${project.build.directory}/scala-${scala.binary.version}/spark-streaming-flume-assembly-${project.version}.jar</outputFile>
-        <artifactSet>
-          <includes>
-            <include>*:*</include>
-          </includes>
-        </artifactSet>
-        <filters>
-          <filter>
-            <artifact>*:*</artifact>
-            <excludes>
-              <exclude>META-INF/*.SF</exclude>
-              <exclude>META-INF/*.DSA</exclude>
-              <exclude>META-INF/*.RSA</exclude>
-            </excludes>
-          </filter>
-        </filters>
-      </configuration>
-      <executions>
-        <execution>
-          <phase>package</phase>
-          <goals>
-            <goal>shade</goal>
-          </goals>
-          <configuration>
-            <transformers>
-              <transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
-              <transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
-                <resource>reference.conf</resource>
-              </transformer>
-              <transformer implementation="org.apache.maven.plugins.shade.resource.DontIncludeResourceTransformer">
-                <resource>log4j.properties</resource>
-              </transformer>
-              <transformer implementation="org.apache.maven.plugins.shade.resource.ApacheLicenseResourceTransformer"/>
-              <transformer implementation="org.apache.maven.plugins.shade.resource.ApacheNoticeResourceTransformer"/>
-            </transformers>
-          </configuration>
-        </execution>
-      </executions>
-    </plugin>
-  </plugins>
-</build>
+    <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
+    <testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-shade-plugin</artifactId>
+        <configuration>
+          <shadedArtifactAttached>false</shadedArtifactAttached>
+          <outputFile>${project.build.directory}/scala-${scala.binary.version}/spark-streaming-flume-assembly-${project.version}.jar</outputFile>
+          <artifactSet>
+            <includes>
+              <include>*:*</include>
+            </includes>
+          </artifactSet>
+          <filters>
+            <filter>
+              <artifact>*:*</artifact>
+              <excludes>
+                <exclude>META-INF/*.SF</exclude>
+                <exclude>META-INF/*.DSA</exclude>
+                <exclude>META-INF/*.RSA</exclude>
+              </excludes>
+            </filter>
+          </filters>
+        </configuration>
+        <executions>
+          <execution>
+            <phase>package</phase>
+            <goals>
+              <goal>shade</goal>
+            </goals>
+            <configuration>
+              <transformers>
+                <transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
+                <transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
+                  <resource>reference.conf</resource>
+                </transformer>
+                <transformer implementation="org.apache.maven.plugins.shade.resource.DontIncludeResourceTransformer">
+                  <resource>log4j.properties</resource>
+                </transformer>
+                <transformer implementation="org.apache.maven.plugins.shade.resource.ApacheLicenseResourceTransformer"/>
+                <transformer implementation="org.apache.maven.plugins.shade.resource.ApacheNoticeResourceTransformer"/>
+              </transformers>
+            </configuration>
+          </execution>
+        </executions>
+      </plugin>
+    </plugins>
+  </build>
+
+  <profiles>
+    <profile>
+      <id>flume-provided</id>
+      <properties>
+        <flume.deps.scope>provided</flume.deps.scope>
+      </properties>
+    </profile>
+  </profiles>
 </project>
 
diff --git a/pom.xml b/pom.xml
index 1eda108dc065b..172fdef4c73da 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1130,6 +1130,10 @@
             <groupId>io.netty</groupId>
             <artifactId>netty</artifactId>
           </exclusion>
+          <exclusion>
+            <groupId>org.apache.flume</groupId>
+            <artifactId>flume-ng-auth</artifactId>
+          </exclusion>
           <exclusion>
             <groupId>org.apache.thrift</groupId>
             <artifactId>libthrift</artifactId>

From 2727304660663fcf1e41f7b666978c1443262e4e Mon Sep 17 00:00:00 2001
From: Holden Karau <holden@pigscanfly.ca>
Date: Thu, 9 Jul 2015 19:08:33 -0700
Subject: [PATCH 105/549] [SPARK-8913] [ML] Simplify LogisticRegression suite
 to use Vector Vector comparision

Cleanup tests from SPARK 8700.

Author: Holden Karau <holden@pigscanfly.ca>

Closes #7335 from holdenk/SPARK-8913-cleanup-tests-from-SPARK-8700-logistic-regression-r2-really-logistic-regression-this-time and squashes the following commits:

e5e2c5f [Holden Karau] Simplify LogisticRegression suite to use Vector <-> Vector comparisions instead of comparing element by element
---
 .../LogisticRegressionSuite.scala             | 135 +++++-------------
 1 file changed, 39 insertions(+), 96 deletions(-)

diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
index 27253c1db2fff..b7dd44753896a 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
@@ -234,20 +234,14 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
        data.V5     -0.7996864
      */
     val interceptR = 2.8366423
-    val weightsR = Array(-0.5895848, 0.8931147, -0.3925051, -0.7996864)
+    val weightsR = Vectors.dense(-0.5895848, 0.8931147, -0.3925051, -0.7996864)
 
     assert(model1.intercept ~== interceptR relTol 1E-3)
-    assert(model1.weights(0) ~== weightsR(0) relTol 1E-3)
-    assert(model1.weights(1) ~== weightsR(1) relTol 1E-3)
-    assert(model1.weights(2) ~== weightsR(2) relTol 1E-3)
-    assert(model1.weights(3) ~== weightsR(3) relTol 1E-3)
+    assert(model1.weights ~= weightsR relTol 1E-3)
 
     // Without regularization, with or without standardization will converge to the same solution.
     assert(model2.intercept ~== interceptR relTol 1E-3)
-    assert(model2.weights(0) ~== weightsR(0) relTol 1E-3)
-    assert(model2.weights(1) ~== weightsR(1) relTol 1E-3)
-    assert(model2.weights(2) ~== weightsR(2) relTol 1E-3)
-    assert(model2.weights(3) ~== weightsR(3) relTol 1E-3)
+    assert(model2.weights ~= weightsR relTol 1E-3)
   }
 
   test("binary logistic regression without intercept without regularization") {
@@ -277,20 +271,14 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
        data.V5     -0.7407946
      */
     val interceptR = 0.0
-    val weightsR = Array(-0.3534996, 1.2964482, -0.3571741, -0.7407946)
+    val weightsR = Vectors.dense(-0.3534996, 1.2964482, -0.3571741, -0.7407946)
 
     assert(model1.intercept ~== interceptR relTol 1E-3)
-    assert(model1.weights(0) ~== weightsR(0) relTol 1E-2)
-    assert(model1.weights(1) ~== weightsR(1) relTol 1E-2)
-    assert(model1.weights(2) ~== weightsR(2) relTol 1E-3)
-    assert(model1.weights(3) ~== weightsR(3) relTol 1E-3)
+    assert(model1.weights ~= weightsR relTol 1E-2)
 
     // Without regularization, with or without standardization should converge to the same solution.
     assert(model2.intercept ~== interceptR relTol 1E-3)
-    assert(model2.weights(0) ~== weightsR(0) relTol 1E-2)
-    assert(model2.weights(1) ~== weightsR(1) relTol 1E-2)
-    assert(model2.weights(2) ~== weightsR(2) relTol 1E-3)
-    assert(model2.weights(3) ~== weightsR(3) relTol 1E-3)
+    assert(model2.weights ~= weightsR relTol 1E-2)
   }
 
   test("binary logistic regression with intercept with L1 regularization") {
@@ -321,13 +309,10 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
        data.V5     -0.02481551
      */
     val interceptR1 = -0.05627428
-    val weightsR1 = Array(0.0, 0.0, -0.04325749, -0.02481551)
+    val weightsR1 = Vectors.dense(0.0, 0.0, -0.04325749, -0.02481551)
 
     assert(model1.intercept ~== interceptR1 relTol 1E-2)
-    assert(model1.weights(0) ~== weightsR1(0) absTol 1E-3)
-    assert(model1.weights(1) ~== weightsR1(1) absTol 1E-3)
-    assert(model1.weights(2) ~== weightsR1(2) relTol 1E-2)
-    assert(model1.weights(3) ~== weightsR1(3) relTol 2E-2)
+    assert(model1.weights ~= weightsR1 absTol 2E-2)
 
     /*
        Using the following R code to load the data and train the model using glmnet package.
@@ -349,13 +334,10 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
        data.V5       .
      */
     val interceptR2 = 0.3722152
-    val weightsR2 = Array(0.0, 0.0, -0.1665453, 0.0)
+    val weightsR2 = Vectors.dense(0.0, 0.0, -0.1665453, 0.0)
 
     assert(model2.intercept ~== interceptR2 relTol 1E-2)
-    assert(model2.weights(0) ~== weightsR2(0) absTol 1E-3)
-    assert(model2.weights(1) ~== weightsR2(1) absTol 1E-3)
-    assert(model2.weights(2) ~== weightsR2(2) relTol 1E-2)
-    assert(model2.weights(3) ~== weightsR2(3) absTol 1E-3)
+    assert(model2.weights ~= weightsR2 absTol 1E-3)
   }
 
   test("binary logistic regression without intercept with L1 regularization") {
@@ -387,13 +369,10 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
        data.V5     -0.03891782
      */
     val interceptR1 = 0.0
-    val weightsR1 = Array(0.0, 0.0, -0.05189203, -0.03891782)
+    val weightsR1 = Vectors.dense(0.0, 0.0, -0.05189203, -0.03891782)
 
     assert(model1.intercept ~== interceptR1 relTol 1E-3)
-    assert(model1.weights(0) ~== weightsR1(0) absTol 1E-3)
-    assert(model1.weights(1) ~== weightsR1(1) absTol 1E-3)
-    assert(model1.weights(2) ~== weightsR1(2) relTol 1E-2)
-    assert(model1.weights(3) ~== weightsR1(3) relTol 1E-2)
+    assert(model1.weights ~= weightsR1 absTol 1E-3)
 
     /*
        Using the following R code to load the data and train the model using glmnet package.
@@ -415,13 +394,10 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
        data.V5       .
      */
     val interceptR2 = 0.0
-    val weightsR2 = Array(0.0, 0.0, -0.08420782, 0.0)
+    val weightsR2 = Vectors.dense(0.0, 0.0, -0.08420782, 0.0)
 
-    assert(model2.intercept ~== interceptR2 relTol 1E-3)
-    assert(model2.weights(0) ~== weightsR2(0) absTol 1E-3)
-    assert(model2.weights(1) ~== weightsR2(1) absTol 1E-3)
-    assert(model2.weights(2) ~== weightsR2(2) relTol 1E-2)
-    assert(model2.weights(3) ~== weightsR2(3) absTol 1E-3)
+    assert(model2.intercept ~== interceptR2 absTol 1E-3)
+    assert(model2.weights ~= weightsR2 absTol 1E-3)
   }
 
   test("binary logistic regression with intercept with L2 regularization") {
@@ -452,13 +428,10 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
        data.V5     -0.10062872
      */
     val interceptR1 = 0.15021751
-    val weightsR1 = Array(-0.07251837, 0.10724191, -0.04865309, -0.10062872)
+    val weightsR1 = Vectors.dense(-0.07251837, 0.10724191, -0.04865309, -0.10062872)
 
     assert(model1.intercept ~== interceptR1 relTol 1E-3)
-    assert(model1.weights(0) ~== weightsR1(0) relTol 1E-3)
-    assert(model1.weights(1) ~== weightsR1(1) relTol 1E-3)
-    assert(model1.weights(2) ~== weightsR1(2) relTol 1E-3)
-    assert(model1.weights(3) ~== weightsR1(3) relTol 1E-3)
+    assert(model1.weights ~= weightsR1 relTol 1E-3)
 
     /*
        Using the following R code to load the data and train the model using glmnet package.
@@ -480,13 +453,10 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
        data.V5     -0.06266838
      */
     val interceptR2 = 0.48657516
-    val weightsR2 = Array(-0.05155371, 0.02301057, -0.11482896, -0.06266838)
+    val weightsR2 = Vectors.dense(-0.05155371, 0.02301057, -0.11482896, -0.06266838)
 
     assert(model2.intercept ~== interceptR2 relTol 1E-3)
-    assert(model2.weights(0) ~== weightsR2(0) relTol 1E-3)
-    assert(model2.weights(1) ~== weightsR2(1) relTol 1E-3)
-    assert(model2.weights(2) ~== weightsR2(2) relTol 1E-3)
-    assert(model2.weights(3) ~== weightsR2(3) relTol 1E-3)
+    assert(model2.weights ~= weightsR2 relTol 1E-3)
   }
 
   test("binary logistic regression without intercept with L2 regularization") {
@@ -518,13 +488,10 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
        data.V5     -0.09799775
      */
     val interceptR1 = 0.0
-    val weightsR1 = Array(-0.06099165, 0.12857058, -0.04708770, -0.09799775)
+    val weightsR1 = Vectors.dense(-0.06099165, 0.12857058, -0.04708770, -0.09799775)
 
-    assert(model1.intercept ~== interceptR1 relTol 1E-3)
-    assert(model1.weights(0) ~== weightsR1(0) relTol 1E-2)
-    assert(model1.weights(1) ~== weightsR1(1) relTol 1E-2)
-    assert(model1.weights(2) ~== weightsR1(2) relTol 1E-3)
-    assert(model1.weights(3) ~== weightsR1(3) relTol 1E-3)
+    assert(model1.intercept ~== interceptR1 absTol 1E-3)
+    assert(model1.weights ~= weightsR1 relTol 1E-2)
 
     /*
        Using the following R code to load the data and train the model using glmnet package.
@@ -546,13 +513,10 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
        data.V5     -0.053314311
      */
     val interceptR2 = 0.0
-    val weightsR2 = Array(-0.005679651, 0.048967094, -0.093714016, -0.053314311)
+    val weightsR2 = Vectors.dense(-0.005679651, 0.048967094, -0.093714016, -0.053314311)
 
-    assert(model2.intercept ~== interceptR2 relTol 1E-3)
-    assert(model2.weights(0) ~== weightsR2(0) relTol 1E-2)
-    assert(model2.weights(1) ~== weightsR2(1) relTol 1E-2)
-    assert(model2.weights(2) ~== weightsR2(2) relTol 1E-3)
-    assert(model2.weights(3) ~== weightsR2(3) relTol 1E-3)
+    assert(model2.intercept ~== interceptR2 absTol 1E-3)
+    assert(model2.weights ~= weightsR2 relTol 1E-2)
   }
 
   test("binary logistic regression with intercept with ElasticNet regularization") {
@@ -583,13 +547,10 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
        data.V5     -0.15458796
      */
     val interceptR1 = 0.57734851
-    val weightsR1 = Array(-0.05310287, 0.0, -0.08849250, -0.15458796)
+    val weightsR1 = Vectors.dense(-0.05310287, 0.0, -0.08849250, -0.15458796)
 
     assert(model1.intercept ~== interceptR1 relTol 6E-3)
-    assert(model1.weights(0) ~== weightsR1(0) relTol 5E-3)
-    assert(model1.weights(1) ~== weightsR1(1) absTol 1E-3)
-    assert(model1.weights(2) ~== weightsR1(2) relTol 5E-3)
-    assert(model1.weights(3) ~== weightsR1(3) relTol 1E-3)
+    assert(model1.weights ~== weightsR1 absTol 5E-3)
 
     /*
        Using the following R code to load the data and train the model using glmnet package.
@@ -611,13 +572,10 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
        data.V5     -0.05350074
      */
     val interceptR2 = 0.51555993
-    val weightsR2 = Array(0.0, 0.0, -0.18807395, -0.05350074)
+    val weightsR2 = Vectors.dense(0.0, 0.0, -0.18807395, -0.05350074)
 
     assert(model2.intercept ~== interceptR2 relTol 6E-3)
-    assert(model2.weights(0) ~== weightsR2(0) absTol 1E-3)
-    assert(model2.weights(1) ~== weightsR2(1) absTol 1E-3)
-    assert(model2.weights(2) ~== weightsR2(2) relTol 5E-3)
-    assert(model2.weights(3) ~== weightsR2(3) relTol 1E-2)
+    assert(model2.weights ~= weightsR2 absTol 1E-3)
   }
 
   test("binary logistic regression without intercept with ElasticNet regularization") {
@@ -649,13 +607,10 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
        data.V5     -0.142534158
      */
     val interceptR1 = 0.0
-    val weightsR1 = Array(-0.001005743, 0.072577857, -0.081203769, -0.142534158)
+    val weightsR1 = Vectors.dense(-0.001005743, 0.072577857, -0.081203769, -0.142534158)
 
     assert(model1.intercept ~== interceptR1 relTol 1E-3)
-    assert(model1.weights(0) ~== weightsR1(0) absTol 1E-2)
-    assert(model1.weights(1) ~== weightsR1(1) absTol 1E-2)
-    assert(model1.weights(2) ~== weightsR1(2) relTol 1E-3)
-    assert(model1.weights(3) ~== weightsR1(3) relTol 1E-2)
+    assert(model1.weights ~= weightsR1 absTol 1E-2)
 
     /*
        Using the following R code to load the data and train the model using glmnet package.
@@ -677,13 +632,10 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
        data.V5       .
      */
     val interceptR2 = 0.0
-    val weightsR2 = Array(0.0, 0.03345223, -0.11304532, 0.0)
+    val weightsR2 = Vectors.dense(0.0, 0.03345223, -0.11304532, 0.0)
 
-    assert(model2.intercept ~== interceptR2 relTol 1E-3)
-    assert(model2.weights(0) ~== weightsR2(0) absTol 1E-3)
-    assert(model2.weights(1) ~== weightsR2(1) relTol 1E-2)
-    assert(model2.weights(2) ~== weightsR2(2) relTol 1E-2)
-    assert(model2.weights(3) ~== weightsR2(3) absTol 1E-3)
+    assert(model2.intercept ~== interceptR2 absTol 1E-3)
+    assert(model2.weights ~= weightsR2 absTol 1E-3)
   }
 
   test("binary logistic regression with intercept with strong L1 regularization") {
@@ -717,19 +669,13 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
        }}}
      */
     val interceptTheory = math.log(histogram(1).toDouble / histogram(0).toDouble)
-    val weightsTheory = Array(0.0, 0.0, 0.0, 0.0)
+    val weightsTheory = Vectors.dense(0.0, 0.0, 0.0, 0.0)
 
     assert(model1.intercept ~== interceptTheory relTol 1E-5)
-    assert(model1.weights(0) ~== weightsTheory(0) absTol 1E-6)
-    assert(model1.weights(1) ~== weightsTheory(1) absTol 1E-6)
-    assert(model1.weights(2) ~== weightsTheory(2) absTol 1E-6)
-    assert(model1.weights(3) ~== weightsTheory(3) absTol 1E-6)
+    assert(model1.weights ~= weightsTheory absTol 1E-6)
 
     assert(model2.intercept ~== interceptTheory relTol 1E-5)
-    assert(model2.weights(0) ~== weightsTheory(0) absTol 1E-6)
-    assert(model2.weights(1) ~== weightsTheory(1) absTol 1E-6)
-    assert(model2.weights(2) ~== weightsTheory(2) absTol 1E-6)
-    assert(model2.weights(3) ~== weightsTheory(3) absTol 1E-6)
+    assert(model2.weights ~= weightsTheory absTol 1E-6)
 
     /*
        Using the following R code to load the data and train the model using glmnet package.
@@ -750,12 +696,9 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
        data.V5       .
      */
     val interceptR = -0.248065
-    val weightsR = Array(0.0, 0.0, 0.0, 0.0)
+    val weightsR = Vectors.dense(0.0, 0.0, 0.0, 0.0)
 
     assert(model1.intercept ~== interceptR relTol 1E-5)
-    assert(model1.weights(0) ~== weightsR(0) absTol 1E-6)
-    assert(model1.weights(1) ~== weightsR(1) absTol 1E-6)
-    assert(model1.weights(2) ~== weightsR(2) absTol 1E-6)
-    assert(model1.weights(3) ~== weightsR(3) absTol 1E-6)
+    assert(model1.weights ~= weightsR absTol 1E-6)
   }
 }

From 1903641e68ce7e7e657584bf45e91db6df357e41 Mon Sep 17 00:00:00 2001
From: huangzhaowei <carlmartinmax@gmail.com>
Date: Thu, 9 Jul 2015 19:31:31 -0700
Subject: [PATCH 106/549] [SPARK-8839] [SQL] ThriftServer2 will remove session
 and execution no matter it's finished or not.

In my test, `sessions` and `executions` in ThriftServer2 is not the same number as the connection number.
For example, if there are 200 clients connecting to the server,  but it will have more than 200 `sessions` and `executions`.
So if it reaches the `retainedStatements`, it has to remove some object which is not finished.
So it may cause the exception described in [Jira Address](https://issues.apache.org/jira/browse/SPARK-8839)

Author: huangzhaowei <carlmartinmax@gmail.com>

Closes #7239 from SaintBacchus/SPARK-8839 and squashes the following commits:

cf7ef40 [huangzhaowei] Remove the a meanless funciton call
3e9a5a6 [huangzhaowei] Add a filter before take
9d5ceb8 [huangzhaowei] [SPARK-8839][SQL]ThriftServer2 will remove session and execution no matter it's finished or not.
---
 .../spark/sql/hive/thriftserver/HiveThriftServer2.scala    | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala
index 700d994bb6a83..b7db80d93f852 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala
@@ -179,6 +179,7 @@ object HiveThriftServer2 extends Logging {
     def onSessionClosed(sessionId: String): Unit = {
       sessionList(sessionId).finishTimestamp = System.currentTimeMillis
       onlineSessionNum -= 1
+      trimSessionIfNecessary()
     }
 
     def onStatementStart(
@@ -206,18 +207,20 @@ object HiveThriftServer2 extends Logging {
       executionList(id).detail = errorMessage
       executionList(id).state = ExecutionState.FAILED
       totalRunning -= 1
+      trimExecutionIfNecessary()
     }
 
     def onStatementFinish(id: String): Unit = {
       executionList(id).finishTimestamp = System.currentTimeMillis
       executionList(id).state = ExecutionState.FINISHED
       totalRunning -= 1
+      trimExecutionIfNecessary()
     }
 
     private def trimExecutionIfNecessary() = synchronized {
       if (executionList.size > retainedStatements) {
         val toRemove = math.max(retainedStatements / 10, 1)
-        executionList.take(toRemove).foreach { s =>
+        executionList.filter(_._2.finishTimestamp != 0).take(toRemove).foreach { s =>
           executionList.remove(s._1)
         }
       }
@@ -226,7 +229,7 @@ object HiveThriftServer2 extends Logging {
     private def trimSessionIfNecessary() = synchronized {
       if (sessionList.size > retainedSessions) {
         val toRemove = math.max(retainedSessions / 10, 1)
-        sessionList.take(toRemove).foreach { s =>
+        sessionList.filter(_._2.finishTimestamp != 0).take(toRemove).foreach { s =>
           sessionList.remove(s._1)
         }
       }

From d538919cc4fd3ab940d478c62dce1bae0270cfeb Mon Sep 17 00:00:00 2001
From: Michael Vogiatzis <michaelvogiatzis@gmail.com>
Date: Thu, 9 Jul 2015 19:53:23 -0700
Subject: [PATCH 107/549] [DOCS] Added important updateStateByKey details

Runs for *all* existing keys and returning "None" will remove the key-value pair.

Author: Michael Vogiatzis <michaelvogiatzis@gmail.com>

Closes #7229 from mvogiatzis/patch-1 and squashes the following commits:

e7a2946 [Michael Vogiatzis] Updated updateStateByKey text
00283ed [Michael Vogiatzis] Removed space
c2656f9 [Michael Vogiatzis] Moved description farther up
0a42551 [Michael Vogiatzis] Added important updateStateByKey details
---
 docs/streaming-programming-guide.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/streaming-programming-guide.md b/docs/streaming-programming-guide.md
index e72d5580dae55..2f3013b533eb0 100644
--- a/docs/streaming-programming-guide.md
+++ b/docs/streaming-programming-guide.md
@@ -854,6 +854,8 @@ it with new information. To use this, you will have to do two steps.
 1. Define the state update function - Specify with a function how to update the state using the
 previous state and the new values from an input stream.
 
+In every batch, Spark will apply the state  update function for all existing keys, regardless of whether they have new data in a batch or not. If the update function returns `None` then the key-value pair will be eliminated.
+
 Let's illustrate this with an example. Say you want to maintain a running count of each word
 seen in a text data stream. Here, the running count is the state and it is an integer. We
 define the update function as:

From e14b545d2dcbc4587688b4c46718d3680b0a2f67 Mon Sep 17 00:00:00 2001
From: Jonathan Alter <jonalter@users.noreply.github.com>
Date: Fri, 10 Jul 2015 11:34:01 +0100
Subject: [PATCH 108/549] [SPARK-7977] [BUILD] Disallowing println

Author: Jonathan Alter <jonalter@users.noreply.github.com>

Closes #7093 from jonalter/SPARK-7977 and squashes the following commits:

ccd44cc [Jonathan Alter] Changed println to log in ThreadingSuite
7fcac3e [Jonathan Alter] Reverting to println in ThreadingSuite
10724b6 [Jonathan Alter] Changing some printlns to logs in tests
eeec1e7 [Jonathan Alter] Merge branch 'master' of github.com:apache/spark into SPARK-7977
0b1dcb4 [Jonathan Alter] More println cleanup
aedaf80 [Jonathan Alter] Merge branch 'master' of github.com:apache/spark into SPARK-7977
925fd98 [Jonathan Alter] Merge branch 'master' of github.com:apache/spark into SPARK-7977
0c16fa3 [Jonathan Alter] Replacing some printlns with logs
45c7e05 [Jonathan Alter] Merge branch 'master' of github.com:apache/spark into SPARK-7977
5c8e283 [Jonathan Alter] Allowing println in audit-release examples
5b50da1 [Jonathan Alter] Allowing printlns in example files
ca4b477 [Jonathan Alter] Merge branch 'master' of github.com:apache/spark into SPARK-7977
83ab635 [Jonathan Alter] Fixing new printlns
54b131f [Jonathan Alter] Merge branch 'master' of github.com:apache/spark into SPARK-7977
1cd8a81 [Jonathan Alter] Removing some unnecessary comments and printlns
b837c3a [Jonathan Alter] Disallowing println
---
 .../main/scala/org/apache/spark/Logging.scala |  2 ++
 .../org/apache/spark/api/r/RBackend.scala     |  2 ++
 .../scala/org/apache/spark/api/r/RRDD.scala   |  2 ++
 .../org/apache/spark/deploy/Client.scala      | 30 ++++++++--------
 .../apache/spark/deploy/ClientArguments.scala |  4 +++
 .../org/apache/spark/deploy/RRunner.scala     |  2 ++
 .../org/apache/spark/deploy/SparkSubmit.scala | 18 ++++++++++
 .../spark/deploy/SparkSubmitArguments.scala   |  4 +++
 .../spark/deploy/client/TestExecutor.scala    |  2 ++
 .../history/HistoryServerArguments.scala      |  2 ++
 .../spark/deploy/master/MasterArguments.scala |  2 ++
 .../MesosClusterDispatcherArguments.scala     |  6 ++++
 .../spark/deploy/worker/DriverWrapper.scala   |  2 ++
 .../spark/deploy/worker/WorkerArguments.scala |  4 +++
 .../CoarseGrainedExecutorBackend.scala        |  4 +++
 .../input/FixedLengthBinaryInputFormat.scala  |  7 ++--
 .../spark/network/nio/BlockMessage.scala      | 22 ------------
 .../spark/network/nio/BlockMessageArray.scala | 34 ++++---------------
 .../spark/network/nio/ConnectionManager.scala |  4 +++
 .../scala/org/apache/spark/rdd/PipedRDD.scala |  4 +++
 .../scheduler/EventLoggingListener.scala      |  2 ++
 .../apache/spark/scheduler/JobLogger.scala    |  2 ++
 .../org/apache/spark/ui/JettyUtils.scala      |  2 ++
 .../apache/spark/ui/UIWorkloadGenerator.scala |  6 +++-
 .../org/apache/spark/util/Distribution.scala  |  6 ++++
 .../spark/util/random/XORShiftRandom.scala    |  2 ++
 .../org/apache/spark/DistributedSuite.scala   |  2 ++
 .../scala/org/apache/spark/FailureSuite.scala |  2 ++
 .../org/apache/spark/FileServerSuite.scala    |  2 ++
 .../org/apache/spark/ThreadingSuite.scala     |  6 ++--
 .../spark/deploy/SparkSubmitSuite.scala       |  4 +++
 .../spark/deploy/SparkSubmitUtilsSuite.scala  |  2 ++
 .../WholeTextFileRecordReaderSuite.scala      |  8 ++---
 .../metrics/InputOutputMetricsSuite.scala     |  2 ++
 .../spark/scheduler/ReplayListenerSuite.scala |  2 ++
 .../spark/util/ClosureCleanerSuite.scala      |  2 ++
 .../org/apache/spark/util/UtilsSuite.scala    |  2 ++
 .../util/collection/SizeTrackerSuite.scala    |  4 +++
 .../spark/util/collection/SorterSuite.scala   | 10 +++---
 .../src/main/scala/SparkApp.scala             |  2 ++
 .../src/main/scala/SparkApp.scala             |  2 ++
 .../src/main/scala/GraphxApp.scala            |  2 ++
 .../sbt_app_hive/src/main/scala/HiveApp.scala |  2 ++
 .../src/main/scala/SparkApp.scala             |  2 ++
 .../sbt_app_sql/src/main/scala/SqlApp.scala   |  2 ++
 .../src/main/scala/StreamingApp.scala         |  2 ++
 .../apache/spark/examples/BroadcastTest.scala |  2 ++
 .../spark/examples/CassandraCQLTest.scala     |  2 ++
 .../apache/spark/examples/CassandraTest.scala |  2 ++
 .../spark/examples/DFSReadWriteTest.scala     |  2 ++
 .../spark/examples/DriverSubmissionTest.scala |  2 ++
 .../apache/spark/examples/GroupByTest.scala   |  2 ++
 .../org/apache/spark/examples/HBaseTest.scala |  2 ++
 .../org/apache/spark/examples/HdfsTest.scala  |  2 ++
 .../org/apache/spark/examples/LocalALS.scala  |  2 ++
 .../apache/spark/examples/LocalFileLR.scala   |  2 ++
 .../apache/spark/examples/LocalKMeans.scala   |  2 ++
 .../org/apache/spark/examples/LocalLR.scala   |  2 ++
 .../org/apache/spark/examples/LocalPi.scala   |  2 ++
 .../org/apache/spark/examples/LogQuery.scala  |  2 ++
 .../spark/examples/MultiBroadcastTest.scala   |  2 ++
 .../examples/SimpleSkewedGroupByTest.scala    |  2 ++
 .../spark/examples/SkewedGroupByTest.scala    |  2 ++
 .../org/apache/spark/examples/SparkALS.scala  |  2 ++
 .../apache/spark/examples/SparkHdfsLR.scala   |  2 ++
 .../apache/spark/examples/SparkKMeans.scala   |  2 ++
 .../org/apache/spark/examples/SparkLR.scala   |  2 ++
 .../apache/spark/examples/SparkPageRank.scala |  2 ++
 .../org/apache/spark/examples/SparkPi.scala   |  2 ++
 .../org/apache/spark/examples/SparkTC.scala   |  2 ++
 .../spark/examples/SparkTachyonHdfsLR.scala   |  2 ++
 .../spark/examples/SparkTachyonPi.scala       |  2 ++
 .../spark/examples/graphx/Analytics.scala     |  2 ++
 .../examples/graphx/LiveJournalPageRank.scala |  2 ++
 .../examples/graphx/SynthBenchmark.scala      |  2 ++
 .../examples/ml/CrossValidatorExample.scala   |  2 ++
 .../examples/ml/DecisionTreeExample.scala     |  2 ++
 .../examples/ml/DeveloperApiExample.scala     |  2 ++
 .../apache/spark/examples/ml/GBTExample.scala |  2 ++
 .../examples/ml/LinearRegressionExample.scala |  2 ++
 .../ml/LogisticRegressionExample.scala        |  2 ++
 .../spark/examples/ml/MovieLensALS.scala      |  2 ++
 .../spark/examples/ml/OneVsRestExample.scala  |  2 ++
 .../examples/ml/RandomForestExample.scala     |  2 ++
 .../examples/ml/SimpleParamsExample.scala     |  2 ++
 .../ml/SimpleTextClassificationPipeline.scala |  2 ++
 .../examples/mllib/BinaryClassification.scala |  2 ++
 .../spark/examples/mllib/Correlations.scala   |  2 ++
 .../examples/mllib/CosineSimilarity.scala     |  2 ++
 .../spark/examples/mllib/DatasetExample.scala |  2 ++
 .../examples/mllib/DecisionTreeRunner.scala   |  2 ++
 .../examples/mllib/DenseGaussianMixture.scala |  2 ++
 .../spark/examples/mllib/DenseKMeans.scala    |  2 ++
 .../examples/mllib/FPGrowthExample.scala      |  2 ++
 .../mllib/GradientBoostedTreesRunner.scala    |  2 ++
 .../spark/examples/mllib/LDAExample.scala     |  2 ++
 .../examples/mllib/LinearRegression.scala     |  2 ++
 .../spark/examples/mllib/MovieLensALS.scala   |  2 ++
 .../mllib/MultivariateSummarizer.scala        |  2 ++
 .../PowerIterationClusteringExample.scala     |  3 +-
 .../examples/mllib/RandomRDDGeneration.scala  |  2 ++
 .../spark/examples/mllib/SampledRDDs.scala    |  2 ++
 .../examples/mllib/SparseNaiveBayes.scala     |  2 ++
 .../mllib/StreamingKMeansExample.scala        |  2 ++
 .../mllib/StreamingLinearRegression.scala     |  2 ++
 .../mllib/StreamingLogisticRegression.scala   |  2 ++
 .../spark/examples/mllib/TallSkinnyPCA.scala  |  2 ++
 .../spark/examples/mllib/TallSkinnySVD.scala  |  2 ++
 .../spark/examples/sql/RDDRelation.scala      |  2 ++
 .../examples/sql/hive/HiveFromSpark.scala     |  2 ++
 .../examples/streaming/ActorWordCount.scala   |  2 ++
 .../examples/streaming/CustomReceiver.scala   |  2 ++
 .../streaming/DirectKafkaWordCount.scala      |  2 ++
 .../examples/streaming/FlumeEventCount.scala  |  2 ++
 .../streaming/FlumePollingEventCount.scala    |  2 ++
 .../examples/streaming/HdfsWordCount.scala    |  2 ++
 .../examples/streaming/KafkaWordCount.scala   |  2 ++
 .../examples/streaming/MQTTWordCount.scala    |  4 +++
 .../examples/streaming/NetworkWordCount.scala |  2 ++
 .../examples/streaming/RawNetworkGrep.scala   |  2 ++
 .../RecoverableNetworkWordCount.scala         |  2 ++
 .../streaming/SqlNetworkWordCount.scala       |  2 ++
 .../streaming/StatefulNetworkWordCount.scala  |  2 ++
 .../streaming/TwitterAlgebirdCMS.scala        |  2 ++
 .../streaming/TwitterAlgebirdHLL.scala        |  2 ++
 .../streaming/TwitterPopularTags.scala        |  2 ++
 .../examples/streaming/ZeroMQWordCount.scala  |  2 ++
 .../clickstream/PageViewGenerator.scala       |  2 ++
 .../clickstream/PageViewStream.scala          |  2 ++
 .../kafka/DirectKafkaStreamSuite.scala        |  2 +-
 .../streaming/KinesisWordCountASL.scala       |  2 ++
 .../spark/graphx/util/BytecodeUtils.scala     |  1 -
 .../spark/graphx/util/GraphGenerators.scala   |  4 +--
 .../graphx/util/BytecodeUtilsSuite.scala      |  2 ++
 .../mllib/util/KMeansDataGenerator.scala      |  2 ++
 .../mllib/util/LinearDataGenerator.scala      |  2 ++
 .../LogisticRegressionDataGenerator.scala     |  2 ++
 .../spark/mllib/util/MFDataGenerator.scala    |  2 ++
 .../spark/mllib/util/SVMDataGenerator.scala   |  2 ++
 .../spark/ml/feature/VectorIndexerSuite.scala | 10 +++---
 .../spark/mllib/linalg/VectorsSuite.scala     |  6 ++--
 .../spark/mllib/stat/CorrelationSuite.scala   |  6 ++--
 .../tree/GradientBoostedTreesSuite.scala      | 10 +++---
 .../spark/mllib/util/NumericParserSuite.scala |  2 +-
 project/SparkBuild.scala                      |  4 +++
 .../apache/spark/repl/SparkCommandLine.scala  |  2 ++
 .../org/apache/spark/repl/SparkILoop.scala    |  2 ++
 .../apache/spark/repl/SparkILoopInit.scala    |  2 ++
 .../org/apache/spark/repl/SparkIMain.scala    |  2 ++
 .../org/apache/spark/repl/SparkILoop.scala    |  2 ++
 .../org/apache/spark/repl/SparkIMain.scala    |  4 +++
 .../apache/spark/repl/SparkReplReporter.scala |  2 ++
 scalastyle-config.xml                         | 12 +++----
 .../expressions/codegen/package.scala         |  2 ++
 .../spark/sql/catalyst/plans/QueryPlan.scala  |  2 ++
 .../spark/sql/catalyst/util/package.scala     |  2 ++
 .../apache/spark/sql/types/StructType.scala   |  2 ++
 .../scala/org/apache/spark/sql/Column.scala   |  2 ++
 .../org/apache/spark/sql/DataFrame.scala      |  6 ++++
 .../spark/sql/execution/debug/package.scala   | 16 ++++-----
 .../hive/thriftserver/SparkSQLCLIDriver.scala | 12 ++++---
 .../apache/spark/sql/hive/HiveContext.scala   |  5 +--
 .../org/apache/spark/sql/hive/HiveQl.scala    |  5 +--
 .../spark/sql/hive/client/ClientWrapper.scala |  2 ++
 .../regression-test-SPARK-8489/Main.scala     |  2 ++
 .../sql/hive/HiveMetastoreCatalogSuite.scala  |  6 ++--
 .../spark/sql/hive/HiveSparkSubmitSuite.scala |  2 ++
 .../sql/hive/InsertIntoHiveTableSuite.scala   |  2 --
 .../sql/hive/MetastoreDataSourcesSuite.scala  |  6 ++--
 .../sql/hive/execution/HiveUDFSuite.scala     |  1 -
 .../spark/streaming/dstream/DStream.scala     |  2 ++
 .../spark/streaming/util/RawTextSender.scala  |  2 ++
 .../spark/streaming/util/RecurringTimer.scala |  4 +--
 .../spark/streaming/MasterFailureTest.scala   |  4 +++
 .../scheduler/JobGeneratorSuite.scala         |  1 -
 .../spark/tools/GenerateMIMAIgnore.scala      |  8 +++++
 .../tools/JavaAPICompletenessChecker.scala    |  4 +++
 .../spark/tools/StoragePerfTester.scala       |  4 +++
 .../yarn/ApplicationMasterArguments.scala     |  4 +++
 .../org/apache/spark/deploy/yarn/Client.scala |  2 +-
 .../spark/deploy/yarn/ClientArguments.scala   |  4 +++
 .../spark/deploy/yarn/YarnClusterSuite.scala  |  4 +++
 182 files changed, 478 insertions(+), 135 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/Logging.scala b/core/src/main/scala/org/apache/spark/Logging.scala
index 7fcb7830e7b0b..87ab099267b2f 100644
--- a/core/src/main/scala/org/apache/spark/Logging.scala
+++ b/core/src/main/scala/org/apache/spark/Logging.scala
@@ -121,6 +121,7 @@ trait Logging {
     if (usingLog4j12) {
       val log4j12Initialized = LogManager.getRootLogger.getAllAppenders.hasMoreElements
       if (!log4j12Initialized) {
+        // scalastyle:off println
         if (Utils.isInInterpreter) {
           val replDefaultLogProps = "org/apache/spark/log4j-defaults-repl.properties"
           Option(Utils.getSparkClassLoader.getResource(replDefaultLogProps)) match {
@@ -141,6 +142,7 @@ trait Logging {
               System.err.println(s"Spark was unable to load $defaultLogProps")
           }
         }
+        // scalastyle:on println
       }
     }
     Logging.initialized = true
diff --git a/core/src/main/scala/org/apache/spark/api/r/RBackend.scala b/core/src/main/scala/org/apache/spark/api/r/RBackend.scala
index 1a5f2bca26c2b..b7e72d4d0ed0b 100644
--- a/core/src/main/scala/org/apache/spark/api/r/RBackend.scala
+++ b/core/src/main/scala/org/apache/spark/api/r/RBackend.scala
@@ -95,7 +95,9 @@ private[spark] class RBackend {
 private[spark] object RBackend extends Logging {
   def main(args: Array[String]): Unit = {
     if (args.length < 1) {
+      // scalastyle:off println
       System.err.println("Usage: RBackend <tempFilePath>")
+      // scalastyle:on println
       System.exit(-1)
     }
     val sparkRBackend = new RBackend()
diff --git a/core/src/main/scala/org/apache/spark/api/r/RRDD.scala b/core/src/main/scala/org/apache/spark/api/r/RRDD.scala
index 524676544d6f5..ff1702f7dea48 100644
--- a/core/src/main/scala/org/apache/spark/api/r/RRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/r/RRDD.scala
@@ -161,7 +161,9 @@ private abstract class BaseRRDD[T: ClassTag, U: ClassTag](
               dataOut.write(elem.asInstanceOf[Array[Byte]])
             } else if (deserializer == SerializationFormats.STRING) {
               // write string(for StringRRDD)
+              // scalastyle:off println
               printOut.println(elem)
+              // scalastyle:on println
             }
           }
 
diff --git a/core/src/main/scala/org/apache/spark/deploy/Client.scala b/core/src/main/scala/org/apache/spark/deploy/Client.scala
index 71f7e2129116f..f03875a3e8c89 100644
--- a/core/src/main/scala/org/apache/spark/deploy/Client.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/Client.scala
@@ -118,26 +118,26 @@ private class ClientEndpoint(
   def pollAndReportStatus(driverId: String) {
     // Since ClientEndpoint is the only RpcEndpoint in the process, blocking the event loop thread
     // is fine.
-    println("... waiting before polling master for driver state")
+    logInfo("... waiting before polling master for driver state")
     Thread.sleep(5000)
-    println("... polling master for driver state")
+    logInfo("... polling master for driver state")
     val statusResponse =
       activeMasterEndpoint.askWithRetry[DriverStatusResponse](RequestDriverStatus(driverId))
     statusResponse.found match {
       case false =>
-        println(s"ERROR: Cluster master did not recognize $driverId")
+        logError(s"ERROR: Cluster master did not recognize $driverId")
         System.exit(-1)
       case true =>
-        println(s"State of $driverId is ${statusResponse.state.get}")
+        logInfo(s"State of $driverId is ${statusResponse.state.get}")
         // Worker node, if present
         (statusResponse.workerId, statusResponse.workerHostPort, statusResponse.state) match {
           case (Some(id), Some(hostPort), Some(DriverState.RUNNING)) =>
-            println(s"Driver running on $hostPort ($id)")
+            logInfo(s"Driver running on $hostPort ($id)")
           case _ =>
         }
         // Exception, if present
         statusResponse.exception.map { e =>
-          println(s"Exception from cluster was: $e")
+          logError(s"Exception from cluster was: $e")
           e.printStackTrace()
           System.exit(-1)
         }
@@ -148,7 +148,7 @@ private class ClientEndpoint(
   override def receive: PartialFunction[Any, Unit] = {
 
     case SubmitDriverResponse(master, success, driverId, message) =>
-      println(message)
+      logInfo(message)
       if (success) {
         activeMasterEndpoint = master
         pollAndReportStatus(driverId.get)
@@ -158,7 +158,7 @@ private class ClientEndpoint(
 
 
     case KillDriverResponse(master, driverId, success, message) =>
-      println(message)
+      logInfo(message)
       if (success) {
         activeMasterEndpoint = master
         pollAndReportStatus(driverId)
@@ -169,13 +169,13 @@ private class ClientEndpoint(
 
   override def onDisconnected(remoteAddress: RpcAddress): Unit = {
     if (!lostMasters.contains(remoteAddress)) {
-      println(s"Error connecting to master $remoteAddress.")
+      logError(s"Error connecting to master $remoteAddress.")
       lostMasters += remoteAddress
       // Note that this heuristic does not account for the fact that a Master can recover within
       // the lifetime of this client. Thus, once a Master is lost it is lost to us forever. This
       // is not currently a concern, however, because this client does not retry submissions.
       if (lostMasters.size >= masterEndpoints.size) {
-        println("No master is available, exiting.")
+        logError("No master is available, exiting.")
         System.exit(-1)
       }
     }
@@ -183,18 +183,18 @@ private class ClientEndpoint(
 
   override def onNetworkError(cause: Throwable, remoteAddress: RpcAddress): Unit = {
     if (!lostMasters.contains(remoteAddress)) {
-      println(s"Error connecting to master ($remoteAddress).")
-      println(s"Cause was: $cause")
+      logError(s"Error connecting to master ($remoteAddress).")
+      logError(s"Cause was: $cause")
       lostMasters += remoteAddress
       if (lostMasters.size >= masterEndpoints.size) {
-        println("No master is available, exiting.")
+        logError("No master is available, exiting.")
         System.exit(-1)
       }
     }
   }
 
   override def onError(cause: Throwable): Unit = {
-    println(s"Error processing messages, exiting.")
+    logError(s"Error processing messages, exiting.")
     cause.printStackTrace()
     System.exit(-1)
   }
@@ -209,10 +209,12 @@ private class ClientEndpoint(
  */
 object Client {
   def main(args: Array[String]) {
+    // scalastyle:off println
     if (!sys.props.contains("SPARK_SUBMIT")) {
       println("WARNING: This client is deprecated and will be removed in a future version of Spark")
       println("Use ./bin/spark-submit with \"--master spark://host:port\"")
     }
+    // scalastyle:on println
 
     val conf = new SparkConf()
     val driverArgs = new ClientArguments(args)
diff --git a/core/src/main/scala/org/apache/spark/deploy/ClientArguments.scala b/core/src/main/scala/org/apache/spark/deploy/ClientArguments.scala
index 42d3296062e6d..72cc330a398da 100644
--- a/core/src/main/scala/org/apache/spark/deploy/ClientArguments.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/ClientArguments.scala
@@ -72,9 +72,11 @@ private[deploy] class ClientArguments(args: Array[String]) {
       cmd = "launch"
 
       if (!ClientArguments.isValidJarUrl(_jarUrl)) {
+        // scalastyle:off println
         println(s"Jar url '${_jarUrl}' is not in valid format.")
         println(s"Must be a jar file path in URL format " +
           "(e.g. hdfs://host:port/XX.jar, file:///XX.jar)")
+        // scalastyle:on println
         printUsageAndExit(-1)
       }
 
@@ -110,7 +112,9 @@ private[deploy] class ClientArguments(args: Array[String]) {
       |                                  (default: $DEFAULT_SUPERVISE)
       |   -v, --verbose                  Print more debugging output
      """.stripMargin
+    // scalastyle:off println
     System.err.println(usage)
+    // scalastyle:on println
     System.exit(exitCode)
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/deploy/RRunner.scala b/core/src/main/scala/org/apache/spark/deploy/RRunner.scala
index e99779f299785..4165740312e03 100644
--- a/core/src/main/scala/org/apache/spark/deploy/RRunner.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/RRunner.scala
@@ -85,7 +85,9 @@ object RRunner {
       }
       System.exit(returnCode)
     } else {
+      // scalastyle:off println
       System.err.println("SparkR backend did not initialize in " + backendTimeout + " seconds")
+      // scalastyle:on println
       System.exit(-1)
     }
   }
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index b1d6ec209d62b..4cec9017b8adb 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -82,6 +82,7 @@ object SparkSubmit {
 
   private val CLASS_NOT_FOUND_EXIT_STATUS = 101
 
+  // scalastyle:off println
   // Exposed for testing
   private[spark] var exitFn: Int => Unit = (exitCode: Int) => System.exit(exitCode)
   private[spark] var printStream: PrintStream = System.err
@@ -102,11 +103,14 @@ object SparkSubmit {
     printStream.println("Type --help for more information.")
     exitFn(0)
   }
+  // scalastyle:on println
 
   def main(args: Array[String]): Unit = {
     val appArgs = new SparkSubmitArguments(args)
     if (appArgs.verbose) {
+      // scalastyle:off println
       printStream.println(appArgs)
+      // scalastyle:on println
     }
     appArgs.action match {
       case SparkSubmitAction.SUBMIT => submit(appArgs)
@@ -160,7 +164,9 @@ object SparkSubmit {
             // makes the message printed to the output by the JVM not very helpful. Instead,
             // detect exceptions with empty stack traces here, and treat them differently.
             if (e.getStackTrace().length == 0) {
+              // scalastyle:off println
               printStream.println(s"ERROR: ${e.getClass().getName()}: ${e.getMessage()}")
+              // scalastyle:on println
               exitFn(1)
             } else {
               throw e
@@ -178,7 +184,9 @@ object SparkSubmit {
      // to use the legacy gateway if the master endpoint turns out to be not a REST server.
     if (args.isStandaloneCluster && args.useRest) {
       try {
+        // scalastyle:off println
         printStream.println("Running Spark using the REST application submission protocol.")
+        // scalastyle:on println
         doRunMain()
       } catch {
         // Fail over to use the legacy submission gateway
@@ -558,6 +566,7 @@ object SparkSubmit {
       sysProps: Map[String, String],
       childMainClass: String,
       verbose: Boolean): Unit = {
+    // scalastyle:off println
     if (verbose) {
       printStream.println(s"Main class:\n$childMainClass")
       printStream.println(s"Arguments:\n${childArgs.mkString("\n")}")
@@ -565,6 +574,7 @@ object SparkSubmit {
       printStream.println(s"Classpath elements:\n${childClasspath.mkString("\n")}")
       printStream.println("\n")
     }
+    // scalastyle:on println
 
     val loader =
       if (sysProps.getOrElse("spark.driver.userClassPathFirst", "false").toBoolean) {
@@ -592,8 +602,10 @@ object SparkSubmit {
       case e: ClassNotFoundException =>
         e.printStackTrace(printStream)
         if (childMainClass.contains("thriftserver")) {
+          // scalastyle:off println
           printStream.println(s"Failed to load main class $childMainClass.")
           printStream.println("You need to build Spark with -Phive and -Phive-thriftserver.")
+          // scalastyle:on println
         }
         System.exit(CLASS_NOT_FOUND_EXIT_STATUS)
     }
@@ -766,7 +778,9 @@ private[spark] object SparkSubmitUtils {
         brr.setRoot(repo)
         brr.setName(s"repo-${i + 1}")
         cr.add(brr)
+        // scalastyle:off println
         printStream.println(s"$repo added as a remote repository with the name: ${brr.getName}")
+        // scalastyle:on println
       }
     }
 
@@ -829,7 +843,9 @@ private[spark] object SparkSubmitUtils {
       val ri = ModuleRevisionId.newInstance(mvn.groupId, mvn.artifactId, mvn.version)
       val dd = new DefaultDependencyDescriptor(ri, false, false)
       dd.addDependencyConfiguration(ivyConfName, ivyConfName)
+      // scalastyle:off println
       printStream.println(s"${dd.getDependencyId} added as a dependency")
+      // scalastyle:on println
       md.addDependency(dd)
     }
   }
@@ -896,9 +912,11 @@ private[spark] object SparkSubmitUtils {
             ivySettings.setDefaultCache(new File(alternateIvyCache, "cache"))
             new File(alternateIvyCache, "jars")
           }
+        // scalastyle:off println
         printStream.println(
           s"Ivy Default Cache set to: ${ivySettings.getDefaultCache.getAbsolutePath}")
         printStream.println(s"The jars for the packages stored in: $packagesDirectory")
+        // scalastyle:on println
         // create a pattern matcher
         ivySettings.addMatcher(new GlobPatternMatcher)
         // create the dependency resolvers
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
index 6e3c0b21b33c2..ebb39c354dff1 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
@@ -79,6 +79,7 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S
   /** Default properties present in the currently defined defaults file. */
   lazy val defaultSparkProperties: HashMap[String, String] = {
     val defaultProperties = new HashMap[String, String]()
+    // scalastyle:off println
     if (verbose) SparkSubmit.printStream.println(s"Using properties file: $propertiesFile")
     Option(propertiesFile).foreach { filename =>
       Utils.getPropertiesFromFile(filename).foreach { case (k, v) =>
@@ -86,6 +87,7 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S
         if (verbose) SparkSubmit.printStream.println(s"Adding default property: $k=$v")
       }
     }
+    // scalastyle:on println
     defaultProperties
   }
 
@@ -452,6 +454,7 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S
   }
 
   private def printUsageAndExit(exitCode: Int, unknownParam: Any = null): Unit = {
+    // scalastyle:off println
     val outStream = SparkSubmit.printStream
     if (unknownParam != null) {
       outStream.println("Unknown/unsupported param " + unknownParam)
@@ -541,6 +544,7 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S
       outStream.println("CLI options:")
       outStream.println(getSqlShellOptions())
     }
+    // scalastyle:on println
 
     SparkSubmit.exitFn(exitCode)
   }
diff --git a/core/src/main/scala/org/apache/spark/deploy/client/TestExecutor.scala b/core/src/main/scala/org/apache/spark/deploy/client/TestExecutor.scala
index c5ac45c6730d3..a98b1fa8f83a1 100644
--- a/core/src/main/scala/org/apache/spark/deploy/client/TestExecutor.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/client/TestExecutor.scala
@@ -19,7 +19,9 @@ package org.apache.spark.deploy.client
 
 private[spark] object TestExecutor {
   def main(args: Array[String]) {
+    // scalastyle:off println
     println("Hello world!")
+    // scalastyle:on println
     while (true) {
       Thread.sleep(1000)
     }
diff --git a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServerArguments.scala b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServerArguments.scala
index 4692d22651c93..18265df9faa2c 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServerArguments.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServerArguments.scala
@@ -56,6 +56,7 @@ private[history] class HistoryServerArguments(conf: SparkConf, args: Array[Strin
    Utils.loadDefaultSparkProperties(conf, propertiesFile)
 
   private def printUsageAndExit(exitCode: Int) {
+    // scalastyle:off println
     System.err.println(
       """
       |Usage: HistoryServer [options]
@@ -84,6 +85,7 @@ private[history] class HistoryServerArguments(conf: SparkConf, args: Array[Strin
       |  spark.history.fs.updateInterval    How often to reload log data from storage
       |                                     (in seconds, default: 10)
       |""".stripMargin)
+    // scalastyle:on println
     System.exit(exitCode)
   }
 
diff --git a/core/src/main/scala/org/apache/spark/deploy/master/MasterArguments.scala b/core/src/main/scala/org/apache/spark/deploy/master/MasterArguments.scala
index 435b9b12f83b8..44cefbc77f08e 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/MasterArguments.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/MasterArguments.scala
@@ -85,6 +85,7 @@ private[master] class MasterArguments(args: Array[String], conf: SparkConf) {
    * Print usage and exit JVM with the given exit code.
    */
   private def printUsageAndExit(exitCode: Int) {
+    // scalastyle:off println
     System.err.println(
       "Usage: Master [options]\n" +
       "\n" +
@@ -95,6 +96,7 @@ private[master] class MasterArguments(args: Array[String], conf: SparkConf) {
       "  --webui-port PORT      Port for web UI (default: 8080)\n" +
       "  --properties-file FILE Path to a custom Spark properties file.\n" +
       "                         Default is conf/spark-defaults.conf.")
+    // scalastyle:on println
     System.exit(exitCode)
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/deploy/mesos/MesosClusterDispatcherArguments.scala b/core/src/main/scala/org/apache/spark/deploy/mesos/MesosClusterDispatcherArguments.scala
index 894cb78d8591a..5accaf78d0a51 100644
--- a/core/src/main/scala/org/apache/spark/deploy/mesos/MesosClusterDispatcherArguments.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/mesos/MesosClusterDispatcherArguments.scala
@@ -54,7 +54,9 @@ private[mesos] class MesosClusterDispatcherArguments(args: Array[String], conf:
 
     case ("--master" | "-m") :: value :: tail =>
       if (!value.startsWith("mesos://")) {
+        // scalastyle:off println
         System.err.println("Cluster dispatcher only supports mesos (uri begins with mesos://)")
+        // scalastyle:on println
         System.exit(1)
       }
       masterUrl = value.stripPrefix("mesos://")
@@ -73,7 +75,9 @@ private[mesos] class MesosClusterDispatcherArguments(args: Array[String], conf:
 
     case Nil => {
       if (masterUrl == null) {
+        // scalastyle:off println
         System.err.println("--master is required")
+        // scalastyle:on println
         printUsageAndExit(1)
       }
     }
@@ -83,6 +87,7 @@ private[mesos] class MesosClusterDispatcherArguments(args: Array[String], conf:
   }
 
   private def printUsageAndExit(exitCode: Int): Unit = {
+    // scalastyle:off println
     System.err.println(
       "Usage: MesosClusterDispatcher [options]\n" +
         "\n" +
@@ -96,6 +101,7 @@ private[mesos] class MesosClusterDispatcherArguments(args: Array[String], conf:
         "                          Zookeeper for persistence\n" +
         "  --properties-file FILE  Path to a custom Spark properties file.\n" +
         "                          Default is conf/spark-defaults.conf.")
+    // scalastyle:on println
     System.exit(exitCode)
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/DriverWrapper.scala b/core/src/main/scala/org/apache/spark/deploy/worker/DriverWrapper.scala
index d1a12b01e78f7..2d6be3042c905 100644
--- a/core/src/main/scala/org/apache/spark/deploy/worker/DriverWrapper.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/DriverWrapper.scala
@@ -60,7 +60,9 @@ object DriverWrapper {
         rpcEnv.shutdown()
 
       case _ =>
+        // scalastyle:off println
         System.err.println("Usage: DriverWrapper <workerUrl> <userJar> <driverMainClass> [options]")
+        // scalastyle:on println
         System.exit(-1)
     }
   }
diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/WorkerArguments.scala b/core/src/main/scala/org/apache/spark/deploy/worker/WorkerArguments.scala
index 1d2ecab517613..e89d076802215 100644
--- a/core/src/main/scala/org/apache/spark/deploy/worker/WorkerArguments.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/WorkerArguments.scala
@@ -121,6 +121,7 @@ private[worker] class WorkerArguments(args: Array[String], conf: SparkConf) {
    * Print usage and exit JVM with the given exit code.
    */
   def printUsageAndExit(exitCode: Int) {
+    // scalastyle:off println
     System.err.println(
       "Usage: Worker [options] <master>\n" +
       "\n" +
@@ -136,6 +137,7 @@ private[worker] class WorkerArguments(args: Array[String], conf: SparkConf) {
       "  --webui-port PORT        Port for web UI (default: 8081)\n" +
       "  --properties-file FILE   Path to a custom Spark properties file.\n" +
       "                           Default is conf/spark-defaults.conf.")
+    // scalastyle:on println
     System.exit(exitCode)
   }
 
@@ -160,7 +162,9 @@ private[worker] class WorkerArguments(args: Array[String], conf: SparkConf) {
     } catch {
       case e: Exception => {
         totalMb = 2*1024
+        // scalastyle:off println
         System.out.println("Failed to get total physical memory. Using " + totalMb + " MB")
+        // scalastyle:on println
       }
     }
     // Leave out 1 GB for the operating system, but don't return a negative memory size
diff --git a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
index 34d4cfdca7732..fcd76ec52742a 100644
--- a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
+++ b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
@@ -235,7 +235,9 @@ private[spark] object CoarseGrainedExecutorBackend extends Logging {
           argv = tail
         case Nil =>
         case tail =>
+          // scalastyle:off println
           System.err.println(s"Unrecognized options: ${tail.mkString(" ")}")
+          // scalastyle:on println
           printUsageAndExit()
       }
     }
@@ -249,6 +251,7 @@ private[spark] object CoarseGrainedExecutorBackend extends Logging {
   }
 
   private def printUsageAndExit() = {
+    // scalastyle:off println
     System.err.println(
       """
       |"Usage: CoarseGrainedExecutorBackend [options]
@@ -262,6 +265,7 @@ private[spark] object CoarseGrainedExecutorBackend extends Logging {
       |   --worker-url <workerUrl>
       |   --user-class-path <url>
       |""".stripMargin)
+    // scalastyle:on println
     System.exit(1)
   }
 
diff --git a/core/src/main/scala/org/apache/spark/input/FixedLengthBinaryInputFormat.scala b/core/src/main/scala/org/apache/spark/input/FixedLengthBinaryInputFormat.scala
index c219d21fbefa9..532850dd57716 100644
--- a/core/src/main/scala/org/apache/spark/input/FixedLengthBinaryInputFormat.scala
+++ b/core/src/main/scala/org/apache/spark/input/FixedLengthBinaryInputFormat.scala
@@ -21,6 +21,8 @@ import org.apache.hadoop.fs.Path
 import org.apache.hadoop.io.{BytesWritable, LongWritable}
 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
 import org.apache.hadoop.mapreduce.{InputSplit, JobContext, RecordReader, TaskAttemptContext}
+
+import org.apache.spark.Logging
 import org.apache.spark.deploy.SparkHadoopUtil
 
 /**
@@ -39,7 +41,8 @@ private[spark] object FixedLengthBinaryInputFormat {
 }
 
 private[spark] class FixedLengthBinaryInputFormat
-  extends FileInputFormat[LongWritable, BytesWritable] {
+  extends FileInputFormat[LongWritable, BytesWritable]
+  with Logging {
 
   private var recordLength = -1
 
@@ -51,7 +54,7 @@ private[spark] class FixedLengthBinaryInputFormat
       recordLength = FixedLengthBinaryInputFormat.getRecordLength(context)
     }
     if (recordLength <= 0) {
-      println("record length is less than 0, file cannot be split")
+      logDebug("record length is less than 0, file cannot be split")
       false
     } else {
       true
diff --git a/core/src/main/scala/org/apache/spark/network/nio/BlockMessage.scala b/core/src/main/scala/org/apache/spark/network/nio/BlockMessage.scala
index 67a376102994c..79cb0640c8672 100644
--- a/core/src/main/scala/org/apache/spark/network/nio/BlockMessage.scala
+++ b/core/src/main/scala/org/apache/spark/network/nio/BlockMessage.scala
@@ -57,16 +57,6 @@ private[nio] class BlockMessage() {
   }
 
   def set(buffer: ByteBuffer) {
-    /*
-    println()
-    println("BlockMessage: ")
-    while(buffer.remaining > 0) {
-      print(buffer.get())
-    }
-    buffer.rewind()
-    println()
-    println()
-    */
     typ = buffer.getInt()
     val idLength = buffer.getInt()
     val idBuilder = new StringBuilder(idLength)
@@ -138,18 +128,6 @@ private[nio] class BlockMessage() {
       buffers += data
     }
 
-    /*
-    println()
-    println("BlockMessage: ")
-    buffers.foreach(b => {
-      while(b.remaining > 0) {
-        print(b.get())
-      }
-      b.rewind()
-    })
-    println()
-    println()
-    */
     Message.createBufferMessage(buffers)
   }
 
diff --git a/core/src/main/scala/org/apache/spark/network/nio/BlockMessageArray.scala b/core/src/main/scala/org/apache/spark/network/nio/BlockMessageArray.scala
index 7d0806f0c2580..f1c9ea8b64ca3 100644
--- a/core/src/main/scala/org/apache/spark/network/nio/BlockMessageArray.scala
+++ b/core/src/main/scala/org/apache/spark/network/nio/BlockMessageArray.scala
@@ -43,16 +43,6 @@ class BlockMessageArray(var blockMessages: Seq[BlockMessage])
     val newBlockMessages = new ArrayBuffer[BlockMessage]()
     val buffer = bufferMessage.buffers(0)
     buffer.clear()
-    /*
-    println()
-    println("BlockMessageArray: ")
-    while(buffer.remaining > 0) {
-      print(buffer.get())
-    }
-    buffer.rewind()
-    println()
-    println()
-    */
     while (buffer.remaining() > 0) {
       val size = buffer.getInt()
       logDebug("Creating block message of size " + size + " bytes")
@@ -86,23 +76,11 @@ class BlockMessageArray(var blockMessages: Seq[BlockMessage])
 
     logDebug("Buffer list:")
     buffers.foreach((x: ByteBuffer) => logDebug("" + x))
-    /*
-    println()
-    println("BlockMessageArray: ")
-    buffers.foreach(b => {
-      while(b.remaining > 0) {
-        print(b.get())
-      }
-      b.rewind()
-    })
-    println()
-    println()
-    */
     Message.createBufferMessage(buffers)
   }
 }
 
-private[nio] object BlockMessageArray {
+private[nio] object BlockMessageArray extends Logging {
 
   def fromBufferMessage(bufferMessage: BufferMessage): BlockMessageArray = {
     val newBlockMessageArray = new BlockMessageArray()
@@ -123,10 +101,10 @@ private[nio] object BlockMessageArray {
         }
       }
     val blockMessageArray = new BlockMessageArray(blockMessages)
-    println("Block message array created")
+    logDebug("Block message array created")
 
     val bufferMessage = blockMessageArray.toBufferMessage
-    println("Converted to buffer message")
+    logDebug("Converted to buffer message")
 
     val totalSize = bufferMessage.size
     val newBuffer = ByteBuffer.allocate(totalSize)
@@ -138,10 +116,11 @@ private[nio] object BlockMessageArray {
     })
     newBuffer.flip
     val newBufferMessage = Message.createBufferMessage(newBuffer)
-    println("Copied to new buffer message, size = " + newBufferMessage.size)
+    logDebug("Copied to new buffer message, size = " + newBufferMessage.size)
 
     val newBlockMessageArray = BlockMessageArray.fromBufferMessage(newBufferMessage)
-    println("Converted back to block message array")
+    logDebug("Converted back to block message array")
+    // scalastyle:off println
     newBlockMessageArray.foreach(blockMessage => {
       blockMessage.getType match {
         case BlockMessage.TYPE_PUT_BLOCK => {
@@ -154,6 +133,7 @@ private[nio] object BlockMessageArray {
         }
       }
     })
+    // scalastyle:on println
   }
 }
 
diff --git a/core/src/main/scala/org/apache/spark/network/nio/ConnectionManager.scala b/core/src/main/scala/org/apache/spark/network/nio/ConnectionManager.scala
index c0bca2c4bc994..9143918790381 100644
--- a/core/src/main/scala/org/apache/spark/network/nio/ConnectionManager.scala
+++ b/core/src/main/scala/org/apache/spark/network/nio/ConnectionManager.scala
@@ -1016,7 +1016,9 @@ private[spark] object ConnectionManager {
     val conf = new SparkConf
     val manager = new ConnectionManager(9999, conf, new SecurityManager(conf))
     manager.onReceiveMessage((msg: Message, id: ConnectionManagerId) => {
+      // scalastyle:off println
       println("Received [" + msg + "] from [" + id + "]")
+      // scalastyle:on println
       None
     })
 
@@ -1033,6 +1035,7 @@ private[spark] object ConnectionManager {
     System.gc()
   }
 
+  // scalastyle:off println
   def testSequentialSending(manager: ConnectionManager) {
     println("--------------------------")
     println("Sequential Sending")
@@ -1150,4 +1153,5 @@ private[spark] object ConnectionManager {
       println()
     }
   }
+  // scalastyle:on println
 }
diff --git a/core/src/main/scala/org/apache/spark/rdd/PipedRDD.scala b/core/src/main/scala/org/apache/spark/rdd/PipedRDD.scala
index dc60d48927624..defdabf95ac4b 100644
--- a/core/src/main/scala/org/apache/spark/rdd/PipedRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/PipedRDD.scala
@@ -123,7 +123,9 @@ private[spark] class PipedRDD[T: ClassTag](
     new Thread("stderr reader for " + command) {
       override def run() {
         for (line <- Source.fromInputStream(proc.getErrorStream).getLines) {
+          // scalastyle:off println
           System.err.println(line)
+          // scalastyle:on println
         }
       }
     }.start()
@@ -133,6 +135,7 @@ private[spark] class PipedRDD[T: ClassTag](
       override def run() {
         val out = new PrintWriter(proc.getOutputStream)
 
+        // scalastyle:off println
         // input the pipe context firstly
         if (printPipeContext != null) {
           printPipeContext(out.println(_))
@@ -144,6 +147,7 @@ private[spark] class PipedRDD[T: ClassTag](
             out.println(elem)
           }
         }
+        // scalastyle:on println
         out.close()
       }
     }.start()
diff --git a/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala b/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala
index 529a5b2bf1a0d..62b05033a9281 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala
@@ -140,7 +140,9 @@ private[spark] class EventLoggingListener(
   /** Log the event as JSON. */
   private def logEvent(event: SparkListenerEvent, flushLogger: Boolean = false) {
     val eventJson = JsonProtocol.sparkEventToJson(event)
+    // scalastyle:off println
     writer.foreach(_.println(compact(render(eventJson))))
+    // scalastyle:on println
     if (flushLogger) {
       writer.foreach(_.flush())
       hadoopDataStream.foreach(hadoopFlushMethod.invoke(_))
diff --git a/core/src/main/scala/org/apache/spark/scheduler/JobLogger.scala b/core/src/main/scala/org/apache/spark/scheduler/JobLogger.scala
index e55b76c36cc5f..f96eb8ca0ae00 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/JobLogger.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/JobLogger.scala
@@ -125,7 +125,9 @@ class JobLogger(val user: String, val logDirName: String) extends SparkListener
       val date = new Date(System.currentTimeMillis())
       writeInfo = dateFormat.get.format(date) + ": " + info
     }
+    // scalastyle:off println
     jobIdToPrintWriter.get(jobId).foreach(_.println(writeInfo))
+    // scalastyle:on println
   }
 
   /**
diff --git a/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala b/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala
index f413c1d37fbb6..c8356467fab87 100644
--- a/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala
+++ b/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala
@@ -68,7 +68,9 @@ private[spark] object JettyUtils extends Logging {
             response.setStatus(HttpServletResponse.SC_OK)
             val result = servletParams.responder(request)
             response.setHeader("Cache-Control", "no-cache, no-store, must-revalidate")
+            // scalastyle:off println
             response.getWriter.println(servletParams.extractFn(result))
+            // scalastyle:on println
           } else {
             response.setStatus(HttpServletResponse.SC_UNAUTHORIZED)
             response.setHeader("Cache-Control", "no-cache, no-store, must-revalidate")
diff --git a/core/src/main/scala/org/apache/spark/ui/UIWorkloadGenerator.scala b/core/src/main/scala/org/apache/spark/ui/UIWorkloadGenerator.scala
index ba03acdb38cc5..5a8c2914314c2 100644
--- a/core/src/main/scala/org/apache/spark/ui/UIWorkloadGenerator.scala
+++ b/core/src/main/scala/org/apache/spark/ui/UIWorkloadGenerator.scala
@@ -38,9 +38,11 @@ private[spark] object UIWorkloadGenerator {
 
   def main(args: Array[String]) {
     if (args.length < 3) {
+      // scalastyle:off println
       println(
-        "usage: ./bin/spark-class org.apache.spark.ui.UIWorkloadGenerator " +
+        "Usage: ./bin/spark-class org.apache.spark.ui.UIWorkloadGenerator " +
           "[master] [FIFO|FAIR] [#job set (4 jobs per set)]")
+      // scalastyle:on println
       System.exit(1)
     }
 
@@ -96,6 +98,7 @@ private[spark] object UIWorkloadGenerator {
       for ((desc, job) <- jobs) {
         new Thread {
           override def run() {
+            // scalastyle:off println
             try {
               setProperties(desc)
               job()
@@ -106,6 +109,7 @@ private[spark] object UIWorkloadGenerator {
             } finally {
               barrier.release()
             }
+            // scalastyle:on println
           }
         }.start
         Thread.sleep(INTER_JOB_WAIT_MS)
diff --git a/core/src/main/scala/org/apache/spark/util/Distribution.scala b/core/src/main/scala/org/apache/spark/util/Distribution.scala
index 1bab707235b89..950b69f7db641 100644
--- a/core/src/main/scala/org/apache/spark/util/Distribution.scala
+++ b/core/src/main/scala/org/apache/spark/util/Distribution.scala
@@ -52,9 +52,11 @@ private[spark] class Distribution(val data: Array[Double], val startIdx: Int, va
   }
 
   def showQuantiles(out: PrintStream = System.out): Unit = {
+    // scalastyle:off println
     out.println("min\t25%\t50%\t75%\tmax")
     getQuantiles(defaultProbabilities).foreach{q => out.print(q + "\t")}
     out.println
+    // scalastyle:on println
   }
 
   def statCounter: StatCounter = StatCounter(data.slice(startIdx, endIdx))
@@ -64,8 +66,10 @@ private[spark] class Distribution(val data: Array[Double], val startIdx: Int, va
    * @param out
    */
   def summary(out: PrintStream = System.out) {
+    // scalastyle:off println
     out.println(statCounter)
     showQuantiles(out)
+    // scalastyle:on println
   }
 }
 
@@ -80,8 +84,10 @@ private[spark] object Distribution {
   }
 
   def showQuantiles(out: PrintStream = System.out, quantiles: Traversable[Double]) {
+    // scalastyle:off println
     out.println("min\t25%\t50%\t75%\tmax")
     quantiles.foreach{q => out.print(q + "\t")}
     out.println
+    // scalastyle:on println
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/util/random/XORShiftRandom.scala b/core/src/main/scala/org/apache/spark/util/random/XORShiftRandom.scala
index c4a7b4441c85c..85fb923cd9bc7 100644
--- a/core/src/main/scala/org/apache/spark/util/random/XORShiftRandom.scala
+++ b/core/src/main/scala/org/apache/spark/util/random/XORShiftRandom.scala
@@ -70,12 +70,14 @@ private[spark] object XORShiftRandom {
    * @param args takes one argument - the number of random numbers to generate
    */
   def main(args: Array[String]): Unit = {
+    // scalastyle:off println
     if (args.length != 1) {
       println("Benchmark of XORShiftRandom vis-a-vis java.util.Random")
       println("Usage: XORShiftRandom number_of_random_numbers_to_generate")
       System.exit(1)
     }
     println(benchmark(args(0).toInt))
+    // scalastyle:on println
   }
 
   /**
diff --git a/core/src/test/scala/org/apache/spark/DistributedSuite.scala b/core/src/test/scala/org/apache/spark/DistributedSuite.scala
index 9c191ed52206d..2300bcff4f118 100644
--- a/core/src/test/scala/org/apache/spark/DistributedSuite.scala
+++ b/core/src/test/scala/org/apache/spark/DistributedSuite.scala
@@ -107,7 +107,9 @@ class DistributedSuite extends SparkFunSuite with Matchers with LocalSparkContex
     sc = new SparkContext(clusterUrl, "test")
     val accum = sc.accumulator(0)
     val thrown = intercept[SparkException] {
+      // scalastyle:off println
       sc.parallelize(1 to 10, 10).foreach(x => println(x / 0))
+      // scalastyle:on println
     }
     assert(thrown.getClass === classOf[SparkException])
     assert(thrown.getMessage.contains("failed 4 times"))
diff --git a/core/src/test/scala/org/apache/spark/FailureSuite.scala b/core/src/test/scala/org/apache/spark/FailureSuite.scala
index a8c8c6f73fb5a..b099cd3fb7965 100644
--- a/core/src/test/scala/org/apache/spark/FailureSuite.scala
+++ b/core/src/test/scala/org/apache/spark/FailureSuite.scala
@@ -130,7 +130,9 @@ class FailureSuite extends SparkFunSuite with LocalSparkContext {
 
     // Non-serializable closure in foreach function
     val thrown2 = intercept[SparkException] {
+      // scalastyle:off println
       sc.parallelize(1 to 10, 2).foreach(x => println(a))
+      // scalastyle:on println
     }
     assert(thrown2.getClass === classOf[SparkException])
     assert(thrown2.getMessage.contains("NotSerializableException") ||
diff --git a/core/src/test/scala/org/apache/spark/FileServerSuite.scala b/core/src/test/scala/org/apache/spark/FileServerSuite.scala
index 6e65b0a8f6c76..876418aa13029 100644
--- a/core/src/test/scala/org/apache/spark/FileServerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/FileServerSuite.scala
@@ -51,7 +51,9 @@ class FileServerSuite extends SparkFunSuite with LocalSparkContext {
 
     val textFile = new File(testTempDir, "FileServerSuite.txt")
     val pw = new PrintWriter(textFile)
+    // scalastyle:off println
     pw.println("100")
+    // scalastyle:on println
     pw.close()
 
     val jarFile = new File(testTempDir, "test.jar")
diff --git a/core/src/test/scala/org/apache/spark/ThreadingSuite.scala b/core/src/test/scala/org/apache/spark/ThreadingSuite.scala
index 6580139df6c60..48509f0759a3b 100644
--- a/core/src/test/scala/org/apache/spark/ThreadingSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ThreadingSuite.scala
@@ -36,7 +36,7 @@ object ThreadingSuiteState {
   }
 }
 
-class ThreadingSuite extends SparkFunSuite with LocalSparkContext {
+class ThreadingSuite extends SparkFunSuite with LocalSparkContext with Logging {
 
   test("accessing SparkContext form a different thread") {
     sc = new SparkContext("local", "test")
@@ -130,8 +130,6 @@ class ThreadingSuite extends SparkFunSuite with LocalSparkContext {
               Thread.sleep(100)
             }
             if (running.get() != 4) {
-              println("Waited 1 second without seeing runningThreads = 4 (it was " +
-                running.get() + "); failing test")
               ThreadingSuiteState.failed.set(true)
             }
             number
@@ -143,6 +141,8 @@ class ThreadingSuite extends SparkFunSuite with LocalSparkContext {
     }
     sem.acquire(2)
     if (ThreadingSuiteState.failed.get()) {
+      logError("Waited 1 second without seeing runningThreads = 4 (it was " +
+                ThreadingSuiteState.runningThreads.get() + "); failing test")
       fail("One or more threads didn't see runningThreads = 4")
     }
   }
diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
index 2e05dec99b6bf..1b64c329b5d4b 100644
--- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
@@ -51,9 +51,11 @@ class SparkSubmitSuite
   /** Simple PrintStream that reads data into a buffer */
   private class BufferPrintStream extends PrintStream(noOpOutputStream) {
     var lineBuffer = ArrayBuffer[String]()
+    // scalastyle:off println
     override def println(line: String) {
       lineBuffer += line
     }
+    // scalastyle:on println
   }
 
   /** Returns true if the script exits and the given search string is printed. */
@@ -81,6 +83,7 @@ class SparkSubmitSuite
     }
   }
 
+  // scalastyle:off println
   test("prints usage on empty input") {
     testPrematureExit(Array[String](), "Usage: spark-submit")
   }
@@ -491,6 +494,7 @@ class SparkSubmitSuite
       appArgs.executorMemory should be ("2.3g")
     }
   }
+  // scalastyle:on println
 
   // NOTE: This is an expensive operation in terms of time (10 seconds+). Use sparingly.
   private def runSparkSubmit(args: Seq[String]): Unit = {
diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala
index c9b435a9228d3..01ece1a10f46d 100644
--- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala
@@ -41,9 +41,11 @@ class SparkSubmitUtilsSuite extends SparkFunSuite with BeforeAndAfterAll {
   /** Simple PrintStream that reads data into a buffer */
   private class BufferPrintStream extends PrintStream(noOpOutputStream) {
     var lineBuffer = ArrayBuffer[String]()
+    // scalastyle:off println
     override def println(line: String) {
       lineBuffer += line
     }
+    // scalastyle:on println
   }
 
   override def beforeAll() {
diff --git a/core/src/test/scala/org/apache/spark/input/WholeTextFileRecordReaderSuite.scala b/core/src/test/scala/org/apache/spark/input/WholeTextFileRecordReaderSuite.scala
index 63947df3d43a2..8a199459c1ddf 100644
--- a/core/src/test/scala/org/apache/spark/input/WholeTextFileRecordReaderSuite.scala
+++ b/core/src/test/scala/org/apache/spark/input/WholeTextFileRecordReaderSuite.scala
@@ -27,7 +27,7 @@ import org.scalatest.BeforeAndAfterAll
 
 import org.apache.hadoop.io.Text
 
-import org.apache.spark.{SparkConf, SparkContext, SparkFunSuite}
+import org.apache.spark.{Logging, SparkConf, SparkContext, SparkFunSuite}
 import org.apache.spark.util.Utils
 import org.apache.hadoop.io.compress.{DefaultCodec, CompressionCodecFactory, GzipCodec}
 
@@ -36,7 +36,7 @@ import org.apache.hadoop.io.compress.{DefaultCodec, CompressionCodecFactory, Gzi
  * [[org.apache.spark.input.WholeTextFileRecordReader WholeTextFileRecordReader]]. A temporary
  * directory is created as fake input. Temporal storage would be deleted in the end.
  */
-class WholeTextFileRecordReaderSuite extends SparkFunSuite with BeforeAndAfterAll {
+class WholeTextFileRecordReaderSuite extends SparkFunSuite with BeforeAndAfterAll with Logging {
   private var sc: SparkContext = _
   private var factory: CompressionCodecFactory = _
 
@@ -85,7 +85,7 @@ class WholeTextFileRecordReaderSuite extends SparkFunSuite with BeforeAndAfterAl
    */
   test("Correctness of WholeTextFileRecordReader.") {
     val dir = Utils.createTempDir()
-    println(s"Local disk address is ${dir.toString}.")
+    logInfo(s"Local disk address is ${dir.toString}.")
 
     WholeTextFileRecordReaderSuite.files.foreach { case (filename, contents) =>
       createNativeFile(dir, filename, contents, false)
@@ -109,7 +109,7 @@ class WholeTextFileRecordReaderSuite extends SparkFunSuite with BeforeAndAfterAl
 
   test("Correctness of WholeTextFileRecordReader with GzipCodec.") {
     val dir = Utils.createTempDir()
-    println(s"Local disk address is ${dir.toString}.")
+    logInfo(s"Local disk address is ${dir.toString}.")
 
     WholeTextFileRecordReaderSuite.files.foreach { case (filename, contents) =>
       createNativeFile(dir, filename, contents, true)
diff --git a/core/src/test/scala/org/apache/spark/metrics/InputOutputMetricsSuite.scala b/core/src/test/scala/org/apache/spark/metrics/InputOutputMetricsSuite.scala
index 9e4d34fb7d382..d3218a548efc7 100644
--- a/core/src/test/scala/org/apache/spark/metrics/InputOutputMetricsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/metrics/InputOutputMetricsSuite.scala
@@ -60,7 +60,9 @@ class InputOutputMetricsSuite extends SparkFunSuite with SharedSparkContext
     tmpFile = new File(testTempDir, getClass.getSimpleName + ".txt")
     val pw = new PrintWriter(new FileWriter(tmpFile))
     for (x <- 1 to numRecords) {
+      // scalastyle:off println
       pw.println(RandomUtils.nextInt(0, numBuckets))
+      // scalastyle:on println
     }
     pw.close()
 
diff --git a/core/src/test/scala/org/apache/spark/scheduler/ReplayListenerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/ReplayListenerSuite.scala
index ff3fa95ec32ae..4e3defb43a021 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/ReplayListenerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/ReplayListenerSuite.scala
@@ -52,8 +52,10 @@ class ReplayListenerSuite extends SparkFunSuite with BeforeAndAfter {
     val applicationStart = SparkListenerApplicationStart("Greatest App (N)ever", None,
       125L, "Mickey", None)
     val applicationEnd = SparkListenerApplicationEnd(1000L)
+    // scalastyle:off println
     writer.println(compact(render(JsonProtocol.sparkEventToJson(applicationStart))))
     writer.println(compact(render(JsonProtocol.sparkEventToJson(applicationEnd))))
+    // scalastyle:on println
     writer.close()
 
     val conf = EventLoggingListenerSuite.getLoggingConf(logFilePath)
diff --git a/core/src/test/scala/org/apache/spark/util/ClosureCleanerSuite.scala b/core/src/test/scala/org/apache/spark/util/ClosureCleanerSuite.scala
index 1053c6caf7718..480722a5ac182 100644
--- a/core/src/test/scala/org/apache/spark/util/ClosureCleanerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/ClosureCleanerSuite.scala
@@ -375,6 +375,7 @@ class TestCreateNullValue {
     // parameters of the closure constructor. This allows us to test whether
     // null values are created correctly for each type.
     val nestedClosure = () => {
+      // scalastyle:off println
       if (s.toString == "123") { // Don't really output them to avoid noisy
         println(bo)
         println(c)
@@ -389,6 +390,7 @@ class TestCreateNullValue {
       val closure = () => {
         println(getX)
       }
+      // scalastyle:on println
       ClosureCleaner.clean(closure)
     }
     nestedClosure()
diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
index 251a797dc28a2..c7638507c88c6 100644
--- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
@@ -684,7 +684,9 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging {
     val buffer = new CircularBuffer(25)
     val stream = new java.io.PrintStream(buffer, true, "UTF-8")
 
+    // scalastyle:off println
     stream.println("test circular test circular test circular test circular test circular")
+    // scalastyle:on println
     assert(buffer.toString === "t circular test circular\n")
   }
 }
diff --git a/core/src/test/scala/org/apache/spark/util/collection/SizeTrackerSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/SizeTrackerSuite.scala
index 5a5919fca2469..4f382414a8dd7 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/SizeTrackerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/SizeTrackerSuite.scala
@@ -103,7 +103,9 @@ private object SizeTrackerSuite {
    */
   def main(args: Array[String]): Unit = {
     if (args.size < 1) {
+      // scalastyle:off println
       println("Usage: SizeTrackerSuite [num elements]")
+      // scalastyle:on println
       System.exit(1)
     }
     val numElements = args(0).toInt
@@ -180,11 +182,13 @@ private object SizeTrackerSuite {
       baseTimes: Seq[Long],
       sampledTimes: Seq[Long],
       unsampledTimes: Seq[Long]): Unit = {
+    // scalastyle:off println
     println(s"Average times for $testName (ms):")
     println("  Base - " + averageTime(baseTimes))
     println("  SizeTracker (sampled) - " + averageTime(sampledTimes))
     println("  SizeEstimator (unsampled) - " + averageTime(unsampledTimes))
     println()
+    // scalastyle:on println
   }
 
   def time(f: => Unit): Long = {
diff --git a/core/src/test/scala/org/apache/spark/util/collection/SorterSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/SorterSuite.scala
index b2f5d9009ee5d..fefa5165db197 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/SorterSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/SorterSuite.scala
@@ -20,10 +20,10 @@ package org.apache.spark.util.collection
 import java.lang.{Float => JFloat, Integer => JInteger}
 import java.util.{Arrays, Comparator}
 
-import org.apache.spark.SparkFunSuite
+import org.apache.spark.{Logging, SparkFunSuite}
 import org.apache.spark.util.random.XORShiftRandom
 
-class SorterSuite extends SparkFunSuite {
+class SorterSuite extends SparkFunSuite with Logging {
 
   test("equivalent to Arrays.sort") {
     val rand = new XORShiftRandom(123)
@@ -74,7 +74,7 @@ class SorterSuite extends SparkFunSuite {
   /** Runs an experiment several times. */
   def runExperiment(name: String, skip: Boolean = false)(f: => Unit, prepare: () => Unit): Unit = {
     if (skip) {
-      println(s"Skipped experiment $name.")
+      logInfo(s"Skipped experiment $name.")
       return
     }
 
@@ -86,11 +86,11 @@ class SorterSuite extends SparkFunSuite {
     while (i < 10) {
       val time = org.apache.spark.util.Utils.timeIt(1)(f, Some(prepare))
       next10 += time
-      println(s"$name: Took $time ms")
+      logInfo(s"$name: Took $time ms")
       i += 1
     }
 
-    println(s"$name: ($firstTry ms first try, ${next10 / 10} ms average)")
+    logInfo(s"$name: ($firstTry ms first try, ${next10 / 10} ms average)")
   }
 
   /**
diff --git a/dev/audit-release/sbt_app_core/src/main/scala/SparkApp.scala b/dev/audit-release/sbt_app_core/src/main/scala/SparkApp.scala
index fc03fec9866a6..61d91c70e9709 100644
--- a/dev/audit-release/sbt_app_core/src/main/scala/SparkApp.scala
+++ b/dev/audit-release/sbt_app_core/src/main/scala/SparkApp.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package main.scala
 
 import scala.util.Try
@@ -59,3 +60,4 @@ object SimpleApp {
     }
   }
 }
+// scalastyle:on println
diff --git a/dev/audit-release/sbt_app_ganglia/src/main/scala/SparkApp.scala b/dev/audit-release/sbt_app_ganglia/src/main/scala/SparkApp.scala
index 0be8e64fbfabd..9f7ae75d0b477 100644
--- a/dev/audit-release/sbt_app_ganglia/src/main/scala/SparkApp.scala
+++ b/dev/audit-release/sbt_app_ganglia/src/main/scala/SparkApp.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package main.scala
 
 import scala.util.Try
@@ -37,3 +38,4 @@ object SimpleApp {
     }
   }
 }
+// scalastyle:on println
diff --git a/dev/audit-release/sbt_app_graphx/src/main/scala/GraphxApp.scala b/dev/audit-release/sbt_app_graphx/src/main/scala/GraphxApp.scala
index 24c7f8d667296..2f0b6ef9a5672 100644
--- a/dev/audit-release/sbt_app_graphx/src/main/scala/GraphxApp.scala
+++ b/dev/audit-release/sbt_app_graphx/src/main/scala/GraphxApp.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package main.scala
 
 import org.apache.spark.{SparkContext, SparkConf}
@@ -51,3 +52,4 @@ object GraphXApp {
     println("Test succeeded")
   }
 }
+// scalastyle:on println
diff --git a/dev/audit-release/sbt_app_hive/src/main/scala/HiveApp.scala b/dev/audit-release/sbt_app_hive/src/main/scala/HiveApp.scala
index 5111bc0adb772..4a980ec071ae4 100644
--- a/dev/audit-release/sbt_app_hive/src/main/scala/HiveApp.scala
+++ b/dev/audit-release/sbt_app_hive/src/main/scala/HiveApp.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package main.scala
 
 import scala.collection.mutable.{ListBuffer, Queue}
@@ -55,3 +56,4 @@ object SparkSqlExample {
     sc.stop()
   }
 }
+// scalastyle:on println
diff --git a/dev/audit-release/sbt_app_kinesis/src/main/scala/SparkApp.scala b/dev/audit-release/sbt_app_kinesis/src/main/scala/SparkApp.scala
index 9f85066501472..adc25b57d6aa5 100644
--- a/dev/audit-release/sbt_app_kinesis/src/main/scala/SparkApp.scala
+++ b/dev/audit-release/sbt_app_kinesis/src/main/scala/SparkApp.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package main.scala
 
 import scala.util.Try
@@ -31,3 +32,4 @@ object SimpleApp {
     }
   }
 }
+// scalastyle:on println
diff --git a/dev/audit-release/sbt_app_sql/src/main/scala/SqlApp.scala b/dev/audit-release/sbt_app_sql/src/main/scala/SqlApp.scala
index cc86ef45858c9..69c1154dc0955 100644
--- a/dev/audit-release/sbt_app_sql/src/main/scala/SqlApp.scala
+++ b/dev/audit-release/sbt_app_sql/src/main/scala/SqlApp.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package main.scala
 
 import scala.collection.mutable.{ListBuffer, Queue}
@@ -57,3 +58,4 @@ object SparkSqlExample {
     sc.stop()
   }
 }
+// scalastyle:on println
diff --git a/dev/audit-release/sbt_app_streaming/src/main/scala/StreamingApp.scala b/dev/audit-release/sbt_app_streaming/src/main/scala/StreamingApp.scala
index 58a662bd9b2e8..d6a074687f4a1 100644
--- a/dev/audit-release/sbt_app_streaming/src/main/scala/StreamingApp.scala
+++ b/dev/audit-release/sbt_app_streaming/src/main/scala/StreamingApp.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package main.scala
 
 import scala.collection.mutable.{ListBuffer, Queue}
@@ -61,3 +62,4 @@ object SparkStreamingExample {
     ssc.stop()
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/BroadcastTest.scala b/examples/src/main/scala/org/apache/spark/examples/BroadcastTest.scala
index 4c129dbe2d12d..d812262fd87dc 100644
--- a/examples/src/main/scala/org/apache/spark/examples/BroadcastTest.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/BroadcastTest.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples
 
 import org.apache.spark.{SparkConf, SparkContext}
@@ -52,3 +53,4 @@ object BroadcastTest {
     sc.stop()
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/CassandraCQLTest.scala b/examples/src/main/scala/org/apache/spark/examples/CassandraCQLTest.scala
index 023bb3ee2d108..36832f51d2ad4 100644
--- a/examples/src/main/scala/org/apache/spark/examples/CassandraCQLTest.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/CassandraCQLTest.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+ // scalastyle:off println
 package org.apache.spark.examples
 
 import java.nio.ByteBuffer
@@ -140,3 +141,4 @@ object CassandraCQLTest {
     sc.stop()
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/CassandraTest.scala b/examples/src/main/scala/org/apache/spark/examples/CassandraTest.scala
index ec689474aecb0..96ef3e198e380 100644
--- a/examples/src/main/scala/org/apache/spark/examples/CassandraTest.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/CassandraTest.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples
 
 import java.nio.ByteBuffer
@@ -130,6 +131,7 @@ object CassandraTest {
     sc.stop()
   }
 }
+// scalastyle:on println
 
 /*
 create keyspace casDemo;
diff --git a/examples/src/main/scala/org/apache/spark/examples/DFSReadWriteTest.scala b/examples/src/main/scala/org/apache/spark/examples/DFSReadWriteTest.scala
index 1f12034ce0f57..d651fe4d6ee75 100644
--- a/examples/src/main/scala/org/apache/spark/examples/DFSReadWriteTest.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/DFSReadWriteTest.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples
 
 import java.io.File
@@ -136,3 +137,4 @@ object DFSReadWriteTest {
 
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/DriverSubmissionTest.scala b/examples/src/main/scala/org/apache/spark/examples/DriverSubmissionTest.scala
index e757283823fc3..c42df2b8845d2 100644
--- a/examples/src/main/scala/org/apache/spark/examples/DriverSubmissionTest.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/DriverSubmissionTest.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples
 
 import scala.collection.JavaConversions._
@@ -46,3 +47,4 @@ object DriverSubmissionTest {
     }
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/GroupByTest.scala b/examples/src/main/scala/org/apache/spark/examples/GroupByTest.scala
index 15f6678648b29..fa4a3afeecd19 100644
--- a/examples/src/main/scala/org/apache/spark/examples/GroupByTest.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/GroupByTest.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples
 
 import java.util.Random
@@ -53,3 +54,4 @@ object GroupByTest {
     sc.stop()
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/HBaseTest.scala b/examples/src/main/scala/org/apache/spark/examples/HBaseTest.scala
index 95c96111c9b1f..244742327a907 100644
--- a/examples/src/main/scala/org/apache/spark/examples/HBaseTest.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/HBaseTest.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples
 
 import org.apache.hadoop.hbase.client.HBaseAdmin
@@ -62,3 +63,4 @@ object HBaseTest {
     admin.close()
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/HdfsTest.scala b/examples/src/main/scala/org/apache/spark/examples/HdfsTest.scala
index ed2b38e2ca6f8..124dc9af6390f 100644
--- a/examples/src/main/scala/org/apache/spark/examples/HdfsTest.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/HdfsTest.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples
 
 import org.apache.spark._
@@ -41,3 +42,4 @@ object HdfsTest {
     sc.stop()
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/LocalALS.scala b/examples/src/main/scala/org/apache/spark/examples/LocalALS.scala
index 3d5259463003d..af5f216f28ba4 100644
--- a/examples/src/main/scala/org/apache/spark/examples/LocalALS.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/LocalALS.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples
 
 import org.apache.commons.math3.linear._
@@ -142,3 +143,4 @@ object LocalALS {
     new Array2DRowRealMatrix(Array.fill(rows, cols)(math.random))
 
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/LocalFileLR.scala b/examples/src/main/scala/org/apache/spark/examples/LocalFileLR.scala
index ac2ea35bbd0e0..9c8aae53cf48d 100644
--- a/examples/src/main/scala/org/apache/spark/examples/LocalFileLR.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/LocalFileLR.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples
 
 import java.util.Random
@@ -73,3 +74,4 @@ object LocalFileLR {
     println("Final w: " + w)
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/LocalKMeans.scala b/examples/src/main/scala/org/apache/spark/examples/LocalKMeans.scala
index 04fc0a033014a..e7b28d38bdfc6 100644
--- a/examples/src/main/scala/org/apache/spark/examples/LocalKMeans.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/LocalKMeans.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples
 
 import java.util.Random
@@ -119,3 +120,4 @@ object LocalKMeans {
     println("Final centers: " + kPoints)
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/LocalLR.scala b/examples/src/main/scala/org/apache/spark/examples/LocalLR.scala
index c3fc74a116c0a..4f6b092a59ca5 100644
--- a/examples/src/main/scala/org/apache/spark/examples/LocalLR.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/LocalLR.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples
 
 import java.util.Random
@@ -77,3 +78,4 @@ object LocalLR {
     println("Final w: " + w)
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/LocalPi.scala b/examples/src/main/scala/org/apache/spark/examples/LocalPi.scala
index ee6b3ee34aeb2..3d923625f11b6 100644
--- a/examples/src/main/scala/org/apache/spark/examples/LocalPi.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/LocalPi.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples
 
 import scala.math.random
@@ -33,3 +34,4 @@ object LocalPi {
     println("Pi is roughly " + 4 * count / 100000.0)
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/LogQuery.scala b/examples/src/main/scala/org/apache/spark/examples/LogQuery.scala
index 75c82117cbad2..a80de10f4610a 100644
--- a/examples/src/main/scala/org/apache/spark/examples/LogQuery.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/LogQuery.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples
 
 import org.apache.spark.{SparkConf, SparkContext}
@@ -83,3 +84,4 @@ object LogQuery {
     sc.stop()
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/MultiBroadcastTest.scala b/examples/src/main/scala/org/apache/spark/examples/MultiBroadcastTest.scala
index 2a5c0c0defe13..61ce9db914f9f 100644
--- a/examples/src/main/scala/org/apache/spark/examples/MultiBroadcastTest.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/MultiBroadcastTest.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples
 
 import org.apache.spark.rdd.RDD
@@ -53,3 +54,4 @@ object MultiBroadcastTest {
     sc.stop()
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/SimpleSkewedGroupByTest.scala b/examples/src/main/scala/org/apache/spark/examples/SimpleSkewedGroupByTest.scala
index 5291ab81f459e..3b0b00fe4dd0a 100644
--- a/examples/src/main/scala/org/apache/spark/examples/SimpleSkewedGroupByTest.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/SimpleSkewedGroupByTest.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples
 
 import java.util.Random
@@ -67,3 +68,4 @@ object SimpleSkewedGroupByTest {
     sc.stop()
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/SkewedGroupByTest.scala b/examples/src/main/scala/org/apache/spark/examples/SkewedGroupByTest.scala
index 017d4e1e5ce13..719e2176fed3f 100644
--- a/examples/src/main/scala/org/apache/spark/examples/SkewedGroupByTest.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/SkewedGroupByTest.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples
 
 import java.util.Random
@@ -57,3 +58,4 @@ object SkewedGroupByTest {
     sc.stop()
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/SparkALS.scala b/examples/src/main/scala/org/apache/spark/examples/SparkALS.scala
index 30c4261551837..69799b7c2bb30 100644
--- a/examples/src/main/scala/org/apache/spark/examples/SparkALS.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/SparkALS.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples
 
 import org.apache.commons.math3.linear._
@@ -144,3 +145,4 @@ object SparkALS {
     new Array2DRowRealMatrix(Array.fill(rows, cols)(math.random))
 
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/SparkHdfsLR.scala b/examples/src/main/scala/org/apache/spark/examples/SparkHdfsLR.scala
index 9099c2fcc90b3..505ea5a4c7a85 100644
--- a/examples/src/main/scala/org/apache/spark/examples/SparkHdfsLR.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/SparkHdfsLR.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples
 
 import java.util.Random
@@ -97,3 +98,4 @@ object SparkHdfsLR {
     sc.stop()
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/SparkKMeans.scala b/examples/src/main/scala/org/apache/spark/examples/SparkKMeans.scala
index b514d9123f5e7..c56e1124ad415 100644
--- a/examples/src/main/scala/org/apache/spark/examples/SparkKMeans.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/SparkKMeans.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples
 
 import breeze.linalg.{Vector, DenseVector, squaredDistance}
@@ -100,3 +101,4 @@ object SparkKMeans {
     sc.stop()
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/SparkLR.scala b/examples/src/main/scala/org/apache/spark/examples/SparkLR.scala
index 1e6b4fb0c7514..d265c227f4ed2 100644
--- a/examples/src/main/scala/org/apache/spark/examples/SparkLR.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/SparkLR.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples
 
 import java.util.Random
@@ -86,3 +87,4 @@ object SparkLR {
     sc.stop()
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/SparkPageRank.scala b/examples/src/main/scala/org/apache/spark/examples/SparkPageRank.scala
index bd7894f184c4c..0fd79660dd196 100644
--- a/examples/src/main/scala/org/apache/spark/examples/SparkPageRank.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/SparkPageRank.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples
 
 import org.apache.spark.SparkContext._
@@ -74,3 +75,4 @@ object SparkPageRank {
     ctx.stop()
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/SparkPi.scala b/examples/src/main/scala/org/apache/spark/examples/SparkPi.scala
index 35b8dd6c29b66..818d4f2b81f82 100644
--- a/examples/src/main/scala/org/apache/spark/examples/SparkPi.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/SparkPi.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples
 
 import scala.math.random
@@ -37,3 +38,4 @@ object SparkPi {
     spark.stop()
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/SparkTC.scala b/examples/src/main/scala/org/apache/spark/examples/SparkTC.scala
index 772cd897f5140..95072071ccddb 100644
--- a/examples/src/main/scala/org/apache/spark/examples/SparkTC.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/SparkTC.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples
 
 import scala.util.Random
@@ -70,3 +71,4 @@ object SparkTC {
     spark.stop()
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/SparkTachyonHdfsLR.scala b/examples/src/main/scala/org/apache/spark/examples/SparkTachyonHdfsLR.scala
index 4393b99e636b6..cfbdae02212a5 100644
--- a/examples/src/main/scala/org/apache/spark/examples/SparkTachyonHdfsLR.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/SparkTachyonHdfsLR.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples
 
 import java.util.Random
@@ -94,3 +95,4 @@ object SparkTachyonHdfsLR {
     sc.stop()
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/SparkTachyonPi.scala b/examples/src/main/scala/org/apache/spark/examples/SparkTachyonPi.scala
index 7743f7968b100..e46ac655beb58 100644
--- a/examples/src/main/scala/org/apache/spark/examples/SparkTachyonPi.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/SparkTachyonPi.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples
 
 import scala.math.random
@@ -46,3 +47,4 @@ object SparkTachyonPi {
     spark.stop()
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/graphx/Analytics.scala b/examples/src/main/scala/org/apache/spark/examples/graphx/Analytics.scala
index 409721b01c8fd..8dd6c9706e7df 100644
--- a/examples/src/main/scala/org/apache/spark/examples/graphx/Analytics.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/graphx/Analytics.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples.graphx
 
 import scala.collection.mutable
@@ -151,3 +152,4 @@ object Analytics extends Logging {
     }
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/graphx/LiveJournalPageRank.scala b/examples/src/main/scala/org/apache/spark/examples/graphx/LiveJournalPageRank.scala
index f6f8d9f90c275..da3ffca1a6f2a 100644
--- a/examples/src/main/scala/org/apache/spark/examples/graphx/LiveJournalPageRank.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/graphx/LiveJournalPageRank.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples.graphx
 
 /**
@@ -42,3 +43,4 @@ object LiveJournalPageRank {
     Analytics.main(args.patch(0, List("pagerank"), 0))
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/graphx/SynthBenchmark.scala b/examples/src/main/scala/org/apache/spark/examples/graphx/SynthBenchmark.scala
index 3ec20d594b784..46e52aacd90bb 100644
--- a/examples/src/main/scala/org/apache/spark/examples/graphx/SynthBenchmark.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/graphx/SynthBenchmark.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples.graphx
 
 import org.apache.spark.SparkContext._
@@ -128,3 +129,4 @@ object SynthBenchmark {
     sc.stop()
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/CrossValidatorExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/CrossValidatorExample.scala
index 6c0af20461d3b..14b358d46f6ab 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/CrossValidatorExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/CrossValidatorExample.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples.ml
 
 import org.apache.spark.{SparkConf, SparkContext}
@@ -110,3 +111,4 @@ object CrossValidatorExample {
     sc.stop()
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeExample.scala
index 54e4073941056..f28671f7869fc 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeExample.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples.ml
 
 import scala.collection.mutable
@@ -355,3 +356,4 @@ object DecisionTreeExample {
     println(s"  Root mean squared error (RMSE): $RMSE")
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala
index 7b8cc21ed8982..78f31b4ffe56a 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples.ml
 
 import org.apache.spark.{SparkConf, SparkContext}
@@ -181,3 +182,4 @@ private class MyLogisticRegressionModel(
     copyValues(new MyLogisticRegressionModel(uid, weights), extra)
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/GBTExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/GBTExample.scala
index 33905277c7341..f4a15f806ea81 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/GBTExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/GBTExample.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples.ml
 
 import scala.collection.mutable
@@ -236,3 +237,4 @@ object GBTExample {
     sc.stop()
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/LinearRegressionExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/LinearRegressionExample.scala
index b54466fd48bc5..b73299fb12d3f 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/LinearRegressionExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/LinearRegressionExample.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples.ml
 
 import scala.collection.mutable
@@ -140,3 +141,4 @@ object LinearRegressionExample {
     sc.stop()
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionExample.scala
index 3cf193f353fbc..7682557127b51 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionExample.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples.ml
 
 import scala.collection.mutable
@@ -157,3 +158,4 @@ object LogisticRegressionExample {
     sc.stop()
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/MovieLensALS.scala b/examples/src/main/scala/org/apache/spark/examples/ml/MovieLensALS.scala
index 25f21113bf622..cd411397a4b9d 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/MovieLensALS.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/MovieLensALS.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples.ml
 
 import scopt.OptionParser
@@ -178,3 +179,4 @@ object MovieLensALS {
     sc.stop()
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala
index 6927eb8f275cf..bab31f585b0ef 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples.ml
 
 import java.util.concurrent.TimeUnit.{NANOSECONDS => NANO}
@@ -183,3 +184,4 @@ object OneVsRestExample {
     (NANO.toSeconds(t1 - t0), result)
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestExample.scala
index 9f7cad68a4594..109178f4137b2 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestExample.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples.ml
 
 import scala.collection.mutable
@@ -244,3 +245,4 @@ object RandomForestExample {
     sc.stop()
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala
index a0561e2573fc9..58d7b67674ff7 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples.ml
 
 import org.apache.spark.{SparkConf, SparkContext}
@@ -100,3 +101,4 @@ object SimpleParamsExample {
     sc.stop()
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala
index 1324b066c30c3..960280137cbf9 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples.ml
 
 import scala.beans.BeanInfo
@@ -89,3 +90,4 @@ object SimpleTextClassificationPipeline {
     sc.stop()
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassification.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassification.scala
index a113653810b93..1a4016f76c2ad 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassification.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassification.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples.mllib
 
 import org.apache.log4j.{Level, Logger}
@@ -153,3 +154,4 @@ object BinaryClassification {
     sc.stop()
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/Correlations.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/Correlations.scala
index e49129c4e7844..026d4ecc6d10a 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/Correlations.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/Correlations.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples.mllib
 
 import scopt.OptionParser
@@ -91,3 +92,4 @@ object Correlations {
     sc.stop()
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/CosineSimilarity.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/CosineSimilarity.scala
index cb1abbd18fd4d..69988cc1b9334 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/CosineSimilarity.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/CosineSimilarity.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples.mllib
 
 import scopt.OptionParser
@@ -106,3 +107,4 @@ object CosineSimilarity {
     sc.stop()
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/DatasetExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/DatasetExample.scala
index 520893b26d595..dc13f82488af7 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/DatasetExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/DatasetExample.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples.mllib
 
 import java.io.File
@@ -119,3 +120,4 @@ object DatasetExample {
   }
 
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeRunner.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeRunner.scala
index 3381941673db8..57ffe3dd2524f 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeRunner.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeRunner.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples.mllib
 
 import scala.language.reflectiveCalls
@@ -368,3 +369,4 @@ object DecisionTreeRunner {
   }
   // scalastyle:on structural.type
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/DenseGaussianMixture.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/DenseGaussianMixture.scala
index f8c71ccabc43b..1fce4ba7efd60 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/DenseGaussianMixture.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/DenseGaussianMixture.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples.mllib
 
 import org.apache.spark.{SparkConf, SparkContext}
@@ -65,3 +66,4 @@ object DenseGaussianMixture {
     println()
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/DenseKMeans.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/DenseKMeans.scala
index 14cc5cbb679c5..380d85d60e7b4 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/DenseKMeans.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/DenseKMeans.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples.mllib
 
 import org.apache.log4j.{Level, Logger}
@@ -107,3 +108,4 @@ object DenseKMeans {
     sc.stop()
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/FPGrowthExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/FPGrowthExample.scala
index 13f24a1e59610..14b930550d554 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/FPGrowthExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/FPGrowthExample.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples.mllib
 
 import scopt.OptionParser
@@ -80,3 +81,4 @@ object FPGrowthExample {
     sc.stop()
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/GradientBoostedTreesRunner.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/GradientBoostedTreesRunner.scala
index 7416fb5a40848..e16a6bf033574 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/GradientBoostedTreesRunner.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/GradientBoostedTreesRunner.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples.mllib
 
 import scopt.OptionParser
@@ -145,3 +146,4 @@ object GradientBoostedTreesRunner {
     sc.stop()
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/LDAExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/LDAExample.scala
index 31d629f853161..75b0f69cf91aa 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/LDAExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/LDAExample.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples.mllib
 
 import java.text.BreakIterator
@@ -302,3 +303,4 @@ private class SimpleTokenizer(sc: SparkContext, stopwordFile: String) extends Se
   }
 
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/LinearRegression.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/LinearRegression.scala
index 6a456ba7ec07b..8878061a0970b 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/LinearRegression.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/LinearRegression.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples.mllib
 
 import org.apache.log4j.{Level, Logger}
@@ -134,3 +135,4 @@ object LinearRegression {
     sc.stop()
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/MovieLensALS.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/MovieLensALS.scala
index 99588b0984ab2..e43a6f2864c73 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/MovieLensALS.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/MovieLensALS.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples.mllib
 
 import scala.collection.mutable
@@ -189,3 +190,4 @@ object MovieLensALS {
     math.sqrt(predictionsAndRatings.map(x => (x._1 - x._2) * (x._1 - x._2)).mean())
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/MultivariateSummarizer.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/MultivariateSummarizer.scala
index 6e4e2d07f284b..5f839c75dd581 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/MultivariateSummarizer.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/MultivariateSummarizer.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples.mllib
 
 import scopt.OptionParser
@@ -97,3 +98,4 @@ object MultivariateSummarizer {
     sc.stop()
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/PowerIterationClusteringExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/PowerIterationClusteringExample.scala
index 6d8b806569dfd..0723223954610 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/PowerIterationClusteringExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/PowerIterationClusteringExample.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples.mllib
 
 import org.apache.log4j.{Level, Logger}
@@ -154,4 +155,4 @@ object PowerIterationClusteringExample {
     coeff * math.exp(expCoeff * ssquares)
   }
 }
-
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/RandomRDDGeneration.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/RandomRDDGeneration.scala
index 924b586e3af99..bee85ba0f9969 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/RandomRDDGeneration.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/RandomRDDGeneration.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples.mllib
 
 import org.apache.spark.mllib.random.RandomRDDs
@@ -58,3 +59,4 @@ object RandomRDDGeneration {
   }
 
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/SampledRDDs.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/SampledRDDs.scala
index 663c12734af68..6963f43e082c4 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/SampledRDDs.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/SampledRDDs.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples.mllib
 
 import org.apache.spark.mllib.util.MLUtils
@@ -125,3 +126,4 @@ object SampledRDDs {
     sc.stop()
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/SparseNaiveBayes.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/SparseNaiveBayes.scala
index f1ff4e6911f5e..f81fc292a3bd1 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/SparseNaiveBayes.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/SparseNaiveBayes.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples.mllib
 
 import org.apache.log4j.{Level, Logger}
@@ -100,3 +101,4 @@ object SparseNaiveBayes {
     sc.stop()
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/StreamingKMeansExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/StreamingKMeansExample.scala
index 8bb12d2ee9ed2..af03724a8ac62 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/StreamingKMeansExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/StreamingKMeansExample.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples.mllib
 
 import org.apache.spark.SparkConf
@@ -75,3 +76,4 @@ object StreamingKMeansExample {
     ssc.awaitTermination()
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/StreamingLinearRegression.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/StreamingLinearRegression.scala
index 1a95048bbfe2d..b4a5dca031abd 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/StreamingLinearRegression.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/StreamingLinearRegression.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples.mllib
 
 import org.apache.spark.mllib.linalg.Vectors
@@ -69,3 +70,4 @@ object StreamingLinearRegression {
   }
 
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/StreamingLogisticRegression.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/StreamingLogisticRegression.scala
index e1998099c2d78..b42f4cb5f9338 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/StreamingLogisticRegression.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/StreamingLogisticRegression.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples.mllib
 
 import org.apache.spark.mllib.linalg.Vectors
@@ -71,3 +72,4 @@ object StreamingLogisticRegression {
   }
 
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/TallSkinnyPCA.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/TallSkinnyPCA.scala
index 3cd9cb743e309..464fbd385ab5d 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/TallSkinnyPCA.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/TallSkinnyPCA.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples.mllib
 
 import org.apache.spark.{SparkConf, SparkContext}
@@ -58,3 +59,4 @@ object TallSkinnyPCA {
     sc.stop()
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/TallSkinnySVD.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/TallSkinnySVD.scala
index 4d6690318615a..65b4bc46f0266 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/TallSkinnySVD.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/TallSkinnySVD.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples.mllib
 
 import org.apache.spark.{SparkConf, SparkContext}
@@ -58,3 +59,4 @@ object TallSkinnySVD {
     sc.stop()
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/sql/RDDRelation.scala b/examples/src/main/scala/org/apache/spark/examples/sql/RDDRelation.scala
index b11e32047dc34..2cc56f04e5c1f 100644
--- a/examples/src/main/scala/org/apache/spark/examples/sql/RDDRelation.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/sql/RDDRelation.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples.sql
 
 import org.apache.spark.{SparkConf, SparkContext}
@@ -73,3 +74,4 @@ object RDDRelation {
     sc.stop()
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/sql/hive/HiveFromSpark.scala b/examples/src/main/scala/org/apache/spark/examples/sql/hive/HiveFromSpark.scala
index b7ba60ec28155..bf40bd1ef13df 100644
--- a/examples/src/main/scala/org/apache/spark/examples/sql/hive/HiveFromSpark.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/sql/hive/HiveFromSpark.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples.sql.hive
 
 import com.google.common.io.{ByteStreams, Files}
@@ -77,3 +78,4 @@ object HiveFromSpark {
     sc.stop()
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/ActorWordCount.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/ActorWordCount.scala
index 016de4c63d1d2..e9c9907198769 100644
--- a/examples/src/main/scala/org/apache/spark/examples/streaming/ActorWordCount.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/streaming/ActorWordCount.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples.streaming
 
 import scala.collection.mutable.LinkedList
@@ -170,3 +171,4 @@ object ActorWordCount {
     ssc.awaitTermination()
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/CustomReceiver.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/CustomReceiver.scala
index 30269a7ccae97..28e9bf520e568 100644
--- a/examples/src/main/scala/org/apache/spark/examples/streaming/CustomReceiver.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/streaming/CustomReceiver.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples.streaming
 
 import java.io.{InputStreamReader, BufferedReader, InputStream}
@@ -100,3 +101,4 @@ class CustomReceiver(host: String, port: Int)
    }
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/DirectKafkaWordCount.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/DirectKafkaWordCount.scala
index fbe394de4a179..bd78526f8c299 100644
--- a/examples/src/main/scala/org/apache/spark/examples/streaming/DirectKafkaWordCount.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/streaming/DirectKafkaWordCount.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples.streaming
 
 import kafka.serializer.StringDecoder
@@ -70,3 +71,4 @@ object DirectKafkaWordCount {
     ssc.awaitTermination()
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/FlumeEventCount.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/FlumeEventCount.scala
index 20e7df7c45b1b..91e52e4eff5a7 100644
--- a/examples/src/main/scala/org/apache/spark/examples/streaming/FlumeEventCount.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/streaming/FlumeEventCount.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples.streaming
 
 import org.apache.spark.SparkConf
@@ -66,3 +67,4 @@ object FlumeEventCount {
     ssc.awaitTermination()
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/FlumePollingEventCount.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/FlumePollingEventCount.scala
index 1cc8c8d5c23b6..2bdbc37e2a289 100644
--- a/examples/src/main/scala/org/apache/spark/examples/streaming/FlumePollingEventCount.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/streaming/FlumePollingEventCount.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples.streaming
 
 import org.apache.spark.SparkConf
@@ -65,3 +66,4 @@ object FlumePollingEventCount {
     ssc.awaitTermination()
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/HdfsWordCount.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/HdfsWordCount.scala
index 4b4667fec44e6..1f282d437dc38 100644
--- a/examples/src/main/scala/org/apache/spark/examples/streaming/HdfsWordCount.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/streaming/HdfsWordCount.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples.streaming
 
 import org.apache.spark.SparkConf
@@ -53,3 +54,4 @@ object HdfsWordCount {
     ssc.awaitTermination()
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/KafkaWordCount.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/KafkaWordCount.scala
index 60416ee343544..b40d17e9c2fa3 100644
--- a/examples/src/main/scala/org/apache/spark/examples/streaming/KafkaWordCount.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/streaming/KafkaWordCount.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples.streaming
 
 import java.util.HashMap
@@ -101,3 +102,4 @@ object KafkaWordCountProducer {
   }
 
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/MQTTWordCount.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/MQTTWordCount.scala
index 813c8554f5193..d772ae309f40d 100644
--- a/examples/src/main/scala/org/apache/spark/examples/streaming/MQTTWordCount.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/streaming/MQTTWordCount.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples.streaming
 
 import org.eclipse.paho.client.mqttv3._
@@ -96,8 +97,10 @@ object MQTTWordCount {
 
   def main(args: Array[String]) {
     if (args.length < 2) {
+      // scalastyle:off println
       System.err.println(
         "Usage: MQTTWordCount <MqttbrokerUrl> <topic>")
+      // scalastyle:on println
       System.exit(1)
     }
 
@@ -113,3 +116,4 @@ object MQTTWordCount {
     ssc.awaitTermination()
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/NetworkWordCount.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/NetworkWordCount.scala
index 2cd8073dada14..9a57fe286d1ae 100644
--- a/examples/src/main/scala/org/apache/spark/examples/streaming/NetworkWordCount.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/streaming/NetworkWordCount.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples.streaming
 
 import org.apache.spark.SparkConf
@@ -57,3 +58,4 @@ object NetworkWordCount {
     ssc.awaitTermination()
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/RawNetworkGrep.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/RawNetworkGrep.scala
index a9aaa445bccb6..5322929d177b4 100644
--- a/examples/src/main/scala/org/apache/spark/examples/streaming/RawNetworkGrep.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/streaming/RawNetworkGrep.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples.streaming
 
 import org.apache.spark.SparkConf
@@ -58,3 +59,4 @@ object RawNetworkGrep {
     ssc.awaitTermination()
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/RecoverableNetworkWordCount.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/RecoverableNetworkWordCount.scala
index 751b30ea15782..9916882e4f94a 100644
--- a/examples/src/main/scala/org/apache/spark/examples/streaming/RecoverableNetworkWordCount.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/streaming/RecoverableNetworkWordCount.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples.streaming
 
 import java.io.File
@@ -108,3 +109,4 @@ object RecoverableNetworkWordCount {
     ssc.awaitTermination()
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/SqlNetworkWordCount.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/SqlNetworkWordCount.scala
index 5a6b9216a3fbc..ed617754cbf1c 100644
--- a/examples/src/main/scala/org/apache/spark/examples/streaming/SqlNetworkWordCount.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/streaming/SqlNetworkWordCount.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples.streaming
 
 import org.apache.spark.SparkConf
@@ -99,3 +100,4 @@ object SQLContextSingleton {
     instance
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/StatefulNetworkWordCount.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/StatefulNetworkWordCount.scala
index 345d0bc441351..02ba1c2eed0f7 100644
--- a/examples/src/main/scala/org/apache/spark/examples/streaming/StatefulNetworkWordCount.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/streaming/StatefulNetworkWordCount.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples.streaming
 
 import org.apache.spark.SparkConf
@@ -78,3 +79,4 @@ object StatefulNetworkWordCount {
     ssc.awaitTermination()
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/TwitterAlgebirdCMS.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/TwitterAlgebirdCMS.scala
index c10de84a80ffe..825c671a929b1 100644
--- a/examples/src/main/scala/org/apache/spark/examples/streaming/TwitterAlgebirdCMS.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/streaming/TwitterAlgebirdCMS.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples.streaming
 
 import com.twitter.algebird._
@@ -113,3 +114,4 @@ object TwitterAlgebirdCMS {
     ssc.awaitTermination()
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/TwitterAlgebirdHLL.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/TwitterAlgebirdHLL.scala
index 62db5e663b8af..49826ede70418 100644
--- a/examples/src/main/scala/org/apache/spark/examples/streaming/TwitterAlgebirdHLL.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/streaming/TwitterAlgebirdHLL.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples.streaming
 
 import com.twitter.algebird.HyperLogLogMonoid
@@ -90,3 +91,4 @@ object TwitterAlgebirdHLL {
     ssc.awaitTermination()
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/TwitterPopularTags.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/TwitterPopularTags.scala
index f253d75b279f7..49cee1b43c2dc 100644
--- a/examples/src/main/scala/org/apache/spark/examples/streaming/TwitterPopularTags.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/streaming/TwitterPopularTags.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples.streaming
 
 import org.apache.spark.streaming.{Seconds, StreamingContext}
@@ -82,3 +83,4 @@ object TwitterPopularTags {
     ssc.awaitTermination()
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/ZeroMQWordCount.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/ZeroMQWordCount.scala
index e99d1baa72b9f..6ac9a72c37941 100644
--- a/examples/src/main/scala/org/apache/spark/examples/streaming/ZeroMQWordCount.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/streaming/ZeroMQWordCount.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples.streaming
 
 import akka.actor.ActorSystem
@@ -97,3 +98,4 @@ object ZeroMQWordCount {
     ssc.awaitTermination()
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/clickstream/PageViewGenerator.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/clickstream/PageViewGenerator.scala
index 889f052c70263..bea7a47cb2855 100644
--- a/examples/src/main/scala/org/apache/spark/examples/streaming/clickstream/PageViewGenerator.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/streaming/clickstream/PageViewGenerator.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples.streaming.clickstream
 
 import java.net.ServerSocket
@@ -108,3 +109,4 @@ object PageViewGenerator {
     }
   }
 }
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/clickstream/PageViewStream.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/clickstream/PageViewStream.scala
index fbacaee98690f..ec7d39da8b2e9 100644
--- a/examples/src/main/scala/org/apache/spark/examples/streaming/clickstream/PageViewStream.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/streaming/clickstream/PageViewStream.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples.streaming.clickstream
 
 import org.apache.spark.SparkContext._
@@ -107,3 +108,4 @@ object PageViewStream {
     ssc.start()
   }
 }
+// scalastyle:on println
diff --git a/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/DirectKafkaStreamSuite.scala b/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/DirectKafkaStreamSuite.scala
index 8e1715f6dbb95..5b3c79444aa68 100644
--- a/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/DirectKafkaStreamSuite.scala
+++ b/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/DirectKafkaStreamSuite.scala
@@ -111,7 +111,7 @@ class DirectKafkaStreamSuite
       rdd
     }.foreachRDD { rdd =>
       for (o <- offsetRanges) {
-        println(s"${o.topic} ${o.partition} ${o.fromOffset} ${o.untilOffset}")
+        logInfo(s"${o.topic} ${o.partition} ${o.fromOffset} ${o.untilOffset}")
       }
       val collected = rdd.mapPartitionsWithIndex { (i, iter) =>
       // For each partition, get size of the range in the partition,
diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala
index be8b62d3cc6ba..de749626ec09c 100644
--- a/extras/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala
+++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off println
 package org.apache.spark.examples.streaming
 
 import java.nio.ByteBuffer
@@ -272,3 +273,4 @@ private[streaming] object StreamingExamples extends Logging {
     }
   }
 }
+// scalastyle:on println
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/util/BytecodeUtils.scala b/graphx/src/main/scala/org/apache/spark/graphx/util/BytecodeUtils.scala
index be6b9047d932d..5c07b415cd796 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/util/BytecodeUtils.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/util/BytecodeUtils.scala
@@ -66,7 +66,6 @@ private[graphx] object BytecodeUtils {
       val finder = new MethodInvocationFinder(c.getName, m)
       getClassReader(c).accept(finder, 0)
       for (classMethod <- finder.methodsInvoked) {
-        // println(classMethod)
         if (classMethod._1 == targetClass && classMethod._2 == targetMethod) {
           return true
         } else if (!seen.contains(classMethod)) {
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/util/GraphGenerators.scala b/graphx/src/main/scala/org/apache/spark/graphx/util/GraphGenerators.scala
index 9591c4e9b8f4e..989e226305265 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/util/GraphGenerators.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/util/GraphGenerators.scala
@@ -33,7 +33,7 @@ import org.apache.spark.graphx.Edge
 import org.apache.spark.graphx.impl.GraphImpl
 
 /** A collection of graph generating functions. */
-object GraphGenerators {
+object GraphGenerators extends Logging {
 
   val RMATa = 0.45
   val RMATb = 0.15
@@ -142,7 +142,7 @@ object GraphGenerators {
     var edges: Set[Edge[Int]] = Set()
     while (edges.size < numEdges) {
       if (edges.size % 100 == 0) {
-        println(edges.size + " edges")
+        logDebug(edges.size + " edges")
       }
       edges += addEdge(numVertices)
     }
diff --git a/graphx/src/test/scala/org/apache/spark/graphx/util/BytecodeUtilsSuite.scala b/graphx/src/test/scala/org/apache/spark/graphx/util/BytecodeUtilsSuite.scala
index 186d0cc2a977b..61e44dcab578c 100644
--- a/graphx/src/test/scala/org/apache/spark/graphx/util/BytecodeUtilsSuite.scala
+++ b/graphx/src/test/scala/org/apache/spark/graphx/util/BytecodeUtilsSuite.scala
@@ -20,6 +20,7 @@ package org.apache.spark.graphx.util
 import org.apache.spark.SparkFunSuite
 
 
+// scalastyle:off println
 class BytecodeUtilsSuite extends SparkFunSuite {
 
   import BytecodeUtilsSuite.TestClass
@@ -102,6 +103,7 @@ class BytecodeUtilsSuite extends SparkFunSuite {
   private val c = {e: TestClass => println(e.baz)}
 }
 
+// scalastyle:on println
 
 object BytecodeUtilsSuite {
   class TestClass(val foo: Int, val bar: Long) {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/KMeansDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/KMeansDataGenerator.scala
index 6eaebaf7dba9f..e6bcff48b022c 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/KMeansDataGenerator.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/KMeansDataGenerator.scala
@@ -64,8 +64,10 @@ object KMeansDataGenerator {
 
   def main(args: Array[String]) {
     if (args.length < 6) {
+      // scalastyle:off println
       println("Usage: KMeansGenerator " +
         "<master> <output_dir> <num_points> <k> <d> <r> [<num_partitions>]")
+      // scalastyle:on println
       System.exit(1)
     }
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala
index b4e33c98ba7e5..87eeb5db05d26 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala
@@ -153,8 +153,10 @@ object LinearDataGenerator {
 
   def main(args: Array[String]) {
     if (args.length < 2) {
+      // scalastyle:off println
       println("Usage: LinearDataGenerator " +
         "<master> <output_dir> [num_examples] [num_features] [num_partitions]")
+      // scalastyle:on println
       System.exit(1)
     }
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/LogisticRegressionDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/LogisticRegressionDataGenerator.scala
index 9d802678c4a77..c09cbe69bb971 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/LogisticRegressionDataGenerator.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/LogisticRegressionDataGenerator.scala
@@ -64,8 +64,10 @@ object LogisticRegressionDataGenerator {
 
   def main(args: Array[String]) {
     if (args.length != 5) {
+      // scalastyle:off println
       println("Usage: LogisticRegressionGenerator " +
         "<master> <output_dir> <num_examples> <num_features> <num_partitions>")
+      // scalastyle:on println
       System.exit(1)
     }
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MFDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MFDataGenerator.scala
index bd73a866c8a82..16f430599a515 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/MFDataGenerator.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MFDataGenerator.scala
@@ -55,8 +55,10 @@ import org.apache.spark.rdd.RDD
 object MFDataGenerator {
   def main(args: Array[String]) {
     if (args.length < 2) {
+      // scalastyle:off println
       println("Usage: MFDataGenerator " +
         "<master> <outputDir> [m] [n] [rank] [trainSampFact] [noise] [sigma] [test] [testSampFact]")
+      // scalastyle:on println
       System.exit(1)
     }
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/SVMDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/SVMDataGenerator.scala
index a8e30cc9d730c..ad20b7694a779 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/SVMDataGenerator.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/SVMDataGenerator.scala
@@ -37,8 +37,10 @@ object SVMDataGenerator {
 
   def main(args: Array[String]) {
     if (args.length < 2) {
+      // scalastyle:off println
       println("Usage: SVMGenerator " +
         "<master> <output_dir> [num_examples] [num_features] [num_partitions]")
+      // scalastyle:on println
       System.exit(1)
     }
 
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/VectorIndexerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/VectorIndexerSuite.scala
index 8c85c96d5c6d8..03120c828ca96 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/VectorIndexerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/VectorIndexerSuite.scala
@@ -19,7 +19,7 @@ package org.apache.spark.ml.feature
 
 import scala.beans.{BeanInfo, BeanProperty}
 
-import org.apache.spark.{SparkException, SparkFunSuite}
+import org.apache.spark.{Logging, SparkException, SparkFunSuite}
 import org.apache.spark.ml.attribute._
 import org.apache.spark.ml.param.ParamsSuite
 import org.apache.spark.mllib.linalg.{SparseVector, Vector, Vectors}
@@ -27,7 +27,7 @@ import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.DataFrame
 
-class VectorIndexerSuite extends SparkFunSuite with MLlibTestSparkContext {
+class VectorIndexerSuite extends SparkFunSuite with MLlibTestSparkContext with Logging {
 
   import VectorIndexerSuite.FeatureData
 
@@ -113,11 +113,11 @@ class VectorIndexerSuite extends SparkFunSuite with MLlibTestSparkContext {
     model.transform(sparsePoints1) // should work
     intercept[SparkException] {
       model.transform(densePoints2).collect()
-      println("Did not throw error when fit, transform were called on vectors of different lengths")
+      logInfo("Did not throw error when fit, transform were called on vectors of different lengths")
     }
     intercept[SparkException] {
       vectorIndexer.fit(badPoints)
-      println("Did not throw error when fitting vectors of different lengths in same RDD.")
+      logInfo("Did not throw error when fitting vectors of different lengths in same RDD.")
     }
   }
 
@@ -196,7 +196,7 @@ class VectorIndexerSuite extends SparkFunSuite with MLlibTestSparkContext {
         }
       } catch {
         case e: org.scalatest.exceptions.TestFailedException =>
-          println(errMsg)
+          logError(errMsg)
           throw e
       }
     }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala
index c4ae0a16f7c04..178d95a7b94ec 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala
@@ -21,10 +21,10 @@ import scala.util.Random
 
 import breeze.linalg.{DenseMatrix => BDM, squaredDistance => breezeSquaredDistance}
 
-import org.apache.spark.{SparkException, SparkFunSuite}
+import org.apache.spark.{Logging, SparkException, SparkFunSuite}
 import org.apache.spark.mllib.util.TestingUtils._
 
-class VectorsSuite extends SparkFunSuite {
+class VectorsSuite extends SparkFunSuite with Logging {
 
   val arr = Array(0.1, 0.0, 0.3, 0.4)
   val n = 4
@@ -142,7 +142,7 @@ class VectorsSuite extends SparkFunSuite {
     malformatted.foreach { s =>
       intercept[SparkException] {
         Vectors.parse(s)
-        println(s"Didn't detect malformatted string $s.")
+        logInfo(s"Didn't detect malformatted string $s.")
       }
     }
   }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/stat/CorrelationSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/stat/CorrelationSuite.scala
index c292ced75e870..c3eeda012571c 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/stat/CorrelationSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/stat/CorrelationSuite.scala
@@ -19,13 +19,13 @@ package org.apache.spark.mllib.stat
 
 import breeze.linalg.{DenseMatrix => BDM, Matrix => BM}
 
-import org.apache.spark.SparkFunSuite
+import org.apache.spark.{Logging, SparkFunSuite}
 import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.stat.correlation.{Correlations, PearsonCorrelation,
   SpearmanCorrelation}
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 
-class CorrelationSuite extends SparkFunSuite with MLlibTestSparkContext {
+class CorrelationSuite extends SparkFunSuite with MLlibTestSparkContext with Logging {
 
   // test input data
   val xData = Array(1.0, 0.0, -2.0)
@@ -146,7 +146,7 @@ class CorrelationSuite extends SparkFunSuite with MLlibTestSparkContext {
   def matrixApproxEqual(A: BM[Double], B: BM[Double], threshold: Double = 1e-6): Boolean = {
     for (i <- 0 until A.rows; j <- 0 until A.cols) {
       if (!approxEqual(A(i, j), B(i, j), threshold)) {
-        println("i, j = " + i + ", " + j + " actual: " + A(i, j) + " expected:" + B(i, j))
+        logInfo("i, j = " + i + ", " + j + " actual: " + A(i, j) + " expected:" + B(i, j))
         return false
       }
     }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/tree/GradientBoostedTreesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/tree/GradientBoostedTreesSuite.scala
index 84dd3b342d4c0..2521b3342181a 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/tree/GradientBoostedTreesSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/tree/GradientBoostedTreesSuite.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.mllib.tree
 
-import org.apache.spark.SparkFunSuite
+import org.apache.spark.{Logging, SparkFunSuite}
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.tree.configuration.Algo._
 import org.apache.spark.mllib.tree.configuration.{BoostingStrategy, Strategy}
@@ -31,7 +31,7 @@ import org.apache.spark.util.Utils
 /**
  * Test suite for [[GradientBoostedTrees]].
  */
-class GradientBoostedTreesSuite extends SparkFunSuite with MLlibTestSparkContext {
+class GradientBoostedTreesSuite extends SparkFunSuite with MLlibTestSparkContext with Logging {
 
   test("Regression with continuous features: SquaredError") {
     GradientBoostedTreesSuite.testCombinations.foreach {
@@ -50,7 +50,7 @@ class GradientBoostedTreesSuite extends SparkFunSuite with MLlibTestSparkContext
           EnsembleTestHelper.validateRegressor(gbt, GradientBoostedTreesSuite.data, 0.06)
         } catch {
           case e: java.lang.AssertionError =>
-            println(s"FAILED for numIterations=$numIterations, learningRate=$learningRate," +
+            logError(s"FAILED for numIterations=$numIterations, learningRate=$learningRate," +
               s" subsamplingRate=$subsamplingRate")
             throw e
         }
@@ -80,7 +80,7 @@ class GradientBoostedTreesSuite extends SparkFunSuite with MLlibTestSparkContext
           EnsembleTestHelper.validateRegressor(gbt, GradientBoostedTreesSuite.data, 0.85, "mae")
         } catch {
           case e: java.lang.AssertionError =>
-            println(s"FAILED for numIterations=$numIterations, learningRate=$learningRate," +
+            logError(s"FAILED for numIterations=$numIterations, learningRate=$learningRate," +
               s" subsamplingRate=$subsamplingRate")
             throw e
         }
@@ -111,7 +111,7 @@ class GradientBoostedTreesSuite extends SparkFunSuite with MLlibTestSparkContext
           EnsembleTestHelper.validateClassifier(gbt, GradientBoostedTreesSuite.data, 0.9)
         } catch {
           case e: java.lang.AssertionError =>
-            println(s"FAILED for numIterations=$numIterations, learningRate=$learningRate," +
+            logError(s"FAILED for numIterations=$numIterations, learningRate=$learningRate," +
               s" subsamplingRate=$subsamplingRate")
             throw e
         }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala
index fa4f74d71b7e7..16d7c3ab39b03 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/util/NumericParserSuite.scala
@@ -33,7 +33,7 @@ class NumericParserSuite extends SparkFunSuite {
     malformatted.foreach { s =>
       intercept[SparkException] {
         NumericParser.parse(s)
-        println(s"Didn't detect malformatted string $s.")
+        throw new RuntimeException(s"Didn't detect malformatted string $s.")
       }
     }
   }
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index 3408c6d51ed4c..4291b0be2a616 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -69,6 +69,7 @@ object SparkBuild extends PomBuild {
     import scala.collection.mutable
     var isAlphaYarn = false
     var profiles: mutable.Seq[String] = mutable.Seq("sbt")
+    // scalastyle:off println
     if (Properties.envOrNone("SPARK_GANGLIA_LGPL").isDefined) {
       println("NOTE: SPARK_GANGLIA_LGPL is deprecated, please use -Pspark-ganglia-lgpl flag.")
       profiles ++= Seq("spark-ganglia-lgpl")
@@ -88,6 +89,7 @@ object SparkBuild extends PomBuild {
       println("NOTE: SPARK_YARN is deprecated, please use -Pyarn flag.")
       profiles ++= Seq("yarn")
     }
+    // scalastyle:on println
     profiles
   }
 
@@ -96,8 +98,10 @@ object SparkBuild extends PomBuild {
     case None => backwardCompatibility
     case Some(v) =>
       if (backwardCompatibility.nonEmpty)
+        // scalastyle:off println
         println("Note: We ignore environment variables, when use of profile is detected in " +
           "conjunction with environment variable.")
+        // scalastyle:on println
       v.split("(\\s+|,)").filterNot(_.isEmpty).map(_.trim.replaceAll("-P", "")).toSeq
     }
 
diff --git a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkCommandLine.scala b/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkCommandLine.scala
index 6480e2d24e044..24fbbc12c08da 100644
--- a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkCommandLine.scala
+++ b/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkCommandLine.scala
@@ -39,6 +39,8 @@ class SparkCommandLine(args: List[String], override val settings: Settings)
   }
 
   def this(args: List[String]) {
+    // scalastyle:off println
     this(args, str => Console.println("Error: " + str))
+    // scalastyle:on println
   }
 }
diff --git a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoop.scala b/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoop.scala
index 2b235525250c2..8f7f9074d3f03 100644
--- a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoop.scala
+++ b/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoop.scala
@@ -1101,7 +1101,9 @@ object SparkILoop extends Logging {
             val s = super.readLine()
             // helping out by printing the line being interpreted.
             if (s != null)
+              // scalastyle:off println
               output.println(s)
+              // scalastyle:on println
             s
           }
         }
diff --git a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoopInit.scala b/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoopInit.scala
index 05faef8786d2c..bd3314d94eed6 100644
--- a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoopInit.scala
+++ b/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoopInit.scala
@@ -80,11 +80,13 @@ private[repl] trait SparkILoopInit {
     if (!initIsComplete)
       withLock { while (!initIsComplete) initLoopCondition.await() }
     if (initError != null) {
+      // scalastyle:off println
       println("""
         |Failed to initialize the REPL due to an unexpected error.
         |This is a bug, please, report it along with the error diagnostics printed below.
         |%s.""".stripMargin.format(initError)
       )
+      // scalastyle:on println
       false
     } else true
   }
diff --git a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkIMain.scala b/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkIMain.scala
index 35fb625645022..8791618bd355e 100644
--- a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkIMain.scala
+++ b/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkIMain.scala
@@ -1761,7 +1761,9 @@ object SparkIMain {
         if (intp.totalSilence) ()
         else super.printMessage(msg)
       }
+      // scalastyle:off println
       else Console.println(msg)
+      // scalastyle:on println
     }
   }
 }
diff --git a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkILoop.scala b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkILoop.scala
index 7a5e94da5cbf3..3c90287249497 100644
--- a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkILoop.scala
+++ b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkILoop.scala
@@ -943,7 +943,9 @@ object SparkILoop {
             val s = super.readLine()
             // helping out by printing the line being interpreted.
             if (s != null)
+              // scalastyle:off println
               output.println(s)
+              // scalastyle:on println
             s
           }
         }
diff --git a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkIMain.scala b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkIMain.scala
index 1cb910f376060..56c009a4e38e7 100644
--- a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkIMain.scala
+++ b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkIMain.scala
@@ -129,7 +129,9 @@ class SparkIMain(@BeanProperty val factory: ScriptEngineFactory, initialSettings
   }
   private def tquoted(s: String) = "\"\"\"" + s + "\"\"\""
   private val logScope = scala.sys.props contains "scala.repl.scope"
+  // scalastyle:off println
   private def scopelog(msg: String) = if (logScope) Console.err.println(msg)
+  // scalastyle:on println
 
   // argument is a thunk to execute after init is done
   def initialize(postInitSignal: => Unit) {
@@ -1297,8 +1299,10 @@ class SparkISettings(intp: SparkIMain) {
   def deprecation_=(x: Boolean) = {
     val old = intp.settings.deprecation.value
     intp.settings.deprecation.value = x
+    // scalastyle:off println
     if (!old && x) println("Enabled -deprecation output.")
     else if (old && !x) println("Disabled -deprecation output.")
+    // scalastyle:on println
   }
   def deprecation: Boolean = intp.settings.deprecation.value
 
diff --git a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkReplReporter.scala b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkReplReporter.scala
index 0711ed4871bb6..272f81eca92c1 100644
--- a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkReplReporter.scala
+++ b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkReplReporter.scala
@@ -42,7 +42,9 @@ class SparkReplReporter(intp: SparkIMain) extends ConsoleReporter(intp.settings,
       }
       else super.printMessage(msg)
     }
+    // scalastyle:off println
     else Console.println("[init] " + msg)
+    // scalastyle:on println
   }
 
   override def displayPrompt() {
diff --git a/scalastyle-config.xml b/scalastyle-config.xml
index d6f927b6fa803..49611703798e8 100644
--- a/scalastyle-config.xml
+++ b/scalastyle-config.xml
@@ -141,12 +141,8 @@ This file is divided into 3 sections:
     <customMessage>Tests must extend org.apache.spark.SparkFunSuite instead.</customMessage>
   </check>
 
-  <!-- ================================================================================ -->
-  <!--       rules we'd like to enforce, but haven't cleaned up the codebase yet        -->
-  <!-- ================================================================================ -->
-
-  <!-- SPARK-7977 We should turn this on, but we'd need to add whitelist to files that are using it first. -->
-  <check customId="println" level="error" class="org.scalastyle.scalariform.TokenChecker" enabled="false">
+  <!-- As of SPARK-7977 all printlns need to be wrapped in '// scalastyle:off/on println' -->
+  <check customId="println" level="error" class="org.scalastyle.scalariform.TokenChecker" enabled="true">
     <parameters><parameter name="regex">^println$</parameter></parameters>
     <customMessage><![CDATA[Are you sure you want to println? If yes, wrap the code block with
       // scalastyle:off println
@@ -154,6 +150,10 @@ This file is divided into 3 sections:
       // scalastyle:on println]]></customMessage>
   </check>
 
+  <!-- ================================================================================ -->
+  <!--       rules we'd like to enforce, but haven't cleaned up the codebase yet        -->
+  <!-- ================================================================================ -->
+
   <!-- We cannot turn the following two on, because it'd fail a lot of string interpolation use cases. -->
   <!-- Ideally the following two rules should be configurable to rule out string interpolation. -->
   <check level="error" class="org.scalastyle.scalariform.NoWhitespaceBeforeLeftBracketChecker" enabled="false"></check>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/package.scala
index 7f1b12cdd5800..606fecbe06e47 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/package.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/package.scala
@@ -67,8 +67,10 @@ package object codegen {
       outfile.write(generatedBytes)
       outfile.close()
 
+      // scalastyle:off println
       println(
         s"javap -p -v -classpath ${dumpDirectory.getCanonicalPath} ${generatedClass.getName}".!!)
+      // scalastyle:on println
     }
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
index 2f545bb432165..b89e3382f06a9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
@@ -154,7 +154,9 @@ abstract class QueryPlan[PlanType <: TreeNode[PlanType]] extends TreeNode[PlanTy
   def schemaString: String = schema.treeString
 
   /** Prints out the schema in the tree format */
+  // scalastyle:off println
   def printSchema(): Unit = println(schemaString)
+  // scalastyle:on println
 
   /**
    * A prefix string used when printing the plan.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/package.scala
index 07054166a5e88..71293475ca0f9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/package.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/package.scala
@@ -124,7 +124,9 @@ package object util {
     val startTime = System.nanoTime()
     val ret = f
     val endTime = System.nanoTime()
+    // scalastyle:off println
     println(s"${(endTime - startTime).toDouble / 1000000}ms")
+    // scalastyle:on println
     ret
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
index e0b8ff91786a7..b8097403ec3cc 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
@@ -250,7 +250,9 @@ case class StructType(fields: Array[StructField]) extends DataType with Seq[Stru
     builder.toString()
   }
 
+  // scalastyle:off println
   def printTreeString(): Unit = println(treeString)
+  // scalastyle:on println
 
   private[sql] def buildFormattedString(prefix: String, builder: StringBuilder): Unit = {
     fields.foreach(field => field.buildFormattedString(prefix, builder))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
index f201c8ea8a110..10250264625b2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
@@ -860,11 +860,13 @@ class Column(protected[sql] val expr: Expression) extends Logging {
    * @since 1.3.0
    */
   def explain(extended: Boolean): Unit = {
+    // scalastyle:off println
     if (extended) {
       println(expr)
     } else {
       println(expr.prettyString)
     }
+    // scalastyle:on println
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index d7966651b1948..830fba35bb7bc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -308,7 +308,9 @@ class DataFrame private[sql](
    * @group basic
    * @since 1.3.0
    */
+  // scalastyle:off println
   def printSchema(): Unit = println(schema.treeString)
+  // scalastyle:on println
 
   /**
    * Prints the plans (logical and physical) to the console for debugging purposes.
@@ -319,7 +321,9 @@ class DataFrame private[sql](
     ExplainCommand(
       queryExecution.logical,
       extended = extended).queryExecution.executedPlan.executeCollect().map {
+      // scalastyle:off println
       r => println(r.getString(0))
+      // scalastyle:on println
     }
   }
 
@@ -392,7 +396,9 @@ class DataFrame private[sql](
    * @group action
    * @since 1.5.0
    */
+  // scalastyle:off println
   def show(numRows: Int, truncate: Boolean): Unit = println(showString(numRows, truncate))
+  // scalastyle:on println
 
   /**
    * Returns a [[DataFrameNaFunctions]] for working with missing data.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala
index 2964edac1aba2..e6081cb05bc2d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala
@@ -24,7 +24,7 @@ import org.apache.spark.unsafe.types.UTF8String
 
 import scala.collection.mutable.HashSet
 
-import org.apache.spark.{AccumulatorParam, Accumulator}
+import org.apache.spark.{AccumulatorParam, Accumulator, Logging}
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.trees.TreeNodeRef
@@ -57,7 +57,7 @@ package object debug {
    * Augments [[DataFrame]]s with debug methods.
    */
   @DeveloperApi
-  implicit class DebugQuery(query: DataFrame) {
+  implicit class DebugQuery(query: DataFrame) extends Logging {
     def debug(): Unit = {
       val plan = query.queryExecution.executedPlan
       val visited = new collection.mutable.HashSet[TreeNodeRef]()
@@ -66,7 +66,7 @@ package object debug {
           visited += new TreeNodeRef(s)
           DebugNode(s)
       }
-      println(s"Results returned: ${debugPlan.execute().count()}")
+      logDebug(s"Results returned: ${debugPlan.execute().count()}")
       debugPlan.foreach {
         case d: DebugNode => d.dumpStats()
         case _ =>
@@ -82,11 +82,11 @@ package object debug {
           TypeCheck(s)
       }
       try {
-        println(s"Results returned: ${debugPlan.execute().count()}")
+        logDebug(s"Results returned: ${debugPlan.execute().count()}")
       } catch {
         case e: Exception =>
           def unwrap(e: Throwable): Throwable = if (e.getCause == null) e else unwrap(e.getCause)
-          println(s"Deepest Error: ${unwrap(e)}")
+          logDebug(s"Deepest Error: ${unwrap(e)}")
       }
     }
   }
@@ -119,11 +119,11 @@ package object debug {
     val columnStats: Array[ColumnMetrics] = Array.fill(child.output.size)(new ColumnMetrics())
 
     def dumpStats(): Unit = {
-      println(s"== ${child.simpleString} ==")
-      println(s"Tuples output: ${tupleCount.value}")
+      logDebug(s"== ${child.simpleString} ==")
+      logDebug(s"Tuples output: ${tupleCount.value}")
       child.output.zip(columnStats).foreach { case(attr, metric) =>
         val actualDataTypes = metric.elementTypes.value.mkString("{", ",", "}")
-        println(s" ${attr.name} ${attr.dataType}: $actualDataTypes")
+        logDebug(s" ${attr.name} ${attr.dataType}: $actualDataTypes")
       }
     }
 
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala
index 039cfa40d26b3..f66a17b20915f 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala
@@ -40,7 +40,7 @@ import org.apache.spark.Logging
 import org.apache.spark.sql.hive.HiveContext
 import org.apache.spark.util.Utils
 
-private[hive] object SparkSQLCLIDriver {
+private[hive] object SparkSQLCLIDriver extends Logging {
   private var prompt = "spark-sql"
   private var continuedPrompt = "".padTo(prompt.length, ' ')
   private var transport: TSocket = _
@@ -164,7 +164,7 @@ private[hive] object SparkSQLCLIDriver {
       }
     } catch {
       case e: FileNotFoundException =>
-        System.err.println(s"Could not open input file for reading. (${e.getMessage})")
+        logError(s"Could not open input file for reading. (${e.getMessage})")
         System.exit(3)
     }
 
@@ -180,14 +180,14 @@ private[hive] object SparkSQLCLIDriver {
         val historyFile = historyDirectory + File.separator + ".hivehistory"
         reader.setHistory(new History(new File(historyFile)))
       } else {
-        System.err.println("WARNING: Directory for Hive history file: " + historyDirectory +
+        logWarning("WARNING: Directory for Hive history file: " + historyDirectory +
                            " does not exist.   History will not be available during this session.")
       }
     } catch {
       case e: Exception =>
-        System.err.println("WARNING: Encountered an error while trying to initialize Hive's " +
+        logWarning("WARNING: Encountered an error while trying to initialize Hive's " +
                            "history file.  History will not be available during this session.")
-        System.err.println(e.getMessage)
+        logWarning(e.getMessage)
     }
 
     val clientTransportTSocketField = classOf[CliSessionState].getDeclaredField("transport")
@@ -270,6 +270,7 @@ private[hive] class SparkSQLCLIDriver extends CliDriver with Logging {
       val proc: CommandProcessor = CommandProcessorFactory.get(Array(tokens(0)), hconf)
 
       if (proc != null) {
+        // scalastyle:off println
         if (proc.isInstanceOf[Driver] || proc.isInstanceOf[SetProcessor] ||
           proc.isInstanceOf[AddResourceProcessor]) {
           val driver = new SparkSQLDriver
@@ -336,6 +337,7 @@ private[hive] class SparkSQLCLIDriver extends CliDriver with Logging {
           }
           ret = proc.run(cmd_1).getResponseCode
         }
+        // scalastyle:on println
       }
       ret
     }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index bbc39b892b79e..4684d48aff889 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -34,6 +34,7 @@ import org.apache.hadoop.hive.ql.parse.VariableSubstitution
 import org.apache.hadoop.hive.ql.session.SessionState
 import org.apache.hadoop.hive.serde2.io.{DateWritable, TimestampWritable}
 
+import org.apache.spark.Logging
 import org.apache.spark.SparkContext
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.sql._
@@ -65,12 +66,12 @@ private[hive] class HiveQLDialect extends ParserDialect {
  *
  * @since 1.0.0
  */
-class HiveContext(sc: SparkContext) extends SQLContext(sc) {
+class HiveContext(sc: SparkContext) extends SQLContext(sc) with Logging {
   self =>
 
   import HiveContext._
 
-  println("create HiveContext")
+  logDebug("create HiveContext")
 
   /**
    * When true, enables an experimental feature where metastore tables that use the parquet SerDe
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
index 2de7a99c122fd..7fc517b646b20 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
@@ -28,6 +28,7 @@ import org.apache.hadoop.hive.ql.parse._
 import org.apache.hadoop.hive.ql.plan.PlanUtils
 import org.apache.hadoop.hive.ql.session.SessionState
 
+import org.apache.spark.Logging
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.expressions._
@@ -73,7 +74,7 @@ private[hive] case class CreateTableAsSelect(
 }
 
 /** Provides a mapping from HiveQL statements to catalyst logical plans and expression trees. */
-private[hive] object HiveQl {
+private[hive] object HiveQl extends Logging {
   protected val nativeCommands = Seq(
     "TOK_ALTERDATABASE_OWNER",
     "TOK_ALTERDATABASE_PROPERTIES",
@@ -186,7 +187,7 @@ private[hive] object HiveQl {
             .map(ast => Option(ast).map(_.transform(rule)).orNull))
       } catch {
         case e: Exception =>
-          println(dumpTree(n))
+          logError(dumpTree(n).toString)
           throw e
       }
     }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala
index cbd2bf6b5eede..9d83ca6c113dc 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala
@@ -360,7 +360,9 @@ private[hive] class ClientWrapper(
 
         case _ =>
           if (state.out != null) {
+            // scalastyle:off println
             state.out.println(tokens(0) + " " + cmd_1)
+            // scalastyle:on println
           }
           Seq(proc.run(cmd_1).getResponseCode.toString)
       }
diff --git a/sql/hive/src/test/resources/regression-test-SPARK-8489/Main.scala b/sql/hive/src/test/resources/regression-test-SPARK-8489/Main.scala
index 0e428ba1d7456..2590040f2ec1c 100644
--- a/sql/hive/src/test/resources/regression-test-SPARK-8489/Main.scala
+++ b/sql/hive/src/test/resources/regression-test-SPARK-8489/Main.scala
@@ -30,6 +30,7 @@ import org.apache.spark.sql.hive.HiveContext
  */
 object Main {
   def main(args: Array[String]) {
+    // scalastyle:off println
     println("Running regression test for SPARK-8489.")
     val sc = new SparkContext("local", "testing")
     val hc = new HiveContext(sc)
@@ -38,6 +39,7 @@ object Main {
     val df = hc.createDataFrame(Seq(MyCoolClass("1", "2", "3")))
     df.collect()
     println("Regression test for SPARK-8489 success!")
+    // scalastyle:on println
     sc.stop()
   }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
index e9bb32667936c..983c013bcf86a 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
@@ -17,13 +17,13 @@
 
 package org.apache.spark.sql.hive
 
-import org.apache.spark.SparkFunSuite
+import org.apache.spark.{Logging, SparkFunSuite}
 import org.apache.spark.sql.hive.test.TestHive
 
 import org.apache.spark.sql.test.ExamplePointUDT
 import org.apache.spark.sql.types.StructType
 
-class HiveMetastoreCatalogSuite extends SparkFunSuite {
+class HiveMetastoreCatalogSuite extends SparkFunSuite with Logging {
 
   test("struct field should accept underscore in sub-column name") {
     val metastr = "struct<a: int, b_1: string, c: string>"
@@ -41,7 +41,7 @@ class HiveMetastoreCatalogSuite extends SparkFunSuite {
   test("duplicated metastore relations") {
     import TestHive.implicits._
     val df = TestHive.sql("SELECT * FROM src")
-    println(df.queryExecution)
+    logInfo(df.queryExecution.toString)
     df.as('a).join(df.as('b), $"a.key" === $"b.key")
   }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala
index a38ed23b5cf9a..917900e5f46dc 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala
@@ -90,8 +90,10 @@ class HiveSparkSubmitSuite
       "SPARK_TESTING" -> "1",
       "SPARK_HOME" -> sparkHome
     ).run(ProcessLogger(
+      // scalastyle:off println
       (line: String) => { println(s"out> $line") },
       (line: String) => { println(s"err> $line") }
+      // scalastyle:on println
     ))
 
     try {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala
index aa5dbe2db6903..508695919e9a7 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala
@@ -86,8 +86,6 @@ class InsertIntoHiveTableSuite extends QueryTest with BeforeAndAfter {
     val message = intercept[QueryExecutionException] {
       sql("CREATE TABLE doubleCreateAndInsertTest (key int, value string)")
     }.getMessage
-
-    println("message!!!!" + message)
   }
 
   test("Double create does not fail when allowExisting = true") {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
index cc294bc3e8bc3..d910af22c3dd1 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
@@ -26,6 +26,7 @@ import org.scalatest.BeforeAndAfterAll
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapred.InvalidInputException
 
+import org.apache.spark.Logging
 import org.apache.spark.sql._
 import org.apache.spark.sql.hive.client.{HiveTable, ManagedTable}
 import org.apache.spark.sql.hive.test.TestHive
@@ -40,7 +41,8 @@ import org.apache.spark.util.Utils
 /**
  * Tests for persisting tables created though the data sources API into the metastore.
  */
-class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with BeforeAndAfterAll {
+class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with BeforeAndAfterAll
+  with Logging {
   override val sqlContext = TestHive
 
   var jsonFilePath: String = _
@@ -415,7 +417,7 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with BeforeA
            |)
          """.stripMargin)
 
-      sql("DROP TABLE jsonTable").collect().foreach(println)
+      sql("DROP TABLE jsonTable").collect().foreach(i => logInfo(i.toString))
     }
   }
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala
index eaaa88e17002b..1bde5922b5278 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala
@@ -315,7 +315,6 @@ class PairUDF extends GenericUDF {
   )
 
   override def evaluate(args: Array[DeferredObject]): AnyRef = {
-    println("Type = %s".format(args(0).getClass.getName))
     Integer.valueOf(args(0).get.asInstanceOf[TestPair].entry._2)
   }
 
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
index 192aa6a139bcb..1da0b0a54df07 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
@@ -720,12 +720,14 @@ abstract class DStream[T: ClassTag] (
     def foreachFunc: (RDD[T], Time) => Unit = {
       (rdd: RDD[T], time: Time) => {
         val firstNum = rdd.take(num + 1)
+        // scalastyle:off println
         println("-------------------------------------------")
         println("Time: " + time)
         println("-------------------------------------------")
         firstNum.take(num).foreach(println)
         if (firstNum.length > num) println("...")
         println()
+        // scalastyle:on println
       }
     }
     new ForEachDStream(this, context.sparkContext.clean(foreachFunc)).register()
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/util/RawTextSender.scala b/streaming/src/main/scala/org/apache/spark/streaming/util/RawTextSender.scala
index ca2f319f174a2..6addb96752038 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/util/RawTextSender.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/util/RawTextSender.scala
@@ -35,7 +35,9 @@ private[streaming]
 object RawTextSender extends Logging {
   def main(args: Array[String]) {
     if (args.length != 4) {
+      // scalastyle:off println
       System.err.println("Usage: RawTextSender <port> <file> <blockSize> <bytesPerSec>")
+      // scalastyle:on println
       System.exit(1)
     }
     // Parse the arguments using a pattern match
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/util/RecurringTimer.scala b/streaming/src/main/scala/org/apache/spark/streaming/util/RecurringTimer.scala
index c8eef833eb431..dd32ad5ad811d 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/util/RecurringTimer.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/util/RecurringTimer.scala
@@ -106,7 +106,7 @@ class RecurringTimer(clock: Clock, period: Long, callback: (Long) => Unit, name:
 }
 
 private[streaming]
-object RecurringTimer {
+object RecurringTimer extends Logging {
 
   def main(args: Array[String]) {
     var lastRecurTime = 0L
@@ -114,7 +114,7 @@ object RecurringTimer {
 
     def onRecur(time: Long) {
       val currentTime = System.currentTimeMillis()
-      println("" + currentTime + ": " + (currentTime - lastRecurTime))
+      logInfo("" + currentTime + ": " + (currentTime - lastRecurTime))
       lastRecurTime = currentTime
     }
     val timer = new  RecurringTimer(new SystemClock(), period, onRecur, "Test")
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/MasterFailureTest.scala b/streaming/src/test/scala/org/apache/spark/streaming/MasterFailureTest.scala
index e0f14fd954280..6e9d4431090a2 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/MasterFailureTest.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/MasterFailureTest.scala
@@ -43,6 +43,7 @@ object MasterFailureTest extends Logging {
   @volatile var setupCalled = false
 
   def main(args: Array[String]) {
+    // scalastyle:off println
     if (args.size < 2) {
       println(
         "Usage: MasterFailureTest <local/HDFS directory> <# batches> " +
@@ -60,6 +61,7 @@ object MasterFailureTest extends Logging {
     testUpdateStateByKey(directory, numBatches, batchDuration)
 
     println("\n\nSUCCESS\n\n")
+    // scalastyle:on println
   }
 
   def testMap(directory: String, numBatches: Int, batchDuration: Duration) {
@@ -291,10 +293,12 @@ object MasterFailureTest extends Logging {
     }
 
     // Log the output
+    // scalastyle:off println
     println("Expected output, size = " + expectedOutput.size)
     println(expectedOutput.mkString("[", ",", "]"))
     println("Output, size = " + output.size)
     println(output.mkString("[", ",", "]"))
+    // scalastyle:on println
 
     // Match the output with the expected output
     output.foreach(o =>
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/JobGeneratorSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/JobGeneratorSuite.scala
index 7865b06c2e3c2..a2dbae149f311 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/JobGeneratorSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/JobGeneratorSuite.scala
@@ -76,7 +76,6 @@ class JobGeneratorSuite extends TestSuiteBase {
         if (time.milliseconds == longBatchTime) {
           while (waitLatch.getCount() > 0) {
             waitLatch.await()
-            println("Await over")
           }
         }
       })
diff --git a/tools/src/main/scala/org/apache/spark/tools/GenerateMIMAIgnore.scala b/tools/src/main/scala/org/apache/spark/tools/GenerateMIMAIgnore.scala
index 595ded6ae67fa..9483d2b692ab5 100644
--- a/tools/src/main/scala/org/apache/spark/tools/GenerateMIMAIgnore.scala
+++ b/tools/src/main/scala/org/apache/spark/tools/GenerateMIMAIgnore.scala
@@ -92,7 +92,9 @@ object GenerateMIMAIgnore {
         ignoredMembers ++= getAnnotatedOrPackagePrivateMembers(classSymbol)
 
       } catch {
+        // scalastyle:off println
         case _: Throwable => println("Error instrumenting class:" + className)
+        // scalastyle:on println
       }
     }
     (ignoredClasses.flatMap(c => Seq(c, c.replace("$", "#"))).toSet, ignoredMembers.toSet)
@@ -108,7 +110,9 @@ object GenerateMIMAIgnore {
         .filter(_.contains("$$")).map(classSymbol.fullName + "." + _)
     } catch {
       case t: Throwable =>
+        // scalastyle:off println
         println("[WARN] Unable to detect inner functions for class:" + classSymbol.fullName)
+        // scalastyle:on println
         Seq.empty[String]
     }
   }
@@ -128,12 +132,14 @@ object GenerateMIMAIgnore {
       getOrElse(Iterator.empty).mkString("\n")
     File(".generated-mima-class-excludes")
       .writeAll(previousContents + privateClasses.mkString("\n"))
+    // scalastyle:off println
     println("Created : .generated-mima-class-excludes in current directory.")
     val previousMembersContents = Try(File(".generated-mima-member-excludes").lines)
       .getOrElse(Iterator.empty).mkString("\n")
     File(".generated-mima-member-excludes").writeAll(previousMembersContents +
       privateMembers.mkString("\n"))
     println("Created : .generated-mima-member-excludes in current directory.")
+    // scalastyle:on println
   }
 
 
@@ -174,7 +180,9 @@ object GenerateMIMAIgnore {
       try {
         classes += Class.forName(entry.replace('/', '.').stripSuffix(".class"), false, classLoader)
       } catch {
+        // scalastyle:off println
         case _: Throwable => println("Unable to load:" + entry)
+        // scalastyle:on println
       }
     }
     classes
diff --git a/tools/src/main/scala/org/apache/spark/tools/JavaAPICompletenessChecker.scala b/tools/src/main/scala/org/apache/spark/tools/JavaAPICompletenessChecker.scala
index 583823c90c5c6..856ea177a9a10 100644
--- a/tools/src/main/scala/org/apache/spark/tools/JavaAPICompletenessChecker.scala
+++ b/tools/src/main/scala/org/apache/spark/tools/JavaAPICompletenessChecker.scala
@@ -323,11 +323,14 @@ object JavaAPICompletenessChecker {
     val missingMethods = javaEquivalents -- javaMethods
 
     for (method <- missingMethods) {
+      // scalastyle:off println
       println(method)
+      // scalastyle:on println
     }
   }
 
   def main(args: Array[String]) {
+    // scalastyle:off println
     println("Missing RDD methods")
     printMissingMethods(classOf[RDD[_]], classOf[JavaRDD[_]])
     println()
@@ -359,5 +362,6 @@ object JavaAPICompletenessChecker {
     println("Missing PairDStream methods")
     printMissingMethods(classOf[PairDStreamFunctions[_, _]], classOf[JavaPairDStream[_, _]])
     println()
+    // scalastyle:on println
   }
 }
diff --git a/tools/src/main/scala/org/apache/spark/tools/StoragePerfTester.scala b/tools/src/main/scala/org/apache/spark/tools/StoragePerfTester.scala
index baa97616eaff3..0dc2861253f17 100644
--- a/tools/src/main/scala/org/apache/spark/tools/StoragePerfTester.scala
+++ b/tools/src/main/scala/org/apache/spark/tools/StoragePerfTester.scala
@@ -85,7 +85,9 @@ object StoragePerfTester {
             latch.countDown()
           } catch {
             case e: Exception =>
+              // scalastyle:off println
               println("Exception in child thread: " + e + " " + e.getMessage)
+              // scalastyle:on println
               System.exit(1)
           }
         }
@@ -97,9 +99,11 @@ object StoragePerfTester {
     val bytesPerSecond = totalBytes.get() / time
     val bytesPerFile = (totalBytes.get() / (numOutputSplits * numMaps.toDouble)).toLong
 
+    // scalastyle:off println
     System.err.println("files_total\t\t%s".format(numMaps * numOutputSplits))
     System.err.println("bytes_per_file\t\t%s".format(Utils.bytesToString(bytesPerFile)))
     System.err.println("agg_throughput\t\t%s/s".format(Utils.bytesToString(bytesPerSecond.toLong)))
+    // scalastyle:on println
 
     executor.shutdown()
     sc.stop()
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMasterArguments.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMasterArguments.scala
index 68e9f6b4db7f4..37f793763367e 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMasterArguments.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMasterArguments.scala
@@ -85,7 +85,9 @@ class ApplicationMasterArguments(val args: Array[String]) {
     }
 
     if (primaryPyFile != null && primaryRFile != null) {
+      // scalastyle:off println
       System.err.println("Cannot have primary-py-file and primary-r-file at the same time")
+      // scalastyle:on println
       System.exit(-1)
     }
 
@@ -93,6 +95,7 @@ class ApplicationMasterArguments(val args: Array[String]) {
   }
 
   def printUsageAndExit(exitCode: Int, unknownParam: Any = null) {
+    // scalastyle:off println
     if (unknownParam != null) {
       System.err.println("Unknown/unsupported param " + unknownParam)
     }
@@ -111,6 +114,7 @@ class ApplicationMasterArguments(val args: Array[String]) {
       |  --executor-cores NUM   Number of cores for the executors (Default: 1)
       |  --executor-memory MEM  Memory per executor (e.g. 1000M, 2G) (Default: 1G)
       """.stripMargin)
+    // scalastyle:on println
     System.exit(exitCode)
   }
 }
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
index 4d52ae774ea00..f0af6f875f523 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -938,7 +938,7 @@ private[spark] class Client(
 object Client extends Logging {
   def main(argStrings: Array[String]) {
     if (!sys.props.contains("SPARK_SUBMIT")) {
-      println("WARNING: This client is deprecated and will be removed in a " +
+      logWarning("WARNING: This client is deprecated and will be removed in a " +
         "future version of Spark. Use ./bin/spark-submit with \"--master yarn\"")
     }
 
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientArguments.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientArguments.scala
index 19d1bbff9993f..20d63d40cf605 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientArguments.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientArguments.scala
@@ -123,6 +123,7 @@ private[spark] class ClientArguments(args: Array[String], sparkConf: SparkConf)
       throw new SparkException("Executor cores must not be less than " +
         "spark.task.cpus.")
     }
+    // scalastyle:off println
     if (isClusterMode) {
       for (key <- Seq(amMemKey, amMemOverheadKey, amCoresKey)) {
         if (sparkConf.contains(key)) {
@@ -144,11 +145,13 @@ private[spark] class ClientArguments(args: Array[String], sparkConf: SparkConf)
         .map(_.toInt)
         .foreach { cores => amCores = cores }
     }
+    // scalastyle:on println
   }
 
   private def parseArgs(inputArgs: List[String]): Unit = {
     var args = inputArgs
 
+    // scalastyle:off println
     while (!args.isEmpty) {
       args match {
         case ("--jar") :: value :: tail =>
@@ -253,6 +256,7 @@ private[spark] class ClientArguments(args: Array[String], sparkConf: SparkConf)
           throw new IllegalArgumentException(getUsageMessage(args))
       }
     }
+    // scalastyle:on println
 
     if (primaryPyFile != null && primaryRFile != null) {
       throw new IllegalArgumentException("Cannot have primary-py-file and primary-r-file" +
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
index 335e966519c7c..547863d9a0739 100644
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
+++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
@@ -328,12 +328,14 @@ private object YarnClusterDriver extends Logging with Matchers {
 
   def main(args: Array[String]): Unit = {
     if (args.length != 1) {
+      // scalastyle:off println
       System.err.println(
         s"""
         |Invalid command line: ${args.mkString(" ")}
         |
         |Usage: YarnClusterDriver [result file]
         """.stripMargin)
+      // scalastyle:on println
       System.exit(1)
     }
 
@@ -386,12 +388,14 @@ private object YarnClasspathTest {
 
   def main(args: Array[String]): Unit = {
     if (args.length != 2) {
+      // scalastyle:off println
       System.err.println(
         s"""
         |Invalid command line: ${args.mkString(" ")}
         |
         |Usage: YarnClasspathTest [driver result file] [executor result file]
         """.stripMargin)
+      // scalastyle:on println
       System.exit(1)
     }
 

From 11e22b74a080ea58fb9410b5cc6fa4c03f9198f2 Mon Sep 17 00:00:00 2001
From: Iulian Dragos <jaguarul@gmail.com>
Date: Fri, 10 Jul 2015 16:22:49 +0100
Subject: [PATCH 109/549] [SPARK-7944] [SPARK-8013] Remove most of the Spark
 REPL fork for Scala 2.11
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This PR removes most of the code in the Spark REPL for Scala 2.11 and leaves just a couple of overridden methods in `SparkILoop` in order to:

- change welcome message
- restrict available commands (like `:power`)
- initialize Spark context

The two codebases have diverged and it's extremely hard to backport fixes from the upstream REPL. This somewhat radical step is absolutely necessary in order to fix other REPL tickets (like SPARK-8013 - Hive Thrift server for 2.11). BTW, the Scala REPL has fixed the serialization-unfriendly wrappers thanks to ScrapCodes's work in [#4522](https://github.com/scala/scala/pull/4522)

All tests pass and I tried the `spark-shell` on our Mesos cluster with some simple jobs (including with additional jars), everything looked good.

As soon as Scala 2.11.7 is out we need to upgrade and get a shaded `jline` dependency, clearing the way for SPARK-8013.

/cc pwendell

Author: Iulian Dragos <jaguarul@gmail.com>

Closes #6903 from dragos/issue/no-spark-repl-fork and squashes the following commits:

c596c6f [Iulian Dragos] Merge branch 'master' into issue/no-spark-repl-fork
2b1a305 [Iulian Dragos] Removed spaces around multiple imports.
0ce67a6 [Iulian Dragos] Remove -verbose flag for java compiler (added by mistake in an earlier commit).
10edaf9 [Iulian Dragos] Keep the jline dependency only in the 2.10 build.
529293b [Iulian Dragos] Add back Spark REPL files to rat-excludes, since they are part of the 2.10 real.
d85370d [Iulian Dragos] Remove jline dependency from the Spark REPL.
b541930 [Iulian Dragos] Merge branch 'master' into issue/no-spark-repl-fork
2b15962 [Iulian Dragos] Change jline dependency and bump Scala version.
b300183 [Iulian Dragos] Rename package and add license on top of the file, remove files from rat-excludes and removed `-Yrepl-sync` per reviewer’s request.
9d46d85 [Iulian Dragos] Fix SPARK-7944.
abcc7cb [Iulian Dragos] Remove the REPL forked code.
---
 pom.xml                                       |   18 +-
 repl/pom.xml                                  |   19 +-
 .../scala/org/apache/spark/repl/Main.scala    |   16 +-
 .../apache/spark/repl/SparkExprTyper.scala    |   86 --
 .../org/apache/spark/repl/SparkILoop.scala    |  971 +-----------
 .../org/apache/spark/repl/SparkIMain.scala    | 1323 -----------------
 .../org/apache/spark/repl/SparkImports.scala  |  201 ---
 .../spark/repl/SparkJLineCompletion.scala     |  350 -----
 .../spark/repl/SparkMemberHandlers.scala      |  221 ---
 .../apache/spark/repl/SparkReplReporter.scala |   55 -
 .../org/apache/spark/repl/ReplSuite.scala     |   11 +-
 11 files changed, 90 insertions(+), 3181 deletions(-)
 delete mode 100644 repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkExprTyper.scala
 delete mode 100644 repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkIMain.scala
 delete mode 100644 repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkImports.scala
 delete mode 100644 repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkJLineCompletion.scala
 delete mode 100644 repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkMemberHandlers.scala
 delete mode 100644 repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkReplReporter.scala

diff --git a/pom.xml b/pom.xml
index 172fdef4c73da..c2ebc1a11e770 100644
--- a/pom.xml
+++ b/pom.xml
@@ -341,11 +341,6 @@
   </dependencies>
   <dependencyManagement>
     <dependencies>
-      <dependency>
-        <groupId>${jline.groupid}</groupId>
-        <artifactId>jline</artifactId>
-        <version>${jline.version}</version>
-      </dependency>
       <dependency>
         <groupId>com.twitter</groupId>
         <artifactId>chill_${scala.binary.version}</artifactId>
@@ -1826,6 +1821,15 @@
         <jline.version>${scala.version}</jline.version>
         <jline.groupid>org.scala-lang</jline.groupid>
       </properties>
+      <dependencyManagement>
+        <dependencies>
+          <dependency>
+            <groupId>${jline.groupid}</groupId>
+            <artifactId>jline</artifactId>
+            <version>${jline.version}</version>
+          </dependency>
+        </dependencies>
+      </dependencyManagement>
     </profile>
 
     <profile>
@@ -1844,10 +1848,8 @@
         <property><name>scala-2.11</name></property>
       </activation>
       <properties>
-        <scala.version>2.11.6</scala.version>
+        <scala.version>2.11.7</scala.version>
         <scala.binary.version>2.11</scala.binary.version>
-        <jline.version>2.12.1</jline.version>
-        <jline.groupid>jline</jline.groupid>
       </properties>
     </profile>
 
diff --git a/repl/pom.xml b/repl/pom.xml
index 370b2bc2fa8ed..70c9bd7c01296 100644
--- a/repl/pom.xml
+++ b/repl/pom.xml
@@ -38,11 +38,6 @@
   </properties>
 
   <dependencies>
-    <dependency>
-      <groupId>${jline.groupid}</groupId>
-      <artifactId>jline</artifactId>
-      <version>${jline.version}</version>
-    </dependency>
     <dependency>
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-core_${scala.binary.version}</artifactId>
@@ -161,6 +156,20 @@
     </plugins>
   </build>
   <profiles>
+    <profile>
+      <id>scala-2.10</id>
+      <activation>
+        <property><name>!scala-2.11</name></property>
+      </activation>
+      <dependencies>
+        <dependency>
+          <groupId>${jline.groupid}</groupId>
+          <artifactId>jline</artifactId>
+          <version>${jline.version}</version>
+        </dependency>
+      </dependencies>
+    </profile>
+
     <profile>
       <id>scala-2.11</id>
       <activation>
diff --git a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/Main.scala b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/Main.scala
index f4f4b626988e9..eed4a379afa60 100644
--- a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/Main.scala
+++ b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/Main.scala
@@ -17,13 +17,14 @@
 
 package org.apache.spark.repl
 
+import java.io.File
+
+import scala.tools.nsc.Settings
+
 import org.apache.spark.util.Utils
 import org.apache.spark._
 import org.apache.spark.sql.SQLContext
 
-import scala.tools.nsc.Settings
-import scala.tools.nsc.interpreter.SparkILoop
-
 object Main extends Logging {
 
   val conf = new SparkConf()
@@ -32,7 +33,8 @@ object Main extends Logging {
   val outputDir = Utils.createTempDir(rootDir)
   val s = new Settings()
   s.processArguments(List("-Yrepl-class-based",
-    "-Yrepl-outdir", s"${outputDir.getAbsolutePath}", "-Yrepl-sync"), true)
+    "-Yrepl-outdir", s"${outputDir.getAbsolutePath}",
+    "-classpath", getAddedJars.mkString(File.pathSeparator)), true)
   val classServer = new HttpServer(conf, outputDir, new SecurityManager(conf))
   var sparkContext: SparkContext = _
   var sqlContext: SQLContext = _
@@ -48,7 +50,6 @@ object Main extends Logging {
     Option(sparkContext).map(_.stop)
   }
 
-
   def getAddedJars: Array[String] = {
     val envJars = sys.env.get("ADD_JARS")
     if (envJars.isDefined) {
@@ -84,10 +85,9 @@ object Main extends Logging {
     val loader = Utils.getContextOrSparkClassLoader
     try {
       sqlContext = loader.loadClass(name).getConstructor(classOf[SparkContext])
-        .newInstance(sparkContext).asInstanceOf[SQLContext] 
+        .newInstance(sparkContext).asInstanceOf[SQLContext]
       logInfo("Created sql context (with Hive support)..")
-    }
-    catch {
+    } catch {
       case _: java.lang.ClassNotFoundException | _: java.lang.NoClassDefFoundError =>
         sqlContext = new SQLContext(sparkContext)
         logInfo("Created sql context..")
diff --git a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkExprTyper.scala b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkExprTyper.scala
deleted file mode 100644
index 8e519fa67f649..0000000000000
--- a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkExprTyper.scala
+++ /dev/null
@@ -1,86 +0,0 @@
-/* NSC -- new Scala compiler
- * Copyright 2005-2013 LAMP/EPFL
- * @author  Paul Phillips
- */
-
-package scala.tools.nsc
-package interpreter
-
-import scala.tools.nsc.ast.parser.Tokens.EOF
-
-trait SparkExprTyper {
-  val repl: SparkIMain
-
-  import repl._
-  import global.{ reporter => _, Import => _, _ }
-  import naming.freshInternalVarName
-
-  def symbolOfLine(code: String): Symbol = {
-    def asExpr(): Symbol = {
-      val name  = freshInternalVarName()
-      // Typing it with a lazy val would give us the right type, but runs
-      // into compiler bugs with things like existentials, so we compile it
-      // behind a def and strip the NullaryMethodType which wraps the expr.
-      val line = "def " + name + " = " + code
-
-      interpretSynthetic(line) match {
-        case IR.Success =>
-          val sym0 = symbolOfTerm(name)
-          // drop NullaryMethodType
-          sym0.cloneSymbol setInfo exitingTyper(sym0.tpe_*.finalResultType)
-        case _          => NoSymbol
-      }
-    }
-    def asDefn(): Symbol = {
-      val old = repl.definedSymbolList.toSet
-
-      interpretSynthetic(code) match {
-        case IR.Success =>
-          repl.definedSymbolList filterNot old match {
-            case Nil        => NoSymbol
-            case sym :: Nil => sym
-            case syms       => NoSymbol.newOverloaded(NoPrefix, syms)
-          }
-        case _ => NoSymbol
-      }
-    }
-    def asError(): Symbol = {
-      interpretSynthetic(code)
-      NoSymbol
-    }
-    beSilentDuring(asExpr()) orElse beSilentDuring(asDefn()) orElse asError()
-  }
-
-  private var typeOfExpressionDepth = 0
-  def typeOfExpression(expr: String, silent: Boolean = true): Type = {
-    if (typeOfExpressionDepth > 2) {
-      repldbg("Terminating typeOfExpression recursion for expression: " + expr)
-      return NoType
-    }
-    typeOfExpressionDepth += 1
-    // Don't presently have a good way to suppress undesirable success output
-    // while letting errors through, so it is first trying it silently: if there
-    // is an error, and errors are desired, then it re-evaluates non-silently
-    // to induce the error message.
-    try beSilentDuring(symbolOfLine(expr).tpe) match {
-      case NoType if !silent => symbolOfLine(expr).tpe // generate error
-      case tpe               => tpe
-    }
-    finally typeOfExpressionDepth -= 1
-  }
-
-  // This only works for proper types.
-  def typeOfTypeString(typeString: String): Type = {
-    def asProperType(): Option[Type] = {
-      val name = freshInternalVarName()
-      val line = "def %s: %s = ???" format (name, typeString)
-      interpretSynthetic(line) match {
-        case IR.Success =>
-          val sym0 = symbolOfTerm(name)
-          Some(sym0.asMethod.returnType)
-        case _          => None
-      }
-    }
-    beSilentDuring(asProperType()) getOrElse NoType
-  }
-}
diff --git a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkILoop.scala b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkILoop.scala
index 3c90287249497..bf609ff0f65fc 100644
--- a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkILoop.scala
+++ b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkILoop.scala
@@ -1,88 +1,64 @@
-/* NSC -- new Scala compiler
- * Copyright 2005-2013 LAMP/EPFL
- * @author Alexander Spoon
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 
-package scala
-package tools.nsc
-package interpreter
+package org.apache.spark.repl
 
-import scala.language.{ implicitConversions, existentials }
-import scala.annotation.tailrec
-import Predef.{ println => _, _ }
-import interpreter.session._
-import StdReplTags._
-import scala.reflect.api.{Mirror, Universe, TypeCreator}
-import scala.util.Properties.{ jdkHome, javaVersion, versionString, javaVmName }
-import scala.tools.nsc.util.{ ClassPath, Exceptional, stringFromWriter, stringFromStream }
-import scala.reflect.{ClassTag, classTag}
-import scala.reflect.internal.util.{ BatchSourceFile, ScalaClassLoader }
-import ScalaClassLoader._
-import scala.reflect.io.{ File, Directory }
-import scala.tools.util._
-import scala.collection.generic.Clearable
-import scala.concurrent.{ ExecutionContext, Await, Future, future }
-import ExecutionContext.Implicits._
-import java.io.{ BufferedReader, FileReader }
+import java.io.{BufferedReader, FileReader}
 
-/** The Scala interactive shell.  It provides a read-eval-print loop
-  *  around the Interpreter class.
-  *  After instantiation, clients should call the main() method.
-  *
-  *  If no in0 is specified, then input will come from the console, and
-  *  the class will attempt to provide input editing feature such as
-  *  input history.
-  *
-  *  @author Moez A. Abdel-Gawad
-  *  @author  Lex Spoon
-  *  @version 1.2
-  */
-class SparkILoop(in0: Option[BufferedReader], protected val out: JPrintWriter)
-  extends AnyRef
-  with LoopCommands
-{
-  def this(in0: BufferedReader, out: JPrintWriter) = this(Some(in0), out)
-  def this() = this(None, new JPrintWriter(Console.out, true))
-//
-//  @deprecated("Use `intp` instead.", "2.9.0") def interpreter = intp
-//  @deprecated("Use `intp` instead.", "2.9.0") def interpreter_= (i: Interpreter): Unit = intp = i
-
-  var in: InteractiveReader = _   // the input stream from which commands come
-  var settings: Settings = _
-  var intp: SparkIMain = _
+import Predef.{println => _, _}
+import scala.util.Properties.{jdkHome, javaVersion, versionString, javaVmName}
 
-  var globalFuture: Future[Boolean] = _
+import scala.tools.nsc.interpreter.{JPrintWriter, ILoop}
+import scala.tools.nsc.Settings
+import scala.tools.nsc.util.stringFromStream
 
-  protected def asyncMessage(msg: String) {
-    if (isReplInfo || isReplPower)
-      echoAndRefresh(msg)
-  }
+/**
+ *  A Spark-specific interactive shell.
+ */
+class SparkILoop(in0: Option[BufferedReader], out: JPrintWriter)
+    extends ILoop(in0, out) {
+  def this(in0: BufferedReader, out: JPrintWriter) = this(Some(in0), out)
+  def this() = this(None, new JPrintWriter(Console.out, true))
 
   def initializeSpark() {
     intp.beQuietDuring {
-      command( """
+      processLine("""
          @transient val sc = {
            val _sc = org.apache.spark.repl.Main.createSparkContext()
            println("Spark context available as sc.")
            _sc
          }
         """)
-      command( """
+      processLine("""
          @transient val sqlContext = {
            val _sqlContext = org.apache.spark.repl.Main.createSQLContext()
            println("SQL context available as sqlContext.")
            _sqlContext
          }
         """)
-      command("import org.apache.spark.SparkContext._")
-      command("import sqlContext.implicits._")
-      command("import sqlContext.sql")
-      command("import org.apache.spark.sql.functions._")
+      processLine("import org.apache.spark.SparkContext._")
+      processLine("import sqlContext.implicits._")
+      processLine("import sqlContext.sql")
+      processLine("import org.apache.spark.sql.functions._")
     }
   }
 
   /** Print a welcome message */
-  def printWelcome() {
+  override def printWelcome() {
     import org.apache.spark.SPARK_VERSION
     echo("""Welcome to
       ____              __
@@ -98,877 +74,42 @@ class SparkILoop(in0: Option[BufferedReader], protected val out: JPrintWriter)
     echo("Type :help for more information.")
   }
 
-  override def echoCommandMessage(msg: String) {
-    intp.reporter printUntruncatedMessage msg
-  }
-
-  // lazy val power = new Power(intp, new StdReplVals(this))(tagOfStdReplVals, classTag[StdReplVals])
-  def history = in.history
-
-  // classpath entries added via :cp
-  var addedClasspath: String = ""
-
-  /** A reverse list of commands to replay if the user requests a :replay */
-  var replayCommandStack: List[String] = Nil
-
-  /** A list of commands to replay if the user requests a :replay */
-  def replayCommands = replayCommandStack.reverse
-
-  /** Record a command for replay should the user request a :replay */
-  def addReplay(cmd: String) = replayCommandStack ::= cmd
-
-  def savingReplayStack[T](body: => T): T = {
-    val saved = replayCommandStack
-    try body
-    finally replayCommandStack = saved
-  }
-  def savingReader[T](body: => T): T = {
-    val saved = in
-    try body
-    finally in = saved
-  }
-
-  /** Close the interpreter and set the var to null. */
-  def closeInterpreter() {
-    if (intp ne null) {
-      intp.close()
-      intp = null
-    }
-  }
-
-  class SparkILoopInterpreter extends SparkIMain(settings, out) {
-    outer =>
-
-    override lazy val formatting = new Formatting {
-      def prompt = SparkILoop.this.prompt
-    }
-    override protected def parentClassLoader =
-      settings.explicitParentLoader.getOrElse( classOf[SparkILoop].getClassLoader )
-  }
-
-  /** Create a new interpreter. */
-  def createInterpreter() {
-    if (addedClasspath != "")
-      settings.classpath append addedClasspath
-
-    intp = new SparkILoopInterpreter
-  }
-
-  /** print a friendly help message */
-  def helpCommand(line: String): Result = {
-    if (line == "") helpSummary()
-    else uniqueCommand(line) match {
-      case Some(lc) => echo("\n" + lc.help)
-      case _        => ambiguousError(line)
-    }
-  }
-  private def helpSummary() = {
-    val usageWidth  = commands map (_.usageMsg.length) max
-    val formatStr   = "%-" + usageWidth + "s %s"
-
-    echo("All commands can be abbreviated, e.g. :he instead of :help.")
-
-    commands foreach { cmd =>
-      echo(formatStr.format(cmd.usageMsg, cmd.help))
-    }
-  }
-  private def ambiguousError(cmd: String): Result = {
-    matchingCommands(cmd) match {
-      case Nil  => echo(cmd + ": no such command.  Type :help for help.")
-      case xs   => echo(cmd + " is ambiguous: did you mean " + xs.map(":" + _.name).mkString(" or ") + "?")
-    }
-    Result(keepRunning = true, None)
-  }
-  private def matchingCommands(cmd: String) = commands filter (_.name startsWith cmd)
-  private def uniqueCommand(cmd: String): Option[LoopCommand] = {
-    // this lets us add commands willy-nilly and only requires enough command to disambiguate
-    matchingCommands(cmd) match {
-      case List(x)  => Some(x)
-      // exact match OK even if otherwise appears ambiguous
-      case xs       => xs find (_.name == cmd)
-    }
-  }
-
-  /** Show the history */
-  lazy val historyCommand = new LoopCommand("history", "show the history (optional num is commands to show)") {
-    override def usage = "[num]"
-    def defaultLines = 20
-
-    def apply(line: String): Result = {
-      if (history eq NoHistory)
-        return "No history available."
-
-      val xs      = words(line)
-      val current = history.index
-      val count   = try xs.head.toInt catch { case _: Exception => defaultLines }
-      val lines   = history.asStrings takeRight count
-      val offset  = current - lines.size + 1
-
-      for ((line, index) <- lines.zipWithIndex)
-        echo("%3d  %s".format(index + offset, line))
-    }
-  }
-
-  // When you know you are most likely breaking into the middle
-  // of a line being typed.  This softens the blow.
-  protected def echoAndRefresh(msg: String) = {
-    echo("\n" + msg)
-    in.redrawLine()
-  }
-  protected def echo(msg: String) = {
-    out println msg
-    out.flush()
-  }
-
-  /** Search the history */
-  def searchHistory(_cmdline: String) {
-    val cmdline = _cmdline.toLowerCase
-    val offset  = history.index - history.size + 1
-
-    for ((line, index) <- history.asStrings.zipWithIndex ; if line.toLowerCase contains cmdline)
-      echo("%d %s".format(index + offset, line))
-  }
-
-  private val currentPrompt = Properties.shellPromptString
-
-  /** Prompt to print when awaiting input */
-  def prompt = currentPrompt
-
   import LoopCommand.{ cmd, nullary }
 
-  /** Standard commands **/
-  lazy val standardCommands = List(
-    cmd("cp", "<path>", "add a jar or directory to the classpath", addClasspath),
-    cmd("edit", "<id>|<line>", "edit history", editCommand),
-    cmd("help", "[command]", "print this summary or command-specific help", helpCommand),
-    historyCommand,
-    cmd("h?", "<string>", "search the history", searchHistory),
-    cmd("imports", "[name name ...]", "show import history, identifying sources of names", importsCommand),
-    //cmd("implicits", "[-v]", "show the implicits in scope", intp.implicitsCommand),
-    cmd("javap", "<path|class>", "disassemble a file or class name", javapCommand),
-    cmd("line", "<id>|<line>", "place line(s) at the end of history", lineCommand),
-    cmd("load", "<path>", "interpret lines in a file", loadCommand),
-    cmd("paste", "[-raw] [path]", "enter paste mode or paste a file", pasteCommand),
-    // nullary("power", "enable power user mode", powerCmd),
-    nullary("quit", "exit the interpreter", () => Result(keepRunning = false, None)),
-    nullary("replay", "reset execution and replay all previous commands", replay),
-    nullary("reset", "reset the repl to its initial state, forgetting all session entries", resetCommand),
-    cmd("save", "<path>", "save replayable session to a file", saveCommand),
-    shCommand,
-    cmd("settings", "[+|-]<options>", "+enable/-disable flags, set compiler options", changeSettings),
-    nullary("silent", "disable/enable automatic printing of results", verbosity),
-//    cmd("type", "[-v] <expr>", "display the type of an expression without evaluating it", typeCommand),
-//    cmd("kind", "[-v] <expr>", "display the kind of expression's type", kindCommand),
-    nullary("warnings", "show the suppressed warnings from the most recent line which had any", warningsCommand)
-  )
-
-  /** Power user commands */
-//  lazy val powerCommands: List[LoopCommand] = List(
-//    cmd("phase", "<phase>", "set the implicit phase for power commands", phaseCommand)
-//  )
-
-  private def importsCommand(line: String): Result = {
-    val tokens    = words(line)
-    val handlers  = intp.languageWildcardHandlers ++ intp.importHandlers
-
-    handlers.filterNot(_.importedSymbols.isEmpty).zipWithIndex foreach {
-      case (handler, idx) =>
-        val (types, terms) = handler.importedSymbols partition (_.name.isTypeName)
-        val imps           = handler.implicitSymbols
-        val found          = tokens filter (handler importsSymbolNamed _)
-        val typeMsg        = if (types.isEmpty) "" else types.size + " types"
-        val termMsg        = if (terms.isEmpty) "" else terms.size + " terms"
-        val implicitMsg    = if (imps.isEmpty) "" else imps.size + " are implicit"
-        val foundMsg       = if (found.isEmpty) "" else found.mkString(" // imports: ", ", ", "")
-        val statsMsg       = List(typeMsg, termMsg, implicitMsg) filterNot (_ == "") mkString ("(", ", ", ")")
-
-        intp.reporter.printMessage("%2d) %-30s %s%s".format(
-          idx + 1,
-          handler.importString,
-          statsMsg,
-          foundMsg
-        ))
-    }
-  }
-
-  private def findToolsJar() = PathResolver.SupplementalLocations.platformTools
+  private val blockedCommands = Set("implicits", "javap", "power", "type", "kind")
 
-  private def addToolsJarToLoader() = {
-    val cl = findToolsJar() match {
-      case Some(tools) => ScalaClassLoader.fromURLs(Seq(tools.toURL), intp.classLoader)
-      case _           => intp.classLoader
-    }
-    if (Javap.isAvailable(cl)) {
-      repldbg(":javap available.")
-      cl
-    }
-    else {
-      repldbg(":javap unavailable: no tools.jar at " + jdkHome)
-      intp.classLoader
-    }
-  }
-//
-//  protected def newJavap() =
-//    JavapClass(addToolsJarToLoader(), new IMain.ReplStrippingWriter(intp), Some(intp))
-//
-//  private lazy val javap = substituteAndLog[Javap]("javap", NoJavap)(newJavap())
-
-  // Still todo: modules.
-//  private def typeCommand(line0: String): Result = {
-//    line0.trim match {
-//      case "" => ":type [-v] <expression>"
-//      case s  => intp.typeCommandInternal(s stripPrefix "-v " trim, verbose = s startsWith "-v ")
-//    }
-//  }
-
-//  private def kindCommand(expr: String): Result = {
-//    expr.trim match {
-//      case "" => ":kind [-v] <expression>"
-//      case s  => intp.kindCommandInternal(s stripPrefix "-v " trim, verbose = s startsWith "-v ")
-//    }
-//  }
-
-  private def warningsCommand(): Result = {
-    if (intp.lastWarnings.isEmpty)
-      "Can't find any cached warnings."
-    else
-      intp.lastWarnings foreach { case (pos, msg) => intp.reporter.warning(pos, msg) }
-  }
-
-  private def changeSettings(args: String): Result = {
-    def showSettings() = {
-      for (s <- settings.userSetSettings.toSeq.sorted) echo(s.toString)
-    }
-    def updateSettings() = {
-      // put aside +flag options
-      val (pluses, rest) = (args split "\\s+").toList partition (_.startsWith("+"))
-      val tmps = new Settings
-      val (ok, leftover) = tmps.processArguments(rest, processAll = true)
-      if (!ok) echo("Bad settings request.")
-      else if (leftover.nonEmpty) echo("Unprocessed settings.")
-      else {
-        // boolean flags set-by-user on tmp copy should be off, not on
-        val offs = tmps.userSetSettings filter (_.isInstanceOf[Settings#BooleanSetting])
-        val (minuses, nonbools) = rest partition (arg => offs exists (_ respondsTo arg))
-        // update non-flags
-        settings.processArguments(nonbools, processAll = true)
-        // also snag multi-value options for clearing, e.g. -Ylog: and -language:
-        for {
-          s <- settings.userSetSettings
-          if s.isInstanceOf[Settings#MultiStringSetting] || s.isInstanceOf[Settings#PhasesSetting]
-          if nonbools exists (arg => arg.head == '-' && arg.last == ':' && (s respondsTo arg.init))
-        } s match {
-          case c: Clearable => c.clear()
-          case _ =>
-        }
-        def update(bs: Seq[String], name: String=>String, setter: Settings#Setting=>Unit) = {
-          for (b <- bs)
-            settings.lookupSetting(name(b)) match {
-              case Some(s) =>
-                if (s.isInstanceOf[Settings#BooleanSetting]) setter(s)
-                else echo(s"Not a boolean flag: $b")
-              case _ =>
-                echo(s"Not an option: $b")
-            }
-        }
-        update(minuses, identity, _.tryToSetFromPropertyValue("false"))  // turn off
-        update(pluses, "-" + _.drop(1), _.tryToSet(Nil))                 // turn on
-      }
-    }
-    if (args.isEmpty) showSettings() else updateSettings()
-  }
-
-  private def javapCommand(line: String): Result = {
-//    if (javap == null)
-//      ":javap unavailable, no tools.jar at %s.  Set JDK_HOME.".format(jdkHome)
-//    else if (line == "")
-//      ":javap [-lcsvp] [path1 path2 ...]"
-//    else
-//      javap(words(line)) foreach { res =>
-//        if (res.isError) return "Failed: " + res.value
-//        else res.show()
-//      }
-  }
-
-  private def pathToPhaseWrapper = intp.originalPath("$r") + ".phased.atCurrent"
-
-  private def phaseCommand(name: String): Result = {
-//    val phased: Phased = power.phased
-//    import phased.NoPhaseName
-//
-//    if (name == "clear") {
-//      phased.set(NoPhaseName)
-//      intp.clearExecutionWrapper()
-//      "Cleared active phase."
-//    }
-//    else if (name == "") phased.get match {
-//      case NoPhaseName => "Usage: :phase <expr> (e.g. typer, erasure.next, erasure+3)"
-//      case ph          => "Active phase is '%s'.  (To clear, :phase clear)".format(phased.get)
-//    }
-//    else {
-//      val what = phased.parse(name)
-//      if (what.isEmpty || !phased.set(what))
-//        "'" + name + "' does not appear to represent a valid phase."
-//      else {
-//        intp.setExecutionWrapper(pathToPhaseWrapper)
-//        val activeMessage =
-//          if (what.toString.length == name.length) "" + what
-//          else "%s (%s)".format(what, name)
-//
-//        "Active phase is now: " + activeMessage
-//      }
-//    }
-  }
+  /** Standard commands **/
+  lazy val sparkStandardCommands: List[SparkILoop.this.LoopCommand] =
+    standardCommands.filter(cmd => !blockedCommands(cmd.name))
 
   /** Available commands */
-  def commands: List[LoopCommand] = standardCommands ++ (
-    // if (isReplPower)
-    //  powerCommands
-    // else
-      Nil
-    )
-
-  val replayQuestionMessage =
-    """|That entry seems to have slain the compiler.  Shall I replay
-      |your session? I can re-run each line except the last one.
-      |[y/n]
-    """.trim.stripMargin
-
-  private val crashRecovery: PartialFunction[Throwable, Boolean] = {
-    case ex: Throwable =>
-      val (err, explain) = (
-        if (intp.isInitializeComplete)
-          (intp.global.throwableAsString(ex), "")
-        else
-          (ex.getMessage, "The compiler did not initialize.\n")
-        )
-      echo(err)
-
-      ex match {
-        case _: NoSuchMethodError | _: NoClassDefFoundError =>
-          echo("\nUnrecoverable error.")
-          throw ex
-        case _  =>
-          def fn(): Boolean =
-            try in.readYesOrNo(explain + replayQuestionMessage, { echo("\nYou must enter y or n.") ; fn() })
-            catch { case _: RuntimeException => false }
-
-          if (fn()) replay()
-          else echo("\nAbandoning crashed session.")
-      }
-      true
-  }
-
-  // return false if repl should exit
-  def processLine(line: String): Boolean = {
-    import scala.concurrent.duration._
-    Await.ready(globalFuture, 60.seconds)
-
-    (line ne null) && (command(line) match {
-      case Result(false, _)      => false
-      case Result(_, Some(line)) => addReplay(line) ; true
-      case _                     => true
-    })
-  }
-
-  private def readOneLine() = {
-    out.flush()
-    in readLine prompt
-  }
-
-  /** The main read-eval-print loop for the repl.  It calls
-    *  command() for each line of input, and stops when
-    *  command() returns false.
-    */
-  @tailrec final def loop() {
-    if ( try processLine(readOneLine()) catch crashRecovery )
-      loop()
-  }
-
-  /** interpret all lines from a specified file */
-  def interpretAllFrom(file: File) {
-    savingReader {
-      savingReplayStack {
-        file applyReader { reader =>
-          in = SimpleReader(reader, out, interactive = false)
-          echo("Loading " + file + "...")
-          loop()
-        }
-      }
-    }
-  }
-
-  /** create a new interpreter and replay the given commands */
-  def replay() {
-    reset()
-    if (replayCommandStack.isEmpty)
-      echo("Nothing to replay.")
-    else for (cmd <- replayCommands) {
-      echo("Replaying: " + cmd)  // flush because maybe cmd will have its own output
-      command(cmd)
-      echo("")
-    }
-  }
-  def resetCommand() {
-    echo("Resetting interpreter state.")
-    if (replayCommandStack.nonEmpty) {
-      echo("Forgetting this session history:\n")
-      replayCommands foreach echo
-      echo("")
-      replayCommandStack = Nil
-    }
-    if (intp.namedDefinedTerms.nonEmpty)
-      echo("Forgetting all expression results and named terms: " + intp.namedDefinedTerms.mkString(", "))
-    if (intp.definedTypes.nonEmpty)
-      echo("Forgetting defined types: " + intp.definedTypes.mkString(", "))
-
-    reset()
-  }
-  def reset() {
-    intp.reset()
-    unleashAndSetPhase()
-  }
-
-  def lineCommand(what: String): Result = editCommand(what, None)
-
-  // :edit id or :edit line
-  def editCommand(what: String): Result = editCommand(what, Properties.envOrNone("EDITOR"))
-
-  def editCommand(what: String, editor: Option[String]): Result = {
-    def diagnose(code: String) = {
-      echo("The edited code is incomplete!\n")
-      val errless = intp compileSources new BatchSourceFile("<pastie>", s"object pastel {\n$code\n}")
-      if (errless) echo("The compiler reports no errors.")
-    }
-    def historicize(text: String) = history match {
-      case jlh: JLineHistory => text.lines foreach jlh.add ; jlh.moveToEnd() ; true
-      case _ => false
-    }
-    def edit(text: String): Result = editor match {
-      case Some(ed) =>
-        val tmp = File.makeTemp()
-        tmp.writeAll(text)
-        try {
-          val pr = new ProcessResult(s"$ed ${tmp.path}")
-          pr.exitCode match {
-            case 0 =>
-              tmp.safeSlurp() match {
-                case Some(edited) if edited.trim.isEmpty => echo("Edited text is empty.")
-                case Some(edited) =>
-                  echo(edited.lines map ("+" + _) mkString "\n")
-                  val res = intp interpret edited
-                  if (res == IR.Incomplete) diagnose(edited)
-                  else {
-                    historicize(edited)
-                    Result(lineToRecord = Some(edited), keepRunning = true)
-                  }
-                case None => echo("Can't read edited text. Did you delete it?")
-              }
-            case x => echo(s"Error exit from $ed ($x), ignoring")
-          }
-        } finally {
-          tmp.delete()
-        }
-      case None =>
-        if (historicize(text)) echo("Placing text in recent history.")
-        else echo(f"No EDITOR defined and you can't change history, echoing your text:%n$text")
-    }
-
-    // if what is a number, use it as a line number or range in history
-    def isNum = what forall (c => c.isDigit || c == '-' || c == '+')
-    // except that "-" means last value
-    def isLast = (what == "-")
-    if (isLast || !isNum) {
-      val name = if (isLast) intp.mostRecentVar else what
-      val sym = intp.symbolOfIdent(name)
-      intp.prevRequestList collectFirst { case r if r.defines contains sym => r } match {
-        case Some(req) => edit(req.line)
-        case None      => echo(s"No symbol in scope: $what")
-      }
-    } else try {
-      val s = what
-      // line 123, 120+3, -3, 120-123, 120-, note -3 is not 0-3 but (cur-3,cur)
-      val (start, len) =
-        if ((s indexOf '+') > 0) {
-          val (a,b) = s splitAt (s indexOf '+')
-          (a.toInt, b.drop(1).toInt)
-        } else {
-          (s indexOf '-') match {
-            case -1 => (s.toInt, 1)
-            case 0  => val n = s.drop(1).toInt ; (history.index - n, n)
-            case _ if s.last == '-' => val n = s.init.toInt ; (n, history.index - n)
-            case i  => val n = s.take(i).toInt ; (n, s.drop(i+1).toInt - n)
-          }
-        }
-      import scala.collection.JavaConverters._
-      val index = (start - 1) max 0
-      val text = history match {
-        case jlh: JLineHistory => jlh.entries(index).asScala.take(len) map (_.value) mkString "\n"
-        case _ => history.asStrings.slice(index, index + len) mkString "\n"
-      }
-      edit(text)
-    } catch {
-      case _: NumberFormatException => echo(s"Bad range '$what'")
-        echo("Use line 123, 120+3, -3, 120-123, 120-, note -3 is not 0-3 but (cur-3,cur)")
-    }
-  }
-
-  /** fork a shell and run a command */
-  lazy val shCommand = new LoopCommand("sh", "run a shell command (result is implicitly => List[String])") {
-    override def usage = "<command line>"
-    def apply(line: String): Result = line match {
-      case ""   => showUsage()
-      case _    =>
-        val toRun = s"new ${classOf[ProcessResult].getName}(${string2codeQuoted(line)})"
-        intp interpret toRun
-        ()
-    }
-  }
-
-  def withFile[A](filename: String)(action: File => A): Option[A] = {
-    val res = Some(File(filename)) filter (_.exists) map action
-    if (res.isEmpty) echo("That file does not exist")  // courtesy side-effect
-    res
-  }
-
-  def loadCommand(arg: String) = {
-    var shouldReplay: Option[String] = None
-    withFile(arg)(f => {
-      interpretAllFrom(f)
-      shouldReplay = Some(":load " + arg)
-    })
-    Result(keepRunning = true, shouldReplay)
-  }
-
-  def saveCommand(filename: String): Result = (
-    if (filename.isEmpty) echo("File name is required.")
-    else if (replayCommandStack.isEmpty) echo("No replay commands in session")
-    else File(filename).printlnAll(replayCommands: _*)
-    )
-
-  def addClasspath(arg: String): Unit = {
-    val f = File(arg).normalize
-    if (f.exists) {
-      addedClasspath = ClassPath.join(addedClasspath, f.path)
-      val totalClasspath = ClassPath.join(settings.classpath.value, addedClasspath)
-      echo("Added '%s'.  Your new classpath is:\n\"%s\"".format(f.path, totalClasspath))
-      replay()
-    }
-    else echo("The path '" + f + "' doesn't seem to exist.")
-  }
-
-  def powerCmd(): Result = {
-    if (isReplPower) "Already in power mode."
-    else enablePowerMode(isDuringInit = false)
-  }
-  def enablePowerMode(isDuringInit: Boolean) = {
-    replProps.power setValue true
-    unleashAndSetPhase()
-    // asyncEcho(isDuringInit, power.banner)
-  }
-  private def unleashAndSetPhase() {
-    if (isReplPower) {
-    //  power.unleash()
-      // Set the phase to "typer"
-      // intp beSilentDuring phaseCommand("typer")
-    }
-  }
-
-  def asyncEcho(async: Boolean, msg: => String) {
-    if (async) asyncMessage(msg)
-    else echo(msg)
-  }
-
-  def verbosity() = {
-    val old = intp.printResults
-    intp.printResults = !old
-    echo("Switched " + (if (old) "off" else "on") + " result printing.")
-  }
-
-  /** Run one command submitted by the user.  Two values are returned:
-    * (1) whether to keep running, (2) the line to record for replay,
-    * if any. */
-  def command(line: String): Result = {
-    if (line startsWith ":") {
-      val cmd = line.tail takeWhile (x => !x.isWhitespace)
-      uniqueCommand(cmd) match {
-        case Some(lc) => lc(line.tail stripPrefix cmd dropWhile (_.isWhitespace))
-        case _        => ambiguousError(cmd)
-      }
-    }
-    else if (intp.global == null) Result(keepRunning = false, None)  // Notice failure to create compiler
-    else Result(keepRunning = true, interpretStartingWith(line))
-  }
-
-  private def readWhile(cond: String => Boolean) = {
-    Iterator continually in.readLine("") takeWhile (x => x != null && cond(x))
-  }
-
-  def pasteCommand(arg: String): Result = {
-    var shouldReplay: Option[String] = None
-    def result = Result(keepRunning = true, shouldReplay)
-    val (raw, file) =
-      if (arg.isEmpty) (false, None)
-      else {
-        val r = """(-raw)?(\s+)?([^\-]\S*)?""".r
-        arg match {
-          case r(flag, sep, name) =>
-            if (flag != null && name != null && sep == null)
-              echo(s"""I assume you mean "$flag $name"?""")
-            (flag != null, Option(name))
-          case _ =>
-            echo("usage: :paste -raw file")
-            return result
-        }
-      }
-    val code = file match {
-      case Some(name) =>
-        withFile(name)(f => {
-          shouldReplay = Some(s":paste $arg")
-          val s = f.slurp.trim
-          if (s.isEmpty) echo(s"File contains no code: $f")
-          else echo(s"Pasting file $f...")
-          s
-        }) getOrElse ""
-      case None =>
-        echo("// Entering paste mode (ctrl-D to finish)\n")
-        val text = (readWhile(_ => true) mkString "\n").trim
-        if (text.isEmpty) echo("\n// Nothing pasted, nothing gained.\n")
-        else echo("\n// Exiting paste mode, now interpreting.\n")
-        text
-    }
-    def interpretCode() = {
-      val res = intp interpret code
-      // if input is incomplete, let the compiler try to say why
-      if (res == IR.Incomplete) {
-        echo("The pasted code is incomplete!\n")
-        // Remembrance of Things Pasted in an object
-        val errless = intp compileSources new BatchSourceFile("<pastie>", s"object pastel {\n$code\n}")
-        if (errless) echo("...but compilation found no error? Good luck with that.")
-      }
-    }
-    def compileCode() = {
-      val errless = intp compileSources new BatchSourceFile("<pastie>", code)
-      if (!errless) echo("There were compilation errors!")
-    }
-    if (code.nonEmpty) {
-      if (raw) compileCode() else interpretCode()
-    }
-    result
-  }
-
-  private object paste extends Pasted {
-    val ContinueString = "     | "
-    val PromptString   = "scala> "
-
-    def interpret(line: String): Unit = {
-      echo(line.trim)
-      intp interpret line
-      echo("")
-    }
-
-    def transcript(start: String) = {
-      echo("\n// Detected repl transcript paste: ctrl-D to finish.\n")
-      apply(Iterator(start) ++ readWhile(_.trim != PromptString.trim))
-    }
-  }
-  import paste.{ ContinueString, PromptString }
-
-  /** Interpret expressions starting with the first line.
-    * Read lines until a complete compilation unit is available
-    * or until a syntax error has been seen.  If a full unit is
-    * read, go ahead and interpret it.  Return the full string
-    * to be recorded for replay, if any.
-    */
-  def interpretStartingWith(code: String): Option[String] = {
-    // signal completion non-completion input has been received
-    in.completion.resetVerbosity()
-
-    def reallyInterpret = {
-      val reallyResult = intp.interpret(code)
-      (reallyResult, reallyResult match {
-        case IR.Error       => None
-        case IR.Success     => Some(code)
-        case IR.Incomplete  =>
-          if (in.interactive && code.endsWith("\n\n")) {
-            echo("You typed two blank lines.  Starting a new command.")
-            None
-          }
-          else in.readLine(ContinueString) match {
-            case null =>
-              // we know compilation is going to fail since we're at EOF and the
-              // parser thinks the input is still incomplete, but since this is
-              // a file being read non-interactively we want to fail.  So we send
-              // it straight to the compiler for the nice error message.
-              intp.compileString(code)
-              None
-
-            case line => interpretStartingWith(code + "\n" + line)
-          }
-      })
-    }
-
-    /** Here we place ourselves between the user and the interpreter and examine
-      *  the input they are ostensibly submitting.  We intervene in several cases:
-      *
-      *  1) If the line starts with "scala> " it is assumed to be an interpreter paste.
-      *  2) If the line starts with "." (but not ".." or "./") it is treated as an invocation
-      *     on the previous result.
-      *  3) If the Completion object's execute returns Some(_), we inject that value
-      *     and avoid the interpreter, as it's likely not valid scala code.
-      */
-    if (code == "") None
-    else if (!paste.running && code.trim.startsWith(PromptString)) {
-      paste.transcript(code)
-      None
-    }
-    else if (Completion.looksLikeInvocation(code) && intp.mostRecentVar != "") {
-      interpretStartingWith(intp.mostRecentVar + code)
-    }
-    else if (code.trim startsWith "//") {
-      // line comment, do nothing
-      None
-    }
-    else
-      reallyInterpret._2
-  }
-
-  // runs :load `file` on any files passed via -i
-  def loadFiles(settings: Settings) = settings match {
-    case settings: GenericRunnerSettings =>
-      for (filename <- settings.loadfiles.value) {
-        val cmd = ":load " + filename
-        command(cmd)
-        addReplay(cmd)
-        echo("")
-      }
-    case _ =>
-  }
-
-  /** Tries to create a JLineReader, falling back to SimpleReader:
-    *  unless settings or properties are such that it should start
-    *  with SimpleReader.
-    */
-  def chooseReader(settings: Settings): InteractiveReader = {
-    if (settings.Xnojline || Properties.isEmacsShell)
-      SimpleReader()
-    else try new JLineReader(
-      if (settings.noCompletion) NoCompletion
-      else new SparkJLineCompletion(intp)
-    )
-    catch {
-      case ex @ (_: Exception | _: NoClassDefFoundError) =>
-        echo("Failed to created JLineReader: " + ex + "\nFalling back to SimpleReader.")
-        SimpleReader()
-    }
-  }
-  protected def tagOfStaticClass[T: ClassTag]: u.TypeTag[T] =
-    u.TypeTag[T](
-      m,
-      new TypeCreator {
-        def apply[U <: Universe with Singleton](m: Mirror[U]): U # Type =
-          m.staticClass(classTag[T].runtimeClass.getName).toTypeConstructor.asInstanceOf[U # Type]
-      })
-
-  private def loopPostInit() {
-    // Bind intp somewhere out of the regular namespace where
-    // we can get at it in generated code.
-    intp.quietBind(NamedParam[SparkIMain]("$intp", intp)(tagOfStaticClass[SparkIMain], classTag[SparkIMain]))
-    // Auto-run code via some setting.
-    ( replProps.replAutorunCode.option
-      flatMap (f => io.File(f).safeSlurp())
-      foreach (intp quietRun _)
-      )
-    // classloader and power mode setup
-    intp.setContextClassLoader()
-    if (isReplPower) {
-     // replProps.power setValue true
-     // unleashAndSetPhase()
-     // asyncMessage(power.banner)
-    }
-    // SI-7418 Now, and only now, can we enable TAB completion.
-    in match {
-      case x: JLineReader => x.consoleReader.postInit
-      case _              =>
-    }
-  }
-  def process(settings: Settings): Boolean = savingContextLoader {
-    this.settings = settings
-    createInterpreter()
-
-    // sets in to some kind of reader depending on environmental cues
-    in = in0.fold(chooseReader(settings))(r => SimpleReader(r, out, interactive = true))
-    globalFuture = future {
-      intp.initializeSynchronous()
-      loopPostInit()
-      !intp.reporter.hasErrors
-    }
-    import scala.concurrent.duration._
-    Await.ready(globalFuture, 10 seconds)
-    printWelcome()
+  override def commands: List[LoopCommand] = sparkStandardCommands
+
+  /** 
+   * We override `loadFiles` because we need to initialize Spark *before* the REPL
+   * sees any files, so that the Spark context is visible in those files. This is a bit of a
+   * hack, but there isn't another hook available to us at this point.
+   */
+  override def loadFiles(settings: Settings): Unit = {
     initializeSpark()
-    loadFiles(settings)
-
-    try loop()
-    catch AbstractOrMissingHandler()
-    finally closeInterpreter()
-
-    true
+    super.loadFiles(settings)
   }
-
-  @deprecated("Use `process` instead", "2.9.0")
-  def main(settings: Settings): Unit = process(settings) //used by sbt
 }
 
 object SparkILoop {
-  implicit def loopToInterpreter(repl: SparkILoop): SparkIMain = repl.intp
 
-  // Designed primarily for use by test code: take a String with a
-  // bunch of code, and prints out a transcript of what it would look
-  // like if you'd just typed it into the repl.
-  def runForTranscript(code: String, settings: Settings): String = {
-    import java.io.{ BufferedReader, StringReader, OutputStreamWriter }
-
-    stringFromStream { ostream =>
-      Console.withOut(ostream) {
-        val output = new JPrintWriter(new OutputStreamWriter(ostream), true) {
-          override def write(str: String) = {
-            // completely skip continuation lines
-            if (str forall (ch => ch.isWhitespace || ch == '|')) ()
-            else super.write(str)
-          }
-        }
-        val input = new BufferedReader(new StringReader(code.trim + "\n")) {
-          override def readLine(): String = {
-            val s = super.readLine()
-            // helping out by printing the line being interpreted.
-            if (s != null)
-              // scalastyle:off println
-              output.println(s)
-              // scalastyle:on println
-            s
-          }
-        }
-        val repl = new SparkILoop(input, output)
-        if (settings.classpath.isDefault)
-          settings.classpath.value = sys.props("java.class.path")
-
-        repl process settings
-      }
-    }
-  }
-
-  /** Creates an interpreter loop with default settings and feeds
-    *  the given code to it as input.
-    */
+  /** 
+   * Creates an interpreter loop with default settings and feeds
+   * the given code to it as input.
+   */
   def run(code: String, sets: Settings = new Settings): String = {
     import java.io.{ BufferedReader, StringReader, OutputStreamWriter }
 
     stringFromStream { ostream =>
       Console.withOut(ostream) {
-        val input    = new BufferedReader(new StringReader(code))
-        val output   = new JPrintWriter(new OutputStreamWriter(ostream), true)
-        val repl     = new SparkILoop(input, output)
+        val input = new BufferedReader(new StringReader(code))
+        val output = new JPrintWriter(new OutputStreamWriter(ostream), true)
+        val repl = new SparkILoop(input, output)
 
         if (sets.classpath.isDefault)
           sets.classpath.value = sys.props("java.class.path")
diff --git a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkIMain.scala b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkIMain.scala
deleted file mode 100644
index 56c009a4e38e7..0000000000000
--- a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkIMain.scala
+++ /dev/null
@@ -1,1323 +0,0 @@
-/* NSC -- new Scala compiler
- * Copyright 2005-2013 LAMP/EPFL
- * @author  Martin Odersky
- */
-
-package scala
-package tools.nsc
-package interpreter
-
-import PartialFunction.cond
-import scala.language.implicitConversions
-import scala.beans.BeanProperty
-import scala.collection.mutable
-import scala.concurrent.{ Future, ExecutionContext }
-import scala.reflect.runtime.{ universe => ru }
-import scala.reflect.{ ClassTag, classTag }
-import scala.reflect.internal.util.{ BatchSourceFile, SourceFile }
-import scala.tools.util.PathResolver
-import scala.tools.nsc.io.AbstractFile
-import scala.tools.nsc.typechecker.{ TypeStrings, StructuredTypeStrings }
-import scala.tools.nsc.util.{ ScalaClassLoader, stringFromReader, stringFromWriter, StackTraceOps }
-import scala.tools.nsc.util.Exceptional.unwrap
-import javax.script.{AbstractScriptEngine, Bindings, ScriptContext, ScriptEngine, ScriptEngineFactory, ScriptException, CompiledScript, Compilable}
-
-/** An interpreter for Scala code.
-  *
-  *  The main public entry points are compile(), interpret(), and bind().
-  *  The compile() method loads a complete Scala file.  The interpret() method
-  *  executes one line of Scala code at the request of the user.  The bind()
-  *  method binds an object to a variable that can then be used by later
-  *  interpreted code.
-  *
-  *  The overall approach is based on compiling the requested code and then
-  *  using a Java classloader and Java reflection to run the code
-  *  and access its results.
-  *
-  *  In more detail, a single compiler instance is used
-  *  to accumulate all successfully compiled or interpreted Scala code.  To
-  *  "interpret" a line of code, the compiler generates a fresh object that
-  *  includes the line of code and which has public member(s) to export
-  *  all variables defined by that code.  To extract the result of an
-  *  interpreted line to show the user, a second "result object" is created
-  *  which imports the variables exported by the above object and then
-  *  exports members called "$eval" and "$print". To accomodate user expressions
-  *  that read from variables or methods defined in previous statements, "import"
-  *  statements are used.
-  *
-  *  This interpreter shares the strengths and weaknesses of using the
-  *  full compiler-to-Java.  The main strength is that interpreted code
-  *  behaves exactly as does compiled code, including running at full speed.
-  *  The main weakness is that redefining classes and methods is not handled
-  *  properly, because rebinding at the Java level is technically difficult.
-  *
-  *  @author Moez A. Abdel-Gawad
-  *  @author Lex Spoon
-  */
-class SparkIMain(@BeanProperty val factory: ScriptEngineFactory, initialSettings: Settings,
-  protected val out: JPrintWriter) extends AbstractScriptEngine with Compilable with SparkImports {
-  imain =>
-
-  setBindings(createBindings, ScriptContext.ENGINE_SCOPE)
-  object replOutput extends ReplOutput(settings.Yreploutdir) { }
-
-  @deprecated("Use replOutput.dir instead", "2.11.0")
-  def virtualDirectory = replOutput.dir
-  // Used in a test case.
-  def showDirectory() = replOutput.show(out)
-
-  private[nsc] var printResults               = true      // whether to print result lines
-  private[nsc] var totalSilence               = false     // whether to print anything
-  private var _initializeComplete             = false     // compiler is initialized
-  private var _isInitialized: Future[Boolean] = null      // set up initialization future
-  private var bindExceptions                  = true      // whether to bind the lastException variable
-  private var _executionWrapper               = ""        // code to be wrapped around all lines
-
-  /** We're going to go to some trouble to initialize the compiler asynchronously.
-    *  It's critical that nothing call into it until it's been initialized or we will
-    *  run into unrecoverable issues, but the perceived repl startup time goes
-    *  through the roof if we wait for it.  So we initialize it with a future and
-    *  use a lazy val to ensure that any attempt to use the compiler object waits
-    *  on the future.
-    */
-  private var _classLoader: util.AbstractFileClassLoader = null                              // active classloader
-  private val _compiler: ReplGlobal                 = newCompiler(settings, reporter)   // our private compiler
-
-  def compilerClasspath: Seq[java.net.URL] = (
-    if (isInitializeComplete) global.classPath.asURLs
-    else new PathResolver(settings).result.asURLs  // the compiler's classpath
-    )
-  def settings = initialSettings
-  // Run the code body with the given boolean settings flipped to true.
-  def withoutWarnings[T](body: => T): T = beQuietDuring {
-    val saved = settings.nowarn.value
-    if (!saved)
-      settings.nowarn.value = true
-
-    try body
-    finally if (!saved) settings.nowarn.value = false
-  }
-
-  /** construct an interpreter that reports to Console */
-  def this(settings: Settings, out: JPrintWriter) = this(null, settings, out)
-  def this(factory: ScriptEngineFactory, settings: Settings) = this(factory, settings, new NewLinePrintWriter(new ConsoleWriter, true))
-  def this(settings: Settings) = this(settings, new NewLinePrintWriter(new ConsoleWriter, true))
-  def this(factory: ScriptEngineFactory) = this(factory, new Settings())
-  def this() = this(new Settings())
-
-  lazy val formatting: Formatting = new Formatting {
-    val prompt = Properties.shellPromptString
-  }
-  lazy val reporter: SparkReplReporter = new SparkReplReporter(this)
-
-  import formatting._
-  import reporter.{ printMessage, printUntruncatedMessage }
-
-  // This exists mostly because using the reporter too early leads to deadlock.
-  private def echo(msg: String) { Console println msg }
-  private def _initSources = List(new BatchSourceFile("<init>", "class $repl_$init { }"))
-  private def _initialize() = {
-    try {
-      // if this crashes, REPL will hang its head in shame
-      val run = new _compiler.Run()
-      assert(run.typerPhase != NoPhase, "REPL requires a typer phase.")
-      run compileSources _initSources
-      _initializeComplete = true
-      true
-    }
-    catch AbstractOrMissingHandler()
-  }
-  private def tquoted(s: String) = "\"\"\"" + s + "\"\"\""
-  private val logScope = scala.sys.props contains "scala.repl.scope"
-  // scalastyle:off println
-  private def scopelog(msg: String) = if (logScope) Console.err.println(msg)
-  // scalastyle:on println
-
-  // argument is a thunk to execute after init is done
-  def initialize(postInitSignal: => Unit) {
-    synchronized {
-      if (_isInitialized == null) {
-        _isInitialized =
-          Future(try _initialize() finally postInitSignal)(ExecutionContext.global)
-      }
-    }
-  }
-  def initializeSynchronous(): Unit = {
-    if (!isInitializeComplete) {
-      _initialize()
-      assert(global != null, global)
-    }
-  }
-  def isInitializeComplete = _initializeComplete
-
-  lazy val global: Global = {
-    if (!isInitializeComplete) _initialize()
-    _compiler
-  }
-
-  import global._
-  import definitions.{ ObjectClass, termMember, dropNullaryMethod}
-
-  lazy val runtimeMirror = ru.runtimeMirror(classLoader)
-
-  private def noFatal(body: => Symbol): Symbol = try body catch { case _: FatalError => NoSymbol }
-
-  def getClassIfDefined(path: String)  = (
-    noFatal(runtimeMirror staticClass path)
-      orElse noFatal(rootMirror staticClass path)
-    )
-  def getModuleIfDefined(path: String) = (
-    noFatal(runtimeMirror staticModule path)
-      orElse noFatal(rootMirror staticModule path)
-    )
-
-  implicit class ReplTypeOps(tp: Type) {
-    def andAlso(fn: Type => Type): Type = if (tp eq NoType) tp else fn(tp)
-  }
-
-  // TODO: If we try to make naming a lazy val, we run into big time
-  // scalac unhappiness with what look like cycles.  It has not been easy to
-  // reduce, but name resolution clearly takes different paths.
-  object naming extends {
-    val global: imain.global.type = imain.global
-  } with Naming {
-    // make sure we don't overwrite their unwisely named res3 etc.
-    def freshUserTermName(): TermName = {
-      val name = newTermName(freshUserVarName())
-      if (replScope containsName name) freshUserTermName()
-      else name
-    }
-    def isInternalTermName(name: Name) = isInternalVarName("" + name)
-  }
-  import naming._
-
-  object deconstruct extends {
-    val global: imain.global.type = imain.global
-  } with StructuredTypeStrings
-
-  lazy val memberHandlers = new {
-    val intp: imain.type = imain
-  } with SparkMemberHandlers
-  import memberHandlers._
-
-  /** Temporarily be quiet */
-  def beQuietDuring[T](body: => T): T = {
-    val saved = printResults
-    printResults = false
-    try body
-    finally printResults = saved
-  }
-  def beSilentDuring[T](operation: => T): T = {
-    val saved = totalSilence
-    totalSilence = true
-    try operation
-    finally totalSilence = saved
-  }
-
-  def quietRun[T](code: String) = beQuietDuring(interpret(code))
-
-  /** takes AnyRef because it may be binding a Throwable or an Exceptional */
-  private def withLastExceptionLock[T](body: => T, alt: => T): T = {
-    assert(bindExceptions, "withLastExceptionLock called incorrectly.")
-    bindExceptions = false
-
-    try     beQuietDuring(body)
-    catch   logAndDiscard("withLastExceptionLock", alt)
-    finally bindExceptions = true
-  }
-
-  def executionWrapper = _executionWrapper
-  def setExecutionWrapper(code: String) = _executionWrapper = code
-  def clearExecutionWrapper() = _executionWrapper = ""
-
-  /** interpreter settings */
-  lazy val isettings = new SparkISettings(this)
-
-  /** Instantiate a compiler.  Overridable. */
-  protected def newCompiler(settings: Settings, reporter: reporters.Reporter): ReplGlobal = {
-    settings.outputDirs setSingleOutput replOutput.dir
-    settings.exposeEmptyPackage.value = true
-    new Global(settings, reporter) with ReplGlobal { override def toString: String = "<global>" }
-  }
-
-  /** Parent classloader.  Overridable. */
-  protected def parentClassLoader: ClassLoader =
-    settings.explicitParentLoader.getOrElse( this.getClass.getClassLoader() )
-
-  /* A single class loader is used for all commands interpreted by this Interpreter.
-     It would also be possible to create a new class loader for each command
-     to interpret.  The advantages of the current approach are:
-
-       - Expressions are only evaluated one time.  This is especially
-         significant for I/O, e.g. "val x = Console.readLine"
-
-     The main disadvantage is:
-
-       - Objects, classes, and methods cannot be rebound.  Instead, definitions
-         shadow the old ones, and old code objects refer to the old
-         definitions.
-  */
-  def resetClassLoader() = {
-    repldbg("Setting new classloader: was " + _classLoader)
-    _classLoader = null
-    ensureClassLoader()
-  }
-  final def ensureClassLoader() {
-    if (_classLoader == null)
-      _classLoader = makeClassLoader()
-  }
-  def classLoader: util.AbstractFileClassLoader = {
-    ensureClassLoader()
-    _classLoader
-  }
-
-  def backticked(s: String): String = (
-    (s split '.').toList map {
-      case "_"                               => "_"
-      case s if nme.keywords(newTermName(s)) => s"`$s`"
-      case s                                 => s
-    } mkString "."
-    )
-  def readRootPath(readPath: String) = getModuleIfDefined(readPath)
-
-  abstract class PhaseDependentOps {
-    def shift[T](op: => T): T
-
-    def path(name: => Name): String = shift(path(symbolOfName(name)))
-    def path(sym: Symbol): String = backticked(shift(sym.fullName))
-    def sig(sym: Symbol): String  = shift(sym.defString)
-  }
-  object typerOp extends PhaseDependentOps {
-    def shift[T](op: => T): T = exitingTyper(op)
-  }
-  object flatOp extends PhaseDependentOps {
-    def shift[T](op: => T): T = exitingFlatten(op)
-  }
-
-  def originalPath(name: String): String = originalPath(name: TermName)
-  def originalPath(name: Name): String   = typerOp path name
-  def originalPath(sym: Symbol): String  = typerOp path sym
-  def flatPath(sym: Symbol): String      = flatOp shift sym.javaClassName
-  def translatePath(path: String) = {
-    val sym = if (path endsWith "$") symbolOfTerm(path.init) else symbolOfIdent(path)
-    sym.toOption map flatPath
-  }
-  def translateEnclosingClass(n: String) = symbolOfTerm(n).enclClass.toOption map flatPath
-
-  private class TranslatingClassLoader(parent: ClassLoader) extends util.AbstractFileClassLoader(replOutput.dir, parent) {
-    /** Overridden here to try translating a simple name to the generated
-      *  class name if the original attempt fails.  This method is used by
-      *  getResourceAsStream as well as findClass.
-      */
-    override protected def findAbstractFile(name: String): AbstractFile =
-      super.findAbstractFile(name) match {
-        case null if _initializeComplete => translatePath(name) map (super.findAbstractFile(_)) orNull
-        case file => file
-      }
-  }
-  private def makeClassLoader(): util.AbstractFileClassLoader =
-    new TranslatingClassLoader(parentClassLoader match {
-      case null   => ScalaClassLoader fromURLs compilerClasspath
-      case p      => new ScalaClassLoader.URLClassLoader(compilerClasspath, p)
-    })
-
-  // Set the current Java "context" class loader to this interpreter's class loader
-  def setContextClassLoader() = classLoader.setAsContext()
-
-  def allDefinedNames: List[Name]  = exitingTyper(replScope.toList.map(_.name).sorted)
-  def unqualifiedIds: List[String] = allDefinedNames map (_.decode) sorted
-
-  /** Most recent tree handled which wasn't wholly synthetic. */
-  private def mostRecentlyHandledTree: Option[Tree] = {
-    prevRequests.reverse foreach { req =>
-      req.handlers.reverse foreach {
-        case x: MemberDefHandler if x.definesValue && !isInternalTermName(x.name) => return Some(x.member)
-        case _ => ()
-      }
-    }
-    None
-  }
-
-  private def updateReplScope(sym: Symbol, isDefined: Boolean) {
-    def log(what: String) {
-      val mark = if (sym.isType) "t " else "v "
-      val name = exitingTyper(sym.nameString)
-      val info = cleanTypeAfterTyper(sym)
-      val defn = sym defStringSeenAs info
-
-      scopelog(f"[$mark$what%6s] $name%-25s $defn%s")
-    }
-    if (ObjectClass isSubClass sym.owner) return
-    // unlink previous
-    replScope lookupAll sym.name foreach { sym =>
-      log("unlink")
-      replScope unlink sym
-    }
-    val what = if (isDefined) "define" else "import"
-    log(what)
-    replScope enter sym
-  }
-
-  def recordRequest(req: Request) {
-    if (req == null)
-      return
-
-    prevRequests += req
-
-    // warning about serially defining companions.  It'd be easy
-    // enough to just redefine them together but that may not always
-    // be what people want so I'm waiting until I can do it better.
-    exitingTyper {
-      req.defines filterNot (s => req.defines contains s.companionSymbol) foreach { newSym =>
-        val oldSym = replScope lookup newSym.name.companionName
-        if (Seq(oldSym, newSym).permutations exists { case Seq(s1, s2) => s1.isClass && s2.isModule }) {
-          replwarn(s"warning: previously defined $oldSym is not a companion to $newSym.")
-          replwarn("Companions must be defined together; you may wish to use :paste mode for this.")
-        }
-      }
-    }
-    exitingTyper {
-      req.imports foreach (sym => updateReplScope(sym, isDefined = false))
-      req.defines foreach (sym => updateReplScope(sym, isDefined = true))
-    }
-  }
-
-  private[nsc] def replwarn(msg: => String) {
-    if (!settings.nowarnings)
-      printMessage(msg)
-  }
-
-  def compileSourcesKeepingRun(sources: SourceFile*) = {
-    val run = new Run()
-    assert(run.typerPhase != NoPhase, "REPL requires a typer phase.")
-    reporter.reset()
-    run compileSources sources.toList
-    (!reporter.hasErrors, run)
-  }
-
-  /** Compile an nsc SourceFile.  Returns true if there are
-    *  no compilation errors, or false otherwise.
-    */
-  def compileSources(sources: SourceFile*): Boolean =
-    compileSourcesKeepingRun(sources: _*)._1
-
-  /** Compile a string.  Returns true if there are no
-    *  compilation errors, or false otherwise.
-    */
-  def compileString(code: String): Boolean =
-    compileSources(new BatchSourceFile("<script>", code))
-
-  /** Build a request from the user. `trees` is `line` after being parsed.
-    */
-  private def buildRequest(line: String, trees: List[Tree]): Request = {
-    executingRequest = new Request(line, trees)
-    executingRequest
-  }
-
-  private def safePos(t: Tree, alt: Int): Int =
-    try t.pos.start
-    catch { case _: UnsupportedOperationException => alt }
-
-  // Given an expression like 10 * 10 * 10 we receive the parent tree positioned
-  // at a '*'.  So look at each subtree and find the earliest of all positions.
-  private def earliestPosition(tree: Tree): Int = {
-    var pos = Int.MaxValue
-    tree foreach { t =>
-      pos = math.min(pos, safePos(t, Int.MaxValue))
-    }
-    pos
-  }
-
-  private def requestFromLine(line: String, synthetic: Boolean): Either[IR.Result, Request] = {
-    val content = indentCode(line)
-    val trees = parse(content) match {
-      case parse.Incomplete     => return Left(IR.Incomplete)
-      case parse.Error          => return Left(IR.Error)
-      case parse.Success(trees) => trees
-    }
-    repltrace(
-      trees map (t => {
-        // [Eugene to Paul] previously it just said `t map ...`
-        // because there was an implicit conversion from Tree to a list of Trees
-        // however Martin and I have removed the conversion
-        // (it was conflicting with the new reflection API),
-        // so I had to rewrite this a bit
-        val subs = t collect { case sub => sub }
-        subs map (t0 =>
-          "  " + safePos(t0, -1) + ": " + t0.shortClass + "\n"
-          ) mkString ""
-      }) mkString "\n"
-    )
-    // If the last tree is a bare expression, pinpoint where it begins using the
-    // AST node position and snap the line off there.  Rewrite the code embodied
-    // by the last tree as a ValDef instead, so we can access the value.
-    val last = trees.lastOption.getOrElse(EmptyTree)
-    last match {
-      case _:Assign                        => // we don't want to include assignments
-      case _:TermTree | _:Ident | _:Select => // ... but do want other unnamed terms.
-        val varName  = if (synthetic) freshInternalVarName() else freshUserVarName()
-        val rewrittenLine = (
-          // In theory this would come out the same without the 1-specific test, but
-          // it's a cushion against any more sneaky parse-tree position vs. code mismatches:
-          // this way such issues will only arise on multiple-statement repl input lines,
-          // which most people don't use.
-          if (trees.size == 1) "val " + varName + " =\n" + content
-          else {
-            // The position of the last tree
-            val lastpos0 = earliestPosition(last)
-            // Oh boy, the parser throws away parens so "(2+2)" is mispositioned,
-            // with increasingly hard to decipher positions as we move on to "() => 5",
-            // (x: Int) => x + 1, and more.  So I abandon attempts to finesse and just
-            // look for semicolons and newlines, which I'm sure is also buggy.
-            val (raw1, raw2) = content splitAt lastpos0
-            repldbg("[raw] " + raw1 + "   <--->   " + raw2)
-
-            val adjustment = (raw1.reverse takeWhile (ch => (ch != ';') && (ch != '\n'))).size
-            val lastpos = lastpos0 - adjustment
-
-            // the source code split at the laboriously determined position.
-            val (l1, l2) = content splitAt lastpos
-            repldbg("[adj] " + l1 + "   <--->   " + l2)
-
-            val prefix   = if (l1.trim == "") "" else l1 + ";\n"
-            // Note to self: val source needs to have this precise structure so that
-            // error messages print the user-submitted part without the "val res0 = " part.
-            val combined   = prefix + "val " + varName + " =\n" + l2
-
-            repldbg(List(
-              "    line" -> line,
-              " content" -> content,
-              "     was" -> l2,
-              "combined" -> combined) map {
-              case (label, s) => label + ": '" + s + "'"
-            } mkString "\n"
-            )
-            combined
-          }
-          )
-        // Rewriting    "foo ; bar ; 123"
-        // to           "foo ; bar ; val resXX = 123"
-        requestFromLine(rewrittenLine, synthetic) match {
-          case Right(req) => return Right(req withOriginalLine line)
-          case x          => return x
-        }
-      case _ =>
-    }
-    Right(buildRequest(line, trees))
-  }
-
-  // dealias non-public types so we don't see protected aliases like Self
-  def dealiasNonPublic(tp: Type) = tp match {
-    case TypeRef(_, sym, _) if sym.isAliasType && !sym.isPublic => tp.dealias
-    case _                                                      => tp
-  }
-
-  /**
-   *  Interpret one line of input. All feedback, including parse errors
-   *  and evaluation results, are printed via the supplied compiler's
-   *  reporter. Values defined are available for future interpreted strings.
-   *
-   *  The return value is whether the line was interpreter successfully,
-   *  e.g. that there were no parse errors.
-   */
-  def interpret(line: String): IR.Result = interpret(line, synthetic = false)
-  def interpretSynthetic(line: String): IR.Result = interpret(line, synthetic = true)
-  def interpret(line: String, synthetic: Boolean): IR.Result = compile(line, synthetic) match {
-    case Left(result) => result
-    case Right(req)   => new WrappedRequest(req).loadAndRunReq
-  }
-
-  private def compile(line: String, synthetic: Boolean): Either[IR.Result, Request] = {
-    if (global == null) Left(IR.Error)
-    else requestFromLine(line, synthetic) match {
-      case Left(result) => Left(result)
-      case Right(req)   =>
-        // null indicates a disallowed statement type; otherwise compile and
-        // fail if false (implying e.g. a type error)
-        if (req == null || !req.compile) Left(IR.Error) else Right(req)
-    }
-  }
-
-  var code = ""
-  var bound = false
-  def compiled(script: String): CompiledScript = {
-    if (!bound) {
-      quietBind("engine" -> this.asInstanceOf[ScriptEngine])
-      bound = true
-    }
-    val cat = code + script
-    compile(cat, false) match {
-      case Left(result) => result match {
-        case IR.Incomplete => {
-          code = cat + "\n"
-          new CompiledScript {
-            def eval(context: ScriptContext): Object = null
-            def getEngine: ScriptEngine = SparkIMain.this
-          }
-        }
-        case _ => {
-          code = ""
-          throw new ScriptException("compile-time error")
-        }
-      }
-      case Right(req)   => {
-        code = ""
-        new WrappedRequest(req)
-      }
-    }
-  }
-
-  private class WrappedRequest(val req: Request) extends CompiledScript {
-    var recorded = false
-
-    /** In Java we would have to wrap any checked exception in the declared
-      *  ScriptException. Runtime exceptions and errors would be ok and would
-      *  not need to be caught. So let us do the same in Scala : catch and
-      *  wrap any checked exception, and let runtime exceptions and errors
-      *  escape. We could have wrapped runtime exceptions just like other
-      *  exceptions in ScriptException, this is a choice.
-      */
-    @throws[ScriptException]
-    def eval(context: ScriptContext): Object = {
-      val result = req.lineRep.evalEither match {
-        case Left(e: RuntimeException) => throw e
-        case Left(e: Exception) => throw new ScriptException(e)
-        case Left(e) => throw e
-        case Right(result) => result.asInstanceOf[Object]
-      }
-      if (!recorded) {
-        recordRequest(req)
-        recorded = true
-      }
-      result
-    }
-
-    def loadAndRunReq = classLoader.asContext {
-      val (result, succeeded) = req.loadAndRun
-
-      /** To our displeasure, ConsoleReporter offers only printMessage,
-        *  which tacks a newline on the end.  Since that breaks all the
-        *  output checking, we have to take one off to balance.
-        */
-      if (succeeded) {
-        if (printResults && result != "")
-          printMessage(result stripSuffix "\n")
-        else if (isReplDebug) // show quiet-mode activity
-          printMessage(result.trim.lines map ("[quiet] " + _) mkString "\n")
-
-        // Book-keeping.  Have to record synthetic requests too,
-        // as they may have been issued for information, e.g. :type
-        recordRequest(req)
-        IR.Success
-      }
-      else {
-        // don't truncate stack traces
-        printUntruncatedMessage(result)
-        IR.Error
-      }
-    }
-
-    def getEngine: ScriptEngine = SparkIMain.this
-  }
-
-  /** Bind a specified name to a specified value.  The name may
-    *  later be used by expressions passed to interpret.
-    *
-    *  @param name      the variable name to bind
-    *  @param boundType the type of the variable, as a string
-    *  @param value     the object value to bind to it
-    *  @return          an indication of whether the binding succeeded
-    */
-  def bind(name: String, boundType: String, value: Any, modifiers: List[String] = Nil): IR.Result = {
-    val bindRep = new ReadEvalPrint()
-    bindRep.compile("""
-                      |object %s {
-                      |  var value: %s = _
-                      |  def set(x: Any) = value = x.asInstanceOf[%s]
-                      |}
-                    """.stripMargin.format(bindRep.evalName, boundType, boundType)
-    )
-    bindRep.callEither("set", value) match {
-      case Left(ex) =>
-        repldbg("Set failed in bind(%s, %s, %s)".format(name, boundType, value))
-        repldbg(util.stackTraceString(ex))
-        IR.Error
-
-      case Right(_) =>
-        val line = "%sval %s = %s.value".format(modifiers map (_ + " ") mkString, name, bindRep.evalPath)
-        repldbg("Interpreting: " + line)
-        interpret(line)
-    }
-  }
-  def directBind(name: String, boundType: String, value: Any): IR.Result = {
-    val result = bind(name, boundType, value)
-    if (result == IR.Success)
-      directlyBoundNames += newTermName(name)
-    result
-  }
-  def directBind(p: NamedParam): IR.Result                                    = directBind(p.name, p.tpe, p.value)
-  def directBind[T: ru.TypeTag : ClassTag](name: String, value: T): IR.Result = directBind((name, value))
-
-  def rebind(p: NamedParam): IR.Result = {
-    val name     = p.name
-    val newType  = p.tpe
-    val tempName = freshInternalVarName()
-
-    quietRun("val %s = %s".format(tempName, name))
-    quietRun("val %s = %s.asInstanceOf[%s]".format(name, tempName, newType))
-  }
-  def quietBind(p: NamedParam): IR.Result                               = beQuietDuring(bind(p))
-  def bind(p: NamedParam): IR.Result                                    = bind(p.name, p.tpe, p.value)
-  def bind[T: ru.TypeTag : ClassTag](name: String, value: T): IR.Result = bind((name, value))
-
-  /** Reset this interpreter, forgetting all user-specified requests. */
-  def reset() {
-    clearExecutionWrapper()
-    resetClassLoader()
-    resetAllCreators()
-    prevRequests.clear()
-    resetReplScope()
-    replOutput.dir.clear()
-  }
-
-  /** This instance is no longer needed, so release any resources
-    *  it is using.  The reporter's output gets flushed.
-    */
-  def close() {
-    reporter.flush()
-  }
-
-  /** Here is where we:
-    *
-    *  1) Read some source code, and put it in the "read" object.
-    *  2) Evaluate the read object, and put the result in the "eval" object.
-    *  3) Create a String for human consumption, and put it in the "print" object.
-    *
-    *  Read! Eval! Print! Some of that not yet centralized here.
-    */
-  class ReadEvalPrint(val lineId: Int) {
-    def this() = this(freshLineId())
-
-    val packageName = sessionNames.line + lineId
-    val readName    = sessionNames.read
-    val evalName    = sessionNames.eval
-    val printName   = sessionNames.print
-    val resultName  = sessionNames.result
-
-    def bindError(t: Throwable) = {
-      if (!bindExceptions) // avoid looping if already binding
-        throw t
-
-      val unwrapped = unwrap(t)
-
-      // Example input: $line3.$read$$iw$$iw$
-      val classNameRegex = (naming.lineRegex + ".*").r
-      def isWrapperInit(x: StackTraceElement) = cond(x.getClassName) {
-        case classNameRegex() if x.getMethodName == nme.CONSTRUCTOR.decoded => true
-      }
-      val stackTrace = unwrapped stackTracePrefixString (!isWrapperInit(_))
-
-      withLastExceptionLock[String]({
-        directBind[Throwable]("lastException", unwrapped)(StdReplTags.tagOfThrowable, classTag[Throwable])
-        stackTrace
-      }, stackTrace)
-    }
-
-    // TODO: split it out into a package object and a regular
-    // object and we can do that much less wrapping.
-    def packageDecl = "package " + packageName
-
-    def pathTo(name: String)   = packageName + "." + name
-    def packaged(code: String) = packageDecl + "\n\n" + code
-
-    def readPath  = pathTo(readName)
-    def evalPath  = pathTo(evalName)
-
-    def call(name: String, args: Any*): AnyRef = {
-      val m = evalMethod(name)
-      repldbg("Invoking: " + m)
-      if (args.nonEmpty)
-        repldbg("  with args: " + args.mkString(", "))
-
-      m.invoke(evalClass, args.map(_.asInstanceOf[AnyRef]): _*)
-    }
-
-    def callEither(name: String, args: Any*): Either[Throwable, AnyRef] =
-      try Right(call(name, args: _*))
-      catch { case ex: Throwable => Left(ex) }
-
-    class EvalException(msg: String, cause: Throwable) extends RuntimeException(msg, cause) { }
-
-    private def evalError(path: String, ex: Throwable) =
-      throw new EvalException("Failed to load '" + path + "': " + ex.getMessage, ex)
-
-    private def load(path: String): Class[_] = {
-      try Class.forName(path, true, classLoader)
-      catch { case ex: Throwable => evalError(path, unwrap(ex)) }
-    }
-
-    lazy val evalClass = load(evalPath)
-
-    def evalEither = callEither(resultName) match {
-      case Left(ex) => ex match {
-        case ex: NullPointerException => Right(null)
-        case ex => Left(unwrap(ex))
-      }
-      case Right(result) => Right(result)
-    }
-
-    def compile(source: String): Boolean = compileAndSaveRun("<console>", source)
-
-    /** The innermost object inside the wrapper, found by
-      * following accessPath into the outer one.
-      */
-    def resolvePathToSymbol(accessPath: String): Symbol = {
-      val readRoot: global.Symbol = readRootPath(readPath) // the outermost wrapper
-      ((".INSTANCE" + accessPath) split '.').foldLeft(readRoot: Symbol) {
-        case (sym, "")    => sym
-        case (sym, name)  => exitingTyper(termMember(sym, name))
-      }
-    }
-    /** We get a bunch of repeated warnings for reasons I haven't
-      *  entirely figured out yet.  For now, squash.
-      */
-    private def updateRecentWarnings(run: Run) {
-      def loop(xs: List[(Position, String)]): List[(Position, String)] = xs match {
-        case Nil                  => Nil
-        case ((pos, msg)) :: rest =>
-          val filtered = rest filter { case (pos0, msg0) =>
-            (msg != msg0) || (pos.lineContent.trim != pos0.lineContent.trim) || {
-              // same messages and same line content after whitespace removal
-              // but we want to let through multiple warnings on the same line
-              // from the same run.  The untrimmed line will be the same since
-              // there's no whitespace indenting blowing it.
-              (pos.lineContent == pos0.lineContent)
-            }
-          }
-          ((pos, msg)) :: loop(filtered)
-      }
-      val warnings = loop(run.reporting.allConditionalWarnings)
-      if (warnings.nonEmpty)
-        mostRecentWarnings = warnings
-    }
-    private def evalMethod(name: String) = evalClass.getMethods filter (_.getName == name) match {
-      case Array()       => null
-      case Array(method) => method
-      case xs            => sys.error("Internal error: eval object " + evalClass + ", " + xs.mkString("\n", "\n", ""))
-    }
-    private def compileAndSaveRun(label: String, code: String) = {
-      showCodeIfDebugging(code)
-      val (success, run) = compileSourcesKeepingRun(new BatchSourceFile(label, packaged(code)))
-      updateRecentWarnings(run)
-      success
-    }
-  }
-
-  /** One line of code submitted by the user for interpretation */
-  class Request(val line: String, val trees: List[Tree]) {
-    def defines    = defHandlers flatMap (_.definedSymbols)
-    def imports    = importedSymbols
-    def value      = Some(handlers.last) filter (h => h.definesValue) map (h => definedSymbols(h.definesTerm.get)) getOrElse NoSymbol
-
-    val lineRep = new ReadEvalPrint()
-
-    private var _originalLine: String = null
-    def withOriginalLine(s: String): this.type = { _originalLine = s ; this }
-    def originalLine = if (_originalLine == null) line else _originalLine
-
-    /** handlers for each tree in this request */
-    val handlers: List[MemberHandler] = trees map (memberHandlers chooseHandler _)
-    def defHandlers = handlers collect { case x: MemberDefHandler => x }
-
-    /** list of names used by this expression */
-    val referencedNames: List[Name] = handlers flatMap (_.referencedNames)
-
-    /** def and val names */
-    def termNames = handlers flatMap (_.definesTerm)
-    def typeNames = handlers flatMap (_.definesType)
-    def importedSymbols = handlers flatMap {
-      case x: ImportHandler => x.importedSymbols
-      case _                => Nil
-    }
-
-    val definedClasses = handlers.exists {
-      case _: ClassHandler => true
-      case _ => false
-    }
-    /** Code to import bound names from previous lines - accessPath is code to
-      * append to objectName to access anything bound by request.
-      */
-    lazy val ComputedImports(importsPreamble, importsTrailer, accessPath) =
-      exitingTyper(importsCode(referencedNames.toSet, ObjectSourceCode, definedClasses))
-
-    /** the line of code to compute */
-    def toCompute = line
-
-    /** The path of the value that contains the user code. */
-    def fullAccessPath = s"${lineRep.readPath}.INSTANCE$accessPath"
-
-    /** The path of the given member of the wrapping instance. */
-    def fullPath(vname: String) = s"$fullAccessPath.`$vname`"
-
-    /** generate the source code for the object that computes this request */
-    abstract class Wrapper extends SparkIMain.CodeAssembler[MemberHandler] {
-      def path = originalPath("$intp")
-      def envLines = {
-        if (!isReplPower) Nil // power mode only for now
-        else List("def %s = %s".format("$line", tquoted(originalLine)), "def %s = Nil".format("$trees"))
-      }
-      def preamble = s"""
-        |$preambleHeader
-        |%s%s%s
-      """.stripMargin.format(lineRep.readName, envLines.map("  " + _ + ";\n").mkString,
-          importsPreamble, indentCode(toCompute))
-
-      val generate = (m: MemberHandler) => m extraCodeToEvaluate Request.this
-
-      /** A format string with %s for $read, specifying the wrapper definition. */
-      def preambleHeader: String
-
-      /** Like preambleHeader for an import wrapper. */
-      def prewrap: String = preambleHeader + "\n"
-
-      /** Like postamble for an import wrapper. */
-      def postwrap: String
-    }
-
-    private class ObjectBasedWrapper extends Wrapper {
-      def preambleHeader = "object %s {"
-
-      def postamble = importsTrailer + "\n}"
-
-      def postwrap = "}\n"
-    }
-
-    private class ClassBasedWrapper extends Wrapper {
-      def preambleHeader = "class %s extends Serializable {"
-
-      /** Adds an object that instantiates the outer wrapping class. */
-      def postamble  = s"""
-                          |$importsTrailer
-                          |}
-                          |object ${lineRep.readName} {
-                          |   val INSTANCE = new ${lineRep.readName}();
-                          |}
-                          |""".stripMargin
-
-      import nme.{ INTERPRETER_IMPORT_WRAPPER => iw }
-
-      /** Adds a val that instantiates the wrapping class. */
-      def postwrap = s"}\nval $iw = new $iw\n"
-    }
-
-    private lazy val ObjectSourceCode: Wrapper = new ClassBasedWrapper
-    private object ResultObjectSourceCode extends SparkIMain.CodeAssembler[MemberHandler] {
-      /** We only want to generate this code when the result
-        *  is a value which can be referred to as-is.
-        */
-      val evalResult = Request.this.value match {
-        case NoSymbol => ""
-        case sym      =>
-          "lazy val %s = %s".format(lineRep.resultName,  fullPath(sym.decodedName))
-      }
-      // first line evaluates object to make sure constructor is run
-      // initial "" so later code can uniformly be: + etc
-      val preamble = """
-                       |object %s {
-                       |  %s
-                       |  lazy val %s: String = %s {
-                       |    %s
-                       |    (""
-                     """.stripMargin.format(
-          lineRep.evalName, evalResult, lineRep.printName,
-          executionWrapper, fullAccessPath
-        )
-
-      val postamble = """
-                        |    )
-                        |  }
-                        |}
-                      """.stripMargin
-      val generate = (m: MemberHandler) => m resultExtractionCode Request.this
-    }
-
-    /** Compile the object file.  Returns whether the compilation succeeded.
-      *  If all goes well, the "types" map is computed. */
-    lazy val compile: Boolean = {
-      // error counting is wrong, hence interpreter may overlook failure - so we reset
-      reporter.reset()
-
-      // compile the object containing the user's code
-      lineRep.compile(ObjectSourceCode(handlers)) && {
-        // extract and remember types
-        typeOf
-        typesOfDefinedTerms
-
-        // Assign symbols to the original trees
-        // TODO - just use the new trees.
-        defHandlers foreach { dh =>
-          val name = dh.member.name
-          definedSymbols get name foreach { sym =>
-            dh.member setSymbol sym
-            repldbg("Set symbol of " + name + " to " + symbolDefString(sym))
-          }
-        }
-
-        // compile the result-extraction object
-        val handls = if (printResults) handlers else Nil
-        withoutWarnings(lineRep compile ResultObjectSourceCode(handls))
-      }
-    }
-
-    lazy val resultSymbol =  lineRep.resolvePathToSymbol(accessPath)
-
-    def applyToResultMember[T](name: Name, f: Symbol => T) = exitingTyper(f(resultSymbol.info.nonPrivateDecl(name)))
-
-    /* typeOf lookup with encoding */
-    def lookupTypeOf(name: Name) = typeOf.getOrElse(name, typeOf(global.encode(name.toString)))
-
-    private def typeMap[T](f: Type => T) =
-      mapFrom[Name, Name, T](termNames ++ typeNames)(x => f(cleanMemberDecl(resultSymbol, x)))
-
-    /** Types of variables defined by this request. */
-    lazy val compilerTypeOf = typeMap[Type](x => x) withDefaultValue NoType
-    /** String representations of same. */
-    lazy val typeOf         = typeMap[String](tp => exitingTyper(tp.toString))
-
-    lazy val definedSymbols = (
-      termNames.map(x => x -> applyToResultMember(x, x => x)) ++
-        typeNames.map(x => x -> compilerTypeOf(x).typeSymbolDirect)
-      ).toMap[Name, Symbol] withDefaultValue NoSymbol
-
-    lazy val typesOfDefinedTerms = mapFrom[Name, Name, Type](termNames)(x => applyToResultMember(x, _.tpe))
-
-    /** load and run the code using reflection */
-    def loadAndRun: (String, Boolean) = {
-      try   { ("" + (lineRep call sessionNames.print), true) }
-      catch { case ex: Throwable => (lineRep.bindError(ex), false) }
-    }
-
-    override def toString = "Request(line=%s, %s trees)".format(line, trees.size)
-  }
-
-  def createBindings: Bindings = new IBindings {
-    override def put(name: String, value: Object): Object = {
-      val n = name.indexOf(":")
-      val p: NamedParam = if (n < 0) (name, value) else {
-        val nme = name.substring(0, n).trim
-        val tpe = name.substring(n + 1).trim
-        NamedParamClass(nme, tpe, value)
-      }
-      if (!p.name.startsWith("javax.script")) bind(p)
-      null
-    }
-  }
-
-  @throws[ScriptException]
-  def compile(script: String): CompiledScript = eval("new javax.script.CompiledScript { def eval(context: javax.script.ScriptContext): Object = { " + script + " }.asInstanceOf[Object]; def getEngine: javax.script.ScriptEngine = engine }").asInstanceOf[CompiledScript]
-
-  @throws[ScriptException]
-  def compile(reader: java.io.Reader): CompiledScript = compile(stringFromReader(reader))
-
-  @throws[ScriptException]
-  def eval(script: String, context: ScriptContext): Object = compiled(script).eval(context)
-
-  @throws[ScriptException]
-  def eval(reader: java.io.Reader, context: ScriptContext): Object = eval(stringFromReader(reader), context)
-
-  override def finalize = close
-
-  /** Returns the name of the most recent interpreter result.
-    *  Mostly this exists so you can conveniently invoke methods on
-    *  the previous result.
-    */
-  def mostRecentVar: String =
-    if (mostRecentlyHandledTree.isEmpty) ""
-    else "" + (mostRecentlyHandledTree.get match {
-      case x: ValOrDefDef           => x.name
-      case Assign(Ident(name), _)   => name
-      case ModuleDef(_, name, _)    => name
-      case _                        => naming.mostRecentVar
-    })
-
-  private var mostRecentWarnings: List[(global.Position, String)] = Nil
-  def lastWarnings = mostRecentWarnings
-
-  private lazy val importToGlobal  = global mkImporter ru
-  private lazy val importToRuntime = ru.internal createImporter global
-  private lazy val javaMirror = ru.rootMirror match {
-    case x: ru.JavaMirror => x
-    case _                => null
-  }
-  private implicit def importFromRu(sym: ru.Symbol): Symbol = importToGlobal importSymbol sym
-  private implicit def importToRu(sym: Symbol): ru.Symbol   = importToRuntime importSymbol sym
-
-  def classOfTerm(id: String): Option[JClass] = symbolOfTerm(id) match {
-    case NoSymbol => None
-    case sym      => Some(javaMirror runtimeClass importToRu(sym).asClass)
-  }
-
-  def typeOfTerm(id: String): Type = symbolOfTerm(id).tpe
-
-  def valueOfTerm(id: String): Option[Any] = exitingTyper {
-    def value() = {
-      val sym0    = symbolOfTerm(id)
-      val sym     = (importToRuntime importSymbol sym0).asTerm
-      val module  = runtimeMirror.reflectModule(sym.owner.companionSymbol.asModule).instance
-      val module1 = runtimeMirror.reflect(module)
-      val invoker = module1.reflectField(sym)
-
-      invoker.get
-    }
-
-    try Some(value()) catch { case _: Exception => None }
-  }
-
-  /** It's a bit of a shotgun approach, but for now we will gain in
-    *  robustness. Try a symbol-producing operation at phase typer, and
-    *  if that is NoSymbol, try again at phase flatten. I'll be able to
-    *  lose this and run only from exitingTyper as soon as I figure out
-    *  exactly where a flat name is sneaking in when calculating imports.
-    */
-  def tryTwice(op: => Symbol): Symbol = exitingTyper(op) orElse exitingFlatten(op)
-
-  def symbolOfIdent(id: String): Symbol  = symbolOfType(id) orElse symbolOfTerm(id)
-  def symbolOfType(id: String): Symbol   = tryTwice(replScope lookup (id: TypeName))
-  def symbolOfTerm(id: String): Symbol   = tryTwice(replScope lookup (id: TermName))
-  def symbolOfName(id: Name): Symbol     = replScope lookup id
-
-  def runtimeClassAndTypeOfTerm(id: String): Option[(JClass, Type)] = {
-    classOfTerm(id) flatMap { clazz =>
-      clazz.supers find (!_.isScalaAnonymous) map { nonAnon =>
-        (nonAnon, runtimeTypeOfTerm(id))
-      }
-    }
-  }
-
-  def runtimeTypeOfTerm(id: String): Type = {
-    typeOfTerm(id) andAlso { tpe =>
-      val clazz      = classOfTerm(id) getOrElse { return NoType }
-      val staticSym  = tpe.typeSymbol
-      val runtimeSym = getClassIfDefined(clazz.getName)
-
-      if ((runtimeSym != NoSymbol) && (runtimeSym != staticSym) && (runtimeSym isSubClass staticSym))
-        runtimeSym.info
-      else NoType
-    }
-  }
-
-  def cleanTypeAfterTyper(sym: => Symbol): Type = {
-    exitingTyper(
-      dealiasNonPublic(
-        dropNullaryMethod(
-          sym.tpe_*
-        )
-      )
-    )
-  }
-  def cleanMemberDecl(owner: Symbol, member: Name): Type =
-    cleanTypeAfterTyper(owner.info nonPrivateDecl member)
-
-  object exprTyper extends {
-    val repl: SparkIMain.this.type = imain
-  } with SparkExprTyper { }
-
-  /** Parse a line into and return parsing result (error, incomplete or success with list of trees) */
-  object parse {
-    abstract sealed class Result
-    case object Error extends Result
-    case object Incomplete extends Result
-    case class Success(trees: List[Tree]) extends Result
-
-    def apply(line: String): Result = debugging(s"""parse("$line")""")  {
-      var isIncomplete = false
-      currentRun.parsing.withIncompleteHandler((_, _) => isIncomplete = true) {
-        reporter.reset()
-        val trees = newUnitParser(line).parseStats()
-        if (reporter.hasErrors) Error
-        else if (isIncomplete) Incomplete
-        else Success(trees)
-      }
-    }
-  }
-
-  def symbolOfLine(code: String): Symbol =
-    exprTyper.symbolOfLine(code)
-
-  def typeOfExpression(expr: String, silent: Boolean = true): Type =
-    exprTyper.typeOfExpression(expr, silent)
-
-  protected def onlyTerms(xs: List[Name]): List[TermName] = xs collect { case x: TermName => x }
-  protected def onlyTypes(xs: List[Name]): List[TypeName] = xs collect { case x: TypeName => x }
-
-  def definedTerms      = onlyTerms(allDefinedNames) filterNot isInternalTermName
-  def definedTypes      = onlyTypes(allDefinedNames)
-  def definedSymbolList = prevRequestList flatMap (_.defines) filterNot (s => isInternalTermName(s.name))
-
-  // Terms with user-given names (i.e. not res0 and not synthetic)
-  def namedDefinedTerms = definedTerms filterNot (x => isUserVarName("" + x) || directlyBoundNames(x))
-
-  private var _replScope: Scope = _
-  private def resetReplScope() {
-    _replScope = newScope
-  }
-  def replScope = {
-    if (_replScope eq null)
-      _replScope = newScope
-
-    _replScope
-  }
-
-  private var executingRequest: Request = _
-  private val prevRequests       = mutable.ListBuffer[Request]()
-  private val directlyBoundNames = mutable.Set[Name]()
-
-  def allHandlers     = prevRequestList flatMap (_.handlers)
-  def lastRequest     = if (prevRequests.isEmpty) null else prevRequests.last
-  def prevRequestList = prevRequests.toList
-  def importHandlers  = allHandlers collect { case x: ImportHandler => x }
-
-  def withoutUnwrapping(op: => Unit): Unit = {
-    val saved = isettings.unwrapStrings
-    isettings.unwrapStrings = false
-    try op
-    finally isettings.unwrapStrings = saved
-  }
-
-  def symbolDefString(sym: Symbol) = {
-    TypeStrings.quieter(
-      exitingTyper(sym.defString),
-      sym.owner.name + ".this.",
-      sym.owner.fullName + "."
-    )
-  }
-
-  def showCodeIfDebugging(code: String) {
-    /** Secret bookcase entrance for repl debuggers: end the line
-      *  with "// show" and see what's going on.
-      */
-    def isShow = code.lines exists (_.trim endsWith "// show")
-    if (isReplDebug || isShow) {
-      beSilentDuring(parse(code)) match {
-        case parse.Success(ts) =>
-          ts foreach { t =>
-            withoutUnwrapping(echo(asCompactString(t)))
-          }
-        case _ =>
-      }
-    }
-  }
-
-  // debugging
-  def debugging[T](msg: String)(res: T) = {
-    repldbg(msg + " " + res)
-    res
-  }
-}
-
-/** Utility methods for the Interpreter. */
-object SparkIMain {
-  import java.util.Arrays.{ asList => asJavaList }
-
-  // The two name forms this is catching are the two sides of this assignment:
-  //
-  // $line3.$read.$iw.$iw.Bippy =
-  //   $line3.$read$$iw$$iw$Bippy@4a6a00ca
-  private def removeLineWrapper(s: String) = s.replaceAll("""\$line\d+[./]\$(read|eval|print)[$.]""", "")
-  private def removeIWPackages(s: String)  = s.replaceAll("""\$(iw|read|eval|print)[$.]""", "")
-  private def removeSparkVals(s: String) = s.replaceAll("""\$VAL[0-9]+[$.]""", "")
-  def stripString(s: String)               = removeSparkVals(removeIWPackages(removeLineWrapper(s)))
-  
-  trait CodeAssembler[T] {
-    def preamble: String
-    def generate: T => String
-    def postamble: String
-
-    def apply(contributors: List[T]): String = stringFromWriter { code =>
-      code println preamble
-      contributors map generate foreach (code println _)
-      code println postamble
-    }
-  }
-
-  trait StrippingWriter {
-    def isStripping: Boolean
-    def stripImpl(str: String): String
-    def strip(str: String): String = if (isStripping) stripImpl(str) else str
-  }
-  trait TruncatingWriter {
-    def maxStringLength: Int
-    def isTruncating: Boolean
-    def truncate(str: String): String = {
-      if (isTruncating && (maxStringLength != 0 && str.length > maxStringLength))
-        (str take maxStringLength - 3) + "..."
-      else str
-    }
-  }
-  abstract class StrippingTruncatingWriter(out: JPrintWriter)
-    extends JPrintWriter(out)
-    with StrippingWriter
-    with TruncatingWriter {
-    self =>
-
-    def clean(str: String): String = truncate(strip(str))
-    override def write(str: String) = super.write(clean(str))
-  }
-  class SparkReplStrippingWriter(intp: SparkIMain) extends StrippingTruncatingWriter(intp.out) {
-    import intp._
-    def maxStringLength    = isettings.maxPrintString
-    def isStripping        = isettings.unwrapStrings
-    def isTruncating       = reporter.truncationOK
-
-    def stripImpl(str: String): String = naming.unmangle(str)
-  }
-}
-
-/** Settings for the interpreter
-  *
-  * @version 1.0
-  * @author Lex Spoon, 2007/3/24
-  **/
-class SparkISettings(intp: SparkIMain) {
-  /** The maximum length of toString to use when printing the result
-    *  of an evaluation.  0 means no maximum.  If a printout requires
-    *  more than this number of characters, then the printout is
-    *  truncated.
-    */
-  var maxPrintString = replProps.maxPrintString.option.getOrElse(800)
-
-  /** The maximum number of completion candidates to print for tab
-    *  completion without requiring confirmation.
-    */
-  var maxAutoprintCompletion = 250
-
-  /** String unwrapping can be disabled if it is causing issues.
-    *  Setting this to false means you will see Strings like "$iw.$iw.".
-    */
-  var unwrapStrings = true
-
-  def deprecation_=(x: Boolean) = {
-    val old = intp.settings.deprecation.value
-    intp.settings.deprecation.value = x
-    // scalastyle:off println
-    if (!old && x) println("Enabled -deprecation output.")
-    else if (old && !x) println("Disabled -deprecation output.")
-    // scalastyle:on println
-  }
-  def deprecation: Boolean = intp.settings.deprecation.value
-
-  def allSettings = Map[String, Any](
-    "maxPrintString" -> maxPrintString,
-    "maxAutoprintCompletion" -> maxAutoprintCompletion,
-    "unwrapStrings" -> unwrapStrings,
-    "deprecation" -> deprecation
-  )
-
-  private def allSettingsString =
-    allSettings.toList sortBy (_._1) map { case (k, v) => "  " + k + " = " + v + "\n" } mkString
-
-  override def toString = """
-                            | SparkISettings {
-                            | %s
-                            | }""".stripMargin.format(allSettingsString)
-}
diff --git a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkImports.scala b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkImports.scala
deleted file mode 100644
index e60406d1e5ad8..0000000000000
--- a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkImports.scala
+++ /dev/null
@@ -1,201 +0,0 @@
-/* NSC -- new Scala compiler
- * Copyright 2005-2013 LAMP/EPFL
- * @author  Paul Phillips
- */
-
-package scala.tools.nsc
-package interpreter
-
-import scala.collection.{ mutable, immutable }
-
-trait SparkImports {
-  self: SparkIMain =>
-
-  import global._
-  import definitions.{ ObjectClass, ScalaPackage, JavaLangPackage, PredefModule }
-  import memberHandlers._
-
-  /** Synthetic import handlers for the language defined imports. */
-  private def makeWildcardImportHandler(sym: Symbol): ImportHandler = {
-    val hd :: tl = sym.fullName.split('.').toList map newTermName
-    val tree = Import(
-      tl.foldLeft(Ident(hd): Tree)((x, y) => Select(x, y)),
-      ImportSelector.wildList
-    )
-    tree setSymbol sym
-    new ImportHandler(tree)
-  }
-
-  /** Symbols whose contents are language-defined to be imported. */
-  def languageWildcardSyms: List[Symbol] = List(JavaLangPackage, ScalaPackage, PredefModule)
-  def languageWildcardHandlers = languageWildcardSyms map makeWildcardImportHandler
-
-  def allImportedNames = importHandlers flatMap (_.importedNames)
-
-  /** Types which have been wildcard imported, such as:
-    *    val x = "abc" ; import x._  // type java.lang.String
-    *    import java.lang.String._   // object java.lang.String
-    *
-    *  Used by tab completion.
-    *
-    *  XXX right now this gets import x._ and import java.lang.String._,
-    *  but doesn't figure out import String._.  There's a lot of ad hoc
-    *  scope twiddling which should be swept away in favor of digging
-    *  into the compiler scopes.
-    */
-  def sessionWildcards: List[Type] = {
-    importHandlers filter (_.importsWildcard) map (_.targetType) distinct
-  }
-
-  def languageSymbols        = languageWildcardSyms flatMap membersAtPickler
-  def sessionImportedSymbols = importHandlers flatMap (_.importedSymbols)
-  def importedSymbols        = languageSymbols ++ sessionImportedSymbols
-  def importedTermSymbols    = importedSymbols collect { case x: TermSymbol => x }
-
-  /** Tuples of (source, imported symbols) in the order they were imported.
-    */
-  def importedSymbolsBySource: List[(Symbol, List[Symbol])] = {
-    val lang    = languageWildcardSyms map (sym => (sym, membersAtPickler(sym)))
-    val session = importHandlers filter (_.targetType != NoType) map { mh =>
-      (mh.targetType.typeSymbol, mh.importedSymbols)
-    }
-
-    lang ++ session
-  }
-  def implicitSymbolsBySource: List[(Symbol, List[Symbol])] = {
-    importedSymbolsBySource map {
-      case (k, vs) => (k, vs filter (_.isImplicit))
-    } filterNot (_._2.isEmpty)
-  }
-
-  /** Compute imports that allow definitions from previous
-    *  requests to be visible in a new request.  Returns
-    *  three pieces of related code:
-    *
-    *  1. An initial code fragment that should go before
-    *  the code of the new request.
-    *
-    *  2. A code fragment that should go after the code
-    *  of the new request.
-    *
-    *  3. An access path which can be traversed to access
-    *  any bindings inside code wrapped by #1 and #2 .
-    *
-    * The argument is a set of Names that need to be imported.
-    *
-    * Limitations: This method is not as precise as it could be.
-    * (1) It does not process wildcard imports to see what exactly
-    * they import.
-    * (2) If it imports any names from a request, it imports all
-    * of them, which is not really necessary.
-    * (3) It imports multiple same-named implicits, but only the
-    * last one imported is actually usable.
-    */
-  case class ComputedImports(prepend: String, append: String, access: String)
-  protected def importsCode(wanted: Set[Name], wrapper: Request#Wrapper, definedClass: Boolean): ComputedImports = {
-    /** Narrow down the list of requests from which imports
-      *  should be taken.  Removes requests which cannot contribute
-      *  useful imports for the specified set of wanted names.
-      */
-    case class ReqAndHandler(req: Request, handler: MemberHandler) { }
-
-    def reqsToUse: List[ReqAndHandler] = {
-      /** Loop through a list of MemberHandlers and select which ones to keep.
-        * 'wanted' is the set of names that need to be imported.
-        */
-      def select(reqs: List[ReqAndHandler], wanted: Set[Name]): List[ReqAndHandler] = {
-        // Single symbol imports might be implicits! See bug #1752.  Rather than
-        // try to finesse this, we will mimic all imports for now.
-        def keepHandler(handler: MemberHandler) = handler match {
-          case h: ImportHandler if definedClass => h.importedNames.exists(x => wanted.contains(x))
-          case _: ImportHandler => true
-          case x                => x.definesImplicit || (x.definedNames exists wanted)
-        }
-
-        reqs match {
-          case Nil                                    => Nil
-          case rh :: rest if !keepHandler(rh.handler) => select(rest, wanted)
-          case rh :: rest                             =>
-            import rh.handler._
-            val newWanted = wanted ++ referencedNames -- definedNames -- importedNames
-            rh :: select(rest, newWanted)
-        }
-      }
-
-      /** Flatten the handlers out and pair each with the original request */
-      select(allReqAndHandlers reverseMap { case (r, h) => ReqAndHandler(r, h) }, wanted).reverse
-    }
-
-    val code, trailingBraces, accessPath = new StringBuilder
-    val currentImps = mutable.HashSet[Name]()
-
-    // add code for a new object to hold some imports
-    def addWrapper() {
-      import nme.{ INTERPRETER_IMPORT_WRAPPER => iw }
-      code append (wrapper.prewrap format iw)
-      trailingBraces append wrapper.postwrap
-      accessPath append s".$iw"
-      currentImps.clear()
-    }
-
-    def maybeWrap(names: Name*) = if (names exists currentImps) addWrapper()
-
-    def wrapBeforeAndAfter[T](op: => T): T = {
-      addWrapper()
-      try op finally addWrapper()
-    }
-
-    // loop through previous requests, adding imports for each one
-    wrapBeforeAndAfter {
-      for (ReqAndHandler(req, handler) <- reqsToUse) {
-        handler match {
-          // If the user entered an import, then just use it; add an import wrapping
-          // level if the import might conflict with some other import
-          case x: ImportHandler if x.importsWildcard =>
-            wrapBeforeAndAfter(code append (x.member + "\n"))
-          case x: ImportHandler =>
-            maybeWrap(x.importedNames: _*)
-            code append (x.member + "\n")
-            currentImps ++= x.importedNames
-
-          case x: ClassHandler =>
-             for (imv <- x.definedNames) {
-              val objName = req.lineRep.readPath
-              code.append("import " + objName + ".INSTANCE" + req.accessPath + ".`" + imv + "`\n")
-            }
-
-          // For other requests, import each defined name.
-          // import them explicitly instead of with _, so that
-          // ambiguity errors will not be generated. Also, quote
-          // the name of the variable, so that we don't need to
-          // handle quoting keywords separately.
-          case x =>
-            for (imv <- x.definedNames) {
-              if (currentImps contains imv) addWrapper()
-              val objName = req.lineRep.readPath
-              val valName = "$VAL" + newValId()
-              if(!code.toString.endsWith(".`" + imv + "`;\n")) { // Which means already imported
-                code.append("val " + valName + " = " + objName + ".INSTANCE\n")
-                code.append("import " + valName + req.accessPath + ".`" + imv + "`;\n")
-              }
-              currentImps += imv
-            }
-        }
-      }
-    }
-    addWrapper()
-    ComputedImports(code.toString, trailingBraces.toString, accessPath.toString)
-  }
-  private var curValId = 0
-
-  private def newValId(): Int = {
-    curValId += 1
-    curValId
-  }
-
-  private def allReqAndHandlers =
-    prevRequestList flatMap (req => req.handlers map (req -> _))
-
-  private def membersAtPickler(sym: Symbol): List[Symbol] =
-    enteringPickler(sym.info.nonPrivateMembers.toList)
-}
diff --git a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkJLineCompletion.scala b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkJLineCompletion.scala
deleted file mode 100644
index 7fe6dcb32827f..0000000000000
--- a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkJLineCompletion.scala
+++ /dev/null
@@ -1,350 +0,0 @@
-/* NSC -- new Scala compiler
- * Copyright 2005-2013 LAMP/EPFL
- * @author Paul Phillips
- */
-
-package scala.tools.nsc
-package interpreter
-
-import Completion._
-import scala.collection.mutable.ListBuffer
-import scala.reflect.internal.util.StringOps.longestCommonPrefix
-
-// REPL completor - queries supplied interpreter for valid
-// completions based on current contents of buffer.
-class SparkJLineCompletion(val intp: SparkIMain) extends Completion with CompletionOutput {
-  val global: intp.global.type = intp.global
-  import global._
-  import definitions._
-  import rootMirror.{ RootClass, getModuleIfDefined }
-  import intp.{ debugging }
-
-  // verbosity goes up with consecutive tabs
-  private var verbosity: Int = 0
-  def resetVerbosity() = verbosity = 0
-
-  def getSymbol(name: String, isModule: Boolean) = (
-    if (isModule) getModuleIfDefined(name)
-    else getModuleIfDefined(name)
-    )
-
-  trait CompilerCompletion {
-    def tp: Type
-    def effectiveTp = tp match {
-      case MethodType(Nil, resType)   => resType
-      case NullaryMethodType(resType) => resType
-      case _                          => tp
-    }
-
-    // for some reason any's members don't show up in subclasses, which
-    // we need so 5.<tab> offers asInstanceOf etc.
-    private def anyMembers = AnyTpe.nonPrivateMembers
-    def anyRefMethodsToShow = Set("isInstanceOf", "asInstanceOf", "toString")
-
-    def tos(sym: Symbol): String = sym.decodedName
-    def memberNamed(s: String) = exitingTyper(effectiveTp member newTermName(s))
-
-    // XXX we'd like to say "filterNot (_.isDeprecated)" but this causes the
-    // compiler to crash for reasons not yet known.
-    def members     = exitingTyper((effectiveTp.nonPrivateMembers.toList ++ anyMembers) filter (_.isPublic))
-    def methods     = members.toList filter (_.isMethod)
-    def packages    = members.toList filter (_.hasPackageFlag)
-    def aliases     = members.toList filter (_.isAliasType)
-
-    def memberNames   = members map tos
-    def methodNames   = methods map tos
-    def packageNames  = packages map tos
-    def aliasNames    = aliases map tos
-  }
-
-  object NoTypeCompletion extends TypeMemberCompletion(NoType) {
-    override def memberNamed(s: String) = NoSymbol
-    override def members = Nil
-    override def follow(s: String) = None
-    override def alternativesFor(id: String) = Nil
-  }
-
-  object TypeMemberCompletion {
-    def apply(tp: Type, runtimeType: Type, param: NamedParam): TypeMemberCompletion = {
-      new TypeMemberCompletion(tp) {
-        var upgraded = false
-        lazy val upgrade = {
-          intp rebind param
-          intp.reporter.printMessage("\nRebinding stable value %s from %s to %s".format(param.name, tp, param.tpe))
-          upgraded = true
-          new TypeMemberCompletion(runtimeType)
-        }
-        override def completions(verbosity: Int) = {
-          super.completions(verbosity) ++ (
-            if (verbosity == 0) Nil
-            else upgrade.completions(verbosity)
-            )
-        }
-        override def follow(s: String) = super.follow(s) orElse {
-          if (upgraded) upgrade.follow(s)
-          else None
-        }
-        override def alternativesFor(id: String) = super.alternativesFor(id) ++ (
-          if (upgraded) upgrade.alternativesFor(id)
-          else Nil
-          ) distinct
-      }
-    }
-    def apply(tp: Type): TypeMemberCompletion = {
-      if (tp eq NoType) NoTypeCompletion
-      else if (tp.typeSymbol.isPackageClass) new PackageCompletion(tp)
-      else new TypeMemberCompletion(tp)
-    }
-    def imported(tp: Type) = new ImportCompletion(tp)
-  }
-
-  class TypeMemberCompletion(val tp: Type) extends CompletionAware
-  with CompilerCompletion {
-    def excludeEndsWith: List[String] = Nil
-    def excludeStartsWith: List[String] = List("<") // <byname>, <repeated>, etc.
-    def excludeNames: List[String] = (anyref.methodNames filterNot anyRefMethodsToShow) :+ "_root_"
-
-    def methodSignatureString(sym: Symbol) = {
-      IMain stripString exitingTyper(new MethodSymbolOutput(sym).methodString())
-    }
-
-    def exclude(name: String): Boolean = (
-      (name contains "$") ||
-        (excludeNames contains name) ||
-        (excludeEndsWith exists (name endsWith _)) ||
-        (excludeStartsWith exists (name startsWith _))
-      )
-    def filtered(xs: List[String]) = xs filterNot exclude distinct
-
-    def completions(verbosity: Int) =
-      debugging(tp + " completions ==> ")(filtered(memberNames))
-
-    override def follow(s: String): Option[CompletionAware] =
-      debugging(tp + " -> '" + s + "' ==> ")(Some(TypeMemberCompletion(memberNamed(s).tpe)) filterNot (_ eq NoTypeCompletion))
-
-    override def alternativesFor(id: String): List[String] =
-      debugging(id + " alternatives ==> ") {
-        val alts = members filter (x => x.isMethod && tos(x) == id) map methodSignatureString
-
-        if (alts.nonEmpty) "" :: alts else Nil
-      }
-
-    override def toString = "%s (%d members)".format(tp, members.size)
-  }
-
-  class PackageCompletion(tp: Type) extends TypeMemberCompletion(tp) {
-    override def excludeNames = anyref.methodNames
-  }
-
-  class LiteralCompletion(lit: Literal) extends TypeMemberCompletion(lit.value.tpe) {
-    override def completions(verbosity: Int) = verbosity match {
-      case 0    => filtered(memberNames)
-      case _    => memberNames
-    }
-  }
-
-  class ImportCompletion(tp: Type) extends TypeMemberCompletion(tp) {
-    override def completions(verbosity: Int) = verbosity match {
-      case 0    => filtered(members filterNot (_.isSetter) map tos)
-      case _    => super.completions(verbosity)
-    }
-  }
-
-  // not for completion but for excluding
-  object anyref extends TypeMemberCompletion(AnyRefTpe) { }
-
-  // the unqualified vals/defs/etc visible in the repl
-  object ids extends CompletionAware {
-    override def completions(verbosity: Int) = intp.unqualifiedIds ++ List("classOf") //, "_root_")
-    // now we use the compiler for everything.
-    override def follow(id: String): Option[CompletionAware] = {
-      if (!completions(0).contains(id))
-        return None
-
-      val tpe = intp typeOfExpression id
-      if (tpe == NoType)
-        return None
-
-      def default = Some(TypeMemberCompletion(tpe))
-
-      // only rebinding vals in power mode for now.
-      if (!isReplPower) default
-      else intp runtimeClassAndTypeOfTerm id match {
-        case Some((clazz, runtimeType)) =>
-          val sym = intp.symbolOfTerm(id)
-          if (sym.isStable) {
-            val param = new NamedParam.Untyped(id, intp valueOfTerm id getOrElse null)
-            Some(TypeMemberCompletion(tpe, runtimeType, param))
-          }
-          else default
-        case _        =>
-          default
-      }
-    }
-    override def toString = "<repl ids> (%s)".format(completions(0).size)
-  }
-
-  // user-issued wildcard imports like "import global._" or "import String._"
-  private def imported = intp.sessionWildcards map TypeMemberCompletion.imported
-
-  // literal Ints, Strings, etc.
-  object literals extends CompletionAware {
-    def simpleParse(code: String): Option[Tree] = newUnitParser(code).parseStats().lastOption
-    def completions(verbosity: Int) = Nil
-
-    override def follow(id: String) = simpleParse(id).flatMap {
-      case x: Literal   => Some(new LiteralCompletion(x))
-      case _            => None
-    }
-  }
-
-  // top level packages
-  object rootClass extends TypeMemberCompletion(RootClass.tpe) {
-    override def completions(verbosity: Int) = super.completions(verbosity) :+ "_root_"
-    override def follow(id: String) = id match {
-      case "_root_" => Some(this)
-      case _        => super.follow(id)
-    }
-  }
-  // members of Predef
-  object predef extends TypeMemberCompletion(PredefModule.tpe) {
-    override def excludeEndsWith    = super.excludeEndsWith ++ List("Wrapper", "ArrayOps")
-    override def excludeStartsWith  = super.excludeStartsWith ++ List("wrap")
-    override def excludeNames       = anyref.methodNames
-
-    override def exclude(name: String) = super.exclude(name) || (
-      (name contains "2")
-      )
-
-    override def completions(verbosity: Int) = verbosity match {
-      case 0    => Nil
-      case _    => super.completions(verbosity)
-    }
-  }
-  // members of scala.*
-  object scalalang extends PackageCompletion(ScalaPackage.tpe) {
-    def arityClasses = List("Product", "Tuple", "Function")
-    def skipArity(name: String) = arityClasses exists (x => name != x && (name startsWith x))
-    override def exclude(name: String) = super.exclude(name) || (
-      skipArity(name)
-      )
-
-    override def completions(verbosity: Int) = verbosity match {
-      case 0    => filtered(packageNames ++ aliasNames)
-      case _    => super.completions(verbosity)
-    }
-  }
-  // members of java.lang.*
-  object javalang extends PackageCompletion(JavaLangPackage.tpe) {
-    override lazy val excludeEndsWith   = super.excludeEndsWith ++ List("Exception", "Error")
-    override lazy val excludeStartsWith = super.excludeStartsWith ++ List("CharacterData")
-
-    override def completions(verbosity: Int) = verbosity match {
-      case 0    => filtered(packageNames)
-      case _    => super.completions(verbosity)
-    }
-  }
-
-  // the list of completion aware objects which should be consulted
-  // for top level unqualified, it's too noisy to let much in.
-  lazy val topLevelBase: List[CompletionAware] = List(ids, rootClass, predef, scalalang, javalang, literals)
-  def topLevel = topLevelBase ++ imported
-  def topLevelThreshold = 50
-
-  // the first tier of top level objects (doesn't include file completion)
-  def topLevelFor(parsed: Parsed): List[String] = {
-    val buf = new ListBuffer[String]
-    topLevel foreach { ca =>
-      buf ++= (ca completionsFor parsed)
-
-      if (buf.size > topLevelThreshold)
-        return buf.toList.sorted
-    }
-    buf.toList
-  }
-
-  // the most recent result
-  def lastResult = Forwarder(() => ids follow intp.mostRecentVar)
-
-  def lastResultFor(parsed: Parsed) = {
-    /** The logic is a little tortured right now because normally '.' is
-      *  ignored as a delimiter, but on .<tab> it needs to be propagated.
-      */
-    val xs = lastResult completionsFor parsed
-    if (parsed.isEmpty) xs map ("." + _) else xs
-  }
-
-  def completer(): ScalaCompleter = new JLineTabCompletion
-
-  /** This gets a little bit hairy.  It's no small feat delegating everything
-    *  and also keeping track of exactly where the cursor is and where it's supposed
-    *  to end up.  The alternatives mechanism is a little hacky: if there is an empty
-    *  string in the list of completions, that means we are expanding a unique
-    *  completion, so don't update the "last" buffer because it'll be wrong.
-    */
-  class JLineTabCompletion extends ScalaCompleter {
-    // For recording the buffer on the last tab hit
-    private var lastBuf: String = ""
-    private var lastCursor: Int = -1
-
-    // Does this represent two consecutive tabs?
-    def isConsecutiveTabs(buf: String, cursor: Int) =
-      cursor == lastCursor && buf == lastBuf
-
-    // This is jline's entry point for completion.
-    override def complete(buf: String, cursor: Int): Candidates = {
-      verbosity = if (isConsecutiveTabs(buf, cursor)) verbosity + 1 else 0
-      repldbg(f"%ncomplete($buf, $cursor%d) last = ($lastBuf, $lastCursor%d), verbosity: $verbosity")
-
-      // we don't try lower priority completions unless higher ones return no results.
-      def tryCompletion(p: Parsed, completionFunction: Parsed => List[String]): Option[Candidates] = {
-        val winners = completionFunction(p)
-        if (winners.isEmpty)
-          return None
-        val newCursor =
-          if (winners contains "") p.cursor
-          else {
-            val advance = longestCommonPrefix(winners)
-            lastCursor = p.position + advance.length
-            lastBuf = (buf take p.position) + advance
-            repldbg(s"tryCompletion($p, _) lastBuf = $lastBuf, lastCursor = $lastCursor, p.position = ${p.position}")
-            p.position
-          }
-
-        Some(Candidates(newCursor, winners))
-      }
-
-      def mkDotted = Parsed.dotted(buf, cursor) withVerbosity verbosity
-
-      // a single dot is special cased to completion on the previous result
-      def lastResultCompletion =
-        if (!looksLikeInvocation(buf)) None
-        else tryCompletion(Parsed.dotted(buf drop 1, cursor), lastResultFor)
-
-      def tryAll = (
-        lastResultCompletion
-          orElse tryCompletion(mkDotted, topLevelFor)
-          getOrElse Candidates(cursor, Nil)
-        )
-
-      /**
-       *  This is the kickoff point for all manner of theoretically
-       *  possible compiler unhappiness. The fault may be here or
-       *  elsewhere, but we don't want to crash the repl regardless.
-       *  The compiler makes it impossible to avoid catching Throwable
-       *  with its unfortunate tendency to throw java.lang.Errors and
-       *  AssertionErrors as the hats drop. We take two swings at it
-       *  because there are some spots which like to throw an assertion
-       *  once, then work after that. Yeah, what can I say.
-       */
-      try tryAll
-      catch { case ex: Throwable =>
-        repldbg("Error: complete(%s, %s) provoked".format(buf, cursor) + ex)
-        Candidates(cursor,
-          if (isReplDebug) List("<error:" + ex + ">")
-          else Nil
-        )
-      }
-    }
-  }
-}
diff --git a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkMemberHandlers.scala b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkMemberHandlers.scala
deleted file mode 100644
index 0e22bc806d922..0000000000000
--- a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkMemberHandlers.scala
+++ /dev/null
@@ -1,221 +0,0 @@
-/* NSC -- new Scala compiler
- * Copyright 2005-2013 LAMP/EPFL
- * @author  Martin Odersky
- */
-
-package scala.tools.nsc
-package interpreter
-
-import scala.collection.{ mutable, immutable }
-import scala.language.implicitConversions
-
-trait SparkMemberHandlers {
-  val intp: SparkIMain
-
-  import intp.{ Request, global, naming }
-  import global._
-  import naming._
-
-  private def codegenln(leadingPlus: Boolean, xs: String*): String = codegen(leadingPlus, (xs ++ Array("\n")): _*)
-  private def codegenln(xs: String*): String = codegenln(true, xs: _*)
-  private def codegen(leadingPlus: Boolean, xs: String*): String = {
-    val front = if (leadingPlus) "+ " else ""
-    front + (xs map string2codeQuoted mkString " + ")
-  }
-  private implicit def name2string(name: Name) = name.toString
-
-  /** A traverser that finds all mentioned identifiers, i.e. things
-    *  that need to be imported.  It might return extra names.
-    */
-  private class ImportVarsTraverser extends Traverser {
-    val importVars = new mutable.HashSet[Name]()
-
-    override def traverse(ast: Tree) = ast match {
-      case Ident(name) =>
-        // XXX this is obviously inadequate but it's going to require some effort
-        // to get right.
-        if (name.toString startsWith "x$") ()
-        else importVars += name
-      case _        => super.traverse(ast)
-    }
-  }
-  private object ImportVarsTraverser {
-    def apply(member: Tree) = {
-      val ivt = new ImportVarsTraverser()
-      ivt traverse member
-      ivt.importVars.toList
-    }
-  }
-
-  private def isTermMacro(ddef: DefDef): Boolean = ddef.mods.isMacro
-
-  def chooseHandler(member: Tree): MemberHandler = member match {
-    case member: DefDef if isTermMacro(member) => new TermMacroHandler(member)
-    case member: DefDef                        => new DefHandler(member)
-    case member: ValDef                        => new ValHandler(member)
-    case member: ModuleDef                     => new ModuleHandler(member)
-    case member: ClassDef                      => new ClassHandler(member)
-    case member: TypeDef                       => new TypeAliasHandler(member)
-    case member: Assign                        => new AssignHandler(member)
-    case member: Import                        => new ImportHandler(member)
-    case DocDef(_, documented)                 => chooseHandler(documented)
-    case member                                => new GenericHandler(member)
-  }
-
-  sealed abstract class MemberDefHandler(override val member: MemberDef) extends MemberHandler(member) {
-    override def name: Name = member.name
-    def mods: Modifiers     = member.mods
-    def keyword             = member.keyword
-    def prettyName          = name.decode
-
-    override def definesImplicit = member.mods.isImplicit
-    override def definesTerm: Option[TermName] = Some(name.toTermName) filter (_ => name.isTermName)
-    override def definesType: Option[TypeName] = Some(name.toTypeName) filter (_ => name.isTypeName)
-    override def definedSymbols = if (symbol.exists) symbol :: Nil else Nil
-  }
-
-  /** Class to handle one member among all the members included
-    *  in a single interpreter request.
-    */
-  sealed abstract class MemberHandler(val member: Tree) {
-    def name: Name      = nme.NO_NAME
-    def path            = intp.originalPath(symbol).replaceFirst("read", "read.INSTANCE")
-    def symbol          = if (member.symbol eq null) NoSymbol else member.symbol
-    def definesImplicit = false
-    def definesValue    = false
-
-    def definesTerm     = Option.empty[TermName]
-    def definesType     = Option.empty[TypeName]
-
-    private lazy val _referencedNames = ImportVarsTraverser(member)
-    def referencedNames = _referencedNames
-    def importedNames   = List[Name]()
-    def definedNames    = definesTerm.toList ++ definesType.toList
-    def definedSymbols  = List[Symbol]()
-
-    def extraCodeToEvaluate(req: Request): String = ""
-    def resultExtractionCode(req: Request): String = ""
-
-    private def shortName = this.getClass.toString split '.' last
-    override def toString = shortName + referencedNames.mkString(" (refs: ", ", ", ")")
-  }
-
-  class GenericHandler(member: Tree) extends MemberHandler(member)
-
-  class ValHandler(member: ValDef) extends MemberDefHandler(member) {
-    val maxStringElements = 1000  // no need to mkString billions of elements
-    override def definesValue = true
-
-    override def resultExtractionCode(req: Request): String = {
-
-      val isInternal = isUserVarName(name) && req.lookupTypeOf(name) == "Unit"
-      if (!mods.isPublic || isInternal) ""
-      else {
-        // if this is a lazy val we avoid evaluating it here
-        val resultString =
-          if (mods.isLazy) codegenln(false, "<lazy>")
-          else any2stringOf(path, maxStringElements)
-
-        val vidString =
-          if (replProps.vids) s"""" + " @ " + "%%8x".format(System.identityHashCode($path)) + " """.trim
-          else ""
-
-        """ + "%s%s: %s = " + %s""".format(string2code(prettyName), vidString, string2code(req typeOf name), resultString)
-      }
-    }
-  }
-
-  class DefHandler(member: DefDef) extends MemberDefHandler(member) {
-    override def definesValue = flattensToEmpty(member.vparamss) // true if 0-arity
-    override def resultExtractionCode(req: Request) =
-      if (mods.isPublic) codegenln(name, ": ", req.typeOf(name)) else ""
-  }
-
-  abstract class MacroHandler(member: DefDef) extends MemberDefHandler(member) {
-    override def referencedNames = super.referencedNames.flatMap(name => List(name.toTermName, name.toTypeName))
-    override def definesValue = false
-    override def definesTerm: Option[TermName] = Some(name.toTermName)
-    override def definesType: Option[TypeName] = None
-    override def resultExtractionCode(req: Request) = if (mods.isPublic) codegenln(notification(req)) else ""
-    def notification(req: Request): String
-  }
-
-  class TermMacroHandler(member: DefDef) extends MacroHandler(member) {
-    def notification(req: Request) = s"defined term macro $name: ${req.typeOf(name)}"
-  }
-
-  class AssignHandler(member: Assign) extends MemberHandler(member) {
-    val Assign(lhs, rhs) = member
-    override lazy val name = newTermName(freshInternalVarName())
-
-    override def definesTerm = Some(name)
-    override def definesValue = true
-    override def extraCodeToEvaluate(req: Request) =
-      """val %s = %s""".format(name, lhs)
-
-    /** Print out lhs instead of the generated varName */
-    override def resultExtractionCode(req: Request) = {
-      val lhsType = string2code(req lookupTypeOf name)
-      val res     = string2code(req fullPath name)
-      """ + "%s: %s = " + %s + "\n" """.format(string2code(lhs.toString), lhsType, res) + "\n"
-    }
-  }
-
-  class ModuleHandler(module: ModuleDef) extends MemberDefHandler(module) {
-    override def definesTerm = Some(name.toTermName)
-    override def definesValue = true
-
-    override def resultExtractionCode(req: Request) = codegenln("defined object ", name)
-  }
-
-  class ClassHandler(member: ClassDef) extends MemberDefHandler(member) {
-    override def definedSymbols = List(symbol, symbol.companionSymbol) filterNot (_ == NoSymbol)
-    override def definesType = Some(name.toTypeName)
-    override def definesTerm = Some(name.toTermName) filter (_ => mods.isCase)
-
-    override def resultExtractionCode(req: Request) =
-      codegenln("defined %s %s".format(keyword, name))
-  }
-
-  class TypeAliasHandler(member: TypeDef) extends MemberDefHandler(member) {
-    private def isAlias = mods.isPublic && treeInfo.isAliasTypeDef(member)
-    override def definesType = Some(name.toTypeName) filter (_ => isAlias)
-
-    override def resultExtractionCode(req: Request) =
-      codegenln("defined type alias ", name) + "\n"
-  }
-
-  class ImportHandler(imp: Import) extends MemberHandler(imp) {
-    val Import(expr, selectors) = imp
-    def targetType = intp.global.rootMirror.getModuleIfDefined("" + expr) match {
-      case NoSymbol => intp.typeOfExpression("" + expr)
-      case sym      => sym.thisType
-    }
-    private def importableTargetMembers = importableMembers(targetType).toList
-    // wildcard imports, e.g. import foo._
-    private def selectorWild    = selectors filter (_.name == nme.USCOREkw)
-    // renamed imports, e.g. import foo.{ bar => baz }
-    private def selectorRenames = selectors map (_.rename) filterNot (_ == null)
-
-    /** Whether this import includes a wildcard import */
-    val importsWildcard = selectorWild.nonEmpty
-
-    def implicitSymbols = importedSymbols filter (_.isImplicit)
-    def importedSymbols = individualSymbols ++ wildcardSymbols
-
-    private val selectorNames = selectorRenames filterNot (_ == nme.USCOREkw) flatMap (_.bothNames) toSet
-    lazy val individualSymbols: List[Symbol] = exitingTyper(importableTargetMembers filter (m => selectorNames(m.name)))
-    lazy val wildcardSymbols: List[Symbol]   = exitingTyper(if (importsWildcard) importableTargetMembers else Nil)
-
-    /** Complete list of names imported by a wildcard */
-    lazy val wildcardNames: List[Name]   = wildcardSymbols map (_.name)
-    lazy val individualNames: List[Name] = individualSymbols map (_.name)
-
-    /** The names imported by this statement */
-    override lazy val importedNames: List[Name] = wildcardNames ++ individualNames
-    lazy val importsSymbolNamed: Set[String] = importedNames map (_.toString) toSet
-
-    def importString = imp.toString
-    override def resultExtractionCode(req: Request) = codegenln(importString) + "\n"
-  }
-}
diff --git a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkReplReporter.scala b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkReplReporter.scala
deleted file mode 100644
index 272f81eca92c1..0000000000000
--- a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkReplReporter.scala
+++ /dev/null
@@ -1,55 +0,0 @@
-/* NSC -- new Scala compiler
- * Copyright 2002-2013 LAMP/EPFL
- * @author Paul Phillips
- */
-
-package scala.tools.nsc
-package interpreter
-
-import reporters._
-import SparkIMain._
-
-import scala.reflect.internal.util.Position
-
-/** Like ReplGlobal, a layer for ensuring extra functionality.
-  */
-class SparkReplReporter(intp: SparkIMain) extends ConsoleReporter(intp.settings, Console.in, new SparkReplStrippingWriter(intp)) {
-  def printUntruncatedMessage(msg: String) = withoutTruncating(printMessage(msg))
-
-  /** Whether very long lines can be truncated.  This exists so important
-    *  debugging information (like printing the classpath) is not rendered
-    *  invisible due to the max message length.
-    */
-  private var _truncationOK: Boolean = !intp.settings.verbose
-  def truncationOK = _truncationOK
-  def withoutTruncating[T](body: => T): T = {
-    val saved = _truncationOK
-    _truncationOK = false
-    try body
-    finally _truncationOK = saved
-  }
-
-  override def warning(pos: Position, msg: String): Unit = withoutTruncating(super.warning(pos, msg))
-  override def error(pos: Position, msg: String): Unit   = withoutTruncating(super.error(pos, msg))
-
-  override def printMessage(msg: String) {
-    // Avoiding deadlock if the compiler starts logging before
-    // the lazy val is complete.
-    if (intp.isInitializeComplete) {
-      if (intp.totalSilence) {
-        if (isReplTrace)
-          super.printMessage("[silent] " + msg)
-      }
-      else super.printMessage(msg)
-    }
-    // scalastyle:off println
-    else Console.println("[init] " + msg)
-    // scalastyle:on println
-  }
-
-  override def displayPrompt() {
-    if (intp.totalSilence) ()
-    else super.displayPrompt()
-  }
-
-}
diff --git a/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala b/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala
index 9ecc7c229e38a..e1cee97de32bc 100644
--- a/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala
+++ b/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala
@@ -22,14 +22,11 @@ import java.net.URLClassLoader
 
 import scala.collection.mutable.ArrayBuffer
 import scala.concurrent.duration._
-import scala.tools.nsc.interpreter.SparkILoop
 
 import org.apache.commons.lang3.StringEscapeUtils
 import org.apache.spark.{SparkContext, SparkFunSuite}
 import org.apache.spark.util.Utils
 
-
-
 class ReplSuite extends SparkFunSuite {
 
   def runInterpreter(master: String, input: String): String = {
@@ -87,10 +84,6 @@ class ReplSuite extends SparkFunSuite {
       settings = new scala.tools.nsc.Settings
       settings.usejavacp.value = true
       org.apache.spark.repl.Main.interp = this
-      override def createInterpreter() {
-        intp = new SparkILoopInterpreter
-        intp.setContextClassLoader()
-      }
     }
 
     val out = new StringWriter()
@@ -274,7 +267,7 @@ class ReplSuite extends SparkFunSuite {
 
   test("SPARK-2632 importing a method from non serializable class and not using it.") {
     val output = runInterpreter("local",
-    """
+      """
       |class TestClass() { def testMethod = 3 }
       |val t = new TestClass
       |import t.testMethod
@@ -319,7 +312,7 @@ class ReplSuite extends SparkFunSuite {
     assertDoesNotContain("Exception", output)
     assertContains("ret: Array[Foo] = Array(Foo(1),", output)
   }
-  
+
   test("collecting objects of class defined in repl - shuffling") {
     val output = runInterpreter("local-cluster[1,1,512]",
       """

From 5dd45bde4ae6f5a10cfe005de3041abd075da4d6 Mon Sep 17 00:00:00 2001
From: Andrew Or <andrew@databricks.com>
Date: Fri, 10 Jul 2015 09:48:17 -0700
Subject: [PATCH 110/549] [SPARK-8958] Dynamic allocation: change cached
 timeout to infinity

pwendell and I discussed this a little more offline and concluded that it would be good to keep it more conservative. Losing cached blocks may be very expensive and we should only allow it if the user knows what he/she is doing.

FYI harishreedharan sryza.

Author: Andrew Or <andrew@databricks.com>

Closes #7329 from andrewor14/da-cached-timeout and squashes the following commits:

cef0b4e [Andrew Or] Change timeout to infinity
---
 .../scala/org/apache/spark/ExecutorAllocationManager.scala    | 2 +-
 docs/configuration.md                                         | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
index 49329423dca76..0c50b4002cf7b 100644
--- a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
+++ b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
@@ -102,7 +102,7 @@ private[spark] class ExecutorAllocationManager(
     "spark.dynamicAllocation.executorIdleTimeout", "60s")
 
   private val cachedExecutorIdleTimeoutS = conf.getTimeAsSeconds(
-    "spark.dynamicAllocation.cachedExecutorIdleTimeout", s"${2 * executorIdleTimeoutS}s")
+    "spark.dynamicAllocation.cachedExecutorIdleTimeout", s"${Integer.MAX_VALUE}s")
 
   // During testing, the methods to actually kill and add executors are mocked out
   private val testing = conf.getBoolean("spark.dynamicAllocation.testing", false)
diff --git a/docs/configuration.md b/docs/configuration.md
index 892c02b27df32..443322e1eadf4 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -1206,7 +1206,7 @@ Apart from these, the following properties are also available, and may be useful
 </tr>
 <tr>
   <td><code>spark.dynamicAllocation.cachedExecutorIdleTimeout</code></td>
-  <td>2 * executorIdleTimeout</td>
+  <td>infinity</td>
   <td>
     If dynamic allocation is enabled and an executor which has cached data blocks has been idle for more than this duration,
     the executor will be removed. For more details, see this
@@ -1222,7 +1222,7 @@ Apart from these, the following properties are also available, and may be useful
 </tr>
 <tr>
   <td><code>spark.dynamicAllocation.maxExecutors</code></td>
-  <td>Integer.MAX_VALUE</td>
+  <td>infinity</td>
   <td>
     Upper bound for the number of executors if dynamic allocation is enabled.
   </td>

From db6d57f87a9d657d764e01879014bf406971855a Mon Sep 17 00:00:00 2001
From: Cheng Hao <hao.cheng@intel.com>
Date: Fri, 10 Jul 2015 09:50:46 -0700
Subject: [PATCH 111/549] [CORE] [MINOR] change the log level to info

Too many logs even when set the log level to warning.

Author: Cheng Hao <hao.cheng@intel.com>

Closes #7340 from chenghao-intel/log and squashes the following commits:

59658cf [Cheng Hao] change the log level to info
---
 core/src/main/scala/org/apache/spark/executor/Executor.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala
index 8f916e0502ecb..f7ef92bc80f91 100644
--- a/core/src/main/scala/org/apache/spark/executor/Executor.scala
+++ b/core/src/main/scala/org/apache/spark/executor/Executor.scala
@@ -443,7 +443,7 @@ private[spark] class Executor(
     try {
       val response = heartbeatReceiverRef.askWithRetry[HeartbeatResponse](message)
       if (response.reregisterBlockManager) {
-        logWarning("Told to re-register on heartbeat")
+        logInfo("Told to re-register on heartbeat")
         env.blockManager.reregister()
       }
     } catch {

From c185f3a45ddbc073192f7da41303941ee4cebd4f Mon Sep 17 00:00:00 2001
From: Min Zhou <coderplay@gmail.com>
Date: Fri, 10 Jul 2015 09:52:40 -0700
Subject: [PATCH 112/549] [SPARK-8675] Executors created by LocalBackend won't
 get the same classpath as other executor backends

AFAIK, some spark application always use LocalBackend to do some local initiatives, spark sql is an example. Starting a LocalPoint won't add user classpath into executor.
```java
  override def start() {
    localEndpoint = SparkEnv.get.rpcEnv.setupEndpoint(
      "LocalBackendEndpoint", new LocalEndpoint(SparkEnv.get.rpcEnv, scheduler, this, totalCores))
  }
```
Thus will cause local executor fail with these scenarios, loading hadoop built-in native libraries, loading other user defined native libraries, loading user jars, reading s3 config from a site.xml file, etc

Author: Min Zhou <coderplay@gmail.com>

Closes #7091 from coderplay/master and squashes the following commits:

365838f [Min Zhou] Fixed java.net.MalformedURLException, add default scheme, support relative path
d215b7f [Min Zhou] Follows spark standard scala style, make the auto testing happy
84ad2cd [Min Zhou] Use system specific path separator instead of ','
01f5d1a [Min Zhou] Merge branch 'master' of https://github.com/apache/spark
e528be7 [Min Zhou] Merge branch 'master' of https://github.com/apache/spark
45bf62c [Min Zhou] SPARK-8675 Executors created by LocalBackend won't get the same classpath as other executor backends
---
 .../spark/scheduler/local/LocalBackend.scala  | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/local/LocalBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/local/LocalBackend.scala
index 3078a1b10be8b..776e5d330e3c7 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/local/LocalBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/local/LocalBackend.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.scheduler.local
 
+import java.io.File
+import java.net.URL
 import java.nio.ByteBuffer
 
 import org.apache.spark.{Logging, SparkConf, SparkContext, SparkEnv, TaskState}
@@ -40,6 +42,7 @@ private case class StopExecutor()
  */
 private[spark] class LocalEndpoint(
     override val rpcEnv: RpcEnv,
+    userClassPath: Seq[URL],
     scheduler: TaskSchedulerImpl,
     executorBackend: LocalBackend,
     private val totalCores: Int)
@@ -51,7 +54,7 @@ private[spark] class LocalEndpoint(
   private val localExecutorHostname = "localhost"
 
   private val executor = new Executor(
-    localExecutorId, localExecutorHostname, SparkEnv.get, isLocal = true)
+    localExecutorId, localExecutorHostname, SparkEnv.get, userClassPath, isLocal = true)
 
   override def receive: PartialFunction[Any, Unit] = {
     case ReviveOffers =>
@@ -97,10 +100,22 @@ private[spark] class LocalBackend(
 
   private val appId = "local-" + System.currentTimeMillis
   var localEndpoint: RpcEndpointRef = null
+  private val userClassPath = getUserClasspath(conf)
+
+  /**
+   * Returns a list of URLs representing the user classpath.
+   *
+   * @param conf Spark configuration.
+   */
+  def getUserClasspath(conf: SparkConf): Seq[URL] = {
+    val userClassPathStr = conf.getOption("spark.executor.extraClassPath")
+    userClassPathStr.map(_.split(File.pathSeparator)).toSeq.flatten.map(new File(_).toURI.toURL)
+  }
 
   override def start() {
     localEndpoint = SparkEnv.get.rpcEnv.setupEndpoint(
-      "LocalBackendEndpoint", new LocalEndpoint(SparkEnv.get.rpcEnv, scheduler, this, totalCores))
+      "LocalBackendEndpoint",
+      new LocalEndpoint(SparkEnv.get.rpcEnv, userClassPath, scheduler, this, totalCores))
   }
 
   override def stop() {

From 05ac023dc8d9004a27c2f06ee875b0ff3743ccdd Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Fri, 10 Jul 2015 13:05:23 -0700
Subject: [PATCH 113/549] [HOTFIX] fix flaky test in PySpark SQL

It may loss precision in microseconds when using float for it.

Author: Davies Liu <davies@databricks.com>

Closes #7344 from davies/fix_date_test and squashes the following commits:

249ec61 [Davies Liu] fix flaky test
---
 python/pyspark/sql/tests.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 4d7cad5a1ab88..ed903334beafa 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -705,11 +705,12 @@ def test_filter_with_datetime(self):
     def test_time_with_timezone(self):
         day = datetime.date.today()
         now = datetime.datetime.now()
-        ts = time.mktime(now.timetuple()) + now.microsecond / 1e6
+        ts = time.mktime(now.timetuple())
         # class in __main__ is not serializable
         from pyspark.sql.tests import UTC
         utc = UTC()
-        utcnow = datetime.datetime.fromtimestamp(ts, utc)
+        utcnow = datetime.datetime.utcfromtimestamp(ts)  # without microseconds
+        utcnow = datetime.datetime(*(utcnow.timetuple()[:6] + (now.microsecond, utc)))
         df = self.sqlCtx.createDataFrame([(day, now, utcnow)])
         day1, now1, utcnow1 = df.first()
         self.assertEqual(day1, day)

From 0772026c2fc88aa85423034006b758f6ff0cc2ed Mon Sep 17 00:00:00 2001
From: rahulpalamuttam <rahulpalamut@gmail.com>
Date: Fri, 10 Jul 2015 16:07:31 -0700
Subject: [PATCH 114/549] [SPARK-8923] [DOCUMENTATION, MLLIB] Add @since tags
 to mllib.fpm

Author: rahulpalamuttam <rahulpalamut@gmail.com>

Closes #7341 from rahulpalamuttam/TaggingMLlibfpm and squashes the following commits:

bef2843 [rahulpalamuttam] fix @since tags in mmlib.fpm
cd86252 [rahulpalamuttam] Add @since tags to mllib.fpm
---
 .../spark/mllib/fpm/AssociationRules.scala    | 10 ++++++++++
 .../org/apache/spark/mllib/fpm/FPGrowth.scala | 19 +++++++++++++++++++
 2 files changed, 29 insertions(+)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala
index 7e2bbfe31c1b7..72d0ea0c12e1e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala
@@ -31,6 +31,8 @@ import org.apache.spark.rdd.RDD
  *
  * Generates association rules from a [[RDD[FreqItemset[Item]]]. This method only generates
  * association rules which have a single item as the consequent.
+ *
+ * @since 1.5.0
  */
 @Experimental
 class AssociationRules private[fpm] (
@@ -38,11 +40,15 @@ class AssociationRules private[fpm] (
 
   /**
    * Constructs a default instance with default parameters {minConfidence = 0.8}.
+   *
+   * @since 1.5.0
    */
   def this() = this(0.8)
 
   /**
    * Sets the minimal confidence (default: `0.8`).
+   *
+   * @since 1.5.0
    */
   def setMinConfidence(minConfidence: Double): this.type = {
     require(minConfidence >= 0.0 && minConfidence <= 1.0)
@@ -54,6 +60,8 @@ class AssociationRules private[fpm] (
    * Computes the association rules with confidence above [[minConfidence]].
    * @param freqItemsets frequent itemset model obtained from [[FPGrowth]]
    * @return a [[Set[Rule[Item]]] containing the assocation rules.
+   *
+   * @since 1.5.0
    */
   def run[Item: ClassTag](freqItemsets: RDD[FreqItemset[Item]]): RDD[Rule[Item]] = {
     // For candidate rule X => Y, generate (X, (Y, freq(X union Y)))
@@ -90,6 +98,8 @@ object AssociationRules {
    * @param antecedent hypotheses of the rule
    * @param consequent conclusion of the rule
    * @tparam Item item type
+   *
+   * @since 1.5.0
    */
   @Experimental
   class Rule[Item] private[fpm] (
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
index 9cb9a00dbd9c7..e2370a52f4930 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
@@ -38,12 +38,15 @@ import org.apache.spark.storage.StorageLevel
  * Model trained by [[FPGrowth]], which holds frequent itemsets.
  * @param freqItemsets frequent itemset, which is an RDD of [[FreqItemset]]
  * @tparam Item item type
+ *
+ * @since 1.3.0
  */
 @Experimental
 class FPGrowthModel[Item: ClassTag](val freqItemsets: RDD[FreqItemset[Item]]) extends Serializable {
   /**
    * Generates association rules for the [[Item]]s in [[freqItemsets]].
    * @param confidence minimal confidence of the rules produced
+   * @since 1.5.0
    */
   def generateAssociationRules(confidence: Double): RDD[AssociationRules.Rule[Item]] = {
     val associationRules = new AssociationRules(confidence)
@@ -67,6 +70,8 @@ class FPGrowthModel[Item: ClassTag](val freqItemsets: RDD[FreqItemset[Item]]) ex
  *
  * @see [[http://en.wikipedia.org/wiki/Association_rule_learning Association rule learning
  *       (Wikipedia)]]
+ *
+ * @since 1.3.0
  */
 @Experimental
 class FPGrowth private (
@@ -76,11 +81,15 @@ class FPGrowth private (
   /**
    * Constructs a default instance with default parameters {minSupport: `0.3`, numPartitions: same
    * as the input data}.
+   *
+   * @since 1.3.0
    */
   def this() = this(0.3, -1)
 
   /**
    * Sets the minimal support level (default: `0.3`).
+   *
+   * @since 1.3.0
    */
   def setMinSupport(minSupport: Double): this.type = {
     this.minSupport = minSupport
@@ -89,6 +98,8 @@ class FPGrowth private (
 
   /**
    * Sets the number of partitions used by parallel FP-growth (default: same as input data).
+   *
+   * @since 1.3.0
    */
   def setNumPartitions(numPartitions: Int): this.type = {
     this.numPartitions = numPartitions
@@ -99,6 +110,8 @@ class FPGrowth private (
    * Computes an FP-Growth model that contains frequent itemsets.
    * @param data input data set, each element contains a transaction
    * @return an [[FPGrowthModel]]
+   *
+   * @since 1.3.0
    */
   def run[Item: ClassTag](data: RDD[Array[Item]]): FPGrowthModel[Item] = {
     if (data.getStorageLevel == StorageLevel.NONE) {
@@ -199,6 +212,8 @@ class FPGrowth private (
 
 /**
  * :: Experimental ::
+ *
+ * @since 1.3.0
  */
 @Experimental
 object FPGrowth {
@@ -208,11 +223,15 @@ object FPGrowth {
    * @param items items in this itemset. Java users should call [[FreqItemset#javaItems]] instead.
    * @param freq frequency
    * @tparam Item item type
+   *
+   * @since 1.3.0
    */
   class FreqItemset[Item](val items: Array[Item], val freq: Long) extends Serializable {
 
     /**
      * Returns items in a Java List.
+     *
+     * @since 1.3.0
      */
     def javaItems: java.util.List[Item] = {
       items.toList.asJava

From fb8807c9b04f27467b36fc9d0177ef92dd012670 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Fri, 10 Jul 2015 16:44:51 -0700
Subject: [PATCH 115/549] [SPARK-7078] [SPARK-7079] Binary processing sort for
 Spark SQL

This patch adds a cache-friendly external sorter which operates on serialized bytes and uses this sorter to implement a new sort operator for Spark SQL and DataFrames.

### Overview of the new sorter

The new sorter design is inspired by [Alphasort](http://research.microsoft.com/pubs/68249/alphasort.doc) and implements a key-prefix optimization in order to improve the cache friendliness of the sort.  In naive sort implementations, the sorting algorithm operates on an array of record pointers.  To compare two records for ordering, the sorter must dereference these pointers, which likely involves random memory access, then compare the objects themselves.

![image](https://cloud.githubusercontent.com/assets/50748/8611390/3b1402ae-2675-11e5-8308-1a10bf347e6e.png)

In a key-prefix sort, the sort operates on an array which stores the record pointer alongside a prefix of the record's key. When comparing two records for ordering, the sorter first compares the the stored key prefixes. If the ordering can be determined from the key prefixes (i.e. the prefixes are unequal), then the sort can avoid directly comparing the records, avoiding random memory accesses and full record comparisons. For example, if we're sorting a list of strings then we can store the first 8 bytes of the UTF-8 encoded string as the key-prefix and can perform unsigned byte-at-a-time comparisons to determine the ordering of strings based on their prefixes, only resorting to full comparisons for strings that share a common prefix.  In cases where the sort key can fit entirely in the space allotted for the key prefix (e.g. the sorting key is an integer), we completely avoid direct record comparison.

In this patch's implementation of key-prefix sorting, our sorter's internal array stores a 64-bit long and 64-bit pointer for each record being sorted. The key prefixes are generated by the user when inserting records into the sorter, which uses a user-defined comparison function for comparing them.  The `PrefixComparators` object implements a set of comparators for many common types, including primitive numeric types and UTF-8 strings.

The actual sorting is implemented by `UnsafeInMemorySorter`.  Most consumers will not use this directly, but instead will use `UnsafeExternalSorter`, a class which implements a sort that can spill to disk in response to memory pressure.  Internally, `UnsafeExternalSorter` creates `UnsafeInMemorySorters` to perform sorting and uses `UnsafeSortSpillReader/Writer` to spill and read back runs of sorted records and `UnsafeSortSpillMerger` to merge multiple sorted spills into a single sorted iterator.  This external sorter integrates with Spark's existing ShuffleMemoryManager for controlling spilling.

Many parts of this sorter's design are based on / copied from the more specialized external sort implementation that I designed for the new UnsafeShuffleManager write path; see #5868 for more details on that patch.

### Sorting rows in Spark SQL

For now, `UnsafeExternalSorter` is only used by Spark SQL, which uses it to implement a new sort operator, `UnsafeExternalSort`.  This sort operator uses a SQL-specific class called `UnsafeExternalRowSorter` that configures an `UnsafeExternalSorter` to use prefix generators and comparators that operate on rows encoded in the UnsafeRow format that was designed for Project Tungsten.

I used some interesting unit-testing techniques to test this patch's SQL-specific components.  `UnsafeExternalSortSuite` uses the SQL random data generators introduced in #7176 to test the UnsafeSort operator with all atomic types both with and without nullability and in both ascending and descending sort orders.  `PrefixComparatorsSuite` contains a cool use of ScalaCheck + ScalaTest's `GeneratorDrivenPropertyChecks` in order to test UTF8String prefix comparison.

### Misc. additional improvements made in this patch

This patch made several miscellaneous improvements to related code in Spark SQL:

- The logic for selecting physical sort operator implementations, which was partially duplicated in both `Exchange` and `SparkStrategies, has now been consolidated into a `getSortOperator()` helper function in `SparkStrategies`.
- The `SparkPlanTest` unit testing helper trait has been extended with new methods for comparing the output produced by two different physical plans. This makes it easy to write tests which assert that two physical operator implementations should produce the same output.  I also added a method for disabling the implicit sorting of outputs prior to comparing them, a change which is necessary in order to be able to write proper SparkPlan tests for sort operators.

### Tasks deferred to followup patches

While most of this patch's features are reasonably well-tested and complete, there are a number of tasks that are intentionally being deferred to followup patches:

- Add tests which mock the ShuffleMemoryManager to check that memory pressure properly triggers spilling (there are examples of this type of test in #5868).
- Add tests to ensure that spill files are properly cleaned up after errors.  I'd like to do this in the context of a patch which introduces more general metrics for ensuring proper cleanup of tasks' temporary files; see https://issues.apache.org/jira/browse/SPARK-8966 for more details.
- Metrics integration: there are some open questions regarding how to track / report spill metrics for non-shuffle operations, so I've deferred most of the IO / shuffle metrics integration for now.
- Performance profiling.

<!-- Reviewable:start -->
[<img src="https://reviewable.io/review_button.png" height=40 alt="Review on Reviewable"/>](https://reviewable.io/reviews/apache/spark/6444)
<!-- Reviewable:end -->

Author: Josh Rosen <joshrosen@databricks.com>

Closes #6444 from JoshRosen/sql-external-sort and squashes the following commits:

6beb467 [Josh Rosen] Remove a bunch of overloaded methods to avoid default args. issue
2bbac9c [Josh Rosen] Merge remote-tracking branch 'origin/master' into sql-external-sort
35dad9f [Josh Rosen] Make sortAnswers = false the default in SparkPlanTest
5135200 [Josh Rosen] Fix spill reading for large rows; add test
2f48777 [Josh Rosen] Add test and fix bug for sorting empty arrays
d1e28bc [Josh Rosen] Merge remote-tracking branch 'origin/master' into sql-external-sort
cd05866 [Josh Rosen] Fix scalastyle
3947fc1 [Josh Rosen] Merge remote-tracking branch 'origin/master' into sql-external-sort
d13ac55 [Josh Rosen] Hacky approach to copying of UnsafeRows for sort followed by limit.
845bea3 [Josh Rosen] Remove unnecessary zeroing of row conversion buffer
c56ec18 [Josh Rosen] Clean up final row copying code.
d31f180 [Josh Rosen] Re-enable NullType sorting test now that SPARK-8868 is fixed
844f4ca [Josh Rosen] Merge remote-tracking branch 'origin/master' into sql-external-sort
293f109 [Josh Rosen] Add missing license header.
f99a612 [Josh Rosen] Fix bugs in string prefix comparison.
9d00afc [Josh Rosen] Clean up prefix comparators for integral types
88aff18 [Josh Rosen] NULL_PREFIX has to be negative infinity for floating point types
613e16f [Josh Rosen] Test with larger data.
1d7ffaa [Josh Rosen] Somewhat hacky fix for descending sorts
08701e7 [Josh Rosen] Fix prefix comparison of null primitives.
b86e684 [Josh Rosen] Set global = true in UnsafeExternalSortSuite.
1c7bad8 [Josh Rosen] Make sorting of answers explicit in SparkPlanTest.checkAnswer().
b81a920 [Josh Rosen] Temporarily enable only the passing sort tests
5d6109d [Josh Rosen] Fix inconsistent handling / encoding of record lengths.
87b6ed9 [Josh Rosen] Fix critical issues in test which led to false negatives.
8d7fbe7 [Josh Rosen] Fixes to multiple spilling-related bugs.
82e21c1 [Josh Rosen] Force spilling in UnsafeExternalSortSuite.
88b72db [Josh Rosen] Test ascending and descending sort orders.
f27be09 [Josh Rosen] Fix tests by binding attributes.
0a79d39 [Josh Rosen] Revert "Undo part of a SparkPlanTest change in #7162 that broke my test."
7c3c864 [Josh Rosen] Undo part of a SparkPlanTest change in #7162 that broke my test.
9969c14 [Josh Rosen] Merge remote-tracking branch 'origin/master' into sql-external-sort
5822e6f [Josh Rosen] Fix test compilation issue
939f824 [Josh Rosen] Remove code gen experiment.
0dfe919 [Josh Rosen] Implement prefix sort for strings (albeit inefficiently).
66a813e [Josh Rosen] Prefix comparators for float and double
b310c88 [Josh Rosen] Integrate prefix comparators for Int and Long (others coming soon)
95058d9 [Josh Rosen] Add missing SortPrefixUtils file
4c37ba6 [Josh Rosen] Add tests for sorting on all primitive types.
6890863 [Josh Rosen] Fix memory leak on empty inputs.
d246e29 [Josh Rosen] Fix consideration of column types when choosing sort implementation.
6b156fb [Josh Rosen] Some WIP work on prefix comparison.
7f875f9 [Josh Rosen] Commit failing test demonstrating bug in handling objects in spills
41b8881 [Josh Rosen] Get UnsafeInMemorySorterSuite to pass (WIP)
90c2b6a [Josh Rosen] Update test name
6d6a1e6 [Josh Rosen] Centralize logic for picking sort operator implementations
9869ec2 [Josh Rosen] Clean up Exchange code a bit
82bb0ec [Josh Rosen] Fix IntelliJ complaint due to negated if condition
1db845a [Josh Rosen] Many more changes to harmonize with shuffle sorter
ebf9eea [Josh Rosen] Harmonization with shuffle's unsafe sorter
206bfa2 [Josh Rosen] Add some missing newlines at the ends of files
26c8931 [Josh Rosen] Back out some Hive changes that aren't needed anymore
62f0bb8 [Josh Rosen] Update to reflect SparkPlanTest changes
21d7d93 [Josh Rosen] Back out of BlockObjectWriter change
7eafecf [Josh Rosen] Port test to SparkPlanTest
d468a88 [Josh Rosen] Update for InternalRow refactoring
269cf86 [Josh Rosen] Back out SMJ operator change; isolate changes to selection of sort op.
1b841ca [Josh Rosen] WIP towards copying
b420a71 [Josh Rosen] Move most of the existing SMJ code into Java.
dfdb93f [Josh Rosen] SparkFunSuite change
73cc761 [Josh Rosen] Fix whitespace
9cc98f5 [Josh Rosen] Move more code to Java; fix bugs in UnsafeRowConverter length type.
c8792de [Josh Rosen] Remove some debug logging
dda6752 [Josh Rosen] Commit some missing code from an old git stash.
58f36d0 [Josh Rosen] Merge in a sketch of a unit test for the new sorter (now failing).
2bd8c9a [Josh Rosen] Import my original tests and get them to pass.
d5d3106 [Josh Rosen] WIP towards external sorter for Spark SQL.
---
 core/pom.xml                                  |  20 +-
 .../DummySerializerInstance.java              |   9 +-
 .../unsafe/UnsafeShuffleExternalSorter.java   |   1 +
 .../unsafe/sort/PrefixComparator.java         |  29 ++
 .../unsafe/sort/PrefixComparators.java        | 109 +++++++
 .../unsafe/sort/RecordComparator.java         |  37 +++
 .../sort/RecordPointerAndKeyPrefix.java       |  31 ++
 .../unsafe/sort/UnsafeExternalSorter.java     | 282 ++++++++++++++++++
 .../unsafe/sort/UnsafeInMemorySorter.java     | 189 ++++++++++++
 .../unsafe/sort/UnsafeSortDataFormat.java     |  80 +++++
 .../unsafe/sort/UnsafeSorterIterator.java     |  35 +++
 .../unsafe/sort/UnsafeSorterSpillMerger.java  |  91 ++++++
 .../unsafe/sort/UnsafeSorterSpillReader.java  |  98 ++++++
 .../unsafe/sort/UnsafeSorterSpillWriter.java  | 146 +++++++++
 .../sort/UnsafeExternalSorterSuite.java       | 202 +++++++++++++
 .../sort/UnsafeInMemorySorterSuite.java       | 139 +++++++++
 .../unsafe/sort/PrefixComparatorsSuite.scala  |  50 ++++
 .../sql/catalyst/expressions/UnsafeRow.java   |   7 +-
 .../execution/UnsafeExternalRowSorter.java    | 216 ++++++++++++++
 .../spark/sql/AbstractScalaRowIterator.scala  |  27 ++
 .../apache/spark/sql/execution/Exchange.scala |  13 +-
 .../spark/sql/execution/SortPrefixUtils.scala |  97 ++++++
 .../spark/sql/execution/SparkStrategies.scala |  22 +-
 .../spark/sql/execution/basicOperators.scala  |  73 +++++
 .../spark/sql/execution/SortSuite.scala       |  11 +-
 .../spark/sql/execution/SparkPlanTest.scala   | 253 ++++++++++------
 .../execution/UnsafeExternalSortSuite.scala   | 104 +++++++
 .../sql/execution/joins/OuterJoinSuite.scala  |  21 +-
 28 files changed, 2254 insertions(+), 138 deletions(-)
 rename core/src/main/java/org/apache/spark/{shuffle/unsafe => serializer}/DummySerializerInstance.java (91%)
 create mode 100644 core/src/main/java/org/apache/spark/util/collection/unsafe/sort/PrefixComparator.java
 create mode 100644 core/src/main/java/org/apache/spark/util/collection/unsafe/sort/PrefixComparators.java
 create mode 100644 core/src/main/java/org/apache/spark/util/collection/unsafe/sort/RecordComparator.java
 create mode 100644 core/src/main/java/org/apache/spark/util/collection/unsafe/sort/RecordPointerAndKeyPrefix.java
 create mode 100644 core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java
 create mode 100644 core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java
 create mode 100644 core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSortDataFormat.java
 create mode 100644 core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterIterator.java
 create mode 100644 core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillMerger.java
 create mode 100644 core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java
 create mode 100644 core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillWriter.java
 create mode 100644 core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java
 create mode 100644 core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorterSuite.java
 create mode 100644 core/src/test/scala/org/apache/spark/util/collection/unsafe/sort/PrefixComparatorsSuite.scala
 create mode 100644 sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/AbstractScalaRowIterator.scala
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/SortPrefixUtils.scala
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeExternalSortSuite.scala

diff --git a/core/pom.xml b/core/pom.xml
index aee0d92620606..558cc3fb9f2f3 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -343,28 +343,28 @@
       <scope>test</scope>
     </dependency>
     <dependency>
-      <groupId>org.mockito</groupId>
-      <artifactId>mockito-core</artifactId>
+      <groupId>org.hamcrest</groupId>
+      <artifactId>hamcrest-core</artifactId>
       <scope>test</scope>
     </dependency>
     <dependency>
-      <groupId>org.scalacheck</groupId>
-      <artifactId>scalacheck_${scala.binary.version}</artifactId>
+      <groupId>org.hamcrest</groupId>
+      <artifactId>hamcrest-library</artifactId>
       <scope>test</scope>
     </dependency>
     <dependency>
-      <groupId>junit</groupId>
-      <artifactId>junit</artifactId>
+      <groupId>org.mockito</groupId>
+      <artifactId>mockito-core</artifactId>
       <scope>test</scope>
     </dependency>
     <dependency>
-      <groupId>org.hamcrest</groupId>
-      <artifactId>hamcrest-core</artifactId>
+      <groupId>org.scalacheck</groupId>
+      <artifactId>scalacheck_${scala.binary.version}</artifactId>
       <scope>test</scope>
     </dependency>
     <dependency>
-      <groupId>org.hamcrest</groupId>
-      <artifactId>hamcrest-library</artifactId>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
       <scope>test</scope>
     </dependency>
     <dependency>
diff --git a/core/src/main/java/org/apache/spark/shuffle/unsafe/DummySerializerInstance.java b/core/src/main/java/org/apache/spark/serializer/DummySerializerInstance.java
similarity index 91%
rename from core/src/main/java/org/apache/spark/shuffle/unsafe/DummySerializerInstance.java
rename to core/src/main/java/org/apache/spark/serializer/DummySerializerInstance.java
index 3f746b886bc9b..0399abc63c235 100644
--- a/core/src/main/java/org/apache/spark/shuffle/unsafe/DummySerializerInstance.java
+++ b/core/src/main/java/org/apache/spark/serializer/DummySerializerInstance.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.shuffle.unsafe;
+package org.apache.spark.serializer;
 
 import java.io.IOException;
 import java.io.InputStream;
@@ -24,9 +24,7 @@
 
 import scala.reflect.ClassTag;
 
-import org.apache.spark.serializer.DeserializationStream;
-import org.apache.spark.serializer.SerializationStream;
-import org.apache.spark.serializer.SerializerInstance;
+import org.apache.spark.annotation.Private;
 import org.apache.spark.unsafe.PlatformDependent;
 
 /**
@@ -35,7 +33,8 @@
  * `write() OutputStream methods), but DiskBlockObjectWriter still calls some methods on it. To work
  * around this, we pass a dummy no-op serializer.
  */
-final class DummySerializerInstance extends SerializerInstance {
+@Private
+public final class DummySerializerInstance extends SerializerInstance {
 
   public static final DummySerializerInstance INSTANCE = new DummySerializerInstance();
 
diff --git a/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleExternalSorter.java b/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleExternalSorter.java
index 9e9ed94b7890c..56289573209fb 100644
--- a/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleExternalSorter.java
+++ b/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleExternalSorter.java
@@ -30,6 +30,7 @@
 import org.apache.spark.SparkConf;
 import org.apache.spark.TaskContext;
 import org.apache.spark.executor.ShuffleWriteMetrics;
+import org.apache.spark.serializer.DummySerializerInstance;
 import org.apache.spark.serializer.SerializerInstance;
 import org.apache.spark.shuffle.ShuffleMemoryManager;
 import org.apache.spark.storage.*;
diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/PrefixComparator.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/PrefixComparator.java
new file mode 100644
index 0000000000000..45b78829e4cf7
--- /dev/null
+++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/PrefixComparator.java
@@ -0,0 +1,29 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util.collection.unsafe.sort;
+
+import org.apache.spark.annotation.Private;
+
+/**
+ * Compares 8-byte key prefixes in prefix sort. Subclasses may implement type-specific
+ * comparisons, such as lexicographic comparison for strings.
+ */
+@Private
+public abstract class PrefixComparator {
+  public abstract int compare(long prefix1, long prefix2);
+}
diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/PrefixComparators.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/PrefixComparators.java
new file mode 100644
index 0000000000000..438742565c51d
--- /dev/null
+++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/PrefixComparators.java
@@ -0,0 +1,109 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util.collection.unsafe.sort;
+
+import com.google.common.base.Charsets;
+import com.google.common.primitives.Longs;
+import com.google.common.primitives.UnsignedBytes;
+
+import org.apache.spark.annotation.Private;
+import org.apache.spark.unsafe.types.UTF8String;
+
+@Private
+public class PrefixComparators {
+  private PrefixComparators() {}
+
+  public static final StringPrefixComparator STRING = new StringPrefixComparator();
+  public static final IntegralPrefixComparator INTEGRAL = new IntegralPrefixComparator();
+  public static final FloatPrefixComparator FLOAT = new FloatPrefixComparator();
+  public static final DoublePrefixComparator DOUBLE = new DoublePrefixComparator();
+
+  public static final class StringPrefixComparator extends PrefixComparator {
+    @Override
+    public int compare(long aPrefix, long bPrefix) {
+      // TODO: can done more efficiently
+      byte[] a = Longs.toByteArray(aPrefix);
+      byte[] b = Longs.toByteArray(bPrefix);
+      for (int i = 0; i < 8; i++) {
+        int c = UnsignedBytes.compare(a[i], b[i]);
+        if (c != 0) return c;
+      }
+      return 0;
+    }
+
+    public long computePrefix(byte[] bytes) {
+      if (bytes == null) {
+        return 0L;
+      } else {
+        byte[] padded = new byte[8];
+        System.arraycopy(bytes, 0, padded, 0, Math.min(bytes.length, 8));
+        return Longs.fromByteArray(padded);
+      }
+    }
+
+    public long computePrefix(String value) {
+      return value == null ? 0L : computePrefix(value.getBytes(Charsets.UTF_8));
+    }
+
+    public long computePrefix(UTF8String value) {
+      return value == null ? 0L : computePrefix(value.getBytes());
+    }
+  }
+
+  /**
+   * Prefix comparator for all integral types (boolean, byte, short, int, long).
+   */
+  public static final class IntegralPrefixComparator extends PrefixComparator {
+    @Override
+    public int compare(long a, long b) {
+      return (a < b) ? -1 : (a > b) ? 1 : 0;
+    }
+
+    public final long NULL_PREFIX = Long.MIN_VALUE;
+  }
+
+  public static final class FloatPrefixComparator extends PrefixComparator {
+    @Override
+    public int compare(long aPrefix, long bPrefix) {
+      float a = Float.intBitsToFloat((int) aPrefix);
+      float b = Float.intBitsToFloat((int) bPrefix);
+      return (a < b) ? -1 : (a > b) ? 1 : 0;
+    }
+
+    public long computePrefix(float value) {
+      return Float.floatToIntBits(value) & 0xffffffffL;
+    }
+
+    public final long NULL_PREFIX = computePrefix(Float.NEGATIVE_INFINITY);
+  }
+
+  public static final class DoublePrefixComparator extends PrefixComparator {
+    @Override
+    public int compare(long aPrefix, long bPrefix) {
+      double a = Double.longBitsToDouble(aPrefix);
+      double b = Double.longBitsToDouble(bPrefix);
+      return (a < b) ? -1 : (a > b) ? 1 : 0;
+    }
+
+    public long computePrefix(double value) {
+      return Double.doubleToLongBits(value);
+    }
+
+    public final long NULL_PREFIX = computePrefix(Double.NEGATIVE_INFINITY);
+  }
+}
diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/RecordComparator.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/RecordComparator.java
new file mode 100644
index 0000000000000..09e4258792204
--- /dev/null
+++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/RecordComparator.java
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util.collection.unsafe.sort;
+
+/**
+ * Compares records for ordering. In cases where the entire sorting key can fit in the 8-byte
+ * prefix, this may simply return 0.
+ */
+public abstract class RecordComparator {
+
+  /**
+   * Compare two records for order.
+   *
+   * @return a negative integer, zero, or a positive integer as the first record is less than,
+   *         equal to, or greater than the second.
+   */
+  public abstract int compare(
+    Object leftBaseObject,
+    long leftBaseOffset,
+    Object rightBaseObject,
+    long rightBaseOffset);
+}
diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/RecordPointerAndKeyPrefix.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/RecordPointerAndKeyPrefix.java
new file mode 100644
index 0000000000000..0c4ebde407cfc
--- /dev/null
+++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/RecordPointerAndKeyPrefix.java
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util.collection.unsafe.sort;
+
+final class RecordPointerAndKeyPrefix {
+  /**
+   * A pointer to a record; see {@link org.apache.spark.unsafe.memory.TaskMemoryManager} for a
+   * description of how these addresses are encoded.
+   */
+  public long recordPointer;
+
+  /**
+   * A key prefix, for use in comparisons.
+   */
+  public long keyPrefix;
+}
diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java
new file mode 100644
index 0000000000000..4d6731ee60af3
--- /dev/null
+++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java
@@ -0,0 +1,282 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util.collection.unsafe.sort;
+
+import java.io.IOException;
+import java.util.LinkedList;
+
+import com.google.common.annotations.VisibleForTesting;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.TaskContext;
+import org.apache.spark.executor.ShuffleWriteMetrics;
+import org.apache.spark.shuffle.ShuffleMemoryManager;
+import org.apache.spark.storage.BlockManager;
+import org.apache.spark.unsafe.PlatformDependent;
+import org.apache.spark.unsafe.memory.MemoryBlock;
+import org.apache.spark.unsafe.memory.TaskMemoryManager;
+import org.apache.spark.util.Utils;
+
+/**
+ * External sorter based on {@link UnsafeInMemorySorter}.
+ */
+public final class UnsafeExternalSorter {
+
+  private final Logger logger = LoggerFactory.getLogger(UnsafeExternalSorter.class);
+
+  private static final int PAGE_SIZE = 1 << 27;  // 128 megabytes
+  @VisibleForTesting
+  static final int MAX_RECORD_SIZE = PAGE_SIZE - 4;
+
+  private final PrefixComparator prefixComparator;
+  private final RecordComparator recordComparator;
+  private final int initialSize;
+  private final TaskMemoryManager memoryManager;
+  private final ShuffleMemoryManager shuffleMemoryManager;
+  private final BlockManager blockManager;
+  private final TaskContext taskContext;
+  private ShuffleWriteMetrics writeMetrics;
+
+  /** The buffer size to use when writing spills using DiskBlockObjectWriter */
+  private final int fileBufferSizeBytes;
+
+  /**
+   * Memory pages that hold the records being sorted. The pages in this list are freed when
+   * spilling, although in principle we could recycle these pages across spills (on the other hand,
+   * this might not be necessary if we maintained a pool of re-usable pages in the TaskMemoryManager
+   * itself).
+   */
+  private final LinkedList<MemoryBlock> allocatedPages = new LinkedList<MemoryBlock>();
+
+  // These variables are reset after spilling:
+  private UnsafeInMemorySorter sorter;
+  private MemoryBlock currentPage = null;
+  private long currentPagePosition = -1;
+  private long freeSpaceInCurrentPage = 0;
+
+  private final LinkedList<UnsafeSorterSpillWriter> spillWriters = new LinkedList<>();
+
+  public UnsafeExternalSorter(
+      TaskMemoryManager memoryManager,
+      ShuffleMemoryManager shuffleMemoryManager,
+      BlockManager blockManager,
+      TaskContext taskContext,
+      RecordComparator recordComparator,
+      PrefixComparator prefixComparator,
+      int initialSize,
+      SparkConf conf) throws IOException {
+    this.memoryManager = memoryManager;
+    this.shuffleMemoryManager = shuffleMemoryManager;
+    this.blockManager = blockManager;
+    this.taskContext = taskContext;
+    this.recordComparator = recordComparator;
+    this.prefixComparator = prefixComparator;
+    this.initialSize = initialSize;
+    // Use getSizeAsKb (not bytes) to maintain backwards compatibility for units
+    this.fileBufferSizeBytes = (int) conf.getSizeAsKb("spark.shuffle.file.buffer", "32k") * 1024;
+    initializeForWriting();
+  }
+
+  // TODO: metrics tracking + integration with shuffle write metrics
+  // need to connect the write metrics to task metrics so we count the spill IO somewhere.
+
+  /**
+   * Allocates new sort data structures. Called when creating the sorter and after each spill.
+   */
+  private void initializeForWriting() throws IOException {
+    this.writeMetrics = new ShuffleWriteMetrics();
+    // TODO: move this sizing calculation logic into a static method of sorter:
+    final long memoryRequested = initialSize * 8L * 2;
+    final long memoryAcquired = shuffleMemoryManager.tryToAcquire(memoryRequested);
+    if (memoryAcquired != memoryRequested) {
+      shuffleMemoryManager.release(memoryAcquired);
+      throw new IOException("Could not acquire " + memoryRequested + " bytes of memory");
+    }
+
+    this.sorter =
+      new UnsafeInMemorySorter(memoryManager, recordComparator, prefixComparator, initialSize);
+  }
+
+  /**
+   * Sort and spill the current records in response to memory pressure.
+   */
+  @VisibleForTesting
+  public void spill() throws IOException {
+    logger.info("Thread {} spilling sort data of {} to disk ({} {} so far)",
+      Thread.currentThread().getId(),
+      Utils.bytesToString(getMemoryUsage()),
+      spillWriters.size(),
+      spillWriters.size() > 1 ? " times" : " time");
+
+    final UnsafeSorterSpillWriter spillWriter =
+      new UnsafeSorterSpillWriter(blockManager, fileBufferSizeBytes, writeMetrics,
+        sorter.numRecords());
+    spillWriters.add(spillWriter);
+    final UnsafeSorterIterator sortedRecords = sorter.getSortedIterator();
+    while (sortedRecords.hasNext()) {
+      sortedRecords.loadNext();
+      final Object baseObject = sortedRecords.getBaseObject();
+      final long baseOffset = sortedRecords.getBaseOffset();
+      final int recordLength = sortedRecords.getRecordLength();
+      spillWriter.write(baseObject, baseOffset, recordLength, sortedRecords.getKeyPrefix());
+    }
+    spillWriter.close();
+    final long sorterMemoryUsage = sorter.getMemoryUsage();
+    sorter = null;
+    shuffleMemoryManager.release(sorterMemoryUsage);
+    final long spillSize = freeMemory();
+    taskContext.taskMetrics().incMemoryBytesSpilled(spillSize);
+    initializeForWriting();
+  }
+
+  private long getMemoryUsage() {
+    return sorter.getMemoryUsage() + (allocatedPages.size() * (long) PAGE_SIZE);
+  }
+
+  public long freeMemory() {
+    long memoryFreed = 0;
+    for (MemoryBlock block : allocatedPages) {
+      memoryManager.freePage(block);
+      shuffleMemoryManager.release(block.size());
+      memoryFreed += block.size();
+    }
+    allocatedPages.clear();
+    currentPage = null;
+    currentPagePosition = -1;
+    freeSpaceInCurrentPage = 0;
+    return memoryFreed;
+  }
+
+  /**
+   * Checks whether there is enough space to insert a new record into the sorter.
+   *
+   * @param requiredSpace the required space in the data page, in bytes, including space for storing
+   *                      the record size.
+
+   * @return true if the record can be inserted without requiring more allocations, false otherwise.
+   */
+  private boolean haveSpaceForRecord(int requiredSpace) {
+    assert (requiredSpace > 0);
+    return (sorter.hasSpaceForAnotherRecord() && (requiredSpace <= freeSpaceInCurrentPage));
+  }
+
+  /**
+   * Allocates more memory in order to insert an additional record. This will request additional
+   * memory from the {@link ShuffleMemoryManager} and spill if the requested memory can not be
+   * obtained.
+   *
+   * @param requiredSpace the required space in the data page, in bytes, including space for storing
+   *                      the record size.
+   */
+  private void allocateSpaceForRecord(int requiredSpace) throws IOException {
+    // TODO: merge these steps to first calculate total memory requirements for this insert,
+    // then try to acquire; no point in acquiring sort buffer only to spill due to no space in the
+    // data page.
+    if (!sorter.hasSpaceForAnotherRecord()) {
+      logger.debug("Attempting to expand sort pointer array");
+      final long oldPointerArrayMemoryUsage = sorter.getMemoryUsage();
+      final long memoryToGrowPointerArray = oldPointerArrayMemoryUsage * 2;
+      final long memoryAcquired = shuffleMemoryManager.tryToAcquire(memoryToGrowPointerArray);
+      if (memoryAcquired < memoryToGrowPointerArray) {
+        shuffleMemoryManager.release(memoryAcquired);
+        spill();
+      } else {
+        sorter.expandPointerArray();
+        shuffleMemoryManager.release(oldPointerArrayMemoryUsage);
+      }
+    }
+
+    if (requiredSpace > freeSpaceInCurrentPage) {
+      logger.trace("Required space {} is less than free space in current page ({})", requiredSpace,
+        freeSpaceInCurrentPage);
+      // TODO: we should track metrics on the amount of space wasted when we roll over to a new page
+      // without using the free space at the end of the current page. We should also do this for
+      // BytesToBytesMap.
+      if (requiredSpace > PAGE_SIZE) {
+        throw new IOException("Required space " + requiredSpace + " is greater than page size (" +
+          PAGE_SIZE + ")");
+      } else {
+        final long memoryAcquired = shuffleMemoryManager.tryToAcquire(PAGE_SIZE);
+        if (memoryAcquired < PAGE_SIZE) {
+          shuffleMemoryManager.release(memoryAcquired);
+          spill();
+          final long memoryAcquiredAfterSpilling = shuffleMemoryManager.tryToAcquire(PAGE_SIZE);
+          if (memoryAcquiredAfterSpilling != PAGE_SIZE) {
+            shuffleMemoryManager.release(memoryAcquiredAfterSpilling);
+            throw new IOException("Unable to acquire " + PAGE_SIZE + " bytes of memory");
+          }
+        }
+        currentPage = memoryManager.allocatePage(PAGE_SIZE);
+        currentPagePosition = currentPage.getBaseOffset();
+        freeSpaceInCurrentPage = PAGE_SIZE;
+        allocatedPages.add(currentPage);
+      }
+    }
+  }
+
+  /**
+   * Write a record to the sorter.
+   */
+  public void insertRecord(
+      Object recordBaseObject,
+      long recordBaseOffset,
+      int lengthInBytes,
+      long prefix) throws IOException {
+    // Need 4 bytes to store the record length.
+    final int totalSpaceRequired = lengthInBytes + 4;
+    if (!haveSpaceForRecord(totalSpaceRequired)) {
+      allocateSpaceForRecord(totalSpaceRequired);
+    }
+
+    final long recordAddress =
+      memoryManager.encodePageNumberAndOffset(currentPage, currentPagePosition);
+    final Object dataPageBaseObject = currentPage.getBaseObject();
+    PlatformDependent.UNSAFE.putInt(dataPageBaseObject, currentPagePosition, lengthInBytes);
+    currentPagePosition += 4;
+    PlatformDependent.copyMemory(
+      recordBaseObject,
+      recordBaseOffset,
+      dataPageBaseObject,
+      currentPagePosition,
+      lengthInBytes);
+    currentPagePosition += lengthInBytes;
+
+    sorter.insertRecord(recordAddress, prefix);
+  }
+
+  public UnsafeSorterIterator getSortedIterator() throws IOException {
+    final UnsafeSorterIterator inMemoryIterator = sorter.getSortedIterator();
+    int numIteratorsToMerge = spillWriters.size() + (inMemoryIterator.hasNext() ? 1 : 0);
+    if (spillWriters.isEmpty()) {
+      return inMemoryIterator;
+    } else {
+      final UnsafeSorterSpillMerger spillMerger =
+        new UnsafeSorterSpillMerger(recordComparator, prefixComparator, numIteratorsToMerge);
+      for (UnsafeSorterSpillWriter spillWriter : spillWriters) {
+        spillMerger.addSpill(spillWriter.getReader(blockManager));
+      }
+      spillWriters.clear();
+      if (inMemoryIterator.hasNext()) {
+        spillMerger.addSpill(inMemoryIterator);
+      }
+      return spillMerger.getSortedIterator();
+    }
+  }
+}
diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java
new file mode 100644
index 0000000000000..fc34ad9cff369
--- /dev/null
+++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java
@@ -0,0 +1,189 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util.collection.unsafe.sort;
+
+import java.util.Comparator;
+
+import org.apache.spark.unsafe.PlatformDependent;
+import org.apache.spark.util.collection.Sorter;
+import org.apache.spark.unsafe.memory.TaskMemoryManager;
+
+/**
+ * Sorts records using an AlphaSort-style key-prefix sort. This sort stores pointers to records
+ * alongside a user-defined prefix of the record's sorting key. When the underlying sort algorithm
+ * compares records, it will first compare the stored key prefixes; if the prefixes are not equal,
+ * then we do not need to traverse the record pointers to compare the actual records. Avoiding these
+ * random memory accesses improves cache hit rates.
+ */
+public final class UnsafeInMemorySorter {
+
+  private static final class SortComparator implements Comparator<RecordPointerAndKeyPrefix> {
+
+    private final RecordComparator recordComparator;
+    private final PrefixComparator prefixComparator;
+    private final TaskMemoryManager memoryManager;
+
+    SortComparator(
+        RecordComparator recordComparator,
+        PrefixComparator prefixComparator,
+        TaskMemoryManager memoryManager) {
+      this.recordComparator = recordComparator;
+      this.prefixComparator = prefixComparator;
+      this.memoryManager = memoryManager;
+    }
+
+    @Override
+    public int compare(RecordPointerAndKeyPrefix r1, RecordPointerAndKeyPrefix r2) {
+      final int prefixComparisonResult = prefixComparator.compare(r1.keyPrefix, r2.keyPrefix);
+      if (prefixComparisonResult == 0) {
+        final Object baseObject1 = memoryManager.getPage(r1.recordPointer);
+        final long baseOffset1 = memoryManager.getOffsetInPage(r1.recordPointer) + 4; // skip length
+        final Object baseObject2 = memoryManager.getPage(r2.recordPointer);
+        final long baseOffset2 = memoryManager.getOffsetInPage(r2.recordPointer) + 4; // skip length
+        return recordComparator.compare(baseObject1, baseOffset1, baseObject2, baseOffset2);
+      } else {
+        return prefixComparisonResult;
+      }
+    }
+  }
+
+  private final TaskMemoryManager memoryManager;
+  private final Sorter<RecordPointerAndKeyPrefix, long[]> sorter;
+  private final Comparator<RecordPointerAndKeyPrefix> sortComparator;
+
+  /**
+   * Within this buffer, position {@code 2 * i} holds a pointer pointer to the record at
+   * index {@code i}, while position {@code 2 * i + 1} in the array holds an 8-byte key prefix.
+   */
+  private long[] pointerArray;
+
+  /**
+   * The position in the sort buffer where new records can be inserted.
+   */
+  private int pointerArrayInsertPosition = 0;
+
+  public UnsafeInMemorySorter(
+      final TaskMemoryManager memoryManager,
+      final RecordComparator recordComparator,
+      final PrefixComparator prefixComparator,
+      int initialSize) {
+    assert (initialSize > 0);
+    this.pointerArray = new long[initialSize * 2];
+    this.memoryManager = memoryManager;
+    this.sorter = new Sorter<>(UnsafeSortDataFormat.INSTANCE);
+    this.sortComparator = new SortComparator(recordComparator, prefixComparator, memoryManager);
+  }
+
+  /**
+   * @return the number of records that have been inserted into this sorter.
+   */
+  public int numRecords() {
+    return pointerArrayInsertPosition / 2;
+  }
+
+  public long getMemoryUsage() {
+    return pointerArray.length * 8L;
+  }
+
+  public boolean hasSpaceForAnotherRecord() {
+    return pointerArrayInsertPosition + 2 < pointerArray.length;
+  }
+
+  public void expandPointerArray() {
+    final long[] oldArray = pointerArray;
+    // Guard against overflow:
+    final int newLength = oldArray.length * 2 > 0 ? (oldArray.length * 2) : Integer.MAX_VALUE;
+    pointerArray = new long[newLength];
+    System.arraycopy(oldArray, 0, pointerArray, 0, oldArray.length);
+  }
+
+  /**
+   * Inserts a record to be sorted. Assumes that the record pointer points to a record length
+   * stored as a 4-byte integer, followed by the record's bytes.
+   *
+   * @param recordPointer pointer to a record in a data page, encoded by {@link TaskMemoryManager}.
+   * @param keyPrefix a user-defined key prefix
+   */
+  public void insertRecord(long recordPointer, long keyPrefix) {
+    if (!hasSpaceForAnotherRecord()) {
+      expandPointerArray();
+    }
+    pointerArray[pointerArrayInsertPosition] = recordPointer;
+    pointerArrayInsertPosition++;
+    pointerArray[pointerArrayInsertPosition] = keyPrefix;
+    pointerArrayInsertPosition++;
+  }
+
+  private static final class SortedIterator extends UnsafeSorterIterator {
+
+    private final TaskMemoryManager memoryManager;
+    private final int sortBufferInsertPosition;
+    private final long[] sortBuffer;
+    private int position = 0;
+    private Object baseObject;
+    private long baseOffset;
+    private long keyPrefix;
+    private int recordLength;
+
+    SortedIterator(
+        TaskMemoryManager memoryManager,
+        int sortBufferInsertPosition,
+        long[] sortBuffer) {
+      this.memoryManager = memoryManager;
+      this.sortBufferInsertPosition = sortBufferInsertPosition;
+      this.sortBuffer = sortBuffer;
+    }
+
+    @Override
+    public boolean hasNext() {
+      return position < sortBufferInsertPosition;
+    }
+
+    @Override
+    public void loadNext() {
+      // This pointer points to a 4-byte record length, followed by the record's bytes
+      final long recordPointer = sortBuffer[position];
+      baseObject = memoryManager.getPage(recordPointer);
+      baseOffset = memoryManager.getOffsetInPage(recordPointer) + 4;  // Skip over record length
+      recordLength = PlatformDependent.UNSAFE.getInt(baseObject, baseOffset - 4);
+      keyPrefix = sortBuffer[position + 1];
+      position += 2;
+    }
+
+    @Override
+    public Object getBaseObject() { return baseObject; }
+
+    @Override
+    public long getBaseOffset() { return baseOffset; }
+
+    @Override
+    public int getRecordLength() { return recordLength; }
+
+    @Override
+    public long getKeyPrefix() { return keyPrefix; }
+  }
+
+  /**
+   * Return an iterator over record pointers in sorted order. For efficiency, all calls to
+   * {@code next()} will return the same mutable object.
+   */
+  public UnsafeSorterIterator getSortedIterator() {
+    sorter.sort(pointerArray, 0, pointerArrayInsertPosition / 2, sortComparator);
+    return new SortedIterator(memoryManager, pointerArrayInsertPosition, pointerArray);
+  }
+}
diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSortDataFormat.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSortDataFormat.java
new file mode 100644
index 0000000000000..d09c728a7a638
--- /dev/null
+++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSortDataFormat.java
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util.collection.unsafe.sort;
+
+import org.apache.spark.util.collection.SortDataFormat;
+
+/**
+ * Supports sorting an array of (record pointer, key prefix) pairs.
+ * Used in {@link UnsafeInMemorySorter}.
+ * <p>
+ * Within each long[] buffer, position {@code 2 * i} holds a pointer pointer to the record at
+ * index {@code i}, while position {@code 2 * i + 1} in the array holds an 8-byte key prefix.
+ */
+final class UnsafeSortDataFormat extends SortDataFormat<RecordPointerAndKeyPrefix, long[]> {
+
+  public static final UnsafeSortDataFormat INSTANCE = new UnsafeSortDataFormat();
+
+  private UnsafeSortDataFormat() { }
+
+  @Override
+  public RecordPointerAndKeyPrefix getKey(long[] data, int pos) {
+    // Since we re-use keys, this method shouldn't be called.
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public RecordPointerAndKeyPrefix newKey() {
+    return new RecordPointerAndKeyPrefix();
+  }
+
+  @Override
+  public RecordPointerAndKeyPrefix getKey(long[] data, int pos, RecordPointerAndKeyPrefix reuse) {
+    reuse.recordPointer = data[pos * 2];
+    reuse.keyPrefix = data[pos * 2 + 1];
+    return reuse;
+  }
+
+  @Override
+  public void swap(long[] data, int pos0, int pos1) {
+    long tempPointer = data[pos0 * 2];
+    long tempKeyPrefix = data[pos0 * 2 + 1];
+    data[pos0 * 2] = data[pos1 * 2];
+    data[pos0 * 2 + 1] = data[pos1 * 2 + 1];
+    data[pos1 * 2] = tempPointer;
+    data[pos1 * 2 + 1] = tempKeyPrefix;
+  }
+
+  @Override
+  public void copyElement(long[] src, int srcPos, long[] dst, int dstPos) {
+    dst[dstPos * 2] = src[srcPos * 2];
+    dst[dstPos * 2 + 1] = src[srcPos * 2 + 1];
+  }
+
+  @Override
+  public void copyRange(long[] src, int srcPos, long[] dst, int dstPos, int length) {
+    System.arraycopy(src, srcPos * 2, dst, dstPos * 2, length * 2);
+  }
+
+  @Override
+  public long[] allocate(int length) {
+    assert (length < Integer.MAX_VALUE / 2) : "Length " + length + " is too large";
+    return new long[length * 2];
+  }
+
+}
diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterIterator.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterIterator.java
new file mode 100644
index 0000000000000..16ac2e8d821ba
--- /dev/null
+++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterIterator.java
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util.collection.unsafe.sort;
+
+import java.io.IOException;
+
+public abstract class UnsafeSorterIterator {
+
+  public abstract boolean hasNext();
+
+  public abstract void loadNext() throws IOException;
+
+  public abstract Object getBaseObject();
+
+  public abstract long getBaseOffset();
+
+  public abstract int getRecordLength();
+
+  public abstract long getKeyPrefix();
+}
diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillMerger.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillMerger.java
new file mode 100644
index 0000000000000..8272c2a5be0d1
--- /dev/null
+++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillMerger.java
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util.collection.unsafe.sort;
+
+import java.io.IOException;
+import java.util.Comparator;
+import java.util.PriorityQueue;
+
+final class UnsafeSorterSpillMerger {
+
+  private final PriorityQueue<UnsafeSorterIterator> priorityQueue;
+
+  public UnsafeSorterSpillMerger(
+      final RecordComparator recordComparator,
+      final PrefixComparator prefixComparator,
+      final int numSpills) {
+    final Comparator<UnsafeSorterIterator> comparator = new Comparator<UnsafeSorterIterator>() {
+
+      @Override
+      public int compare(UnsafeSorterIterator left, UnsafeSorterIterator right) {
+        final int prefixComparisonResult =
+          prefixComparator.compare(left.getKeyPrefix(), right.getKeyPrefix());
+        if (prefixComparisonResult == 0) {
+          return recordComparator.compare(
+            left.getBaseObject(), left.getBaseOffset(),
+            right.getBaseObject(), right.getBaseOffset());
+        } else {
+          return prefixComparisonResult;
+        }
+      }
+    };
+    priorityQueue = new PriorityQueue<UnsafeSorterIterator>(numSpills, comparator);
+  }
+
+  public void addSpill(UnsafeSorterIterator spillReader) throws IOException {
+    if (spillReader.hasNext()) {
+      spillReader.loadNext();
+    }
+    priorityQueue.add(spillReader);
+  }
+
+  public UnsafeSorterIterator getSortedIterator() throws IOException {
+    return new UnsafeSorterIterator() {
+
+      private UnsafeSorterIterator spillReader;
+
+      @Override
+      public boolean hasNext() {
+        return !priorityQueue.isEmpty() || (spillReader != null && spillReader.hasNext());
+      }
+
+      @Override
+      public void loadNext() throws IOException {
+        if (spillReader != null) {
+          if (spillReader.hasNext()) {
+            spillReader.loadNext();
+            priorityQueue.add(spillReader);
+          }
+        }
+        spillReader = priorityQueue.remove();
+      }
+
+      @Override
+      public Object getBaseObject() { return spillReader.getBaseObject(); }
+
+      @Override
+      public long getBaseOffset() { return spillReader.getBaseOffset(); }
+
+      @Override
+      public int getRecordLength() { return spillReader.getRecordLength(); }
+
+      @Override
+      public long getKeyPrefix() { return spillReader.getKeyPrefix(); }
+    };
+  }
+}
diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java
new file mode 100644
index 0000000000000..29e9e0f30f934
--- /dev/null
+++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util.collection.unsafe.sort;
+
+import java.io.*;
+
+import com.google.common.io.ByteStreams;
+
+import org.apache.spark.storage.BlockId;
+import org.apache.spark.storage.BlockManager;
+import org.apache.spark.unsafe.PlatformDependent;
+
+/**
+ * Reads spill files written by {@link UnsafeSorterSpillWriter} (see that class for a description
+ * of the file format).
+ */
+final class UnsafeSorterSpillReader extends UnsafeSorterIterator {
+
+  private InputStream in;
+  private DataInputStream din;
+
+  // Variables that change with every record read:
+  private int recordLength;
+  private long keyPrefix;
+  private int numRecordsRemaining;
+
+  private byte[] arr = new byte[1024 * 1024];
+  private Object baseObject = arr;
+  private final long baseOffset = PlatformDependent.BYTE_ARRAY_OFFSET;
+
+  public UnsafeSorterSpillReader(
+      BlockManager blockManager,
+      File file,
+      BlockId blockId) throws IOException {
+    assert (file.length() > 0);
+    final BufferedInputStream bs = new BufferedInputStream(new FileInputStream(file));
+    this.in = blockManager.wrapForCompression(blockId, bs);
+    this.din = new DataInputStream(this.in);
+    numRecordsRemaining = din.readInt();
+  }
+
+  @Override
+  public boolean hasNext() {
+    return (numRecordsRemaining > 0);
+  }
+
+  @Override
+  public void loadNext() throws IOException {
+    recordLength = din.readInt();
+    keyPrefix = din.readLong();
+    if (recordLength > arr.length) {
+      arr = new byte[recordLength];
+      baseObject = arr;
+    }
+    ByteStreams.readFully(in, arr, 0, recordLength);
+    numRecordsRemaining--;
+    if (numRecordsRemaining == 0) {
+      in.close();
+      in = null;
+      din = null;
+    }
+  }
+
+  @Override
+  public Object getBaseObject() {
+    return baseObject;
+  }
+
+  @Override
+  public long getBaseOffset() {
+    return baseOffset;
+  }
+
+  @Override
+  public int getRecordLength() {
+    return recordLength;
+  }
+
+  @Override
+  public long getKeyPrefix() {
+    return keyPrefix;
+  }
+}
diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillWriter.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillWriter.java
new file mode 100644
index 0000000000000..b8d66659804ad
--- /dev/null
+++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillWriter.java
@@ -0,0 +1,146 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util.collection.unsafe.sort;
+
+import java.io.File;
+import java.io.IOException;
+
+import scala.Tuple2;
+
+import org.apache.spark.executor.ShuffleWriteMetrics;
+import org.apache.spark.serializer.DummySerializerInstance;
+import org.apache.spark.storage.BlockId;
+import org.apache.spark.storage.BlockManager;
+import org.apache.spark.storage.BlockObjectWriter;
+import org.apache.spark.storage.TempLocalBlockId;
+import org.apache.spark.unsafe.PlatformDependent;
+
+/**
+ * Spills a list of sorted records to disk. Spill files have the following format:
+ *
+ *   [# of records (int)] [[len (int)][prefix (long)][data (bytes)]...]
+ */
+final class UnsafeSorterSpillWriter {
+
+  static final int DISK_WRITE_BUFFER_SIZE = 1024 * 1024;
+
+  // Small writes to DiskBlockObjectWriter will be fairly inefficient. Since there doesn't seem to
+  // be an API to directly transfer bytes from managed memory to the disk writer, we buffer
+  // data through a byte array.
+  private byte[] writeBuffer = new byte[DISK_WRITE_BUFFER_SIZE];
+
+  private final File file;
+  private final BlockId blockId;
+  private final int numRecordsToWrite;
+  private BlockObjectWriter writer;
+  private int numRecordsSpilled = 0;
+
+  public UnsafeSorterSpillWriter(
+      BlockManager blockManager,
+      int fileBufferSize,
+      ShuffleWriteMetrics writeMetrics,
+      int numRecordsToWrite) throws IOException {
+    final Tuple2<TempLocalBlockId, File> spilledFileInfo =
+      blockManager.diskBlockManager().createTempLocalBlock();
+    this.file = spilledFileInfo._2();
+    this.blockId = spilledFileInfo._1();
+    this.numRecordsToWrite = numRecordsToWrite;
+    // Unfortunately, we need a serializer instance in order to construct a DiskBlockObjectWriter.
+    // Our write path doesn't actually use this serializer (since we end up calling the `write()`
+    // OutputStream methods), but DiskBlockObjectWriter still calls some methods on it. To work
+    // around this, we pass a dummy no-op serializer.
+    writer = blockManager.getDiskWriter(
+      blockId, file, DummySerializerInstance.INSTANCE, fileBufferSize, writeMetrics);
+    // Write the number of records
+    writeIntToBuffer(numRecordsToWrite, 0);
+    writer.write(writeBuffer, 0, 4);
+  }
+
+  // Based on DataOutputStream.writeLong.
+  private void writeLongToBuffer(long v, int offset) throws IOException {
+    writeBuffer[offset + 0] = (byte)(v >>> 56);
+    writeBuffer[offset + 1] = (byte)(v >>> 48);
+    writeBuffer[offset + 2] = (byte)(v >>> 40);
+    writeBuffer[offset + 3] = (byte)(v >>> 32);
+    writeBuffer[offset + 4] = (byte)(v >>> 24);
+    writeBuffer[offset + 5] = (byte)(v >>> 16);
+    writeBuffer[offset + 6] = (byte)(v >>>  8);
+    writeBuffer[offset + 7] = (byte)(v >>>  0);
+  }
+
+  // Based on DataOutputStream.writeInt.
+  private void writeIntToBuffer(int v, int offset) throws IOException {
+    writeBuffer[offset + 0] = (byte)(v >>> 24);
+    writeBuffer[offset + 1] = (byte)(v >>> 16);
+    writeBuffer[offset + 2] = (byte)(v >>>  8);
+    writeBuffer[offset + 3] = (byte)(v >>>  0);
+  }
+
+  /**
+   * Write a record to a spill file.
+   *
+   * @param baseObject the base object / memory page containing the record
+   * @param baseOffset the base offset which points directly to the record data.
+   * @param recordLength the length of the record.
+   * @param keyPrefix a sort key prefix
+   */
+  public void write(
+      Object baseObject,
+      long baseOffset,
+      int recordLength,
+      long keyPrefix) throws IOException {
+    if (numRecordsSpilled == numRecordsToWrite) {
+      throw new IllegalStateException(
+        "Number of records written exceeded numRecordsToWrite = " + numRecordsToWrite);
+    } else {
+      numRecordsSpilled++;
+    }
+    writeIntToBuffer(recordLength, 0);
+    writeLongToBuffer(keyPrefix, 4);
+    int dataRemaining = recordLength;
+    int freeSpaceInWriteBuffer = DISK_WRITE_BUFFER_SIZE - 4 - 8; // space used by prefix + len
+    long recordReadPosition = baseOffset;
+    while (dataRemaining > 0) {
+      final int toTransfer = Math.min(freeSpaceInWriteBuffer, dataRemaining);
+      PlatformDependent.copyMemory(
+        baseObject,
+        recordReadPosition,
+        writeBuffer,
+        PlatformDependent.BYTE_ARRAY_OFFSET + (DISK_WRITE_BUFFER_SIZE - freeSpaceInWriteBuffer),
+        toTransfer);
+      writer.write(writeBuffer, 0, (DISK_WRITE_BUFFER_SIZE - freeSpaceInWriteBuffer) + toTransfer);
+      recordReadPosition += toTransfer;
+      dataRemaining -= toTransfer;
+      freeSpaceInWriteBuffer = DISK_WRITE_BUFFER_SIZE;
+    }
+    if (freeSpaceInWriteBuffer < DISK_WRITE_BUFFER_SIZE) {
+      writer.write(writeBuffer, 0, (DISK_WRITE_BUFFER_SIZE - freeSpaceInWriteBuffer));
+    }
+    writer.recordWritten();
+  }
+
+  public void close() throws IOException {
+    writer.commitAndClose();
+    writer = null;
+    writeBuffer = null;
+  }
+
+  public UnsafeSorterSpillReader getReader(BlockManager blockManager) throws IOException {
+    return new UnsafeSorterSpillReader(blockManager, file, blockId);
+  }
+}
diff --git a/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java b/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java
new file mode 100644
index 0000000000000..ea8755e21eb68
--- /dev/null
+++ b/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java
@@ -0,0 +1,202 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util.collection.unsafe.sort;
+
+import java.io.File;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.util.UUID;
+
+import scala.Tuple2;
+import scala.Tuple2$;
+import scala.runtime.AbstractFunction1;
+
+import org.junit.Before;
+import org.junit.Test;
+import org.mockito.Mock;
+import org.mockito.MockitoAnnotations;
+import org.mockito.invocation.InvocationOnMock;
+import org.mockito.stubbing.Answer;
+import static org.junit.Assert.*;
+import static org.mockito.AdditionalAnswers.returnsFirstArg;
+import static org.mockito.AdditionalAnswers.returnsSecondArg;
+import static org.mockito.Answers.RETURNS_SMART_NULLS;
+import static org.mockito.Mockito.*;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.TaskContext;
+import org.apache.spark.executor.ShuffleWriteMetrics;
+import org.apache.spark.executor.TaskMetrics;
+import org.apache.spark.serializer.SerializerInstance;
+import org.apache.spark.shuffle.ShuffleMemoryManager;
+import org.apache.spark.storage.*;
+import org.apache.spark.unsafe.PlatformDependent;
+import org.apache.spark.unsafe.memory.ExecutorMemoryManager;
+import org.apache.spark.unsafe.memory.MemoryAllocator;
+import org.apache.spark.unsafe.memory.TaskMemoryManager;
+import org.apache.spark.util.Utils;
+
+public class UnsafeExternalSorterSuite {
+
+  final TaskMemoryManager memoryManager =
+    new TaskMemoryManager(new ExecutorMemoryManager(MemoryAllocator.HEAP));
+  // Use integer comparison for comparing prefixes (which are partition ids, in this case)
+  final PrefixComparator prefixComparator = new PrefixComparator() {
+    @Override
+    public int compare(long prefix1, long prefix2) {
+      return (int) prefix1 - (int) prefix2;
+    }
+  };
+  // Since the key fits within the 8-byte prefix, we don't need to do any record comparison, so
+  // use a dummy comparator
+  final RecordComparator recordComparator = new RecordComparator() {
+    @Override
+    public int compare(
+      Object leftBaseObject,
+      long leftBaseOffset,
+      Object rightBaseObject,
+      long rightBaseOffset) {
+      return 0;
+    }
+  };
+
+  @Mock(answer = RETURNS_SMART_NULLS) ShuffleMemoryManager shuffleMemoryManager;
+  @Mock(answer = RETURNS_SMART_NULLS) BlockManager blockManager;
+  @Mock(answer = RETURNS_SMART_NULLS) DiskBlockManager diskBlockManager;
+  @Mock(answer = RETURNS_SMART_NULLS) TaskContext taskContext;
+
+  File tempDir;
+
+  private static final class CompressStream extends AbstractFunction1<OutputStream, OutputStream> {
+    @Override
+    public OutputStream apply(OutputStream stream) {
+      return stream;
+    }
+  }
+
+  @Before
+  public void setUp() {
+    MockitoAnnotations.initMocks(this);
+    tempDir = new File(Utils.createTempDir$default$1());
+    taskContext = mock(TaskContext.class);
+    when(taskContext.taskMetrics()).thenReturn(new TaskMetrics());
+    when(shuffleMemoryManager.tryToAcquire(anyLong())).then(returnsFirstArg());
+    when(blockManager.diskBlockManager()).thenReturn(diskBlockManager);
+    when(diskBlockManager.createTempLocalBlock()).thenAnswer(new Answer<Tuple2<TempLocalBlockId, File>>() {
+      @Override
+      public Tuple2<TempLocalBlockId, File> answer(InvocationOnMock invocationOnMock) throws Throwable {
+        TempLocalBlockId blockId = new TempLocalBlockId(UUID.randomUUID());
+        File file = File.createTempFile("spillFile", ".spill", tempDir);
+        return Tuple2$.MODULE$.apply(blockId, file);
+      }
+    });
+    when(blockManager.getDiskWriter(
+      any(BlockId.class),
+      any(File.class),
+      any(SerializerInstance.class),
+      anyInt(),
+      any(ShuffleWriteMetrics.class))).thenAnswer(new Answer<DiskBlockObjectWriter>() {
+      @Override
+      public DiskBlockObjectWriter answer(InvocationOnMock invocationOnMock) throws Throwable {
+        Object[] args = invocationOnMock.getArguments();
+
+        return new DiskBlockObjectWriter(
+          (BlockId) args[0],
+          (File) args[1],
+          (SerializerInstance) args[2],
+          (Integer) args[3],
+          new CompressStream(),
+          false,
+          (ShuffleWriteMetrics) args[4]
+        );
+      }
+    });
+    when(blockManager.wrapForCompression(any(BlockId.class), any(InputStream.class)))
+      .then(returnsSecondArg());
+  }
+
+  private static void insertNumber(UnsafeExternalSorter sorter, int value) throws Exception {
+    final int[] arr = new int[] { value };
+    sorter.insertRecord(arr, PlatformDependent.INT_ARRAY_OFFSET, 4, value);
+  }
+
+  @Test
+  public void testSortingOnlyByPrefix() throws Exception {
+
+    final UnsafeExternalSorter sorter = new UnsafeExternalSorter(
+      memoryManager,
+      shuffleMemoryManager,
+      blockManager,
+      taskContext,
+      recordComparator,
+      prefixComparator,
+      1024,
+      new SparkConf());
+
+    insertNumber(sorter, 5);
+    insertNumber(sorter, 1);
+    insertNumber(sorter, 3);
+    sorter.spill();
+    insertNumber(sorter, 4);
+    sorter.spill();
+    insertNumber(sorter, 2);
+
+    UnsafeSorterIterator iter = sorter.getSortedIterator();
+
+    for (int i = 1; i <= 5; i++) {
+      iter.loadNext();
+      assertEquals(i, iter.getKeyPrefix());
+      assertEquals(4, iter.getRecordLength());
+      // TODO: read rest of value.
+    }
+
+    // TODO: test for cleanup:
+    // assert(tempDir.isEmpty)
+  }
+
+  @Test
+  public void testSortingEmptyArrays() throws Exception {
+
+    final UnsafeExternalSorter sorter = new UnsafeExternalSorter(
+      memoryManager,
+      shuffleMemoryManager,
+      blockManager,
+      taskContext,
+      recordComparator,
+      prefixComparator,
+      1024,
+      new SparkConf());
+
+    sorter.insertRecord(null, 0, 0, 0);
+    sorter.insertRecord(null, 0, 0, 0);
+    sorter.spill();
+    sorter.insertRecord(null, 0, 0, 0);
+    sorter.spill();
+    sorter.insertRecord(null, 0, 0, 0);
+    sorter.insertRecord(null, 0, 0, 0);
+
+    UnsafeSorterIterator iter = sorter.getSortedIterator();
+
+    for (int i = 1; i <= 5; i++) {
+      iter.loadNext();
+      assertEquals(0, iter.getKeyPrefix());
+      assertEquals(0, iter.getRecordLength());
+    }
+  }
+
+}
diff --git a/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorterSuite.java b/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorterSuite.java
new file mode 100644
index 0000000000000..909500930539c
--- /dev/null
+++ b/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorterSuite.java
@@ -0,0 +1,139 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util.collection.unsafe.sort;
+
+import java.util.Arrays;
+
+import org.junit.Test;
+import static org.hamcrest.MatcherAssert.assertThat;
+import static org.hamcrest.Matchers.*;
+import static org.junit.Assert.*;
+import static org.mockito.Mockito.mock;
+
+import org.apache.spark.HashPartitioner;
+import org.apache.spark.unsafe.PlatformDependent;
+import org.apache.spark.unsafe.memory.ExecutorMemoryManager;
+import org.apache.spark.unsafe.memory.MemoryAllocator;
+import org.apache.spark.unsafe.memory.MemoryBlock;
+import org.apache.spark.unsafe.memory.TaskMemoryManager;
+
+public class UnsafeInMemorySorterSuite {
+
+  private static String getStringFromDataPage(Object baseObject, long baseOffset, int length) {
+    final byte[] strBytes = new byte[length];
+    PlatformDependent.copyMemory(
+      baseObject,
+      baseOffset,
+      strBytes,
+      PlatformDependent.BYTE_ARRAY_OFFSET, length);
+    return new String(strBytes);
+  }
+
+  @Test
+  public void testSortingEmptyInput() {
+    final UnsafeInMemorySorter sorter = new UnsafeInMemorySorter(
+      new TaskMemoryManager(new ExecutorMemoryManager(MemoryAllocator.HEAP)),
+      mock(RecordComparator.class),
+      mock(PrefixComparator.class),
+      100);
+    final UnsafeSorterIterator iter = sorter.getSortedIterator();
+    assert(!iter.hasNext());
+  }
+
+  @Test
+  public void testSortingOnlyByIntegerPrefix() throws Exception {
+    final String[] dataToSort = new String[] {
+      "Boba",
+      "Pearls",
+      "Tapioca",
+      "Taho",
+      "Condensed Milk",
+      "Jasmine",
+      "Milk Tea",
+      "Lychee",
+      "Mango"
+    };
+    final TaskMemoryManager memoryManager =
+      new TaskMemoryManager(new ExecutorMemoryManager(MemoryAllocator.HEAP));
+    final MemoryBlock dataPage = memoryManager.allocatePage(2048);
+    final Object baseObject = dataPage.getBaseObject();
+    // Write the records into the data page:
+    long position = dataPage.getBaseOffset();
+    for (String str : dataToSort) {
+      final byte[] strBytes = str.getBytes("utf-8");
+      PlatformDependent.UNSAFE.putInt(baseObject, position, strBytes.length);
+      position += 4;
+      PlatformDependent.copyMemory(
+        strBytes,
+        PlatformDependent.BYTE_ARRAY_OFFSET,
+        baseObject,
+        position,
+        strBytes.length);
+      position += strBytes.length;
+    }
+    // Since the key fits within the 8-byte prefix, we don't need to do any record comparison, so
+    // use a dummy comparator
+    final RecordComparator recordComparator = new RecordComparator() {
+      @Override
+      public int compare(
+        Object leftBaseObject,
+        long leftBaseOffset,
+        Object rightBaseObject,
+        long rightBaseOffset) {
+        return 0;
+      }
+    };
+    // Compute key prefixes based on the records' partition ids
+    final HashPartitioner hashPartitioner = new HashPartitioner(4);
+    // Use integer comparison for comparing prefixes (which are partition ids, in this case)
+    final PrefixComparator prefixComparator = new PrefixComparator() {
+      @Override
+      public int compare(long prefix1, long prefix2) {
+        return (int) prefix1 - (int) prefix2;
+      }
+    };
+    UnsafeInMemorySorter sorter = new UnsafeInMemorySorter(memoryManager, recordComparator,
+      prefixComparator, dataToSort.length);
+    // Given a page of records, insert those records into the sorter one-by-one:
+    position = dataPage.getBaseOffset();
+    for (int i = 0; i < dataToSort.length; i++) {
+      // position now points to the start of a record (which holds its length).
+      final int recordLength = PlatformDependent.UNSAFE.getInt(baseObject, position);
+      final long address = memoryManager.encodePageNumberAndOffset(dataPage, position);
+      final String str = getStringFromDataPage(baseObject, position + 4, recordLength);
+      final int partitionId = hashPartitioner.getPartition(str);
+      sorter.insertRecord(address, partitionId);
+      position += 4 + recordLength;
+    }
+    final UnsafeSorterIterator iter = sorter.getSortedIterator();
+    int iterLength = 0;
+    long prevPrefix = -1;
+    Arrays.sort(dataToSort);
+    while (iter.hasNext()) {
+      iter.loadNext();
+      final String str =
+        getStringFromDataPage(iter.getBaseObject(), iter.getBaseOffset(), iter.getRecordLength());
+      final long keyPrefix = iter.getKeyPrefix();
+      assertThat(str, isIn(Arrays.asList(dataToSort)));
+      assertThat(keyPrefix, greaterThanOrEqualTo(prevPrefix));
+      prevPrefix = keyPrefix;
+      iterLength++;
+    }
+    assertEquals(dataToSort.length, iterLength);
+  }
+}
diff --git a/core/src/test/scala/org/apache/spark/util/collection/unsafe/sort/PrefixComparatorsSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/unsafe/sort/PrefixComparatorsSuite.scala
new file mode 100644
index 0000000000000..dd505dfa7d758
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/util/collection/unsafe/sort/PrefixComparatorsSuite.scala
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util.collection.unsafe.sort
+
+import org.scalatest.prop.PropertyChecks
+
+import org.apache.spark.SparkFunSuite
+
+class PrefixComparatorsSuite extends SparkFunSuite with PropertyChecks {
+
+  test("String prefix comparator") {
+
+    def testPrefixComparison(s1: String, s2: String): Unit = {
+      val s1Prefix = PrefixComparators.STRING.computePrefix(s1)
+      val s2Prefix = PrefixComparators.STRING.computePrefix(s2)
+      val prefixComparisonResult = PrefixComparators.STRING.compare(s1Prefix, s2Prefix)
+      assert(
+        (prefixComparisonResult == 0) ||
+        (prefixComparisonResult < 0 && s1 < s2) ||
+        (prefixComparisonResult > 0 && s1 > s2))
+    }
+
+    // scalastyle:off
+    val regressionTests = Table(
+      ("s1", "s2"),
+      ("abc", "世界"),
+      ("你好", "世界"),
+      ("你好123", "你好122")
+    )
+    // scalastyle:on
+
+    forAll (regressionTests) { (s1: String, s2: String) => testPrefixComparison(s1, s2) }
+    forAll { (s1: String, s2: String) => testPrefixComparison(s1, s2) }
+  }
+}
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
index edb7202245289..4b99030d1046f 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
@@ -61,9 +61,10 @@ public final class UnsafeRow extends MutableRow {
   /** A pool to hold non-primitive objects */
   private ObjectPool pool;
 
-  Object getBaseObject() { return baseObject; }
-  long getBaseOffset() { return baseOffset; }
-  ObjectPool getPool() { return pool; }
+  public Object getBaseObject() { return baseObject; }
+  public long getBaseOffset() { return baseOffset; }
+  public int getSizeInBytes() { return sizeInBytes; }
+  public ObjectPool getPool() { return pool; }
 
   /** The number of fields in this row, used for calculating the bitset width (and in assertions) */
   private int numFields;
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java b/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java
new file mode 100644
index 0000000000000..b94601cf6d818
--- /dev/null
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java
@@ -0,0 +1,216 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution;
+
+import java.io.IOException;
+
+import scala.collection.Iterator;
+import scala.math.Ordering;
+
+import com.google.common.annotations.VisibleForTesting;
+
+import org.apache.spark.SparkEnv;
+import org.apache.spark.TaskContext;
+import org.apache.spark.sql.AbstractScalaRowIterator;
+import org.apache.spark.sql.catalyst.InternalRow;
+import org.apache.spark.sql.catalyst.expressions.ObjectUnsafeColumnWriter;
+import org.apache.spark.sql.catalyst.expressions.UnsafeColumnWriter;
+import org.apache.spark.sql.catalyst.expressions.UnsafeRow;
+import org.apache.spark.sql.catalyst.expressions.UnsafeRowConverter;
+import org.apache.spark.sql.catalyst.util.ObjectPool;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+import org.apache.spark.unsafe.PlatformDependent;
+import org.apache.spark.util.collection.unsafe.sort.PrefixComparator;
+import org.apache.spark.util.collection.unsafe.sort.RecordComparator;
+import org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter;
+import org.apache.spark.util.collection.unsafe.sort.UnsafeSorterIterator;
+
+final class UnsafeExternalRowSorter {
+
+  /**
+   * If positive, forces records to be spilled to disk at the given frequency (measured in numbers
+   * of records). This is only intended to be used in tests.
+   */
+  private int testSpillFrequency = 0;
+
+  private long numRowsInserted = 0;
+
+  private final StructType schema;
+  private final UnsafeRowConverter rowConverter;
+  private final PrefixComputer prefixComputer;
+  private final UnsafeExternalSorter sorter;
+  private byte[] rowConversionBuffer = new byte[1024 * 8];
+
+  public static abstract class PrefixComputer {
+    abstract long computePrefix(InternalRow row);
+  }
+
+  public UnsafeExternalRowSorter(
+      StructType schema,
+      Ordering<InternalRow> ordering,
+      PrefixComparator prefixComparator,
+      PrefixComputer prefixComputer) throws IOException {
+    this.schema = schema;
+    this.rowConverter = new UnsafeRowConverter(schema);
+    this.prefixComputer = prefixComputer;
+    final SparkEnv sparkEnv = SparkEnv.get();
+    final TaskContext taskContext = TaskContext.get();
+    sorter = new UnsafeExternalSorter(
+      taskContext.taskMemoryManager(),
+      sparkEnv.shuffleMemoryManager(),
+      sparkEnv.blockManager(),
+      taskContext,
+      new RowComparator(ordering, schema.length(), null),
+      prefixComparator,
+      4096,
+      sparkEnv.conf()
+    );
+  }
+
+  /**
+   * Forces spills to occur every `frequency` records. Only for use in tests.
+   */
+  @VisibleForTesting
+  void setTestSpillFrequency(int frequency) {
+    assert frequency > 0 : "Frequency must be positive";
+    testSpillFrequency = frequency;
+  }
+
+  @VisibleForTesting
+  void insertRow(InternalRow row) throws IOException {
+    final int sizeRequirement = rowConverter.getSizeRequirement(row);
+    if (sizeRequirement > rowConversionBuffer.length) {
+      rowConversionBuffer = new byte[sizeRequirement];
+    }
+    final int bytesWritten = rowConverter.writeRow(
+      row, rowConversionBuffer, PlatformDependent.BYTE_ARRAY_OFFSET, sizeRequirement, null);
+    assert (bytesWritten == sizeRequirement);
+    final long prefix = prefixComputer.computePrefix(row);
+    sorter.insertRecord(
+      rowConversionBuffer,
+      PlatformDependent.BYTE_ARRAY_OFFSET,
+      sizeRequirement,
+      prefix
+    );
+    numRowsInserted++;
+    if (testSpillFrequency > 0 && (numRowsInserted % testSpillFrequency) == 0) {
+      spill();
+    }
+  }
+
+  @VisibleForTesting
+  void spill() throws IOException {
+    sorter.spill();
+  }
+
+  private void cleanupResources() {
+    sorter.freeMemory();
+  }
+
+  @VisibleForTesting
+  Iterator<InternalRow> sort() throws IOException {
+    try {
+      final UnsafeSorterIterator sortedIterator = sorter.getSortedIterator();
+      if (!sortedIterator.hasNext()) {
+        // Since we won't ever call next() on an empty iterator, we need to clean up resources
+        // here in order to prevent memory leaks.
+        cleanupResources();
+      }
+      return new AbstractScalaRowIterator() {
+
+        private final int numFields = schema.length();
+        private final UnsafeRow row = new UnsafeRow();
+
+        @Override
+        public boolean hasNext() {
+          return sortedIterator.hasNext();
+        }
+
+        @Override
+        public InternalRow next() {
+          try {
+            sortedIterator.loadNext();
+            row.pointTo(
+              sortedIterator.getBaseObject(),
+              sortedIterator.getBaseOffset(),
+              numFields,
+              sortedIterator.getRecordLength(),
+              null);
+            if (!hasNext()) {
+              row.copy(); // so that we don't have dangling pointers to freed page
+              cleanupResources();
+            }
+            return row;
+          } catch (IOException e) {
+            cleanupResources();
+            // Scala iterators don't declare any checked exceptions, so we need to use this hack
+            // to re-throw the exception:
+            PlatformDependent.throwException(e);
+          }
+          throw new RuntimeException("Exception should have been re-thrown in next()");
+        };
+      };
+    } catch (IOException e) {
+      cleanupResources();
+      throw e;
+    }
+  }
+
+
+  public Iterator<InternalRow> sort(Iterator<InternalRow> inputIterator) throws IOException {
+      while (inputIterator.hasNext()) {
+        insertRow(inputIterator.next());
+      }
+      return sort();
+  }
+
+  /**
+   * Return true if UnsafeExternalRowSorter can sort rows with the given schema, false otherwise.
+   */
+  public static boolean supportsSchema(StructType schema) {
+    // TODO: add spilling note to explain why we do this for now:
+    for (StructField field : schema.fields()) {
+      if (UnsafeColumnWriter.forType(field.dataType()) instanceof ObjectUnsafeColumnWriter) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  private static final class RowComparator extends RecordComparator {
+    private final Ordering<InternalRow> ordering;
+    private final int numFields;
+    private final ObjectPool objPool;
+    private final UnsafeRow row1 = new UnsafeRow();
+    private final UnsafeRow row2 = new UnsafeRow();
+
+    public RowComparator(Ordering<InternalRow> ordering, int numFields, ObjectPool objPool) {
+      this.numFields = numFields;
+      this.ordering = ordering;
+      this.objPool = objPool;
+    }
+
+    @Override
+    public int compare(Object baseObj1, long baseOff1, Object baseObj2, long baseOff2) {
+      row1.pointTo(baseObj1, baseOff1, numFields, -1, objPool);
+      row2.pointTo(baseObj2, baseOff2, numFields, -1, objPool);
+      return ordering.compare(row1, row2);
+    }
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/AbstractScalaRowIterator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/AbstractScalaRowIterator.scala
new file mode 100644
index 0000000000000..cfefb13e7721e
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/AbstractScalaRowIterator.scala
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql
+
+import org.apache.spark.sql.catalyst.InternalRow
+
+/**
+ * Shim to allow us to implement [[scala.Iterator]] in Java. Scala 2.11+ has an AbstractIterator
+ * class for this, but that class is `private[scala]` in 2.10. We need to explicitly fix this to
+ * `Row` in order to work around a spurious IntelliJ compiler error.
+ */
+private[spark] abstract class AbstractScalaRowIterator extends Iterator[InternalRow]
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
index 74d933404551c..4b783e30d95e1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
@@ -289,11 +289,8 @@ private[sql] case class EnsureRequirements(sqlContext: SQLContext) extends Rule[
         }
 
         val withSort = if (needSort) {
-          if (sqlContext.conf.externalSortEnabled) {
-            ExternalSort(rowOrdering, global = false, withShuffle)
-          } else {
-            Sort(rowOrdering, global = false, withShuffle)
-          }
+          sqlContext.planner.BasicOperators.getSortOperator(
+            rowOrdering, global = false, withShuffle)
         } else {
           withShuffle
         }
@@ -321,11 +318,7 @@ private[sql] case class EnsureRequirements(sqlContext: SQLContext) extends Rule[
           case (UnspecifiedDistribution, Seq(), child) =>
             child
           case (UnspecifiedDistribution, rowOrdering, child) =>
-            if (sqlContext.conf.externalSortEnabled) {
-              ExternalSort(rowOrdering, global = false, child)
-            } else {
-              Sort(rowOrdering, global = false, child)
-            }
+            sqlContext.planner.BasicOperators.getSortOperator(rowOrdering, global = false, child)
 
           case (dist, ordering, _) =>
             sys.error(s"Don't know how to ensure $dist with ordering $ordering")
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SortPrefixUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SortPrefixUtils.scala
new file mode 100644
index 0000000000000..2dee3542d6101
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SortPrefixUtils.scala
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.spark.sql.execution
+
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.SortOrder
+import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.UTF8String
+import org.apache.spark.util.collection.unsafe.sort.{PrefixComparators, PrefixComparator}
+
+
+object SortPrefixUtils {
+
+  /**
+   * A dummy prefix comparator which always claims that prefixes are equal. This is used in cases
+   * where we don't know how to generate or compare prefixes for a SortOrder.
+   */
+  private object NoOpPrefixComparator extends PrefixComparator {
+    override def compare(prefix1: Long, prefix2: Long): Int = 0
+  }
+
+  def getPrefixComparator(sortOrder: SortOrder): PrefixComparator = {
+    sortOrder.dataType match {
+      case StringType => PrefixComparators.STRING
+      case BooleanType | ByteType | ShortType | IntegerType | LongType => PrefixComparators.INTEGRAL
+      case FloatType => PrefixComparators.FLOAT
+      case DoubleType => PrefixComparators.DOUBLE
+      case _ => NoOpPrefixComparator
+    }
+  }
+
+  def getPrefixComputer(sortOrder: SortOrder): InternalRow => Long = {
+    sortOrder.dataType match {
+      case StringType => (row: InternalRow) => {
+        PrefixComparators.STRING.computePrefix(sortOrder.child.eval(row).asInstanceOf[UTF8String])
+      }
+      case BooleanType =>
+        (row: InternalRow) => {
+          val exprVal = sortOrder.child.eval(row)
+          if (exprVal == null) PrefixComparators.INTEGRAL.NULL_PREFIX
+          else if (sortOrder.child.eval(row).asInstanceOf[Boolean]) 1
+          else 0
+        }
+      case ByteType =>
+        (row: InternalRow) => {
+          val exprVal = sortOrder.child.eval(row)
+          if (exprVal == null) PrefixComparators.INTEGRAL.NULL_PREFIX
+          else sortOrder.child.eval(row).asInstanceOf[Byte]
+        }
+      case ShortType =>
+        (row: InternalRow) => {
+          val exprVal = sortOrder.child.eval(row)
+          if (exprVal == null) PrefixComparators.INTEGRAL.NULL_PREFIX
+          else sortOrder.child.eval(row).asInstanceOf[Short]
+        }
+      case IntegerType =>
+        (row: InternalRow) => {
+          val exprVal = sortOrder.child.eval(row)
+          if (exprVal == null) PrefixComparators.INTEGRAL.NULL_PREFIX
+          else sortOrder.child.eval(row).asInstanceOf[Int]
+        }
+      case LongType =>
+        (row: InternalRow) => {
+          val exprVal = sortOrder.child.eval(row)
+          if (exprVal == null) PrefixComparators.INTEGRAL.NULL_PREFIX
+          else sortOrder.child.eval(row).asInstanceOf[Long]
+        }
+      case FloatType => (row: InternalRow) => {
+        val exprVal = sortOrder.child.eval(row)
+        if (exprVal == null) PrefixComparators.FLOAT.NULL_PREFIX
+        else PrefixComparators.FLOAT.computePrefix(sortOrder.child.eval(row).asInstanceOf[Float])
+      }
+      case DoubleType => (row: InternalRow) => {
+        val exprVal = sortOrder.child.eval(row)
+        if (exprVal == null) PrefixComparators.DOUBLE.NULL_PREFIX
+        else PrefixComparators.DOUBLE.computePrefix(sortOrder.child.eval(row).asInstanceOf[Double])
+      }
+      case _ => (row: InternalRow) => 0L
+    }
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index 59b9b553a7ae5..ce25af58b6cab 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -302,6 +302,22 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
   object BasicOperators extends Strategy {
     def numPartitions: Int = self.numPartitions
 
+    /**
+     * Picks an appropriate sort operator.
+     *
+     * @param global when true performs a global sort of all partitions by shuffling the data first
+     *               if necessary.
+     */
+    def getSortOperator(sortExprs: Seq[SortOrder], global: Boolean, child: SparkPlan): SparkPlan = {
+      if (sqlContext.conf.unsafeEnabled && UnsafeExternalSort.supportsSchema(child.schema)) {
+        execution.UnsafeExternalSort(sortExprs, global, child)
+      } else if (sqlContext.conf.externalSortEnabled) {
+        execution.ExternalSort(sortExprs, global, child)
+      } else {
+        execution.Sort(sortExprs, global, child)
+      }
+    }
+
     def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
       case r: RunnableCommand => ExecutedCommand(r) :: Nil
 
@@ -313,11 +329,9 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
       case logical.SortPartitions(sortExprs, child) =>
         // This sort only sorts tuples within a partition. Its requiredDistribution will be
         // an UnspecifiedDistribution.
-        execution.Sort(sortExprs, global = false, planLater(child)) :: Nil
-      case logical.Sort(sortExprs, global, child) if sqlContext.conf.externalSortEnabled =>
-        execution.ExternalSort(sortExprs, global, planLater(child)):: Nil
+        getSortOperator(sortExprs, global = false, planLater(child)) :: Nil
       case logical.Sort(sortExprs, global, child) =>
-        execution.Sort(sortExprs, global, planLater(child)):: Nil
+        getSortOperator(sortExprs, global, planLater(child)):: Nil
       case logical.Project(projectList, child) =>
         execution.Project(projectList, planLater(child)) :: Nil
       case logical.Filter(condition, child) =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
index de14e6ad79ad6..4c063c299ba53 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.execution
 
+import org.apache.spark.sql.types.StructType
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.{RDD, ShuffledRDD}
 import org.apache.spark.shuffle.sort.SortShuffleManager
@@ -27,6 +28,7 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.util.collection.ExternalSorter
+import org.apache.spark.util.collection.unsafe.sort.PrefixComparator
 import org.apache.spark.util.{CompletionIterator, MutablePair}
 import org.apache.spark.{HashPartitioner, SparkEnv}
 
@@ -246,6 +248,77 @@ case class ExternalSort(
   override def outputOrdering: Seq[SortOrder] = sortOrder
 }
 
+/**
+ * :: DeveloperApi ::
+ * Optimized version of [[ExternalSort]] that operates on binary data (implemented as part of
+ * Project Tungsten).
+ *
+ * @param global when true performs a global sort of all partitions by shuffling the data first
+ *               if necessary.
+ * @param testSpillFrequency Method for configuring periodic spilling in unit tests. If set, will
+ *                           spill every `frequency` records.
+ */
+@DeveloperApi
+case class UnsafeExternalSort(
+    sortOrder: Seq[SortOrder],
+    global: Boolean,
+    child: SparkPlan,
+    testSpillFrequency: Int = 0)
+  extends UnaryNode {
+
+  private[this] val schema: StructType = child.schema
+
+  override def requiredChildDistribution: Seq[Distribution] =
+    if (global) OrderedDistribution(sortOrder) :: Nil else UnspecifiedDistribution :: Nil
+
+  protected override def doExecute(): RDD[InternalRow] = attachTree(this, "sort") {
+    assert(codegenEnabled, "UnsafeExternalSort requires code generation to be enabled")
+    def doSort(iterator: Iterator[InternalRow]): Iterator[InternalRow] = {
+      val ordering = newOrdering(sortOrder, child.output)
+      val boundSortExpression = BindReferences.bindReference(sortOrder.head, child.output)
+      // Hack until we generate separate comparator implementations for ascending vs. descending
+      // (or choose to codegen them):
+      val prefixComparator = {
+        val comp = SortPrefixUtils.getPrefixComparator(boundSortExpression)
+        if (sortOrder.head.direction == Descending) {
+          new PrefixComparator {
+            override def compare(p1: Long, p2: Long): Int = -1 * comp.compare(p1, p2)
+          }
+        } else {
+          comp
+        }
+      }
+      val prefixComputer = {
+        val prefixComputer = SortPrefixUtils.getPrefixComputer(boundSortExpression)
+        new UnsafeExternalRowSorter.PrefixComputer {
+          override def computePrefix(row: InternalRow): Long = prefixComputer(row)
+        }
+      }
+      val sorter = new UnsafeExternalRowSorter(schema, ordering, prefixComparator, prefixComputer)
+      if (testSpillFrequency > 0) {
+        sorter.setTestSpillFrequency(testSpillFrequency)
+      }
+      sorter.sort(iterator)
+    }
+    child.execute().mapPartitions(doSort, preservesPartitioning = true)
+  }
+
+  override def output: Seq[Attribute] = child.output
+
+  override def outputOrdering: Seq[SortOrder] = sortOrder
+}
+
+@DeveloperApi
+object UnsafeExternalSort {
+  /**
+   * Return true if UnsafeExternalSort can sort rows with the given schema, false otherwise.
+   */
+  def supportsSchema(schema: StructType): Boolean = {
+    UnsafeExternalRowSorter.supportsSchema(schema)
+  }
+}
+
+
 /**
  * :: DeveloperApi ::
  * Return a new RDD that has exactly `numPartitions` partitions.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SortSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SortSuite.scala
index a1e3ca11b1ad9..a2c10fdaf6cdb 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SortSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SortSuite.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.execution
 
+import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.dsl.expressions._
 
 class SortSuite extends SparkPlanTest {
@@ -33,12 +34,14 @@ class SortSuite extends SparkPlanTest {
 
     checkAnswer(
       input.toDF("a", "b", "c"),
-      ExternalSort('a.asc :: 'b.asc :: Nil, global = false, _: SparkPlan),
-      input.sorted)
+      ExternalSort('a.asc :: 'b.asc :: Nil, global = true, _: SparkPlan),
+      input.sortBy(t => (t._1, t._2)).map(Row.fromTuple),
+      sortAnswers = false)
 
     checkAnswer(
       input.toDF("a", "b", "c"),
-      ExternalSort('b.asc :: 'a.asc :: Nil, global = false, _: SparkPlan),
-      input.sortBy(t => (t._2, t._1)))
+      ExternalSort('b.asc :: 'a.asc :: Nil, global = true, _: SparkPlan),
+      input.sortBy(t => (t._2, t._1)).map(Row.fromTuple),
+      sortAnswers = false)
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala
index 108b1122f7bff..6a8f394545816 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala
@@ -17,18 +17,15 @@
 
 package org.apache.spark.sql.execution
 
-import scala.language.implicitConversions
-import scala.reflect.runtime.universe.TypeTag
-import scala.util.control.NonFatal
-
 import org.apache.spark.SparkFunSuite
-
 import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
-import org.apache.spark.sql.catalyst.expressions.BoundReference
 import org.apache.spark.sql.catalyst.util._
-
 import org.apache.spark.sql.test.TestSQLContext
-import org.apache.spark.sql.{DataFrameHolder, Row, DataFrame}
+import org.apache.spark.sql.{DataFrame, DataFrameHolder, Row}
+
+import scala.language.implicitConversions
+import scala.reflect.runtime.universe.TypeTag
+import scala.util.control.NonFatal
 
 /**
  * Base class for writing tests for individual physical operators. For an example of how this
@@ -49,12 +46,19 @@ class SparkPlanTest extends SparkFunSuite {
    * @param planFunction a function which accepts the input SparkPlan and uses it to instantiate
    *                     the physical operator that's being tested.
    * @param expectedAnswer the expected result in a [[Seq]] of [[Row]]s.
+   * @param sortAnswers if true, the answers will be sorted by their toString representations prior
+   *                    to being compared.
    */
   protected def checkAnswer(
       input: DataFrame,
       planFunction: SparkPlan => SparkPlan,
-      expectedAnswer: Seq[Row]): Unit = {
-    checkAnswer(input :: Nil, (plans: Seq[SparkPlan]) => planFunction(plans.head), expectedAnswer)
+      expectedAnswer: Seq[Row],
+      sortAnswers: Boolean = true): Unit = {
+    doCheckAnswer(
+      input :: Nil,
+      (plans: Seq[SparkPlan]) => planFunction(plans.head),
+      expectedAnswer,
+      sortAnswers)
   }
 
   /**
@@ -64,86 +68,131 @@ class SparkPlanTest extends SparkFunSuite {
    * @param planFunction a function which accepts the input SparkPlan and uses it to instantiate
    *                     the physical operator that's being tested.
    * @param expectedAnswer the expected result in a [[Seq]] of [[Row]]s.
+   * @param sortAnswers if true, the answers will be sorted by their toString representations prior
+   *                    to being compared.
    */
-  protected def checkAnswer(
+  protected def checkAnswer2(
       left: DataFrame,
       right: DataFrame,
       planFunction: (SparkPlan, SparkPlan) => SparkPlan,
-      expectedAnswer: Seq[Row]): Unit = {
-    checkAnswer(left :: right :: Nil,
-      (plans: Seq[SparkPlan]) => planFunction(plans(0), plans(1)), expectedAnswer)
+      expectedAnswer: Seq[Row],
+      sortAnswers: Boolean = true): Unit = {
+    doCheckAnswer(
+      left :: right :: Nil,
+      (plans: Seq[SparkPlan]) => planFunction(plans(0), plans(1)),
+      expectedAnswer,
+      sortAnswers)
   }
 
   /**
    * Runs the plan and makes sure the answer matches the expected result.
    * @param input the input data to be used.
-   * @param planFunction a function which accepts the input SparkPlan and uses it to instantiate
-   *                     the physical operator that's being tested.
+   * @param planFunction a function which accepts a sequence of input SparkPlans and uses them to
+   *                     instantiate the physical operator that's being tested.
    * @param expectedAnswer the expected result in a [[Seq]] of [[Row]]s.
+   * @param sortAnswers if true, the answers will be sorted by their toString representations prior
+   *                    to being compared.
    */
-  protected def checkAnswer(
+  protected def doCheckAnswer(
       input: Seq[DataFrame],
       planFunction: Seq[SparkPlan] => SparkPlan,
-      expectedAnswer: Seq[Row]): Unit = {
-    SparkPlanTest.checkAnswer(input, planFunction, expectedAnswer) match {
+      expectedAnswer: Seq[Row],
+      sortAnswers: Boolean = true): Unit = {
+    SparkPlanTest.checkAnswer(input, planFunction, expectedAnswer, sortAnswers) match {
       case Some(errorMessage) => fail(errorMessage)
       case None =>
     }
   }
 
   /**
-   * Runs the plan and makes sure the answer matches the expected result.
+   * Runs the plan and makes sure the answer matches the result produced by a reference plan.
    * @param input the input data to be used.
    * @param planFunction a function which accepts the input SparkPlan and uses it to instantiate
    *                     the physical operator that's being tested.
-   * @param expectedAnswer the expected result in a [[Seq]] of [[Product]]s.
+   * @param expectedPlanFunction a function which accepts the input SparkPlan and uses it to
+   *                             instantiate a reference implementation of the physical operator
+   *                             that's being tested. The result of executing this plan will be
+   *                             treated as the source-of-truth for the test.
+   * @param sortAnswers if true, the answers will be sorted by their toString representations prior
+   *                    to being compared.
    */
-  protected def checkAnswer[A <: Product : TypeTag](
+  protected def checkThatPlansAgree(
       input: DataFrame,
       planFunction: SparkPlan => SparkPlan,
-      expectedAnswer: Seq[A]): Unit = {
-    val expectedRows = expectedAnswer.map(Row.fromTuple)
-    checkAnswer(input, planFunction, expectedRows)
+      expectedPlanFunction: SparkPlan => SparkPlan,
+      sortAnswers: Boolean = true): Unit = {
+    SparkPlanTest.checkAnswer(input, planFunction, expectedPlanFunction, sortAnswers) match {
+      case Some(errorMessage) => fail(errorMessage)
+      case None =>
+    }
   }
+}
 
-  /**
-   * Runs the plan and makes sure the answer matches the expected result.
-   * @param left the left input data to be used.
-   * @param right the right input data to be used.
-   * @param planFunction a function which accepts the input SparkPlan and uses it to instantiate
-   *                     the physical operator that's being tested.
-   * @param expectedAnswer the expected result in a [[Seq]] of [[Product]]s.
-   */
-  protected def checkAnswer[A <: Product : TypeTag](
-      left: DataFrame,
-      right: DataFrame,
-      planFunction: (SparkPlan, SparkPlan) => SparkPlan,
-      expectedAnswer: Seq[A]): Unit = {
-    val expectedRows = expectedAnswer.map(Row.fromTuple)
-    checkAnswer(left, right, planFunction, expectedRows)
-  }
+/**
+ * Helper methods for writing tests of individual physical operators.
+ */
+object SparkPlanTest {
 
   /**
-   * Runs the plan and makes sure the answer matches the expected result.
+   * Runs the plan and makes sure the answer matches the result produced by a reference plan.
    * @param input the input data to be used.
    * @param planFunction a function which accepts the input SparkPlan and uses it to instantiate
    *                     the physical operator that's being tested.
-   * @param expectedAnswer the expected result in a [[Seq]] of [[Product]]s.
+   * @param expectedPlanFunction a function which accepts the input SparkPlan and uses it to
+   *                             instantiate a reference implementation of the physical operator
+   *                             that's being tested. The result of executing this plan will be
+   *                             treated as the source-of-truth for the test.
    */
-  protected def checkAnswer[A <: Product : TypeTag](
-      input: Seq[DataFrame],
-      planFunction: Seq[SparkPlan] => SparkPlan,
-      expectedAnswer: Seq[A]): Unit = {
-    val expectedRows = expectedAnswer.map(Row.fromTuple)
-    checkAnswer(input, planFunction, expectedRows)
-  }
+  def checkAnswer(
+      input: DataFrame,
+      planFunction: SparkPlan => SparkPlan,
+      expectedPlanFunction: SparkPlan => SparkPlan,
+      sortAnswers: Boolean): Option[String] = {
 
-}
+    val outputPlan = planFunction(input.queryExecution.sparkPlan)
+    val expectedOutputPlan = expectedPlanFunction(input.queryExecution.sparkPlan)
 
-/**
- * Helper methods for writing tests of individual physical operators.
- */
-object SparkPlanTest {
+    val expectedAnswer: Seq[Row] = try {
+      executePlan(expectedOutputPlan)
+    } catch {
+      case NonFatal(e) =>
+        val errorMessage =
+          s"""
+             | Exception thrown while executing Spark plan to calculate expected answer:
+             | $expectedOutputPlan
+             | == Exception ==
+             | $e
+             | ${org.apache.spark.sql.catalyst.util.stackTraceToString(e)}
+          """.stripMargin
+        return Some(errorMessage)
+    }
+
+    val actualAnswer: Seq[Row] = try {
+      executePlan(outputPlan)
+    } catch {
+      case NonFatal(e) =>
+        val errorMessage =
+          s"""
+             | Exception thrown while executing Spark plan:
+             | $outputPlan
+             | == Exception ==
+             | $e
+             | ${org.apache.spark.sql.catalyst.util.stackTraceToString(e)}
+          """.stripMargin
+        return Some(errorMessage)
+    }
+
+    compareAnswers(actualAnswer, expectedAnswer, sortAnswers).map { errorMessage =>
+      s"""
+         | Results do not match.
+         | Actual result Spark plan:
+         | $outputPlan
+         | Expected result Spark plan:
+         | $expectedOutputPlan
+         | $errorMessage
+       """.stripMargin
+    }
+  }
 
   /**
    * Runs the plan and makes sure the answer matches the expected result.
@@ -151,28 +200,45 @@ object SparkPlanTest {
    * @param planFunction a function which accepts the input SparkPlan and uses it to instantiate
    *                     the physical operator that's being tested.
    * @param expectedAnswer the expected result in a [[Seq]] of [[Row]]s.
+   * @param sortAnswers if true, the answers will be sorted by their toString representations prior
+   *                    to being compared.
    */
   def checkAnswer(
       input: Seq[DataFrame],
       planFunction: Seq[SparkPlan] => SparkPlan,
-      expectedAnswer: Seq[Row]): Option[String] = {
+      expectedAnswer: Seq[Row],
+      sortAnswers: Boolean): Option[String] = {
 
     val outputPlan = planFunction(input.map(_.queryExecution.sparkPlan))
 
-    // A very simple resolver to make writing tests easier. In contrast to the real resolver
-    // this is always case sensitive and does not try to handle scoping or complex type resolution.
-    val resolvedPlan = TestSQLContext.prepareForExecution.execute(
-      outputPlan transform {
-        case plan: SparkPlan =>
-          val inputMap = plan.children.flatMap(_.output).map(a => (a.name, a)).toMap
-          plan.transformExpressions {
-            case UnresolvedAttribute(Seq(u)) =>
-              inputMap.getOrElse(u,
-                sys.error(s"Invalid Test: Cannot resolve $u given input $inputMap"))
-          }
-      }
-    )
+    val sparkAnswer: Seq[Row] = try {
+      executePlan(outputPlan)
+    } catch {
+      case NonFatal(e) =>
+        val errorMessage =
+          s"""
+             | Exception thrown while executing Spark plan:
+             | $outputPlan
+             | == Exception ==
+             | $e
+             | ${org.apache.spark.sql.catalyst.util.stackTraceToString(e)}
+          """.stripMargin
+        return Some(errorMessage)
+    }
 
+    compareAnswers(sparkAnswer, expectedAnswer, sortAnswers).map { errorMessage =>
+      s"""
+         | Results do not match for Spark plan:
+         | $outputPlan
+         | $errorMessage
+       """.stripMargin
+    }
+  }
+
+  private def compareAnswers(
+      sparkAnswer: Seq[Row],
+      expectedAnswer: Seq[Row],
+      sort: Boolean): Option[String] = {
     def prepareAnswer(answer: Seq[Row]): Seq[Row] = {
       // Converts data to types that we can do equality comparison using Scala collections.
       // For BigDecimal type, the Scala type has a better definition of equality test (similar to
@@ -187,40 +253,43 @@ object SparkPlanTest {
           case o => o
         })
       }
-      converted.sortBy(_.toString())
-    }
-
-    val sparkAnswer: Seq[Row] = try {
-      resolvedPlan.executeCollect().toSeq
-    } catch {
-      case NonFatal(e) =>
-        val errorMessage =
-          s"""
-             | Exception thrown while executing Spark plan:
-             | $outputPlan
-             | == Exception ==
-             | $e
-             | ${org.apache.spark.sql.catalyst.util.stackTraceToString(e)}
-          """.stripMargin
-        return Some(errorMessage)
+      if (sort) {
+        converted.sortBy(_.toString())
+      } else {
+        converted
+      }
     }
-
     if (prepareAnswer(expectedAnswer) != prepareAnswer(sparkAnswer)) {
       val errorMessage =
         s"""
-           | Results do not match for Spark plan:
-           | $outputPlan
            | == Results ==
            | ${sideBySide(
-              s"== Correct Answer - ${expectedAnswer.size} ==" +:
+              s"== Expected Answer - ${expectedAnswer.size} ==" +:
               prepareAnswer(expectedAnswer).map(_.toString()),
-              s"== Spark Answer - ${sparkAnswer.size} ==" +:
+              s"== Actual Answer - ${sparkAnswer.size} ==" +:
               prepareAnswer(sparkAnswer).map(_.toString())).mkString("\n")}
       """.stripMargin
-      return Some(errorMessage)
+      Some(errorMessage)
+    } else {
+      None
     }
+  }
 
-    None
+  private def executePlan(outputPlan: SparkPlan): Seq[Row] = {
+    // A very simple resolver to make writing tests easier. In contrast to the real resolver
+    // this is always case sensitive and does not try to handle scoping or complex type resolution.
+    val resolvedPlan = TestSQLContext.prepareForExecution.execute(
+      outputPlan transform {
+        case plan: SparkPlan =>
+          val inputMap = plan.children.flatMap(_.output).map(a => (a.name, a)).toMap
+          plan.transformExpressions {
+            case UnresolvedAttribute(Seq(u)) =>
+              inputMap.getOrElse(u,
+                sys.error(s"Invalid Test: Cannot resolve $u given input $inputMap"))
+          }
+      }
+    )
+    resolvedPlan.executeCollect().toSeq
   }
 }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeExternalSortSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeExternalSortSuite.scala
new file mode 100644
index 0000000000000..4f4c1f28564cb
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeExternalSortSuite.scala
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution
+
+import scala.util.Random
+
+import org.scalatest.BeforeAndAfterAll
+
+import org.apache.spark.sql.{RandomDataGenerator, Row, SQLConf}
+import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.test.TestSQLContext
+import org.apache.spark.sql.types._
+
+class UnsafeExternalSortSuite extends SparkPlanTest with BeforeAndAfterAll {
+
+  override def beforeAll(): Unit = {
+    TestSQLContext.conf.setConf(SQLConf.CODEGEN_ENABLED, true)
+  }
+
+  override def afterAll(): Unit = {
+    TestSQLContext.conf.setConf(SQLConf.CODEGEN_ENABLED, SQLConf.CODEGEN_ENABLED.defaultValue.get)
+  }
+
+  ignore("sort followed by limit should not leak memory") {
+    // TODO: this test is going to fail until we implement a proper iterator interface
+    // with a close() method.
+    TestSQLContext.sparkContext.conf.set("spark.unsafe.exceptionOnMemoryLeak", "true")
+    checkThatPlansAgree(
+      (1 to 100).map(v => Tuple1(v)).toDF("a"),
+      (child: SparkPlan) => Limit(10, UnsafeExternalSort('a.asc :: Nil, true, child)),
+      (child: SparkPlan) => Limit(10, Sort('a.asc :: Nil, global = true, child)),
+      sortAnswers = false
+    )
+  }
+
+  test("sort followed by limit") {
+    TestSQLContext.sparkContext.conf.set("spark.unsafe.exceptionOnMemoryLeak", "false")
+    try {
+      checkThatPlansAgree(
+        (1 to 100).map(v => Tuple1(v)).toDF("a"),
+        (child: SparkPlan) => Limit(10, UnsafeExternalSort('a.asc :: Nil, true, child)),
+        (child: SparkPlan) => Limit(10, Sort('a.asc :: Nil, global = true, child)),
+        sortAnswers = false
+      )
+    } finally {
+      TestSQLContext.sparkContext.conf.set("spark.unsafe.exceptionOnMemoryLeak", "true")
+
+    }
+  }
+
+  test("sorting does not crash for large inputs") {
+    val sortOrder = 'a.asc :: Nil
+    val stringLength = 1024 * 1024 * 2
+    checkThatPlansAgree(
+      Seq(Tuple1("a" * stringLength), Tuple1("b" * stringLength)).toDF("a").repartition(1),
+      UnsafeExternalSort(sortOrder, global = true, _: SparkPlan, testSpillFrequency = 1),
+      Sort(sortOrder, global = true, _: SparkPlan),
+      sortAnswers = false
+    )
+  }
+
+  // Test sorting on different data types
+  for (
+    dataType <- DataTypeTestUtils.atomicTypes ++ Set(NullType)
+    if !dataType.isInstanceOf[DecimalType]; // We don't have an unsafe representation for decimals
+    nullable <- Seq(true, false);
+    sortOrder <- Seq('a.asc :: Nil, 'a.desc :: Nil);
+    randomDataGenerator <- RandomDataGenerator.forType(dataType, nullable)
+  ) {
+    test(s"sorting on $dataType with nullable=$nullable, sortOrder=$sortOrder") {
+      val inputData = Seq.fill(1000)(randomDataGenerator()).filter {
+        case d: Double => !d.isNaN
+        case f: Float => !java.lang.Float.isNaN(f)
+        case x => true
+      }
+      val inputDf = TestSQLContext.createDataFrame(
+        TestSQLContext.sparkContext.parallelize(Random.shuffle(inputData).map(v => Row(v))),
+        StructType(StructField("a", dataType, nullable = true) :: Nil)
+      )
+      assert(UnsafeExternalSort.supportsSchema(inputDf.schema))
+      checkThatPlansAgree(
+        inputDf,
+        UnsafeExternalSort(sortOrder, global = true, _: SparkPlan, testSpillFrequency = 23),
+        Sort(sortOrder, global = true, _: SparkPlan),
+        sortAnswers = false
+      )
+    }
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/OuterJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/OuterJoinSuite.scala
index 5707d2fb300ae..2c27da596bc4f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/OuterJoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/OuterJoinSuite.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.execution.joins
 
+import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.expressions.{Expression, LessThan}
 import org.apache.spark.sql.catalyst.plans.{FullOuter, LeftOuter, RightOuter}
@@ -41,23 +42,23 @@ class OuterJoinSuite extends SparkPlanTest {
   val condition = Some(LessThan('b, 'd))
 
   test("shuffled hash outer join") {
-    checkAnswer(left, right, (left: SparkPlan, right: SparkPlan) =>
+    checkAnswer2(left, right, (left: SparkPlan, right: SparkPlan) =>
       ShuffledHashOuterJoin(leftKeys, rightKeys, LeftOuter, condition, left, right),
       Seq(
         (1, 2.0, null, null),
         (2, 1.0, 2, 3.0),
         (3, 3.0, null, null)
-      ))
+      ).map(Row.fromTuple))
 
-    checkAnswer(left, right, (left: SparkPlan, right: SparkPlan) =>
+    checkAnswer2(left, right, (left: SparkPlan, right: SparkPlan) =>
       ShuffledHashOuterJoin(leftKeys, rightKeys, RightOuter, condition, left, right),
       Seq(
         (2, 1.0, 2, 3.0),
         (null, null, 3, 2.0),
         (null, null, 4, 1.0)
-      ))
+      ).map(Row.fromTuple))
 
-    checkAnswer(left, right, (left: SparkPlan, right: SparkPlan) =>
+    checkAnswer2(left, right, (left: SparkPlan, right: SparkPlan) =>
       ShuffledHashOuterJoin(leftKeys, rightKeys, FullOuter, condition, left, right),
       Seq(
         (1, 2.0, null, null),
@@ -65,24 +66,24 @@ class OuterJoinSuite extends SparkPlanTest {
         (3, 3.0, null, null),
         (null, null, 3, 2.0),
         (null, null, 4, 1.0)
-      ))
+      ).map(Row.fromTuple))
   }
 
   test("broadcast hash outer join") {
-    checkAnswer(left, right, (left: SparkPlan, right: SparkPlan) =>
+    checkAnswer2(left, right, (left: SparkPlan, right: SparkPlan) =>
       BroadcastHashOuterJoin(leftKeys, rightKeys, LeftOuter, condition, left, right),
       Seq(
         (1, 2.0, null, null),
         (2, 1.0, 2, 3.0),
         (3, 3.0, null, null)
-      ))
+      ).map(Row.fromTuple))
 
-    checkAnswer(left, right, (left: SparkPlan, right: SparkPlan) =>
+    checkAnswer2(left, right, (left: SparkPlan, right: SparkPlan) =>
       BroadcastHashOuterJoin(leftKeys, rightKeys, RightOuter, condition, left, right),
       Seq(
         (2, 1.0, 2, 3.0),
         (null, null, 3, 2.0),
         (null, null, 4, 1.0)
-      ))
+      ).map(Row.fromTuple))
   }
 }

From 857e325f304a02b1cab2463c1e15af7627a600bd Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Fri, 10 Jul 2015 16:49:45 -0700
Subject: [PATCH 116/549] [SPARK-8990] [SQL] SPARK-8990
 DataFrameReader.parquet() should respect user specified options

Author: Cheng Lian <lian@databricks.com>

Closes #7347 from liancheng/spark-8990 and squashes the following commits:

045698c [Cheng Lian] SPARK-8990 DataFrameReader.parquet() should respect user specified options
---
 .../apache/spark/sql/DataFrameReader.scala    |  2 +-
 .../spark/sql/parquet/ParquetQuerySuite.scala | 21 +++++++++++++++++++
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index 1828ed1aab50b..9ad6e21da7bf7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -263,7 +263,7 @@ class DataFrameReader private[sql](sqlContext: SQLContext) {
       val globbedPaths = paths.map(new Path(_)).flatMap(SparkHadoopUtil.get.globPath).toArray
       sqlContext.baseRelationToDataFrame(
         new ParquetRelation2(
-          globbedPaths.map(_.toString), None, None, Map.empty[String, String])(sqlContext))
+          globbedPaths.map(_.toString), None, None, extraOptions.toMap)(sqlContext))
     }
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala
index a0a81c4309c0f..21007d95ed752 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala
@@ -142,6 +142,27 @@ class ParquetQuerySuiteBase extends QueryTest with ParquetTest {
       testSchemaMerging(2)
     }
   }
+
+  test("SPARK-8990 DataFrameReader.parquet() should respect user specified options") {
+    withTempPath { dir =>
+      val basePath = dir.getCanonicalPath
+      sqlContext.range(0, 10).toDF("a").write.parquet(new Path(basePath, "foo=1").toString)
+      sqlContext.range(0, 10).toDF("b").write.parquet(new Path(basePath, "foo=a").toString)
+
+      // Disables the global SQL option for schema merging
+      withSQLConf(SQLConf.PARQUET_SCHEMA_MERGING_ENABLED.key -> "false") {
+        assertResult(2) {
+          // Disables schema merging via data source option
+          sqlContext.read.option("mergeSchema", "false").parquet(basePath).columns.length
+        }
+
+        assertResult(3) {
+          // Enables schema merging via data source option
+          sqlContext.read.option("mergeSchema", "true").parquet(basePath).columns.length
+        }
+      }
+    }
+  }
 }
 
 class ParquetDataSourceOnQuerySuite extends ParquetQuerySuiteBase with BeforeAndAfterAll {

From b6fc0adf6874fc26ab27cdaa8ebb28474c0681f0 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies.liu@gmail.com>
Date: Fri, 10 Jul 2015 17:44:21 -0700
Subject: [PATCH 117/549] add inline comment for python tests

---
 python/pyspark/sql/tests.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index ed903334beafa..241eac45cfe36 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -710,6 +710,7 @@ def test_time_with_timezone(self):
         from pyspark.sql.tests import UTC
         utc = UTC()
         utcnow = datetime.datetime.utcfromtimestamp(ts)  # without microseconds
+        # add microseconds to utcnow (keeping year,month,day,hour,minute,second)
         utcnow = datetime.datetime(*(utcnow.timetuple()[:6] + (now.microsecond, utc)))
         df = self.sqlCtx.createDataFrame([(day, now, utcnow)])
         day1, now1, utcnow1 = df.first()

From 33630883685eafcc3ee4521ea8363be342f6e6b4 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Fri, 10 Jul 2015 18:15:36 -0700
Subject: [PATCH 118/549] [SPARK-8961] [SQL] Makes
 BaseWriterContainer.outputWriterForRow accepts InternalRow instead of Row

This is a follow-up of [SPARK-8888] [1], which also aims to optimize writing dynamic partitions.

Three more changes can be made here:

1. Using `InternalRow` instead of `Row` in `BaseWriterContainer.outputWriterForRow`
2. Using `Cast` expressions to convert partition columns to strings, so that we can leverage code generation.
3. Replacing the FP-style `zip` and `map` calls with a faster imperative `while` loop.

[1]: https://issues.apache.org/jira/browse/SPARK-8888

Author: Cheng Lian <lian@databricks.com>

Closes #7331 from liancheng/spark-8961 and squashes the following commits:

b5ab9ae [Cheng Lian] Casts Java iterator to Scala iterator explicitly
719e63b [Cheng Lian] Makes BaseWriterContainer.outputWriterForRow accepts InternalRow instead of Row
---
 .../apache/spark/sql/sources/commands.scala   | 73 +++++++++++--------
 1 file changed, 42 insertions(+), 31 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala
index 9189d176111d5..5c6ef2dc90c73 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala
@@ -19,6 +19,8 @@ package org.apache.spark.sql.sources
 
 import java.util.{Date, UUID}
 
+import scala.collection.JavaConversions.asScalaIterator
+
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapreduce._
 import org.apache.hadoop.mapreduce.lib.output.{FileOutputCommitter => MapReduceFileOutputCommitter, FileOutputFormat}
@@ -26,15 +28,14 @@ import org.apache.hadoop.mapreduce.lib.output.{FileOutputCommitter => MapReduceF
 import org.apache.spark._
 import org.apache.spark.mapred.SparkHadoopMapRedUtil
 import org.apache.spark.mapreduce.SparkHadoopMapReduceUtil
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.CatalystTypeConverters
+import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.codegen.GenerateProjection
 import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project}
+import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
 import org.apache.spark.sql.execution.RunnableCommand
-import org.apache.spark.sql.types.StructType
-import org.apache.spark.sql.{DataFrame, Row, SQLConf, SQLContext, SaveMode}
+import org.apache.spark.sql.types.StringType
 import org.apache.spark.util.SerializableConfiguration
 
 private[sql] case class InsertIntoDataSource(
@@ -170,14 +171,14 @@ private[sql] case class InsertIntoHadoopFsRelation(
       try {
         writerContainer.executorSideSetup(taskContext)
 
-        val converter = if (needsConversion) {
+        val converter: InternalRow => Row = if (needsConversion) {
           CatalystTypeConverters.createToScalaConverter(dataSchema).asInstanceOf[InternalRow => Row]
         } else {
           r: InternalRow => r.asInstanceOf[Row]
         }
         while (iterator.hasNext) {
-          val row = converter(iterator.next())
-          writerContainer.outputWriterForRow(row).write(row)
+          val internalRow = iterator.next()
+          writerContainer.outputWriterForRow(internalRow).write(converter(internalRow))
         }
 
         writerContainer.commitTask()
@@ -239,7 +240,9 @@ private[sql] case class InsertIntoHadoopFsRelation(
       try {
         writerContainer.executorSideSetup(taskContext)
 
-        val partitionProj = newProjection(codegenEnabled, partitionOutput, output)
+        // Projects all partition columns and casts them to strings to build partition directories.
+        val partitionCasts = partitionOutput.map(Cast(_, StringType))
+        val partitionProj = newProjection(codegenEnabled, partitionCasts, output)
         val dataProj = newProjection(codegenEnabled, dataOutput, output)
 
         val dataConverter: InternalRow => Row = if (needsConversion) {
@@ -247,15 +250,11 @@ private[sql] case class InsertIntoHadoopFsRelation(
         } else {
           r: InternalRow => r.asInstanceOf[Row]
         }
-        val partitionSchema = StructType.fromAttributes(partitionOutput)
-        val partConverter: InternalRow => Row =
-          CatalystTypeConverters.createToScalaConverter(partitionSchema)
-            .asInstanceOf[InternalRow => Row]
 
         while (iterator.hasNext) {
-          val row = iterator.next()
-          val partitionPart = partConverter(partitionProj(row))
-          val dataPart = dataConverter(dataProj(row))
+          val internalRow = iterator.next()
+          val partitionPart = partitionProj(internalRow)
+          val dataPart = dataConverter(dataProj(internalRow))
           writerContainer.outputWriterForRow(partitionPart).write(dataPart)
         }
 
@@ -435,7 +434,7 @@ private[sql] abstract class BaseWriterContainer(
   }
 
   // Called on executor side when writing rows
-  def outputWriterForRow(row: Row): OutputWriter
+  def outputWriterForRow(row: InternalRow): OutputWriter
 
   protected def initWriters(): Unit
 
@@ -477,7 +476,7 @@ private[sql] class DefaultWriterContainer(
     writer = outputWriterFactory.newInstance(getWorkPath, dataSchema, taskAttemptContext)
   }
 
-  override def outputWriterForRow(row: Row): OutputWriter = writer
+  override def outputWriterForRow(row: InternalRow): OutputWriter = writer
 
   override def commitTask(): Unit = {
     try {
@@ -518,23 +517,36 @@ private[sql] class DynamicPartitionWriterContainer(
     outputWriters = new java.util.HashMap[String, OutputWriter]
   }
 
-  override def outputWriterForRow(row: Row): OutputWriter = {
-    // TODO (SPARK-8888): zip and all the stuff happening here is very inefficient.
-    val partitionPath = partitionColumns.zip(row.toSeq).map { case (col, rawValue) =>
-      val string = if (rawValue == null) null else String.valueOf(rawValue)
-      val valueString = if (string == null || string.isEmpty) {
-        defaultPartitionName
-      } else {
-        PartitioningUtils.escapePathName(string)
+  // The `row` argument is supposed to only contain partition column values which have been casted
+  // to strings.
+  override def outputWriterForRow(row: InternalRow): OutputWriter = {
+    val partitionPath = {
+      val partitionPathBuilder = new StringBuilder
+      var i = 0
+
+      while (i < partitionColumns.length) {
+        val col = partitionColumns(i)
+        val partitionValueString = {
+          val string = row.getString(i)
+          if (string.eq(null)) defaultPartitionName else PartitioningUtils.escapePathName(string)
+        }
+
+        if (i > 0) {
+          partitionPathBuilder.append(Path.SEPARATOR_CHAR)
+        }
+
+        partitionPathBuilder.append(s"$col=$partitionValueString")
+        i += 1
       }
-      s"/$col=$valueString"
-    }.mkString.stripPrefix(Path.SEPARATOR)
+
+      partitionPathBuilder.toString()
+    }
 
     val writer = outputWriters.get(partitionPath)
     if (writer.eq(null)) {
       val path = new Path(getWorkPath, partitionPath)
-      taskAttemptContext.getConfiguration.set("spark.sql.sources.output.path",
-        new Path(outputPath, partitionPath).toString)
+      taskAttemptContext.getConfiguration.set(
+        "spark.sql.sources.output.path", new Path(outputPath, partitionPath).toString)
       val newWriter = outputWriterFactory.newInstance(path.toString, dataSchema, taskAttemptContext)
       outputWriters.put(partitionPath, newWriter)
       newWriter
@@ -545,8 +557,7 @@ private[sql] class DynamicPartitionWriterContainer(
 
   private def clearOutputWriters(): Unit = {
     if (!outputWriters.isEmpty) {
-      val iter = scala.collection.JavaConversions.asScalaIterator(outputWriters.values().iterator())
-      iter.foreach(_.close())
+      asScalaIterator(outputWriters.values().iterator()).foreach(_.close())
       outputWriters.clear()
     }
   }

From 6e1c7e2798bb0b7b120e28a07c5d70fa162dd724 Mon Sep 17 00:00:00 2001
From: Scott Taylor <github@megatron.me.uk>
Date: Fri, 10 Jul 2015 19:29:32 -0700
Subject: [PATCH 119/549] [SPARK-7735] [PYSPARK] Raise Exception on non-zero
 exit from pipe commands

This will allow problems with piped commands to be detected.
This will also allow tasks to be retried where errors are rare (such as network problems in piped commands).

Author: Scott Taylor <github@megatron.me.uk>

Closes #6262 from megatron-me-uk/patch-2 and squashes the following commits:

04ae1d5 [Scott Taylor] Remove spurious empty line
98fa101 [Scott Taylor] fix blank line style error
574b564 [Scott Taylor] Merge pull request #2 from megatron-me-uk/patch-4
0c1e762 [Scott Taylor] Update rdd pipe method for checkCode
ab9a2e1 [Scott Taylor] Update rdd pipe tests for checkCode
eb4801c [Scott Taylor] fix fail_condition
b0ac3a4 [Scott Taylor] Merge pull request #1 from megatron-me-uk/megatron-me-uk-patch-1
a307d13 [Scott Taylor] update rdd tests to test pipe modes
34fcdc3 [Scott Taylor] add optional argument 'mode' for rdd.pipe
a0c0161 [Scott Taylor] fix generator issue
8a9ef9c [Scott Taylor] make check_return_code an iterator
0486ae3 [Scott Taylor] style fixes
8ed89a6 [Scott Taylor] Chain generators to prevent potential deadlock
4153b02 [Scott Taylor] fix list.sort returns None
491d3fc [Scott Taylor] Pass a function handle to assertRaises
3344a21 [Scott Taylor] wrap assertRaises with QuietTest
3ab8c7a [Scott Taylor] remove whitespace for style
cc1a73d [Scott Taylor] fix style issues in pipe test
8db4073 [Scott Taylor] Add a test for rdd pipe functions
1b3dc4e [Scott Taylor] fix missing space around operator style
0974f98 [Scott Taylor] add space between words in multiline string
45f4977 [Scott Taylor] fix line too long style error
5745d85 [Scott Taylor] Remove space to fix style
f552d49 [Scott Taylor] Catch non-zero exit from pipe commands
---
 python/pyspark/rdd.py   | 16 ++++++++++++++--
 python/pyspark/tests.py | 12 ++++++++++++
 2 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index 79dafb0a4ef27..3218bed5c74fc 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -700,12 +700,14 @@ def groupBy(self, f, numPartitions=None):
         return self.map(lambda x: (f(x), x)).groupByKey(numPartitions)
 
     @ignore_unicode_prefix
-    def pipe(self, command, env={}):
+    def pipe(self, command, env={}, checkCode=False):
         """
         Return an RDD created by piping elements to a forked external process.
 
         >>> sc.parallelize(['1', '2', '', '3']).pipe('cat').collect()
         [u'1', u'2', u'', u'3']
+
+        :param checkCode: whether or not to check the return value of the shell command.
         """
         def func(iterator):
             pipe = Popen(
@@ -717,7 +719,17 @@ def pipe_objs(out):
                     out.write(s.encode('utf-8'))
                 out.close()
             Thread(target=pipe_objs, args=[pipe.stdin]).start()
-            return (x.rstrip(b'\n').decode('utf-8') for x in iter(pipe.stdout.readline, b''))
+
+            def check_return_code():
+                pipe.wait()
+                if checkCode and pipe.returncode:
+                    raise Exception("Pipe function `%s' exited "
+                                    "with error code %d" % (command, pipe.returncode))
+                else:
+                    for i in range(0):
+                        yield i
+            return (x.rstrip(b'\n').decode('utf-8') for x in
+                    chain(iter(pipe.stdout.readline, b''), check_return_code()))
         return self.mapPartitions(func)
 
     def foreach(self, f):
diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py
index 17256dfc95744..c5c0add49d02c 100644
--- a/python/pyspark/tests.py
+++ b/python/pyspark/tests.py
@@ -885,6 +885,18 @@ def test_sortByKey_uses_all_partitions_not_only_first_and_last(self):
             for size in sizes:
                 self.assertGreater(size, 0)
 
+    def test_pipe_functions(self):
+        data = ['1', '2', '3']
+        rdd = self.sc.parallelize(data)
+        with QuietTest(self.sc):
+            self.assertEqual([], rdd.pipe('cc').collect())
+            self.assertRaises(Py4JJavaError, rdd.pipe('cc', checkCode=True).collect)
+        result = rdd.pipe('cat').collect()
+        result.sort()
+        [self.assertEqual(x, y) for x, y in zip(data, result)]
+        self.assertRaises(Py4JJavaError, rdd.pipe('grep 4', checkCode=True).collect)
+        self.assertEqual([], rdd.pipe('grep 4').collect())
+
 
 class ProfilerTests(PySparkTestCase):
 

From 9c5075775d741eacbeeb2df77ea30611356b6e1a Mon Sep 17 00:00:00 2001
From: "jose.cambronero" <jose.cambronero@cloudera.com>
Date: Fri, 10 Jul 2015 20:55:45 -0700
Subject: [PATCH 120/549] [SPARK-8598] [MLLIB] Implementation of 1-sample,
 two-sided, Kolmogorov Smirnov Test for RDDs

This contribution is my original work and I license it to the project under it's open source license.

Author: jose.cambronero <jose.cambronero@cloudera.com>

Closes #6994 from josepablocam/master and squashes the following commits:

bbb30b1 [jose.cambronero] renamed KSTestResult to KolmogorovSmirnovTestResult, to stay consistent with method name
0d0c201 [jose.cambronero] kstTest -> kolmogorovSmirnovTest in statistics.md
1f56371 [jose.cambronero] changed ksTest in public API to kolmogorovSmirnovTest for clarity
a48ae7b [jose.cambronero] refactor code to account for serializable RealDistribution. Reuse testOneSample( _, cdf)
1bb44bd [jose.cambronero]  style and doc changes. Factored out ks test into 2 separate tests
2ec2aa6 [jose.cambronero] initialize to stdnormal when no params passed (and log). Change unit tests to approximate equivalence rather than strict
a4bc0c7 [jose.cambronero] changed ksTest(data, distName) to ksTest(data, distName, params*) after api discussions. Changed tests and docs accordingly
7e66f57 [jose.cambronero] copied implementation note to public api docs, and added @see for links to wiki info
e760ebd [jose.cambronero] line length changes to fit style check
3288e42 [jose.cambronero] addressed style changes, correctness change to simpler approach, and fixed edge case for foldLeft in searchOneSampleCandidates when a partition is empty
9026895 [jose.cambronero] addressed style changes, correctness change to simpler approach, and fixed edge case for foldLeft in searchOneSampleCandidates when a partition is empty
1226b30 [jose.cambronero] reindent multi-line lambdas, prior intepretation of style guide was wrong on my part
9c0f1af [jose.cambronero] additional style changes incorporated and added documentation to mllib statistics docs
3f81ad2 [jose.cambronero] renamed ks1 sample test for clarity
992293b [jose.cambronero] Style changes as per comments and added implementation note explaining the distributed approach.
6a4784f [jose.cambronero] specified what distributions are available for the convenience method ksTest(data, name) (solely standard normal)
4b8ba61 [jose.cambronero] fixed off by 1/N in cases when post-constant adjustment ecdf is above cdf, but prior to adj it was below
0b5e8ec [jose.cambronero] changed KS one sample test to perform just 1 distributed pass (in addition to the sorting pass), operates on each partition separately. Implementation of Sandy Ryza's algorithm
16b5c4c [jose.cambronero] renamed dat to data and eliminated recalc of RDD size by sharing as argument between empirical and evalOneSampleP
c18dc66 [jose.cambronero] removed ksTestOpt from API and changed comments in HypothesisTestSuite accordingly
f6951b6 [jose.cambronero] changed style and some comments based on feedback from pull request
b9cff3a [jose.cambronero] made small changes to pass style check
ce8e9a1 [jose.cambronero] added kstest testing in HypothesisTestSuite
4da189b [jose.cambronero] added user facing ks test functions
c659ea1 [jose.cambronero] created KS test class
13dfe4d [jose.cambronero] created test result class for ks test
---
 docs/mllib-statistics.md                      |  37 +++-
 .../apache/spark/mllib/stat/Statistics.scala  |  40 +++-
 .../stat/test/KolmogorovSmirnovTest.scala     | 194 ++++++++++++++++++
 .../spark/mllib/stat/test/TestResult.scala    |  17 ++
 .../mllib/stat/HypothesisTestSuite.scala      | 101 +++++++++
 5 files changed, 387 insertions(+), 2 deletions(-)
 create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/stat/test/KolmogorovSmirnovTest.scala

diff --git a/docs/mllib-statistics.md b/docs/mllib-statistics.md
index 887eae7f4f07b..de5d6485f9b5f 100644
--- a/docs/mllib-statistics.md
+++ b/docs/mllib-statistics.md
@@ -283,7 +283,7 @@ approxSample = data.sampleByKey(False, fractions);
 
 Hypothesis testing is a powerful tool in statistics to determine whether a result is statistically 
 significant, whether this result occurred by chance or not. MLlib currently supports Pearson's 
-chi-squared ( $\chi^2$) tests for goodness of fit and independence. The input data types determine 
+chi-squared ( $\chi^2$) tests for goodness of fit and independence. The input data types determine
 whether the goodness of fit or the independence test is conducted. The goodness of fit test requires 
 an input type of `Vector`, whereas the independence test requires a `Matrix` as input.
 
@@ -422,6 +422,41 @@ for i, result in enumerate(featureTestResults):
 
 </div>
 
+Additionally, MLlib provides a 1-sample, 2-sided implementation of the Kolmogorov-Smirnov (KS) test
+for equality of probability distributions. By providing the name of a theoretical distribution
+(currently solely supported for the normal distribution) and its parameters, or a function to 
+calculate the cumulative distribution according to a given theoretical distribution, the user can
+test the null hypothesis that their sample is drawn from that distribution. In the case that the
+user tests against the normal distribution (`distName="norm"`), but does not provide distribution
+parameters, the test initializes to the standard normal distribution and logs an appropriate 
+message.
+
+<div class="codetabs">
+<div data-lang="scala" markdown="1">
+[`Statistics`](api/scala/index.html#org.apache.spark.mllib.stat.Statistics$) provides methods to
+run a 1-sample, 2-sided Kolmogorov-Smirnov test. The following example demonstrates how to run
+and interpret the hypothesis tests.
+
+{% highlight scala %}
+import org.apache.spark.SparkContext
+import org.apache.spark.mllib.stat.Statistics._
+
+val data: RDD[Double] = ... // an RDD of sample data
+
+// run a KS test for the sample versus a standard normal distribution
+val testResult = Statistics.kolmogorovSmirnovTest(data, "norm", 0, 1)
+println(testResult) // summary of the test including the p-value, test statistic,
+                      // and null hypothesis
+                      // if our p-value indicates significance, we can reject the null hypothesis
+
+// perform a KS test using a cumulative distribution function of our making
+val myCDF: Double => Double = ...
+val testResult2 = Statistics.kolmogorovSmirnovTest(data, myCDF)
+{% endhighlight %}
+</div>
+</div>
+
+
 ## Random data generation
 
 Random data generation is useful for randomized algorithms, prototyping, and performance testing.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
index 900007ec6bc74..90332028cfb3a 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
@@ -17,13 +17,16 @@
 
 package org.apache.spark.mllib.stat
 
+import scala.annotation.varargs
+
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.mllib.linalg.distributed.RowMatrix
 import org.apache.spark.mllib.linalg.{Matrix, Vector}
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.stat.correlation.Correlations
-import org.apache.spark.mllib.stat.test.{ChiSqTest, ChiSqTestResult}
+import org.apache.spark.mllib.stat.test.{ChiSqTest, ChiSqTestResult, KolmogorovSmirnovTest,
+  KolmogorovSmirnovTestResult}
 import org.apache.spark.rdd.RDD
 
 /**
@@ -158,4 +161,39 @@ object Statistics {
   def chiSqTest(data: RDD[LabeledPoint]): Array[ChiSqTestResult] = {
     ChiSqTest.chiSquaredFeatures(data)
   }
+
+  /**
+   * Conduct the two-sided Kolmogorov-Smirnov (KS) test for data sampled from a
+   * continuous distribution. By comparing the largest difference between the empirical cumulative
+   * distribution of the sample data and the theoretical distribution we can provide a test for the
+   * the null hypothesis that the sample data comes from that theoretical distribution.
+   * For more information on KS Test:
+   * @see [[https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test]]
+   *
+   * @param data an `RDD[Double]` containing the sample of data to test
+   * @param cdf a `Double => Double` function to calculate the theoretical CDF at a given value
+   * @return [[org.apache.spark.mllib.stat.test.KolmogorovSmirnovTestResult]] object containing test
+   *        statistic, p-value, and null hypothesis.
+   */
+  def kolmogorovSmirnovTest(data: RDD[Double], cdf: Double => Double)
+    : KolmogorovSmirnovTestResult = {
+    KolmogorovSmirnovTest.testOneSample(data, cdf)
+  }
+
+  /**
+   * Convenience function to conduct a one-sample, two-sided Kolmogorov-Smirnov test for probability
+   * distribution equality. Currently supports the normal distribution, taking as parameters
+   * the mean and standard deviation.
+   * (distName = "norm")
+   * @param data an `RDD[Double]` containing the sample of data to test
+   * @param distName a `String` name for a theoretical distribution
+   * @param params `Double*` specifying the parameters to be used for the theoretical distribution
+   * @return [[org.apache.spark.mllib.stat.test.KolmogorovSmirnovTestResult]] object containing test
+   *        statistic, p-value, and null hypothesis.
+   */
+  @varargs
+  def kolmogorovSmirnovTest(data: RDD[Double], distName: String, params: Double*)
+    : KolmogorovSmirnovTestResult = {
+    KolmogorovSmirnovTest.testOneSample(data, distName, params: _*)
+  }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/KolmogorovSmirnovTest.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/KolmogorovSmirnovTest.scala
new file mode 100644
index 0000000000000..d89b0059d83f3
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/KolmogorovSmirnovTest.scala
@@ -0,0 +1,194 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.stat.test
+
+import scala.annotation.varargs
+
+import org.apache.commons.math3.distribution.{NormalDistribution, RealDistribution}
+import org.apache.commons.math3.stat.inference.KolmogorovSmirnovTest
+
+import org.apache.spark.Logging
+import org.apache.spark.rdd.RDD
+
+/**
+ * Conduct the two-sided Kolmogorov Smirnov (KS) test for data sampled from a
+ * continuous distribution. By comparing the largest difference between the empirical cumulative
+ * distribution of the sample data and the theoretical distribution we can provide a test for the
+ * the null hypothesis that the sample data comes from that theoretical distribution.
+ * For more information on KS Test:
+ * @see [[https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test]]
+ *
+ * Implementation note: We seek to implement the KS test with a minimal number of distributed
+ * passes. We sort the RDD, and then perform the following operations on a per-partition basis:
+ * calculate an empirical cumulative distribution value for each observation, and a theoretical
+ * cumulative distribution value. We know the latter to be correct, while the former will be off by
+ * a constant (how large the constant is depends on how many values precede it in other partitions).
+ * However, given that this constant simply shifts the empirical CDF upwards, but doesn't
+ * change its shape, and furthermore, that constant is the same within a given partition, we can
+ * pick 2 values in each partition that can potentially resolve to the largest global distance.
+ * Namely, we pick the minimum distance and the maximum distance. Additionally, we keep track of how
+ * many elements are in each partition. Once these three values have been returned for every
+ * partition, we can collect and operate locally. Locally, we can now adjust each distance by the
+ * appropriate constant (the cumulative sum of number of elements in the prior partitions divided by
+ * thedata set size). Finally, we take the maximum absolute value, and this is the statistic.
+ */
+private[stat] object KolmogorovSmirnovTest extends Logging {
+
+  // Null hypothesis for the type of KS test to be included in the result.
+  object NullHypothesis extends Enumeration {
+    type NullHypothesis = Value
+    val OneSampleTwoSided = Value("Sample follows theoretical distribution")
+  }
+
+  /**
+   * Runs a KS test for 1 set of sample data, comparing it to a theoretical distribution
+   * @param data `RDD[Double]` data on which to run test
+   * @param cdf `Double => Double` function to calculate the theoretical CDF
+   * @return [[org.apache.spark.mllib.stat.test.KolmogorovSmirnovTestResult]] summarizing the test
+   *        results (p-value, statistic, and null hypothesis)
+   */
+  def testOneSample(data: RDD[Double], cdf: Double => Double): KolmogorovSmirnovTestResult = {
+    val n = data.count().toDouble
+    val localData = data.sortBy(x => x).mapPartitions { part =>
+      val partDiffs = oneSampleDifferences(part, n, cdf) // local distances
+      searchOneSampleCandidates(partDiffs) // candidates: local extrema
+    }.collect()
+    val ksStat = searchOneSampleStatistic(localData, n) // result: global extreme
+    evalOneSampleP(ksStat, n.toLong)
+  }
+
+  /**
+   * Runs a KS test for 1 set of sample data, comparing it to a theoretical distribution
+   * @param data `RDD[Double]` data on which to run test
+   * @param distObj `RealDistribution` a theoretical distribution
+   * @return [[org.apache.spark.mllib.stat.test.KolmogorovSmirnovTestResult]] summarizing the test
+   *        results (p-value, statistic, and null hypothesis)
+   */
+  def testOneSample(data: RDD[Double], distObj: RealDistribution): KolmogorovSmirnovTestResult = {
+    val cdf = (x: Double) => distObj.cumulativeProbability(x)
+    testOneSample(data, cdf)
+  }
+
+  /**
+   * Calculate unadjusted distances between the empirical CDF and the theoretical CDF in a
+   * partition
+   * @param partData `Iterator[Double]` 1 partition of a sorted RDD
+   * @param n `Double` the total size of the RDD
+   * @param cdf `Double => Double` a function the calculates the theoretical CDF of a value
+   * @return `Iterator[(Double, Double)] `Unadjusted (ie. off by a constant) potential extrema
+   *        in a partition. The first element corresponds to the (empirical CDF - 1/N) - CDF,
+   *        the second element corresponds to empirical CDF - CDF.  We can then search the resulting
+   *        iterator for the minimum of the first and the maximum of the second element, and provide
+   *        this as a partition's candidate extrema
+   */
+  private def oneSampleDifferences(partData: Iterator[Double], n: Double, cdf: Double => Double)
+    : Iterator[(Double, Double)] = {
+    // zip data with index (within that partition)
+    // calculate local (unadjusted) empirical CDF and subtract CDF
+    partData.zipWithIndex.map { case (v, ix) =>
+      // dp and dl are later adjusted by constant, when global info is available
+      val dp = (ix + 1) / n
+      val dl = ix / n
+      val cdfVal = cdf(v)
+      (dl - cdfVal, dp - cdfVal)
+    }
+  }
+
+  /**
+   * Search the unadjusted differences in a partition and return the
+   * two extrema (furthest below and furthest above CDF), along with a count of elements in that
+   * partition
+   * @param partDiffs `Iterator[(Double, Double)]` the unadjusted differences between empirical CDF
+   *                 and CDFin a partition, which come as a tuple of
+   *                 (empirical CDF - 1/N - CDF, empirical CDF - CDF)
+   * @return `Iterator[(Double, Double, Double)]` the local extrema and a count of elements
+   */
+  private def searchOneSampleCandidates(partDiffs: Iterator[(Double, Double)])
+    : Iterator[(Double, Double, Double)] = {
+    val initAcc = (Double.MaxValue, Double.MinValue, 0.0)
+    val pResults = partDiffs.foldLeft(initAcc) { case ((pMin, pMax, pCt), (dl, dp)) =>
+      (math.min(pMin, dl), math.max(pMax, dp), pCt + 1)
+    }
+    val results = if (pResults == initAcc) Array[(Double, Double, Double)]() else Array(pResults)
+    results.iterator
+  }
+
+  /**
+   * Find the global maximum distance between empirical CDF and CDF (i.e. the KS statistic) after
+   * adjusting local extrema estimates from individual partitions with the amount of elements in
+   * preceding partitions
+   * @param localData `Array[(Double, Double, Double)]` A local array containing the collected
+   *                 results of `searchOneSampleCandidates` across all partitions
+   * @param n `Double`The size of the RDD
+   * @return The one-sample Kolmogorov Smirnov Statistic
+   */
+  private def searchOneSampleStatistic(localData: Array[(Double, Double, Double)], n: Double)
+    : Double = {
+    val initAcc = (Double.MinValue, 0.0)
+    // adjust differences based on the number of elements preceding it, which should provide
+    // the correct distance between empirical CDF and CDF
+    val results = localData.foldLeft(initAcc) { case ((prevMax, prevCt), (minCand, maxCand, ct)) =>
+      val adjConst = prevCt / n
+      val dist1 = math.abs(minCand + adjConst)
+      val dist2 = math.abs(maxCand + adjConst)
+      val maxVal = Array(prevMax, dist1, dist2).max
+      (maxVal, prevCt + ct)
+    }
+    results._1
+  }
+
+  /**
+   * A convenience function that allows running the KS test for 1 set of sample data against
+   * a named distribution
+   * @param data the sample data that we wish to evaluate
+   * @param distName the name of the theoretical distribution
+   * @param params Variable length parameter for distribution's parameters
+   * @return [[org.apache.spark.mllib.stat.test.KolmogorovSmirnovTestResult]] summarizing the
+   *        test results (p-value, statistic, and null hypothesis)
+   */
+  @varargs
+  def testOneSample(data: RDD[Double], distName: String, params: Double*)
+    : KolmogorovSmirnovTestResult = {
+    val distObj =
+      distName match {
+        case "norm" => {
+          if (params.nonEmpty) {
+            // parameters are passed, then can only be 2
+            require(params.length == 2, "Normal distribution requires mean and standard " +
+              "deviation as parameters")
+            new NormalDistribution(params(0), params(1))
+          } else {
+            // if no parameters passed in initializes to standard normal
+            logInfo("No parameters specified for normal distribution," +
+              "initialized to standard normal (i.e. N(0, 1))")
+            new NormalDistribution(0, 1)
+          }
+        }
+        case  _ => throw new UnsupportedOperationException(s"$distName not yet supported through" +
+          s" convenience method. Current options are:['norm'].")
+      }
+
+    testOneSample(data, distObj)
+  }
+
+  private def evalOneSampleP(ksStat: Double, n: Long): KolmogorovSmirnovTestResult = {
+    val pval = 1 - new KolmogorovSmirnovTest().cdf(ksStat, n.toInt)
+    new KolmogorovSmirnovTestResult(pval, ksStat, NullHypothesis.OneSampleTwoSided.toString)
+  }
+}
+
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/TestResult.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/TestResult.scala
index 4784f9e947908..f44be13706695 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/TestResult.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/TestResult.scala
@@ -90,3 +90,20 @@ class ChiSqTestResult private[stat] (override val pValue: Double,
       super.toString
   }
 }
+
+/**
+ * :: Experimental ::
+ * Object containing the test results for the Kolmogorov-Smirnov test.
+ */
+@Experimental
+class KolmogorovSmirnovTestResult private[stat] (
+    override val pValue: Double,
+    override val statistic: Double,
+    override val nullHypothesis: String) extends TestResult[Int] {
+
+  override val degreesOfFreedom = 0
+
+  override def toString: String = {
+    "Kolmogorov-Smirnov test summary:\n" + super.toString
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/stat/HypothesisTestSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/stat/HypothesisTestSuite.scala
index b084a5fb4313f..142b90e764a7c 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/stat/HypothesisTestSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/stat/HypothesisTestSuite.scala
@@ -19,6 +19,10 @@ package org.apache.spark.mllib.stat
 
 import java.util.Random
 
+import org.apache.commons.math3.distribution.{ExponentialDistribution,
+  NormalDistribution, UniformRealDistribution}
+import org.apache.commons.math3.stat.inference.KolmogorovSmirnovTest
+
 import org.apache.spark.{SparkException, SparkFunSuite}
 import org.apache.spark.mllib.linalg.{DenseVector, Matrices, Vectors}
 import org.apache.spark.mllib.regression.LabeledPoint
@@ -153,4 +157,101 @@ class HypothesisTestSuite extends SparkFunSuite with MLlibTestSparkContext {
       Statistics.chiSqTest(sc.parallelize(continuousFeature, 2))
     }
   }
+
+  test("1 sample Kolmogorov-Smirnov test: apache commons math3 implementation equivalence") {
+    // Create theoretical distributions
+    val stdNormalDist = new NormalDistribution(0, 1)
+    val expDist = new ExponentialDistribution(0.6)
+    val unifDist = new UniformRealDistribution()
+
+    // set seeds
+    val seed = 10L
+    stdNormalDist.reseedRandomGenerator(seed)
+    expDist.reseedRandomGenerator(seed)
+    unifDist.reseedRandomGenerator(seed)
+
+    // Sample data from the distributions and parallelize it
+    val n = 100000
+    val sampledNorm = sc.parallelize(stdNormalDist.sample(n), 10)
+    val sampledExp = sc.parallelize(expDist.sample(n), 10)
+    val sampledUnif = sc.parallelize(unifDist.sample(n), 10)
+
+    // Use a apache math commons local KS test to verify calculations
+    val ksTest = new KolmogorovSmirnovTest()
+    val pThreshold = 0.05
+
+    // Comparing a standard normal sample to a standard normal distribution
+    val result1 = Statistics.kolmogorovSmirnovTest(sampledNorm, "norm", 0, 1)
+    val referenceStat1 = ksTest.kolmogorovSmirnovStatistic(stdNormalDist, sampledNorm.collect())
+    val referencePVal1 = 1 - ksTest.cdf(referenceStat1, n)
+    // Verify vs apache math commons ks test
+    assert(result1.statistic ~== referenceStat1 relTol 1e-4)
+    assert(result1.pValue ~== referencePVal1 relTol 1e-4)
+    // Cannot reject null hypothesis
+    assert(result1.pValue > pThreshold)
+
+    // Comparing an exponential sample to a standard normal distribution
+    val result2 = Statistics.kolmogorovSmirnovTest(sampledExp, "norm", 0, 1)
+    val referenceStat2 = ksTest.kolmogorovSmirnovStatistic(stdNormalDist, sampledExp.collect())
+    val referencePVal2 = 1 - ksTest.cdf(referenceStat2, n)
+    // verify vs apache math commons ks test
+    assert(result2.statistic ~== referenceStat2 relTol 1e-4)
+    assert(result2.pValue ~== referencePVal2 relTol 1e-4)
+    // reject null hypothesis
+    assert(result2.pValue < pThreshold)
+
+    // Testing the use of a user provided CDF function
+    // Distribution is not serializable, so will have to create in the lambda
+    val expCDF = (x: Double) => new ExponentialDistribution(0.2).cumulativeProbability(x)
+
+    // Comparing an exponential sample with mean X to an exponential distribution with mean Y
+    // Where X != Y
+    val result3 = Statistics.kolmogorovSmirnovTest(sampledExp, expCDF)
+    val referenceStat3 = ksTest.kolmogorovSmirnovStatistic(new ExponentialDistribution(0.2),
+      sampledExp.collect())
+    val referencePVal3 = 1 - ksTest.cdf(referenceStat3, sampledNorm.count().toInt)
+    // verify vs apache math commons ks test
+    assert(result3.statistic ~== referenceStat3 relTol 1e-4)
+    assert(result3.pValue ~== referencePVal3 relTol 1e-4)
+    // reject null hypothesis
+    assert(result3.pValue < pThreshold)
+  }
+
+  test("1 sample Kolmogorov-Smirnov test: R implementation equivalence") {
+    /*
+      Comparing results with R's implementation of Kolmogorov-Smirnov for 1 sample
+      > sessionInfo()
+      R version 3.2.0 (2015-04-16)
+      Platform: x86_64-apple-darwin13.4.0 (64-bit)
+      > set.seed(20)
+      > v <- rnorm(20)
+      > v
+       [1]  1.16268529 -0.58592447  1.78546500 -1.33259371 -0.44656677  0.56960612
+       [7] -2.88971761 -0.86901834 -0.46170268 -0.55554091 -0.02013537 -0.15038222
+      [13] -0.62812676  1.32322085 -1.52135057 -0.43742787  0.97057758  0.02822264
+      [19] -0.08578219  0.38921440
+      > ks.test(v, pnorm, alternative = "two.sided")
+
+               One-sample Kolmogorov-Smirnov test
+
+      data:  v
+      D = 0.18874, p-value = 0.4223
+      alternative hypothesis: two-sided
+    */
+
+    val rKSStat = 0.18874
+    val rKSPVal = 0.4223
+    val rData = sc.parallelize(
+      Array(
+        1.1626852897838, -0.585924465893051, 1.78546500331661, -1.33259371048501,
+        -0.446566766553219, 0.569606122374976, -2.88971761441412, -0.869018343326555,
+        -0.461702683149641, -0.555540910137444, -0.0201353678515895, -0.150382224136063,
+        -0.628126755843964, 1.32322085193283, -1.52135057001199, -0.437427868856691,
+        0.970577579543399, 0.0282226444247749, -0.0857821886527593, 0.389214404984942
+      )
+    )
+    val rCompResult = Statistics.kolmogorovSmirnovTest(rData, "norm", 0, 1)
+    assert(rCompResult.statistic ~== rKSStat relTol 1e-4)
+    assert(rCompResult.pValue ~== rKSPVal relTol 1e-4)
+  }
 }

From 7f6be1f24d4f2fcb3d3ec181b5abf241709a8b6d Mon Sep 17 00:00:00 2001
From: zhangjiajin <zhangjiajin@huawei.com>
Date: Fri, 10 Jul 2015 21:11:46 -0700
Subject: [PATCH 121/549] [SPARK-6487] [MLLIB] Add sequential pattern mining
 algorithm PrefixSpan to Spark MLlib

Add parallel PrefixSpan algorithm and test file.
Support non-temporal sequences.

Author: zhangjiajin <zhangjiajin@huawei.com>
Author: zhang jiajin <zhangjiajin@huawei.com>

Closes #7258 from zhangjiajin/master and squashes the following commits:

ca9c4c8 [zhangjiajin] Modified the code according to the review comments.
574e56c [zhangjiajin] Add new object LocalPrefixSpan, and do some optimization.
ba5df34 [zhangjiajin] Fix a Scala style error.
4c60fb3 [zhangjiajin] Fix some Scala style errors.
1dd33ad [zhangjiajin] Modified the code according to the review comments.
89bc368 [zhangjiajin] Fixed a Scala style error.
a2eb14c [zhang jiajin] Delete PrefixspanSuite.scala
951fd42 [zhang jiajin] Delete Prefixspan.scala
575995f [zhangjiajin] Modified the code according to the review comments.
91fd7e6 [zhangjiajin] Add new algorithm PrefixSpan and test file.
---
 .../spark/mllib/fpm/LocalPrefixSpan.scala     | 113 +++++++++++++
 .../apache/spark/mllib/fpm/PrefixSpan.scala   | 157 ++++++++++++++++++
 .../spark/mllib/fpm/PrefixSpanSuite.scala     | 120 +++++++++++++
 3 files changed, 390 insertions(+)
 create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/fpm/LocalPrefixSpan.scala
 create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala
 create mode 100644 mllib/src/test/scala/org/apache/spark/mllib/fpm/PrefixSpanSuite.scala

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/LocalPrefixSpan.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/LocalPrefixSpan.scala
new file mode 100644
index 0000000000000..39c48b084e550
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/LocalPrefixSpan.scala
@@ -0,0 +1,113 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.fpm
+
+import org.apache.spark.Logging
+import org.apache.spark.annotation.Experimental
+
+/**
+ *
+ * :: Experimental ::
+ *
+ * Calculate all patterns of a projected database in local.
+ */
+@Experimental
+private[fpm] object LocalPrefixSpan extends Logging with Serializable {
+
+  /**
+   * Calculate all patterns of a projected database.
+   * @param minCount minimum count
+   * @param maxPatternLength maximum pattern length
+   * @param prefix prefix
+   * @param projectedDatabase the projected dabase
+   * @return a set of sequential pattern pairs,
+   *         the key of pair is sequential pattern (a list of items),
+   *         the value of pair is the pattern's count.
+   */
+  def run(
+      minCount: Long,
+      maxPatternLength: Int,
+      prefix: Array[Int],
+      projectedDatabase: Array[Array[Int]]): Array[(Array[Int], Long)] = {
+    val frequentPrefixAndCounts = getFreqItemAndCounts(minCount, projectedDatabase)
+    val frequentPatternAndCounts = frequentPrefixAndCounts
+      .map(x => (prefix ++ Array(x._1), x._2))
+    val prefixProjectedDatabases = getPatternAndProjectedDatabase(
+      prefix, frequentPrefixAndCounts.map(_._1), projectedDatabase)
+
+    val continueProcess = prefixProjectedDatabases.nonEmpty && prefix.length + 1 < maxPatternLength
+    if (continueProcess) {
+      val nextPatterns = prefixProjectedDatabases
+        .map(x => run(minCount, maxPatternLength, x._1, x._2))
+        .reduce(_ ++ _)
+      frequentPatternAndCounts ++ nextPatterns
+    } else {
+      frequentPatternAndCounts
+    }
+  }
+
+  /**
+   * calculate suffix sequence following a prefix in a sequence
+   * @param prefix prefix
+   * @param sequence sequence
+   * @return suffix sequence
+   */
+  def getSuffix(prefix: Int, sequence: Array[Int]): Array[Int] = {
+    val index = sequence.indexOf(prefix)
+    if (index == -1) {
+      Array()
+    } else {
+      sequence.drop(index + 1)
+    }
+  }
+
+  /**
+   * Generates frequent items by filtering the input data using minimal count level.
+   * @param minCount the absolute minimum count
+   * @param sequences sequences data
+   * @return array of item and count pair
+   */
+  private def getFreqItemAndCounts(
+      minCount: Long,
+      sequences: Array[Array[Int]]): Array[(Int, Long)] = {
+    sequences.flatMap(_.distinct)
+      .groupBy(x => x)
+      .mapValues(_.length.toLong)
+      .filter(_._2 >= minCount)
+      .toArray
+  }
+
+  /**
+   * Get the frequent prefixes' projected database.
+   * @param prePrefix the frequent prefixes' prefix
+   * @param frequentPrefixes frequent prefixes
+   * @param sequences sequences data
+   * @return prefixes and projected database
+   */
+  private def getPatternAndProjectedDatabase(
+      prePrefix: Array[Int],
+      frequentPrefixes: Array[Int],
+      sequences: Array[Array[Int]]): Array[(Array[Int], Array[Array[Int]])] = {
+    val filteredProjectedDatabase = sequences
+      .map(x => x.filter(frequentPrefixes.contains(_)))
+    frequentPrefixes.map { x =>
+      val sub = filteredProjectedDatabase.map(y => getSuffix(x, y)).filter(_.nonEmpty)
+      (prePrefix ++ Array(x), sub)
+    }.filter(x => x._2.nonEmpty)
+  }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala
new file mode 100644
index 0000000000000..9d8c60ef0fc45
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala
@@ -0,0 +1,157 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.fpm
+
+import org.apache.spark.Logging
+import org.apache.spark.annotation.Experimental
+import org.apache.spark.rdd.RDD
+import org.apache.spark.storage.StorageLevel
+
+/**
+ *
+ * :: Experimental ::
+ *
+ * A parallel PrefixSpan algorithm to mine sequential pattern.
+ * The PrefixSpan algorithm is described in
+ * [[http://doi.org/10.1109/ICDE.2001.914830]].
+ *
+ * @param minSupport the minimal support level of the sequential pattern, any pattern appears
+ *                   more than  (minSupport * size-of-the-dataset) times will be output
+ * @param maxPatternLength the maximal length of the sequential pattern, any pattern appears
+ *                   less than maxPatternLength will be output
+ *
+ * @see [[https://en.wikipedia.org/wiki/Sequential_Pattern_Mining Sequential Pattern Mining
+ *       (Wikipedia)]]
+ */
+@Experimental
+class PrefixSpan private (
+    private var minSupport: Double,
+    private var maxPatternLength: Int) extends Logging with Serializable {
+
+  /**
+   * Constructs a default instance with default parameters
+   * {minSupport: `0.1`, maxPatternLength: `10`}.
+   */
+  def this() = this(0.1, 10)
+
+  /**
+   * Sets the minimal support level (default: `0.1`).
+   */
+  def setMinSupport(minSupport: Double): this.type = {
+    require(minSupport >= 0 && minSupport <= 1,
+      "The minimum support value must be between 0 and 1, including 0 and 1.")
+    this.minSupport = minSupport
+    this
+  }
+
+  /**
+   * Sets maximal pattern length (default: `10`).
+   */
+  def setMaxPatternLength(maxPatternLength: Int): this.type = {
+    require(maxPatternLength >= 1,
+      "The maximum pattern length value must be greater than 0.")
+    this.maxPatternLength = maxPatternLength
+    this
+  }
+
+  /**
+   * Find the complete set of sequential patterns in the input sequences.
+   * @param sequences input data set, contains a set of sequences,
+   *                  a sequence is an ordered list of elements.
+   * @return a set of sequential pattern pairs,
+   *         the key of pair is pattern (a list of elements),
+   *         the value of pair is the pattern's count.
+   */
+  def run(sequences: RDD[Array[Int]]): RDD[(Array[Int], Long)] = {
+    if (sequences.getStorageLevel == StorageLevel.NONE) {
+      logWarning("Input data is not cached.")
+    }
+    val minCount = getMinCount(sequences)
+    val lengthOnePatternsAndCounts =
+      getFreqItemAndCounts(minCount, sequences).collect()
+    val prefixAndProjectedDatabase = getPrefixAndProjectedDatabase(
+      lengthOnePatternsAndCounts.map(_._1), sequences)
+    val groupedProjectedDatabase = prefixAndProjectedDatabase
+      .map(x => (x._1.toSeq, x._2))
+      .groupByKey()
+      .map(x => (x._1.toArray, x._2.toArray))
+    val nextPatterns = getPatternsInLocal(minCount, groupedProjectedDatabase)
+    val lengthOnePatternsAndCountsRdd =
+      sequences.sparkContext.parallelize(
+        lengthOnePatternsAndCounts.map(x => (Array(x._1), x._2)))
+    val allPatterns = lengthOnePatternsAndCountsRdd ++ nextPatterns
+    allPatterns
+  }
+
+  /**
+   * Get the minimum count (sequences count * minSupport).
+   * @param sequences input data set, contains a set of sequences,
+   * @return minimum count,
+   */
+  private def getMinCount(sequences: RDD[Array[Int]]): Long = {
+    if (minSupport == 0) 0L else math.ceil(sequences.count() * minSupport).toLong
+  }
+
+  /**
+   * Generates frequent items by filtering the input data using minimal count level.
+   * @param minCount the absolute minimum count
+   * @param sequences original sequences data
+   * @return array of item and count pair
+   */
+  private def getFreqItemAndCounts(
+      minCount: Long,
+      sequences: RDD[Array[Int]]): RDD[(Int, Long)] = {
+    sequences.flatMap(_.distinct.map((_, 1L)))
+      .reduceByKey(_ + _)
+      .filter(_._2 >= minCount)
+  }
+
+  /**
+   * Get the frequent prefixes' projected database.
+   * @param frequentPrefixes frequent prefixes
+   * @param sequences sequences data
+   * @return prefixes and projected database
+   */
+  private def getPrefixAndProjectedDatabase(
+      frequentPrefixes: Array[Int],
+      sequences: RDD[Array[Int]]): RDD[(Array[Int], Array[Int])] = {
+    val filteredSequences = sequences.map { p =>
+      p.filter (frequentPrefixes.contains(_) )
+    }
+    filteredSequences.flatMap { x =>
+      frequentPrefixes.map { y =>
+        val sub = LocalPrefixSpan.getSuffix(y, x)
+        (Array(y), sub)
+      }.filter(_._2.nonEmpty)
+    }
+  }
+
+  /**
+   * calculate the patterns in local.
+   * @param minCount the absolute minimum count
+   * @param data patterns and projected sequences data data
+   * @return patterns
+   */
+  private def getPatternsInLocal(
+      minCount: Long,
+      data: RDD[(Array[Int], Array[Array[Int]])]): RDD[(Array[Int], Long)] = {
+    data.flatMap { x =>
+      LocalPrefixSpan.run(minCount, maxPatternLength, x._1, x._2)
+    }
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/fpm/PrefixSpanSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/fpm/PrefixSpanSuite.scala
new file mode 100644
index 0000000000000..413436d3db85f
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/mllib/fpm/PrefixSpanSuite.scala
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.mllib.fpm
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.rdd.RDD
+
+class PrefixspanSuite extends SparkFunSuite with MLlibTestSparkContext {
+
+  test("PrefixSpan using Integer type") {
+
+    /*
+      library("arulesSequences")
+      prefixSpanSeqs = read_baskets("prefixSpanSeqs", info = c("sequenceID","eventID","SIZE"))
+      freqItemSeq = cspade(
+        prefixSpanSeqs,
+        parameter = list(support =
+          2 / length(unique(transactionInfo(prefixSpanSeqs)$sequenceID)), maxlen = 2 ))
+      resSeq = as(freqItemSeq, "data.frame")
+      resSeq
+    */
+
+    val sequences = Array(
+      Array(1, 3, 4, 5),
+      Array(2, 3, 1),
+      Array(2, 4, 1),
+      Array(3, 1, 3, 4, 5),
+      Array(3, 4, 4, 3),
+      Array(6, 5, 3))
+
+    val rdd = sc.parallelize(sequences, 2).cache()
+
+    def compareResult(
+        expectedValue: Array[(Array[Int], Long)],
+        actualValue: Array[(Array[Int], Long)]): Boolean = {
+      val sortedExpectedValue = expectedValue.sortWith{ (x, y) =>
+        x._1.mkString(",") + ":" + x._2 < y._1.mkString(",") + ":" + y._2
+      }
+      val sortedActualValue = actualValue.sortWith{ (x, y) =>
+        x._1.mkString(",") + ":" + x._2 < y._1.mkString(",") + ":" + y._2
+      }
+      sortedExpectedValue.zip(sortedActualValue)
+        .map(x => x._1._1.mkString(",") == x._2._1.mkString(",") && x._1._2 == x._2._2)
+        .reduce(_&&_)
+    }
+
+    val prefixspan = new PrefixSpan()
+      .setMinSupport(0.33)
+      .setMaxPatternLength(50)
+    val result1 = prefixspan.run(rdd)
+    val expectedValue1 = Array(
+      (Array(1), 4L),
+      (Array(1, 3), 2L),
+      (Array(1, 3, 4), 2L),
+      (Array(1, 3, 4, 5), 2L),
+      (Array(1, 3, 5), 2L),
+      (Array(1, 4), 2L),
+      (Array(1, 4, 5), 2L),
+      (Array(1, 5), 2L),
+      (Array(2), 2L),
+      (Array(2, 1), 2L),
+      (Array(3), 5L),
+      (Array(3, 1), 2L),
+      (Array(3, 3), 2L),
+      (Array(3, 4), 3L),
+      (Array(3, 4, 5), 2L),
+      (Array(3, 5), 2L),
+      (Array(4), 4L),
+      (Array(4, 5), 2L),
+      (Array(5), 3L)
+    )
+    assert(compareResult(expectedValue1, result1.collect()))
+
+    prefixspan.setMinSupport(0.5).setMaxPatternLength(50)
+    val result2 = prefixspan.run(rdd)
+    val expectedValue2 = Array(
+      (Array(1), 4L),
+      (Array(3), 5L),
+      (Array(3, 4), 3L),
+      (Array(4), 4L),
+      (Array(5), 3L)
+    )
+    assert(compareResult(expectedValue2, result2.collect()))
+
+    prefixspan.setMinSupport(0.33).setMaxPatternLength(2)
+    val result3 = prefixspan.run(rdd)
+    val expectedValue3 = Array(
+      (Array(1), 4L),
+      (Array(1, 3), 2L),
+      (Array(1, 4), 2L),
+      (Array(1, 5), 2L),
+      (Array(2, 1), 2L),
+      (Array(2), 2L),
+      (Array(3), 5L),
+      (Array(3, 1), 2L),
+      (Array(3, 3), 2L),
+      (Array(3, 4), 3L),
+      (Array(3, 5), 2L),
+      (Array(4), 4L),
+      (Array(4, 5), 2L),
+      (Array(5), 3L)
+    )
+    assert(compareResult(expectedValue3, result3.collect()))
+  }
+}

From 0c5207c66db8cf76950d53404abd43e085f8d45b Mon Sep 17 00:00:00 2001
From: "Joseph K. Bradley" <joseph@databricks.com>
Date: Fri, 10 Jul 2015 21:25:09 -0700
Subject: [PATCH 122/549] [SPARK-8994] [ML] tiny cleanups to Params, Pipeline

Made default impl of Params.validateParams empty
CC mengxr

Author: Joseph K. Bradley <joseph@databricks.com>

Closes #7349 from jkbradley/pipeline-small-cleanups and squashes the following commits:

4e0f013 [Joseph K. Bradley] small cleanups after SPARK-5956
---
 mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala     | 2 ++
 mllib/src/main/scala/org/apache/spark/ml/param/params.scala | 4 +---
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala b/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
index a1f3851d804ff..aef2c019d2871 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
@@ -95,6 +95,8 @@ class Pipeline(override val uid: String) extends Estimator[PipelineModel] {
   /** @group setParam */
   def setStages(value: Array[PipelineStage]): this.type = { set(stages, value); this }
 
+  // Below, we clone stages so that modifications to the list of stages will not change
+  // the Param value in the Pipeline.
   /** @group getParam */
   def getStages: Array[PipelineStage] = $(stages).clone()
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
index 50c0d855066f8..d034d7ec6b60e 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
@@ -341,9 +341,7 @@ trait Params extends Identifiable with Serializable {
    * those are checked during schema validation.
    */
   def validateParams(): Unit = {
-    params.filter(isDefined).foreach { param =>
-      param.asInstanceOf[Param[Any]].validate($(param))
-    }
+    // Do nothing by default.  Override to handle Param interactions.
   }
 
   /**

From c472eb17ae7f0910f304e414ea5ccbb77a9e153a Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@outlook.com>
Date: Fri, 10 Jul 2015 23:25:11 -0700
Subject: [PATCH 123/549] [SPARK-8970][SQL] remove unnecessary abstraction for
 ExtractValue

Author: Wenchen Fan <cloud0fan@outlook.com>

Closes #7339 from cloud-fan/minor and squashes the following commits:

84a2128 [Wenchen Fan] remove unapply
6a37c12 [Wenchen Fan] remove unnecessary abstraction for ExtractValue
---
 .../sql/catalyst/analysis/Analyzer.scala      |  3 +-
 .../expressions/complexTypeExtractors.scala   | 36 ++++---------------
 .../sql/catalyst/optimizer/Optimizer.scala    |  8 +++--
 3 files changed, 15 insertions(+), 32 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 3fdc6d62bc786..891408e310049 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -141,7 +141,8 @@ class Analyzer(
           child match {
             case _: UnresolvedAttribute => u
             case ne: NamedExpression => ne
-            case ev: ExtractValueWithStruct => Alias(ev, ev.field.name)()
+            case g: GetStructField => Alias(g, g.field.name)()
+            case g: GetArrayStructFields => Alias(g, g.field.name)()
             case g: Generator if g.resolved && g.elementTypes.size > 1 => MultiAlias(g, Nil)
             case e if !e.resolved => u
             case other => Alias(other, s"_c$i")()
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala
index 73cc930c45832..5504781edca1b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala
@@ -78,12 +78,6 @@ object ExtractValue {
     }
   }
 
-  def unapply(g: ExtractValue): Option[(Expression, Expression)] = g match {
-    case o: GetArrayItem => Some((o.child, o.ordinal))
-    case o: GetMapValue => Some((o.child, o.key))
-    case s: ExtractValueWithStruct => Some((s.child, null))
-  }
-
   /**
    * Find the ordinal of StructField, report error if no desired field or over one
    * desired fields are found.
@@ -103,32 +97,17 @@ object ExtractValue {
   }
 }
 
-/**
- * A common interface of all kinds of extract value expressions.
- * Note: concrete extract value expressions are created only by `ExtractValue.apply`,
- * we don't need to do type check for them.
- */
-trait ExtractValue {
-  self: Expression =>
-}
-
-abstract class ExtractValueWithStruct extends UnaryExpression with ExtractValue {
-  self: Product =>
-
-  def field: StructField
-  override def toString: String = s"$child.${field.name}"
-}
-
 /**
  * Returns the value of fields in the Struct `child`.
  *
  * No need to do type checking since it is handled by [[ExtractValue]].
  */
 case class GetStructField(child: Expression, field: StructField, ordinal: Int)
-  extends ExtractValueWithStruct {
+  extends UnaryExpression {
 
   override def dataType: DataType = field.dataType
   override def nullable: Boolean = child.nullable || field.nullable
+  override def toString: String = s"$child.${field.name}"
 
   protected override def nullSafeEval(input: Any): Any =
     input.asInstanceOf[InternalRow](ordinal)
@@ -155,10 +134,11 @@ case class GetArrayStructFields(
     child: Expression,
     field: StructField,
     ordinal: Int,
-    containsNull: Boolean) extends ExtractValueWithStruct {
+    containsNull: Boolean) extends UnaryExpression {
 
   override def dataType: DataType = ArrayType(field.dataType, containsNull)
   override def nullable: Boolean = child.nullable || containsNull || field.nullable
+  override def toString: String = s"$child.${field.name}"
 
   protected override def nullSafeEval(input: Any): Any = {
     input.asInstanceOf[Seq[InternalRow]].map { row =>
@@ -191,8 +171,7 @@ case class GetArrayStructFields(
  *
  * No need to do type checking since it is handled by [[ExtractValue]].
  */
-case class GetArrayItem(child: Expression, ordinal: Expression)
-  extends BinaryExpression with ExtractValue {
+case class GetArrayItem(child: Expression, ordinal: Expression) extends BinaryExpression {
 
   override def toString: String = s"$child[$ordinal]"
 
@@ -231,12 +210,11 @@ case class GetArrayItem(child: Expression, ordinal: Expression)
 }
 
 /**
- * Returns the value of key `ordinal` in Map `child`.
+ * Returns the value of key `key` in Map `child`.
  *
  * No need to do type checking since it is handled by [[ExtractValue]].
  */
-case class GetMapValue(child: Expression, key: Expression)
-  extends BinaryExpression with ExtractValue {
+case class GetMapValue(child: Expression, key: Expression) extends BinaryExpression {
 
   override def toString: String = s"$child[$key]"
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 7d41ef9aafa44..5d80214abf141 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -275,8 +275,12 @@ object NullPropagation extends Rule[LogicalPlan] {
       case e @ Count(Literal(null, _)) => Cast(Literal(0L), e.dataType)
       case e @ IsNull(c) if !c.nullable => Literal.create(false, BooleanType)
       case e @ IsNotNull(c) if !c.nullable => Literal.create(true, BooleanType)
-      case e @ ExtractValue(Literal(null, _), _) => Literal.create(null, e.dataType)
-      case e @ ExtractValue(_, Literal(null, _)) => Literal.create(null, e.dataType)
+      case e @ GetArrayItem(Literal(null, _), _) => Literal.create(null, e.dataType)
+      case e @ GetArrayItem(_, Literal(null, _)) => Literal.create(null, e.dataType)
+      case e @ GetMapValue(Literal(null, _), _) => Literal.create(null, e.dataType)
+      case e @ GetMapValue(_, Literal(null, _)) => Literal.create(null, e.dataType)
+      case e @ GetStructField(Literal(null, _), _, _) => Literal.create(null, e.dataType)
+      case e @ GetArrayStructFields(Literal(null, _), _, _, _) => Literal.create(null, e.dataType)
       case e @ EqualNullSafe(Literal(null, _), r) => IsNull(r)
       case e @ EqualNullSafe(l, Literal(null, _)) => IsNull(l)
       case e @ Count(expr) if !expr.nullable => Count(Literal(1))

From 30090884f96efde72d9653f2253070b68f87782c Mon Sep 17 00:00:00 2001
From: Kay Ousterhout <kayousterhout@gmail.com>
Date: Sun, 12 Jul 2015 20:45:09 -0400
Subject: [PATCH 124/549] [SPARK-8880] Fix confusing Stage.attemptId member
 variable

Author: Kay Ousterhout <kayousterhout@gmail.com>

Closes #7275 from kayousterhout/SPARK-8880 and squashes the following commits:

3e9ce7c [Kay Ousterhout] Added missing return type
e150278 [Kay Ousterhout] [SPARK-8880] Fix confusing Stage.attemptId member variable
---
 .../apache/spark/scheduler/DAGScheduler.scala |  6 +++---
 .../org/apache/spark/scheduler/Stage.scala    | 20 ++++++++++++-------
 .../apache/spark/scheduler/StageInfo.scala    |  4 ++--
 3 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
index 6841fa835747f..f3d87ee5c4fd1 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -872,7 +872,7 @@ class DAGScheduler(
     // serializable. If tasks are not serializable, a SparkListenerStageCompleted event
     // will be posted, which should always come after a corresponding SparkListenerStageSubmitted
     // event.
-    stage.latestInfo = StageInfo.fromStage(stage, Some(partitionsToCompute.size))
+    stage.makeNewStageAttempt(partitionsToCompute.size)
     outputCommitCoordinator.stageStart(stage.id)
     listenerBus.post(SparkListenerStageSubmitted(stage.latestInfo, properties))
 
@@ -937,8 +937,8 @@ class DAGScheduler(
       logInfo("Submitting " + tasks.size + " missing tasks from " + stage + " (" + stage.rdd + ")")
       stage.pendingTasks ++= tasks
       logDebug("New pending tasks: " + stage.pendingTasks)
-      taskScheduler.submitTasks(
-        new TaskSet(tasks.toArray, stage.id, stage.newAttemptId(), stage.firstJobId, properties))
+      taskScheduler.submitTasks(new TaskSet(
+        tasks.toArray, stage.id, stage.latestInfo.attemptId, stage.firstJobId, properties))
       stage.latestInfo.submissionTime = Some(clock.getTimeMillis())
     } else {
       // Because we posted SparkListenerStageSubmitted earlier, we should mark
diff --git a/core/src/main/scala/org/apache/spark/scheduler/Stage.scala b/core/src/main/scala/org/apache/spark/scheduler/Stage.scala
index c59d6e4f5bc04..b86724de2cb73 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/Stage.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/Stage.scala
@@ -62,22 +62,28 @@ private[spark] abstract class Stage(
 
   var pendingTasks = new HashSet[Task[_]]
 
+  /** The ID to use for the next new attempt for this stage. */
   private var nextAttemptId: Int = 0
 
   val name = callSite.shortForm
   val details = callSite.longForm
 
-  /** Pointer to the latest [StageInfo] object, set by DAGScheduler. */
-  var latestInfo: StageInfo = StageInfo.fromStage(this)
+  /**
+   * Pointer to the [StageInfo] object for the most recent attempt. This needs to be initialized
+   * here, before any attempts have actually been created, because the DAGScheduler uses this
+   * StageInfo to tell SparkListeners when a job starts (which happens before any stage attempts
+   * have been created).
+   */
+  private var _latestInfo: StageInfo = StageInfo.fromStage(this, nextAttemptId)
 
-  /** Return a new attempt id, starting with 0. */
-  def newAttemptId(): Int = {
-    val id = nextAttemptId
+  /** Creates a new attempt for this stage by creating a new StageInfo with a new attempt ID. */
+  def makeNewStageAttempt(numPartitionsToCompute: Int): Unit = {
+    _latestInfo = StageInfo.fromStage(this, nextAttemptId, Some(numPartitionsToCompute))
     nextAttemptId += 1
-    id
   }
 
-  def attemptId: Int = nextAttemptId
+  /** Returns the StageInfo for the most recent attempt for this stage. */
+  def latestInfo: StageInfo = _latestInfo
 
   override final def hashCode(): Int = id
   override final def equals(other: Any): Boolean = other match {
diff --git a/core/src/main/scala/org/apache/spark/scheduler/StageInfo.scala b/core/src/main/scala/org/apache/spark/scheduler/StageInfo.scala
index e439d2a7e1229..5d2abbc67e9d9 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/StageInfo.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/StageInfo.scala
@@ -70,12 +70,12 @@ private[spark] object StageInfo {
    * shuffle dependencies. Therefore, all ancestor RDDs related to this Stage's RDD through a
    * sequence of narrow dependencies should also be associated with this Stage.
    */
-  def fromStage(stage: Stage, numTasks: Option[Int] = None): StageInfo = {
+  def fromStage(stage: Stage, attemptId: Int, numTasks: Option[Int] = None): StageInfo = {
     val ancestorRddInfos = stage.rdd.getNarrowAncestors.map(RDDInfo.fromRdd)
     val rddInfos = Seq(RDDInfo.fromRdd(stage.rdd)) ++ ancestorRddInfos
     new StageInfo(
       stage.id,
-      stage.attemptId,
+      attemptId,
       stage.name,
       numTasks.getOrElse(stage.numTasks),
       rddInfos,

From 20b474335c68c644150fdc8443a2d0d2dad5e27d Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Sun, 12 Jul 2015 20:25:06 -0700
Subject: [PATCH 125/549] [SPARK-9006] [PYSPARK] fix microsecond loss in Python
 3

It may loss a microsecond if using timestamp as float, should be `int` instead.

Author: Davies Liu <davies@databricks.com>

Closes #7363 from davies/fix_microsecond and squashes the following commits:

36f6007 [Davies Liu] fix microsecond loss in Python 3
---
 python/pyspark/sql/types.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index d63857691675a..f75791fad1612 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -188,7 +188,8 @@ def toInternal(self, dt):
 
     def fromInternal(self, ts):
         if ts is not None:
-            return datetime.datetime.fromtimestamp(ts / 1e6)
+            # using int to avoid precision loss in float
+            return datetime.datetime.fromtimestamp(ts // 1000000).replace(microsecond=ts % 1000000)
 
 
 class DecimalType(FractionalType):

From 92540d22e45f9300f413f520a1770e9f36cfa833 Mon Sep 17 00:00:00 2001
From: Daoyuan Wang <daoyuan.wang@intel.com>
Date: Mon, 13 Jul 2015 00:14:32 -0700
Subject: [PATCH 126/549] [SPARK-8203] [SPARK-8204] [SQL] conditional function:
 least/greatest

chenghao-intel zhichao-li qiansl127

Author: Daoyuan Wang <daoyuan.wang@intel.com>

Closes #6851 from adrian-wang/udflg and squashes the following commits:

0f1bff2 [Daoyuan Wang] address comments from davis
7a6bdbb [Daoyuan Wang] add '.' for hex()
c1f6824 [Daoyuan Wang] add codegen, test for all types
ec625b0 [Daoyuan Wang] conditional function: least/greatest
---
 .../catalyst/analysis/FunctionRegistry.scala  |   2 +
 .../catalyst/expressions/conditionals.scala   | 103 +++++++++++++++++-
 .../ConditionalExpressionSuite.scala          |  81 ++++++++++++++
 .../org/apache/spark/sql/functions.scala      |  60 +++++++++-
 .../spark/sql/DataFrameFunctionsSuite.scala   |  22 ++++
 5 files changed, 263 insertions(+), 5 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index f62d79f8cea6d..ed69c42dcb825 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -76,9 +76,11 @@ object FunctionRegistry {
     expression[CreateArray]("array"),
     expression[Coalesce]("coalesce"),
     expression[Explode]("explode"),
+    expression[Greatest]("greatest"),
     expression[If]("if"),
     expression[IsNull]("isnull"),
     expression[IsNotNull]("isnotnull"),
+    expression[Least]("least"),
     expression[Coalesce]("nvl"),
     expression[Rand]("rand"),
     expression[Randn]("randn"),
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionals.scala
index 395e84f089e41..e6a705fb8055b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionals.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionals.scala
@@ -20,7 +20,8 @@ package org.apache.spark.sql.catalyst.expressions
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
 import org.apache.spark.sql.catalyst.expressions.codegen._
-import org.apache.spark.sql.types.{BooleanType, DataType}
+import org.apache.spark.sql.catalyst.util.TypeUtils
+import org.apache.spark.sql.types.{NullType, BooleanType, DataType}
 
 
 case class If(predicate: Expression, trueValue: Expression, falseValue: Expression)
@@ -312,3 +313,103 @@ case class CaseKeyWhen(key: Expression, branches: Seq[Expression]) extends CaseW
     }.mkString
   }
 }
+
+case class Least(children: Expression*) extends Expression {
+  require(children.length > 1, "LEAST requires at least 2 arguments, got " + children.length)
+
+  override def nullable: Boolean = children.forall(_.nullable)
+  override def foldable: Boolean = children.forall(_.foldable)
+
+  private lazy val ordering = TypeUtils.getOrdering(dataType)
+
+  override def checkInputDataTypes(): TypeCheckResult = {
+    if (children.map(_.dataType).distinct.count(_ != NullType) > 1) {
+      TypeCheckResult.TypeCheckFailure(
+        s"The expressions should all have the same type," +
+          s" got LEAST (${children.map(_.dataType)}).")
+    } else {
+      TypeUtils.checkForOrderingExpr(dataType, "function " + prettyName)
+    }
+  }
+
+  override def dataType: DataType = children.head.dataType
+
+  override def eval(input: InternalRow): Any = {
+    children.foldLeft[Any](null)((r, c) => {
+      val evalc = c.eval(input)
+      if (evalc != null) {
+        if (r == null || ordering.lt(evalc, r)) evalc else r
+      } else {
+        r
+      }
+    })
+  }
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    val evalChildren = children.map(_.gen(ctx))
+    def updateEval(i: Int): String =
+      s"""
+        if (!${evalChildren(i).isNull} && (${ev.isNull} ||
+          ${ctx.genComp(dataType, evalChildren(i).primitive, ev.primitive)} < 0)) {
+          ${ev.isNull} = false;
+          ${ev.primitive} = ${evalChildren(i).primitive};
+        }
+      """
+    s"""
+      ${evalChildren.map(_.code).mkString("\n")}
+      boolean ${ev.isNull} = true;
+      ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+      ${(0 until children.length).map(updateEval).mkString("\n")}
+    """
+  }
+}
+
+case class Greatest(children: Expression*) extends Expression {
+  require(children.length > 1, "GREATEST requires at least 2 arguments, got " + children.length)
+
+  override def nullable: Boolean = children.forall(_.nullable)
+  override def foldable: Boolean = children.forall(_.foldable)
+
+  private lazy val ordering = TypeUtils.getOrdering(dataType)
+
+  override def checkInputDataTypes(): TypeCheckResult = {
+    if (children.map(_.dataType).distinct.count(_ != NullType) > 1) {
+      TypeCheckResult.TypeCheckFailure(
+        s"The expressions should all have the same type," +
+          s" got GREATEST (${children.map(_.dataType)}).")
+    } else {
+      TypeUtils.checkForOrderingExpr(dataType, "function " + prettyName)
+    }
+  }
+
+  override def dataType: DataType = children.head.dataType
+
+  override def eval(input: InternalRow): Any = {
+    children.foldLeft[Any](null)((r, c) => {
+      val evalc = c.eval(input)
+      if (evalc != null) {
+        if (r == null || ordering.gt(evalc, r)) evalc else r
+      } else {
+        r
+      }
+    })
+  }
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    val evalChildren = children.map(_.gen(ctx))
+    def updateEval(i: Int): String =
+      s"""
+        if (!${evalChildren(i).isNull} && (${ev.isNull} ||
+          ${ctx.genComp(dataType, evalChildren(i).primitive, ev.primitive)} > 0)) {
+          ${ev.isNull} = false;
+          ${ev.primitive} = ${evalChildren(i).primitive};
+        }
+      """
+    s"""
+      ${evalChildren.map(_.code).mkString("\n")}
+      boolean ${ev.isNull} = true;
+      ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+      ${(0 until children.length).map(updateEval).mkString("\n")}
+    """
+  }
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ConditionalExpressionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ConditionalExpressionSuite.scala
index 372848ea9a596..aaf40cc83e762 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ConditionalExpressionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ConditionalExpressionSuite.scala
@@ -17,7 +17,10 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+import java.sql.{Timestamp, Date}
+
 import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.types._
 
@@ -134,4 +137,82 @@ class ConditionalExpressionSuite extends SparkFunSuite with ExpressionEvalHelper
     checkEvaluation(CaseKeyWhen(literalNull, Seq(c2, c5, c1, c6)), "c", row)
   }
 
+  test("function least") {
+    val row = create_row(1, 2, "a", "b", "c")
+    val c1 = 'a.int.at(0)
+    val c2 = 'a.int.at(1)
+    val c3 = 'a.string.at(2)
+    val c4 = 'a.string.at(3)
+    val c5 = 'a.string.at(4)
+    checkEvaluation(Least(c4, c3, c5), "a", row)
+    checkEvaluation(Least(c1, c2), 1, row)
+    checkEvaluation(Least(c1, c2, Literal(-1)), -1, row)
+    checkEvaluation(Least(c4, c5, c3, c3, Literal("a")), "a", row)
+
+    checkEvaluation(Least(Literal(null), Literal(null)), null, InternalRow.empty)
+    checkEvaluation(Least(Literal(-1.0), Literal(2.5)), -1.0, InternalRow.empty)
+    checkEvaluation(Least(Literal(-1), Literal(2)), -1, InternalRow.empty)
+    checkEvaluation(
+      Least(Literal((-1.0).toFloat), Literal(2.5.toFloat)), (-1.0).toFloat, InternalRow.empty)
+    checkEvaluation(
+      Least(Literal(Long.MaxValue), Literal(Long.MinValue)), Long.MinValue, InternalRow.empty)
+    checkEvaluation(Least(Literal(1.toByte), Literal(2.toByte)), 1.toByte, InternalRow.empty)
+    checkEvaluation(
+      Least(Literal(1.toShort), Literal(2.toByte.toShort)), 1.toShort, InternalRow.empty)
+    checkEvaluation(Least(Literal("abc"), Literal("aaaa")), "aaaa", InternalRow.empty)
+    checkEvaluation(Least(Literal(true), Literal(false)), false, InternalRow.empty)
+    checkEvaluation(
+      Least(
+        Literal(BigDecimal("1234567890987654321123456")),
+        Literal(BigDecimal("1234567890987654321123458"))),
+      BigDecimal("1234567890987654321123456"), InternalRow.empty)
+    checkEvaluation(
+      Least(Literal(Date.valueOf("2015-01-01")), Literal(Date.valueOf("2015-07-01"))),
+      Date.valueOf("2015-01-01"), InternalRow.empty)
+    checkEvaluation(
+      Least(
+        Literal(Timestamp.valueOf("2015-07-01 08:00:00")),
+        Literal(Timestamp.valueOf("2015-07-01 10:00:00"))),
+      Timestamp.valueOf("2015-07-01 08:00:00"), InternalRow.empty)
+  }
+
+  test("function greatest") {
+    val row = create_row(1, 2, "a", "b", "c")
+    val c1 = 'a.int.at(0)
+    val c2 = 'a.int.at(1)
+    val c3 = 'a.string.at(2)
+    val c4 = 'a.string.at(3)
+    val c5 = 'a.string.at(4)
+    checkEvaluation(Greatest(c4, c5, c3), "c", row)
+    checkEvaluation(Greatest(c2, c1), 2, row)
+    checkEvaluation(Greatest(c1, c2, Literal(2)), 2, row)
+    checkEvaluation(Greatest(c4, c5, c3, Literal("ccc")), "ccc", row)
+
+    checkEvaluation(Greatest(Literal(null), Literal(null)), null, InternalRow.empty)
+    checkEvaluation(Greatest(Literal(-1.0), Literal(2.5)), 2.5, InternalRow.empty)
+    checkEvaluation(Greatest(Literal(-1), Literal(2)), 2, InternalRow.empty)
+    checkEvaluation(
+      Greatest(Literal((-1.0).toFloat), Literal(2.5.toFloat)), 2.5.toFloat, InternalRow.empty)
+    checkEvaluation(
+      Greatest(Literal(Long.MaxValue), Literal(Long.MinValue)), Long.MaxValue, InternalRow.empty)
+    checkEvaluation(Greatest(Literal(1.toByte), Literal(2.toByte)), 2.toByte, InternalRow.empty)
+    checkEvaluation(
+      Greatest(Literal(1.toShort), Literal(2.toByte.toShort)), 2.toShort, InternalRow.empty)
+    checkEvaluation(Greatest(Literal("abc"), Literal("aaaa")), "abc", InternalRow.empty)
+    checkEvaluation(Greatest(Literal(true), Literal(false)), true, InternalRow.empty)
+    checkEvaluation(
+      Greatest(
+        Literal(BigDecimal("1234567890987654321123456")),
+        Literal(BigDecimal("1234567890987654321123458"))),
+      BigDecimal("1234567890987654321123458"), InternalRow.empty)
+    checkEvaluation(
+      Greatest(Literal(Date.valueOf("2015-01-01")), Literal(Date.valueOf("2015-07-01"))),
+      Date.valueOf("2015-07-01"), InternalRow.empty)
+    checkEvaluation(
+      Greatest(
+        Literal(Timestamp.valueOf("2015-07-01 08:00:00")),
+        Literal(Timestamp.valueOf("2015-07-01 10:00:00"))),
+      Timestamp.valueOf("2015-07-01 10:00:00"), InternalRow.empty)
+  }
+
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 08bf37a5c223c..ffa52f62588dc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -599,7 +599,7 @@ object functions {
   /**
    * Creates a new row for each element in the given array or map column.
    */
-   def explode(e: Column): Column = Explode(e.expr)
+  def explode(e: Column): Column = Explode(e.expr)
 
   /**
    * Converts a string exprsesion to lower case.
@@ -1073,15 +1073,41 @@ object functions {
   def floor(columnName: String): Column = floor(Column(columnName))
 
   /**
-   * Computes hex value of the given column
+   * Returns the greatest value of the list of values.
    *
-   * @group math_funcs
+   * @group normal_funcs
    * @since 1.5.0
    */
+  @scala.annotation.varargs
+  def greatest(exprs: Column*): Column = if (exprs.length < 2) {
+    sys.error("GREATEST takes at least 2 parameters")
+  } else {
+    Greatest(exprs.map(_.expr): _*)
+  }
+
+  /**
+   * Returns the greatest value of the list of column names.
+   *
+   * @group normal_funcs
+   * @since 1.5.0
+   */
+  @scala.annotation.varargs
+  def greatest(columnName: String, columnNames: String*): Column = if (columnNames.isEmpty) {
+    sys.error("GREATEST takes at least 2 parameters")
+  } else {
+    greatest((columnName +: columnNames).map(Column.apply): _*)
+  }
+
+  /**
+    * Computes hex value of the given column.
+    *
+    * @group math_funcs
+    * @since 1.5.0
+    */
   def hex(column: Column): Column = Hex(column.expr)
 
   /**
-   * Computes hex value of the given input
+   * Computes hex value of the given input.
    *
    * @group math_funcs
    * @since 1.5.0
@@ -1171,6 +1197,32 @@ object functions {
    */
   def hypot(l: Double, rightName: String): Column = hypot(l, Column(rightName))
 
+  /**
+   * Returns the least value of the list of values.
+   *
+   * @group normal_funcs
+   * @since 1.5.0
+   */
+  @scala.annotation.varargs
+  def least(exprs: Column*): Column = if (exprs.length < 2) {
+    sys.error("LEAST takes at least 2 parameters")
+  } else {
+    Least(exprs.map(_.expr): _*)
+  }
+
+  /**
+   * Returns the least value of the list of column names.
+   *
+   * @group normal_funcs
+   * @since 1.5.0
+   */
+  @scala.annotation.varargs
+  def least(columnName: String, columnNames: String*): Column = if (columnNames.isEmpty) {
+    sys.error("LEAST takes at least 2 parameters")
+  } else {
+    least((columnName +: columnNames).map(Column.apply): _*)
+  }
+
   /**
    * Computes the natural logarithm of the given value.
    *
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
index 173280375c411..6cebec95d2850 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
@@ -381,4 +381,26 @@ class DataFrameFunctionsSuite extends QueryTest {
       df.selectExpr("split(a, '[1-9]+')"),
       Row(Seq("aa", "bb", "cc")))
   }
+
+  test("conditional function: least") {
+    checkAnswer(
+      testData2.select(least(lit(-1), lit(0), col("a"), col("b"))).limit(1),
+      Row(-1)
+    )
+    checkAnswer(
+      ctx.sql("SELECT least(a, 2) as l from testData2 order by l"),
+      Seq(Row(1), Row(1), Row(2), Row(2), Row(2), Row(2))
+    )
+  }
+
+  test("conditional function: greatest") {
+    checkAnswer(
+      testData2.select(greatest(lit(2), lit(3), col("a"), col("b"))).limit(1),
+      Row(3)
+    )
+    checkAnswer(
+      ctx.sql("SELECT greatest(a, 2) as g from testData2 order by g"),
+      Seq(Row(2), Row(2), Row(2), Row(2), Row(3), Row(3))
+    )
+  }
 }

From 6b89943834a8d9d5d0ecfd97efcc10056d08532a Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@outlook.com>
Date: Mon, 13 Jul 2015 00:49:39 -0700
Subject: [PATCH 127/549] [SPARK-8944][SQL] Support casting between
 IntervalType and StringType

Author: Wenchen Fan <cloud0fan@outlook.com>

Closes #7355 from cloud-fan/fromString and squashes the following commits:

3bbb9d6 [Wenchen Fan] fix code gen
7dab957 [Wenchen Fan] naming fix
0fbbe19 [Wenchen Fan] address comments
ac1f3d1 [Wenchen Fan] Support casting between IntervalType and StringType
---
 .../spark/sql/catalyst/expressions/Cast.scala | 17 ++++++-
 .../sql/catalyst/expressions/CastSuite.scala  | 10 ++++
 .../apache/spark/unsafe/types/Interval.java   | 48 +++++++++++++++++++
 .../spark/unsafe/types/IntervalSuite.java     | 46 ++++++++++++++++++
 4 files changed, 120 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
index 7f2383dedc035..ab02addfb4d25 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -26,7 +26,7 @@ import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
 import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode}
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.types._
-import org.apache.spark.unsafe.types.UTF8String
+import org.apache.spark.unsafe.types.{Interval, UTF8String}
 
 
 object Cast {
@@ -55,6 +55,9 @@ object Cast {
 
     case (_, DateType) => true
 
+    case (StringType, IntervalType) => true
+    case (IntervalType, StringType) => true
+
     case (StringType, _: NumericType) => true
     case (BooleanType, _: NumericType) => true
     case (DateType, _: NumericType) => true
@@ -232,6 +235,13 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
     case _ => _ => null
   }
 
+  // IntervalConverter
+  private[this] def castToInterval(from: DataType): Any => Any = from match {
+    case StringType =>
+      buildCast[UTF8String](_, s => Interval.fromString(s.toString))
+    case _ => _ => null
+  }
+
   // LongConverter
   private[this] def castToLong(from: DataType): Any => Any = from match {
     case StringType =>
@@ -405,6 +415,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
     case DateType => castToDate(from)
     case decimal: DecimalType => castToDecimal(from, decimal)
     case TimestampType => castToTimestamp(from)
+    case IntervalType => castToInterval(from)
     case BooleanType => castToBoolean(from)
     case ByteType => castToByte(from)
     case ShortType => castToShort(from)
@@ -442,6 +453,10 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
       case (_, StringType) =>
         defineCodeGen(ctx, ev, c => s"${ctx.stringType}.fromString(String.valueOf($c))")
 
+      case (StringType, IntervalType) =>
+        defineCodeGen(ctx, ev, c =>
+          s"org.apache.spark.unsafe.types.Interval.fromString($c.toString())")
+
       // fallback for DecimalType, this must be before other numeric types
       case (_, dt: DecimalType) =>
         super.genCode(ctx, ev)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
index 919fdd470b79a..1de161c367a1d 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
@@ -563,4 +563,14 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
       InternalRow(0L)))
   }
 
+  test("case between string and interval") {
+    import org.apache.spark.unsafe.types.Interval
+
+    checkEvaluation(Cast(Literal("interval -3 month 7 hours"), IntervalType),
+      new Interval(-3, 7 * Interval.MICROS_PER_HOUR))
+    checkEvaluation(Cast(Literal.create(
+      new Interval(15, -3 * Interval.MICROS_PER_DAY), IntervalType), StringType),
+      "interval 1 years 3 months -3 days")
+  }
+
 }
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/Interval.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/Interval.java
index 0af982d4844c2..eb7475e9df869 100644
--- a/unsafe/src/main/java/org/apache/spark/unsafe/types/Interval.java
+++ b/unsafe/src/main/java/org/apache/spark/unsafe/types/Interval.java
@@ -18,6 +18,8 @@
 package org.apache.spark.unsafe.types;
 
 import java.io.Serializable;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 
 /**
  * The internal representation of interval type.
@@ -30,6 +32,52 @@ public final class Interval implements Serializable {
   public static final long MICROS_PER_DAY = MICROS_PER_HOUR * 24;
   public static final long MICROS_PER_WEEK = MICROS_PER_DAY * 7;
 
+  /**
+   * A function to generate regex which matches interval string's unit part like "3 years".
+   *
+   * First, we can leave out some units in interval string, and we only care about the value of
+   * unit, so here we use non-capturing group to wrap the actual regex.
+   * At the beginning of the actual regex, we should match spaces before the unit part.
+   * Next is the number part, starts with an optional "-" to represent negative value. We use
+   * capturing group to wrap this part as we need the value later.
+   * Finally is the unit name, ends with an optional "s".
+   */
+  private static String unitRegex(String unit) {
+    return "(?:\\s+(-?\\d+)\\s+" + unit + "s?)?";
+  }
+
+  private static Pattern p = Pattern.compile("interval" + unitRegex("year") + unitRegex("month") +
+    unitRegex("week") + unitRegex("day") + unitRegex("hour") + unitRegex("minute") +
+    unitRegex("second") + unitRegex("millisecond") + unitRegex("microsecond"));
+
+  private static long toLong(String s) {
+    if (s == null) {
+      return 0;
+    } else {
+      return Long.valueOf(s);
+    }
+  }
+
+  public static Interval fromString(String s) {
+    if (s == null) {
+      return null;
+    }
+    Matcher m = p.matcher(s);
+    if (!m.matches() || s.equals("interval")) {
+      return null;
+    } else {
+      long months = toLong(m.group(1)) * 12 + toLong(m.group(2));
+      long microseconds = toLong(m.group(3)) * MICROS_PER_WEEK;
+      microseconds += toLong(m.group(4)) * MICROS_PER_DAY;
+      microseconds += toLong(m.group(5)) * MICROS_PER_HOUR;
+      microseconds += toLong(m.group(6)) * MICROS_PER_MINUTE;
+      microseconds += toLong(m.group(7)) * MICROS_PER_SECOND;
+      microseconds += toLong(m.group(8)) * MICROS_PER_MILLI;
+      microseconds += toLong(m.group(9));
+      return new Interval((int) months, microseconds);
+    }
+  }
+
   public final int months;
   public final long microseconds;
 
diff --git a/unsafe/src/test/java/org/apache/spark/unsafe/types/IntervalSuite.java b/unsafe/src/test/java/org/apache/spark/unsafe/types/IntervalSuite.java
index 0f4f38b2b03be..44a949a371f2b 100644
--- a/unsafe/src/test/java/org/apache/spark/unsafe/types/IntervalSuite.java
+++ b/unsafe/src/test/java/org/apache/spark/unsafe/types/IntervalSuite.java
@@ -56,4 +56,50 @@ public void toStringTest() {
     i = new Interval(34, 3 * MICROS_PER_WEEK + 13 * MICROS_PER_HOUR + 123);
     assertEquals(i.toString(), "interval 2 years 10 months 3 weeks 13 hours 123 microseconds");
   }
+
+  @Test
+  public void fromStringTest() {
+    testSingleUnit("year", 3, 36, 0);
+    testSingleUnit("month", 3, 3, 0);
+    testSingleUnit("week", 3, 0, 3 * MICROS_PER_WEEK);
+    testSingleUnit("day", 3, 0, 3 * MICROS_PER_DAY);
+    testSingleUnit("hour", 3, 0, 3 * MICROS_PER_HOUR);
+    testSingleUnit("minute", 3, 0, 3 * MICROS_PER_MINUTE);
+    testSingleUnit("second", 3, 0, 3 * MICROS_PER_SECOND);
+    testSingleUnit("millisecond", 3, 0, 3 * MICROS_PER_MILLI);
+    testSingleUnit("microsecond", 3, 0, 3);
+
+    String input;
+
+    input = "interval   -5  years  23   month";
+    Interval result = new Interval(-5 * 12 + 23, 0);
+    assertEquals(Interval.fromString(input), result);
+
+    // Error cases
+    input = "interval   3month 1 hour";
+    assertEquals(Interval.fromString(input), null);
+
+    input = "interval 3 moth 1 hour";
+    assertEquals(Interval.fromString(input), null);
+
+    input = "interval";
+    assertEquals(Interval.fromString(input), null);
+
+    input = "int";
+    assertEquals(Interval.fromString(input), null);
+
+    input = "";
+    assertEquals(Interval.fromString(input), null);
+
+    input = null;
+    assertEquals(Interval.fromString(input), null);
+  }
+
+  private void testSingleUnit(String unit, int number, int months, long microseconds) {
+    String input1 = "interval " + number + " " + unit;
+    String input2 = "interval " + number + " " + unit + "s";
+    Interval result = new Interval(months, microseconds);
+    assertEquals(Interval.fromString(input1), result);
+    assertEquals(Interval.fromString(input2), result);
+  }
 }

From a5bc803b7271028e34de1548d55b80ecfb812a7b Mon Sep 17 00:00:00 2001
From: "Vincent D. Warmerdam" <vincentwarmerdam@gmail.com>
Date: Mon, 13 Jul 2015 08:15:54 -0700
Subject: [PATCH 128/549] [SPARK-8596] Add module for rstudio link to spark

shivaram, added module for rstudio install

Author: Vincent D. Warmerdam <vincentwarmerdam@gmail.com>

Closes #7366 from koaning/rstudio-install and squashes the following commits:

e47c2da [Vincent D. Warmerdam] added rstudio module
---
 ec2/spark_ec2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ec2/spark_ec2.py b/ec2/spark_ec2.py
index ae4f2ecc5bde7..7c83d68e7993e 100755
--- a/ec2/spark_ec2.py
+++ b/ec2/spark_ec2.py
@@ -793,7 +793,7 @@ def setup_cluster(conn, master_nodes, slave_nodes, opts, deploy_ssh_key):
             ssh_write(slave_address, opts, ['tar', 'x'], dot_ssh_tar)
 
     modules = ['spark', 'ephemeral-hdfs', 'persistent-hdfs',
-               'mapreduce', 'spark-standalone', 'tachyon']
+               'mapreduce', 'spark-standalone', 'tachyon', 'rstudio']
 
     if opts.hadoop_major_version == "1":
         modules = list(filter(lambda x: x != "mapreduce", modules))

From 7f487c8bde14dbdd244a3493ad11a129ef2bb327 Mon Sep 17 00:00:00 2001
From: Sun Rui <rui.sun@intel.com>
Date: Mon, 13 Jul 2015 08:21:47 -0700
Subject: [PATCH 129/549] [SPARK-6797] [SPARKR] Add support for YARN cluster
 mode.

This PR enables SparkR to dynamically ship the SparkR binary package to the AM node in YARN cluster mode, thus it is no longer required that the SparkR package be installed on each worker node.

This PR uses the JDK jar tool to package the SparkR package, because jar is thought to be available on both Linux/Windows platforms where JDK has been installed.

This PR does not address the R worker involved in RDD API. Will address it in a separate JIRA issue.

This PR does not address SBT build. SparkR installation and packaging by SBT will be addressed in a separate JIRA issue.

R/install-dev.bat is not tested. shivaram , Could you help to test it?

Author: Sun Rui <rui.sun@intel.com>

Closes #6743 from sun-rui/SPARK-6797 and squashes the following commits:

ca63c86 [Sun Rui] Adjust MimaExcludes after rebase.
7313374 [Sun Rui] Fix unit test errors.
72695fb [Sun Rui] Fix unit test failures.
193882f [Sun Rui] Fix Mima test error.
fe25a33 [Sun Rui] Fix Mima test error.
35ecfa3 [Sun Rui] Fix comments.
c38a005 [Sun Rui] Unzipped SparkR binary package is still required for standalone and Mesos modes.
b05340c [Sun Rui] Fix scala style.
2ca5048 [Sun Rui] Fix comments.
1acefd1 [Sun Rui] Fix scala style.
0aa1e97 [Sun Rui] Fix scala style.
41d4f17 [Sun Rui] Add support for locating SparkR package for R workers required by RDD APIs.
49ff948 [Sun Rui] Invoke jar.exe with full path in install-dev.bat.
7b916c5 [Sun Rui] Use 'rem' consistently.
3bed438 [Sun Rui] Add a comment.
681afb0 [Sun Rui] Fix a bug that RRunner does not handle client deployment modes.
cedfbe2 [Sun Rui] [SPARK-6797][SPARKR] Add support for YARN cluster mode.
---
 R/install-dev.bat                             |  5 ++
 R/install-dev.sh                              |  8 ++-
 R/pkg/DESCRIPTION                             |  1 -
 R/pkg/R/RDD.R                                 |  2 -
 R/pkg/R/pairRDD.R                             |  1 -
 R/pkg/R/sparkR.R                              | 10 ---
 R/pkg/R/zzz.R                                 | 20 ------
 R/pkg/inst/profile/general.R                  |  4 +-
 .../scala/org/apache/spark/api/r/RRDD.scala   | 21 +++---
 .../scala/org/apache/spark/api/r/RUtils.scala | 65 +++++++++++++++++++
 .../org/apache/spark/deploy/RRunner.scala     |  7 +-
 .../org/apache/spark/deploy/SparkSubmit.scala | 27 ++++++++
 .../spark/deploy/SparkSubmitSuite.scala       |  3 +-
 make-distribution.sh                          |  1 +
 project/MimaExcludes.scala                    | 12 ++++
 15 files changed, 133 insertions(+), 54 deletions(-)
 delete mode 100644 R/pkg/R/zzz.R
 create mode 100644 core/src/main/scala/org/apache/spark/api/r/RUtils.scala

diff --git a/R/install-dev.bat b/R/install-dev.bat
index 008a5c668bc45..f32670b67de96 100644
--- a/R/install-dev.bat
+++ b/R/install-dev.bat
@@ -25,3 +25,8 @@ set SPARK_HOME=%~dp0..
 MKDIR %SPARK_HOME%\R\lib
 
 R.exe CMD INSTALL --library="%SPARK_HOME%\R\lib"  %SPARK_HOME%\R\pkg\
+
+rem Zip the SparkR package so that it can be distributed to worker nodes on YARN
+pushd %SPARK_HOME%\R\lib
+%JAVA_HOME%\bin\jar.exe cfM "%SPARK_HOME%\R\lib\sparkr.zip" SparkR
+popd
diff --git a/R/install-dev.sh b/R/install-dev.sh
index 1edd551f8d243..4972bb9217072 100755
--- a/R/install-dev.sh
+++ b/R/install-dev.sh
@@ -34,7 +34,7 @@ LIB_DIR="$FWDIR/lib"
 
 mkdir -p $LIB_DIR
 
-pushd $FWDIR
+pushd $FWDIR > /dev/null
 
 # Generate Rd files if devtools is installed
 Rscript -e ' if("devtools" %in% rownames(installed.packages())) { library(devtools); devtools::document(pkg="./pkg", roclets=c("rd")) }'
@@ -42,4 +42,8 @@ Rscript -e ' if("devtools" %in% rownames(installed.packages())) { library(devtoo
 # Install SparkR to $LIB_DIR
 R CMD INSTALL --library=$LIB_DIR $FWDIR/pkg/
 
-popd
+# Zip the SparkR package so that it can be distributed to worker nodes on YARN
+cd $LIB_DIR
+jar cfM "$LIB_DIR/sparkr.zip" SparkR
+
+popd > /dev/null
diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index efc85bbc4b316..d028821534b1a 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -32,4 +32,3 @@ Collate:
     'serialize.R'
     'sparkR.R'
     'utils.R'
-    'zzz.R'
diff --git a/R/pkg/R/RDD.R b/R/pkg/R/RDD.R
index 89511141d3ef7..d2d096709245d 100644
--- a/R/pkg/R/RDD.R
+++ b/R/pkg/R/RDD.R
@@ -165,7 +165,6 @@ setMethod("getJRDD", signature(rdd = "PipelinedRDD"),
                                    serializedFuncArr,
                                    rdd@env$prev_serializedMode,
                                    packageNamesArr,
-                                   as.character(.sparkREnv[["libname"]]),
                                    broadcastArr,
                                    callJMethod(prev_jrdd, "classTag"))
             } else {
@@ -175,7 +174,6 @@ setMethod("getJRDD", signature(rdd = "PipelinedRDD"),
                                    rdd@env$prev_serializedMode,
                                    serializedMode,
                                    packageNamesArr,
-                                   as.character(.sparkREnv[["libname"]]),
                                    broadcastArr,
                                    callJMethod(prev_jrdd, "classTag"))
             }
diff --git a/R/pkg/R/pairRDD.R b/R/pkg/R/pairRDD.R
index 0f1179e0aa51a..ebc6ff65e9d0f 100644
--- a/R/pkg/R/pairRDD.R
+++ b/R/pkg/R/pairRDD.R
@@ -215,7 +215,6 @@ setMethod("partitionBy",
                                        serializedHashFuncBytes,
                                        getSerializedMode(x),
                                        packageNamesArr,
-                                       as.character(.sparkREnv$libname),
                                        broadcastArr,
                                        callJMethod(jrdd, "classTag"))
 
diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R
index 048eb8ed541e4..172335809dec2 100644
--- a/R/pkg/R/sparkR.R
+++ b/R/pkg/R/sparkR.R
@@ -17,10 +17,6 @@
 
 .sparkREnv <- new.env()
 
-sparkR.onLoad <- function(libname, pkgname) {
-  .sparkREnv$libname <- libname
-}
-
 # Utility function that returns TRUE if we have an active connection to the
 # backend and FALSE otherwise
 connExists <- function(env) {
@@ -80,7 +76,6 @@ sparkR.stop <- function() {
 #' @param sparkEnvir Named list of environment variables to set on worker nodes.
 #' @param sparkExecutorEnv Named list of environment variables to be used when launching executors.
 #' @param sparkJars Character string vector of jar files to pass to the worker nodes.
-#' @param sparkRLibDir The path where R is installed on the worker nodes.
 #' @param sparkPackages Character string vector of packages from spark-packages.org
 #' @export
 #' @examples
@@ -101,7 +96,6 @@ sparkR.init <- function(
   sparkEnvir = list(),
   sparkExecutorEnv = list(),
   sparkJars = "",
-  sparkRLibDir = "",
   sparkPackages = "") {
 
   if (exists(".sparkRjsc", envir = .sparkREnv)) {
@@ -170,10 +164,6 @@ sparkR.init <- function(
     sparkHome <- normalizePath(sparkHome)
   }
 
-  if (nchar(sparkRLibDir) != 0) {
-    .sparkREnv$libname <- sparkRLibDir
-  }
-
   sparkEnvirMap <- new.env()
   for (varname in names(sparkEnvir)) {
     sparkEnvirMap[[varname]] <- sparkEnvir[[varname]]
diff --git a/R/pkg/R/zzz.R b/R/pkg/R/zzz.R
deleted file mode 100644
index 301feade65fa3..0000000000000
--- a/R/pkg/R/zzz.R
+++ /dev/null
@@ -1,20 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-.onLoad <- function(libname, pkgname) {
-  sparkR.onLoad(libname, pkgname)
-}
diff --git a/R/pkg/inst/profile/general.R b/R/pkg/inst/profile/general.R
index 8fe711b622086..2a8a8213d0849 100644
--- a/R/pkg/inst/profile/general.R
+++ b/R/pkg/inst/profile/general.R
@@ -16,7 +16,7 @@
 #
 
 .First <- function() {
-  home <- Sys.getenv("SPARK_HOME")
-  .libPaths(c(file.path(home, "R", "lib"), .libPaths()))
+  packageDir <- Sys.getenv("SPARKR_PACKAGE_DIR")
+  .libPaths(c(packageDir, .libPaths()))
   Sys.setenv(NOAWT=1)
 }
diff --git a/core/src/main/scala/org/apache/spark/api/r/RRDD.scala b/core/src/main/scala/org/apache/spark/api/r/RRDD.scala
index ff1702f7dea48..23a470d6afcae 100644
--- a/core/src/main/scala/org/apache/spark/api/r/RRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/r/RRDD.scala
@@ -39,7 +39,6 @@ private abstract class BaseRRDD[T: ClassTag, U: ClassTag](
     deserializer: String,
     serializer: String,
     packageNames: Array[Byte],
-    rLibDir: String,
     broadcastVars: Array[Broadcast[Object]])
   extends RDD[U](parent) with Logging {
   protected var dataStream: DataInputStream = _
@@ -60,7 +59,7 @@ private abstract class BaseRRDD[T: ClassTag, U: ClassTag](
 
     // The stdout/stderr is shared by multiple tasks, because we use one daemon
     // to launch child process as worker.
-    val errThread = RRDD.createRWorker(rLibDir, listenPort)
+    val errThread = RRDD.createRWorker(listenPort)
 
     // We use two sockets to separate input and output, then it's easy to manage
     // the lifecycle of them to avoid deadlock.
@@ -235,11 +234,10 @@ private class PairwiseRRDD[T: ClassTag](
     hashFunc: Array[Byte],
     deserializer: String,
     packageNames: Array[Byte],
-    rLibDir: String,
     broadcastVars: Array[Object])
   extends BaseRRDD[T, (Int, Array[Byte])](
     parent, numPartitions, hashFunc, deserializer,
-    SerializationFormats.BYTE, packageNames, rLibDir,
+    SerializationFormats.BYTE, packageNames,
     broadcastVars.map(x => x.asInstanceOf[Broadcast[Object]])) {
 
   override protected def readData(length: Int): (Int, Array[Byte]) = {
@@ -266,10 +264,9 @@ private class RRDD[T: ClassTag](
     deserializer: String,
     serializer: String,
     packageNames: Array[Byte],
-    rLibDir: String,
     broadcastVars: Array[Object])
   extends BaseRRDD[T, Array[Byte]](
-    parent, -1, func, deserializer, serializer, packageNames, rLibDir,
+    parent, -1, func, deserializer, serializer, packageNames,
     broadcastVars.map(x => x.asInstanceOf[Broadcast[Object]])) {
 
   override protected def readData(length: Int): Array[Byte] = {
@@ -293,10 +290,9 @@ private class StringRRDD[T: ClassTag](
     func: Array[Byte],
     deserializer: String,
     packageNames: Array[Byte],
-    rLibDir: String,
     broadcastVars: Array[Object])
   extends BaseRRDD[T, String](
-    parent, -1, func, deserializer, SerializationFormats.STRING, packageNames, rLibDir,
+    parent, -1, func, deserializer, SerializationFormats.STRING, packageNames,
     broadcastVars.map(x => x.asInstanceOf[Broadcast[Object]])) {
 
   override protected def readData(length: Int): String = {
@@ -392,9 +388,10 @@ private[r] object RRDD {
     thread
   }
 
-  private def createRProcess(rLibDir: String, port: Int, script: String): BufferedStreamThread = {
+  private def createRProcess(port: Int, script: String): BufferedStreamThread = {
     val rCommand = SparkEnv.get.conf.get("spark.sparkr.r.command", "Rscript")
     val rOptions = "--vanilla"
+    val rLibDir = RUtils.sparkRPackagePath(isDriver = false)
     val rExecScript = rLibDir + "/SparkR/worker/" + script
     val pb = new ProcessBuilder(List(rCommand, rOptions, rExecScript))
     // Unset the R_TESTS environment variable for workers.
@@ -413,7 +410,7 @@ private[r] object RRDD {
   /**
    * ProcessBuilder used to launch worker R processes.
    */
-  def createRWorker(rLibDir: String, port: Int): BufferedStreamThread = {
+  def createRWorker(port: Int): BufferedStreamThread = {
     val useDaemon = SparkEnv.get.conf.getBoolean("spark.sparkr.use.daemon", true)
     if (!Utils.isWindows && useDaemon) {
       synchronized {
@@ -421,7 +418,7 @@ private[r] object RRDD {
           // we expect one connections
           val serverSocket = new ServerSocket(0, 1, InetAddress.getByName("localhost"))
           val daemonPort = serverSocket.getLocalPort
-          errThread = createRProcess(rLibDir, daemonPort, "daemon.R")
+          errThread = createRProcess(daemonPort, "daemon.R")
           // the socket used to send out the input of task
           serverSocket.setSoTimeout(10000)
           val sock = serverSocket.accept()
@@ -443,7 +440,7 @@ private[r] object RRDD {
         errThread
       }
     } else {
-      createRProcess(rLibDir, port, "worker.R")
+      createRProcess(port, "worker.R")
     }
   }
 
diff --git a/core/src/main/scala/org/apache/spark/api/r/RUtils.scala b/core/src/main/scala/org/apache/spark/api/r/RUtils.scala
new file mode 100644
index 0000000000000..d53abd3408c55
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/api/r/RUtils.scala
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.api.r
+
+import java.io.File
+
+import org.apache.spark.{SparkEnv, SparkException}
+
+private[spark] object RUtils {
+  /**
+   * Get the SparkR package path in the local spark distribution.
+   */
+  def localSparkRPackagePath: Option[String] = {
+    val sparkHome = sys.env.get("SPARK_HOME")
+    sparkHome.map(
+      Seq(_, "R", "lib").mkString(File.separator)
+    )
+  }
+
+  /**
+   * Get the SparkR package path in various deployment modes.
+   * This assumes that Spark properties `spark.master` and `spark.submit.deployMode`
+   * and environment variable `SPARK_HOME` are set.
+   */
+  def sparkRPackagePath(isDriver: Boolean): String = {
+    val (master, deployMode) =
+      if (isDriver) {
+        (sys.props("spark.master"), sys.props("spark.submit.deployMode"))
+      } else {
+        val sparkConf = SparkEnv.get.conf
+        (sparkConf.get("spark.master"), sparkConf.get("spark.submit.deployMode"))
+      }
+
+    val isYarnCluster = master.contains("yarn") && deployMode == "cluster"
+    val isYarnClient = master.contains("yarn") && deployMode == "client"
+
+    // In YARN mode, the SparkR package is distributed as an archive symbolically
+    // linked to the "sparkr" file in the current directory. Note that this does not apply
+    // to the driver in client mode because it is run outside of the cluster.
+    if (isYarnCluster || (isYarnClient && !isDriver)) {
+      new File("sparkr").getAbsolutePath
+    } else {
+      // Otherwise, assume the package is local
+      // TODO: support this for Mesos
+      localSparkRPackagePath.getOrElse {
+        throw new SparkException("SPARK_HOME not set. Can't locate SparkR package.")
+      }
+    }
+  }
+}
diff --git a/core/src/main/scala/org/apache/spark/deploy/RRunner.scala b/core/src/main/scala/org/apache/spark/deploy/RRunner.scala
index 4165740312e03..c0cab22fa8252 100644
--- a/core/src/main/scala/org/apache/spark/deploy/RRunner.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/RRunner.scala
@@ -24,7 +24,7 @@ import scala.collection.JavaConversions._
 
 import org.apache.hadoop.fs.Path
 
-import org.apache.spark.api.r.RBackend
+import org.apache.spark.api.r.{RBackend, RUtils}
 import org.apache.spark.util.RedirectThread
 
 /**
@@ -71,9 +71,10 @@ object RRunner {
         val builder = new ProcessBuilder(Seq(rCommand, rFileNormalized) ++ otherArgs)
         val env = builder.environment()
         env.put("EXISTING_SPARKR_BACKEND_PORT", sparkRBackendPort.toString)
-        val sparkHome = System.getenv("SPARK_HOME")
+        val rPackageDir = RUtils.sparkRPackagePath(isDriver = true)
+        env.put("SPARKR_PACKAGE_DIR", rPackageDir)
         env.put("R_PROFILE_USER",
-          Seq(sparkHome, "R", "lib", "SparkR", "profile", "general.R").mkString(File.separator))
+          Seq(rPackageDir, "SparkR", "profile", "general.R").mkString(File.separator))
         builder.redirectErrorStream(true) // Ugly but needed for stdout and stderr to synchronize
         val process = builder.start()
 
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index 4cec9017b8adb..7089a7e26707f 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -37,6 +37,7 @@ import org.apache.ivy.core.settings.IvySettings
 import org.apache.ivy.plugins.matcher.GlobPatternMatcher
 import org.apache.ivy.plugins.repository.file.FileRepository
 import org.apache.ivy.plugins.resolver.{FileSystemResolver, ChainResolver, IBiblioResolver}
+import org.apache.spark.api.r.RUtils
 import org.apache.spark.SPARK_VERSION
 import org.apache.spark.deploy.rest._
 import org.apache.spark.util.{ChildFirstURLClassLoader, MutableURLClassLoader, Utils}
@@ -79,6 +80,7 @@ object SparkSubmit {
   private val SPARK_SHELL = "spark-shell"
   private val PYSPARK_SHELL = "pyspark-shell"
   private val SPARKR_SHELL = "sparkr-shell"
+  private val SPARKR_PACKAGE_ARCHIVE = "sparkr.zip"
 
   private val CLASS_NOT_FOUND_EXIT_STATUS = 101
 
@@ -262,6 +264,12 @@ object SparkSubmit {
       }
     }
 
+    // Update args.deployMode if it is null. It will be passed down as a Spark property later.
+    (args.deployMode, deployMode) match {
+      case (null, CLIENT) => args.deployMode = "client"
+      case (null, CLUSTER) => args.deployMode = "cluster"
+      case _ =>
+    }
     val isYarnCluster = clusterManager == YARN && deployMode == CLUSTER
     val isMesosCluster = clusterManager == MESOS && deployMode == CLUSTER
 
@@ -347,6 +355,23 @@ object SparkSubmit {
       }
     }
 
+    // In YARN mode for an R app, add the SparkR package archive to archives
+    // that can be distributed with the job
+    if (args.isR && clusterManager == YARN) {
+      val rPackagePath = RUtils.localSparkRPackagePath
+      if (rPackagePath.isEmpty) {
+        printErrorAndExit("SPARK_HOME does not exist for R application in YARN mode.")
+      }
+      val rPackageFile = new File(rPackagePath.get, SPARKR_PACKAGE_ARCHIVE)
+      if (!rPackageFile.exists()) {
+        printErrorAndExit(s"$SPARKR_PACKAGE_ARCHIVE does not exist for R application in YARN mode.")
+      }
+      val localURI = Utils.resolveURI(rPackageFile.getAbsolutePath)
+
+      // Assigns a symbol link name "sparkr" to the shipped package.
+      args.archives = mergeFileLists(args.archives, localURI.toString + "#sparkr")
+    }
+
     // If we're running a R app, set the main class to our specific R runner
     if (args.isR && deployMode == CLIENT) {
       if (args.primaryResource == SPARKR_SHELL) {
@@ -375,6 +400,8 @@ object SparkSubmit {
 
       // All cluster managers
       OptionAssigner(args.master, ALL_CLUSTER_MGRS, ALL_DEPLOY_MODES, sysProp = "spark.master"),
+      OptionAssigner(args.deployMode, ALL_CLUSTER_MGRS, ALL_DEPLOY_MODES,
+        sysProp = "spark.submit.deployMode"),
       OptionAssigner(args.name, ALL_CLUSTER_MGRS, ALL_DEPLOY_MODES, sysProp = "spark.app.name"),
       OptionAssigner(args.jars, ALL_CLUSTER_MGRS, CLIENT, sysProp = "spark.jars"),
       OptionAssigner(args.ivyRepoPath, ALL_CLUSTER_MGRS, CLIENT, sysProp = "spark.jars.ivy"),
diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
index 1b64c329b5d4b..e7878bde6fcb0 100644
--- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
@@ -246,7 +246,7 @@ class SparkSubmitSuite
       mainClass should be ("org.apache.spark.deploy.Client")
     }
     classpath should have size 0
-    sysProps should have size 8
+    sysProps should have size 9
     sysProps.keys should contain ("SPARK_SUBMIT")
     sysProps.keys should contain ("spark.master")
     sysProps.keys should contain ("spark.app.name")
@@ -255,6 +255,7 @@ class SparkSubmitSuite
     sysProps.keys should contain ("spark.driver.cores")
     sysProps.keys should contain ("spark.driver.supervise")
     sysProps.keys should contain ("spark.shuffle.spill")
+    sysProps.keys should contain ("spark.submit.deployMode")
     sysProps("spark.shuffle.spill") should be ("false")
   }
 
diff --git a/make-distribution.sh b/make-distribution.sh
index 9f063da3a16c0..cac7032bb2e87 100755
--- a/make-distribution.sh
+++ b/make-distribution.sh
@@ -219,6 +219,7 @@ cp -r "$SPARK_HOME/ec2" "$DISTDIR"
 if [ -d "$SPARK_HOME"/R/lib/SparkR ]; then
   mkdir -p "$DISTDIR"/R/lib
   cp -r "$SPARK_HOME/R/lib/SparkR" "$DISTDIR"/R/lib
+  cp "$SPARK_HOME/R/lib/sparkr.zip" "$DISTDIR"/R/lib
 fi
 
 # Download and copy in tachyon, if requested
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 79089aae2a37c..4e4e810ec36e3 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -83,6 +83,18 @@ object MimaExcludes {
               "org.apache.spark.streaming.scheduler.InputInfo$"),
             ProblemFilters.exclude[MissingClassProblem](
               "org.apache.spark.streaming.scheduler.InputInfo")
+          ) ++ Seq(
+            // SPARK-6797 Support YARN modes for SparkR
+            ProblemFilters.exclude[MissingMethodProblem](
+              "org.apache.spark.api.r.PairwiseRRDD.this"),
+            ProblemFilters.exclude[MissingMethodProblem](
+              "org.apache.spark.api.r.RRDD.createRWorker"),
+            ProblemFilters.exclude[MissingMethodProblem](
+              "org.apache.spark.api.r.RRDD.this"),
+            ProblemFilters.exclude[MissingMethodProblem](
+              "org.apache.spark.api.r.StringRRDD.this"),
+            ProblemFilters.exclude[MissingMethodProblem](
+              "org.apache.spark.api.r.BaseRRDD.this")
           )
 
         case v if v.startsWith("1.4") =>

From 9b62e9375f032548d386aec7468e3d0f7c6da7b2 Mon Sep 17 00:00:00 2001
From: MechCoder <manojkumarsivaraj334@gmail.com>
Date: Mon, 13 Jul 2015 09:47:53 -0700
Subject: [PATCH 130/549] [SPARK-8706] [PYSPARK] [PROJECT INFRA] Add pylint
 checks to PySpark

This adds Pylint checks to PySpark.

For now this lazy installs using easy_install to /dev/pylint (similar to the pep8 script).
We still need to figure out what rules to be allowed.

Author: MechCoder <manojkumarsivaraj334@gmail.com>

Closes #7241 from MechCoder/pylint and squashes the following commits:

8496834 [MechCoder] Silence warnings and make pylint tests fail to check if it works in jenkins
57393a3 [MechCoder] undefined-variable
a8e2547 [MechCoder] Minor changes
7753810 [MechCoder] remove trailing whitespace
75c5d2b [MechCoder] Remove blacklisted arguments and pointless statements check
6bde250 [MechCoder] Disable all checks for now
3464666 [MechCoder] Add pylint configuration file
d28109f [MechCoder] [SPARK-8706] [PySpark] [Project infra] Add pylint checks to PySpark
---
 dev/lint-python                   |  57 ++++-
 pylintrc                          | 404 ++++++++++++++++++++++++++++++
 python/pyspark/ml/param/shared.py |   4 +-
 3 files changed, 455 insertions(+), 10 deletions(-)
 create mode 100644 pylintrc

diff --git a/dev/lint-python b/dev/lint-python
index 0c3586462cb37..e02dff220eb87 100755
--- a/dev/lint-python
+++ b/dev/lint-python
@@ -21,12 +21,14 @@ SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )"
 SPARK_ROOT_DIR="$(dirname "$SCRIPT_DIR")"
 PATHS_TO_CHECK="./python/pyspark/ ./ec2/spark_ec2.py ./examples/src/main/python/ ./dev/sparktestsupport"
 PATHS_TO_CHECK="$PATHS_TO_CHECK ./dev/run-tests.py ./python/run-tests.py"
-PYTHON_LINT_REPORT_PATH="$SPARK_ROOT_DIR/dev/python-lint-report.txt"
+PEP8_REPORT_PATH="$SPARK_ROOT_DIR/dev/pep8-report.txt"
+PYLINT_REPORT_PATH="$SPARK_ROOT_DIR/dev/pylint-report.txt"
+PYLINT_INSTALL_INFO="$SPARK_ROOT_DIR/dev/pylint-info.txt"
 
 cd "$SPARK_ROOT_DIR"
 
 # compileall: https://docs.python.org/2/library/compileall.html
-python -B -m compileall -q -l $PATHS_TO_CHECK > "$PYTHON_LINT_REPORT_PATH"
+python -B -m compileall -q -l $PATHS_TO_CHECK > "$PEP8_REPORT_PATH"
 compile_status="${PIPESTATUS[0]}"
 
 # Get pep8 at runtime so that we don't rely on it being installed on the build server.
@@ -47,11 +49,36 @@ if [ ! -e "$PEP8_SCRIPT_PATH" ]; then
     fi
 fi
 
+# Easy install pylint in /dev/pylint. To easy_install into a directory, the PYTHONPATH should
+# be set to the directory.
+# dev/pylint should be appended to the PATH variable as well.
+# Jenkins by default installs the pylint3 version, so for now this just checks the code quality
+# of python3.
+export "PYTHONPATH=$SPARK_ROOT_DIR/dev/pylint"
+export "PYLINT_HOME=$PYTHONPATH"
+export "PATH=$PYTHONPATH:$PATH"
+
+if [ ! -d "$PYLINT_HOME" ]; then
+    mkdir "$PYLINT_HOME"
+    # Redirect the annoying pylint installation output.
+    easy_install -d "$PYLINT_HOME" pylint==1.4.4 &>> "$PYLINT_INSTALL_INFO"
+    easy_install_status="$?"
+
+    if [ "$easy_install_status" -ne 0 ]; then
+        echo "Unable to install pylint locally in \"$PYTHONPATH\"."
+        cat "$PYLINT_INSTALL_INFO"
+        exit "$easy_install_status"
+    fi
+
+    rm "$PYLINT_INSTALL_INFO"
+
+fi
+
 # There is no need to write this output to a file
 #+ first, but we do so so that the check status can
 #+ be output before the report, like with the
 #+ scalastyle and RAT checks.
-python "$PEP8_SCRIPT_PATH" --ignore=E402,E731,E241,W503,E226 $PATHS_TO_CHECK >> "$PYTHON_LINT_REPORT_PATH"
+python "$PEP8_SCRIPT_PATH" --ignore=E402,E731,E241,W503,E226 $PATHS_TO_CHECK >> "$PEP8_REPORT_PATH"
 pep8_status="${PIPESTATUS[0]}"
 
 if [ "$compile_status" -eq 0 -a "$pep8_status" -eq 0 ]; then
@@ -61,13 +88,27 @@ else
 fi
 
 if [ "$lint_status" -ne 0 ]; then
-    echo "Python lint checks failed."
-    cat "$PYTHON_LINT_REPORT_PATH"
+    echo "PEP8 checks failed."
+    cat "$PEP8_REPORT_PATH"
+else
+    echo "PEP8 checks passed."
+fi
+
+rm "$PEP8_REPORT_PATH"
+
+for to_be_checked in "$PATHS_TO_CHECK"
+do
+    pylint --rcfile="$SPARK_ROOT_DIR/pylintrc" $to_be_checked >> "$PYLINT_REPORT_PATH"
+done
+
+if [ "${PIPESTATUS[0]}" -ne 0 ]; then
+    lint_status=1
+    echo "Pylint checks failed."
+    cat "$PYLINT_REPORT_PATH"
 else
-    echo "Python lint checks passed."
+    echo "Pylint checks passed."
 fi
 
-# rm "$PEP8_SCRIPT_PATH"
-rm "$PYTHON_LINT_REPORT_PATH"
+rm "$PYLINT_REPORT_PATH"
 
 exit "$lint_status"
diff --git a/pylintrc b/pylintrc
new file mode 100644
index 0000000000000..061775960393b
--- /dev/null
+++ b/pylintrc
@@ -0,0 +1,404 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+[MASTER]
+
+# Specify a configuration file.
+#rcfile=
+
+# Python code to execute, usually for sys.path manipulation such as
+# pygtk.require().
+#init-hook=
+
+# Profiled execution.
+profile=no
+
+# Add files or directories to the blacklist. They should be base names, not
+# paths.
+ignore=pyspark.heapq3
+
+# Pickle collected data for later comparisons.
+persistent=yes
+
+# List of plugins (as comma separated values of python modules names) to load,
+# usually to register additional checkers.
+load-plugins=
+
+# Use multiple processes to speed up Pylint.
+jobs=1
+
+# Allow loading of arbitrary C extensions. Extensions are imported into the
+# active Python interpreter and may run arbitrary code.
+unsafe-load-any-extension=no
+
+# A comma-separated list of package or module names from where C extensions may
+# be loaded. Extensions are loading into the active Python interpreter and may
+# run arbitrary code
+extension-pkg-whitelist=
+
+# Allow optimization of some AST trees. This will activate a peephole AST
+# optimizer, which will apply various small optimizations. For instance, it can
+# be used to obtain the result of joining multiple strings with the addition
+# operator. Joining a lot of strings can lead to a maximum recursion error in
+# Pylint and this flag can prevent that. It has one side effect, the resulting
+# AST will be different than the one from reality.
+optimize-ast=no
+
+
+[MESSAGES CONTROL]
+
+# Only show warnings with the listed confidence levels. Leave empty to show
+# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED
+confidence=
+
+# Enable the message, report, category or checker with the given id(s). You can
+# either give multiple identifier separated by comma (,) or put this option
+# multiple time. See also the "--disable" option for examples.
+enable=
+
+# Disable the message, report, category or checker with the given id(s). You
+# can either give multiple identifiers separated by comma (,) or put this
+# option multiple times (only on the command line, not in the configuration
+# file where it should appear only once).You can also use "--disable=all" to
+# disable everything first and then reenable specific checks. For example, if
+# you want to run only the similarities checker, you can use "--disable=all
+# --enable=similarities". If you want to run only the classes checker, but have
+# no Warning level messages displayed, use"--disable=all --enable=classes
+# --disable=W"
+
+# These errors are arranged in order of number of warning given in pylint.
+# If you would like to improve the code quality of pyspark, remove any of these disabled errors
+# run ./dev/lint-python and see if the errors raised by pylint can be fixed.
+
+disable=invalid-name,missing-docstring,protected-access,unused-argument,no-member,unused-wildcard-import,redefined-builtin,too-many-arguments,unused-variable,too-few-public-methods,bad-continuation,duplicate-code,redefined-outer-name,too-many-ancestors,import-error,superfluous-parens,unused-import,line-too-long,no-name-in-module,unnecessary-lambda,import-self,no-self-use,unidiomatic-typecheck,fixme,too-many-locals,cyclic-import,too-many-branches,bare-except,wildcard-import,dangerous-default-value,broad-except,too-many-public-methods,deprecated-lambda,anomalous-backslash-in-string,too-many-lines,reimported,too-many-statements,bad-whitespace,unpacking-non-sequence,too-many-instance-attributes,abstract-method,old-style-class,global-statement,attribute-defined-outside-init,arguments-differ,undefined-all-variable,no-init,useless-else-on-loop,super-init-not-called,notimplemented-raised,too-many-return-statements,pointless-string-statement,global-variable-undefined,bad-classmethod-argument,too-many-format-args,parse-error,no-self-argument,pointless-statement,undefined-variable
+
+
+[REPORTS]
+
+# Set the output format. Available formats are text, parseable, colorized, msvs
+# (visual studio) and html. You can also give a reporter class, eg
+# mypackage.mymodule.MyReporterClass.
+output-format=text
+
+# Put messages in a separate file for each module / package specified on the
+# command line instead of printing them on stdout. Reports (if any) will be
+# written in a file name "pylint_global.[txt|html]".
+files-output=no
+
+# Tells whether to display a full report or only the messages
+reports=no
+
+# Python expression which should return a note less than 10 (10 is the highest
+# note). You have access to the variables errors warning, statement which
+# respectively contain the number of errors / warnings messages and the total
+# number of statements analyzed. This is used by the global evaluation report
+# (RP0004).
+evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
+
+# Add a comment according to your evaluation note. This is used by the global
+# evaluation report (RP0004).
+comment=no
+
+# Template used to display messages. This is a python new-style format string
+# used to format the message information. See doc for all details
+#msg-template=
+
+
+[MISCELLANEOUS]
+
+# List of note tags to take in consideration, separated by a comma.
+notes=FIXME,XXX,TODO
+
+
+[BASIC]
+
+# Required attributes for module, separated by a comma
+required-attributes=
+
+# List of builtins function names that should not be used, separated by a comma
+bad-functions=
+
+# Good variable names which should always be accepted, separated by a comma
+good-names=i,j,k,ex,Run,_
+
+# Bad variable names which should always be refused, separated by a comma
+bad-names=baz,toto,tutu,tata
+
+# Colon-delimited sets of names that determine each other's naming style when
+# the name regexes allow several styles.
+name-group=
+
+# Include a hint for the correct naming format with invalid-name
+include-naming-hint=no
+
+# Regular expression matching correct function names
+function-rgx=[a-z_][a-z0-9_]{2,30}$
+
+# Naming hint for function names
+function-name-hint=[a-z_][a-z0-9_]{2,30}$
+
+# Regular expression matching correct variable names
+variable-rgx=[a-z_][a-z0-9_]{2,30}$
+
+# Naming hint for variable names
+variable-name-hint=[a-z_][a-z0-9_]{2,30}$
+
+# Regular expression matching correct constant names
+const-rgx=(([A-Z_][A-Z0-9_]*)|(__.*__))$
+
+# Naming hint for constant names
+const-name-hint=(([A-Z_][A-Z0-9_]*)|(__.*__))$
+
+# Regular expression matching correct attribute names
+attr-rgx=[a-z_][a-z0-9_]{2,30}$
+
+# Naming hint for attribute names
+attr-name-hint=[a-z_][a-z0-9_]{2,30}$
+
+# Regular expression matching correct argument names
+argument-rgx=[a-z_][a-z0-9_]{2,30}$
+
+# Naming hint for argument names
+argument-name-hint=[a-z_][a-z0-9_]{2,30}$
+
+# Regular expression matching correct class attribute names
+class-attribute-rgx=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$
+
+# Naming hint for class attribute names
+class-attribute-name-hint=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$
+
+# Regular expression matching correct inline iteration names
+inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$
+
+# Naming hint for inline iteration names
+inlinevar-name-hint=[A-Za-z_][A-Za-z0-9_]*$
+
+# Regular expression matching correct class names
+class-rgx=[A-Z_][a-zA-Z0-9]+$
+
+# Naming hint for class names
+class-name-hint=[A-Z_][a-zA-Z0-9]+$
+
+# Regular expression matching correct module names
+module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
+
+# Naming hint for module names
+module-name-hint=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
+
+# Regular expression matching correct method names
+method-rgx=[a-z_][a-z0-9_]{2,30}$
+
+# Naming hint for method names
+method-name-hint=[a-z_][a-z0-9_]{2,30}$
+
+# Regular expression which should only match function or class names that do
+# not require a docstring.
+no-docstring-rgx=__.*__
+
+# Minimum line length for functions/classes that require docstrings, shorter
+# ones are exempt.
+docstring-min-length=-1
+
+
+[FORMAT]
+
+# Maximum number of characters on a single line.
+max-line-length=100
+
+# Regexp for a line that is allowed to be longer than the limit.
+ignore-long-lines=^\s*(# )?<?https?://\S+>?$
+
+# Allow the body of an if to be on the same line as the test if there is no
+# else.
+single-line-if-stmt=no
+
+# List of optional constructs for which whitespace checking is disabled
+no-space-check=trailing-comma,dict-separator
+
+# Maximum number of lines in a module
+max-module-lines=1000
+
+# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1
+# tab).
+indent-string='    '
+
+# Number of spaces of indent required inside a hanging or continued line.
+indent-after-paren=4
+
+# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
+expected-line-ending-format=
+
+
+[SIMILARITIES]
+
+# Minimum lines number of a similarity.
+min-similarity-lines=4
+
+# Ignore comments when computing similarities.
+ignore-comments=yes
+
+# Ignore docstrings when computing similarities.
+ignore-docstrings=yes
+
+# Ignore imports when computing similarities.
+ignore-imports=no
+
+
+[VARIABLES]
+
+# Tells whether we should check for unused import in __init__ files.
+init-import=no
+
+# A regular expression matching the name of dummy variables (i.e. expectedly
+# not used).
+dummy-variables-rgx=_$|dummy
+
+# List of additional names supposed to be defined in builtins. Remember that
+# you should avoid to define new builtins when possible.
+additional-builtins=
+
+# List of strings which can identify a callback function by name. A callback
+# name must start or end with one of those strings.
+callbacks=cb_,_cb
+
+
+[SPELLING]
+
+# Spelling dictionary name. Available dictionaries: none. To make it working
+# install python-enchant package.
+spelling-dict=
+
+# List of comma separated words that should not be checked.
+spelling-ignore-words=
+
+# A path to a file that contains private dictionary; one word per line.
+spelling-private-dict-file=
+
+# Tells whether to store unknown words to indicated private dictionary in
+# --spelling-private-dict-file option instead of raising a message.
+spelling-store-unknown-words=no
+
+
+[LOGGING]
+
+# Logging modules to check that the string format arguments are in logging
+# function parameter format
+logging-modules=logging
+
+
+[TYPECHECK]
+
+# Tells whether missing members accessed in mixin class should be ignored. A
+# mixin class is detected if its name ends with "mixin" (case insensitive).
+ignore-mixin-members=yes
+
+# List of module names for which member attributes should not be checked
+# (useful for modules/projects where namespaces are manipulated during runtime
+# and thus existing member attributes cannot be deduced by static analysis
+ignored-modules=
+
+# List of classes names for which member attributes should not be checked
+# (useful for classes with attributes dynamically set).
+ignored-classes=SQLObject
+
+# When zope mode is activated, add a predefined set of Zope acquired attributes
+# to generated-members.
+zope=no
+
+# List of members which are set dynamically and missed by pylint inference
+# system, and so shouldn't trigger E0201 when accessed. Python regular
+# expressions are accepted.
+generated-members=REQUEST,acl_users,aq_parent
+
+
+[CLASSES]
+
+# List of interface methods to ignore, separated by a comma. This is used for
+# instance to not check methods defines in Zope's Interface base class.
+ignore-iface-methods=isImplementedBy,deferred,extends,names,namesAndDescriptions,queryDescriptionFor,getBases,getDescriptionFor,getDoc,getName,getTaggedValue,getTaggedValueTags,isEqualOrExtendedBy,setTaggedValue,isImplementedByInstancesOf,adaptWith,is_implemented_by
+
+# List of method names used to declare (i.e. assign) instance attributes.
+defining-attr-methods=__init__,__new__,setUp
+
+# List of valid names for the first argument in a class method.
+valid-classmethod-first-arg=cls
+
+# List of valid names for the first argument in a metaclass class method.
+valid-metaclass-classmethod-first-arg=mcs
+
+# List of member names, which should be excluded from the protected access
+# warning.
+exclude-protected=_asdict,_fields,_replace,_source,_make
+
+
+[IMPORTS]
+
+# Deprecated modules which should not be used, separated by a comma
+deprecated-modules=regsub,TERMIOS,Bastion,rexec
+
+# Create a graph of every (i.e. internal and external) dependencies in the
+# given file (report RP0402 must not be disabled)
+import-graph=
+
+# Create a graph of external dependencies in the given file (report RP0402 must
+# not be disabled)
+ext-import-graph=
+
+# Create a graph of internal dependencies in the given file (report RP0402 must
+# not be disabled)
+int-import-graph=
+
+
+[DESIGN]
+
+# Maximum number of arguments for function / method
+max-args=5
+
+# Argument names that match this expression will be ignored. Default to name
+# with leading underscore
+ignored-argument-names=_.*
+
+# Maximum number of locals for function / method body
+max-locals=15
+
+# Maximum number of return / yield for function / method body
+max-returns=6
+
+# Maximum number of branch for function / method body
+max-branches=12
+
+# Maximum number of statements in function / method body
+max-statements=50
+
+# Maximum number of parents for a class (see R0901).
+max-parents=7
+
+# Maximum number of attributes for a class (see R0902).
+max-attributes=7
+
+# Minimum number of public methods for a class (see R0903).
+min-public-methods=2
+
+# Maximum number of public methods for a class (see R0904).
+max-public-methods=20
+
+
+[EXCEPTIONS]
+
+# Exceptions that will emit a warning when being caught. Defaults to
+# "Exception"
+overgeneral-exceptions=Exception
diff --git a/python/pyspark/ml/param/shared.py b/python/pyspark/ml/param/shared.py
index bc088e4c29e26..595124726366d 100644
--- a/python/pyspark/ml/param/shared.py
+++ b/python/pyspark/ml/param/shared.py
@@ -444,7 +444,7 @@ class DecisionTreeParams(Params):
     minInfoGain = Param(Params._dummy(), "minInfoGain", "Minimum information gain for a split to be considered at a tree node.")
     maxMemoryInMB = Param(Params._dummy(), "maxMemoryInMB", "Maximum memory in MB allocated to histogram aggregation.")
     cacheNodeIds = Param(Params._dummy(), "cacheNodeIds", "If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees.")
-    
+
 
     def __init__(self):
         super(DecisionTreeParams, self).__init__()
@@ -460,7 +460,7 @@ def __init__(self):
         self.maxMemoryInMB = Param(self, "maxMemoryInMB", "Maximum memory in MB allocated to histogram aggregation.")
         #: param for If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees.
         self.cacheNodeIds = Param(self, "cacheNodeIds", "If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees.")
-        
+
     def setMaxDepth(self, value):
         """
         Sets the value of :py:attr:`maxDepth`.

From 5ca26fb64de99fa414dc59ce4cf29a0171894793 Mon Sep 17 00:00:00 2001
From: Carson Wang <carson.wang@intel.com>
Date: Mon, 13 Jul 2015 11:20:04 -0700
Subject: [PATCH 131/549] [SPARK-8950] [WEBUI] Correct the calculation of
 SchedulerDelay in StagePage

In StagePage, the SchedulerDelay is calculated as totalExecutionTime - executorRunTime - executorOverhead - gettingResultTime.
But the totalExecutionTime is calculated in the way that doesn't include the gettingResultTime.

Author: Carson Wang <carson.wang@intel.com>

Closes #7319 from carsonwang/SchedulerDelayTime and squashes the following commits:

f66fb6e [Carson Wang] Update the code style
7d971ae [Carson Wang] Correct the calculation of SchedulerDelay
---
 .../org/apache/spark/ui/jobs/StagePage.scala  | 45 +++++++++----------
 1 file changed, 22 insertions(+), 23 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
index 60e3c6343122c..ff0a339a39c65 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
@@ -332,7 +332,7 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") {
             </td> +: getFormattedTimeQuantiles(serializationTimes)
 
           val gettingResultTimes = validTasks.map { case TaskUIData(info, _, _) =>
-            getGettingResultTime(info).toDouble
+            getGettingResultTime(info, currentTime).toDouble
           }
           val gettingResultQuantiles =
             <td>
@@ -346,7 +346,7 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") {
           // machine and to send back the result (but not the time to fetch the task result,
           // if it needed to be fetched from the block manager on the worker).
           val schedulerDelays = validTasks.map { case TaskUIData(info, metrics, _) =>
-            getSchedulerDelay(info, metrics.get).toDouble
+            getSchedulerDelay(info, metrics.get, currentTime).toDouble
           }
           val schedulerDelayTitle = <td><span data-toggle="tooltip"
             title={ToolTips.SCHEDULER_DELAY} data-placement="right">Scheduler Delay</span></td>
@@ -544,7 +544,7 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") {
         val serializationTimeProportion = toProportion(serializationTime)
         val deserializationTime = metricsOpt.map(_.executorDeserializeTime).getOrElse(0L)
         val deserializationTimeProportion = toProportion(deserializationTime)
-        val gettingResultTime = getGettingResultTime(taskUIData.taskInfo)
+        val gettingResultTime = getGettingResultTime(taskUIData.taskInfo, currentTime)
         val gettingResultTimeProportion = toProportion(gettingResultTime)
         val schedulerDelay = totalExecutionTime -
           (executorComputingTime + shuffleReadTime + shuffleWriteTime +
@@ -685,11 +685,11 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") {
         else metrics.map(_.executorRunTime).getOrElse(1L)
       val formatDuration = if (info.status == "RUNNING") UIUtils.formatDuration(duration)
         else metrics.map(m => UIUtils.formatDuration(m.executorRunTime)).getOrElse("")
-      val schedulerDelay = metrics.map(getSchedulerDelay(info, _)).getOrElse(0L)
+      val schedulerDelay = metrics.map(getSchedulerDelay(info, _, currentTime)).getOrElse(0L)
       val gcTime = metrics.map(_.jvmGCTime).getOrElse(0L)
       val taskDeserializationTime = metrics.map(_.executorDeserializeTime).getOrElse(0L)
       val serializationTime = metrics.map(_.resultSerializationTime).getOrElse(0L)
-      val gettingResultTime = getGettingResultTime(info)
+      val gettingResultTime = getGettingResultTime(info, currentTime)
 
       val maybeAccumulators = info.accumulables
       val accumulatorsReadable = maybeAccumulators.map{acc => s"${acc.name}: ${acc.update.get}"}
@@ -852,32 +852,31 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") {
     <td>{errorSummary}{details}</td>
   }
 
-  private def getGettingResultTime(info: TaskInfo): Long = {
-    if (info.gettingResultTime > 0) {
-      if (info.finishTime > 0) {
+  private def getGettingResultTime(info: TaskInfo, currentTime: Long): Long = {
+    if (info.gettingResult) {
+      if (info.finished) {
         info.finishTime - info.gettingResultTime
       } else {
         // The task is still fetching the result.
-        System.currentTimeMillis - info.gettingResultTime
+        currentTime - info.gettingResultTime
       }
     } else {
       0L
     }
   }
 
-  private def getSchedulerDelay(info: TaskInfo, metrics: TaskMetrics): Long = {
-    val totalExecutionTime =
-      if (info.gettingResult) {
-        info.gettingResultTime - info.launchTime
-      } else if (info.finished) {
-        info.finishTime - info.launchTime
-      } else {
-        0
-      }
-    val executorOverhead = (metrics.executorDeserializeTime +
-      metrics.resultSerializationTime)
-    math.max(
-      0,
-      totalExecutionTime - metrics.executorRunTime - executorOverhead - getGettingResultTime(info))
+  private def getSchedulerDelay(info: TaskInfo, metrics: TaskMetrics, currentTime: Long): Long = {
+    if (info.finished) {
+      val totalExecutionTime = info.finishTime - info.launchTime
+      val executorOverhead = (metrics.executorDeserializeTime +
+        metrics.resultSerializationTime)
+      math.max(
+        0,
+        totalExecutionTime - metrics.executorRunTime - executorOverhead -
+          getGettingResultTime(info, currentTime))
+    } else {
+      // The task is still running and the metrics like executorRunTime are not available.
+      0L
+    }
   }
 }

From 79c35826e626805c5df776b3d22ed37829d239b6 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies.liu@gmail.com>
Date: Mon, 13 Jul 2015 11:30:36 -0700
Subject: [PATCH 132/549] Revert "[SPARK-8706] [PYSPARK] [PROJECT INFRA] Add
 pylint checks to PySpark"

This reverts commit 9b62e9375f032548d386aec7468e3d0f7c6da7b2.
---
 dev/lint-python                   |  57 +----
 pylintrc                          | 404 ------------------------------
 python/pyspark/ml/param/shared.py |   4 +-
 3 files changed, 10 insertions(+), 455 deletions(-)
 delete mode 100644 pylintrc

diff --git a/dev/lint-python b/dev/lint-python
index e02dff220eb87..0c3586462cb37 100755
--- a/dev/lint-python
+++ b/dev/lint-python
@@ -21,14 +21,12 @@ SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )"
 SPARK_ROOT_DIR="$(dirname "$SCRIPT_DIR")"
 PATHS_TO_CHECK="./python/pyspark/ ./ec2/spark_ec2.py ./examples/src/main/python/ ./dev/sparktestsupport"
 PATHS_TO_CHECK="$PATHS_TO_CHECK ./dev/run-tests.py ./python/run-tests.py"
-PEP8_REPORT_PATH="$SPARK_ROOT_DIR/dev/pep8-report.txt"
-PYLINT_REPORT_PATH="$SPARK_ROOT_DIR/dev/pylint-report.txt"
-PYLINT_INSTALL_INFO="$SPARK_ROOT_DIR/dev/pylint-info.txt"
+PYTHON_LINT_REPORT_PATH="$SPARK_ROOT_DIR/dev/python-lint-report.txt"
 
 cd "$SPARK_ROOT_DIR"
 
 # compileall: https://docs.python.org/2/library/compileall.html
-python -B -m compileall -q -l $PATHS_TO_CHECK > "$PEP8_REPORT_PATH"
+python -B -m compileall -q -l $PATHS_TO_CHECK > "$PYTHON_LINT_REPORT_PATH"
 compile_status="${PIPESTATUS[0]}"
 
 # Get pep8 at runtime so that we don't rely on it being installed on the build server.
@@ -49,36 +47,11 @@ if [ ! -e "$PEP8_SCRIPT_PATH" ]; then
     fi
 fi
 
-# Easy install pylint in /dev/pylint. To easy_install into a directory, the PYTHONPATH should
-# be set to the directory.
-# dev/pylint should be appended to the PATH variable as well.
-# Jenkins by default installs the pylint3 version, so for now this just checks the code quality
-# of python3.
-export "PYTHONPATH=$SPARK_ROOT_DIR/dev/pylint"
-export "PYLINT_HOME=$PYTHONPATH"
-export "PATH=$PYTHONPATH:$PATH"
-
-if [ ! -d "$PYLINT_HOME" ]; then
-    mkdir "$PYLINT_HOME"
-    # Redirect the annoying pylint installation output.
-    easy_install -d "$PYLINT_HOME" pylint==1.4.4 &>> "$PYLINT_INSTALL_INFO"
-    easy_install_status="$?"
-
-    if [ "$easy_install_status" -ne 0 ]; then
-        echo "Unable to install pylint locally in \"$PYTHONPATH\"."
-        cat "$PYLINT_INSTALL_INFO"
-        exit "$easy_install_status"
-    fi
-
-    rm "$PYLINT_INSTALL_INFO"
-
-fi
-
 # There is no need to write this output to a file
 #+ first, but we do so so that the check status can
 #+ be output before the report, like with the
 #+ scalastyle and RAT checks.
-python "$PEP8_SCRIPT_PATH" --ignore=E402,E731,E241,W503,E226 $PATHS_TO_CHECK >> "$PEP8_REPORT_PATH"
+python "$PEP8_SCRIPT_PATH" --ignore=E402,E731,E241,W503,E226 $PATHS_TO_CHECK >> "$PYTHON_LINT_REPORT_PATH"
 pep8_status="${PIPESTATUS[0]}"
 
 if [ "$compile_status" -eq 0 -a "$pep8_status" -eq 0 ]; then
@@ -88,27 +61,13 @@ else
 fi
 
 if [ "$lint_status" -ne 0 ]; then
-    echo "PEP8 checks failed."
-    cat "$PEP8_REPORT_PATH"
-else
-    echo "PEP8 checks passed."
-fi
-
-rm "$PEP8_REPORT_PATH"
-
-for to_be_checked in "$PATHS_TO_CHECK"
-do
-    pylint --rcfile="$SPARK_ROOT_DIR/pylintrc" $to_be_checked >> "$PYLINT_REPORT_PATH"
-done
-
-if [ "${PIPESTATUS[0]}" -ne 0 ]; then
-    lint_status=1
-    echo "Pylint checks failed."
-    cat "$PYLINT_REPORT_PATH"
+    echo "Python lint checks failed."
+    cat "$PYTHON_LINT_REPORT_PATH"
 else
-    echo "Pylint checks passed."
+    echo "Python lint checks passed."
 fi
 
-rm "$PYLINT_REPORT_PATH"
+# rm "$PEP8_SCRIPT_PATH"
+rm "$PYTHON_LINT_REPORT_PATH"
 
 exit "$lint_status"
diff --git a/pylintrc b/pylintrc
deleted file mode 100644
index 061775960393b..0000000000000
--- a/pylintrc
+++ /dev/null
@@ -1,404 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-[MASTER]
-
-# Specify a configuration file.
-#rcfile=
-
-# Python code to execute, usually for sys.path manipulation such as
-# pygtk.require().
-#init-hook=
-
-# Profiled execution.
-profile=no
-
-# Add files or directories to the blacklist. They should be base names, not
-# paths.
-ignore=pyspark.heapq3
-
-# Pickle collected data for later comparisons.
-persistent=yes
-
-# List of plugins (as comma separated values of python modules names) to load,
-# usually to register additional checkers.
-load-plugins=
-
-# Use multiple processes to speed up Pylint.
-jobs=1
-
-# Allow loading of arbitrary C extensions. Extensions are imported into the
-# active Python interpreter and may run arbitrary code.
-unsafe-load-any-extension=no
-
-# A comma-separated list of package or module names from where C extensions may
-# be loaded. Extensions are loading into the active Python interpreter and may
-# run arbitrary code
-extension-pkg-whitelist=
-
-# Allow optimization of some AST trees. This will activate a peephole AST
-# optimizer, which will apply various small optimizations. For instance, it can
-# be used to obtain the result of joining multiple strings with the addition
-# operator. Joining a lot of strings can lead to a maximum recursion error in
-# Pylint and this flag can prevent that. It has one side effect, the resulting
-# AST will be different than the one from reality.
-optimize-ast=no
-
-
-[MESSAGES CONTROL]
-
-# Only show warnings with the listed confidence levels. Leave empty to show
-# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED
-confidence=
-
-# Enable the message, report, category or checker with the given id(s). You can
-# either give multiple identifier separated by comma (,) or put this option
-# multiple time. See also the "--disable" option for examples.
-enable=
-
-# Disable the message, report, category or checker with the given id(s). You
-# can either give multiple identifiers separated by comma (,) or put this
-# option multiple times (only on the command line, not in the configuration
-# file where it should appear only once).You can also use "--disable=all" to
-# disable everything first and then reenable specific checks. For example, if
-# you want to run only the similarities checker, you can use "--disable=all
-# --enable=similarities". If you want to run only the classes checker, but have
-# no Warning level messages displayed, use"--disable=all --enable=classes
-# --disable=W"
-
-# These errors are arranged in order of number of warning given in pylint.
-# If you would like to improve the code quality of pyspark, remove any of these disabled errors
-# run ./dev/lint-python and see if the errors raised by pylint can be fixed.
-
-disable=invalid-name,missing-docstring,protected-access,unused-argument,no-member,unused-wildcard-import,redefined-builtin,too-many-arguments,unused-variable,too-few-public-methods,bad-continuation,duplicate-code,redefined-outer-name,too-many-ancestors,import-error,superfluous-parens,unused-import,line-too-long,no-name-in-module,unnecessary-lambda,import-self,no-self-use,unidiomatic-typecheck,fixme,too-many-locals,cyclic-import,too-many-branches,bare-except,wildcard-import,dangerous-default-value,broad-except,too-many-public-methods,deprecated-lambda,anomalous-backslash-in-string,too-many-lines,reimported,too-many-statements,bad-whitespace,unpacking-non-sequence,too-many-instance-attributes,abstract-method,old-style-class,global-statement,attribute-defined-outside-init,arguments-differ,undefined-all-variable,no-init,useless-else-on-loop,super-init-not-called,notimplemented-raised,too-many-return-statements,pointless-string-statement,global-variable-undefined,bad-classmethod-argument,too-many-format-args,parse-error,no-self-argument,pointless-statement,undefined-variable
-
-
-[REPORTS]
-
-# Set the output format. Available formats are text, parseable, colorized, msvs
-# (visual studio) and html. You can also give a reporter class, eg
-# mypackage.mymodule.MyReporterClass.
-output-format=text
-
-# Put messages in a separate file for each module / package specified on the
-# command line instead of printing them on stdout. Reports (if any) will be
-# written in a file name "pylint_global.[txt|html]".
-files-output=no
-
-# Tells whether to display a full report or only the messages
-reports=no
-
-# Python expression which should return a note less than 10 (10 is the highest
-# note). You have access to the variables errors warning, statement which
-# respectively contain the number of errors / warnings messages and the total
-# number of statements analyzed. This is used by the global evaluation report
-# (RP0004).
-evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
-
-# Add a comment according to your evaluation note. This is used by the global
-# evaluation report (RP0004).
-comment=no
-
-# Template used to display messages. This is a python new-style format string
-# used to format the message information. See doc for all details
-#msg-template=
-
-
-[MISCELLANEOUS]
-
-# List of note tags to take in consideration, separated by a comma.
-notes=FIXME,XXX,TODO
-
-
-[BASIC]
-
-# Required attributes for module, separated by a comma
-required-attributes=
-
-# List of builtins function names that should not be used, separated by a comma
-bad-functions=
-
-# Good variable names which should always be accepted, separated by a comma
-good-names=i,j,k,ex,Run,_
-
-# Bad variable names which should always be refused, separated by a comma
-bad-names=baz,toto,tutu,tata
-
-# Colon-delimited sets of names that determine each other's naming style when
-# the name regexes allow several styles.
-name-group=
-
-# Include a hint for the correct naming format with invalid-name
-include-naming-hint=no
-
-# Regular expression matching correct function names
-function-rgx=[a-z_][a-z0-9_]{2,30}$
-
-# Naming hint for function names
-function-name-hint=[a-z_][a-z0-9_]{2,30}$
-
-# Regular expression matching correct variable names
-variable-rgx=[a-z_][a-z0-9_]{2,30}$
-
-# Naming hint for variable names
-variable-name-hint=[a-z_][a-z0-9_]{2,30}$
-
-# Regular expression matching correct constant names
-const-rgx=(([A-Z_][A-Z0-9_]*)|(__.*__))$
-
-# Naming hint for constant names
-const-name-hint=(([A-Z_][A-Z0-9_]*)|(__.*__))$
-
-# Regular expression matching correct attribute names
-attr-rgx=[a-z_][a-z0-9_]{2,30}$
-
-# Naming hint for attribute names
-attr-name-hint=[a-z_][a-z0-9_]{2,30}$
-
-# Regular expression matching correct argument names
-argument-rgx=[a-z_][a-z0-9_]{2,30}$
-
-# Naming hint for argument names
-argument-name-hint=[a-z_][a-z0-9_]{2,30}$
-
-# Regular expression matching correct class attribute names
-class-attribute-rgx=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$
-
-# Naming hint for class attribute names
-class-attribute-name-hint=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$
-
-# Regular expression matching correct inline iteration names
-inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$
-
-# Naming hint for inline iteration names
-inlinevar-name-hint=[A-Za-z_][A-Za-z0-9_]*$
-
-# Regular expression matching correct class names
-class-rgx=[A-Z_][a-zA-Z0-9]+$
-
-# Naming hint for class names
-class-name-hint=[A-Z_][a-zA-Z0-9]+$
-
-# Regular expression matching correct module names
-module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
-
-# Naming hint for module names
-module-name-hint=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
-
-# Regular expression matching correct method names
-method-rgx=[a-z_][a-z0-9_]{2,30}$
-
-# Naming hint for method names
-method-name-hint=[a-z_][a-z0-9_]{2,30}$
-
-# Regular expression which should only match function or class names that do
-# not require a docstring.
-no-docstring-rgx=__.*__
-
-# Minimum line length for functions/classes that require docstrings, shorter
-# ones are exempt.
-docstring-min-length=-1
-
-
-[FORMAT]
-
-# Maximum number of characters on a single line.
-max-line-length=100
-
-# Regexp for a line that is allowed to be longer than the limit.
-ignore-long-lines=^\s*(# )?<?https?://\S+>?$
-
-# Allow the body of an if to be on the same line as the test if there is no
-# else.
-single-line-if-stmt=no
-
-# List of optional constructs for which whitespace checking is disabled
-no-space-check=trailing-comma,dict-separator
-
-# Maximum number of lines in a module
-max-module-lines=1000
-
-# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1
-# tab).
-indent-string='    '
-
-# Number of spaces of indent required inside a hanging or continued line.
-indent-after-paren=4
-
-# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
-expected-line-ending-format=
-
-
-[SIMILARITIES]
-
-# Minimum lines number of a similarity.
-min-similarity-lines=4
-
-# Ignore comments when computing similarities.
-ignore-comments=yes
-
-# Ignore docstrings when computing similarities.
-ignore-docstrings=yes
-
-# Ignore imports when computing similarities.
-ignore-imports=no
-
-
-[VARIABLES]
-
-# Tells whether we should check for unused import in __init__ files.
-init-import=no
-
-# A regular expression matching the name of dummy variables (i.e. expectedly
-# not used).
-dummy-variables-rgx=_$|dummy
-
-# List of additional names supposed to be defined in builtins. Remember that
-# you should avoid to define new builtins when possible.
-additional-builtins=
-
-# List of strings which can identify a callback function by name. A callback
-# name must start or end with one of those strings.
-callbacks=cb_,_cb
-
-
-[SPELLING]
-
-# Spelling dictionary name. Available dictionaries: none. To make it working
-# install python-enchant package.
-spelling-dict=
-
-# List of comma separated words that should not be checked.
-spelling-ignore-words=
-
-# A path to a file that contains private dictionary; one word per line.
-spelling-private-dict-file=
-
-# Tells whether to store unknown words to indicated private dictionary in
-# --spelling-private-dict-file option instead of raising a message.
-spelling-store-unknown-words=no
-
-
-[LOGGING]
-
-# Logging modules to check that the string format arguments are in logging
-# function parameter format
-logging-modules=logging
-
-
-[TYPECHECK]
-
-# Tells whether missing members accessed in mixin class should be ignored. A
-# mixin class is detected if its name ends with "mixin" (case insensitive).
-ignore-mixin-members=yes
-
-# List of module names for which member attributes should not be checked
-# (useful for modules/projects where namespaces are manipulated during runtime
-# and thus existing member attributes cannot be deduced by static analysis
-ignored-modules=
-
-# List of classes names for which member attributes should not be checked
-# (useful for classes with attributes dynamically set).
-ignored-classes=SQLObject
-
-# When zope mode is activated, add a predefined set of Zope acquired attributes
-# to generated-members.
-zope=no
-
-# List of members which are set dynamically and missed by pylint inference
-# system, and so shouldn't trigger E0201 when accessed. Python regular
-# expressions are accepted.
-generated-members=REQUEST,acl_users,aq_parent
-
-
-[CLASSES]
-
-# List of interface methods to ignore, separated by a comma. This is used for
-# instance to not check methods defines in Zope's Interface base class.
-ignore-iface-methods=isImplementedBy,deferred,extends,names,namesAndDescriptions,queryDescriptionFor,getBases,getDescriptionFor,getDoc,getName,getTaggedValue,getTaggedValueTags,isEqualOrExtendedBy,setTaggedValue,isImplementedByInstancesOf,adaptWith,is_implemented_by
-
-# List of method names used to declare (i.e. assign) instance attributes.
-defining-attr-methods=__init__,__new__,setUp
-
-# List of valid names for the first argument in a class method.
-valid-classmethod-first-arg=cls
-
-# List of valid names for the first argument in a metaclass class method.
-valid-metaclass-classmethod-first-arg=mcs
-
-# List of member names, which should be excluded from the protected access
-# warning.
-exclude-protected=_asdict,_fields,_replace,_source,_make
-
-
-[IMPORTS]
-
-# Deprecated modules which should not be used, separated by a comma
-deprecated-modules=regsub,TERMIOS,Bastion,rexec
-
-# Create a graph of every (i.e. internal and external) dependencies in the
-# given file (report RP0402 must not be disabled)
-import-graph=
-
-# Create a graph of external dependencies in the given file (report RP0402 must
-# not be disabled)
-ext-import-graph=
-
-# Create a graph of internal dependencies in the given file (report RP0402 must
-# not be disabled)
-int-import-graph=
-
-
-[DESIGN]
-
-# Maximum number of arguments for function / method
-max-args=5
-
-# Argument names that match this expression will be ignored. Default to name
-# with leading underscore
-ignored-argument-names=_.*
-
-# Maximum number of locals for function / method body
-max-locals=15
-
-# Maximum number of return / yield for function / method body
-max-returns=6
-
-# Maximum number of branch for function / method body
-max-branches=12
-
-# Maximum number of statements in function / method body
-max-statements=50
-
-# Maximum number of parents for a class (see R0901).
-max-parents=7
-
-# Maximum number of attributes for a class (see R0902).
-max-attributes=7
-
-# Minimum number of public methods for a class (see R0903).
-min-public-methods=2
-
-# Maximum number of public methods for a class (see R0904).
-max-public-methods=20
-
-
-[EXCEPTIONS]
-
-# Exceptions that will emit a warning when being caught. Defaults to
-# "Exception"
-overgeneral-exceptions=Exception
diff --git a/python/pyspark/ml/param/shared.py b/python/pyspark/ml/param/shared.py
index 595124726366d..bc088e4c29e26 100644
--- a/python/pyspark/ml/param/shared.py
+++ b/python/pyspark/ml/param/shared.py
@@ -444,7 +444,7 @@ class DecisionTreeParams(Params):
     minInfoGain = Param(Params._dummy(), "minInfoGain", "Minimum information gain for a split to be considered at a tree node.")
     maxMemoryInMB = Param(Params._dummy(), "maxMemoryInMB", "Maximum memory in MB allocated to histogram aggregation.")
     cacheNodeIds = Param(Params._dummy(), "cacheNodeIds", "If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees.")
-
+    
 
     def __init__(self):
         super(DecisionTreeParams, self).__init__()
@@ -460,7 +460,7 @@ def __init__(self):
         self.maxMemoryInMB = Param(self, "maxMemoryInMB", "Maximum memory in MB allocated to histogram aggregation.")
         #: param for If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees.
         self.cacheNodeIds = Param(self, "cacheNodeIds", "If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees.")
-
+        
     def setMaxDepth(self, value):
         """
         Sets the value of :py:attr:`maxDepth`.

From 5c41691fe4da0017ccc7e372e3f655dc1c815349 Mon Sep 17 00:00:00 2001
From: yongtang <yongtang@users.noreply.github.com>
Date: Mon, 13 Jul 2015 12:01:23 -0700
Subject: [PATCH 133/549] [SPARK-8954] [BUILD] Remove unneeded deb repository
 from Dockerfile to fix build error in docker.

[SPARK-8954] [Build]
1. Remove unneeded deb repository from Dockerfile to fix build error in docker.
2. Remove unneeded /var/lib/apt/lists/* after install to reduce the docker image size (by ~30MB).

Author: yongtang <yongtang@users.noreply.github.com>

Closes #7346 from yongtang/SPARK-8954 and squashes the following commits:

36024a1 [yongtang] [SPARK-8954] [Build] Remove unneeded /var/lib/apt/lists/* after install to reduce the docker image size (by ~30MB)
7084941 [yongtang] [SPARK-8954] [Build] Remove unneeded deb repository from Dockerfile to fix build error in docker.
---
 docker/spark-test/base/Dockerfile | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/docker/spark-test/base/Dockerfile b/docker/spark-test/base/Dockerfile
index 5956d59130fbf..5dbdb8b22a44f 100644
--- a/docker/spark-test/base/Dockerfile
+++ b/docker/spark-test/base/Dockerfile
@@ -17,13 +17,13 @@
 
 FROM ubuntu:precise
 
-RUN echo "deb http://archive.ubuntu.com/ubuntu precise main universe" > /etc/apt/sources.list
-
 # Upgrade package index
-RUN apt-get update
-
 # install a few other useful packages plus Open Jdk 7
-RUN apt-get install -y less openjdk-7-jre-headless net-tools vim-tiny sudo openssh-server
+# Remove unneeded /var/lib/apt/lists/* after install to reduce the
+# docker image size (by ~30MB)
+RUN apt-get update && \
+    apt-get install -y less openjdk-7-jre-headless net-tools vim-tiny sudo openssh-server && \
+    rm -rf /var/lib/apt/lists/*
 
 ENV SCALA_VERSION 2.10.4
 ENV CDH_VERSION cdh4

From 714fc55f4aadd5e7b7fb1e462910bfb6a82d9154 Mon Sep 17 00:00:00 2001
From: Vinod K C <vinod.kc@huawei.com>
Date: Mon, 13 Jul 2015 12:03:39 -0700
Subject: [PATCH 134/549] [SPARK-8991] [ML] Update SharedParamsCodeGen's
 Generated Documentation

Removed private[ml] from Generated documentation

Author: Vinod K C <vinod.kc@huawei.com>

Closes #7367 from vinodkc/fix_sharedparmascodegen and squashes the following commits:

4fa3c8f [Vinod K C] Adding auto generated code
7e19025 [Vinod K C] Removed private[ml]
---
 .../ml/param/shared/SharedParamsCodeGen.scala |  3 +-
 .../spark/ml/param/shared/sharedParams.scala  | 37 +++++++++----------
 2 files changed, 19 insertions(+), 21 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
index 66b751a1b02ee..f7ae1de522e01 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
@@ -134,7 +134,7 @@ private[shared] object SharedParamsCodeGen {
 
     s"""
       |/**
-      | * (private[ml]) Trait for shared param $name$defaultValueDoc.
+      | * Trait for shared param $name$defaultValueDoc.
       | */
       |private[ml] trait Has$Name extends Params {
       |
@@ -173,7 +173,6 @@ private[shared] object SharedParamsCodeGen {
         |package org.apache.spark.ml.param.shared
         |
         |import org.apache.spark.ml.param._
-        |import org.apache.spark.util.Utils
         |
         |// DO NOT MODIFY THIS FILE! It was generated by SharedParamsCodeGen.
         |
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
index f81bd76c22376..65e48e4ee5083 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
@@ -18,14 +18,13 @@
 package org.apache.spark.ml.param.shared
 
 import org.apache.spark.ml.param._
-import org.apache.spark.util.Utils
 
 // DO NOT MODIFY THIS FILE! It was generated by SharedParamsCodeGen.
 
 // scalastyle:off
 
 /**
- * (private[ml]) Trait for shared param regParam.
+ * Trait for shared param regParam.
  */
 private[ml] trait HasRegParam extends Params {
 
@@ -40,7 +39,7 @@ private[ml] trait HasRegParam extends Params {
 }
 
 /**
- * (private[ml]) Trait for shared param maxIter.
+ * Trait for shared param maxIter.
  */
 private[ml] trait HasMaxIter extends Params {
 
@@ -55,7 +54,7 @@ private[ml] trait HasMaxIter extends Params {
 }
 
 /**
- * (private[ml]) Trait for shared param featuresCol (default: "features").
+ * Trait for shared param featuresCol (default: "features").
  */
 private[ml] trait HasFeaturesCol extends Params {
 
@@ -72,7 +71,7 @@ private[ml] trait HasFeaturesCol extends Params {
 }
 
 /**
- * (private[ml]) Trait for shared param labelCol (default: "label").
+ * Trait for shared param labelCol (default: "label").
  */
 private[ml] trait HasLabelCol extends Params {
 
@@ -89,7 +88,7 @@ private[ml] trait HasLabelCol extends Params {
 }
 
 /**
- * (private[ml]) Trait for shared param predictionCol (default: "prediction").
+ * Trait for shared param predictionCol (default: "prediction").
  */
 private[ml] trait HasPredictionCol extends Params {
 
@@ -106,7 +105,7 @@ private[ml] trait HasPredictionCol extends Params {
 }
 
 /**
- * (private[ml]) Trait for shared param rawPredictionCol (default: "rawPrediction").
+ * Trait for shared param rawPredictionCol (default: "rawPrediction").
  */
 private[ml] trait HasRawPredictionCol extends Params {
 
@@ -123,7 +122,7 @@ private[ml] trait HasRawPredictionCol extends Params {
 }
 
 /**
- * (private[ml]) Trait for shared param probabilityCol (default: "probability").
+ * Trait for shared param probabilityCol (default: "probability").
  */
 private[ml] trait HasProbabilityCol extends Params {
 
@@ -140,7 +139,7 @@ private[ml] trait HasProbabilityCol extends Params {
 }
 
 /**
- * (private[ml]) Trait for shared param threshold.
+ * Trait for shared param threshold.
  */
 private[ml] trait HasThreshold extends Params {
 
@@ -155,7 +154,7 @@ private[ml] trait HasThreshold extends Params {
 }
 
 /**
- * (private[ml]) Trait for shared param inputCol.
+ * Trait for shared param inputCol.
  */
 private[ml] trait HasInputCol extends Params {
 
@@ -170,7 +169,7 @@ private[ml] trait HasInputCol extends Params {
 }
 
 /**
- * (private[ml]) Trait for shared param inputCols.
+ * Trait for shared param inputCols.
  */
 private[ml] trait HasInputCols extends Params {
 
@@ -185,7 +184,7 @@ private[ml] trait HasInputCols extends Params {
 }
 
 /**
- * (private[ml]) Trait for shared param outputCol (default: uid + "__output").
+ * Trait for shared param outputCol (default: uid + "__output").
  */
 private[ml] trait HasOutputCol extends Params {
 
@@ -202,7 +201,7 @@ private[ml] trait HasOutputCol extends Params {
 }
 
 /**
- * (private[ml]) Trait for shared param checkpointInterval.
+ * Trait for shared param checkpointInterval.
  */
 private[ml] trait HasCheckpointInterval extends Params {
 
@@ -217,7 +216,7 @@ private[ml] trait HasCheckpointInterval extends Params {
 }
 
 /**
- * (private[ml]) Trait for shared param fitIntercept (default: true).
+ * Trait for shared param fitIntercept (default: true).
  */
 private[ml] trait HasFitIntercept extends Params {
 
@@ -234,7 +233,7 @@ private[ml] trait HasFitIntercept extends Params {
 }
 
 /**
- * (private[ml]) Trait for shared param standardization (default: true).
+ * Trait for shared param standardization (default: true).
  */
 private[ml] trait HasStandardization extends Params {
 
@@ -251,7 +250,7 @@ private[ml] trait HasStandardization extends Params {
 }
 
 /**
- * (private[ml]) Trait for shared param seed (default: this.getClass.getName.hashCode.toLong).
+ * Trait for shared param seed (default: this.getClass.getName.hashCode.toLong).
  */
 private[ml] trait HasSeed extends Params {
 
@@ -268,7 +267,7 @@ private[ml] trait HasSeed extends Params {
 }
 
 /**
- * (private[ml]) Trait for shared param elasticNetParam.
+ * Trait for shared param elasticNetParam.
  */
 private[ml] trait HasElasticNetParam extends Params {
 
@@ -283,7 +282,7 @@ private[ml] trait HasElasticNetParam extends Params {
 }
 
 /**
- * (private[ml]) Trait for shared param tol.
+ * Trait for shared param tol.
  */
 private[ml] trait HasTol extends Params {
 
@@ -298,7 +297,7 @@ private[ml] trait HasTol extends Params {
 }
 
 /**
- * (private[ml]) Trait for shared param stepSize.
+ * Trait for shared param stepSize.
  */
 private[ml] trait HasStepSize extends Params {
 

From 4c797f2b0989317a2d004e5f72a0e593919737ea Mon Sep 17 00:00:00 2001
From: Vinod K C <vinod.kc@huawei.com>
Date: Mon, 13 Jul 2015 12:51:33 -0700
Subject: [PATCH 135/549] [SPARK-8636] [SQL] Fix equalNullSafe comparison

Author: Vinod K C <vinod.kc@huawei.com>

Closes #7040 from vinodkc/fix_CaseKeyWhen_equalNullSafe and squashes the following commits:

be5e641 [Vinod K C] Renamed equalNullSafe to threeValueEquals
aac9f67 [Vinod K C] Updated test suite and genCode method
f2d0b53 [Vinod K C]  Fix equalNullSafe comparison
---
 .../spark/sql/catalyst/expressions/conditionals.scala | 11 ++++-------
 .../expressions/ConditionalExpressionSuite.scala      |  4 ++--
 2 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionals.scala
index e6a705fb8055b..84c28c27f934b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionals.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionals.scala
@@ -238,7 +238,7 @@ case class CaseKeyWhen(key: Expression, branches: Seq[Expression]) extends CaseW
     // If all branches fail and an elseVal is not provided, the whole statement
     // defaults to null, according to Hive's semantics.
     while (i < len - 1) {
-      if (equalNullSafe(evaluatedKey, branchesArr(i).eval(input))) {
+      if (threeValueEquals(evaluatedKey, branchesArr(i).eval(input))) {
         return branchesArr(i + 1).eval(input)
       }
       i += 2
@@ -261,8 +261,7 @@ case class CaseKeyWhen(key: Expression, branches: Seq[Expression]) extends CaseW
       s"""
         if (!$got) {
           ${cond.code}
-          if (${keyEval.isNull} && ${cond.isNull} ||
-            !${keyEval.isNull} && !${cond.isNull}
+          if (!${keyEval.isNull} && !${cond.isNull}
              && ${ctx.genEqual(key.dataType, keyEval.primitive, cond.primitive)}) {
             $got = true;
             ${res.code}
@@ -296,10 +295,8 @@ case class CaseKeyWhen(key: Expression, branches: Seq[Expression]) extends CaseW
     """
   }
 
-  private def equalNullSafe(l: Any, r: Any) = {
-    if (l == null && r == null) {
-      true
-    } else if (l == null || r == null) {
+  private def threeValueEquals(l: Any, r: Any) = {
+    if (l == null || r == null) {
       false
     } else {
       l == r
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ConditionalExpressionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ConditionalExpressionSuite.scala
index aaf40cc83e762..adadc8c54fae3 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ConditionalExpressionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ConditionalExpressionSuite.scala
@@ -125,7 +125,7 @@ class ConditionalExpressionSuite extends SparkFunSuite with ExpressionEvalHelper
     val literalString = Literal("a")
 
     checkEvaluation(CaseKeyWhen(c1, Seq(c2, c4, c5)), "b", row)
-    checkEvaluation(CaseKeyWhen(c1, Seq(c2, c4, literalNull, c5, c6)), "b", row)
+    checkEvaluation(CaseKeyWhen(c1, Seq(c2, c4, literalNull, c5, c6)), "c", row)
     checkEvaluation(CaseKeyWhen(c2, Seq(literalInt, c4, c5)), "a", row)
     checkEvaluation(CaseKeyWhen(c2, Seq(c1, c4, c5)), "b", row)
     checkEvaluation(CaseKeyWhen(c4, Seq(literalString, c2, c3)), 1, row)
@@ -134,7 +134,7 @@ class ConditionalExpressionSuite extends SparkFunSuite with ExpressionEvalHelper
     checkEvaluation(CaseKeyWhen(literalInt, Seq(c2, c4, c5)), "a", row)
     checkEvaluation(CaseKeyWhen(literalString, Seq(c5, c2, c4, c3)), 2, row)
     checkEvaluation(CaseKeyWhen(c6, Seq(c5, c2, c4, c3)), null, row)
-    checkEvaluation(CaseKeyWhen(literalNull, Seq(c2, c5, c1, c6)), "c", row)
+    checkEvaluation(CaseKeyWhen(literalNull, Seq(c2, c5, c1, c6)), null, row)
   }
 
   test("function least") {

From 0aed38e4498b24d372bfdc7001959e78536369a1 Mon Sep 17 00:00:00 2001
From: Hari Shreedharan <hshreedharan@apache.org>
Date: Mon, 13 Jul 2015 14:15:31 -0700
Subject: [PATCH 136/549] [SPARK-8533] [STREAMING] Upgrade Flume to 1.6.0

Author: Hari Shreedharan <hshreedharan@apache.org>

Closes #6939 from harishreedharan/upgrade-flume-1.6.0 and squashes the following commits:

94b80ae [Hari Shreedharan] [SPARK-8533][Streaming] Upgrade Flume to 1.6.0
---
 pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pom.xml b/pom.xml
index c2ebc1a11e770..370c95dd03632 100644
--- a/pom.xml
+++ b/pom.xml
@@ -128,7 +128,7 @@
     <yarn.version>${hadoop.version}</yarn.version>
     <hbase.version>0.98.7-hadoop2</hbase.version>
     <hbase.artifact>hbase</hbase.artifact>
-    <flume.version>1.4.0</flume.version>
+    <flume.version>1.6.0</flume.version>
     <zookeeper.version>3.4.5</zookeeper.version>
     <curator.version>2.4.0</curator.version>
     <hive.group>org.spark-project.hive</hive.group>

From b7bcbe25f90ba4e78b548465bc80d4de1d2c4a4a Mon Sep 17 00:00:00 2001
From: Neelesh Srinivas Salian <nsalian@cloudera.com>
Date: Mon, 13 Jul 2015 15:46:51 -0700
Subject: [PATCH 137/549] [SPARK-8743] [STREAMING] Deregister Codahale metrics
 for streaming when StreamingContext is closed

The issue link: https://issues.apache.org/jira/browse/SPARK-8743
Deregister Codahale metrics for streaming when StreamingContext is closed

Design:
Adding the method calls in the appropriate start() and stop () methods for the StreamingContext

Actions in the PullRequest:
1) Added the registerSource method call to the start method for the Streaming Context.
2) Added the removeSource method to the stop method.
3) Added comments for both 1 and 2 and comment to show initialization of the StreamingSource
4) Added a test case to check for both registration and de-registration of metrics

Previous closed PR for reference: https://github.com/apache/spark/pull/7250

Author: Neelesh Srinivas Salian <nsalian@cloudera.com>

Closes #7362 from nssalian/branch-SPARK-8743 and squashes the following commits:

7d998a3 [Neelesh Srinivas Salian] Removed the Thread.sleep() call
8b26397 [Neelesh Srinivas Salian] Moved the scalatest.{} import
0e8007a [Neelesh Srinivas Salian] moved import org.apache.spark{} to correct place
daedaa5 [Neelesh Srinivas Salian] Corrected Ordering of imports
8873180 [Neelesh Srinivas Salian] Removed redundancy in imports
59227a4 [Neelesh Srinivas Salian] Changed the ordering of the imports to classify  scala and spark imports
d8cb577 [Neelesh Srinivas Salian] Added registerSource to start() and removeSource to stop(). Wrote a test to check the registration and de-registration
---
 .../spark/streaming/StreamingContext.scala    | 10 +++--
 .../streaming/StreamingContextSuite.scala     | 41 ++++++++++++++++++-
 2 files changed, 45 insertions(+), 6 deletions(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
index ec49d0f42d122..6b78a82e68c24 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
@@ -192,11 +192,8 @@ class StreamingContext private[streaming] (
       None
     }
 
-  /** Register streaming source to metrics system */
+  /* Initializing a streamingSource to register metrics */
   private val streamingSource = new StreamingSource(this)
-  assert(env != null)
-  assert(env.metricsSystem != null)
-  env.metricsSystem.registerSource(streamingSource)
 
   private var state: StreamingContextState = INITIALIZED
 
@@ -606,6 +603,9 @@ class StreamingContext private[streaming] (
         }
         shutdownHookRef = Utils.addShutdownHook(
           StreamingContext.SHUTDOWN_HOOK_PRIORITY)(stopOnShutdown)
+        // Registering Streaming Metrics at the start of the StreamingContext
+        assert(env.metricsSystem != null)
+        env.metricsSystem.registerSource(streamingSource)
         uiTab.foreach(_.attach())
         logInfo("StreamingContext started")
       case ACTIVE =>
@@ -682,6 +682,8 @@ class StreamingContext private[streaming] (
           logWarning("StreamingContext has already been stopped")
         case ACTIVE =>
           scheduler.stop(stopGracefully)
+          // Removing the streamingSource to de-register the metrics on stop()
+          env.metricsSystem.removeSource(streamingSource)
           uiTab.foreach(_.detach())
           StreamingContext.setActiveContext(null)
           waiter.notifyStop()
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala
index 56b4ce5638a51..289a159d8990a 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala
@@ -20,20 +20,23 @@ package org.apache.spark.streaming
 import java.io.{File, NotSerializableException}
 import java.util.concurrent.atomic.AtomicInteger
 
+import scala.collection.mutable.ArrayBuffer
 import scala.collection.mutable.Queue
 
 import org.apache.commons.io.FileUtils
+import org.scalatest.{Assertions, BeforeAndAfter, PrivateMethodTester}
 import org.scalatest.concurrent.Eventually._
 import org.scalatest.concurrent.Timeouts
 import org.scalatest.exceptions.TestFailedDueToTimeoutException
 import org.scalatest.time.SpanSugar._
-import org.scalatest.{Assertions, BeforeAndAfter}
 
+import org.apache.spark.{Logging, SparkConf, SparkContext, SparkFunSuite}
+import org.apache.spark.metrics.MetricsSystem
+import org.apache.spark.metrics.source.Source
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.streaming.dstream.DStream
 import org.apache.spark.streaming.receiver.Receiver
 import org.apache.spark.util.Utils
-import org.apache.spark.{Logging, SparkConf, SparkContext, SparkException, SparkFunSuite}
 
 
 class StreamingContextSuite extends SparkFunSuite with BeforeAndAfter with Timeouts with Logging {
@@ -299,6 +302,25 @@ class StreamingContextSuite extends SparkFunSuite with BeforeAndAfter with Timeo
     Thread.sleep(100)
   }
 
+  test ("registering and de-registering of streamingSource") {
+    val conf = new SparkConf().setMaster(master).setAppName(appName)
+    ssc = new StreamingContext(conf, batchDuration)
+    assert(ssc.getState() === StreamingContextState.INITIALIZED)
+    addInputStream(ssc).register()
+    ssc.start()
+
+    val sources = StreamingContextSuite.getSources(ssc.env.metricsSystem)
+    val streamingSource = StreamingContextSuite.getStreamingSource(ssc)
+    assert(sources.contains(streamingSource))
+    assert(ssc.getState() === StreamingContextState.ACTIVE)
+
+    ssc.stop()
+    val sourcesAfterStop = StreamingContextSuite.getSources(ssc.env.metricsSystem)
+    val streamingSourceAfterStop = StreamingContextSuite.getStreamingSource(ssc)
+    assert(ssc.getState() === StreamingContextState.STOPPED)
+    assert(!sourcesAfterStop.contains(streamingSourceAfterStop))
+  }
+
   test("awaitTermination") {
     ssc = new StreamingContext(master, appName, batchDuration)
     val inputStream = addInputStream(ssc)
@@ -811,3 +833,18 @@ package object testPackage extends Assertions {
     }
   }
 }
+
+/**
+ * Helper methods for testing StreamingContextSuite
+ * This includes methods to access private methods and fields in StreamingContext and MetricsSystem
+ */
+private object StreamingContextSuite extends PrivateMethodTester {
+  private val _sources = PrivateMethod[ArrayBuffer[Source]]('sources)
+  private def getSources(metricsSystem: MetricsSystem): ArrayBuffer[Source] = {
+    metricsSystem.invokePrivate(_sources())
+  }
+  private val _streamingSource = PrivateMethod[StreamingSource]('streamingSource)
+  private def getStreamingSource(streamingContext: StreamingContext): StreamingSource = {
+    streamingContext.invokePrivate(_streamingSource())
+  }
+}

From 408b384de96b9dbe94945753f7947fbe84272ae1 Mon Sep 17 00:00:00 2001
From: Cheolsoo Park <cheolsoop@netflix.com>
Date: Mon, 13 Jul 2015 19:45:10 -0700
Subject: [PATCH 138/549] [SPARK-6910] [SQL] Support for pushing predicates
 down to metastore for partition pruning

This PR supersedes my old one #6921. Since my patch has changed quite a bit, I am opening a new PR to make it easier to review.

The changes include-
* Implement `toMetastoreFilter()` function in `HiveShim` that takes `Seq[Expression]` and converts them into a filter string for Hive metastore.
 * This functions matches all the `AttributeReference` + `BinaryComparisonOp` + `Integral/StringType` patterns in `Seq[Expression]` and fold them into a string.
* Change `hiveQlPartitions` field in `MetastoreRelation` to `getHiveQlPartitions()` function that takes a filter string parameter.
* Call `getHiveQlPartitions()` in `HiveTableScan` with a filter string.

But there are some cases in which predicate pushdown is disabled-

Case | Predicate pushdown
------- | -----------------------------
Hive integral and string types | Yes
Hive varchar type | No
Hive 0.13 and newer | Yes
Hive 0.12 and older | No
convertMetastoreParquet=false | Yes
convertMetastoreParquet=true | No

In case of `convertMetastoreParquet=true`, predicates are not pushed down because this conversion happens in an `Analyzer` rule (`HiveMetastoreCatalog.ParquetConversions`). At this point, `HiveTableScan` hasn't run, so predicates are not available. But reading the source code, I think it is intentional to convert the entire Hive table w/ all the partitions into `ParquetRelation` because then `ParquetRelation` can be cached and reused for any query against that table. Please correct me if I am wrong.

cc marmbrus

Author: Cheolsoo Park <cheolsoop@netflix.com>

Closes #7216 from piaozhexiu/SPARK-6910-2 and squashes the following commits:

aa1490f [Cheolsoo Park] Fix ordering of imports
c212c4d [Cheolsoo Park] Incorporate review comments
5e93f9d [Cheolsoo Park] Predicate pushdown into Hive metastore
---
 .../spark/sql/hive/HiveMetastoreCatalog.scala | 58 ++++++++--------
 .../org/apache/spark/sql/hive/HiveShim.scala  |  1 +
 .../spark/sql/hive/HiveStrategies.scala       |  4 +-
 .../sql/hive/client/ClientInterface.scala     | 11 ++-
 .../spark/sql/hive/client/ClientWrapper.scala | 22 +++---
 .../spark/sql/hive/client/HiveShim.scala      | 68 ++++++++++++++++++-
 .../sql/hive/execution/HiveTableScan.scala    |  7 +-
 .../spark/sql/hive/client/VersionsSuite.scala |  8 +++
 .../sql/hive/execution/PruningSuite.scala     |  2 +-
 9 files changed, 137 insertions(+), 44 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index 4b7a782c805a0..5bdf68c83fca7 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -301,7 +301,9 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
     val result = if (metastoreRelation.hiveQlTable.isPartitioned) {
       val partitionSchema = StructType.fromAttributes(metastoreRelation.partitionKeys)
       val partitionColumnDataTypes = partitionSchema.map(_.dataType)
-      val partitions = metastoreRelation.hiveQlPartitions.map { p =>
+      // We're converting the entire table into ParquetRelation, so predicates to Hive metastore
+      // are empty.
+      val partitions = metastoreRelation.getHiveQlPartitions().map { p =>
         val location = p.getLocation
         val values = InternalRow.fromSeq(p.getValues.zip(partitionColumnDataTypes).map {
           case (rawValue, dataType) => Cast(Literal(rawValue), dataType).eval(null)
@@ -644,32 +646,6 @@ private[hive] case class MetastoreRelation
     new Table(tTable)
   }
 
-  @transient val hiveQlPartitions: Seq[Partition] = table.getAllPartitions.map { p =>
-    val tPartition = new org.apache.hadoop.hive.metastore.api.Partition
-    tPartition.setDbName(databaseName)
-    tPartition.setTableName(tableName)
-    tPartition.setValues(p.values)
-
-    val sd = new org.apache.hadoop.hive.metastore.api.StorageDescriptor()
-    tPartition.setSd(sd)
-    sd.setCols(table.schema.map(c => new FieldSchema(c.name, c.hiveType, c.comment)))
-
-    sd.setLocation(p.storage.location)
-    sd.setInputFormat(p.storage.inputFormat)
-    sd.setOutputFormat(p.storage.outputFormat)
-
-    val serdeInfo = new org.apache.hadoop.hive.metastore.api.SerDeInfo
-    sd.setSerdeInfo(serdeInfo)
-    serdeInfo.setSerializationLib(p.storage.serde)
-
-    val serdeParameters = new java.util.HashMap[String, String]()
-    serdeInfo.setParameters(serdeParameters)
-    table.serdeProperties.foreach { case (k, v) => serdeParameters.put(k, v) }
-    p.storage.serdeProperties.foreach { case (k, v) => serdeParameters.put(k, v) }
-
-    new Partition(hiveQlTable, tPartition)
-  }
-
   @transient override lazy val statistics: Statistics = Statistics(
     sizeInBytes = {
       val totalSize = hiveQlTable.getParameters.get(StatsSetupConst.TOTAL_SIZE)
@@ -690,6 +666,34 @@ private[hive] case class MetastoreRelation
     }
   )
 
+  def getHiveQlPartitions(predicates: Seq[Expression] = Nil): Seq[Partition] = {
+    table.getPartitions(predicates).map { p =>
+      val tPartition = new org.apache.hadoop.hive.metastore.api.Partition
+      tPartition.setDbName(databaseName)
+      tPartition.setTableName(tableName)
+      tPartition.setValues(p.values)
+
+      val sd = new org.apache.hadoop.hive.metastore.api.StorageDescriptor()
+      tPartition.setSd(sd)
+      sd.setCols(table.schema.map(c => new FieldSchema(c.name, c.hiveType, c.comment)))
+
+      sd.setLocation(p.storage.location)
+      sd.setInputFormat(p.storage.inputFormat)
+      sd.setOutputFormat(p.storage.outputFormat)
+
+      val serdeInfo = new org.apache.hadoop.hive.metastore.api.SerDeInfo
+      sd.setSerdeInfo(serdeInfo)
+      serdeInfo.setSerializationLib(p.storage.serde)
+
+      val serdeParameters = new java.util.HashMap[String, String]()
+      serdeInfo.setParameters(serdeParameters)
+      table.serdeProperties.foreach { case (k, v) => serdeParameters.put(k, v) }
+      p.storage.serdeProperties.foreach { case (k, v) => serdeParameters.put(k, v) }
+
+      new Partition(hiveQlTable, tPartition)
+    }
+  }
+
   /** Only compare database and tablename, not alias. */
   override def sameResult(plan: LogicalPlan): Boolean = {
     plan match {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveShim.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveShim.scala
index d08c594151654..a357bb39ca7fd 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveShim.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveShim.scala
@@ -27,6 +27,7 @@ import scala.reflect.ClassTag
 
 import com.esotericsoftware.kryo.Kryo
 import com.esotericsoftware.kryo.io.{Input, Output}
+
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.hive.ql.exec.{UDF, Utilities}
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
index ed359620a5f7f..9638a8201e190 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
@@ -125,7 +125,7 @@ private[hive] trait HiveStrategies {
                 InterpretedPredicate.create(castedPredicate)
               }
 
-            val partitions = relation.hiveQlPartitions.filter { part =>
+            val partitions = relation.getHiveQlPartitions(pruningPredicates).filter { part =>
               val partitionValues = part.getValues
               var i = 0
               while (i < partitionValues.size()) {
@@ -213,7 +213,7 @@ private[hive] trait HiveStrategies {
           projectList,
           otherPredicates,
           identity[Seq[Expression]],
-          HiveTableScan(_, relation, pruningPredicates.reduceLeftOption(And))(hiveContext)) :: Nil
+          HiveTableScan(_, relation, pruningPredicates)(hiveContext)) :: Nil
       case _ =>
         Nil
     }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientInterface.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientInterface.scala
index 0a1d761a52f88..1656587d14835 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientInterface.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientInterface.scala
@@ -21,6 +21,7 @@ import java.io.PrintStream
 import java.util.{Map => JMap}
 
 import org.apache.spark.sql.catalyst.analysis.{NoSuchDatabaseException, NoSuchTableException}
+import org.apache.spark.sql.catalyst.expressions.Expression
 
 private[hive] case class HiveDatabase(
     name: String,
@@ -71,7 +72,12 @@ private[hive] case class HiveTable(
 
   def isPartitioned: Boolean = partitionColumns.nonEmpty
 
-  def getAllPartitions: Seq[HivePartition] = client.getAllPartitions(this)
+  def getPartitions(predicates: Seq[Expression]): Seq[HivePartition] = {
+    predicates match {
+      case Nil => client.getAllPartitions(this)
+      case _ => client.getPartitionsByFilter(this, predicates)
+    }
+  }
 
   // Hive does not support backticks when passing names to the client.
   def qualifiedName: String = s"$database.$name"
@@ -132,6 +138,9 @@ private[hive] trait ClientInterface {
   /** Returns all partitions for the given table. */
   def getAllPartitions(hTable: HiveTable): Seq[HivePartition]
 
+  /** Returns partitions filtered by predicates for the given table. */
+  def getPartitionsByFilter(hTable: HiveTable, predicates: Seq[Expression]): Seq[HivePartition]
+
   /** Loads a static partition into an existing table. */
   def loadPartition(
       loadPath: String,
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala
index 9d83ca6c113dc..1f280c642979a 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala
@@ -17,27 +17,24 @@
 
 package org.apache.spark.sql.hive.client
 
-import java.io.{BufferedReader, InputStreamReader, File, PrintStream}
-import java.net.URI
-import java.util.{ArrayList => JArrayList, Map => JMap, List => JList, Set => JSet}
+import java.io.{File, PrintStream}
+import java.util.{Map => JMap}
 import javax.annotation.concurrent.GuardedBy
 
+import org.apache.spark.sql.catalyst.expressions.Expression
 import org.apache.spark.util.CircularBuffer
 
 import scala.collection.JavaConversions._
 import scala.language.reflectiveCalls
 
 import org.apache.hadoop.fs.Path
-import org.apache.hadoop.hive.metastore.api.Database
 import org.apache.hadoop.hive.conf.HiveConf
+import org.apache.hadoop.hive.metastore.api.{Database, FieldSchema}
 import org.apache.hadoop.hive.metastore.{TableType => HTableType}
-import org.apache.hadoop.hive.metastore.api
-import org.apache.hadoop.hive.metastore.api.FieldSchema
-import org.apache.hadoop.hive.ql.metadata
 import org.apache.hadoop.hive.ql.metadata.Hive
-import org.apache.hadoop.hive.ql.session.SessionState
 import org.apache.hadoop.hive.ql.processors._
-import org.apache.hadoop.hive.ql.Driver
+import org.apache.hadoop.hive.ql.session.SessionState
+import org.apache.hadoop.hive.ql.{Driver, metadata}
 
 import org.apache.spark.Logging
 import org.apache.spark.sql.execution.QueryExecutionException
@@ -316,6 +313,13 @@ private[hive] class ClientWrapper(
     shim.getAllPartitions(client, qlTable).map(toHivePartition)
   }
 
+  override def getPartitionsByFilter(
+      hTable: HiveTable,
+      predicates: Seq[Expression]): Seq[HivePartition] = withHiveState {
+    val qlTable = toQlTable(hTable)
+    shim.getPartitionsByFilter(client, qlTable, predicates).map(toHivePartition)
+  }
+
   override def listTables(dbName: String): Seq[String] = withHiveState {
     client.getAllTables(dbName)
   }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
index 1fa9d278e2a57..5542a521b173d 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
@@ -31,6 +31,11 @@ import org.apache.hadoop.hive.ql.Driver
 import org.apache.hadoop.hive.ql.metadata.{Hive, Partition, Table}
 import org.apache.hadoop.hive.ql.processors.{CommandProcessor, CommandProcessorFactory}
 import org.apache.hadoop.hive.ql.session.SessionState
+import org.apache.hadoop.hive.serde.serdeConstants
+
+import org.apache.spark.Logging
+import org.apache.spark.sql.catalyst.expressions.{Expression, AttributeReference, BinaryComparison}
+import org.apache.spark.sql.types.{StringType, IntegralType}
 
 /**
  * A shim that defines the interface between ClientWrapper and the underlying Hive library used to
@@ -61,6 +66,8 @@ private[client] sealed abstract class Shim {
 
   def getAllPartitions(hive: Hive, table: Table): Seq[Partition]
 
+  def getPartitionsByFilter(hive: Hive, table: Table, predicates: Seq[Expression]): Seq[Partition]
+
   def getCommandProcessor(token: String, conf: HiveConf): CommandProcessor
 
   def getDriverResults(driver: Driver): Seq[String]
@@ -109,7 +116,7 @@ private[client] sealed abstract class Shim {
 
 }
 
-private[client] class Shim_v0_12 extends Shim {
+private[client] class Shim_v0_12 extends Shim with Logging {
 
   private lazy val startMethod =
     findStaticMethod(
@@ -196,6 +203,17 @@ private[client] class Shim_v0_12 extends Shim {
   override def getAllPartitions(hive: Hive, table: Table): Seq[Partition] =
     getAllPartitionsMethod.invoke(hive, table).asInstanceOf[JSet[Partition]].toSeq
 
+  override def getPartitionsByFilter(
+      hive: Hive,
+      table: Table,
+      predicates: Seq[Expression]): Seq[Partition] = {
+    // getPartitionsByFilter() doesn't support binary comparison ops in Hive 0.12.
+    // See HIVE-4888.
+    logDebug("Hive 0.12 doesn't support predicate pushdown to metastore. " +
+      "Please use Hive 0.13 or higher.")
+    getAllPartitions(hive, table)
+  }
+
   override def getCommandProcessor(token: String, conf: HiveConf): CommandProcessor =
     getCommandProcessorMethod.invoke(null, token, conf).asInstanceOf[CommandProcessor]
 
@@ -267,6 +285,12 @@ private[client] class Shim_v0_13 extends Shim_v0_12 {
       classOf[Hive],
       "getAllPartitionsOf",
       classOf[Table])
+  private lazy val getPartitionsByFilterMethod =
+    findMethod(
+      classOf[Hive],
+      "getPartitionsByFilter",
+      classOf[Table],
+      classOf[String])
   private lazy val getCommandProcessorMethod =
     findStaticMethod(
       classOf[CommandProcessorFactory],
@@ -288,6 +312,48 @@ private[client] class Shim_v0_13 extends Shim_v0_12 {
   override def getAllPartitions(hive: Hive, table: Table): Seq[Partition] =
     getAllPartitionsMethod.invoke(hive, table).asInstanceOf[JSet[Partition]].toSeq
 
+  override def getPartitionsByFilter(
+      hive: Hive,
+      table: Table,
+      predicates: Seq[Expression]): Seq[Partition] = {
+    // hive varchar is treated as catalyst string, but hive varchar can't be pushed down.
+    val varcharKeys = table.getPartitionKeys
+      .filter(col => col.getType.startsWith(serdeConstants.VARCHAR_TYPE_NAME))
+      .map(col => col.getName).toSet
+
+    // Hive getPartitionsByFilter() takes a string that represents partition
+    // predicates like "str_key=\"value\" and int_key=1 ..."
+    val filter = predicates.flatMap { expr =>
+      expr match {
+        case op @ BinaryComparison(lhs, rhs) => {
+          lhs match {
+            case AttributeReference(_, _, _, _) => {
+              rhs.dataType match {
+                case _: IntegralType =>
+                  Some(lhs.prettyString + op.symbol + rhs.prettyString)
+                case _: StringType if (!varcharKeys.contains(lhs.prettyString)) =>
+                  Some(lhs.prettyString + op.symbol + "\"" + rhs.prettyString + "\"")
+                case _ => None
+              }
+            }
+            case _ => None
+          }
+        }
+        case _ => None
+      }
+    }.mkString(" and ")
+
+    val partitions =
+      if (filter.isEmpty) {
+        getAllPartitionsMethod.invoke(hive, table).asInstanceOf[JSet[Partition]]
+      } else {
+        logDebug(s"Hive metastore filter is '$filter'.")
+        getPartitionsByFilterMethod.invoke(hive, table, filter).asInstanceOf[JArrayList[Partition]]
+      }
+
+    partitions.toSeq
+  }
+
   override def getCommandProcessor(token: String, conf: HiveConf): CommandProcessor =
     getCommandProcessorMethod.invoke(null, Array(token), conf).asInstanceOf[CommandProcessor]
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScan.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScan.scala
index d33da8242cc1d..ba7eb15a1c0c6 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScan.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScan.scala
@@ -44,7 +44,7 @@ private[hive]
 case class HiveTableScan(
     requestedAttributes: Seq[Attribute],
     relation: MetastoreRelation,
-    partitionPruningPred: Option[Expression])(
+    partitionPruningPred: Seq[Expression])(
     @transient val context: HiveContext)
   extends LeafNode {
 
@@ -56,7 +56,7 @@ case class HiveTableScan(
 
   // Bind all partition key attribute references in the partition pruning predicate for later
   // evaluation.
-  private[this] val boundPruningPred = partitionPruningPred.map { pred =>
+  private[this] val boundPruningPred = partitionPruningPred.reduceLeftOption(And).map { pred =>
     require(
       pred.dataType == BooleanType,
       s"Data type of predicate $pred must be BooleanType rather than ${pred.dataType}.")
@@ -133,7 +133,8 @@ case class HiveTableScan(
   protected override def doExecute(): RDD[InternalRow] = if (!relation.hiveQlTable.isPartitioned) {
     hadoopReader.makeRDDForTable(relation.hiveQlTable)
   } else {
-    hadoopReader.makeRDDForPartitionedTable(prunePartitions(relation.hiveQlPartitions))
+    hadoopReader.makeRDDForPartitionedTable(
+      prunePartitions(relation.getHiveQlPartitions(partitionPruningPred)))
   }
 
   override def output: Seq[Attribute] = attributes
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
index d52e162acbd04..3eb127e23d486 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
@@ -20,7 +20,9 @@ package org.apache.spark.sql.hive.client
 import java.io.File
 
 import org.apache.spark.{Logging, SparkFunSuite}
+import org.apache.spark.sql.catalyst.expressions.{NamedExpression, Literal, AttributeReference, EqualTo}
 import org.apache.spark.sql.catalyst.util.quietly
+import org.apache.spark.sql.types.IntegerType
 import org.apache.spark.util.Utils
 
 /**
@@ -151,6 +153,12 @@ class VersionsSuite extends SparkFunSuite with Logging {
       client.getAllPartitions(client.getTable("default", "src_part"))
     }
 
+    test(s"$version: getPartitionsByFilter") {
+      client.getPartitionsByFilter(client.getTable("default", "src_part"), Seq(EqualTo(
+        AttributeReference("key", IntegerType, false)(NamedExpression.newExprId),
+        Literal(1))))
+    }
+
     test(s"$version: loadPartition") {
       client.loadPartition(
         emptyDir,
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruningSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruningSuite.scala
index de6a41ce5bfcb..e83a7dc77e329 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruningSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruningSuite.scala
@@ -151,7 +151,7 @@ class PruningSuite extends HiveComparisonTest with BeforeAndAfter {
         case p @ HiveTableScan(columns, relation, _) =>
           val columnNames = columns.map(_.name)
           val partValues = if (relation.table.isPartitioned) {
-            p.prunePartitions(relation.hiveQlPartitions).map(_.getValues)
+            p.prunePartitions(relation.getHiveQlPartitions()).map(_.getValues)
           } else {
             Seq.empty
           }

From 20c1434a8dbb25b98f6b434b158ae88e44ce3057 Mon Sep 17 00:00:00 2001
From: Joseph Gonzalez <joseph.e.gonzalez@gmail.com>
Date: Tue, 14 Jul 2015 00:32:29 -0700
Subject: [PATCH 139/549] [SPARK-9001] Fixing errors in javadocs that lead to
 failed build/sbt doc

These are minor corrections in the documentation of several classes that are preventing:

```bash
build/sbt publish-local
```

I believe this might be an issue associated with running JDK8 as ankurdave does not appear to have this issue in JDK7.

Author: Joseph Gonzalez <joseph.e.gonzalez@gmail.com>

Closes #7354 from jegonzal/FixingJavadocErrors and squashes the following commits:

6664b7e [Joseph Gonzalez] making requested changes
2e16d89 [Joseph Gonzalez] Fixing errors in javadocs that prevents build/sbt publish-local from completing.
---
 .../java/org/apache/spark/launcher/SparkLauncher.java  |  5 +++--
 .../java/org/apache/spark/launcher/package-info.java   | 10 +++++++---
 .../java/org/apache/spark/unsafe/bitset/BitSet.java    |  2 +-
 .../org/apache/spark/unsafe/bitset/BitSetMethods.java  |  2 +-
 .../org/apache/spark/unsafe/map/BytesToBytesMap.java   |  6 +++++-
 .../java/org/apache/spark/unsafe/types/UTF8String.java |  8 ++++----
 6 files changed, 21 insertions(+), 12 deletions(-)

diff --git a/launcher/src/main/java/org/apache/spark/launcher/SparkLauncher.java b/launcher/src/main/java/org/apache/spark/launcher/SparkLauncher.java
index d4cfeacb6ef18..c0f89c9230692 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/SparkLauncher.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/SparkLauncher.java
@@ -25,11 +25,12 @@
 
 import static org.apache.spark.launcher.CommandBuilderUtils.*;
 
-/**
+/** 
  * Launcher for Spark applications.
- * <p/>
+ * <p>
  * Use this class to start Spark applications programmatically. The class uses a builder pattern
  * to allow clients to configure the Spark application and launch it as a child process.
+ * </p>
  */
 public class SparkLauncher {
 
diff --git a/launcher/src/main/java/org/apache/spark/launcher/package-info.java b/launcher/src/main/java/org/apache/spark/launcher/package-info.java
index 7ed756f4b8591..7c97dba511b28 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/package-info.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/package-info.java
@@ -17,13 +17,17 @@
 
 /**
  * Library for launching Spark applications.
- * <p/>
+ * 
+ * <p>
  * This library allows applications to launch Spark programmatically. There's only one entry
  * point to the library - the {@link org.apache.spark.launcher.SparkLauncher} class.
- * <p/>
+ * </p>
+ *
+ * <p>
  * To launch a Spark application, just instantiate a {@link org.apache.spark.launcher.SparkLauncher}
  * and configure the application to run. For example:
- *
+ * </p>
+ * 
  * <pre>
  * {@code
  *   import org.apache.spark.launcher.SparkLauncher;
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/bitset/BitSet.java b/unsafe/src/main/java/org/apache/spark/unsafe/bitset/BitSet.java
index 28e23da108ebe..7c124173b0bbb 100644
--- a/unsafe/src/main/java/org/apache/spark/unsafe/bitset/BitSet.java
+++ b/unsafe/src/main/java/org/apache/spark/unsafe/bitset/BitSet.java
@@ -90,7 +90,7 @@ public boolean isSet(int index) {
    * To iterate over the true bits in a BitSet, use the following loop:
    * <pre>
    * <code>
-   *  for (long i = bs.nextSetBit(0); i >= 0; i = bs.nextSetBit(i + 1)) {
+   *  for (long i = bs.nextSetBit(0); i &gt;= 0; i = bs.nextSetBit(i + 1)) {
    *    // operate on index i here
    *  }
    * </code>
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/bitset/BitSetMethods.java b/unsafe/src/main/java/org/apache/spark/unsafe/bitset/BitSetMethods.java
index 0987191c1c636..27462c7fa5e62 100644
--- a/unsafe/src/main/java/org/apache/spark/unsafe/bitset/BitSetMethods.java
+++ b/unsafe/src/main/java/org/apache/spark/unsafe/bitset/BitSetMethods.java
@@ -87,7 +87,7 @@ public static boolean anySet(Object baseObject, long baseOffset, long bitSetWidt
    * To iterate over the true bits in a BitSet, use the following loop:
    * <pre>
    * <code>
-   *  for (long i = bs.nextSetBit(0, sizeInWords); i >= 0; i = bs.nextSetBit(i + 1, sizeInWords)) {
+   *  for (long i = bs.nextSetBit(0, sizeInWords); i &gt;= 0; i = bs.nextSetBit(i + 1, sizeInWords)) {
    *    // operate on index i here
    *  }
    * </code>
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java b/unsafe/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java
index 8d8c08de52b84..d0bde69cc1068 100644
--- a/unsafe/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java
+++ b/unsafe/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java
@@ -404,14 +404,17 @@ public int getValueLength() {
      * at the value address.
      * <p>
      * It is only valid to call this method immediately after calling `lookup()` using the same key.
+     * </p>
      * <p>
      * The key and value must be word-aligned (that is, their sizes must multiples of 8).
+     * </p>
      * <p>
      * After calling this method, calls to `get[Key|Value]Address()` and `get[Key|Value]Length`
      * will return information on the data stored by this `putNewKey` call.
+     * </p>
      * <p>
      * As an example usage, here's the proper way to store a new key:
-     * <p>
+     * </p>
      * <pre>
      *   Location loc = map.lookup(keyBaseObject, keyBaseOffset, keyLengthInBytes);
      *   if (!loc.isDefined()) {
@@ -420,6 +423,7 @@ public int getValueLength() {
      * </pre>
      * <p>
      * Unspecified behavior if the key is not defined.
+     * </p>
      */
     public void putNewKey(
         Object keyBaseObject,
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
index 60d050b0a0c97..e7f9fbb2bc682 100644
--- a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
+++ b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -330,8 +330,8 @@ public int indexOf(UTF8String v, int start) {
   /**
    * Returns str, right-padded with pad to a length of len
    * For example:
-   *   ('hi', 5, '??') => 'hi???'
-   *   ('hi', 1, '??') => 'h'
+   *   ('hi', 5, '??') =&gt; 'hi???'
+   *   ('hi', 1, '??') =&gt; 'h'
    */
   public UTF8String rpad(int len, UTF8String pad) {
     int spaces = len - this.numChars(); // number of char need to pad
@@ -363,8 +363,8 @@ public UTF8String rpad(int len, UTF8String pad) {
   /**
    * Returns str, left-padded with pad to a length of len.
    * For example:
-   *   ('hi', 5, '??') => '???hi'
-   *   ('hi', 1, '??') => 'h'
+   *   ('hi', 5, '??') =&gt; '???hi'
+   *   ('hi', 1, '??') =&gt; 'h'
    */
   public UTF8String lpad(int len, UTF8String pad) {
     int spaces = len - this.numChars(); // number of char need to pad

From c1feebd8fcba985667db8ccdafd2b5ec76dcfae7 Mon Sep 17 00:00:00 2001
From: zhaishidan <zhaishidan@haizhi.com>
Date: Tue, 14 Jul 2015 08:54:30 +0100
Subject: [PATCH 140/549] [SPARK-9010] [DOCUMENTATION] Improve the Spark
 Configuration document about `spark.kryoserializer.buffer`

The meaning of spark.kryoserializer.buffer should be "Initial size of Kryo's serialization buffer. Note that there will be one buffer per core on each worker. This buffer will grow up to spark.kryoserializer.buffer.max if needed.".

The spark.kryoserializer.buffer.max.mb is out-of-date in spark 1.4.

Author: zhaishidan <zhaishidan@haizhi.com>

Closes #7393 from stanzhai/master and squashes the following commits:

69729ef [zhaishidan] fix document error about spark.kryoserializer.buffer.max.mb
---
 docs/configuration.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/configuration.md b/docs/configuration.md
index 443322e1eadf4..8a186ee51c1ca 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -665,7 +665,7 @@ Apart from these, the following properties are also available, and may be useful
   <td>
     Initial size of Kryo's serialization buffer. Note that there will be one buffer
      <i>per core</i> on each worker. This buffer will grow up to
-     <code>spark.kryoserializer.buffer.max.mb</code> if needed.
+     <code>spark.kryoserializer.buffer.max</code> if needed.
   </td>
 </tr>
 <tr>

From 257236c3e17906098f801cbc2059e7a9054e8cab Mon Sep 17 00:00:00 2001
From: Daoyuan Wang <daoyuan.wang@intel.com>
Date: Tue, 14 Jul 2015 01:09:33 -0700
Subject: [PATCH 141/549] [SPARK-6851] [SQL] function least/greatest follow up

This is a follow up of remaining comments from #6851

Author: Daoyuan Wang <daoyuan.wang@intel.com>

Closes #7387 from adrian-wang/udflgfollow and squashes the following commits:

6163e62 [Daoyuan Wang] add skipping null values
e8c2e09 [Daoyuan Wang] use seq
8362966 [Daoyuan Wang] pr6851 follow up
---
 .../catalyst/expressions/conditionals.scala   | 16 +++-
 .../ConditionalExpressionSuite.scala          | 79 ++++++++++---------
 .../org/apache/spark/sql/functions.scala      | 16 ++--
 3 files changed, 62 insertions(+), 49 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionals.scala
index 84c28c27f934b..eea7706b9d391 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionals.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionals.scala
@@ -311,7 +311,11 @@ case class CaseKeyWhen(key: Expression, branches: Seq[Expression]) extends CaseW
   }
 }
 
-case class Least(children: Expression*) extends Expression {
+/**
+ * A function that returns the least value of all parameters, skipping null values.
+ * It takes at least 2 parameters, and returns null iff all parameters are null.
+ */
+case class Least(children: Seq[Expression]) extends Expression {
   require(children.length > 1, "LEAST requires at least 2 arguments, got " + children.length)
 
   override def nullable: Boolean = children.forall(_.nullable)
@@ -356,12 +360,16 @@ case class Least(children: Expression*) extends Expression {
       ${evalChildren.map(_.code).mkString("\n")}
       boolean ${ev.isNull} = true;
       ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
-      ${(0 until children.length).map(updateEval).mkString("\n")}
+      ${children.indices.map(updateEval).mkString("\n")}
     """
   }
 }
 
-case class Greatest(children: Expression*) extends Expression {
+/**
+ * A function that returns the greatest value of all parameters, skipping null values.
+ * It takes at least 2 parameters, and returns null iff all parameters are null.
+ */
+case class Greatest(children: Seq[Expression]) extends Expression {
   require(children.length > 1, "GREATEST requires at least 2 arguments, got " + children.length)
 
   override def nullable: Boolean = children.forall(_.nullable)
@@ -406,7 +414,7 @@ case class Greatest(children: Expression*) extends Expression {
       ${evalChildren.map(_.code).mkString("\n")}
       boolean ${ev.isNull} = true;
       ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
-      ${(0 until children.length).map(updateEval).mkString("\n")}
+      ${children.indices.map(updateEval).mkString("\n")}
     """
   }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ConditionalExpressionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ConditionalExpressionSuite.scala
index adadc8c54fae3..afa143bd5f331 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ConditionalExpressionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ConditionalExpressionSuite.scala
@@ -144,35 +144,35 @@ class ConditionalExpressionSuite extends SparkFunSuite with ExpressionEvalHelper
     val c3 = 'a.string.at(2)
     val c4 = 'a.string.at(3)
     val c5 = 'a.string.at(4)
-    checkEvaluation(Least(c4, c3, c5), "a", row)
-    checkEvaluation(Least(c1, c2), 1, row)
-    checkEvaluation(Least(c1, c2, Literal(-1)), -1, row)
-    checkEvaluation(Least(c4, c5, c3, c3, Literal("a")), "a", row)
-
-    checkEvaluation(Least(Literal(null), Literal(null)), null, InternalRow.empty)
-    checkEvaluation(Least(Literal(-1.0), Literal(2.5)), -1.0, InternalRow.empty)
-    checkEvaluation(Least(Literal(-1), Literal(2)), -1, InternalRow.empty)
+    checkEvaluation(Least(Seq(c4, c3, c5)), "a", row)
+    checkEvaluation(Least(Seq(c1, c2)), 1, row)
+    checkEvaluation(Least(Seq(c1, c2, Literal(-1))), -1, row)
+    checkEvaluation(Least(Seq(c4, c5, c3, c3, Literal("a"))), "a", row)
+
+    checkEvaluation(Least(Seq(Literal(null), Literal(null))), null, InternalRow.empty)
+    checkEvaluation(Least(Seq(Literal(-1.0), Literal(2.5))), -1.0, InternalRow.empty)
+    checkEvaluation(Least(Seq(Literal(-1), Literal(2))), -1, InternalRow.empty)
     checkEvaluation(
-      Least(Literal((-1.0).toFloat), Literal(2.5.toFloat)), (-1.0).toFloat, InternalRow.empty)
+      Least(Seq(Literal((-1.0).toFloat), Literal(2.5.toFloat))), (-1.0).toFloat, InternalRow.empty)
     checkEvaluation(
-      Least(Literal(Long.MaxValue), Literal(Long.MinValue)), Long.MinValue, InternalRow.empty)
-    checkEvaluation(Least(Literal(1.toByte), Literal(2.toByte)), 1.toByte, InternalRow.empty)
+      Least(Seq(Literal(Long.MaxValue), Literal(Long.MinValue))), Long.MinValue, InternalRow.empty)
+    checkEvaluation(Least(Seq(Literal(1.toByte), Literal(2.toByte))), 1.toByte, InternalRow.empty)
     checkEvaluation(
-      Least(Literal(1.toShort), Literal(2.toByte.toShort)), 1.toShort, InternalRow.empty)
-    checkEvaluation(Least(Literal("abc"), Literal("aaaa")), "aaaa", InternalRow.empty)
-    checkEvaluation(Least(Literal(true), Literal(false)), false, InternalRow.empty)
+      Least(Seq(Literal(1.toShort), Literal(2.toByte.toShort))), 1.toShort, InternalRow.empty)
+    checkEvaluation(Least(Seq(Literal("abc"), Literal("aaaa"))), "aaaa", InternalRow.empty)
+    checkEvaluation(Least(Seq(Literal(true), Literal(false))), false, InternalRow.empty)
     checkEvaluation(
-      Least(
+      Least(Seq(
         Literal(BigDecimal("1234567890987654321123456")),
-        Literal(BigDecimal("1234567890987654321123458"))),
+        Literal(BigDecimal("1234567890987654321123458")))),
       BigDecimal("1234567890987654321123456"), InternalRow.empty)
     checkEvaluation(
-      Least(Literal(Date.valueOf("2015-01-01")), Literal(Date.valueOf("2015-07-01"))),
+      Least(Seq(Literal(Date.valueOf("2015-01-01")), Literal(Date.valueOf("2015-07-01")))),
       Date.valueOf("2015-01-01"), InternalRow.empty)
     checkEvaluation(
-      Least(
+      Least(Seq(
         Literal(Timestamp.valueOf("2015-07-01 08:00:00")),
-        Literal(Timestamp.valueOf("2015-07-01 10:00:00"))),
+        Literal(Timestamp.valueOf("2015-07-01 10:00:00")))),
       Timestamp.valueOf("2015-07-01 08:00:00"), InternalRow.empty)
   }
 
@@ -183,35 +183,36 @@ class ConditionalExpressionSuite extends SparkFunSuite with ExpressionEvalHelper
     val c3 = 'a.string.at(2)
     val c4 = 'a.string.at(3)
     val c5 = 'a.string.at(4)
-    checkEvaluation(Greatest(c4, c5, c3), "c", row)
-    checkEvaluation(Greatest(c2, c1), 2, row)
-    checkEvaluation(Greatest(c1, c2, Literal(2)), 2, row)
-    checkEvaluation(Greatest(c4, c5, c3, Literal("ccc")), "ccc", row)
-
-    checkEvaluation(Greatest(Literal(null), Literal(null)), null, InternalRow.empty)
-    checkEvaluation(Greatest(Literal(-1.0), Literal(2.5)), 2.5, InternalRow.empty)
-    checkEvaluation(Greatest(Literal(-1), Literal(2)), 2, InternalRow.empty)
+    checkEvaluation(Greatest(Seq(c4, c5, c3)), "c", row)
+    checkEvaluation(Greatest(Seq(c2, c1)), 2, row)
+    checkEvaluation(Greatest(Seq(c1, c2, Literal(2))), 2, row)
+    checkEvaluation(Greatest(Seq(c4, c5, c3, Literal("ccc"))), "ccc", row)
+
+    checkEvaluation(Greatest(Seq(Literal(null), Literal(null))), null, InternalRow.empty)
+    checkEvaluation(Greatest(Seq(Literal(-1.0), Literal(2.5))), 2.5, InternalRow.empty)
+    checkEvaluation(Greatest(Seq(Literal(-1), Literal(2))), 2, InternalRow.empty)
     checkEvaluation(
-      Greatest(Literal((-1.0).toFloat), Literal(2.5.toFloat)), 2.5.toFloat, InternalRow.empty)
+      Greatest(Seq(Literal((-1.0).toFloat), Literal(2.5.toFloat))), 2.5.toFloat, InternalRow.empty)
+    checkEvaluation(Greatest(
+      Seq(Literal(Long.MaxValue), Literal(Long.MinValue))), Long.MaxValue, InternalRow.empty)
     checkEvaluation(
-      Greatest(Literal(Long.MaxValue), Literal(Long.MinValue)), Long.MaxValue, InternalRow.empty)
-    checkEvaluation(Greatest(Literal(1.toByte), Literal(2.toByte)), 2.toByte, InternalRow.empty)
+      Greatest(Seq(Literal(1.toByte), Literal(2.toByte))), 2.toByte, InternalRow.empty)
     checkEvaluation(
-      Greatest(Literal(1.toShort), Literal(2.toByte.toShort)), 2.toShort, InternalRow.empty)
-    checkEvaluation(Greatest(Literal("abc"), Literal("aaaa")), "abc", InternalRow.empty)
-    checkEvaluation(Greatest(Literal(true), Literal(false)), true, InternalRow.empty)
+      Greatest(Seq(Literal(1.toShort), Literal(2.toByte.toShort))), 2.toShort, InternalRow.empty)
+    checkEvaluation(Greatest(Seq(Literal("abc"), Literal("aaaa"))), "abc", InternalRow.empty)
+    checkEvaluation(Greatest(Seq(Literal(true), Literal(false))), true, InternalRow.empty)
     checkEvaluation(
-      Greatest(
+      Greatest(Seq(
         Literal(BigDecimal("1234567890987654321123456")),
-        Literal(BigDecimal("1234567890987654321123458"))),
+        Literal(BigDecimal("1234567890987654321123458")))),
       BigDecimal("1234567890987654321123458"), InternalRow.empty)
-    checkEvaluation(
-      Greatest(Literal(Date.valueOf("2015-01-01")), Literal(Date.valueOf("2015-07-01"))),
+    checkEvaluation(Greatest(
+      Seq(Literal(Date.valueOf("2015-01-01")), Literal(Date.valueOf("2015-07-01")))),
       Date.valueOf("2015-07-01"), InternalRow.empty)
     checkEvaluation(
-      Greatest(
+      Greatest(Seq(
         Literal(Timestamp.valueOf("2015-07-01 08:00:00")),
-        Literal(Timestamp.valueOf("2015-07-01 10:00:00"))),
+        Literal(Timestamp.valueOf("2015-07-01 10:00:00")))),
       Timestamp.valueOf("2015-07-01 10:00:00"), InternalRow.empty)
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index ffa52f62588dc..0d4e160ed8057 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -1073,7 +1073,8 @@ object functions {
   def floor(columnName: String): Column = floor(Column(columnName))
 
   /**
-   * Returns the greatest value of the list of values.
+   * Returns the greatest value of the list of values, skipping null values.
+   * This function takes at least 2 parameters. It will return null iff all parameters are null.
    *
    * @group normal_funcs
    * @since 1.5.0
@@ -1082,11 +1083,12 @@ object functions {
   def greatest(exprs: Column*): Column = if (exprs.length < 2) {
     sys.error("GREATEST takes at least 2 parameters")
   } else {
-    Greatest(exprs.map(_.expr): _*)
+    Greatest(exprs.map(_.expr))
   }
 
   /**
-   * Returns the greatest value of the list of column names.
+   * Returns the greatest value of the list of column names, skipping null values.
+   * This function takes at least 2 parameters. It will return null iff all parameters are null.
    *
    * @group normal_funcs
    * @since 1.5.0
@@ -1198,7 +1200,8 @@ object functions {
   def hypot(l: Double, rightName: String): Column = hypot(l, Column(rightName))
 
   /**
-   * Returns the least value of the list of values.
+   * Returns the least value of the list of values, skipping null values.
+   * This function takes at least 2 parameters. It will return null iff all parameters are null.
    *
    * @group normal_funcs
    * @since 1.5.0
@@ -1207,11 +1210,12 @@ object functions {
   def least(exprs: Column*): Column = if (exprs.length < 2) {
     sys.error("LEAST takes at least 2 parameters")
   } else {
-    Least(exprs.map(_.expr): _*)
+    Least(exprs.map(_.expr))
   }
 
   /**
-   * Returns the least value of the list of column names.
+   * Returns the least value of the list of column names, skipping null values.
+   * This function takes at least 2 parameters. It will return null iff all parameters are null.
    *
    * @group normal_funcs
    * @since 1.5.0

From 59d820aa8dec08b744971237860b4c6bef577ddf Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@outlook.com>
Date: Tue, 14 Jul 2015 10:20:15 -0700
Subject: [PATCH 142/549] [SPARK-9029] [SQL] shortcut CaseKeyWhen if key is
 null

Author: Wenchen Fan <cloud0fan@outlook.com>

Closes #7389 from cloud-fan/case-when and squashes the following commits:

ea4b6ba [Wenchen Fan] shortcut for case key when
---
 .../catalyst/expressions/conditionals.scala   | 48 +++++++++----------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionals.scala
index eea7706b9d391..c7f039ede26b3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionals.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionals.scala
@@ -230,24 +230,31 @@ case class CaseKeyWhen(key: Expression, branches: Seq[Expression]) extends CaseW
     }
   }
 
+  private def evalElse(input: InternalRow): Any = {
+    if (branchesArr.length % 2 == 0) {
+      null
+    } else {
+      branchesArr(branchesArr.length - 1).eval(input)
+    }
+  }
+
   /** Written in imperative fashion for performance considerations. */
   override def eval(input: InternalRow): Any = {
     val evaluatedKey = key.eval(input)
-    val len = branchesArr.length
-    var i = 0
-    // If all branches fail and an elseVal is not provided, the whole statement
-    // defaults to null, according to Hive's semantics.
-    while (i < len - 1) {
-      if (threeValueEquals(evaluatedKey, branchesArr(i).eval(input))) {
-        return branchesArr(i + 1).eval(input)
+    // If key is null, we can just return the else part or null if there is no else.
+    // If key is not null but doesn't match any when part, we need to return
+    // the else part or null if there is no else, according to Hive's semantics.
+    if (evaluatedKey != null) {
+      val len = branchesArr.length
+      var i = 0
+      while (i < len - 1) {
+        if (evaluatedKey ==  branchesArr(i).eval(input)) {
+          return branchesArr(i + 1).eval(input)
+        }
+        i += 2
       }
-      i += 2
     }
-    var res: Any = null
-    if (i == len - 1) {
-      res = branchesArr(i).eval(input)
-    }
-    return res
+    evalElse(input)
   }
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
@@ -261,8 +268,7 @@ case class CaseKeyWhen(key: Expression, branches: Seq[Expression]) extends CaseW
       s"""
         if (!$got) {
           ${cond.code}
-          if (!${keyEval.isNull} && !${cond.isNull}
-             && ${ctx.genEqual(key.dataType, keyEval.primitive, cond.primitive)}) {
+          if (!${cond.isNull} && ${ctx.genEqual(key.dataType, keyEval.primitive, cond.primitive)}) {
             $got = true;
             ${res.code}
             ${ev.isNull} = ${res.isNull};
@@ -290,19 +296,13 @@ case class CaseKeyWhen(key: Expression, branches: Seq[Expression]) extends CaseW
       boolean ${ev.isNull} = true;
       ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
       ${keyEval.code}
-      $cases
+      if (!${keyEval.isNull}) {
+        $cases
+      }
       $other
     """
   }
 
-  private def threeValueEquals(l: Any, r: Any) = {
-    if (l == null || r == null) {
-      false
-    } else {
-      l == r
-    }
-  }
-
   override def toString: String = {
     s"CASE $key" + branches.sliding(2, 2).map {
       case Seq(cond, value) => s" WHEN $cond THEN $value"

From 37f2d9635ff874fb8ad9d246e49faf6098d501c3 Mon Sep 17 00:00:00 2001
From: Michael Armbrust <michael@databricks.com>
Date: Tue, 14 Jul 2015 11:22:09 -0700
Subject: [PATCH 143/549] [SPARK-9027] [SQL] Generalize metastore predicate
 pushdown

Add support for pushing down metastore filters that are in different orders and add some unit tests.

Author: Michael Armbrust <michael@databricks.com>

Closes #7386 from marmbrus/metastoreFilters and squashes the following commits:

05a4524 [Michael Armbrust] [SPARK-9027][SQL] Generalize metastore predicate pushdown
---
 .../spark/sql/hive/client/HiveShim.scala      | 54 +++++++------
 .../spark/sql/hive/client/FiltersSuite.scala  | 78 +++++++++++++++++++
 2 files changed, 107 insertions(+), 25 deletions(-)
 create mode 100644 sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
index 5542a521b173d..d12778c7583df 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
@@ -34,7 +34,7 @@ import org.apache.hadoop.hive.ql.session.SessionState
 import org.apache.hadoop.hive.serde.serdeConstants
 
 import org.apache.spark.Logging
-import org.apache.spark.sql.catalyst.expressions.{Expression, AttributeReference, BinaryComparison}
+import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.types.{StringType, IntegralType}
 
 /**
@@ -312,37 +312,41 @@ private[client] class Shim_v0_13 extends Shim_v0_12 {
   override def getAllPartitions(hive: Hive, table: Table): Seq[Partition] =
     getAllPartitionsMethod.invoke(hive, table).asInstanceOf[JSet[Partition]].toSeq
 
-  override def getPartitionsByFilter(
-      hive: Hive,
-      table: Table,
-      predicates: Seq[Expression]): Seq[Partition] = {
+  /**
+   * Converts catalyst expression to the format that Hive's getPartitionsByFilter() expects, i.e.
+   * a string that represents partition predicates like "str_key=\"value\" and int_key=1 ...".
+   *
+   * Unsupported predicates are skipped.
+   */
+  def convertFilters(table: Table, filters: Seq[Expression]): String = {
     // hive varchar is treated as catalyst string, but hive varchar can't be pushed down.
     val varcharKeys = table.getPartitionKeys
       .filter(col => col.getType.startsWith(serdeConstants.VARCHAR_TYPE_NAME))
       .map(col => col.getName).toSet
 
-    // Hive getPartitionsByFilter() takes a string that represents partition
-    // predicates like "str_key=\"value\" and int_key=1 ..."
-    val filter = predicates.flatMap { expr =>
-      expr match {
-        case op @ BinaryComparison(lhs, rhs) => {
-          lhs match {
-            case AttributeReference(_, _, _, _) => {
-              rhs.dataType match {
-                case _: IntegralType =>
-                  Some(lhs.prettyString + op.symbol + rhs.prettyString)
-                case _: StringType if (!varcharKeys.contains(lhs.prettyString)) =>
-                  Some(lhs.prettyString + op.symbol + "\"" + rhs.prettyString + "\"")
-                case _ => None
-              }
-            }
-            case _ => None
-          }
-        }
-        case _ => None
-      }
+    filters.collect {
+      case op @ BinaryComparison(a: Attribute, Literal(v, _: IntegralType)) =>
+        s"${a.name} ${op.symbol} $v"
+      case op @ BinaryComparison(Literal(v, _: IntegralType), a: Attribute) =>
+        s"$v ${op.symbol} ${a.name}"
+
+      case op @ BinaryComparison(a: Attribute, Literal(v, _: StringType))
+          if !varcharKeys.contains(a.name) =>
+        s"""${a.name} ${op.symbol} "$v""""
+      case op @ BinaryComparison(Literal(v, _: StringType), a: Attribute)
+          if !varcharKeys.contains(a.name) =>
+        s""""$v" ${op.symbol} ${a.name}"""
     }.mkString(" and ")
+  }
+
+  override def getPartitionsByFilter(
+      hive: Hive,
+      table: Table,
+      predicates: Seq[Expression]): Seq[Partition] = {
 
+    // Hive getPartitionsByFilter() takes a string that represents partition
+    // predicates like "str_key=\"value\" and int_key=1 ..."
+    val filter = convertFilters(table, predicates)
     val partitions =
       if (filter.isEmpty) {
         getAllPartitionsMethod.invoke(hive, table).asInstanceOf[JSet[Partition]]
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala
new file mode 100644
index 0000000000000..0efcf80bd4ea7
--- /dev/null
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.client
+
+import scala.collection.JavaConversions._
+
+import org.apache.hadoop.hive.metastore.api.FieldSchema
+import org.apache.hadoop.hive.serde.serdeConstants
+
+import org.apache.spark.{Logging, SparkFunSuite}
+import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.types._
+
+/**
+ * A set of tests for the filter conversion logic used when pushing partition pruning into the
+ * metastore
+ */
+class FiltersSuite extends SparkFunSuite with Logging {
+  private val shim = new Shim_v0_13
+
+  private val testTable = new org.apache.hadoop.hive.ql.metadata.Table("default", "test")
+  private val varCharCol = new FieldSchema()
+  varCharCol.setName("varchar")
+  varCharCol.setType(serdeConstants.VARCHAR_TYPE_NAME)
+  testTable.setPartCols(varCharCol :: Nil)
+
+  filterTest("string filter",
+    (a("stringcol", StringType) > Literal("test")) :: Nil,
+    "stringcol > \"test\"")
+
+  filterTest("string filter backwards",
+    (Literal("test") > a("stringcol", StringType)) :: Nil,
+    "\"test\" > stringcol")
+
+  filterTest("int filter",
+    (a("intcol", IntegerType) === Literal(1)) :: Nil,
+    "intcol = 1")
+
+  filterTest("int filter backwards",
+    (Literal(1) === a("intcol", IntegerType)) :: Nil,
+    "1 = intcol")
+
+  filterTest("int and string filter",
+    (Literal(1) === a("intcol", IntegerType)) :: (Literal("a") === a("strcol", IntegerType)) :: Nil,
+    "1 = intcol and \"a\" = strcol")
+
+  filterTest("skip varchar",
+    (Literal("") === a("varchar", StringType)) :: Nil,
+    "")
+
+  private def filterTest(name: String, filters: Seq[Expression], result: String) = {
+    test(name){
+      val converted = shim.convertFilters(testTable, filters)
+      if (converted != result) {
+        fail(
+          s"Expected filters ${filters.mkString(",")} to convert to '$result' but got '$converted'")
+      }
+    }
+  }
+
+  private def a(name: String, dataType: DataType) = AttributeReference(name, dataType)()
+}

From c4e98ff066cc6f0839d15140eb471d74a0d83e91 Mon Sep 17 00:00:00 2001
From: Brennon York <brennon.york@capitalone.com>
Date: Tue, 14 Jul 2015 11:43:26 -0700
Subject: [PATCH 144/549] [SPARK-8933] [BUILD] Provide a --force flag to
 build/mvn that always uses downloaded maven

added --force flag to manually download, if necessary, and use a built-in version of maven best for spark

Author: Brennon York <brennon.york@capitalone.com>

Closes #7374 from brennonyork/SPARK-8933 and squashes the following commits:

d673127 [Brennon York] added --force flag to manually download, if necessary, and use a built-in version of maven best for spark
---
 build/mvn | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/build/mvn b/build/mvn
index e8364181e8230..f62f61ee1c416 100755
--- a/build/mvn
+++ b/build/mvn
@@ -112,10 +112,17 @@ install_scala() {
 # the environment
 ZINC_PORT=${ZINC_PORT:-"3030"}
 
+# Check for the `--force` flag dictating that `mvn` should be downloaded
+# regardless of whether the system already has a `mvn` install
+if [ "$1" == "--force" ]; then
+  FORCE_MVN=1
+  shift
+fi
+
 # Install Maven if necessary
 MVN_BIN="$(command -v mvn)"
 
-if [ ! "$MVN_BIN" ]; then
+if [ ! "$MVN_BIN" -o -n "$FORCE_MVN" ]; then
   install_mvn
 fi
 
@@ -139,5 +146,7 @@ fi
 # Set any `mvn` options if not already present
 export MAVEN_OPTS=${MAVEN_OPTS:-"$_COMPILE_JVM_OPTS"}
 
+echo "Using \`mvn\` from path: $MVN_BIN"
+
 # Last, call the `mvn` command as usual
 ${MVN_BIN} "$@"

From 8fb3a65cbb714120d612e58ef9d12b0521a83260 Mon Sep 17 00:00:00 2001
From: Andrew Or <andrew@databricks.com>
Date: Tue, 14 Jul 2015 12:47:11 -0700
Subject: [PATCH 145/549] [SPARK-8911] Fix local mode endless heartbeats

As of #7173 we expect executors to properly register with the driver before responding to their heartbeats. This behavior is not matched in local mode. This patch adds the missing event that needs to be posted.

Author: Andrew Or <andrew@databricks.com>

Closes #7382 from andrewor14/fix-local-heartbeat and squashes the following commits:

1258bdf [Andrew Or] Post ExecutorAdded event to local executor
---
 .../spark/scheduler/local/LocalBackend.scala  | 20 ++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/local/LocalBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/local/LocalBackend.scala
index 776e5d330e3c7..4d48fcfea44e7 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/local/LocalBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/local/LocalBackend.scala
@@ -25,7 +25,8 @@ import org.apache.spark.{Logging, SparkConf, SparkContext, SparkEnv, TaskState}
 import org.apache.spark.TaskState.TaskState
 import org.apache.spark.executor.{Executor, ExecutorBackend}
 import org.apache.spark.rpc.{RpcCallContext, RpcEndpointRef, RpcEnv, ThreadSafeRpcEndpoint}
-import org.apache.spark.scheduler.{SchedulerBackend, TaskSchedulerImpl, WorkerOffer}
+import org.apache.spark.scheduler._
+import org.apache.spark.scheduler.cluster.ExecutorInfo
 
 private case class ReviveOffers()
 
@@ -50,8 +51,8 @@ private[spark] class LocalEndpoint(
 
   private var freeCores = totalCores
 
-  private val localExecutorId = SparkContext.DRIVER_IDENTIFIER
-  private val localExecutorHostname = "localhost"
+  val localExecutorId = SparkContext.DRIVER_IDENTIFIER
+  val localExecutorHostname = "localhost"
 
   private val executor = new Executor(
     localExecutorId, localExecutorHostname, SparkEnv.get, userClassPath, isLocal = true)
@@ -99,8 +100,9 @@ private[spark] class LocalBackend(
   extends SchedulerBackend with ExecutorBackend with Logging {
 
   private val appId = "local-" + System.currentTimeMillis
-  var localEndpoint: RpcEndpointRef = null
+  private var localEndpoint: RpcEndpointRef = null
   private val userClassPath = getUserClasspath(conf)
+  private val listenerBus = scheduler.sc.listenerBus
 
   /**
    * Returns a list of URLs representing the user classpath.
@@ -113,9 +115,13 @@ private[spark] class LocalBackend(
   }
 
   override def start() {
-    localEndpoint = SparkEnv.get.rpcEnv.setupEndpoint(
-      "LocalBackendEndpoint",
-      new LocalEndpoint(SparkEnv.get.rpcEnv, userClassPath, scheduler, this, totalCores))
+    val rpcEnv = SparkEnv.get.rpcEnv
+    val executorEndpoint = new LocalEndpoint(rpcEnv, userClassPath, scheduler, this, totalCores)
+    localEndpoint = rpcEnv.setupEndpoint("LocalBackendEndpoint", executorEndpoint)
+    listenerBus.post(SparkListenerExecutorAdded(
+      System.currentTimeMillis,
+      executorEndpoint.localExecutorId,
+      new ExecutorInfo(executorEndpoint.localExecutorHostname, totalCores, Map.empty)))
   }
 
   override def stop() {

From d267c2834a639aaebd0559355c6a82613abb689b Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Tue, 14 Jul 2015 12:56:17 -0700
Subject: [PATCH 146/549] [SPARK-9031] Merge BlockObjectWriter and
 DiskBlockObject writer to remove abstract class

BlockObjectWriter has only one concrete non-test class, DiskBlockObjectWriter. In order to simplify the code in preparation for other refactorings, I think that we should remove this base class and have only DiskBlockObjectWriter.

While at one time we may have planned to have multiple BlockObjectWriter implementations, that doesn't seem to have happened, so the extra abstraction seems unnecessary.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #7391 from JoshRosen/shuffle-write-interface-refactoring and squashes the following commits:

c418e33 [Josh Rosen] Fix compilation
5047995 [Josh Rosen] Fix comments
d5dc548 [Josh Rosen] Update references in comments
89dc797 [Josh Rosen] Rename test suite.
5755918 [Josh Rosen] Remove unnecessary val in case class
1607c91 [Josh Rosen] Merge BlockObjectWriter and DiskBlockObjectWriter
---
 .../sort/BypassMergeSortShuffleWriter.java    |  8 +-
 .../unsafe/UnsafeShuffleExternalSorter.java   |  2 +-
 .../unsafe/sort/UnsafeSorterSpillWriter.java  |  4 +-
 .../shuffle/FileShuffleBlockResolver.scala    |  8 +-
 .../shuffle/IndexShuffleBlockResolver.scala   |  2 +-
 .../shuffle/hash/HashShuffleWriter.scala      |  4 +-
 .../apache/spark/storage/BlockManager.scala   |  2 +-
 ...iter.scala => DiskBlockObjectWriter.scala} | 96 +++++++------------
 .../spark/util/collection/ChainedBuffer.scala |  2 +-
 .../util/collection/ExternalSorter.scala      |  4 +-
 .../collection/PartitionedPairBuffer.scala    |  1 -
 .../PartitionedSerializedPairBuffer.scala     |  5 +-
 .../WritablePartitionedPairCollection.scala   |  8 +-
 .../BypassMergeSortShuffleWriterSuite.scala   |  4 +-
 ...scala => DiskBlockObjectWriterSuite.scala} |  2 +-
 ...PartitionedSerializedPairBufferSuite.scala | 52 +++++-----
 16 files changed, 90 insertions(+), 114 deletions(-)
 rename core/src/main/scala/org/apache/spark/storage/{BlockObjectWriter.scala => DiskBlockObjectWriter.scala} (83%)
 rename core/src/test/scala/org/apache/spark/storage/{BlockObjectWriterSuite.scala => DiskBlockObjectWriterSuite.scala} (98%)

diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java b/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java
index d3d6280284beb..0b8b604e18494 100644
--- a/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java
+++ b/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java
@@ -75,7 +75,7 @@ final class BypassMergeSortShuffleWriter<K, V> implements SortShuffleFileWriter<
   private final Serializer serializer;
 
   /** Array of file writers, one for each partition */
-  private BlockObjectWriter[] partitionWriters;
+  private DiskBlockObjectWriter[] partitionWriters;
 
   public BypassMergeSortShuffleWriter(
       SparkConf conf,
@@ -101,7 +101,7 @@ public void insertAll(Iterator<Product2<K, V>> records) throws IOException {
     }
     final SerializerInstance serInstance = serializer.newInstance();
     final long openStartTime = System.nanoTime();
-    partitionWriters = new BlockObjectWriter[numPartitions];
+    partitionWriters = new DiskBlockObjectWriter[numPartitions];
     for (int i = 0; i < numPartitions; i++) {
       final Tuple2<TempShuffleBlockId, File> tempShuffleBlockIdPlusFile =
         blockManager.diskBlockManager().createTempShuffleBlock();
@@ -121,7 +121,7 @@ public void insertAll(Iterator<Product2<K, V>> records) throws IOException {
       partitionWriters[partitioner.getPartition(key)].write(key, record._2());
     }
 
-    for (BlockObjectWriter writer : partitionWriters) {
+    for (DiskBlockObjectWriter writer : partitionWriters) {
       writer.commitAndClose();
     }
   }
@@ -169,7 +169,7 @@ public void stop() throws IOException {
     if (partitionWriters != null) {
       try {
         final DiskBlockManager diskBlockManager = blockManager.diskBlockManager();
-        for (BlockObjectWriter writer : partitionWriters) {
+        for (DiskBlockObjectWriter writer : partitionWriters) {
           // This method explicitly does _not_ throw exceptions:
           writer.revertPartialWritesAndClose();
           if (!diskBlockManager.getFile(writer.blockId()).delete()) {
diff --git a/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleExternalSorter.java b/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleExternalSorter.java
index 56289573209fb..1d460432be9ff 100644
--- a/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleExternalSorter.java
+++ b/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleExternalSorter.java
@@ -157,7 +157,7 @@ private void writeSortedFile(boolean isLastFile) throws IOException {
 
     // Currently, we need to open a new DiskBlockObjectWriter for each partition; we can avoid this
     // after SPARK-5581 is fixed.
-    BlockObjectWriter writer;
+    DiskBlockObjectWriter writer;
 
     // Small writes to DiskBlockObjectWriter will be fairly inefficient. Since there doesn't seem to
     // be an API to directly transfer bytes from managed memory to the disk writer, we buffer
diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillWriter.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillWriter.java
index b8d66659804ad..71eed29563d4a 100644
--- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillWriter.java
+++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillWriter.java
@@ -26,7 +26,7 @@
 import org.apache.spark.serializer.DummySerializerInstance;
 import org.apache.spark.storage.BlockId;
 import org.apache.spark.storage.BlockManager;
-import org.apache.spark.storage.BlockObjectWriter;
+import org.apache.spark.storage.DiskBlockObjectWriter;
 import org.apache.spark.storage.TempLocalBlockId;
 import org.apache.spark.unsafe.PlatformDependent;
 
@@ -47,7 +47,7 @@ final class UnsafeSorterSpillWriter {
   private final File file;
   private final BlockId blockId;
   private final int numRecordsToWrite;
-  private BlockObjectWriter writer;
+  private DiskBlockObjectWriter writer;
   private int numRecordsSpilled = 0;
 
   public UnsafeSorterSpillWriter(
diff --git a/core/src/main/scala/org/apache/spark/shuffle/FileShuffleBlockResolver.scala b/core/src/main/scala/org/apache/spark/shuffle/FileShuffleBlockResolver.scala
index 6c3b3080d2605..f6a96d81e7aa9 100644
--- a/core/src/main/scala/org/apache/spark/shuffle/FileShuffleBlockResolver.scala
+++ b/core/src/main/scala/org/apache/spark/shuffle/FileShuffleBlockResolver.scala
@@ -35,7 +35,7 @@ import org.apache.spark.util.collection.{PrimitiveKeyOpenHashMap, PrimitiveVecto
 
 /** A group of writers for a ShuffleMapTask, one writer per reducer. */
 private[spark] trait ShuffleWriterGroup {
-  val writers: Array[BlockObjectWriter]
+  val writers: Array[DiskBlockObjectWriter]
 
   /** @param success Indicates all writes were successful. If false, no blocks will be recorded. */
   def releaseWriters(success: Boolean)
@@ -113,15 +113,15 @@ private[spark] class FileShuffleBlockResolver(conf: SparkConf)
 
       val openStartTime = System.nanoTime
       val serializerInstance = serializer.newInstance()
-      val writers: Array[BlockObjectWriter] = if (consolidateShuffleFiles) {
+      val writers: Array[DiskBlockObjectWriter] = if (consolidateShuffleFiles) {
         fileGroup = getUnusedFileGroup()
-        Array.tabulate[BlockObjectWriter](numBuckets) { bucketId =>
+        Array.tabulate[DiskBlockObjectWriter](numBuckets) { bucketId =>
           val blockId = ShuffleBlockId(shuffleId, mapId, bucketId)
           blockManager.getDiskWriter(blockId, fileGroup(bucketId), serializerInstance, bufferSize,
             writeMetrics)
         }
       } else {
-        Array.tabulate[BlockObjectWriter](numBuckets) { bucketId =>
+        Array.tabulate[DiskBlockObjectWriter](numBuckets) { bucketId =>
           val blockId = ShuffleBlockId(shuffleId, mapId, bucketId)
           val blockFile = blockManager.diskBlockManager.getFile(blockId)
           // Because of previous failures, the shuffle file may already exist on this machine.
diff --git a/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala b/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala
index d9c63b6e7bbb9..fae69551e7330 100644
--- a/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala
+++ b/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala
@@ -114,7 +114,7 @@ private[spark] class IndexShuffleBlockResolver(conf: SparkConf) extends ShuffleB
 }
 
 private[spark] object IndexShuffleBlockResolver {
-  // No-op reduce ID used in interactions with disk store and BlockObjectWriter.
+  // No-op reduce ID used in interactions with disk store and DiskBlockObjectWriter.
   // The disk store currently expects puts to relate to a (map, reduce) pair, but in the sort
   // shuffle outputs for several reduces are glommed into a single file.
   // TODO: Avoid this entirely by having the DiskBlockObjectWriter not require a BlockId.
diff --git a/core/src/main/scala/org/apache/spark/shuffle/hash/HashShuffleWriter.scala b/core/src/main/scala/org/apache/spark/shuffle/hash/HashShuffleWriter.scala
index eb87cee15903c..41df70c602c30 100644
--- a/core/src/main/scala/org/apache/spark/shuffle/hash/HashShuffleWriter.scala
+++ b/core/src/main/scala/org/apache/spark/shuffle/hash/HashShuffleWriter.scala
@@ -22,7 +22,7 @@ import org.apache.spark.executor.ShuffleWriteMetrics
 import org.apache.spark.scheduler.MapStatus
 import org.apache.spark.serializer.Serializer
 import org.apache.spark.shuffle._
-import org.apache.spark.storage.BlockObjectWriter
+import org.apache.spark.storage.DiskBlockObjectWriter
 
 private[spark] class HashShuffleWriter[K, V](
     shuffleBlockResolver: FileShuffleBlockResolver,
@@ -102,7 +102,7 @@ private[spark] class HashShuffleWriter[K, V](
 
   private def commitWritesAndBuildStatus(): MapStatus = {
     // Commit the writes. Get the size of each bucket block (total block size).
-    val sizes: Array[Long] = shuffle.writers.map { writer: BlockObjectWriter =>
+    val sizes: Array[Long] = shuffle.writers.map { writer: DiskBlockObjectWriter =>
       writer.commitAndClose()
       writer.fileSegment().length
     }
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
index 1beafa1771448..86493673d958d 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
@@ -648,7 +648,7 @@ private[spark] class BlockManager(
       file: File,
       serializerInstance: SerializerInstance,
       bufferSize: Int,
-      writeMetrics: ShuffleWriteMetrics): BlockObjectWriter = {
+      writeMetrics: ShuffleWriteMetrics): DiskBlockObjectWriter = {
     val compressStream: OutputStream => OutputStream = wrapForCompression(blockId, _)
     val syncWrites = conf.getBoolean("spark.shuffle.sync", false)
     new DiskBlockObjectWriter(blockId, file, serializerInstance, bufferSize, compressStream,
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockObjectWriter.scala b/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala
similarity index 83%
rename from core/src/main/scala/org/apache/spark/storage/BlockObjectWriter.scala
rename to core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala
index 7eeabd1e0489c..49d9154f95a5b 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockObjectWriter.scala
+++ b/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala
@@ -26,66 +26,25 @@ import org.apache.spark.executor.ShuffleWriteMetrics
 import org.apache.spark.util.Utils
 
 /**
- * An interface for writing JVM objects to some underlying storage. This interface allows
- * appending data to an existing block, and can guarantee atomicity in the case of faults
- * as it allows the caller to revert partial writes.
+ * A class for writing JVM objects directly to a file on disk. This class allows data to be appended
+ * to an existing block and can guarantee atomicity in the case of faults as it allows the caller to
+ * revert partial writes.
  *
- * This interface does not support concurrent writes. Also, once the writer has
- * been opened, it cannot be reopened again.
- */
-private[spark] abstract class BlockObjectWriter(val blockId: BlockId) extends OutputStream {
-
-  def open(): BlockObjectWriter
-
-  def close()
-
-  def isOpen: Boolean
-
-  /**
-   * Flush the partial writes and commit them as a single atomic block.
-   */
-  def commitAndClose(): Unit
-
-  /**
-   * Reverts writes that haven't been flushed yet. Callers should invoke this function
-   * when there are runtime exceptions. This method will not throw, though it may be
-   * unsuccessful in truncating written data.
-   */
-  def revertPartialWritesAndClose()
-
-  /**
-   * Writes a key-value pair.
-   */
-  def write(key: Any, value: Any)
-
-  /**
-   * Notify the writer that a record worth of bytes has been written with OutputStream#write.
-   */
-  def recordWritten()
-
-  /**
-   * Returns the file segment of committed data that this Writer has written.
-   * This is only valid after commitAndClose() has been called.
-   */
-  def fileSegment(): FileSegment
-}
-
-/**
- * BlockObjectWriter which writes directly to a file on disk. Appends to the given file.
+ * This class does not support concurrent writes. Also, once the writer has been opened it cannot be
+ * reopened again.
  */
 private[spark] class DiskBlockObjectWriter(
-    blockId: BlockId,
+    val blockId: BlockId,
     file: File,
     serializerInstance: SerializerInstance,
     bufferSize: Int,
     compressStream: OutputStream => OutputStream,
     syncWrites: Boolean,
-    // These write metrics concurrently shared with other active BlockObjectWriter's who
+    // These write metrics concurrently shared with other active DiskBlockObjectWriters who
     // are themselves performing writes. All updates must be relative.
     writeMetrics: ShuffleWriteMetrics)
-  extends BlockObjectWriter(blockId)
-  with Logging
-{
+  extends OutputStream
+  with Logging {
 
   /** The file channel, used for repositioning / truncating the file. */
   private var channel: FileChannel = null
@@ -122,7 +81,7 @@ private[spark] class DiskBlockObjectWriter(
    */
   private var numRecordsWritten = 0
 
-  override def open(): BlockObjectWriter = {
+  def open(): DiskBlockObjectWriter = {
     if (hasBeenClosed) {
       throw new IllegalStateException("Writer already closed. Cannot be reopened.")
     }
@@ -159,9 +118,12 @@ private[spark] class DiskBlockObjectWriter(
     }
   }
 
-  override def isOpen: Boolean = objOut != null
+  def isOpen: Boolean = objOut != null
 
-  override def commitAndClose(): Unit = {
+  /**
+   * Flush the partial writes and commit them as a single atomic block.
+   */
+  def commitAndClose(): Unit = {
     if (initialized) {
       // NOTE: Because Kryo doesn't flush the underlying stream we explicitly flush both the
       //       serializer stream and the lower level stream.
@@ -177,9 +139,15 @@ private[spark] class DiskBlockObjectWriter(
     commitAndCloseHasBeenCalled = true
   }
 
-  // Discard current writes. We do this by flushing the outstanding writes and then
-  // truncating the file to its initial position.
-  override def revertPartialWritesAndClose() {
+
+  /**
+   * Reverts writes that haven't been flushed yet. Callers should invoke this function
+   * when there are runtime exceptions. This method will not throw, though it may be
+   * unsuccessful in truncating written data.
+   */
+  def revertPartialWritesAndClose() {
+    // Discard current writes. We do this by flushing the outstanding writes and then
+    // truncating the file to its initial position.
     try {
       if (initialized) {
         writeMetrics.decShuffleBytesWritten(reportedPosition - initialPosition)
@@ -201,7 +169,10 @@ private[spark] class DiskBlockObjectWriter(
     }
   }
 
-  override def write(key: Any, value: Any) {
+  /**
+   * Writes a key-value pair.
+   */
+  def write(key: Any, value: Any) {
     if (!initialized) {
       open()
     }
@@ -221,7 +192,10 @@ private[spark] class DiskBlockObjectWriter(
     bs.write(kvBytes, offs, len)
   }
 
-  override def recordWritten(): Unit = {
+  /**
+   * Notify the writer that a record worth of bytes has been written with OutputStream#write.
+   */
+  def recordWritten(): Unit = {
     numRecordsWritten += 1
     writeMetrics.incShuffleRecordsWritten(1)
 
@@ -230,7 +204,11 @@ private[spark] class DiskBlockObjectWriter(
     }
   }
 
-  override def fileSegment(): FileSegment = {
+  /**
+   * Returns the file segment of committed data that this Writer has written.
+   * This is only valid after commitAndClose() has been called.
+   */
+  def fileSegment(): FileSegment = {
     if (!commitAndCloseHasBeenCalled) {
       throw new IllegalStateException(
         "fileSegment() is only valid after commitAndClose() has been called")
diff --git a/core/src/main/scala/org/apache/spark/util/collection/ChainedBuffer.scala b/core/src/main/scala/org/apache/spark/util/collection/ChainedBuffer.scala
index 516aaa44d03fc..ae60f3b0cb555 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/ChainedBuffer.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/ChainedBuffer.scala
@@ -37,7 +37,7 @@ private[spark] class ChainedBuffer(chunkSize: Int) {
   private var _size: Long = 0
 
   /**
-   * Feed bytes from this buffer into a BlockObjectWriter.
+   * Feed bytes from this buffer into a DiskBlockObjectWriter.
    *
    * @param pos Offset in the buffer to read from.
    * @param os OutputStream to read into.
diff --git a/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala b/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala
index 757dec66c203b..ba7ec834d622d 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala
@@ -30,7 +30,7 @@ import org.apache.spark._
 import org.apache.spark.serializer._
 import org.apache.spark.executor.ShuffleWriteMetrics
 import org.apache.spark.shuffle.sort.{SortShuffleFileWriter, SortShuffleWriter}
-import org.apache.spark.storage.{BlockId, BlockObjectWriter}
+import org.apache.spark.storage.{BlockId, DiskBlockObjectWriter}
 
 /**
  * Sorts and potentially merges a number of key-value pairs of type (K, V) to produce key-combiner
@@ -250,7 +250,7 @@ private[spark] class ExternalSorter[K, V, C](
     // These variables are reset after each flush
     var objectsWritten: Long = 0
     var spillMetrics: ShuffleWriteMetrics = null
-    var writer: BlockObjectWriter = null
+    var writer: DiskBlockObjectWriter = null
     def openWriter(): Unit = {
       assert (writer == null && spillMetrics == null)
       spillMetrics = new ShuffleWriteMetrics
diff --git a/core/src/main/scala/org/apache/spark/util/collection/PartitionedPairBuffer.scala b/core/src/main/scala/org/apache/spark/util/collection/PartitionedPairBuffer.scala
index 04bb7fc78c13b..f5844d5353be7 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/PartitionedPairBuffer.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/PartitionedPairBuffer.scala
@@ -19,7 +19,6 @@ package org.apache.spark.util.collection
 
 import java.util.Comparator
 
-import org.apache.spark.storage.BlockObjectWriter
 import org.apache.spark.util.collection.WritablePartitionedPairCollection._
 
 /**
diff --git a/core/src/main/scala/org/apache/spark/util/collection/PartitionedSerializedPairBuffer.scala b/core/src/main/scala/org/apache/spark/util/collection/PartitionedSerializedPairBuffer.scala
index ae9a48729e201..87a786b02d651 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/PartitionedSerializedPairBuffer.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/PartitionedSerializedPairBuffer.scala
@@ -21,9 +21,8 @@ import java.io.InputStream
 import java.nio.IntBuffer
 import java.util.Comparator
 
-import org.apache.spark.SparkEnv
 import org.apache.spark.serializer.{JavaSerializerInstance, SerializerInstance}
-import org.apache.spark.storage.BlockObjectWriter
+import org.apache.spark.storage.DiskBlockObjectWriter
 import org.apache.spark.util.collection.PartitionedSerializedPairBuffer._
 
 /**
@@ -136,7 +135,7 @@ private[spark] class PartitionedSerializedPairBuffer[K, V](
       // current position in the meta buffer in ints
       var pos = 0
 
-      def writeNext(writer: BlockObjectWriter): Unit = {
+      def writeNext(writer: DiskBlockObjectWriter): Unit = {
         val keyStart = getKeyStartPos(metaBuffer, pos)
         val keyValLen = metaBuffer.get(pos + KEY_VAL_LEN)
         pos += RECORD_SIZE
diff --git a/core/src/main/scala/org/apache/spark/util/collection/WritablePartitionedPairCollection.scala b/core/src/main/scala/org/apache/spark/util/collection/WritablePartitionedPairCollection.scala
index 7bc59898658e4..38848e9018c6c 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/WritablePartitionedPairCollection.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/WritablePartitionedPairCollection.scala
@@ -19,7 +19,7 @@ package org.apache.spark.util.collection
 
 import java.util.Comparator
 
-import org.apache.spark.storage.BlockObjectWriter
+import org.apache.spark.storage.DiskBlockObjectWriter
 
 /**
  * A common interface for size-tracking collections of key-value pairs that
@@ -51,7 +51,7 @@ private[spark] trait WritablePartitionedPairCollection[K, V] {
     new WritablePartitionedIterator {
       private[this] var cur = if (it.hasNext) it.next() else null
 
-      def writeNext(writer: BlockObjectWriter): Unit = {
+      def writeNext(writer: DiskBlockObjectWriter): Unit = {
         writer.write(cur._1._2, cur._2)
         cur = if (it.hasNext) it.next() else null
       }
@@ -91,11 +91,11 @@ private[spark] object WritablePartitionedPairCollection {
 }
 
 /**
- * Iterator that writes elements to a BlockObjectWriter instead of returning them. Each element
+ * Iterator that writes elements to a DiskBlockObjectWriter instead of returning them. Each element
  * has an associated partition.
  */
 private[spark] trait WritablePartitionedIterator {
-  def writeNext(writer: BlockObjectWriter): Unit
+  def writeNext(writer: DiskBlockObjectWriter): Unit
 
   def hasNext(): Boolean
 
diff --git a/core/src/test/scala/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriterSuite.scala b/core/src/test/scala/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriterSuite.scala
index 542f8f45125a4..cc7342f1ecd78 100644
--- a/core/src/test/scala/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriterSuite.scala
+++ b/core/src/test/scala/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriterSuite.scala
@@ -68,8 +68,8 @@ class BypassMergeSortShuffleWriterSuite extends SparkFunSuite with BeforeAndAfte
       any[SerializerInstance],
       anyInt(),
       any[ShuffleWriteMetrics]
-    )).thenAnswer(new Answer[BlockObjectWriter] {
-      override def answer(invocation: InvocationOnMock): BlockObjectWriter = {
+    )).thenAnswer(new Answer[DiskBlockObjectWriter] {
+      override def answer(invocation: InvocationOnMock): DiskBlockObjectWriter = {
         val args = invocation.getArguments
         new DiskBlockObjectWriter(
           args(0).asInstanceOf[BlockId],
diff --git a/core/src/test/scala/org/apache/spark/storage/BlockObjectWriterSuite.scala b/core/src/test/scala/org/apache/spark/storage/DiskBlockObjectWriterSuite.scala
similarity index 98%
rename from core/src/test/scala/org/apache/spark/storage/BlockObjectWriterSuite.scala
rename to core/src/test/scala/org/apache/spark/storage/DiskBlockObjectWriterSuite.scala
index 7bdea724fea58..66af6e1a79740 100644
--- a/core/src/test/scala/org/apache/spark/storage/BlockObjectWriterSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/DiskBlockObjectWriterSuite.scala
@@ -26,7 +26,7 @@ import org.apache.spark.executor.ShuffleWriteMetrics
 import org.apache.spark.serializer.JavaSerializer
 import org.apache.spark.util.Utils
 
-class BlockObjectWriterSuite extends SparkFunSuite with BeforeAndAfterEach {
+class DiskBlockObjectWriterSuite extends SparkFunSuite with BeforeAndAfterEach {
 
   var tempDir: File = _
 
diff --git a/core/src/test/scala/org/apache/spark/util/collection/PartitionedSerializedPairBufferSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/PartitionedSerializedPairBufferSuite.scala
index 6d2459d48d326..3b67f6206495a 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/PartitionedSerializedPairBufferSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/PartitionedSerializedPairBufferSuite.scala
@@ -17,15 +17,20 @@
 
 package org.apache.spark.util.collection
 
-import java.io.{ByteArrayInputStream, ByteArrayOutputStream, InputStream}
+import java.io.{ByteArrayInputStream, ByteArrayOutputStream}
 
 import com.google.common.io.ByteStreams
 
+import org.mockito.Matchers.any
+import org.mockito.Mockito._
+import org.mockito.Mockito.RETURNS_SMART_NULLS
+import org.mockito.invocation.InvocationOnMock
+import org.mockito.stubbing.Answer
 import org.scalatest.Matchers._
 
 import org.apache.spark.{SparkConf, SparkFunSuite}
 import org.apache.spark.serializer.KryoSerializer
-import org.apache.spark.storage.{FileSegment, BlockObjectWriter}
+import org.apache.spark.storage.DiskBlockObjectWriter
 
 class PartitionedSerializedPairBufferSuite extends SparkFunSuite {
   test("OrderedInputStream single record") {
@@ -79,13 +84,13 @@ class PartitionedSerializedPairBufferSuite extends SparkFunSuite {
     val struct = SomeStruct("something", 5)
     buffer.insert(4, 10, struct)
     val it = buffer.destructiveSortedWritablePartitionedIterator(None)
-    val writer = new SimpleBlockObjectWriter
+    val (writer, baos) = createMockWriter()
     assert(it.hasNext)
     it.nextPartition should be (4)
     it.writeNext(writer)
     assert(!it.hasNext)
 
-    val stream = serializerInstance.deserializeStream(writer.getInputStream)
+    val stream = serializerInstance.deserializeStream(new ByteArrayInputStream(baos.toByteArray))
     stream.readObject[AnyRef]() should be (10)
     stream.readObject[AnyRef]() should be (struct)
   }
@@ -101,7 +106,7 @@ class PartitionedSerializedPairBufferSuite extends SparkFunSuite {
     buffer.insert(5, 3, struct3)
 
     val it = buffer.destructiveSortedWritablePartitionedIterator(None)
-    val writer = new SimpleBlockObjectWriter
+    val (writer, baos) = createMockWriter()
     assert(it.hasNext)
     it.nextPartition should be (4)
     it.writeNext(writer)
@@ -113,7 +118,7 @@ class PartitionedSerializedPairBufferSuite extends SparkFunSuite {
     it.writeNext(writer)
     assert(!it.hasNext)
 
-    val stream = serializerInstance.deserializeStream(writer.getInputStream)
+    val stream = serializerInstance.deserializeStream(new ByteArrayInputStream(baos.toByteArray))
     val iter = stream.asIterator
     iter.next() should be (2)
     iter.next() should be (struct2)
@@ -123,26 +128,21 @@ class PartitionedSerializedPairBufferSuite extends SparkFunSuite {
     iter.next() should be (struct1)
     assert(!iter.hasNext)
   }
-}
-
-case class SomeStruct(val str: String, val num: Int)
-
-class SimpleBlockObjectWriter extends BlockObjectWriter(null) {
-  val baos = new ByteArrayOutputStream()
 
-  override def write(bytes: Array[Byte], offs: Int, len: Int): Unit = {
-    baos.write(bytes, offs, len)
+  def createMockWriter(): (DiskBlockObjectWriter, ByteArrayOutputStream) = {
+    val writer = mock(classOf[DiskBlockObjectWriter], RETURNS_SMART_NULLS)
+    val baos = new ByteArrayOutputStream()
+    when(writer.write(any(), any(), any())).thenAnswer(new Answer[Unit] {
+      override def answer(invocationOnMock: InvocationOnMock): Unit = {
+        val args = invocationOnMock.getArguments
+        val bytes = args(0).asInstanceOf[Array[Byte]]
+        val offset = args(1).asInstanceOf[Int]
+        val length = args(2).asInstanceOf[Int]
+        baos.write(bytes, offset, length)
+      }
+    })
+    (writer, baos)
   }
-
-  def getInputStream(): InputStream = new ByteArrayInputStream(baos.toByteArray)
-
-  override def open(): BlockObjectWriter = this
-  override def close(): Unit = { }
-  override def isOpen: Boolean = true
-  override def commitAndClose(): Unit = { }
-  override def revertPartialWritesAndClose(): Unit = { }
-  override def fileSegment(): FileSegment = null
-  override def write(key: Any, value: Any): Unit = { }
-  override def recordWritten(): Unit = { }
-  override def write(b: Int): Unit = { }
 }
+
+case class SomeStruct(str: String, num: Int)

From 0a4071eab30db1db80f61ed2cb2e7243291183ce Mon Sep 17 00:00:00 2001
From: Andrew Ray <ray.andrew@gmail.com>
Date: Tue, 14 Jul 2015 13:14:47 -0700
Subject: [PATCH 147/549] [SPARK-8718] [GRAPHX] Improve EdgePartition2D for non
 perfect square number of partitions

See https://github.com/aray/e2d/blob/master/EdgePartition2D.ipynb

Author: Andrew Ray <ray.andrew@gmail.com>

Closes #7104 from aray/edge-partition-2d-improvement and squashes the following commits:

3729f84 [Andrew Ray] correct bounds and remove unneeded comments
97f8464 [Andrew Ray] change less
5141ab4 [Andrew Ray] Merge branch 'master' into edge-partition-2d-improvement
925fd2c [Andrew Ray] use new interface for partitioning
001bfd0 [Andrew Ray] Refactor PartitionStrategy so that we can return a prtition function for a given number of parts. To keep compatibility we define default methods that translate between the two implementation options. Made EdgePartition2D use old strategy when we have a perfect square and implement new interface.
5d42105 [Andrew Ray] % -> /
3560084 [Andrew Ray] Merge branch 'master' into edge-partition-2d-improvement
f006364 [Andrew Ray] remove unneeded comments
cfa2c5e [Andrew Ray] Modifications to EdgePartition2D so that it works for non perfect squares.
---
 .../spark/graphx/PartitionStrategy.scala      | 32 ++++++++++++-------
 1 file changed, 21 insertions(+), 11 deletions(-)

diff --git a/graphx/src/main/scala/org/apache/spark/graphx/PartitionStrategy.scala b/graphx/src/main/scala/org/apache/spark/graphx/PartitionStrategy.scala
index 7372dfbd9fe98..70a7592da8ae3 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/PartitionStrategy.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/PartitionStrategy.scala
@@ -32,7 +32,7 @@ trait PartitionStrategy extends Serializable {
 object PartitionStrategy {
   /**
    * Assigns edges to partitions using a 2D partitioning of the sparse edge adjacency matrix,
-   * guaranteeing a `2 * sqrt(numParts) - 1` bound on vertex replication.
+   * guaranteeing a `2 * sqrt(numParts)` bound on vertex replication.
    *
    * Suppose we have a graph with 12 vertices that we want to partition
    * over 9 machines.  We can use the following sparse matrix representation:
@@ -61,26 +61,36 @@ object PartitionStrategy {
    * that edges adjacent to `v11` can only be in the first column of blocks `(P0, P3,
    * P6)` or the last
    * row of blocks `(P6, P7, P8)`.  As a consequence we can guarantee that `v11` will need to be
-   * replicated to at most `2 * sqrt(numParts) - 1` machines.
+   * replicated to at most `2 * sqrt(numParts)` machines.
    *
    * Notice that `P0` has many edges and as a consequence this partitioning would lead to poor work
    * balance.  To improve balance we first multiply each vertex id by a large prime to shuffle the
    * vertex locations.
    *
-   * One of the limitations of this approach is that the number of machines must either be a
-   * perfect square. We partially address this limitation by computing the machine assignment to
-   * the next
-   * largest perfect square and then mapping back down to the actual number of machines.
-   * Unfortunately, this can also lead to work imbalance and so it is suggested that a perfect
-   * square is used.
+   * When the number of partitions requested is not a perfect square we use a slightly different
+   * method where the last column can have a different number of rows than the others while still
+   * maintaining the same size per block.
    */
   case object EdgePartition2D extends PartitionStrategy {
     override def getPartition(src: VertexId, dst: VertexId, numParts: PartitionID): PartitionID = {
       val ceilSqrtNumParts: PartitionID = math.ceil(math.sqrt(numParts)).toInt
       val mixingPrime: VertexId = 1125899906842597L
-      val col: PartitionID = (math.abs(src * mixingPrime) % ceilSqrtNumParts).toInt
-      val row: PartitionID = (math.abs(dst * mixingPrime) % ceilSqrtNumParts).toInt
-      (col * ceilSqrtNumParts + row) % numParts
+      if (numParts == ceilSqrtNumParts * ceilSqrtNumParts) {
+        // Use old method for perfect squared to ensure we get same results
+        val col: PartitionID = (math.abs(src * mixingPrime) % ceilSqrtNumParts).toInt
+        val row: PartitionID = (math.abs(dst * mixingPrime) % ceilSqrtNumParts).toInt
+        (col * ceilSqrtNumParts + row) % numParts
+
+      } else {
+        // Otherwise use new method
+        val cols = ceilSqrtNumParts
+        val rows = (numParts + cols - 1) / cols
+        val lastColRows = numParts - rows * (cols - 1)
+        val col = (math.abs(src * mixingPrime) % numParts / rows).toInt
+        val row = (math.abs(dst * mixingPrime) % (if (col < cols - 1) rows else lastColRows)).toInt
+        col * rows + row
+
+      }
     }
   }
 

From fb1d06fc242ec00320f1a3049673fbb03c4a6eb9 Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Tue, 14 Jul 2015 13:58:36 -0700
Subject: [PATCH 148/549] [SPARK-4072] [CORE] Display Streaming blocks in
 Streaming UI

Replace #6634

This PR adds `SparkListenerBlockUpdated` to SparkListener so that it can monitor all block update infos that are sent to `BlockManagerMasaterEndpoint`, and also add new tables in the Storage tab to display the stream block infos.

![screen shot 2015-07-01 at 5 19 46 pm](https://cloud.githubusercontent.com/assets/1000778/8451562/c291a6ec-2016-11e5-890d-0afc174e1f8c.png)

Author: zsxwing <zsxwing@gmail.com>

Closes #6672 from zsxwing/SPARK-4072-2 and squashes the following commits:

df2c1d8 [zsxwing] Use xml query to check the xml elements
54d54af [zsxwing] Add unit tests for StoragePage
e29fb53 [zsxwing] Update as per TD's comments
ccbee07 [zsxwing] Fix the code style
6dc42b4 [zsxwing] Fix the replication level of blocks
450fad1 [zsxwing] Merge branch 'master' into SPARK-4072-2
1e9ef52 [zsxwing] Don't categorize by Executor ID
ca0ab69 [zsxwing] Fix the code style
3de2762 [zsxwing] Make object BlockUpdatedInfo private
e95b594 [zsxwing] Add 'Aggregated Stream Block Metrics by Executor' table
ba5d0d1 [zsxwing] Refactor the unit test to improve the readability
4bbe341 [zsxwing] Revert JsonProtocol and don't log SparkListenerBlockUpdated
b464dd1 [zsxwing] Add onBlockUpdated to EventLoggingListener
5ba014c [zsxwing] Fix the code style
0b1e47b [zsxwing] Add a developer api BlockUpdatedInfo
04838a9 [zsxwing] Fix the code style
2baa161 [zsxwing] Add unit tests
80f6c6d [zsxwing] Address comments
797ee4b [zsxwing] Display Streaming blocks in Streaming UI
---
 .../org/apache/spark/JavaSparkListener.java   |  22 +-
 .../apache/spark/SparkFirehoseListener.java   |   6 +
 .../scheduler/EventLoggingListener.scala      |   3 +
 .../spark/scheduler/SparkListener.scala       |  10 +-
 .../spark/scheduler/SparkListenerBus.scala    |   2 +
 .../storage/BlockManagerMasterEndpoint.scala  |   3 +-
 .../spark/storage/BlockStatusListener.scala   | 105 ++++++++
 .../spark/storage/BlockUpdatedInfo.scala      |  47 ++++
 .../scala/org/apache/spark/ui/UIUtils.scala   |  14 +-
 .../apache/spark/ui/storage/StoragePage.scala | 148 ++++++++++-
 .../apache/spark/ui/storage/StorageTab.scala  |   3 +-
 .../storage/BlockStatusListenerSuite.scala    | 119 +++++++++
 .../spark/ui/storage/StoragePageSuite.scala   | 230 ++++++++++++++++++
 13 files changed, 684 insertions(+), 28 deletions(-)
 create mode 100644 core/src/main/scala/org/apache/spark/storage/BlockStatusListener.scala
 create mode 100644 core/src/main/scala/org/apache/spark/storage/BlockUpdatedInfo.scala
 create mode 100644 core/src/test/scala/org/apache/spark/storage/BlockStatusListenerSuite.scala
 create mode 100644 core/src/test/scala/org/apache/spark/ui/storage/StoragePageSuite.scala

diff --git a/core/src/main/java/org/apache/spark/JavaSparkListener.java b/core/src/main/java/org/apache/spark/JavaSparkListener.java
index 646496f313507..fa9acf0a15b88 100644
--- a/core/src/main/java/org/apache/spark/JavaSparkListener.java
+++ b/core/src/main/java/org/apache/spark/JavaSparkListener.java
@@ -17,23 +17,7 @@
 
 package org.apache.spark;
 
-import org.apache.spark.scheduler.SparkListener;
-import org.apache.spark.scheduler.SparkListenerApplicationEnd;
-import org.apache.spark.scheduler.SparkListenerApplicationStart;
-import org.apache.spark.scheduler.SparkListenerBlockManagerAdded;
-import org.apache.spark.scheduler.SparkListenerBlockManagerRemoved;
-import org.apache.spark.scheduler.SparkListenerEnvironmentUpdate;
-import org.apache.spark.scheduler.SparkListenerExecutorAdded;
-import org.apache.spark.scheduler.SparkListenerExecutorMetricsUpdate;
-import org.apache.spark.scheduler.SparkListenerExecutorRemoved;
-import org.apache.spark.scheduler.SparkListenerJobEnd;
-import org.apache.spark.scheduler.SparkListenerJobStart;
-import org.apache.spark.scheduler.SparkListenerStageCompleted;
-import org.apache.spark.scheduler.SparkListenerStageSubmitted;
-import org.apache.spark.scheduler.SparkListenerTaskEnd;
-import org.apache.spark.scheduler.SparkListenerTaskGettingResult;
-import org.apache.spark.scheduler.SparkListenerTaskStart;
-import org.apache.spark.scheduler.SparkListenerUnpersistRDD;
+import org.apache.spark.scheduler.*;
 
 /**
  * Java clients should extend this class instead of implementing
@@ -94,4 +78,8 @@ public void onExecutorAdded(SparkListenerExecutorAdded executorAdded) { }
 
   @Override
   public void onExecutorRemoved(SparkListenerExecutorRemoved executorRemoved) { }
+
+  @Override
+  public void onBlockUpdated(SparkListenerBlockUpdated blockUpdated) { }
+
 }
diff --git a/core/src/main/java/org/apache/spark/SparkFirehoseListener.java b/core/src/main/java/org/apache/spark/SparkFirehoseListener.java
index fbc5666959055..1214d05ba6063 100644
--- a/core/src/main/java/org/apache/spark/SparkFirehoseListener.java
+++ b/core/src/main/java/org/apache/spark/SparkFirehoseListener.java
@@ -112,4 +112,10 @@ public final void onExecutorAdded(SparkListenerExecutorAdded executorAdded) {
     public final void onExecutorRemoved(SparkListenerExecutorRemoved executorRemoved) {
         onEvent(executorRemoved);
     }
+
+    @Override
+    public void onBlockUpdated(SparkListenerBlockUpdated blockUpdated) {
+        onEvent(blockUpdated);
+    }
+
 }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala b/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala
index 62b05033a9281..5a06ef02f5c57 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala
@@ -199,6 +199,9 @@ private[spark] class EventLoggingListener(
     logEvent(event, flushLogger = true)
   }
 
+  // No-op because logging every update would be overkill
+  override def onBlockUpdated(event: SparkListenerBlockUpdated): Unit = {}
+
   // No-op because logging every update would be overkill
   override def onExecutorMetricsUpdate(event: SparkListenerExecutorMetricsUpdate): Unit = { }
 
diff --git a/core/src/main/scala/org/apache/spark/scheduler/SparkListener.scala b/core/src/main/scala/org/apache/spark/scheduler/SparkListener.scala
index 9620915f495ab..896f1743332f1 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/SparkListener.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/SparkListener.scala
@@ -26,7 +26,7 @@ import org.apache.spark.{Logging, TaskEndReason}
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.executor.TaskMetrics
 import org.apache.spark.scheduler.cluster.ExecutorInfo
-import org.apache.spark.storage.BlockManagerId
+import org.apache.spark.storage.{BlockManagerId, BlockUpdatedInfo}
 import org.apache.spark.util.{Distribution, Utils}
 
 @DeveloperApi
@@ -98,6 +98,9 @@ case class SparkListenerExecutorAdded(time: Long, executorId: String, executorIn
 case class SparkListenerExecutorRemoved(time: Long, executorId: String, reason: String)
   extends SparkListenerEvent
 
+@DeveloperApi
+case class SparkListenerBlockUpdated(blockUpdatedInfo: BlockUpdatedInfo) extends SparkListenerEvent
+
 /**
  * Periodic updates from executors.
  * @param execId executor id
@@ -215,6 +218,11 @@ trait SparkListener {
    * Called when the driver removes an executor.
    */
   def onExecutorRemoved(executorRemoved: SparkListenerExecutorRemoved) { }
+
+  /**
+   * Called when the driver receives a block update info.
+   */
+  def onBlockUpdated(blockUpdated: SparkListenerBlockUpdated) { }
 }
 
 /**
diff --git a/core/src/main/scala/org/apache/spark/scheduler/SparkListenerBus.scala b/core/src/main/scala/org/apache/spark/scheduler/SparkListenerBus.scala
index 61e69ecc08387..04afde33f5aad 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/SparkListenerBus.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/SparkListenerBus.scala
@@ -58,6 +58,8 @@ private[spark] trait SparkListenerBus extends ListenerBus[SparkListener, SparkLi
         listener.onExecutorAdded(executorAdded)
       case executorRemoved: SparkListenerExecutorRemoved =>
         listener.onExecutorRemoved(executorRemoved)
+      case blockUpdated: SparkListenerBlockUpdated =>
+        listener.onBlockUpdated(blockUpdated)
       case logStart: SparkListenerLogStart => // ignore event log metadata
     }
   }
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala
index 68ed9096731c5..5dc0c537cbb62 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala
@@ -60,10 +60,11 @@ class BlockManagerMasterEndpoint(
       register(blockManagerId, maxMemSize, slaveEndpoint)
       context.reply(true)
 
-    case UpdateBlockInfo(
+    case _updateBlockInfo @ UpdateBlockInfo(
       blockManagerId, blockId, storageLevel, deserializedSize, size, externalBlockStoreSize) =>
       context.reply(updateBlockInfo(
         blockManagerId, blockId, storageLevel, deserializedSize, size, externalBlockStoreSize))
+      listenerBus.post(SparkListenerBlockUpdated(BlockUpdatedInfo(_updateBlockInfo)))
 
     case GetLocations(blockId) =>
       context.reply(getLocations(blockId))
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockStatusListener.scala b/core/src/main/scala/org/apache/spark/storage/BlockStatusListener.scala
new file mode 100644
index 0000000000000..2789e25b8d3ab
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/storage/BlockStatusListener.scala
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.storage
+
+import scala.collection.mutable
+
+import org.apache.spark.scheduler._
+
+private[spark] case class BlockUIData(
+    blockId: BlockId,
+    location: String,
+    storageLevel: StorageLevel,
+    memSize: Long,
+    diskSize: Long,
+    externalBlockStoreSize: Long)
+
+/**
+ * The aggregated status of stream blocks in an executor
+ */
+private[spark] case class ExecutorStreamBlockStatus(
+    executorId: String,
+    location: String,
+    blocks: Seq[BlockUIData]) {
+
+  def totalMemSize: Long = blocks.map(_.memSize).sum
+
+  def totalDiskSize: Long = blocks.map(_.diskSize).sum
+
+  def totalExternalBlockStoreSize: Long = blocks.map(_.externalBlockStoreSize).sum
+
+  def numStreamBlocks: Int = blocks.size
+
+}
+
+private[spark] class BlockStatusListener extends SparkListener {
+
+  private val blockManagers =
+    new mutable.HashMap[BlockManagerId, mutable.HashMap[BlockId, BlockUIData]]
+
+  override def onBlockUpdated(blockUpdated: SparkListenerBlockUpdated): Unit = {
+    val blockId = blockUpdated.blockUpdatedInfo.blockId
+    if (!blockId.isInstanceOf[StreamBlockId]) {
+      // Now we only monitor StreamBlocks
+      return
+    }
+    val blockManagerId = blockUpdated.blockUpdatedInfo.blockManagerId
+    val storageLevel = blockUpdated.blockUpdatedInfo.storageLevel
+    val memSize = blockUpdated.blockUpdatedInfo.memSize
+    val diskSize = blockUpdated.blockUpdatedInfo.diskSize
+    val externalBlockStoreSize = blockUpdated.blockUpdatedInfo.externalBlockStoreSize
+
+    synchronized {
+      // Drop the update info if the block manager is not registered
+      blockManagers.get(blockManagerId).foreach { blocksInBlockManager =>
+        if (storageLevel.isValid) {
+          blocksInBlockManager.put(blockId,
+            BlockUIData(
+              blockId,
+              blockManagerId.hostPort,
+              storageLevel,
+              memSize,
+              diskSize,
+              externalBlockStoreSize)
+          )
+        } else {
+          // If isValid is not true, it means we should drop the block.
+          blocksInBlockManager -= blockId
+        }
+      }
+    }
+  }
+
+  override def onBlockManagerAdded(blockManagerAdded: SparkListenerBlockManagerAdded): Unit = {
+    synchronized {
+      blockManagers.put(blockManagerAdded.blockManagerId, mutable.HashMap())
+    }
+  }
+
+  override def onBlockManagerRemoved(
+      blockManagerRemoved: SparkListenerBlockManagerRemoved): Unit = synchronized {
+    blockManagers -= blockManagerRemoved.blockManagerId
+  }
+
+  def allExecutorStreamBlockStatus: Seq[ExecutorStreamBlockStatus] = synchronized {
+    blockManagers.map { case (blockManagerId, blocks) =>
+      ExecutorStreamBlockStatus(
+        blockManagerId.executorId, blockManagerId.hostPort, blocks.values.toSeq)
+    }.toSeq
+  }
+}
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockUpdatedInfo.scala b/core/src/main/scala/org/apache/spark/storage/BlockUpdatedInfo.scala
new file mode 100644
index 0000000000000..a5790e4454a89
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/storage/BlockUpdatedInfo.scala
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.storage
+
+import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.storage.BlockManagerMessages.UpdateBlockInfo
+
+/**
+ * :: DeveloperApi ::
+ * Stores information about a block status in a block manager.
+ */
+@DeveloperApi
+case class BlockUpdatedInfo(
+    blockManagerId: BlockManagerId,
+    blockId: BlockId,
+    storageLevel: StorageLevel,
+    memSize: Long,
+    diskSize: Long,
+    externalBlockStoreSize: Long)
+
+private[spark] object BlockUpdatedInfo {
+
+  private[spark] def apply(updateBlockInfo: UpdateBlockInfo): BlockUpdatedInfo = {
+    BlockUpdatedInfo(
+      updateBlockInfo.blockManagerId,
+      updateBlockInfo.blockId,
+      updateBlockInfo.storageLevel,
+      updateBlockInfo.memSize,
+      updateBlockInfo.diskSize,
+      updateBlockInfo.externalBlockStoreSize)
+  }
+}
diff --git a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala
index 7898039519201..718aea7e1dc22 100644
--- a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala
+++ b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala
@@ -27,7 +27,7 @@ import org.apache.spark.ui.scope.RDDOperationGraph
 
 /** Utility functions for generating XML pages with spark content. */
 private[spark] object UIUtils extends Logging {
-  val TABLE_CLASS_NOT_STRIPED = "table table-bordered table-condensed sortable"
+  val TABLE_CLASS_NOT_STRIPED = "table table-bordered table-condensed"
   val TABLE_CLASS_STRIPED = TABLE_CLASS_NOT_STRIPED + " table-striped"
 
   // SimpleDateFormat is not thread-safe. Don't expose it to avoid improper use.
@@ -267,9 +267,17 @@ private[spark] object UIUtils extends Logging {
       fixedWidth: Boolean = false,
       id: Option[String] = None,
       headerClasses: Seq[String] = Seq.empty,
-      stripeRowsWithCss: Boolean = true): Seq[Node] = {
+      stripeRowsWithCss: Boolean = true,
+      sortable: Boolean = true): Seq[Node] = {
 
-    val listingTableClass = if (stripeRowsWithCss) TABLE_CLASS_STRIPED else TABLE_CLASS_NOT_STRIPED
+    val listingTableClass = {
+      val _tableClass = if (stripeRowsWithCss) TABLE_CLASS_STRIPED else TABLE_CLASS_NOT_STRIPED
+      if (sortable) {
+        _tableClass + " sortable"
+      } else {
+        _tableClass
+      }
+    }
     val colWidth = 100.toDouble / headers.size
     val colWidthAttr = if (fixedWidth) colWidth + "%" else ""
 
diff --git a/core/src/main/scala/org/apache/spark/ui/storage/StoragePage.scala b/core/src/main/scala/org/apache/spark/ui/storage/StoragePage.scala
index 07db783c572cf..04f584621e71e 100644
--- a/core/src/main/scala/org/apache/spark/ui/storage/StoragePage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/storage/StoragePage.scala
@@ -21,7 +21,7 @@ import javax.servlet.http.HttpServletRequest
 
 import scala.xml.Node
 
-import org.apache.spark.storage.RDDInfo
+import org.apache.spark.storage._
 import org.apache.spark.ui.{UIUtils, WebUIPage}
 import org.apache.spark.util.Utils
 
@@ -30,13 +30,25 @@ private[ui] class StoragePage(parent: StorageTab) extends WebUIPage("") {
   private val listener = parent.listener
 
   def render(request: HttpServletRequest): Seq[Node] = {
-    val rdds = listener.rddInfoList
-    val content = UIUtils.listingTable(rddHeader, rddRow, rdds, id = Some("storage-by-rdd-table"))
+    val content = rddTable(listener.rddInfoList) ++
+      receiverBlockTables(listener.allExecutorStreamBlockStatus.sortBy(_.executorId))
     UIUtils.headerSparkPage("Storage", content, parent)
   }
 
+  private[storage] def rddTable(rdds: Seq[RDDInfo]): Seq[Node] = {
+    if (rdds.isEmpty) {
+      // Don't show the rdd table if there is no RDD persisted.
+      Nil
+    } else {
+      <div>
+        <h4>RDDs</h4>
+        {UIUtils.listingTable(rddHeader, rddRow, rdds, id = Some("storage-by-rdd-table"))}
+      </div>
+    }
+  }
+
   /** Header fields for the RDD table */
-  private def rddHeader = Seq(
+  private val rddHeader = Seq(
     "RDD Name",
     "Storage Level",
     "Cached Partitions",
@@ -56,7 +68,7 @@ private[ui] class StoragePage(parent: StorageTab) extends WebUIPage("") {
       </td>
       <td>{rdd.storageLevel.description}
       </td>
-      <td>{rdd.numCachedPartitions}</td>
+      <td>{rdd.numCachedPartitions.toString}</td>
       <td>{"%.0f%%".format(rdd.numCachedPartitions * 100.0 / rdd.numPartitions)}</td>
       <td sorttable_customkey={rdd.memSize.toString}>{Utils.bytesToString(rdd.memSize)}</td>
       <td sorttable_customkey={rdd.externalBlockStoreSize.toString}>{Utils.bytesToString(rdd.externalBlockStoreSize)}</td>
@@ -64,4 +76,130 @@ private[ui] class StoragePage(parent: StorageTab) extends WebUIPage("") {
     </tr>
     // scalastyle:on
   }
+
+  private[storage] def receiverBlockTables(statuses: Seq[ExecutorStreamBlockStatus]): Seq[Node] = {
+    if (statuses.map(_.numStreamBlocks).sum == 0) {
+      // Don't show the tables if there is no stream block
+      Nil
+    } else {
+      val blocks = statuses.flatMap(_.blocks).groupBy(_.blockId).toSeq.sortBy(_._1.toString)
+
+      <div>
+        <h4>Receiver Blocks</h4>
+        {executorMetricsTable(statuses)}
+        {streamBlockTable(blocks)}
+      </div>
+    }
+  }
+
+  private def executorMetricsTable(statuses: Seq[ExecutorStreamBlockStatus]): Seq[Node] = {
+    <div>
+      <h5>Aggregated Block Metrics by Executor</h5>
+      {UIUtils.listingTable(executorMetricsTableHeader, executorMetricsTableRow, statuses,
+        id = Some("storage-by-executor-stream-blocks"))}
+    </div>
+  }
+
+  private val executorMetricsTableHeader = Seq(
+    "Executor ID",
+    "Address",
+    "Total Size in Memory",
+    "Total Size in ExternalBlockStore",
+    "Total Size on Disk",
+    "Stream Blocks")
+
+  private def executorMetricsTableRow(status: ExecutorStreamBlockStatus): Seq[Node] = {
+    <tr>
+      <td>
+        {status.executorId}
+      </td>
+      <td>
+        {status.location}
+      </td>
+      <td sorttable_customkey={status.totalMemSize.toString}>
+        {Utils.bytesToString(status.totalMemSize)}
+      </td>
+      <td sorttable_customkey={status.totalExternalBlockStoreSize.toString}>
+        {Utils.bytesToString(status.totalExternalBlockStoreSize)}
+      </td>
+      <td sorttable_customkey={status.totalDiskSize.toString}>
+        {Utils.bytesToString(status.totalDiskSize)}
+      </td>
+      <td>
+        {status.numStreamBlocks.toString}
+      </td>
+    </tr>
+  }
+
+  private def streamBlockTable(blocks: Seq[(BlockId, Seq[BlockUIData])]): Seq[Node] = {
+    if (blocks.isEmpty) {
+      Nil
+    } else {
+      <div>
+        <h5>Blocks</h5>
+        {UIUtils.listingTable(
+          streamBlockTableHeader,
+          streamBlockTableRow,
+          blocks,
+          id = Some("storage-by-block-table"),
+          sortable = false)}
+      </div>
+    }
+  }
+
+  private val streamBlockTableHeader = Seq(
+    "Block ID",
+    "Replication Level",
+    "Location",
+    "Storage Level",
+    "Size")
+
+  /** Render a stream block */
+  private def streamBlockTableRow(block: (BlockId, Seq[BlockUIData])): Seq[Node] = {
+    val replications = block._2
+    assert(replications.size > 0) // This must be true because it's the result of "groupBy"
+    if (replications.size == 1) {
+      streamBlockTableSubrow(block._1, replications.head, replications.size, true)
+    } else {
+      streamBlockTableSubrow(block._1, replications.head, replications.size, true) ++
+        replications.tail.map(streamBlockTableSubrow(block._1, _, replications.size, false)).flatten
+    }
+  }
+
+  private def streamBlockTableSubrow(
+      blockId: BlockId, block: BlockUIData, replication: Int, firstSubrow: Boolean): Seq[Node] = {
+    val (storageLevel, size) = streamBlockStorageLevelDescriptionAndSize(block)
+
+    <tr>
+      {
+        if (firstSubrow) {
+          <td rowspan={replication.toString}>
+            {block.blockId.toString}
+          </td>
+          <td rowspan={replication.toString}>
+            {replication.toString}
+          </td>
+        }
+      }
+      <td>{block.location}</td>
+      <td>{storageLevel}</td>
+      <td>{Utils.bytesToString(size)}</td>
+    </tr>
+  }
+
+  private[storage] def streamBlockStorageLevelDescriptionAndSize(
+      block: BlockUIData): (String, Long) = {
+    if (block.storageLevel.useDisk) {
+      ("Disk", block.diskSize)
+    } else if (block.storageLevel.useMemory && block.storageLevel.deserialized) {
+      ("Memory", block.memSize)
+    } else if (block.storageLevel.useMemory && !block.storageLevel.deserialized) {
+      ("Memory Serialized", block.memSize)
+    } else if (block.storageLevel.useOffHeap) {
+      ("External", block.externalBlockStoreSize)
+    } else {
+      throw new IllegalStateException(s"Invalid Storage Level: ${block.storageLevel}")
+    }
+  }
+
 }
diff --git a/core/src/main/scala/org/apache/spark/ui/storage/StorageTab.scala b/core/src/main/scala/org/apache/spark/ui/storage/StorageTab.scala
index 0351749700962..22e2993b3b5bd 100644
--- a/core/src/main/scala/org/apache/spark/ui/storage/StorageTab.scala
+++ b/core/src/main/scala/org/apache/spark/ui/storage/StorageTab.scala
@@ -39,7 +39,8 @@ private[ui] class StorageTab(parent: SparkUI) extends SparkUITab(parent, "storag
  * This class is thread-safe (unlike JobProgressListener)
  */
 @DeveloperApi
-class StorageListener(storageStatusListener: StorageStatusListener) extends SparkListener {
+class StorageListener(storageStatusListener: StorageStatusListener) extends BlockStatusListener {
+
   private[ui] val _rddInfoMap = mutable.Map[Int, RDDInfo]() // exposed for testing
 
   def storageStatusList: Seq[StorageStatus] = storageStatusListener.storageStatusList
diff --git a/core/src/test/scala/org/apache/spark/storage/BlockStatusListenerSuite.scala b/core/src/test/scala/org/apache/spark/storage/BlockStatusListenerSuite.scala
new file mode 100644
index 0000000000000..d7ffde1e7864e
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/storage/BlockStatusListenerSuite.scala
@@ -0,0 +1,119 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.storage
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.scheduler._
+
+class BlockStatusListenerSuite extends SparkFunSuite {
+
+  test("basic functions") {
+    val blockManagerId = BlockManagerId("0", "localhost", 10000)
+    val listener = new BlockStatusListener()
+
+    // Add a block manager and a new block status
+    listener.onBlockManagerAdded(SparkListenerBlockManagerAdded(0, blockManagerId, 0))
+    listener.onBlockUpdated(SparkListenerBlockUpdated(
+      BlockUpdatedInfo(
+        blockManagerId,
+        StreamBlockId(0, 100),
+        StorageLevel.MEMORY_AND_DISK,
+        memSize = 100,
+        diskSize = 100,
+        externalBlockStoreSize = 0)))
+    // The new block status should be added to the listener
+    val expectedBlock = BlockUIData(
+      StreamBlockId(0, 100),
+      "localhost:10000",
+      StorageLevel.MEMORY_AND_DISK,
+      memSize = 100,
+      diskSize = 100,
+      externalBlockStoreSize = 0
+    )
+    val expectedExecutorStreamBlockStatus = Seq(
+      ExecutorStreamBlockStatus("0", "localhost:10000", Seq(expectedBlock))
+    )
+    assert(listener.allExecutorStreamBlockStatus === expectedExecutorStreamBlockStatus)
+
+    // Add the second block manager
+    val blockManagerId2 = BlockManagerId("1", "localhost", 10001)
+    listener.onBlockManagerAdded(SparkListenerBlockManagerAdded(0, blockManagerId2, 0))
+    // Add a new replication of the same block id from the second manager
+    listener.onBlockUpdated(SparkListenerBlockUpdated(
+      BlockUpdatedInfo(
+        blockManagerId2,
+        StreamBlockId(0, 100),
+        StorageLevel.MEMORY_AND_DISK,
+        memSize = 100,
+        diskSize = 100,
+        externalBlockStoreSize = 0)))
+    val expectedBlock2 = BlockUIData(
+      StreamBlockId(0, 100),
+      "localhost:10001",
+      StorageLevel.MEMORY_AND_DISK,
+      memSize = 100,
+      diskSize = 100,
+      externalBlockStoreSize = 0
+    )
+    // Each block manager should contain one block
+    val expectedExecutorStreamBlockStatus2 = Set(
+      ExecutorStreamBlockStatus("0", "localhost:10000", Seq(expectedBlock)),
+      ExecutorStreamBlockStatus("1", "localhost:10001", Seq(expectedBlock2))
+    )
+    assert(listener.allExecutorStreamBlockStatus.toSet === expectedExecutorStreamBlockStatus2)
+
+    // Remove a replication of the same block
+    listener.onBlockUpdated(SparkListenerBlockUpdated(
+      BlockUpdatedInfo(
+        blockManagerId2,
+        StreamBlockId(0, 100),
+        StorageLevel.NONE, // StorageLevel.NONE means removing it
+        memSize = 0,
+        diskSize = 0,
+        externalBlockStoreSize = 0)))
+    // Only the first block manager contains a block
+    val expectedExecutorStreamBlockStatus3 = Set(
+      ExecutorStreamBlockStatus("0", "localhost:10000", Seq(expectedBlock)),
+      ExecutorStreamBlockStatus("1", "localhost:10001", Seq.empty)
+    )
+    assert(listener.allExecutorStreamBlockStatus.toSet === expectedExecutorStreamBlockStatus3)
+
+    // Remove the second block manager at first but add a new block status
+    // from this removed block manager
+    listener.onBlockManagerRemoved(SparkListenerBlockManagerRemoved(0, blockManagerId2))
+    listener.onBlockUpdated(SparkListenerBlockUpdated(
+      BlockUpdatedInfo(
+        blockManagerId2,
+        StreamBlockId(0, 100),
+        StorageLevel.MEMORY_AND_DISK,
+        memSize = 100,
+        diskSize = 100,
+        externalBlockStoreSize = 0)))
+    // The second block manager is removed so we should not see the new block
+    val expectedExecutorStreamBlockStatus4 = Seq(
+      ExecutorStreamBlockStatus("0", "localhost:10000", Seq(expectedBlock))
+    )
+    assert(listener.allExecutorStreamBlockStatus === expectedExecutorStreamBlockStatus4)
+
+    // Remove the last block manager
+    listener.onBlockManagerRemoved(SparkListenerBlockManagerRemoved(0, blockManagerId))
+    // No block manager now so we should dop all block managers
+    assert(listener.allExecutorStreamBlockStatus.isEmpty)
+  }
+
+}
diff --git a/core/src/test/scala/org/apache/spark/ui/storage/StoragePageSuite.scala b/core/src/test/scala/org/apache/spark/ui/storage/StoragePageSuite.scala
new file mode 100644
index 0000000000000..3dab15a9d4691
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/ui/storage/StoragePageSuite.scala
@@ -0,0 +1,230 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ui.storage
+
+import scala.xml.Utility
+
+import org.mockito.Mockito._
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.storage._
+
+class StoragePageSuite extends SparkFunSuite {
+
+  val storageTab = mock(classOf[StorageTab])
+  when(storageTab.basePath).thenReturn("http://localhost:4040")
+  val storagePage = new StoragePage(storageTab)
+
+  test("rddTable") {
+    val rdd1 = new RDDInfo(1,
+      "rdd1",
+      10,
+      StorageLevel.MEMORY_ONLY,
+      Seq.empty)
+    rdd1.memSize = 100
+    rdd1.numCachedPartitions = 10
+
+    val rdd2 = new RDDInfo(2,
+      "rdd2",
+      10,
+      StorageLevel.DISK_ONLY,
+      Seq.empty)
+    rdd2.diskSize = 200
+    rdd2.numCachedPartitions = 5
+
+    val rdd3 = new RDDInfo(3,
+      "rdd3",
+      10,
+      StorageLevel.MEMORY_AND_DISK_SER,
+      Seq.empty)
+    rdd3.memSize = 400
+    rdd3.diskSize = 500
+    rdd3.numCachedPartitions = 10
+
+    val xmlNodes = storagePage.rddTable(Seq(rdd1, rdd2, rdd3))
+
+    val headers = Seq(
+      "RDD Name",
+      "Storage Level",
+      "Cached Partitions",
+      "Fraction Cached",
+      "Size in Memory",
+      "Size in ExternalBlockStore",
+      "Size on Disk")
+    assert((xmlNodes \\ "th").map(_.text) === headers)
+
+    assert((xmlNodes \\ "tr").size === 3)
+    assert(((xmlNodes \\ "tr")(0) \\ "td").map(_.text.trim) ===
+      Seq("rdd1", "Memory Deserialized 1x Replicated", "10", "100%", "100.0 B", "0.0 B", "0.0 B"))
+    // Check the url
+    assert(((xmlNodes \\ "tr")(0) \\ "td" \ "a")(0).attribute("href").map(_.text) ===
+      Some("http://localhost:4040/storage/rdd?id=1"))
+
+    assert(((xmlNodes \\ "tr")(1) \\ "td").map(_.text.trim) ===
+      Seq("rdd2", "Disk Serialized 1x Replicated", "5", "50%", "0.0 B", "0.0 B", "200.0 B"))
+    // Check the url
+    assert(((xmlNodes \\ "tr")(1) \\ "td" \ "a")(0).attribute("href").map(_.text) ===
+      Some("http://localhost:4040/storage/rdd?id=2"))
+
+    assert(((xmlNodes \\ "tr")(2) \\ "td").map(_.text.trim) ===
+      Seq("rdd3", "Disk Memory Serialized 1x Replicated", "10", "100%", "400.0 B", "0.0 B",
+        "500.0 B"))
+    // Check the url
+    assert(((xmlNodes \\ "tr")(2) \\ "td" \ "a")(0).attribute("href").map(_.text) ===
+      Some("http://localhost:4040/storage/rdd?id=3"))
+  }
+
+  test("empty rddTable") {
+    assert(storagePage.rddTable(Seq.empty).isEmpty)
+  }
+
+  test("streamBlockStorageLevelDescriptionAndSize") {
+    val memoryBlock = BlockUIData(StreamBlockId(0, 0),
+      "localhost:1111",
+      StorageLevel.MEMORY_ONLY,
+      memSize = 100,
+      diskSize = 0,
+      externalBlockStoreSize = 0)
+    assert(("Memory", 100) === storagePage.streamBlockStorageLevelDescriptionAndSize(memoryBlock))
+
+    val memorySerializedBlock = BlockUIData(StreamBlockId(0, 0),
+      "localhost:1111",
+      StorageLevel.MEMORY_ONLY_SER,
+      memSize = 100,
+      diskSize = 0,
+      externalBlockStoreSize = 0)
+    assert(("Memory Serialized", 100) ===
+      storagePage.streamBlockStorageLevelDescriptionAndSize(memorySerializedBlock))
+
+    val diskBlock = BlockUIData(StreamBlockId(0, 0),
+      "localhost:1111",
+      StorageLevel.DISK_ONLY,
+      memSize = 0,
+      diskSize = 100,
+      externalBlockStoreSize = 0)
+    assert(("Disk", 100) === storagePage.streamBlockStorageLevelDescriptionAndSize(diskBlock))
+
+    val externalBlock = BlockUIData(StreamBlockId(0, 0),
+      "localhost:1111",
+      StorageLevel.OFF_HEAP,
+      memSize = 0,
+      diskSize = 0,
+      externalBlockStoreSize = 100)
+    assert(("External", 100) ===
+      storagePage.streamBlockStorageLevelDescriptionAndSize(externalBlock))
+  }
+
+  test("receiverBlockTables") {
+    val blocksForExecutor0 = Seq(
+      BlockUIData(StreamBlockId(0, 0),
+        "localhost:10000",
+        StorageLevel.MEMORY_ONLY,
+        memSize = 100,
+        diskSize = 0,
+        externalBlockStoreSize = 0),
+      BlockUIData(StreamBlockId(1, 1),
+        "localhost:10000",
+        StorageLevel.DISK_ONLY,
+        memSize = 0,
+        diskSize = 100,
+        externalBlockStoreSize = 0)
+    )
+    val executor0 = ExecutorStreamBlockStatus("0", "localhost:10000", blocksForExecutor0)
+
+    val blocksForExecutor1 = Seq(
+      BlockUIData(StreamBlockId(0, 0),
+        "localhost:10001",
+        StorageLevel.MEMORY_ONLY,
+        memSize = 100,
+        diskSize = 0,
+        externalBlockStoreSize = 0),
+      BlockUIData(StreamBlockId(2, 2),
+        "localhost:10001",
+        StorageLevel.OFF_HEAP,
+        memSize = 0,
+        diskSize = 0,
+        externalBlockStoreSize = 200),
+      BlockUIData(StreamBlockId(1, 1),
+        "localhost:10001",
+        StorageLevel.MEMORY_ONLY_SER,
+        memSize = 100,
+        diskSize = 0,
+        externalBlockStoreSize = 0)
+    )
+    val executor1 = ExecutorStreamBlockStatus("1", "localhost:10001", blocksForExecutor1)
+    val xmlNodes = storagePage.receiverBlockTables(Seq(executor0, executor1))
+
+    val executorTable = (xmlNodes \\ "table")(0)
+    val executorHeaders = Seq(
+      "Executor ID",
+      "Address",
+      "Total Size in Memory",
+      "Total Size in ExternalBlockStore",
+      "Total Size on Disk",
+      "Stream Blocks")
+    assert((executorTable \\ "th").map(_.text) === executorHeaders)
+
+    assert((executorTable \\ "tr").size === 2)
+    assert(((executorTable \\ "tr")(0) \\ "td").map(_.text.trim) ===
+      Seq("0", "localhost:10000", "100.0 B", "0.0 B", "100.0 B", "2"))
+    assert(((executorTable \\ "tr")(1) \\ "td").map(_.text.trim) ===
+      Seq("1", "localhost:10001", "200.0 B", "200.0 B", "0.0 B", "3"))
+
+    val blockTable = (xmlNodes \\ "table")(1)
+    val blockHeaders = Seq(
+      "Block ID",
+      "Replication Level",
+      "Location",
+      "Storage Level",
+      "Size")
+    assert((blockTable \\ "th").map(_.text) === blockHeaders)
+
+    assert((blockTable \\ "tr").size === 5)
+    assert(((blockTable \\ "tr")(0) \\ "td").map(_.text.trim) ===
+      Seq("input-0-0", "2", "localhost:10000", "Memory", "100.0 B"))
+    // Check "rowspan=2" for the first 2 columns
+    assert(((blockTable \\ "tr")(0) \\ "td")(0).attribute("rowspan").map(_.text) === Some("2"))
+    assert(((blockTable \\ "tr")(0) \\ "td")(1).attribute("rowspan").map(_.text) === Some("2"))
+
+    assert(((blockTable \\ "tr")(1) \\ "td").map(_.text.trim) ===
+      Seq("localhost:10001", "Memory", "100.0 B"))
+
+    assert(((blockTable \\ "tr")(2) \\ "td").map(_.text.trim) ===
+      Seq("input-1-1", "2", "localhost:10000", "Disk", "100.0 B"))
+    // Check "rowspan=2" for the first 2 columns
+    assert(((blockTable \\ "tr")(2) \\ "td")(0).attribute("rowspan").map(_.text) === Some("2"))
+    assert(((blockTable \\ "tr")(2) \\ "td")(1).attribute("rowspan").map(_.text) === Some("2"))
+
+    assert(((blockTable \\ "tr")(3) \\ "td").map(_.text.trim) ===
+      Seq("localhost:10001", "Memory Serialized", "100.0 B"))
+
+    assert(((blockTable \\ "tr")(4) \\ "td").map(_.text.trim) ===
+      Seq("input-2-2", "1", "localhost:10001", "External", "200.0 B"))
+    // Check "rowspan=1" for the first 2 columns
+    assert(((blockTable \\ "tr")(4) \\ "td")(0).attribute("rowspan").map(_.text) === Some("1"))
+    assert(((blockTable \\ "tr")(4) \\ "td")(1).attribute("rowspan").map(_.text) === Some("1"))
+  }
+
+  test("empty receiverBlockTables") {
+    assert(storagePage.receiverBlockTables(Seq.empty).isEmpty)
+
+    val executor0 = ExecutorStreamBlockStatus("0", "localhost:10000", Seq.empty)
+    val executor1 = ExecutorStreamBlockStatus("1", "localhost:10001", Seq.empty)
+    assert(storagePage.receiverBlockTables(Seq(executor0, executor1)).isEmpty)
+  }
+}

From 4b5cfc988f23988c2334882a255d494fc93d252e Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Tue, 14 Jul 2015 14:19:27 -0700
Subject: [PATCH 149/549] [SPARK-8800] [SQL] Fix inaccurate precision/scale of
 Decimal division operation

JIRA: https://issues.apache.org/jira/browse/SPARK-8800

Previously, we turn to Java BigDecimal's divide with specified ROUNDING_MODE to avoid non-terminating decimal expansion problem. However, as JihongMA reported, for the division operation on some specific values, we get inaccurate results.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #7212 from viirya/fix_decimal4 and squashes the following commits:

4205a0a [Liang-Chi Hsieh] Fix inaccuracy precision/scale of Decimal division operation.
---
 .../scala/org/apache/spark/sql/types/Decimal.scala | 14 +++++++++++---
 .../spark/sql/types/decimal/DecimalSuite.scala     | 10 +++++++++-
 2 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala
index 5a169488c97eb..f5bd068d60dc4 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala
@@ -145,6 +145,14 @@ final class Decimal extends Ordered[Decimal] with Serializable {
     }
   }
 
+  def toLimitedBigDecimal: BigDecimal = {
+    if (decimalVal.ne(null)) {
+      decimalVal
+    } else {
+      BigDecimal(longVal, _scale)
+    }
+  }
+
   def toJavaBigDecimal: java.math.BigDecimal = toBigDecimal.underlying()
 
   def toUnscaledLong: Long = {
@@ -269,9 +277,9 @@ final class Decimal extends Ordered[Decimal] with Serializable {
     if (that.isZero) {
       null
     } else {
-      // To avoid non-terminating decimal expansion problem, we turn to Java BigDecimal's divide
-      // with specified ROUNDING_MODE.
-      Decimal(toJavaBigDecimal.divide(that.toJavaBigDecimal, ROUNDING_MODE.id))
+      // To avoid non-terminating decimal expansion problem, we get scala's BigDecimal with limited
+      // precision and scala.
+      Decimal(toLimitedBigDecimal / that.toLimitedBigDecimal)
     }
   }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/decimal/DecimalSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/decimal/DecimalSuite.scala
index 5f312964e5bf7..030bb6d21b18b 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/decimal/DecimalSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/decimal/DecimalSuite.scala
@@ -170,6 +170,14 @@ class DecimalSuite extends SparkFunSuite with PrivateMethodTester {
 
   test("fix non-terminating decimal expansion problem") {
     val decimal = Decimal(1.0, 10, 3) / Decimal(3.0, 10, 3)
-    assert(decimal.toString === "0.333")
+    // The difference between decimal should not be more than 0.001.
+    assert(decimal.toDouble - 0.333 < 0.001)
+  }
+
+  test("fix loss of precision/scale when doing division operation") {
+    val a = Decimal(2) / Decimal(3)
+    assert(a.toDouble < 1.0 && a.toDouble > 0.6)
+    val b = Decimal(1) / Decimal(8)
+    assert(b.toDouble === 0.125)
   }
 }

From 740b034f1ca885a386f5a9ef7e0c81c714b047ff Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Tue, 14 Jul 2015 22:44:54 +0100
Subject: [PATCH 150/549] [SPARK-4362] [MLLIB] Make prediction probability
 available in NaiveBayesModel

Add predictProbabilities to Naive Bayes, return class probabilities.

Continues https://github.com/apache/spark/pull/6761

Author: Sean Owen <sowen@cloudera.com>

Closes #7376 from srowen/SPARK-4362 and squashes the following commits:

23d5a76 [Sean Owen] Fix model.labels -> model.theta
95d91fb [Sean Owen] Check that predicted probabilities sum to 1
b32d1c8 [Sean Owen] Add predictProbabilities to Naive Bayes, return class probabilities
---
 .../mllib/classification/NaiveBayes.scala     | 76 +++++++++++++++----
 .../classification/NaiveBayesSuite.scala      | 55 +++++++++++++-
 2 files changed, 113 insertions(+), 18 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
index f51ee36d0dfcb..9e379d7d74b2f 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
@@ -93,26 +93,70 @@ class NaiveBayesModel private[mllib] (
   override def predict(testData: Vector): Double = {
     modelType match {
       case Multinomial =>
-        val prob = thetaMatrix.multiply(testData)
-        BLAS.axpy(1.0, piVector, prob)
-        labels(prob.argmax)
+        labels(multinomialCalculation(testData).argmax)
       case Bernoulli =>
-        testData.foreachActive { (index, value) =>
-          if (value != 0.0 && value != 1.0) {
-            throw new SparkException(
-              s"Bernoulli naive Bayes requires 0 or 1 feature values but found $testData.")
-          }
-        }
-        val prob = thetaMinusNegTheta.get.multiply(testData)
-        BLAS.axpy(1.0, piVector, prob)
-        BLAS.axpy(1.0, negThetaSum.get, prob)
-        labels(prob.argmax)
-      case _ =>
-        // This should never happen.
-        throw new UnknownError(s"Invalid modelType: $modelType.")
+        labels(bernoulliCalculation(testData).argmax)
+    }
+  }
+
+  /**
+   * Predict values for the given data set using the model trained.
+   *
+   * @param testData RDD representing data points to be predicted
+   * @return an RDD[Vector] where each entry contains the predicted posterior class probabilities,
+   *         in the same order as class labels
+   */
+  def predictProbabilities(testData: RDD[Vector]): RDD[Vector] = {
+    val bcModel = testData.context.broadcast(this)
+    testData.mapPartitions { iter =>
+      val model = bcModel.value
+      iter.map(model.predictProbabilities)
     }
   }
 
+  /**
+   * Predict posterior class probabilities for a single data point using the model trained.
+   *
+   * @param testData array representing a single data point
+   * @return predicted posterior class probabilities from the trained model,
+   *         in the same order as class labels
+   */
+  def predictProbabilities(testData: Vector): Vector = {
+    modelType match {
+      case Multinomial =>
+        posteriorProbabilities(multinomialCalculation(testData))
+      case Bernoulli =>
+        posteriorProbabilities(bernoulliCalculation(testData))
+    }
+  }
+
+  private def multinomialCalculation(testData: Vector) = {
+    val prob = thetaMatrix.multiply(testData)
+    BLAS.axpy(1.0, piVector, prob)
+    prob
+  }
+
+  private def bernoulliCalculation(testData: Vector) = {
+    testData.foreachActive((_, value) =>
+      if (value != 0.0 && value != 1.0) {
+        throw new SparkException(
+          s"Bernoulli naive Bayes requires 0 or 1 feature values but found $testData.")
+      }
+    )
+    val prob = thetaMinusNegTheta.get.multiply(testData)
+    BLAS.axpy(1.0, piVector, prob)
+    BLAS.axpy(1.0, negThetaSum.get, prob)
+    prob
+  }
+
+  private def posteriorProbabilities(logProb: DenseVector) = {
+    val logProbArray = logProb.toArray
+    val maxLog = logProbArray.max
+    val scaledProbs = logProbArray.map(lp => math.exp(lp - maxLog))
+    val probSum = scaledProbs.sum
+    new DenseVector(scaledProbs.map(_ / probSum))
+  }
+
   override def save(sc: SparkContext, path: String): Unit = {
     val data = NaiveBayesModel.SaveLoadV2_0.Data(labels, pi, theta, modelType)
     NaiveBayesModel.SaveLoadV2_0.save(sc, path, data)
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala
index f7fc8730606af..cffa1ab700f80 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala
@@ -19,13 +19,14 @@ package org.apache.spark.mllib.classification
 
 import scala.util.Random
 
-import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV, argmax => brzArgmax, sum => brzSum}
+import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV, Vector => BV}
 import breeze.stats.distributions.{Multinomial => BrzMultinomial}
 
 import org.apache.spark.{SparkException, SparkFunSuite}
-import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.linalg.{Vector, Vectors}
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.util.{LocalClusterSparkContext, MLlibTestSparkContext}
+import org.apache.spark.mllib.util.TestingUtils._
 import org.apache.spark.util.Utils
 
 object NaiveBayesSuite {
@@ -154,6 +155,29 @@ class NaiveBayesSuite extends SparkFunSuite with MLlibTestSparkContext {
 
     // Test prediction on Array.
     validatePrediction(validationData.map(row => model.predict(row.features)), validationData)
+
+    // Test posteriors
+    validationData.map(_.features).foreach { features =>
+      val predicted = model.predictProbabilities(features).toArray
+      assert(predicted.sum ~== 1.0 relTol 1.0e-10)
+      val expected = expectedMultinomialProbabilities(model, features)
+      expected.zip(predicted).foreach { case (e, p) => assert(e ~== p relTol 1.0e-10) }
+    }
+  }
+
+  /**
+   * @param model Multinomial Naive Bayes model
+   * @param testData input to compute posterior probabilities for
+   * @return posterior class probabilities (in order of labels) for input
+   */
+  private def expectedMultinomialProbabilities(model: NaiveBayesModel, testData: Vector) = {
+    val piVector = new BDV(model.pi)
+    // model.theta is row-major; treat it as col-major representation of transpose, and transpose:
+    val thetaMatrix = new BDM(model.theta(0).length, model.theta.length, model.theta.flatten).t
+    val logClassProbs: BV[Double] = piVector + (thetaMatrix * testData.toBreeze)
+    val classProbs = logClassProbs.toArray.map(math.exp)
+    val classProbsSum = classProbs.sum
+    classProbs.map(_ / classProbsSum)
   }
 
   test("Naive Bayes Bernoulli") {
@@ -182,6 +206,33 @@ class NaiveBayesSuite extends SparkFunSuite with MLlibTestSparkContext {
 
     // Test prediction on Array.
     validatePrediction(validationData.map(row => model.predict(row.features)), validationData)
+
+    // Test posteriors
+    validationData.map(_.features).foreach { features =>
+      val predicted = model.predictProbabilities(features).toArray
+      assert(predicted.sum ~== 1.0 relTol 1.0e-10)
+      val expected = expectedBernoulliProbabilities(model, features)
+      expected.zip(predicted).foreach { case (e, p) => assert(e ~== p relTol 1.0e-10) }
+    }
+  }
+
+  /**
+   * @param model Bernoulli Naive Bayes model
+   * @param testData input to compute posterior probabilities for
+   * @return posterior class probabilities (in order of labels) for input
+   */
+  private def expectedBernoulliProbabilities(model: NaiveBayesModel, testData: Vector) = {
+    val piVector = new BDV(model.pi)
+    val thetaMatrix = new BDM(model.theta(0).length, model.theta.length, model.theta.flatten).t
+    val negThetaMatrix = new BDM(model.theta(0).length, model.theta.length,
+      model.theta.flatten.map(v => math.log(1.0 - math.exp(v)))).t
+    val testBreeze = testData.toBreeze
+    val negTestBreeze = new BDV(Array.fill(testBreeze.size)(1.0)) - testBreeze
+    val piTheta: BV[Double] = piVector + (thetaMatrix * testBreeze)
+    val logClassProbs: BV[Double] = piTheta + (negThetaMatrix * negTestBreeze)
+    val classProbs = logClassProbs.toArray.map(math.exp)
+    val classProbsSum = classProbs.sum
+    classProbs.map(_ / classProbsSum)
   }
 
   test("detect negative values") {

From 11e5c372862ec00e57460b37ccfee51c6d93c5f7 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Tue, 14 Jul 2015 16:08:17 -0700
Subject: [PATCH 151/549] [SPARK-8962] Add Scalastyle rule to ban direct use of
 Class.forName; fix existing uses

This pull request adds a Scalastyle regex rule which fails the style check if `Class.forName` is used directly.  `Class.forName` always loads classes from the default / system classloader, but in a majority of cases, we should be using Spark's own `Utils.classForName` instead, which tries to load classes from the current thread's context classloader and falls back to the classloader which loaded Spark when the context classloader is not defined.

<!-- Reviewable:start -->
[<img src="https://reviewable.io/review_button.png" height=40 alt="Review on Reviewable"/>](https://reviewable.io/reviews/apache/spark/7350)
<!-- Reviewable:end -->

Author: Josh Rosen <joshrosen@databricks.com>

Closes #7350 from JoshRosen/ban-Class.forName and squashes the following commits:

e3e96f7 [Josh Rosen] Merge remote-tracking branch 'origin/master' into ban-Class.forName
c0b7885 [Josh Rosen] Hopefully fix the last two cases
d707ba7 [Josh Rosen] Fix uses of Class.forName that I missed in my first cleanup pass
046470d [Josh Rosen] Merge remote-tracking branch 'origin/master' into ban-Class.forName
62882ee [Josh Rosen] Fix uses of Class.forName or add exclusion.
d9abade [Josh Rosen] Add stylechecker rule to ban uses of Class.forName
---
 .../main/scala/org/apache/spark/Logging.scala  |  2 +-
 .../scala/org/apache/spark/SparkContext.scala  | 11 +++++------
 .../main/scala/org/apache/spark/SparkEnv.scala |  2 +-
 .../apache/spark/api/r/RBackendHandler.scala   | 18 ++----------------
 .../spark/broadcast/BroadcastManager.scala     |  3 ++-
 .../apache/spark/deploy/SparkHadoopUtil.scala  |  4 ++--
 .../org/apache/spark/deploy/SparkSubmit.scala  |  2 +-
 .../spark/deploy/SparkSubmitArguments.scala    |  2 +-
 .../spark/deploy/history/HistoryServer.scala   |  2 +-
 .../apache/spark/deploy/master/Master.scala    |  2 +-
 .../rest/SubmitRestProtocolMessage.scala       |  2 +-
 .../spark/deploy/worker/DriverWrapper.scala    |  2 +-
 .../spark/deploy/worker/WorkerArguments.scala  |  2 ++
 .../org/apache/spark/executor/Executor.scala   |  2 +-
 .../org/apache/spark/io/CompressionCodec.scala |  3 +--
 .../spark/mapred/SparkHadoopMapRedUtil.scala   |  5 +++--
 .../mapreduce/SparkHadoopMapReduceUtil.scala   |  9 +++++----
 .../apache/spark/metrics/MetricsSystem.scala   |  6 ++++--
 .../scala/org/apache/spark/rdd/HadoopRDD.scala |  6 +++---
 .../scala/org/apache/spark/rpc/RpcEnv.scala    |  3 +--
 .../spark/serializer/JavaSerializer.scala      |  5 ++++-
 .../spark/serializer/KryoSerializer.scala      |  2 ++
 .../serializer/SerializationDebugger.scala     |  2 ++
 .../spark/storage/ExternalBlockStore.scala     |  2 +-
 .../org/apache/spark/util/ClosureCleaner.scala |  2 ++
 .../org/apache/spark/util/SizeEstimator.scala  |  2 ++
 .../scala/org/apache/spark/util/Utils.scala    | 11 +++++++++--
 .../scala/org/apache/spark/FileSuite.scala     |  2 ++
 .../SparkContextSchedulerCreationSuite.scala   |  3 ++-
 .../apache/spark/deploy/SparkSubmitSuite.scala |  4 ++--
 .../org/apache/spark/rdd/JdbcRDDSuite.scala    |  3 ++-
 .../KryoSerializerDistributedSuite.scala       |  2 ++
 .../util/MutableURLClassLoaderSuite.scala      |  2 ++
 .../spark/streaming/flume/sink/Logging.scala   |  2 ++
 .../spark/graphx/util/BytecodeUtils.scala      |  2 +-
 .../org/apache/spark/repl/SparkIMain.scala     |  2 ++
 scalastyle-config.xml                          | 11 +++++++++++
 .../org/apache/spark/sql/types/DataType.scala  |  3 ++-
 .../org/apache/spark/sql/SQLContext.scala      |  3 +--
 .../spark/sql/parquet/ParquetRelation.scala    |  7 ++++---
 .../org/apache/spark/sql/jdbc/JDBCSuite.scala  |  3 ++-
 .../apache/spark/sql/jdbc/JDBCWriteSuite.scala |  3 ++-
 .../thriftserver/HiveThriftServer2Suites.scala |  2 +-
 .../apache/spark/sql/hive/TableReader.scala    |  4 +---
 .../spark/sql/hive/client/ClientWrapper.scala  |  9 ++++-----
 .../spark/sql/hive/HiveSparkSubmitSuite.scala  |  8 ++++----
 .../streaming/scheduler/JobGenerator.scala     |  6 +++---
 .../spark/tools/GenerateMIMAIgnore.scala       |  2 ++
 .../org/apache/spark/deploy/yarn/Client.scala  |  4 ++--
 49 files changed, 117 insertions(+), 84 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/Logging.scala b/core/src/main/scala/org/apache/spark/Logging.scala
index 87ab099267b2f..f0598816d6c07 100644
--- a/core/src/main/scala/org/apache/spark/Logging.scala
+++ b/core/src/main/scala/org/apache/spark/Logging.scala
@@ -159,7 +159,7 @@ private object Logging {
   try {
     // We use reflection here to handle the case where users remove the
     // slf4j-to-jul bridge order to route their logs to JUL.
-    val bridgeClass = Class.forName("org.slf4j.bridge.SLF4JBridgeHandler")
+    val bridgeClass = Utils.classForName("org.slf4j.bridge.SLF4JBridgeHandler")
     bridgeClass.getMethod("removeHandlersForRootLogger").invoke(null)
     val installed = bridgeClass.getMethod("isInstalled").invoke(null).asInstanceOf[Boolean]
     if (!installed) {
diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index 82704b1ab2189..bd1cc332a63e7 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -1968,7 +1968,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
       for (className <- listenerClassNames) {
         // Use reflection to find the right constructor
         val constructors = {
-          val listenerClass = Class.forName(className)
+          val listenerClass = Utils.classForName(className)
           listenerClass.getConstructors.asInstanceOf[Array[Constructor[_ <: SparkListener]]]
         }
         val constructorTakingSparkConf = constructors.find { c =>
@@ -2503,7 +2503,7 @@ object SparkContext extends Logging {
             "\"yarn-standalone\" is deprecated as of Spark 1.0. Use \"yarn-cluster\" instead.")
         }
         val scheduler = try {
-          val clazz = Class.forName("org.apache.spark.scheduler.cluster.YarnClusterScheduler")
+          val clazz = Utils.classForName("org.apache.spark.scheduler.cluster.YarnClusterScheduler")
           val cons = clazz.getConstructor(classOf[SparkContext])
           cons.newInstance(sc).asInstanceOf[TaskSchedulerImpl]
         } catch {
@@ -2515,7 +2515,7 @@ object SparkContext extends Logging {
         }
         val backend = try {
           val clazz =
-            Class.forName("org.apache.spark.scheduler.cluster.YarnClusterSchedulerBackend")
+            Utils.classForName("org.apache.spark.scheduler.cluster.YarnClusterSchedulerBackend")
           val cons = clazz.getConstructor(classOf[TaskSchedulerImpl], classOf[SparkContext])
           cons.newInstance(scheduler, sc).asInstanceOf[CoarseGrainedSchedulerBackend]
         } catch {
@@ -2528,8 +2528,7 @@ object SparkContext extends Logging {
 
       case "yarn-client" =>
         val scheduler = try {
-          val clazz =
-            Class.forName("org.apache.spark.scheduler.cluster.YarnScheduler")
+          val clazz = Utils.classForName("org.apache.spark.scheduler.cluster.YarnScheduler")
           val cons = clazz.getConstructor(classOf[SparkContext])
           cons.newInstance(sc).asInstanceOf[TaskSchedulerImpl]
 
@@ -2541,7 +2540,7 @@ object SparkContext extends Logging {
 
         val backend = try {
           val clazz =
-            Class.forName("org.apache.spark.scheduler.cluster.YarnClientSchedulerBackend")
+            Utils.classForName("org.apache.spark.scheduler.cluster.YarnClientSchedulerBackend")
           val cons = clazz.getConstructor(classOf[TaskSchedulerImpl], classOf[SparkContext])
           cons.newInstance(scheduler, sc).asInstanceOf[CoarseGrainedSchedulerBackend]
         } catch {
diff --git a/core/src/main/scala/org/apache/spark/SparkEnv.scala b/core/src/main/scala/org/apache/spark/SparkEnv.scala
index d18fc599e9890..adfece4d6e7c0 100644
--- a/core/src/main/scala/org/apache/spark/SparkEnv.scala
+++ b/core/src/main/scala/org/apache/spark/SparkEnv.scala
@@ -261,7 +261,7 @@ object SparkEnv extends Logging {
 
     // Create an instance of the class with the given name, possibly initializing it with our conf
     def instantiateClass[T](className: String): T = {
-      val cls = Class.forName(className, true, Utils.getContextOrSparkClassLoader)
+      val cls = Utils.classForName(className)
       // Look for a constructor taking a SparkConf and a boolean isDriver, then one taking just
       // SparkConf, then one taking no arguments
       try {
diff --git a/core/src/main/scala/org/apache/spark/api/r/RBackendHandler.scala b/core/src/main/scala/org/apache/spark/api/r/RBackendHandler.scala
index 4b8f7fe9242e0..9658e9a696ffa 100644
--- a/core/src/main/scala/org/apache/spark/api/r/RBackendHandler.scala
+++ b/core/src/main/scala/org/apache/spark/api/r/RBackendHandler.scala
@@ -26,6 +26,7 @@ import io.netty.channel.{ChannelHandlerContext, SimpleChannelInboundHandler}
 
 import org.apache.spark.Logging
 import org.apache.spark.api.r.SerDe._
+import org.apache.spark.util.Utils
 
 /**
  * Handler for RBackend
@@ -88,21 +89,6 @@ private[r] class RBackendHandler(server: RBackend)
     ctx.close()
   }
 
-  // Looks up a class given a class name. This function first checks the
-  // current class loader and if a class is not found, it looks up the class
-  // in the context class loader. Address [SPARK-5185]
-  def getStaticClass(objId: String): Class[_] = {
-    try {
-      val clsCurrent = Class.forName(objId)
-      clsCurrent
-    } catch {
-      // Use contextLoader if we can't find the JAR in the system class loader
-      case e: ClassNotFoundException =>
-        val clsContext = Class.forName(objId, true, Thread.currentThread().getContextClassLoader)
-        clsContext
-      }
-    }
-
   def handleMethodCall(
       isStatic: Boolean,
       objId: String,
@@ -113,7 +99,7 @@ private[r] class RBackendHandler(server: RBackend)
     var obj: Object = null
     try {
       val cls = if (isStatic) {
-        getStaticClass(objId)
+        Utils.classForName(objId)
       } else {
         JVMObjectTracker.get(objId) match {
           case None => throw new IllegalArgumentException("Object not found " + objId)
diff --git a/core/src/main/scala/org/apache/spark/broadcast/BroadcastManager.scala b/core/src/main/scala/org/apache/spark/broadcast/BroadcastManager.scala
index 685313ac009ba..fac6666bb3410 100644
--- a/core/src/main/scala/org/apache/spark/broadcast/BroadcastManager.scala
+++ b/core/src/main/scala/org/apache/spark/broadcast/BroadcastManager.scala
@@ -22,6 +22,7 @@ import java.util.concurrent.atomic.AtomicLong
 import scala.reflect.ClassTag
 
 import org.apache.spark._
+import org.apache.spark.util.Utils
 
 private[spark] class BroadcastManager(
     val isDriver: Boolean,
@@ -42,7 +43,7 @@ private[spark] class BroadcastManager(
           conf.get("spark.broadcast.factory", "org.apache.spark.broadcast.TorrentBroadcastFactory")
 
         broadcastFactory =
-          Class.forName(broadcastFactoryClass).newInstance.asInstanceOf[BroadcastFactory]
+          Utils.classForName(broadcastFactoryClass).newInstance.asInstanceOf[BroadcastFactory]
 
         // Initialize appropriate BroadcastFactory and BroadcastObject
         broadcastFactory.initialize(isDriver, conf, securityManager)
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
index 6d14590a1d192..9f94118829ff1 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
@@ -178,7 +178,7 @@ class SparkHadoopUtil extends Logging {
 
   private def getFileSystemThreadStatisticsMethod(methodName: String): Method = {
     val statisticsDataClass =
-      Class.forName("org.apache.hadoop.fs.FileSystem$Statistics$StatisticsData")
+      Utils.classForName("org.apache.hadoop.fs.FileSystem$Statistics$StatisticsData")
     statisticsDataClass.getDeclaredMethod(methodName)
   }
 
@@ -356,7 +356,7 @@ object SparkHadoopUtil {
         System.getProperty("SPARK_YARN_MODE", System.getenv("SPARK_YARN_MODE")))
     if (yarnMode) {
       try {
-        Class.forName("org.apache.spark.deploy.yarn.YarnSparkHadoopUtil")
+        Utils.classForName("org.apache.spark.deploy.yarn.YarnSparkHadoopUtil")
           .newInstance()
           .asInstanceOf[SparkHadoopUtil]
       } catch {
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index 7089a7e26707f..036cb6e054791 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -624,7 +624,7 @@ object SparkSubmit {
     var mainClass: Class[_] = null
 
     try {
-      mainClass = Class.forName(childMainClass, true, loader)
+      mainClass = Utils.classForName(childMainClass)
     } catch {
       case e: ClassNotFoundException =>
         e.printStackTrace(printStream)
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
index ebb39c354dff1..b3710073e330c 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
@@ -576,7 +576,7 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S
       System.setSecurityManager(sm)
 
       try {
-        Class.forName(mainClass).getMethod("main", classOf[Array[String]])
+        Utils.classForName(mainClass).getMethod("main", classOf[Array[String]])
           .invoke(null, Array(HELP))
       } catch {
         case e: InvocationTargetException =>
diff --git a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala
index 10638afb74900..a076a9c3f984d 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala
@@ -228,7 +228,7 @@ object HistoryServer extends Logging {
 
     val providerName = conf.getOption("spark.history.provider")
       .getOrElse(classOf[FsHistoryProvider].getName())
-    val provider = Class.forName(providerName)
+    val provider = Utils.classForName(providerName)
       .getConstructor(classOf[SparkConf])
       .newInstance(conf)
       .asInstanceOf[ApplicationHistoryProvider]
diff --git a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
index 48070768f6edb..245b047e7dfbd 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
@@ -172,7 +172,7 @@ private[master] class Master(
           new FileSystemRecoveryModeFactory(conf, SerializationExtension(actorSystem))
         (fsFactory.createPersistenceEngine(), fsFactory.createLeaderElectionAgent(this))
       case "CUSTOM" =>
-        val clazz = Class.forName(conf.get("spark.deploy.recoveryMode.factory"))
+        val clazz = Utils.classForName(conf.get("spark.deploy.recoveryMode.factory"))
         val factory = clazz.getConstructor(classOf[SparkConf], classOf[Serialization])
           .newInstance(conf, SerializationExtension(actorSystem))
           .asInstanceOf[StandaloneRecoveryModeFactory]
diff --git a/core/src/main/scala/org/apache/spark/deploy/rest/SubmitRestProtocolMessage.scala b/core/src/main/scala/org/apache/spark/deploy/rest/SubmitRestProtocolMessage.scala
index e6615a3174ce1..ef5a7e35ad562 100644
--- a/core/src/main/scala/org/apache/spark/deploy/rest/SubmitRestProtocolMessage.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/rest/SubmitRestProtocolMessage.scala
@@ -128,7 +128,7 @@ private[spark] object SubmitRestProtocolMessage {
    */
   def fromJson(json: String): SubmitRestProtocolMessage = {
     val className = parseAction(json)
-    val clazz = Class.forName(packagePrefix + "." + className)
+    val clazz = Utils.classForName(packagePrefix + "." + className)
       .asSubclass[SubmitRestProtocolMessage](classOf[SubmitRestProtocolMessage])
     fromJson(json, clazz)
   }
diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/DriverWrapper.scala b/core/src/main/scala/org/apache/spark/deploy/worker/DriverWrapper.scala
index 2d6be3042c905..6799f78ec0c19 100644
--- a/core/src/main/scala/org/apache/spark/deploy/worker/DriverWrapper.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/DriverWrapper.scala
@@ -53,7 +53,7 @@ object DriverWrapper {
         Thread.currentThread.setContextClassLoader(loader)
 
         // Delegate to supplied main class
-        val clazz = Class.forName(mainClass, true, loader)
+        val clazz = Utils.classForName(mainClass)
         val mainMethod = clazz.getMethod("main", classOf[Array[String]])
         mainMethod.invoke(null, extraArgs.toArray[String])
 
diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/WorkerArguments.scala b/core/src/main/scala/org/apache/spark/deploy/worker/WorkerArguments.scala
index e89d076802215..5181142c5f80e 100644
--- a/core/src/main/scala/org/apache/spark/deploy/worker/WorkerArguments.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/WorkerArguments.scala
@@ -149,6 +149,7 @@ private[worker] class WorkerArguments(args: Array[String], conf: SparkConf) {
     val ibmVendor = System.getProperty("java.vendor").contains("IBM")
     var totalMb = 0
     try {
+      // scalastyle:off classforname
       val bean = ManagementFactory.getOperatingSystemMXBean()
       if (ibmVendor) {
         val beanClass = Class.forName("com.ibm.lang.management.OperatingSystemMXBean")
@@ -159,6 +160,7 @@ private[worker] class WorkerArguments(args: Array[String], conf: SparkConf) {
         val method = beanClass.getDeclaredMethod("getTotalPhysicalMemorySize")
         totalMb = (method.invoke(bean).asInstanceOf[Long] / 1024 / 1024).toInt
       }
+      // scalastyle:on classforname
     } catch {
       case e: Exception => {
         totalMb = 2*1024
diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala
index f7ef92bc80f91..1a02051c87f19 100644
--- a/core/src/main/scala/org/apache/spark/executor/Executor.scala
+++ b/core/src/main/scala/org/apache/spark/executor/Executor.scala
@@ -356,7 +356,7 @@ private[spark] class Executor(
       logInfo("Using REPL class URI: " + classUri)
       try {
         val _userClassPathFirst: java.lang.Boolean = userClassPathFirst
-        val klass = Class.forName("org.apache.spark.repl.ExecutorClassLoader")
+        val klass = Utils.classForName("org.apache.spark.repl.ExecutorClassLoader")
           .asInstanceOf[Class[_ <: ClassLoader]]
         val constructor = klass.getConstructor(classOf[SparkConf], classOf[String],
           classOf[ClassLoader], classOf[Boolean])
diff --git a/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala b/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala
index 0d8ac1f80a9f4..607d5a321efca 100644
--- a/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala
+++ b/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala
@@ -63,8 +63,7 @@ private[spark] object CompressionCodec {
   def createCodec(conf: SparkConf, codecName: String): CompressionCodec = {
     val codecClass = shortCompressionCodecNames.getOrElse(codecName.toLowerCase, codecName)
     val codec = try {
-      val ctor = Class.forName(codecClass, true, Utils.getContextOrSparkClassLoader)
-        .getConstructor(classOf[SparkConf])
+      val ctor = Utils.classForName(codecClass).getConstructor(classOf[SparkConf])
       Some(ctor.newInstance(conf).asInstanceOf[CompressionCodec])
     } catch {
       case e: ClassNotFoundException => None
diff --git a/core/src/main/scala/org/apache/spark/mapred/SparkHadoopMapRedUtil.scala b/core/src/main/scala/org/apache/spark/mapred/SparkHadoopMapRedUtil.scala
index 818f7a4c8d422..87df42748be44 100644
--- a/core/src/main/scala/org/apache/spark/mapred/SparkHadoopMapRedUtil.scala
+++ b/core/src/main/scala/org/apache/spark/mapred/SparkHadoopMapRedUtil.scala
@@ -26,6 +26,7 @@ import org.apache.hadoop.mapreduce.{OutputCommitter => MapReduceOutputCommitter}
 
 import org.apache.spark.executor.CommitDeniedException
 import org.apache.spark.{Logging, SparkEnv, TaskContext}
+import org.apache.spark.util.{Utils => SparkUtils}
 
 private[spark]
 trait SparkHadoopMapRedUtil {
@@ -64,10 +65,10 @@ trait SparkHadoopMapRedUtil {
 
   private def firstAvailableClass(first: String, second: String): Class[_] = {
     try {
-      Class.forName(first)
+      SparkUtils.classForName(first)
     } catch {
       case e: ClassNotFoundException =>
-        Class.forName(second)
+        SparkUtils.classForName(second)
     }
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/mapreduce/SparkHadoopMapReduceUtil.scala b/core/src/main/scala/org/apache/spark/mapreduce/SparkHadoopMapReduceUtil.scala
index 390d148bc97f9..943ebcb7bd0a1 100644
--- a/core/src/main/scala/org/apache/spark/mapreduce/SparkHadoopMapReduceUtil.scala
+++ b/core/src/main/scala/org/apache/spark/mapreduce/SparkHadoopMapReduceUtil.scala
@@ -21,6 +21,7 @@ import java.lang.{Boolean => JBoolean, Integer => JInteger}
 
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.mapreduce.{JobContext, JobID, TaskAttemptContext, TaskAttemptID}
+import org.apache.spark.util.Utils
 
 private[spark]
 trait SparkHadoopMapReduceUtil {
@@ -46,7 +47,7 @@ trait SparkHadoopMapReduceUtil {
       isMap: Boolean,
       taskId: Int,
       attemptId: Int): TaskAttemptID = {
-    val klass = Class.forName("org.apache.hadoop.mapreduce.TaskAttemptID")
+    val klass = Utils.classForName("org.apache.hadoop.mapreduce.TaskAttemptID")
     try {
       // First, attempt to use the old-style constructor that takes a boolean isMap
       // (not available in YARN)
@@ -57,7 +58,7 @@ trait SparkHadoopMapReduceUtil {
     } catch {
       case exc: NoSuchMethodException => {
         // If that failed, look for the new constructor that takes a TaskType (not available in 1.x)
-        val taskTypeClass = Class.forName("org.apache.hadoop.mapreduce.TaskType")
+        val taskTypeClass = Utils.classForName("org.apache.hadoop.mapreduce.TaskType")
           .asInstanceOf[Class[Enum[_]]]
         val taskType = taskTypeClass.getMethod("valueOf", classOf[String]).invoke(
           taskTypeClass, if (isMap) "MAP" else "REDUCE")
@@ -71,10 +72,10 @@ trait SparkHadoopMapReduceUtil {
 
   private def firstAvailableClass(first: String, second: String): Class[_] = {
     try {
-      Class.forName(first)
+      Utils.classForName(first)
     } catch {
       case e: ClassNotFoundException =>
-        Class.forName(second)
+        Utils.classForName(second)
     }
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/metrics/MetricsSystem.scala b/core/src/main/scala/org/apache/spark/metrics/MetricsSystem.scala
index ed5131c79fdc5..67f64d5e278de 100644
--- a/core/src/main/scala/org/apache/spark/metrics/MetricsSystem.scala
+++ b/core/src/main/scala/org/apache/spark/metrics/MetricsSystem.scala
@@ -20,6 +20,8 @@ package org.apache.spark.metrics
 import java.util.Properties
 import java.util.concurrent.TimeUnit
 
+import org.apache.spark.util.Utils
+
 import scala.collection.mutable
 
 import com.codahale.metrics.{Metric, MetricFilter, MetricRegistry}
@@ -166,7 +168,7 @@ private[spark] class MetricsSystem private (
     sourceConfigs.foreach { kv =>
       val classPath = kv._2.getProperty("class")
       try {
-        val source = Class.forName(classPath).newInstance()
+        val source = Utils.classForName(classPath).newInstance()
         registerSource(source.asInstanceOf[Source])
       } catch {
         case e: Exception => logError("Source class " + classPath + " cannot be instantiated", e)
@@ -182,7 +184,7 @@ private[spark] class MetricsSystem private (
       val classPath = kv._2.getProperty("class")
       if (null != classPath) {
         try {
-          val sink = Class.forName(classPath)
+          val sink = Utils.classForName(classPath)
             .getConstructor(classOf[Properties], classOf[MetricRegistry], classOf[SecurityManager])
             .newInstance(kv._2, registry, securityMgr)
           if (kv._1 == "servlet") {
diff --git a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
index bee59a437f120..f1c17369cb48c 100644
--- a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
@@ -383,11 +383,11 @@ private[spark] object HadoopRDD extends Logging {
 
   private[spark] class SplitInfoReflections {
     val inputSplitWithLocationInfo =
-      Class.forName("org.apache.hadoop.mapred.InputSplitWithLocationInfo")
+      Utils.classForName("org.apache.hadoop.mapred.InputSplitWithLocationInfo")
     val getLocationInfo = inputSplitWithLocationInfo.getMethod("getLocationInfo")
-    val newInputSplit = Class.forName("org.apache.hadoop.mapreduce.InputSplit")
+    val newInputSplit = Utils.classForName("org.apache.hadoop.mapreduce.InputSplit")
     val newGetLocationInfo = newInputSplit.getMethod("getLocationInfo")
-    val splitLocationInfo = Class.forName("org.apache.hadoop.mapred.SplitLocationInfo")
+    val splitLocationInfo = Utils.classForName("org.apache.hadoop.mapred.SplitLocationInfo")
     val isInMemory = splitLocationInfo.getMethod("isInMemory")
     val getLocation = splitLocationInfo.getMethod("getLocation")
   }
diff --git a/core/src/main/scala/org/apache/spark/rpc/RpcEnv.scala b/core/src/main/scala/org/apache/spark/rpc/RpcEnv.scala
index 1709bdf560b6f..c9fcc7a36cc04 100644
--- a/core/src/main/scala/org/apache/spark/rpc/RpcEnv.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/RpcEnv.scala
@@ -39,8 +39,7 @@ private[spark] object RpcEnv {
     val rpcEnvNames = Map("akka" -> "org.apache.spark.rpc.akka.AkkaRpcEnvFactory")
     val rpcEnvName = conf.get("spark.rpc", "akka")
     val rpcEnvFactoryClassName = rpcEnvNames.getOrElse(rpcEnvName.toLowerCase, rpcEnvName)
-    Class.forName(rpcEnvFactoryClassName, true, Utils.getContextOrSparkClassLoader).
-      newInstance().asInstanceOf[RpcEnvFactory]
+    Utils.classForName(rpcEnvFactoryClassName).newInstance().asInstanceOf[RpcEnvFactory]
   }
 
   def create(
diff --git a/core/src/main/scala/org/apache/spark/serializer/JavaSerializer.scala b/core/src/main/scala/org/apache/spark/serializer/JavaSerializer.scala
index 698d1384d580d..4a5274b46b7a0 100644
--- a/core/src/main/scala/org/apache/spark/serializer/JavaSerializer.scala
+++ b/core/src/main/scala/org/apache/spark/serializer/JavaSerializer.scala
@@ -62,8 +62,11 @@ private[spark] class JavaDeserializationStream(in: InputStream, loader: ClassLoa
   extends DeserializationStream {
 
   private val objIn = new ObjectInputStream(in) {
-    override def resolveClass(desc: ObjectStreamClass): Class[_] =
+    override def resolveClass(desc: ObjectStreamClass): Class[_] = {
+      // scalastyle:off classforname
       Class.forName(desc.getName, false, loader)
+      // scalastyle:on classforname
+    }
   }
 
   def readObject[T: ClassTag](): T = objIn.readObject().asInstanceOf[T]
diff --git a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala
index ed35cffe968f8..7cb6e080533ad 100644
--- a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala
+++ b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala
@@ -102,6 +102,7 @@ class KryoSerializer(conf: SparkConf)
     kryo.register(classOf[PythonBroadcast], new KryoJavaSerializer())
 
     try {
+      // scalastyle:off classforname
       // Use the default classloader when calling the user registrator.
       Thread.currentThread.setContextClassLoader(classLoader)
       // Register classes given through spark.kryo.classesToRegister.
@@ -111,6 +112,7 @@ class KryoSerializer(conf: SparkConf)
       userRegistrator
         .map(Class.forName(_, true, classLoader).newInstance().asInstanceOf[KryoRegistrator])
         .foreach { reg => reg.registerClasses(kryo) }
+      // scalastyle:on classforname
     } catch {
       case e: Exception =>
         throw new SparkException(s"Failed to register classes with Kryo", e)
diff --git a/core/src/main/scala/org/apache/spark/serializer/SerializationDebugger.scala b/core/src/main/scala/org/apache/spark/serializer/SerializationDebugger.scala
index cc2f0506817d3..a1b1e1631eafb 100644
--- a/core/src/main/scala/org/apache/spark/serializer/SerializationDebugger.scala
+++ b/core/src/main/scala/org/apache/spark/serializer/SerializationDebugger.scala
@@ -407,7 +407,9 @@ private[spark] object SerializationDebugger extends Logging {
 
     /** ObjectStreamClass$ClassDataSlot.desc field */
     val DescField: Field = {
+      // scalastyle:off classforname
       val f = Class.forName("java.io.ObjectStreamClass$ClassDataSlot").getDeclaredField("desc")
+      // scalastyle:on classforname
       f.setAccessible(true)
       f
     }
diff --git a/core/src/main/scala/org/apache/spark/storage/ExternalBlockStore.scala b/core/src/main/scala/org/apache/spark/storage/ExternalBlockStore.scala
index 291394ed34816..db965d54bafd6 100644
--- a/core/src/main/scala/org/apache/spark/storage/ExternalBlockStore.scala
+++ b/core/src/main/scala/org/apache/spark/storage/ExternalBlockStore.scala
@@ -192,7 +192,7 @@ private[spark] class ExternalBlockStore(blockManager: BlockManager, executorId:
       .getOrElse(ExternalBlockStore.DEFAULT_BLOCK_MANAGER_NAME)
 
     try {
-      val instance = Class.forName(clsName)
+      val instance = Utils.classForName(clsName)
         .newInstance()
         .asInstanceOf[ExternalBlockManager]
       instance.init(blockManager, executorId)
diff --git a/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala b/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala
index 305de4c75539d..43626b4ef4880 100644
--- a/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala
+++ b/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala
@@ -448,10 +448,12 @@ private class InnerClosureFinder(output: Set[Class[_]]) extends ClassVisitor(ASM
         if (op == INVOKESPECIAL && name == "<init>" && argTypes.length > 0
             && argTypes(0).toString.startsWith("L") // is it an object?
             && argTypes(0).getInternalName == myName) {
+          // scalastyle:off classforname
           output += Class.forName(
               owner.replace('/', '.'),
               false,
               Thread.currentThread.getContextClassLoader)
+          // scalastyle:on classforname
         }
       }
     }
diff --git a/core/src/main/scala/org/apache/spark/util/SizeEstimator.scala b/core/src/main/scala/org/apache/spark/util/SizeEstimator.scala
index 0180399c9dad5..7d84468f62ab1 100644
--- a/core/src/main/scala/org/apache/spark/util/SizeEstimator.scala
+++ b/core/src/main/scala/org/apache/spark/util/SizeEstimator.scala
@@ -124,9 +124,11 @@ object SizeEstimator extends Logging {
       val server = ManagementFactory.getPlatformMBeanServer()
 
       // NOTE: This should throw an exception in non-Sun JVMs
+      // scalastyle:off classforname
       val hotSpotMBeanClass = Class.forName("com.sun.management.HotSpotDiagnosticMXBean")
       val getVMMethod = hotSpotMBeanClass.getDeclaredMethod("getVMOption",
           Class.forName("java.lang.String"))
+      // scalastyle:on classforname
 
       val bean = ManagementFactory.newPlatformMXBeanProxy(server,
         hotSpotMBeanName, hotSpotMBeanClass)
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index b6b932104a94d..e6374f17d858f 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -113,8 +113,11 @@ private[spark] object Utils extends Logging {
   def deserialize[T](bytes: Array[Byte], loader: ClassLoader): T = {
     val bis = new ByteArrayInputStream(bytes)
     val ois = new ObjectInputStream(bis) {
-      override def resolveClass(desc: ObjectStreamClass): Class[_] =
+      override def resolveClass(desc: ObjectStreamClass): Class[_] = {
+        // scalastyle:off classforname
         Class.forName(desc.getName, false, loader)
+        // scalastyle:on classforname
+      }
     }
     ois.readObject.asInstanceOf[T]
   }
@@ -177,12 +180,16 @@ private[spark] object Utils extends Logging {
 
   /** Determines whether the provided class is loadable in the current thread. */
   def classIsLoadable(clazz: String): Boolean = {
+    // scalastyle:off classforname
     Try { Class.forName(clazz, false, getContextOrSparkClassLoader) }.isSuccess
+    // scalastyle:on classforname
   }
 
+  // scalastyle:off classforname
   /** Preferred alternative to Class.forName(className) */
   def classForName(className: String): Class[_] = {
     Class.forName(className, true, getContextOrSparkClassLoader)
+    // scalastyle:on classforname
   }
 
   /**
@@ -2266,7 +2273,7 @@ private [util] class SparkShutdownHookManager {
     val hookTask = new Runnable() {
       override def run(): Unit = runAll()
     }
-    Try(Class.forName("org.apache.hadoop.util.ShutdownHookManager")) match {
+    Try(Utils.classForName("org.apache.hadoop.util.ShutdownHookManager")) match {
       case Success(shmClass) =>
         val fsPriority = classOf[FileSystem].getField("SHUTDOWN_HOOK_PRIORITY").get()
           .asInstanceOf[Int]
diff --git a/core/src/test/scala/org/apache/spark/FileSuite.scala b/core/src/test/scala/org/apache/spark/FileSuite.scala
index 1d8fade90f398..418763f4e5ffa 100644
--- a/core/src/test/scala/org/apache/spark/FileSuite.scala
+++ b/core/src/test/scala/org/apache/spark/FileSuite.scala
@@ -179,6 +179,7 @@ class FileSuite extends SparkFunSuite with LocalSparkContext {
   }
 
   test("object files of classes from a JAR") {
+    // scalastyle:off classforname
     val original = Thread.currentThread().getContextClassLoader
     val className = "FileSuiteObjectFileTest"
     val jar = TestUtils.createJarWithClasses(Seq(className))
@@ -201,6 +202,7 @@ class FileSuite extends SparkFunSuite with LocalSparkContext {
     finally {
       Thread.currentThread().setContextClassLoader(original)
     }
+    // scalastyle:on classforname
   }
 
   test("write SequenceFile using new Hadoop API") {
diff --git a/core/src/test/scala/org/apache/spark/SparkContextSchedulerCreationSuite.scala b/core/src/test/scala/org/apache/spark/SparkContextSchedulerCreationSuite.scala
index f89e3d0a49920..dba46f101c580 100644
--- a/core/src/test/scala/org/apache/spark/SparkContextSchedulerCreationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/SparkContextSchedulerCreationSuite.scala
@@ -19,6 +19,7 @@ package org.apache.spark
 
 import org.scalatest.PrivateMethodTester
 
+import org.apache.spark.util.Utils
 import org.apache.spark.scheduler.{SchedulerBackend, TaskScheduler, TaskSchedulerImpl}
 import org.apache.spark.scheduler.cluster.{SimrSchedulerBackend, SparkDeploySchedulerBackend}
 import org.apache.spark.scheduler.cluster.mesos.{CoarseMesosSchedulerBackend, MesosSchedulerBackend}
@@ -131,7 +132,7 @@ class SparkContextSchedulerCreationSuite
   def testYarn(master: String, expectedClassName: String) {
     try {
       val sched = createTaskScheduler(master)
-      assert(sched.getClass === Class.forName(expectedClassName))
+      assert(sched.getClass === Utils.classForName(expectedClassName))
     } catch {
       case e: SparkException =>
         assert(e.getMessage.contains("YARN mode not available"))
diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
index e7878bde6fcb0..343d28eef8359 100644
--- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
@@ -541,8 +541,8 @@ object JarCreationTest extends Logging {
     val result = sc.makeRDD(1 to 100, 10).mapPartitions { x =>
       var exception: String = null
       try {
-        Class.forName(args(0), true, Thread.currentThread().getContextClassLoader)
-        Class.forName(args(1), true, Thread.currentThread().getContextClassLoader)
+        Utils.classForName(args(0))
+        Utils.classForName(args(1))
       } catch {
         case t: Throwable =>
           exception = t + "\n" + t.getStackTraceString
diff --git a/core/src/test/scala/org/apache/spark/rdd/JdbcRDDSuite.scala b/core/src/test/scala/org/apache/spark/rdd/JdbcRDDSuite.scala
index 08215a2bafc09..05013fbc49b8e 100644
--- a/core/src/test/scala/org/apache/spark/rdd/JdbcRDDSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/JdbcRDDSuite.scala
@@ -22,11 +22,12 @@ import java.sql._
 import org.scalatest.BeforeAndAfter
 
 import org.apache.spark.{LocalSparkContext, SparkContext, SparkFunSuite}
+import org.apache.spark.util.Utils
 
 class JdbcRDDSuite extends SparkFunSuite with BeforeAndAfter with LocalSparkContext {
 
   before {
-    Class.forName("org.apache.derby.jdbc.EmbeddedDriver")
+    Utils.classForName("org.apache.derby.jdbc.EmbeddedDriver")
     val conn = DriverManager.getConnection("jdbc:derby:target/JdbcRDDSuiteDb;create=true")
     try {
 
diff --git a/core/src/test/scala/org/apache/spark/serializer/KryoSerializerDistributedSuite.scala b/core/src/test/scala/org/apache/spark/serializer/KryoSerializerDistributedSuite.scala
index 63a8480c9b57b..353b97469cd11 100644
--- a/core/src/test/scala/org/apache/spark/serializer/KryoSerializerDistributedSuite.scala
+++ b/core/src/test/scala/org/apache/spark/serializer/KryoSerializerDistributedSuite.scala
@@ -59,7 +59,9 @@ object KryoDistributedTest {
   class AppJarRegistrator extends KryoRegistrator {
     override def registerClasses(k: Kryo) {
       val classLoader = Thread.currentThread.getContextClassLoader
+      // scalastyle:off classforname
       k.register(Class.forName(AppJarRegistrator.customClassName, true, classLoader))
+      // scalastyle:on classforname
     }
   }
 
diff --git a/core/src/test/scala/org/apache/spark/util/MutableURLClassLoaderSuite.scala b/core/src/test/scala/org/apache/spark/util/MutableURLClassLoaderSuite.scala
index 42125547436cb..d3d464e84ffd7 100644
--- a/core/src/test/scala/org/apache/spark/util/MutableURLClassLoaderSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/MutableURLClassLoaderSuite.scala
@@ -84,7 +84,9 @@ class MutableURLClassLoaderSuite extends SparkFunSuite {
     try {
       sc.makeRDD(1 to 5, 2).mapPartitions { x =>
         val loader = Thread.currentThread().getContextClassLoader
+        // scalastyle:off classforname
         Class.forName(className, true, loader).newInstance()
+        // scalastyle:on classforname
         Seq().iterator
       }.count()
     }
diff --git a/external/flume-sink/src/main/scala/org/apache/spark/streaming/flume/sink/Logging.scala b/external/flume-sink/src/main/scala/org/apache/spark/streaming/flume/sink/Logging.scala
index 17cbc6707b5ea..d87b86932dd41 100644
--- a/external/flume-sink/src/main/scala/org/apache/spark/streaming/flume/sink/Logging.scala
+++ b/external/flume-sink/src/main/scala/org/apache/spark/streaming/flume/sink/Logging.scala
@@ -113,7 +113,9 @@ private[sink] object Logging {
   try {
     // We use reflection here to handle the case where users remove the
     // slf4j-to-jul bridge order to route their logs to JUL.
+    // scalastyle:off classforname
     val bridgeClass = Class.forName("org.slf4j.bridge.SLF4JBridgeHandler")
+    // scalastyle:on classforname
     bridgeClass.getMethod("removeHandlersForRootLogger").invoke(null)
     val installed = bridgeClass.getMethod("isInstalled").invoke(null).asInstanceOf[Boolean]
     if (!installed) {
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/util/BytecodeUtils.scala b/graphx/src/main/scala/org/apache/spark/graphx/util/BytecodeUtils.scala
index 5c07b415cd796..74a7de18d4161 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/util/BytecodeUtils.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/util/BytecodeUtils.scala
@@ -121,7 +121,7 @@ private[graphx] object BytecodeUtils {
           override def visitMethodInsn(op: Int, owner: String, name: String, desc: String) {
             if (op == INVOKEVIRTUAL || op == INVOKESPECIAL || op == INVOKESTATIC) {
               if (!skipClass(owner)) {
-                methodsInvoked.add((Class.forName(owner.replace("/", ".")), name))
+                methodsInvoked.add((Utils.classForName(owner.replace("/", ".")), name))
               }
             }
           }
diff --git a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkIMain.scala b/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkIMain.scala
index 8791618bd355e..4ee605fd7f11e 100644
--- a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkIMain.scala
+++ b/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkIMain.scala
@@ -1079,8 +1079,10 @@ import org.apache.spark.annotation.DeveloperApi
       throw new EvalException("Failed to load '" + path + "': " + ex.getMessage, ex)
 
     private def load(path: String): Class[_] = {
+      // scalastyle:off classforname
       try Class.forName(path, true, classLoader)
       catch { case ex: Throwable => evalError(path, unwrap(ex)) }
+      // scalastyle:on classforname
     }
 
     lazy val evalClass = load(evalPath)
diff --git a/scalastyle-config.xml b/scalastyle-config.xml
index 49611703798e8..b5e2e882d2254 100644
--- a/scalastyle-config.xml
+++ b/scalastyle-config.xml
@@ -150,6 +150,17 @@ This file is divided into 3 sections:
       // scalastyle:on println]]></customMessage>
   </check>
 
+  <check customId="classforname" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
+    <parameters><parameter name="regex">Class\.forName</parameter></parameters>
+    <customMessage><![CDATA[
+      Are you sure that you want to use Class.forName? In most cases, you should use Utils.classForName instead.
+      If you must use Class.forName, wrap the code block with
+      // scalastyle:off classforname
+      Class.forName(...)
+      // scalastyle:on classforname
+    ]]></customMessage>
+  </check>
+
   <!-- ================================================================================ -->
   <!--       rules we'd like to enforce, but haven't cleaned up the codebase yet        -->
   <!-- ================================================================================ -->
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala
index 57718228e490f..da83a7f0ba379 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala
@@ -27,6 +27,7 @@ import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.sql.catalyst.expressions.Expression
+import org.apache.spark.util.Utils
 
 
 /**
@@ -146,7 +147,7 @@ object DataType {
     ("pyClass", _),
     ("sqlType", _),
     ("type", JString("udt"))) =>
-      Class.forName(udtClass).newInstance().asInstanceOf[UserDefinedType[_]]
+      Utils.classForName(udtClass).newInstance().asInstanceOf[UserDefinedType[_]]
   }
 
   private def parseStructField(json: JValue): StructField = json match {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 477dea9164726..46bd60daa1f78 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -554,8 +554,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
     val className = beanClass.getName
     val rowRdd = rdd.mapPartitions { iter =>
       // BeanInfo is not serializable so we must rediscover it remotely for each partition.
-      val localBeanInfo = Introspector.getBeanInfo(
-        Class.forName(className, true, Utils.getContextOrSparkClassLoader))
+      val localBeanInfo = Introspector.getBeanInfo(Utils.classForName(className))
       val extractors =
         localBeanInfo.getPropertyDescriptors.filterNot(_.getName == "class").map(_.getReadMethod)
       val methodsToConverts = extractors.zip(attributeSeq).map { case (e, attr) =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala
index 704cf56f38265..e0bea65a15f36 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala
@@ -33,6 +33,7 @@ import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap}
 import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, Statistics}
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.sql.{DataFrame, SQLContext}
+import org.apache.spark.util.Utils
 
 /**
  * Relation that consists of data stored in a Parquet columnar format.
@@ -108,7 +109,7 @@ private[sql] object ParquetRelation {
     //
     // Therefore we need to force the class to be loaded.
     // This should really be resolved by Parquet.
-    Class.forName(classOf[ParquetLog].getName)
+    Utils.classForName(classOf[ParquetLog].getName)
 
     // Note: Logger.getLogger("parquet") has a default logger
     // that appends to Console which needs to be cleared.
@@ -119,12 +120,12 @@ private[sql] object ParquetRelation {
     // Disables a WARN log message in ParquetOutputCommitter.  We first ensure that
     // ParquetOutputCommitter is loaded and the static LOG field gets initialized.
     // See https://issues.apache.org/jira/browse/SPARK-5968 for details
-    Class.forName(classOf[ParquetOutputCommitter].getName)
+    Utils.classForName(classOf[ParquetOutputCommitter].getName)
     JLogger.getLogger(classOf[ParquetOutputCommitter].getName).setLevel(Level.OFF)
 
     // Similar as above, disables a unnecessary WARN log message in ParquetRecordReader.
     // See https://issues.apache.org/jira/browse/PARQUET-220 for details
-    Class.forName(classOf[ParquetRecordReader[_]].getName)
+    Utils.classForName(classOf[ParquetRecordReader[_]].getName)
     JLogger.getLogger(classOf[ParquetRecordReader[_]].getName).setLevel(Level.OFF)
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
index 566a52dc1b784..0f82f13088d39 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
@@ -26,6 +26,7 @@ import org.scalatest.BeforeAndAfter
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.types._
+import org.apache.spark.util.Utils
 
 class JDBCSuite extends SparkFunSuite with BeforeAndAfter {
   val url = "jdbc:h2:mem:testdb0"
@@ -46,7 +47,7 @@ class JDBCSuite extends SparkFunSuite with BeforeAndAfter {
   import ctx.sql
 
   before {
-    Class.forName("org.h2.Driver")
+    Utils.classForName("org.h2.Driver")
     // Extra properties that will be specified for our database. We need these to test
     // usage of parameters from OPTIONS clause in queries.
     val properties = new Properties()
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala
index d949ef42267ec..84b52ca2c733c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala
@@ -25,6 +25,7 @@ import org.scalatest.BeforeAndAfter
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.{SaveMode, Row}
 import org.apache.spark.sql.types._
+import org.apache.spark.util.Utils
 
 class JDBCWriteSuite extends SparkFunSuite with BeforeAndAfter {
   val url = "jdbc:h2:mem:testdb2"
@@ -41,7 +42,7 @@ class JDBCWriteSuite extends SparkFunSuite with BeforeAndAfter {
   import ctx.sql
 
   before {
-    Class.forName("org.h2.Driver")
+    Utils.classForName("org.h2.Driver")
     conn = DriverManager.getConnection(url)
     conn.prepareStatement("create schema test").executeUpdate()
 
diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala
index 301aa5a6411e2..39b31523e07cb 100644
--- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala
+++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala
@@ -417,7 +417,7 @@ object ServerMode extends Enumeration {
 }
 
 abstract class HiveThriftJdbcTest extends HiveThriftServer2Test {
-  Class.forName(classOf[HiveDriver].getCanonicalName)
+  Utils.classForName(classOf[HiveDriver].getCanonicalName)
 
   private def jdbcUri = if (mode == ServerMode.http) {
     s"""jdbc:hive2://localhost:$serverPort/
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
index d65d29daacf31..dc355690852bd 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
@@ -78,9 +78,7 @@ class HadoopTableReader(
   override def makeRDDForTable(hiveTable: HiveTable): RDD[InternalRow] =
     makeRDDForTable(
       hiveTable,
-      Class.forName(
-        relation.tableDesc.getSerdeClassName, true, Utils.getContextOrSparkClassLoader)
-        .asInstanceOf[Class[Deserializer]],
+      Utils.classForName(relation.tableDesc.getSerdeClassName).asInstanceOf[Class[Deserializer]],
       filterOpt = None)
 
   /**
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala
index 1f280c642979a..8adda54754230 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala
@@ -21,9 +21,6 @@ import java.io.{File, PrintStream}
 import java.util.{Map => JMap}
 import javax.annotation.concurrent.GuardedBy
 
-import org.apache.spark.sql.catalyst.expressions.Expression
-import org.apache.spark.util.CircularBuffer
-
 import scala.collection.JavaConversions._
 import scala.language.reflectiveCalls
 
@@ -37,7 +34,9 @@ import org.apache.hadoop.hive.ql.session.SessionState
 import org.apache.hadoop.hive.ql.{Driver, metadata}
 
 import org.apache.spark.Logging
+import org.apache.spark.sql.catalyst.expressions.Expression
 import org.apache.spark.sql.execution.QueryExecutionException
+import org.apache.spark.util.{CircularBuffer, Utils}
 
 
 /**
@@ -249,10 +248,10 @@ private[hive] class ClientWrapper(
   }
 
   private def toInputFormat(name: String) =
-    Class.forName(name).asInstanceOf[Class[_ <: org.apache.hadoop.mapred.InputFormat[_, _]]]
+    Utils.classForName(name).asInstanceOf[Class[_ <: org.apache.hadoop.mapred.InputFormat[_, _]]]
 
   private def toOutputFormat(name: String) =
-    Class.forName(name)
+    Utils.classForName(name)
       .asInstanceOf[Class[_ <: org.apache.hadoop.hive.ql.io.HiveOutputFormat[_, _]]]
 
   private def toQlTable(table: HiveTable): metadata.Table = {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala
index 917900e5f46dc..bee2ecbedb244 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala
@@ -120,8 +120,8 @@ object SparkSubmitClassLoaderTest extends Logging {
     logInfo("Testing load classes at the driver side.")
     // First, we load classes at driver side.
     try {
-      Class.forName(args(0), true, Thread.currentThread().getContextClassLoader)
-      Class.forName(args(1), true, Thread.currentThread().getContextClassLoader)
+      Utils.classForName(args(0))
+      Utils.classForName(args(1))
     } catch {
       case t: Throwable =>
         throw new Exception("Could not load user class from jar:\n", t)
@@ -131,8 +131,8 @@ object SparkSubmitClassLoaderTest extends Logging {
     val result = df.mapPartitions { x =>
       var exception: String = null
       try {
-        Class.forName(args(0), true, Thread.currentThread().getContextClassLoader)
-        Class.forName(args(1), true, Thread.currentThread().getContextClassLoader)
+        Utils.classForName(args(0))
+        Utils.classForName(args(1))
       } catch {
         case t: Throwable =>
           exception = t + "\n" + t.getStackTraceString
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobGenerator.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobGenerator.scala
index f5d41858646e4..9f2117ada61c0 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobGenerator.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobGenerator.scala
@@ -22,7 +22,7 @@ import scala.util.{Failure, Success, Try}
 import org.apache.spark.{SparkEnv, Logging}
 import org.apache.spark.streaming.{Checkpoint, CheckpointWriter, Time}
 import org.apache.spark.streaming.util.RecurringTimer
-import org.apache.spark.util.{Clock, EventLoop, ManualClock}
+import org.apache.spark.util.{Utils, Clock, EventLoop, ManualClock}
 
 /** Event classes for JobGenerator */
 private[scheduler] sealed trait JobGeneratorEvent
@@ -47,11 +47,11 @@ class JobGenerator(jobScheduler: JobScheduler) extends Logging {
     val clockClass = ssc.sc.conf.get(
       "spark.streaming.clock", "org.apache.spark.util.SystemClock")
     try {
-      Class.forName(clockClass).newInstance().asInstanceOf[Clock]
+      Utils.classForName(clockClass).newInstance().asInstanceOf[Clock]
     } catch {
       case e: ClassNotFoundException if clockClass.startsWith("org.apache.spark.streaming") =>
         val newClockClass = clockClass.replace("org.apache.spark.streaming", "org.apache.spark")
-        Class.forName(newClockClass).newInstance().asInstanceOf[Clock]
+        Utils.classForName(newClockClass).newInstance().asInstanceOf[Clock]
     }
   }
 
diff --git a/tools/src/main/scala/org/apache/spark/tools/GenerateMIMAIgnore.scala b/tools/src/main/scala/org/apache/spark/tools/GenerateMIMAIgnore.scala
index 9483d2b692ab5..9418beb6b3e3a 100644
--- a/tools/src/main/scala/org/apache/spark/tools/GenerateMIMAIgnore.scala
+++ b/tools/src/main/scala/org/apache/spark/tools/GenerateMIMAIgnore.scala
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// scalastyle:off classforname
 package org.apache.spark.tools
 
 import java.io.File
@@ -188,3 +189,4 @@ object GenerateMIMAIgnore {
     classes
   }
 }
+// scalastyle:on classforname
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
index f0af6f875f523..f86b6d1e5d7bc 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -732,9 +732,9 @@ private[spark] class Client(
       }
     val amClass =
       if (isClusterMode) {
-        Class.forName("org.apache.spark.deploy.yarn.ApplicationMaster").getName
+        Utils.classForName("org.apache.spark.deploy.yarn.ApplicationMaster").getName
       } else {
-        Class.forName("org.apache.spark.deploy.yarn.ExecutorLauncher").getName
+        Utils.classForName("org.apache.spark.deploy.yarn.ExecutorLauncher").getName
       }
     if (args.primaryRFile != null && args.primaryRFile.endsWith(".R")) {
       args.userArgs = ArrayBuffer(args.primaryRFile) ++ args.userArgs

From e965a798d09a9fba61b104c5cc0b65cdc28d27f6 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Tue, 14 Jul 2015 17:21:48 -0700
Subject: [PATCH 152/549] [SPARK-9045] Fix Scala 2.11 build break in
 UnsafeExternalRowSorter

This fixes a compilation break in under Scala 2.11:

```
[error] /home/jenkins/workspace/Spark-Master-Scala211-Compile/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java:135: error: <anonymous org.apache.spark.sql.execution.UnsafeExternalRowSorter$1> is not abstract and does not override abstract method <B>minBy(Function1<InternalRow,B>,Ordering<B>) in TraversableOnce
[error]       return new AbstractScalaRowIterator() {
[error]                                             ^
[error]   where B,A are type-variables:
[error]     B extends Object declared in method <B>minBy(Function1<A,B>,Ordering<B>)
[error]     A extends Object declared in interface TraversableOnce
[error] 1 error
```

The workaround for this is to make `AbstractScalaRowIterator` into a concrete class.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #7405 from JoshRosen/SPARK-9045 and squashes the following commits:

cbcbb4c [Josh Rosen] Forgot that we can't use the ??? operator anymore
577ba60 [Josh Rosen] [SPARK-9045] Fix Scala 2.11 build break in UnsafeExternalRowSorter.
---
 .../apache/spark/sql/AbstractScalaRowIterator.scala   | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/AbstractScalaRowIterator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/AbstractScalaRowIterator.scala
index cfefb13e7721e..1090bdb5a4bd3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/AbstractScalaRowIterator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/AbstractScalaRowIterator.scala
@@ -17,11 +17,14 @@
 
 package org.apache.spark.sql
 
-import org.apache.spark.sql.catalyst.InternalRow
-
 /**
  * Shim to allow us to implement [[scala.Iterator]] in Java. Scala 2.11+ has an AbstractIterator
  * class for this, but that class is `private[scala]` in 2.10. We need to explicitly fix this to
- * `Row` in order to work around a spurious IntelliJ compiler error.
+ * `Row` in order to work around a spurious IntelliJ compiler error. This cannot be an abstract
+ * class because that leads to compilation errors under Scala 2.11.
  */
-private[spark] abstract class AbstractScalaRowIterator extends Iterator[InternalRow]
+private[spark] class AbstractScalaRowIterator[T] extends Iterator[T] {
+  override def hasNext: Boolean = throw new NotImplementedError
+
+  override def next(): T = throw new NotImplementedError
+}

From cc57d705e732aefc2f3d3f438e84d71705b2eb65 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Tue, 14 Jul 2015 18:55:34 -0700
Subject: [PATCH 153/549] [SPARK-9050] [SQL] Remove unused newOrdering argument
 from Exchange (cleanup after SPARK-8317)

SPARK-8317 changed the SQL Exchange operator so that it no longer pushed sorting into Spark's shuffle layer, a change which allowed more efficient SQL-specific sorters to be used.

This patch performs some leftover cleanup based on those changes:

- Exchange's constructor should no longer accept a `newOrdering` since it's no longer used and no longer works as expected.
- `addOperatorsIfNecessary` looked at shuffle input's output ordering to decide whether to sort, but this is the wrong node to be examining: it needs to look at whether the post-shuffle node has the right ordering, since shuffling will not preserve row orderings.  Thanks to davies for spotting this.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #7407 from JoshRosen/SPARK-9050 and squashes the following commits:

e70be50 [Josh Rosen] No need to wrap line
e866494 [Josh Rosen] Refactor addOperatorsIfNecessary to make code clearer
2e467da [Josh Rosen] Remove `newOrdering` from Exchange.
---
 .../apache/spark/sql/execution/Exchange.scala | 37 ++++++++-----------
 .../spark/sql/execution/SparkStrategies.scala |  3 +-
 2 files changed, 16 insertions(+), 24 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
index 4b783e30d95e1..feea4f239c04d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
@@ -35,21 +35,13 @@ import org.apache.spark.{HashPartitioner, Partitioner, RangePartitioner, SparkEn
 
 /**
  * :: DeveloperApi ::
- * Performs a shuffle that will result in the desired `newPartitioning`.  Optionally sorts each
- * resulting partition based on expressions from the partition key.  It is invalid to construct an
- * exchange operator with a `newOrdering` that cannot be calculated using the partitioning key.
+ * Performs a shuffle that will result in the desired `newPartitioning`.
  */
 @DeveloperApi
-case class Exchange(
-    newPartitioning: Partitioning,
-    newOrdering: Seq[SortOrder],
-    child: SparkPlan)
-  extends UnaryNode {
+case class Exchange(newPartitioning: Partitioning, child: SparkPlan) extends UnaryNode {
 
   override def outputPartitioning: Partitioning = newPartitioning
 
-  override def outputOrdering: Seq[SortOrder] = newOrdering
-
   override def output: Seq[Attribute] = child.output
 
   /**
@@ -279,23 +271,24 @@ private[sql] case class EnsureRequirements(sqlContext: SQLContext) extends Rule[
           partitioning: Partitioning,
           rowOrdering: Seq[SortOrder],
           child: SparkPlan): SparkPlan = {
-        val needSort = rowOrdering.nonEmpty && child.outputOrdering != rowOrdering
-        val needsShuffle = child.outputPartitioning != partitioning
 
-        val withShuffle = if (needsShuffle) {
-          Exchange(partitioning, Nil, child)
-        } else {
-          child
+        def addShuffleIfNecessary(child: SparkPlan): SparkPlan = {
+          if (child.outputPartitioning != partitioning) {
+            Exchange(partitioning, child)
+          } else {
+            child
+          }
         }
 
-        val withSort = if (needSort) {
-          sqlContext.planner.BasicOperators.getSortOperator(
-            rowOrdering, global = false, withShuffle)
-        } else {
-          withShuffle
+        def addSortIfNecessary(child: SparkPlan): SparkPlan = {
+          if (rowOrdering.nonEmpty && child.outputOrdering != rowOrdering) {
+            sqlContext.planner.BasicOperators.getSortOperator(rowOrdering, global = false, child)
+          } else {
+            child
+          }
         }
 
-        withSort
+        addSortIfNecessary(addShuffleIfNecessary(child))
       }
 
       if (meetsRequirements && compatible && !needsAnySort) {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index ce25af58b6cab..73b463471ec5a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -360,8 +360,7 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
       case logical.OneRowRelation =>
         execution.PhysicalRDD(Nil, singleRowRdd) :: Nil
       case logical.RepartitionByExpression(expressions, child) =>
-        execution.Exchange(
-          HashPartitioning(expressions, numPartitions), Nil, planLater(child)) :: Nil
+        execution.Exchange(HashPartitioning(expressions, numPartitions), planLater(child)) :: Nil
       case e @ EvaluatePython(udf, child, _) =>
         BatchPythonEvaluation(udf, e.output, planLater(child)) :: Nil
       case LogicalRDD(output, rdd) => PhysicalRDD(output, rdd) :: Nil

From f957796c4b3c3cd95edfc64500a045f7e810ee87 Mon Sep 17 00:00:00 2001
From: huangzhaowei <carlmartinmax@gmail.com>
Date: Tue, 14 Jul 2015 19:20:49 -0700
Subject: [PATCH 154/549] [SPARK-8820] [STREAMING] Add a configuration to set
 checkpoint dir.

Add a configuration to set checkpoint directory  for convenience to user.
[Jira Address](https://issues.apache.org/jira/browse/SPARK-8820)

Author: huangzhaowei <carlmartinmax@gmail.com>

Closes #7218 from SaintBacchus/SPARK-8820 and squashes the following commits:

d49fe4b [huangzhaowei] Rename the configuration name
66ea47c [huangzhaowei] Add the unit test.
dd0acc1 [huangzhaowei] [SPARK-8820][Streaming] Add a configuration to set checkpoint dir.
---
 .../org/apache/spark/streaming/StreamingContext.scala    | 2 ++
 .../apache/spark/streaming/StreamingContextSuite.scala   | 9 +++++++++
 2 files changed, 11 insertions(+)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
index 6b78a82e68c24..92438f1b1fbf7 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
@@ -201,6 +201,8 @@ class StreamingContext private[streaming] (
 
   private var shutdownHookRef: AnyRef = _
 
+  conf.getOption("spark.streaming.checkpoint.directory").foreach(checkpoint)
+
   /**
    * Return the associated Spark context
    */
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala
index 289a159d8990a..f588cf5bc1e7c 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala
@@ -115,6 +115,15 @@ class StreamingContextSuite extends SparkFunSuite with BeforeAndAfter with Timeo
     assert(ssc.conf.getTimeAsSeconds("spark.cleaner.ttl", "-1") === 10)
   }
 
+  test("checkPoint from conf") {
+    val checkpointDirectory = Utils.createTempDir().getAbsolutePath()
+
+    val myConf = SparkContext.updatedConf(new SparkConf(false), master, appName)
+    myConf.set("spark.streaming.checkpoint.directory", checkpointDirectory)
+    val ssc = new StreamingContext(myConf, batchDuration)
+    assert(ssc.checkpointDir != null)
+  }
+
   test("state matching") {
     import StreamingContextState._
     assert(INITIALIZED === INITIALIZED)

From bb870e72f42b6ce8d056df259f6fcf41808d7ed2 Mon Sep 17 00:00:00 2001
From: jerryshao <saisai.shao@intel.com>
Date: Tue, 14 Jul 2015 19:54:02 -0700
Subject: [PATCH 155/549] [SPARK-5523] [CORE] [STREAMING] Add a cache for
 hostname in TaskMetrics to decrease the memory usage and GC overhead

Hostname in TaskMetrics will be created through deserialization, mostly the number of hostname is only the order of number of cluster node, so adding a cache layer to dedup the object could reduce the memory usage and alleviate GC overhead, especially for long-running and fast job generation applications like Spark Streaming.

Author: jerryshao <saisai.shao@intel.com>
Author: Saisai Shao <saisai.shao@intel.com>

Closes #5064 from jerryshao/SPARK-5523 and squashes the following commits:

3e2412a [jerryshao] Address the comments
b092a81 [Saisai Shao] Add a pool to cache the hostname
---
 .../apache/spark/executor/TaskMetrics.scala   | 20 +++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala b/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala
index a3b4561b07e7f..e80feeeab4142 100644
--- a/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala
+++ b/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala
@@ -17,11 +17,15 @@
 
 package org.apache.spark.executor
 
+import java.io.{IOException, ObjectInputStream}
+import java.util.concurrent.ConcurrentHashMap
+
 import scala.collection.mutable.ArrayBuffer
 
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.executor.DataReadMethod.DataReadMethod
 import org.apache.spark.storage.{BlockId, BlockStatus}
+import org.apache.spark.util.Utils
 
 /**
  * :: DeveloperApi ::
@@ -210,10 +214,26 @@ class TaskMetrics extends Serializable {
   private[spark] def updateInputMetrics(): Unit = synchronized {
     inputMetrics.foreach(_.updateBytesRead())
   }
+
+  @throws(classOf[IOException])
+  private def readObject(in: ObjectInputStream): Unit = Utils.tryOrIOException {
+    in.defaultReadObject()
+    // Get the hostname from cached data, since hostname is the order of number of nodes in
+    // cluster, so using cached hostname will decrease the object number and alleviate the GC
+    // overhead.
+    _hostname = TaskMetrics.getCachedHostName(_hostname)
+  }
 }
 
 private[spark] object TaskMetrics {
+  private val hostNameCache = new ConcurrentHashMap[String, String]()
+
   def empty: TaskMetrics = new TaskMetrics
+
+  def getCachedHostName(host: String): String = {
+    val canonicalHost = hostNameCache.putIfAbsent(host, host)
+    if (canonicalHost != null) canonicalHost else host
+  }
 }
 
 /**

From 5572fd0c518acd2e4483ff41bea1eb1cffd543ce Mon Sep 17 00:00:00 2001
From: Patrick Wendell <patrick@databricks.com>
Date: Tue, 14 Jul 2015 21:44:47 -0700
Subject: [PATCH 156/549] [HOTFIX] Adding new names to known contributors

---
 dev/create-release/known_translations | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/dev/create-release/known_translations b/dev/create-release/known_translations
index 5f2671a6e5053..e462302f28423 100644
--- a/dev/create-release/known_translations
+++ b/dev/create-release/known_translations
@@ -129,3 +129,12 @@ yongtang - Yong Tang
 ypcat - Pei-Lun Lee
 zhichao-li - Zhichao Li
 zzcclp - Zhichao Zhang
+979969786 - Yuming Wang
+Rosstin - Rosstin Murphy
+ameyc - Amey Chaugule
+animeshbaranawal - Animesh Baranawal
+cafreeman - Chris Freeman
+lee19 - Lee
+lockwobr - Brian Lockwood
+navis - Navis Ryu
+pparkkin - Paavo Parkkinen

From f650a005e03ecd800c9005a496cc6a0d8eb68c93 Mon Sep 17 00:00:00 2001
From: Sun Rui <rui.sun@intel.com>
Date: Tue, 14 Jul 2015 22:21:01 -0700
Subject: [PATCH 157/549] [SPARK-8808] [SPARKR] Fix assignments in SparkR.

Author: Sun Rui <rui.sun@intel.com>

Closes #7395 from sun-rui/SPARK-8808 and squashes the following commits:

ce603bc [Sun Rui] Use '<-' instead of '='.
88590b1 [Sun Rui] Use '<-' instead of '='.
---
 R/pkg/R/DataFrame.R                     | 2 +-
 R/pkg/R/client.R                        | 4 ++--
 R/pkg/R/group.R                         | 4 ++--
 R/pkg/R/utils.R                         | 4 ++--
 R/pkg/inst/tests/test_binaryFile.R      | 2 +-
 R/pkg/inst/tests/test_binary_function.R | 2 +-
 R/pkg/inst/tests/test_rdd.R             | 4 ++--
 R/pkg/inst/tests/test_textFile.R        | 2 +-
 R/pkg/inst/tests/test_utils.R           | 2 +-
 9 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 60702824acb46..208813768e264 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -1328,7 +1328,7 @@ setMethod("write.df",
             jmode <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "saveMode", mode)
             options <- varargsToEnv(...)
             if (!is.null(path)) {
-                options[['path']] = path
+                options[['path']] <- path
             }
             callJMethod(df@sdf, "save", source, jmode, options)
           })
diff --git a/R/pkg/R/client.R b/R/pkg/R/client.R
index 78c7a3037ffac..6f772158ddfe8 100644
--- a/R/pkg/R/client.R
+++ b/R/pkg/R/client.R
@@ -36,9 +36,9 @@ connectBackend <- function(hostname, port, timeout = 6000) {
 
 determineSparkSubmitBin <- function() {
   if (.Platform$OS.type == "unix") {
-    sparkSubmitBinName = "spark-submit"
+    sparkSubmitBinName <- "spark-submit"
   } else {
-    sparkSubmitBinName = "spark-submit.cmd"
+    sparkSubmitBinName <- "spark-submit.cmd"
   }
   sparkSubmitBinName
 }
diff --git a/R/pkg/R/group.R b/R/pkg/R/group.R
index 8f1c68f7c4d28..576ac72f40fc0 100644
--- a/R/pkg/R/group.R
+++ b/R/pkg/R/group.R
@@ -87,7 +87,7 @@ setMethod("count",
 setMethod("agg",
           signature(x = "GroupedData"),
           function(x, ...) {
-            cols = list(...)
+            cols <- list(...)
             stopifnot(length(cols) > 0)
             if (is.character(cols[[1]])) {
               cols <- varargsToEnv(...)
@@ -97,7 +97,7 @@ setMethod("agg",
               if (!is.null(ns)) {
                 for (n in ns) {
                   if (n != "") {
-                    cols[[n]] = alias(cols[[n]], n)
+                    cols[[n]] <- alias(cols[[n]], n)
                   }
                 }
               }
diff --git a/R/pkg/R/utils.R b/R/pkg/R/utils.R
index ea629a64f7158..950ba74dbe017 100644
--- a/R/pkg/R/utils.R
+++ b/R/pkg/R/utils.R
@@ -41,8 +41,8 @@ convertJListToRList <- function(jList, flatten, logicalUpperBound = NULL,
               if (isInstanceOf(obj, "scala.Tuple2")) {
                 # JavaPairRDD[Array[Byte], Array[Byte]].
 
-                keyBytes = callJMethod(obj, "_1")
-                valBytes = callJMethod(obj, "_2")
+                keyBytes <- callJMethod(obj, "_1")
+                valBytes <- callJMethod(obj, "_2")
                 res <- list(unserialize(keyBytes),
                   unserialize(valBytes))
               } else {
diff --git a/R/pkg/inst/tests/test_binaryFile.R b/R/pkg/inst/tests/test_binaryFile.R
index ccaea18ecab2a..f2452ed97d2ea 100644
--- a/R/pkg/inst/tests/test_binaryFile.R
+++ b/R/pkg/inst/tests/test_binaryFile.R
@@ -20,7 +20,7 @@ context("functions on binary files")
 # JavaSparkContext handle
 sc <- sparkR.init()
 
-mockFile = c("Spark is pretty.", "Spark is awesome.")
+mockFile <- c("Spark is pretty.", "Spark is awesome.")
 
 test_that("saveAsObjectFile()/objectFile() following textFile() works", {
   fileName1 <- tempfile(pattern="spark-test", fileext=".tmp")
diff --git a/R/pkg/inst/tests/test_binary_function.R b/R/pkg/inst/tests/test_binary_function.R
index 3be8c65a6c1a0..dca0657c57e0d 100644
--- a/R/pkg/inst/tests/test_binary_function.R
+++ b/R/pkg/inst/tests/test_binary_function.R
@@ -76,7 +76,7 @@ test_that("zipPartitions() on RDDs", {
   expect_equal(actual,
                list(list(1, c(1,2), c(1,2,3)), list(2, c(3,4), c(4,5,6))))
 
-  mockFile = c("Spark is pretty.", "Spark is awesome.")
+  mockFile <- c("Spark is pretty.", "Spark is awesome.")
   fileName <- tempfile(pattern="spark-test", fileext=".tmp")
   writeLines(mockFile, fileName)
 
diff --git a/R/pkg/inst/tests/test_rdd.R b/R/pkg/inst/tests/test_rdd.R
index b79692873cec3..6c3aaab8c711e 100644
--- a/R/pkg/inst/tests/test_rdd.R
+++ b/R/pkg/inst/tests/test_rdd.R
@@ -447,7 +447,7 @@ test_that("zipRDD() on RDDs", {
   expect_equal(actual,
                list(list(0, 1000), list(1, 1001), list(2, 1002), list(3, 1003), list(4, 1004)))
 
-  mockFile = c("Spark is pretty.", "Spark is awesome.")
+  mockFile <- c("Spark is pretty.", "Spark is awesome.")
   fileName <- tempfile(pattern="spark-test", fileext=".tmp")
   writeLines(mockFile, fileName)
 
@@ -483,7 +483,7 @@ test_that("cartesian() on RDDs", {
   actual <- collect(cartesian(rdd, emptyRdd))
   expect_equal(actual, list())
 
-  mockFile = c("Spark is pretty.", "Spark is awesome.")
+  mockFile <- c("Spark is pretty.", "Spark is awesome.")
   fileName <- tempfile(pattern="spark-test", fileext=".tmp")
   writeLines(mockFile, fileName)
 
diff --git a/R/pkg/inst/tests/test_textFile.R b/R/pkg/inst/tests/test_textFile.R
index 58318dfef71ab..a9cf83dbdbdb1 100644
--- a/R/pkg/inst/tests/test_textFile.R
+++ b/R/pkg/inst/tests/test_textFile.R
@@ -20,7 +20,7 @@ context("the textFile() function")
 # JavaSparkContext handle
 sc <- sparkR.init()
 
-mockFile = c("Spark is pretty.", "Spark is awesome.")
+mockFile <- c("Spark is pretty.", "Spark is awesome.")
 
 test_that("textFile() on a local file returns an RDD", {
   fileName <- tempfile(pattern="spark-test", fileext=".tmp")
diff --git a/R/pkg/inst/tests/test_utils.R b/R/pkg/inst/tests/test_utils.R
index aa0d2a66b9082..12df4cf4f65b7 100644
--- a/R/pkg/inst/tests/test_utils.R
+++ b/R/pkg/inst/tests/test_utils.R
@@ -119,7 +119,7 @@ test_that("cleanClosure on R functions", {
   # Test for overriding variables in base namespace (Issue: SparkR-196).
   nums <- as.list(1:10)
   rdd <- parallelize(sc, nums, 2L)
-  t = 4  # Override base::t in .GlobalEnv.
+  t <- 4  # Override base::t in .GlobalEnv.
   f <- function(x) { x > t }
   newF <- cleanClosure(f)
   env <- environment(newF)

From f23a721c10b64ec5c6768634fc5e9e7b60ee7ca8 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Tue, 14 Jul 2015 22:52:53 -0700
Subject: [PATCH 158/549] [SPARK-8993][SQL] More comprehensive type checking in
 expressions.

This patch makes the following changes:

1. ExpectsInputTypes only defines expected input types, but does not perform any implicit type casting.
2. ImplicitCastInputTypes is a new trait that defines both expected input types, as well as performs implicit type casting.
3. BinaryOperator has a new abstract function "inputType", which defines the expected input type for both left/right. Concrete BinaryOperator expressions no longer perform any implicit type casting.
4. For BinaryOperators, convert NullType (i.e. null literals) into some accepted type so BinaryOperators don't need to handle NullTypes.

TODOs needed: fix unit tests for error reporting.

I'm intentionally not changing anything in aggregate expressions because yhuai is doing a big refactoring on that right now.

Author: Reynold Xin <rxin@databricks.com>

Closes #7348 from rxin/typecheck and squashes the following commits:

8fcf814 [Reynold Xin] Fixed ordering of cases.
3bb63e7 [Reynold Xin] Style fix.
f45408f [Reynold Xin] Comment update.
aa7790e [Reynold Xin] Moved RemoveNullTypes into ImplicitTypeCasts.
438ea07 [Reynold Xin] space
d55c9e5 [Reynold Xin] Removes NullTypes.
360d124 [Reynold Xin] Fixed the rule.
fb66657 [Reynold Xin] Convert NullType into some accepted type for BinaryOperators.
2e22330 [Reynold Xin] Fixed unit tests.
4932d57 [Reynold Xin] Style fix.
d061691 [Reynold Xin] Rename existing ExpectsInputTypes -> ImplicitCastInputTypes.
e4727cc [Reynold Xin] BinaryOperator should not be doing implicit cast.
d017861 [Reynold Xin] Improve expression type checking.
---
 .../catalyst/analysis/FunctionRegistry.scala  |  1 +
 .../catalyst/analysis/HiveTypeCoercion.scala  | 43 ++++++----
 .../expressions/ExpectsInputTypes.scala       | 17 +++-
 .../sql/catalyst/expressions/Expression.scala | 44 +++++++++-
 .../sql/catalyst/expressions/ScalaUDF.scala   |  2 +-
 .../sql/catalyst/expressions/arithmetic.scala | 84 ++++++++-----------
 .../sql/catalyst/expressions/bitwise.scala    | 30 +++----
 .../spark/sql/catalyst/expressions/math.scala | 18 ++--
 .../spark/sql/catalyst/expressions/misc.scala |  8 +-
 .../sql/catalyst/expressions/predicates.scala | 83 ++++++++++--------
 .../expressions/stringOperations.scala        | 36 ++++----
 .../spark/sql/catalyst/util/TypeUtils.scala   |  8 --
 .../spark/sql/types/AbstractDataType.scala    | 35 ++++++++
 .../analysis/AnalysisErrorSuite.scala         |  2 +-
 .../ExpressionTypeCheckingSuite.scala         |  6 +-
 .../analysis/HiveTypeCoercionSuite.scala      | 56 +++++++++++++
 .../spark/sql/MathExpressionsSuite.scala      |  1 -
 17 files changed, 309 insertions(+), 165 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index ed69c42dcb825..6b1a94e4b2ad4 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.catalyst.analysis
 
+import scala.language.existentials
 import scala.reflect.ClassTag
 import scala.util.{Failure, Success, Try}
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
index 8cb71995eb818..15da5eecc8d3c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
@@ -214,19 +214,6 @@ object HiveTypeCoercion {
           }
 
         Union(newLeft, newRight)
-
-      // Also widen types for BinaryOperator.
-      case q: LogicalPlan => q transformExpressions {
-        // Skip nodes who's children have not been resolved yet.
-        case e if !e.childrenResolved => e
-
-        case b @ BinaryOperator(left, right) if left.dataType != right.dataType =>
-          findTightestCommonTypeOfTwo(left.dataType, right.dataType).map { widestType =>
-            val newLeft = if (left.dataType == widestType) left else Cast(left, widestType)
-            val newRight = if (right.dataType == widestType) right else Cast(right, widestType)
-            b.makeCopy(Array(newLeft, newRight))
-          }.getOrElse(b)  // If there is no applicable conversion, leave expression unchanged.
-      }
     }
   }
 
@@ -672,20 +659,44 @@ object HiveTypeCoercion {
   }
 
   /**
-   * Casts types according to the expected input types for Expressions that have the trait
-   * [[ExpectsInputTypes]].
+   * Casts types according to the expected input types for [[Expression]]s.
    */
   object ImplicitTypeCasts extends Rule[LogicalPlan] {
     def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
       // Skip nodes who's children have not been resolved yet.
       case e if !e.childrenResolved => e
 
-      case e: ExpectsInputTypes if (e.inputTypes.nonEmpty) =>
+      case b @ BinaryOperator(left, right) if left.dataType != right.dataType =>
+        findTightestCommonTypeOfTwo(left.dataType, right.dataType).map { commonType =>
+          if (b.inputType.acceptsType(commonType)) {
+            // If the expression accepts the tighest common type, cast to that.
+            val newLeft = if (left.dataType == commonType) left else Cast(left, commonType)
+            val newRight = if (right.dataType == commonType) right else Cast(right, commonType)
+            b.makeCopy(Array(newLeft, newRight))
+          } else {
+            // Otherwise, don't do anything with the expression.
+            b
+          }
+        }.getOrElse(b)  // If there is no applicable conversion, leave expression unchanged.
+
+      case e: ImplicitCastInputTypes if e.inputTypes.nonEmpty =>
         val children: Seq[Expression] = e.children.zip(e.inputTypes).map { case (in, expected) =>
           // If we cannot do the implicit cast, just use the original input.
           implicitCast(in, expected).getOrElse(in)
         }
         e.withNewChildren(children)
+
+      case e: ExpectsInputTypes if e.inputTypes.nonEmpty =>
+        // Convert NullType into some specific target type for ExpectsInputTypes that don't do
+        // general implicit casting.
+        val children: Seq[Expression] = e.children.zip(e.inputTypes).map { case (in, expected) =>
+          if (in.dataType == NullType && !expected.acceptsType(NullType)) {
+            Cast(in, expected.defaultConcreteType)
+          } else {
+            in
+          }
+        }
+        e.withNewChildren(children)
     }
 
     /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExpectsInputTypes.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExpectsInputTypes.scala
index 3eb0eb195c80d..ded89e85dea79 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExpectsInputTypes.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExpectsInputTypes.scala
@@ -19,10 +19,15 @@ package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
 import org.apache.spark.sql.types.AbstractDataType
-
+import org.apache.spark.sql.catalyst.analysis.HiveTypeCoercion.ImplicitTypeCasts
 
 /**
  * An trait that gets mixin to define the expected input types of an expression.
+ *
+ * This trait is typically used by operator expressions (e.g. [[Add]], [[Subtract]]) to define
+ * expected input types without any implicit casting.
+ *
+ * Most function expressions (e.g. [[Substring]] should extends [[ImplicitCastInputTypes]]) instead.
  */
 trait ExpectsInputTypes { self: Expression =>
 
@@ -40,7 +45,7 @@ trait ExpectsInputTypes { self: Expression =>
     val mismatches = children.zip(inputTypes).zipWithIndex.collect {
       case ((child, expected), idx) if !expected.acceptsType(child.dataType) =>
         s"argument ${idx + 1} is expected to be of type ${expected.simpleString}, " +
-        s"however, '${child.prettyString}' is of type ${child.dataType.simpleString}."
+          s"however, '${child.prettyString}' is of type ${child.dataType.simpleString}."
     }
 
     if (mismatches.isEmpty) {
@@ -50,3 +55,11 @@ trait ExpectsInputTypes { self: Expression =>
     }
   }
 }
+
+
+/**
+ * A mixin for the analyzer to perform implicit type casting using [[ImplicitTypeCasts]].
+ */
+trait ImplicitCastInputTypes extends ExpectsInputTypes { self: Expression =>
+  // No other methods
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
index 54ec10444c4f3..3f19ac2b592b5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
@@ -24,8 +24,20 @@ import org.apache.spark.sql.catalyst.trees
 import org.apache.spark.sql.catalyst.trees.TreeNode
 import org.apache.spark.sql.types._
 
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// This file defines the basic expression abstract classes in Catalyst, including:
+// Expression: the base expression abstract class
+// LeafExpression
+// UnaryExpression
+// BinaryExpression
+// BinaryOperator
+//
+// For details, see their classdocs.
+////////////////////////////////////////////////////////////////////////////////////////////////////
 
 /**
+ * An expression in Catalyst.
+ *
  * If an expression wants to be exposed in the function registry (so users can call it with
  * "name(arguments...)", the concrete implementation must be a case class whose constructor
  * arguments are all Expressions types.
@@ -335,15 +347,41 @@ abstract class BinaryExpression extends Expression with trees.BinaryNode[Express
 
 
 /**
- * An expression that has two inputs that are expected to the be same type. If the two inputs have
- * different types, the analyzer will find the tightest common type and do the proper type casting.
+ * A [[BinaryExpression]] that is an operator, with two properties:
+ *
+ * 1. The string representation is "x symbol y", rather than "funcName(x, y)".
+ * 2. Two inputs are expected to the be same type. If the two inputs have different types,
+ *    the analyzer will find the tightest common type and do the proper type casting.
  */
-abstract class BinaryOperator extends BinaryExpression {
+abstract class BinaryOperator extends BinaryExpression with ExpectsInputTypes {
   self: Product =>
 
+  /**
+   * Expected input type from both left/right child expressions, similar to the
+   * [[ImplicitCastInputTypes]] trait.
+   */
+  def inputType: AbstractDataType
+
   def symbol: String
 
   override def toString: String = s"($left $symbol $right)"
+
+  override def inputTypes: Seq[AbstractDataType] = Seq(inputType, inputType)
+
+  override def checkInputDataTypes(): TypeCheckResult = {
+    // First call the checker for ExpectsInputTypes, and then check whether left and right have
+    // the same type.
+    super.checkInputDataTypes() match {
+      case TypeCheckResult.TypeCheckSuccess =>
+        if (left.dataType != right.dataType) {
+          TypeCheckResult.TypeCheckFailure(s"differing types in '$prettyString' " +
+            s"(${left.dataType.simpleString} and ${right.dataType.simpleString}).")
+        } else {
+          TypeCheckResult.TypeCheckSuccess
+        }
+      case TypeCheckResult.TypeCheckFailure(msg) => TypeCheckResult.TypeCheckFailure(msg)
+    }
+  }
 }
 
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala
index 6fb3343bb63f2..22687acd68a97 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala
@@ -29,7 +29,7 @@ case class ScalaUDF(
     function: AnyRef,
     dataType: DataType,
     children: Seq[Expression],
-    inputTypes: Seq[DataType] = Nil) extends Expression with ExpectsInputTypes {
+    inputTypes: Seq[DataType] = Nil) extends Expression with ImplicitCastInputTypes {
 
   override def nullable: Boolean = true
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
index 8476af4a5d8d6..1a55a0876f303 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
@@ -18,23 +18,19 @@
 package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
 import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode}
 import org.apache.spark.sql.catalyst.util.TypeUtils
 import org.apache.spark.sql.types._
 
-abstract class UnaryArithmetic extends UnaryExpression {
-  self: Product =>
+
+case class UnaryMinus(child: Expression) extends UnaryExpression with ExpectsInputTypes {
+
+  override def inputTypes: Seq[AbstractDataType] = Seq(NumericType)
 
   override def dataType: DataType = child.dataType
-}
 
-case class UnaryMinus(child: Expression) extends UnaryArithmetic {
   override def toString: String = s"-$child"
 
-  override def checkInputDataTypes(): TypeCheckResult =
-    TypeUtils.checkForNumericExpr(child.dataType, "operator -")
-
   private lazy val numeric = TypeUtils.getNumeric(dataType)
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = dataType match {
@@ -45,9 +41,13 @@ case class UnaryMinus(child: Expression) extends UnaryArithmetic {
   protected override def nullSafeEval(input: Any): Any = numeric.negate(input)
 }
 
-case class UnaryPositive(child: Expression) extends UnaryArithmetic {
+case class UnaryPositive(child: Expression) extends UnaryExpression with ExpectsInputTypes {
   override def prettyName: String = "positive"
 
+  override def inputTypes: Seq[AbstractDataType] = Seq(NumericType)
+
+  override def dataType: DataType = child.dataType
+
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String =
     defineCodeGen(ctx, ev, c => c)
 
@@ -57,9 +57,11 @@ case class UnaryPositive(child: Expression) extends UnaryArithmetic {
 /**
  * A function that get the absolute value of the numeric value.
  */
-case class Abs(child: Expression) extends UnaryArithmetic {
-  override def checkInputDataTypes(): TypeCheckResult =
-    TypeUtils.checkForNumericExpr(child.dataType, "function abs")
+case class Abs(child: Expression) extends UnaryExpression with ExpectsInputTypes {
+
+  override def inputTypes: Seq[AbstractDataType] = Seq(NumericType)
+
+  override def dataType: DataType = child.dataType
 
   private lazy val numeric = TypeUtils.getNumeric(dataType)
 
@@ -71,18 +73,6 @@ abstract class BinaryArithmetic extends BinaryOperator {
 
   override def dataType: DataType = left.dataType
 
-  override def checkInputDataTypes(): TypeCheckResult = {
-    if (left.dataType != right.dataType) {
-      TypeCheckResult.TypeCheckFailure(
-        s"differing types in ${this.getClass.getSimpleName} " +
-        s"(${left.dataType} and ${right.dataType}).")
-    } else {
-      checkTypesInternal(dataType)
-    }
-  }
-
-  protected def checkTypesInternal(t: DataType): TypeCheckResult
-
   /** Name of the function for this expression on a [[Decimal]] type. */
   def decimalMethod: String =
     sys.error("BinaryArithmetics must override either decimalMethod or genCode")
@@ -104,62 +94,61 @@ private[sql] object BinaryArithmetic {
 }
 
 case class Add(left: Expression, right: Expression) extends BinaryArithmetic {
+
+  override def inputType: AbstractDataType = NumericType
+
   override def symbol: String = "+"
   override def decimalMethod: String = "$plus"
 
   override lazy val resolved =
     childrenResolved && checkInputDataTypes().isSuccess && !DecimalType.isFixed(dataType)
 
-  protected def checkTypesInternal(t: DataType) =
-    TypeUtils.checkForNumericExpr(t, "operator " + symbol)
-
   private lazy val numeric = TypeUtils.getNumeric(dataType)
 
   protected override def nullSafeEval(input1: Any, input2: Any): Any = numeric.plus(input1, input2)
 }
 
 case class Subtract(left: Expression, right: Expression) extends BinaryArithmetic {
+
+  override def inputType: AbstractDataType = NumericType
+
   override def symbol: String = "-"
   override def decimalMethod: String = "$minus"
 
   override lazy val resolved =
     childrenResolved && checkInputDataTypes().isSuccess && !DecimalType.isFixed(dataType)
 
-  protected def checkTypesInternal(t: DataType) =
-    TypeUtils.checkForNumericExpr(t, "operator " + symbol)
-
   private lazy val numeric = TypeUtils.getNumeric(dataType)
 
   protected override def nullSafeEval(input1: Any, input2: Any): Any = numeric.minus(input1, input2)
 }
 
 case class Multiply(left: Expression, right: Expression) extends BinaryArithmetic {
+
+  override def inputType: AbstractDataType = NumericType
+
   override def symbol: String = "*"
   override def decimalMethod: String = "$times"
 
   override lazy val resolved =
     childrenResolved && checkInputDataTypes().isSuccess && !DecimalType.isFixed(dataType)
 
-  protected def checkTypesInternal(t: DataType) =
-    TypeUtils.checkForNumericExpr(t, "operator " + symbol)
-
   private lazy val numeric = TypeUtils.getNumeric(dataType)
 
   protected override def nullSafeEval(input1: Any, input2: Any): Any = numeric.times(input1, input2)
 }
 
 case class Divide(left: Expression, right: Expression) extends BinaryArithmetic {
+
+  override def inputType: AbstractDataType = NumericType
+
   override def symbol: String = "/"
   override def decimalMethod: String = "$div"
-
   override def nullable: Boolean = true
 
   override lazy val resolved =
     childrenResolved && checkInputDataTypes().isSuccess && !DecimalType.isFixed(dataType)
 
-  protected def checkTypesInternal(t: DataType) =
-    TypeUtils.checkForNumericExpr(t, "operator " + symbol)
-
   private lazy val div: (Any, Any) => Any = dataType match {
     case ft: FractionalType => ft.fractional.asInstanceOf[Fractional[Any]].div
     case it: IntegralType => it.integral.asInstanceOf[Integral[Any]].quot
@@ -215,17 +204,16 @@ case class Divide(left: Expression, right: Expression) extends BinaryArithmetic
 }
 
 case class Remainder(left: Expression, right: Expression) extends BinaryArithmetic {
+
+  override def inputType: AbstractDataType = NumericType
+
   override def symbol: String = "%"
   override def decimalMethod: String = "remainder"
-
   override def nullable: Boolean = true
 
   override lazy val resolved =
     childrenResolved && checkInputDataTypes().isSuccess && !DecimalType.isFixed(dataType)
 
-  protected def checkTypesInternal(t: DataType) =
-    TypeUtils.checkForNumericExpr(t, "operator " + symbol)
-
   private lazy val integral = dataType match {
     case i: IntegralType => i.integral.asInstanceOf[Integral[Any]]
     case i: FractionalType => i.asIntegral.asInstanceOf[Integral[Any]]
@@ -281,10 +269,11 @@ case class Remainder(left: Expression, right: Expression) extends BinaryArithmet
 }
 
 case class MaxOf(left: Expression, right: Expression) extends BinaryArithmetic {
-  override def nullable: Boolean = left.nullable && right.nullable
+  // TODO: Remove MaxOf and MinOf, and replace its usage with Greatest and Least.
 
-  protected def checkTypesInternal(t: DataType) =
-    TypeUtils.checkForOrderingExpr(t, "function maxOf")
+  override def inputType: AbstractDataType = TypeCollection.Ordered
+
+  override def nullable: Boolean = left.nullable && right.nullable
 
   private lazy val ordering = TypeUtils.getOrdering(dataType)
 
@@ -335,10 +324,11 @@ case class MaxOf(left: Expression, right: Expression) extends BinaryArithmetic {
 }
 
 case class MinOf(left: Expression, right: Expression) extends BinaryArithmetic {
-  override def nullable: Boolean = left.nullable && right.nullable
+  // TODO: Remove MaxOf and MinOf, and replace its usage with Greatest and Least.
 
-  protected def checkTypesInternal(t: DataType) =
-    TypeUtils.checkForOrderingExpr(t, "function minOf")
+  override def inputType: AbstractDataType = TypeCollection.Ordered
+
+  override def nullable: Boolean = left.nullable && right.nullable
 
   private lazy val ordering = TypeUtils.getOrdering(dataType)
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/bitwise.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/bitwise.scala
index 2d47124d247e7..af1abbcd2239b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/bitwise.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/bitwise.scala
@@ -17,9 +17,7 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
-import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
 import org.apache.spark.sql.catalyst.expressions.codegen._
-import org.apache.spark.sql.catalyst.util.TypeUtils
 import org.apache.spark.sql.types._
 
 
@@ -29,10 +27,10 @@ import org.apache.spark.sql.types._
  * Code generation inherited from BinaryArithmetic.
  */
 case class BitwiseAnd(left: Expression, right: Expression) extends BinaryArithmetic {
-  override def symbol: String = "&"
 
-  protected def checkTypesInternal(t: DataType) =
-    TypeUtils.checkForBitwiseExpr(t, "operator " + symbol)
+  override def inputType: AbstractDataType = TypeCollection.Bitwise
+
+  override def symbol: String = "&"
 
   private lazy val and: (Any, Any) => Any = dataType match {
     case ByteType =>
@@ -54,10 +52,10 @@ case class BitwiseAnd(left: Expression, right: Expression) extends BinaryArithme
  * Code generation inherited from BinaryArithmetic.
  */
 case class BitwiseOr(left: Expression, right: Expression) extends BinaryArithmetic {
-  override def symbol: String = "|"
 
-  protected def checkTypesInternal(t: DataType) =
-    TypeUtils.checkForBitwiseExpr(t, "operator " + symbol)
+  override def inputType: AbstractDataType = TypeCollection.Bitwise
+
+  override def symbol: String = "|"
 
   private lazy val or: (Any, Any) => Any = dataType match {
     case ByteType =>
@@ -79,10 +77,10 @@ case class BitwiseOr(left: Expression, right: Expression) extends BinaryArithmet
  * Code generation inherited from BinaryArithmetic.
  */
 case class BitwiseXor(left: Expression, right: Expression) extends BinaryArithmetic {
-  override def symbol: String = "^"
 
-  protected def checkTypesInternal(t: DataType) =
-    TypeUtils.checkForBitwiseExpr(t, "operator " + symbol)
+  override def inputType: AbstractDataType = TypeCollection.Bitwise
+
+  override def symbol: String = "^"
 
   private lazy val xor: (Any, Any) => Any = dataType match {
     case ByteType =>
@@ -101,11 +99,13 @@ case class BitwiseXor(left: Expression, right: Expression) extends BinaryArithme
 /**
  * A function that calculates bitwise not(~) of a number.
  */
-case class BitwiseNot(child: Expression) extends UnaryArithmetic {
-  override def toString: String = s"~$child"
+case class BitwiseNot(child: Expression) extends UnaryExpression with ExpectsInputTypes {
 
-  override def checkInputDataTypes(): TypeCheckResult =
-    TypeUtils.checkForBitwiseExpr(child.dataType, "operator ~")
+  override def inputTypes: Seq[AbstractDataType] = Seq(TypeCollection.Bitwise)
+
+  override def dataType: DataType = child.dataType
+
+  override def toString: String = s"~$child"
 
   private lazy val not: (Any) => Any = dataType match {
     case ByteType =>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
index c31890e27fb54..4b7fe05dd4980 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
@@ -55,7 +55,7 @@ abstract class LeafMathExpression(c: Double, name: String)
  * @param name The short name of the function
  */
 abstract class UnaryMathExpression(f: Double => Double, name: String)
-  extends UnaryExpression with Serializable with ExpectsInputTypes { self: Product =>
+  extends UnaryExpression with Serializable with ImplicitCastInputTypes { self: Product =>
 
   override def inputTypes: Seq[DataType] = Seq(DoubleType)
   override def dataType: DataType = DoubleType
@@ -89,7 +89,7 @@ abstract class UnaryMathExpression(f: Double => Double, name: String)
  * @param name The short name of the function
  */
 abstract class BinaryMathExpression(f: (Double, Double) => Double, name: String)
-  extends BinaryExpression with Serializable with ExpectsInputTypes { self: Product =>
+  extends BinaryExpression with Serializable with ImplicitCastInputTypes { self: Product =>
 
   override def inputTypes: Seq[DataType] = Seq(DoubleType, DoubleType)
 
@@ -174,7 +174,7 @@ object Factorial {
   )
 }
 
-case class Factorial(child: Expression) extends UnaryExpression with ExpectsInputTypes {
+case class Factorial(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
 
   override def inputTypes: Seq[DataType] = Seq(IntegerType)
 
@@ -251,7 +251,7 @@ case class ToRadians(child: Expression) extends UnaryMathExpression(math.toRadia
 }
 
 case class Bin(child: Expression)
-  extends UnaryExpression with Serializable with ExpectsInputTypes {
+  extends UnaryExpression with Serializable with ImplicitCastInputTypes {
 
   override def inputTypes: Seq[DataType] = Seq(LongType)
   override def dataType: DataType = StringType
@@ -285,7 +285,7 @@ object Hex {
  * Otherwise if the number is a STRING, it converts each character into its hex representation
  * and returns the resulting STRING. Negative numbers would be treated as two's complement.
  */
-case class Hex(child: Expression) extends UnaryExpression with ExpectsInputTypes {
+case class Hex(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
   // TODO: Create code-gen version.
 
   override def inputTypes: Seq[AbstractDataType] =
@@ -329,7 +329,7 @@ case class Hex(child: Expression) extends UnaryExpression with ExpectsInputTypes
  * Performs the inverse operation of HEX.
  * Resulting characters are returned as a byte array.
  */
-case class Unhex(child: Expression) extends UnaryExpression with ExpectsInputTypes {
+case class Unhex(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
   // TODO: Create code-gen version.
 
   override def inputTypes: Seq[AbstractDataType] = Seq(StringType)
@@ -416,7 +416,7 @@ case class Pow(left: Expression, right: Expression)
  * @param right number of bits to left shift.
  */
 case class ShiftLeft(left: Expression, right: Expression)
-  extends BinaryExpression with ExpectsInputTypes {
+  extends BinaryExpression with ImplicitCastInputTypes {
 
   override def inputTypes: Seq[AbstractDataType] =
     Seq(TypeCollection(IntegerType, LongType), IntegerType)
@@ -442,7 +442,7 @@ case class ShiftLeft(left: Expression, right: Expression)
  * @param right number of bits to left shift.
  */
 case class ShiftRight(left: Expression, right: Expression)
-  extends BinaryExpression with ExpectsInputTypes {
+  extends BinaryExpression with ImplicitCastInputTypes {
 
   override def inputTypes: Seq[AbstractDataType] =
     Seq(TypeCollection(IntegerType, LongType), IntegerType)
@@ -468,7 +468,7 @@ case class ShiftRight(left: Expression, right: Expression)
  * @param right the number of bits to right shift.
  */
 case class ShiftRightUnsigned(left: Expression, right: Expression)
-  extends BinaryExpression with ExpectsInputTypes {
+  extends BinaryExpression with ImplicitCastInputTypes {
 
   override def inputTypes: Seq[AbstractDataType] =
     Seq(TypeCollection(IntegerType, LongType), IntegerType)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala
index 3b59cd431b871..a269ec4a1e6dc 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala
@@ -31,7 +31,7 @@ import org.apache.spark.unsafe.types.UTF8String
  * A function that calculates an MD5 128-bit checksum and returns it as a hex string
  * For input of type [[BinaryType]]
  */
-case class Md5(child: Expression) extends UnaryExpression with ExpectsInputTypes {
+case class Md5(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
 
   override def dataType: DataType = StringType
 
@@ -55,7 +55,7 @@ case class Md5(child: Expression) extends UnaryExpression with ExpectsInputTypes
  * the hash length is not one of the permitted values, the return value is NULL.
  */
 case class Sha2(left: Expression, right: Expression)
-  extends BinaryExpression with Serializable with ExpectsInputTypes {
+  extends BinaryExpression with Serializable with ImplicitCastInputTypes {
 
   override def dataType: DataType = StringType
 
@@ -118,7 +118,7 @@ case class Sha2(left: Expression, right: Expression)
  * A function that calculates a sha1 hash value and returns it as a hex string
  * For input of type [[BinaryType]] or [[StringType]]
  */
-case class Sha1(child: Expression) extends UnaryExpression with ExpectsInputTypes {
+case class Sha1(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
 
   override def dataType: DataType = StringType
 
@@ -138,7 +138,7 @@ case class Sha1(child: Expression) extends UnaryExpression with ExpectsInputType
  * A function that computes a cyclic redundancy check value and returns it as a bigint
  * For input of type [[BinaryType]]
  */
-case class Crc32(child: Expression) extends UnaryExpression with ExpectsInputTypes {
+case class Crc32(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
 
   override def dataType: DataType = LongType
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
index f74fd04619714..aa6c30e2f79f2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
@@ -33,12 +33,17 @@ object InterpretedPredicate {
   }
 }
 
+
+/**
+ * An [[Expression]] that returns a boolean value.
+ */
 trait Predicate extends Expression {
   self: Product =>
 
   override def dataType: DataType = BooleanType
 }
 
+
 trait PredicateHelper {
   protected def splitConjunctivePredicates(condition: Expression): Seq[Expression] = {
     condition match {
@@ -70,7 +75,10 @@ trait PredicateHelper {
     expr.references.subsetOf(plan.outputSet)
 }
 
-case class Not(child: Expression) extends UnaryExpression with Predicate with ExpectsInputTypes {
+
+case class Not(child: Expression)
+  extends UnaryExpression with Predicate with ImplicitCastInputTypes {
+
   override def toString: String = s"NOT $child"
 
   override def inputTypes: Seq[DataType] = Seq(BooleanType)
@@ -82,6 +90,7 @@ case class Not(child: Expression) extends UnaryExpression with Predicate with Ex
   }
 }
 
+
 /**
  * Evaluates to `true` if `list` contains `value`.
  */
@@ -97,6 +106,7 @@ case class In(value: Expression, list: Seq[Expression]) extends Predicate {
   }
 }
 
+
 /**
  * Optimized version of In clause, when all filter values of In clause are
  * static.
@@ -112,12 +122,12 @@ case class InSet(child: Expression, hset: Set[Any])
   }
 }
 
-case class And(left: Expression, right: Expression)
-  extends BinaryExpression with Predicate with ExpectsInputTypes {
 
-  override def toString: String = s"($left && $right)"
+case class And(left: Expression, right: Expression) extends BinaryOperator with Predicate {
+
+  override def inputType: AbstractDataType = BooleanType
 
-  override def inputTypes: Seq[DataType] = Seq(BooleanType, BooleanType)
+  override def symbol: String = "&&"
 
   override def eval(input: InternalRow): Any = {
     val input1 = left.eval(input)
@@ -161,12 +171,12 @@ case class And(left: Expression, right: Expression)
   }
 }
 
-case class Or(left: Expression, right: Expression)
-  extends BinaryExpression with Predicate with ExpectsInputTypes {
 
-  override def toString: String = s"($left || $right)"
+case class Or(left: Expression, right: Expression) extends BinaryOperator with Predicate {
 
-  override def inputTypes: Seq[DataType] = Seq(BooleanType, BooleanType)
+  override def inputType: AbstractDataType = BooleanType
+
+  override def symbol: String = "||"
 
   override def eval(input: InternalRow): Any = {
     val input1 = left.eval(input)
@@ -210,21 +220,10 @@ case class Or(left: Expression, right: Expression)
   }
 }
 
+
 abstract class BinaryComparison extends BinaryOperator with Predicate {
   self: Product =>
 
-  override def checkInputDataTypes(): TypeCheckResult = {
-    if (left.dataType != right.dataType) {
-      TypeCheckResult.TypeCheckFailure(
-        s"differing types in ${this.getClass.getSimpleName} " +
-        s"(${left.dataType} and ${right.dataType}).")
-    } else {
-      checkTypesInternal(dataType)
-    }
-  }
-
-  protected def checkTypesInternal(t: DataType): TypeCheckResult
-
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     if (ctx.isPrimitiveType(left.dataType)) {
       // faster version
@@ -235,10 +234,12 @@ abstract class BinaryComparison extends BinaryOperator with Predicate {
   }
 }
 
+
 private[sql] object BinaryComparison {
   def unapply(e: BinaryComparison): Option[(Expression, Expression)] = Some((e.left, e.right))
 }
 
+
 /** An extractor that matches both standard 3VL equality and null-safe equality. */
 private[sql] object Equality {
   def unapply(e: BinaryComparison): Option[(Expression, Expression)] = e match {
@@ -248,10 +249,12 @@ private[sql] object Equality {
   }
 }
 
+
 case class EqualTo(left: Expression, right: Expression) extends BinaryComparison {
-  override def symbol: String = "="
 
-  override protected def checkTypesInternal(t: DataType) = TypeCheckResult.TypeCheckSuccess
+  override def inputType: AbstractDataType = AnyDataType
+
+  override def symbol: String = "="
 
   protected override def nullSafeEval(input1: Any, input2: Any): Any = {
     if (left.dataType != BinaryType) input1 == input2
@@ -263,13 +266,15 @@ case class EqualTo(left: Expression, right: Expression) extends BinaryComparison
   }
 }
 
+
 case class EqualNullSafe(left: Expression, right: Expression) extends BinaryComparison {
+
+  override def inputType: AbstractDataType = AnyDataType
+
   override def symbol: String = "<=>"
 
   override def nullable: Boolean = false
 
-  override protected def checkTypesInternal(t: DataType) = TypeCheckResult.TypeCheckSuccess
-
   override def eval(input: InternalRow): Any = {
     val input1 = left.eval(input)
     val input2 = right.eval(input)
@@ -298,44 +303,48 @@ case class EqualNullSafe(left: Expression, right: Expression) extends BinaryComp
   }
 }
 
+
 case class LessThan(left: Expression, right: Expression) extends BinaryComparison {
-  override def symbol: String = "<"
 
-  override protected def checkTypesInternal(t: DataType) =
-    TypeUtils.checkForOrderingExpr(left.dataType, "operator " + symbol)
+  override def inputType: AbstractDataType = TypeCollection.Ordered
+
+  override def symbol: String = "<"
 
   private lazy val ordering = TypeUtils.getOrdering(left.dataType)
 
   protected override def nullSafeEval(input1: Any, input2: Any): Any = ordering.lt(input1, input2)
 }
 
+
 case class LessThanOrEqual(left: Expression, right: Expression) extends BinaryComparison {
-  override def symbol: String = "<="
 
-  override protected def checkTypesInternal(t: DataType) =
-    TypeUtils.checkForOrderingExpr(left.dataType, "operator " + symbol)
+  override def inputType: AbstractDataType = TypeCollection.Ordered
+
+  override def symbol: String = "<="
 
   private lazy val ordering = TypeUtils.getOrdering(left.dataType)
 
   protected override def nullSafeEval(input1: Any, input2: Any): Any = ordering.lteq(input1, input2)
 }
 
+
 case class GreaterThan(left: Expression, right: Expression) extends BinaryComparison {
-  override def symbol: String = ">"
 
-  override protected def checkTypesInternal(t: DataType) =
-    TypeUtils.checkForOrderingExpr(left.dataType, "operator " + symbol)
+  override def inputType: AbstractDataType = TypeCollection.Ordered
+
+  override def symbol: String = ">"
 
   private lazy val ordering = TypeUtils.getOrdering(left.dataType)
 
   protected override def nullSafeEval(input1: Any, input2: Any): Any = ordering.gt(input1, input2)
 }
 
+
 case class GreaterThanOrEqual(left: Expression, right: Expression) extends BinaryComparison {
-  override def symbol: String = ">="
 
-  override protected def checkTypesInternal(t: DataType) =
-    TypeUtils.checkForOrderingExpr(left.dataType, "operator " + symbol)
+  override def inputType: AbstractDataType = TypeCollection.Ordered
+
+  override def symbol: String = ">="
 
   private lazy val ordering = TypeUtils.getOrdering(left.dataType)
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
index f64899c1ed84c..03b55ce5fe7cc 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
@@ -29,7 +29,7 @@ import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
 
 
-trait StringRegexExpression extends ExpectsInputTypes {
+trait StringRegexExpression extends ImplicitCastInputTypes {
   self: BinaryExpression =>
 
   def escape(v: String): String
@@ -105,7 +105,7 @@ case class RLike(left: Expression, right: Expression)
   override def toString: String = s"$left RLIKE $right"
 }
 
-trait String2StringExpression extends ExpectsInputTypes {
+trait String2StringExpression extends ImplicitCastInputTypes {
   self: UnaryExpression =>
 
   def convert(v: UTF8String): UTF8String
@@ -142,7 +142,7 @@ case class Lower(child: Expression) extends UnaryExpression with String2StringEx
 }
 
 /** A base trait for functions that compare two strings, returning a boolean. */
-trait StringComparison extends ExpectsInputTypes {
+trait StringComparison extends ImplicitCastInputTypes {
   self: BinaryExpression =>
 
   def compare(l: UTF8String, r: UTF8String): Boolean
@@ -241,7 +241,7 @@ case class StringTrimRight(child: Expression)
  * NOTE: that this is not zero based, but 1-based index. The first character in str has index 1.
  */
 case class StringInstr(str: Expression, substr: Expression)
-  extends BinaryExpression with ExpectsInputTypes {
+  extends BinaryExpression with ImplicitCastInputTypes {
 
   override def left: Expression = str
   override def right: Expression = substr
@@ -265,7 +265,7 @@ case class StringInstr(str: Expression, substr: Expression)
  * in given string after position pos.
  */
 case class StringLocate(substr: Expression, str: Expression, start: Expression)
-  extends Expression with ExpectsInputTypes {
+  extends Expression with ImplicitCastInputTypes {
 
   def this(substr: Expression, str: Expression) = {
     this(substr, str, Literal(0))
@@ -306,7 +306,7 @@ case class StringLocate(substr: Expression, str: Expression, start: Expression)
  * Returns str, left-padded with pad to a length of len.
  */
 case class StringLPad(str: Expression, len: Expression, pad: Expression)
-  extends Expression with ExpectsInputTypes {
+  extends Expression with ImplicitCastInputTypes {
 
   override def children: Seq[Expression] = str :: len :: pad :: Nil
   override def foldable: Boolean = children.forall(_.foldable)
@@ -344,7 +344,7 @@ case class StringLPad(str: Expression, len: Expression, pad: Expression)
  * Returns str, right-padded with pad to a length of len.
  */
 case class StringRPad(str: Expression, len: Expression, pad: Expression)
-  extends Expression with ExpectsInputTypes {
+  extends Expression with ImplicitCastInputTypes {
 
   override def children: Seq[Expression] = str :: len :: pad :: Nil
   override def foldable: Boolean = children.forall(_.foldable)
@@ -413,7 +413,7 @@ case class StringFormat(children: Expression*) extends Expression {
  * Returns the string which repeat the given string value n times.
  */
 case class StringRepeat(str: Expression, times: Expression)
-  extends BinaryExpression with ExpectsInputTypes {
+  extends BinaryExpression with ImplicitCastInputTypes {
 
   override def left: Expression = str
   override def right: Expression = times
@@ -447,7 +447,7 @@ case class StringReverse(child: Expression) extends UnaryExpression with String2
 /**
  * Returns a n spaces string.
  */
-case class StringSpace(child: Expression) extends UnaryExpression with ExpectsInputTypes {
+case class StringSpace(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
 
   override def dataType: DataType = StringType
   override def inputTypes: Seq[DataType] = Seq(IntegerType)
@@ -467,7 +467,7 @@ case class StringSpace(child: Expression) extends UnaryExpression with ExpectsIn
  * Splits str around pat (pattern is a regular expression).
  */
 case class StringSplit(str: Expression, pattern: Expression)
-  extends BinaryExpression with ExpectsInputTypes {
+  extends BinaryExpression with ImplicitCastInputTypes {
 
   override def left: Expression = str
   override def right: Expression = pattern
@@ -488,7 +488,7 @@ case class StringSplit(str: Expression, pattern: Expression)
  * Defined for String and Binary types.
  */
 case class Substring(str: Expression, pos: Expression, len: Expression)
-  extends Expression with ExpectsInputTypes {
+  extends Expression with ImplicitCastInputTypes {
 
   def this(str: Expression, pos: Expression) = {
     this(str, pos, Literal(Integer.MAX_VALUE))
@@ -555,7 +555,7 @@ case class Substring(str: Expression, pos: Expression, len: Expression)
 /**
  * A function that return the length of the given string expression.
  */
-case class StringLength(child: Expression) extends UnaryExpression with ExpectsInputTypes {
+case class StringLength(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
   override def dataType: DataType = IntegerType
   override def inputTypes: Seq[DataType] = Seq(StringType)
 
@@ -573,7 +573,7 @@ case class StringLength(child: Expression) extends UnaryExpression with ExpectsI
  * A function that return the Levenshtein distance between the two given strings.
  */
 case class Levenshtein(left: Expression, right: Expression) extends BinaryExpression
-    with ExpectsInputTypes {
+    with ImplicitCastInputTypes {
 
   override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType)
 
@@ -591,7 +591,7 @@ case class Levenshtein(left: Expression, right: Expression) extends BinaryExpres
 /**
  * Returns the numeric value of the first character of str.
  */
-case class Ascii(child: Expression) extends UnaryExpression with ExpectsInputTypes {
+case class Ascii(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
   override def dataType: DataType = IntegerType
   override def inputTypes: Seq[DataType] = Seq(StringType)
 
@@ -608,7 +608,7 @@ case class Ascii(child: Expression) extends UnaryExpression with ExpectsInputTyp
 /**
  * Converts the argument from binary to a base 64 string.
  */
-case class Base64(child: Expression) extends UnaryExpression with ExpectsInputTypes {
+case class Base64(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
   override def dataType: DataType = StringType
   override def inputTypes: Seq[DataType] = Seq(BinaryType)
 
@@ -622,7 +622,7 @@ case class Base64(child: Expression) extends UnaryExpression with ExpectsInputTy
 /**
  * Converts the argument from a base 64 string to BINARY.
  */
-case class UnBase64(child: Expression) extends UnaryExpression with ExpectsInputTypes {
+case class UnBase64(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
   override def dataType: DataType = BinaryType
   override def inputTypes: Seq[DataType] = Seq(StringType)
 
@@ -636,7 +636,7 @@ case class UnBase64(child: Expression) extends UnaryExpression with ExpectsInput
  * If either argument is null, the result will also be null.
  */
 case class Decode(bin: Expression, charset: Expression)
-  extends BinaryExpression with ExpectsInputTypes {
+  extends BinaryExpression with ImplicitCastInputTypes {
 
   override def left: Expression = bin
   override def right: Expression = charset
@@ -655,7 +655,7 @@ case class Decode(bin: Expression, charset: Expression)
  * If either argument is null, the result will also be null.
 */
 case class Encode(value: Expression, charset: Expression)
-  extends BinaryExpression with ExpectsInputTypes {
+  extends BinaryExpression with ImplicitCastInputTypes {
 
   override def left: Expression = value
   override def right: Expression = charset
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TypeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TypeUtils.scala
index 3148309a2166f..0103ddcf9cfb7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TypeUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TypeUtils.scala
@@ -32,14 +32,6 @@ object TypeUtils {
     }
   }
 
-  def checkForBitwiseExpr(t: DataType, caller: String): TypeCheckResult = {
-    if (t.isInstanceOf[IntegralType] || t == NullType) {
-      TypeCheckResult.TypeCheckSuccess
-    } else {
-      TypeCheckResult.TypeCheckFailure(s"$caller accepts integral types, not $t")
-    }
-  }
-
   def checkForOrderingExpr(t: DataType, caller: String): TypeCheckResult = {
     if (t.isInstanceOf[AtomicType] || t == NullType) {
       TypeCheckResult.TypeCheckSuccess
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala
index 32f87440b4e37..f5715f7a829ff 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala
@@ -96,6 +96,24 @@ private[sql] class TypeCollection(private val types: Seq[AbstractDataType])
 
 private[sql] object TypeCollection {
 
+  /**
+   * Types that can be ordered/compared. In the long run we should probably make this a trait
+   * that can be mixed into each data type, and perhaps create an [[AbstractDataType]].
+   */
+  val Ordered = TypeCollection(
+    BooleanType,
+    ByteType, ShortType, IntegerType, LongType,
+    FloatType, DoubleType, DecimalType,
+    TimestampType, DateType,
+    StringType, BinaryType)
+
+  /**
+   * Types that can be used in bitwise operations.
+   */
+  val Bitwise = TypeCollection(
+    BooleanType,
+    ByteType, ShortType, IntegerType, LongType)
+
   def apply(types: AbstractDataType*): TypeCollection = new TypeCollection(types)
 
   def unapply(typ: AbstractDataType): Option[Seq[AbstractDataType]] = typ match {
@@ -105,6 +123,23 @@ private[sql] object TypeCollection {
 }
 
 
+/**
+ * An [[AbstractDataType]] that matches any concrete data types.
+ */
+protected[sql] object AnyDataType extends AbstractDataType {
+
+  // Note that since AnyDataType matches any concrete types, defaultConcreteType should never
+  // be invoked.
+  override private[sql] def defaultConcreteType: DataType = throw new UnsupportedOperationException
+
+  override private[sql] def simpleString: String = "any"
+
+  override private[sql] def isSameType(other: DataType): Boolean = false
+
+  override private[sql] def acceptsType(other: DataType): Boolean = true
+}
+
+
 /**
  * An internal type used to represent everything that is not null, UDTs, arrays, structs, and maps.
  */
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
index 9d0c69a2451d1..f0f17103991ef 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
@@ -30,7 +30,7 @@ import org.apache.spark.sql.catalyst.dsl.plans._
 
 case class TestFunction(
     children: Seq[Expression],
-    inputTypes: Seq[AbstractDataType]) extends Expression with ExpectsInputTypes {
+    inputTypes: Seq[AbstractDataType]) extends Expression with ImplicitCastInputTypes {
   override def nullable: Boolean = true
   override def eval(input: InternalRow): Any = throw new UnsupportedOperationException
   override def dataType: DataType = StringType
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala
index 8e0551b23eea6..5958acbe009ca 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala
@@ -49,7 +49,7 @@ class ExpressionTypeCheckingSuite extends SparkFunSuite {
 
   def assertErrorForDifferingTypes(expr: Expression): Unit = {
     assertError(expr,
-      s"differing types in ${expr.getClass.getSimpleName} (IntegerType and BooleanType).")
+      s"differing types in '${expr.prettyString}' (int and boolean)")
   }
 
   test("check types for unary arithmetic") {
@@ -58,7 +58,7 @@ class ExpressionTypeCheckingSuite extends SparkFunSuite {
     assertError(BitwiseNot('stringField), "operator ~ accepts integral type")
   }
 
-  test("check types for binary arithmetic") {
+  ignore("check types for binary arithmetic") {
     // We will cast String to Double for binary arithmetic
     assertSuccess(Add('intField, 'stringField))
     assertSuccess(Subtract('intField, 'stringField))
@@ -92,7 +92,7 @@ class ExpressionTypeCheckingSuite extends SparkFunSuite {
     assertError(MinOf('complexField, 'complexField), "function minOf accepts non-complex type")
   }
 
-  test("check types for predicates") {
+  ignore("check types for predicates") {
     // We will cast String to Double for binary comparison
     assertSuccess(EqualTo('intField, 'stringField))
     assertSuccess(EqualNullSafe('intField, 'stringField))
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
index acb9a433de903..8e9b20a3ebe42 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
@@ -194,6 +194,32 @@ class HiveTypeCoercionSuite extends PlanTest {
       Project(Seq(Alias(transformed, "a")()), testRelation))
   }
 
+  test("cast NullType for expresions that implement ExpectsInputTypes") {
+    import HiveTypeCoercionSuite._
+
+    ruleTest(HiveTypeCoercion.ImplicitTypeCasts,
+      AnyTypeUnaryExpression(Literal.create(null, NullType)),
+      AnyTypeUnaryExpression(Literal.create(null, NullType)))
+
+    ruleTest(HiveTypeCoercion.ImplicitTypeCasts,
+      NumericTypeUnaryExpression(Literal.create(null, NullType)),
+      NumericTypeUnaryExpression(Cast(Literal.create(null, NullType), DoubleType)))
+  }
+
+  test("cast NullType for binary operators") {
+    import HiveTypeCoercionSuite._
+
+    ruleTest(HiveTypeCoercion.ImplicitTypeCasts,
+      AnyTypeBinaryOperator(Literal.create(null, NullType), Literal.create(null, NullType)),
+      AnyTypeBinaryOperator(Literal.create(null, NullType), Literal.create(null, NullType)))
+
+    ruleTest(HiveTypeCoercion.ImplicitTypeCasts,
+      NumericTypeBinaryOperator(Literal.create(null, NullType), Literal.create(null, NullType)),
+      NumericTypeBinaryOperator(
+        Cast(Literal.create(null, NullType), DoubleType),
+        Cast(Literal.create(null, NullType), DoubleType)))
+  }
+
   test("coalesce casts") {
     ruleTest(HiveTypeCoercion.FunctionArgumentConversion,
       Coalesce(Literal(1.0)
@@ -302,3 +328,33 @@ class HiveTypeCoercionSuite extends PlanTest {
     )
   }
 }
+
+
+object HiveTypeCoercionSuite {
+
+  case class AnyTypeUnaryExpression(child: Expression)
+    extends UnaryExpression with ExpectsInputTypes {
+    override def inputTypes: Seq[AbstractDataType] = Seq(AnyDataType)
+    override def dataType: DataType = NullType
+  }
+
+  case class NumericTypeUnaryExpression(child: Expression)
+    extends UnaryExpression with ExpectsInputTypes {
+    override def inputTypes: Seq[AbstractDataType] = Seq(NumericType)
+    override def dataType: DataType = NullType
+  }
+
+  case class AnyTypeBinaryOperator(left: Expression, right: Expression)
+    extends BinaryOperator with ExpectsInputTypes {
+    override def dataType: DataType = NullType
+    override def inputType: AbstractDataType = AnyDataType
+    override def symbol: String = "anytype"
+  }
+
+  case class NumericTypeBinaryOperator(left: Expression, right: Expression)
+    extends BinaryOperator with ExpectsInputTypes {
+    override def dataType: DataType = NullType
+    override def inputType: AbstractDataType = NumericType
+    override def symbol: String = "numerictype"
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala
index 24bef21b999ea..b30b9f12258b9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala
@@ -375,6 +375,5 @@ class MathExpressionsSuite extends QueryTest {
     val df = Seq((1, -1, "abc")).toDF("a", "b", "c")
     checkAnswer(df.selectExpr("positive(a)"), Row(1))
     checkAnswer(df.selectExpr("positive(b)"), Row(-1))
-    checkAnswer(df.selectExpr("positive(c)"), Row("abc"))
   }
 }

From c6b1a9e74e34267dc198e57a184c41498ca9d6a3 Mon Sep 17 00:00:00 2001
From: Michael Armbrust <michael@databricks.com>
Date: Tue, 14 Jul 2015 22:57:39 -0700
Subject: [PATCH 159/549] Revert SPARK-6910 and SPARK-9027

Revert #7216 and #7386.  These patch seems to be causing quite a few test failures:

```
Caused by: java.lang.reflect.InvocationTargetException
	at sun.reflect.GeneratedMethodAccessor322.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:606)
	at org.apache.spark.sql.hive.client.Shim_v0_13.getPartitionsByFilter(HiveShim.scala:351)
	at org.apache.spark.sql.hive.client.ClientWrapper$$anonfun$getPartitionsByFilter$1.apply(ClientWrapper.scala:320)
	at org.apache.spark.sql.hive.client.ClientWrapper$$anonfun$getPartitionsByFilter$1.apply(ClientWrapper.scala:318)
	at org.apache.spark.sql.hive.client.ClientWrapper$$anonfun$withHiveState$1.apply(ClientWrapper.scala:180)
	at org.apache.spark.sql.hive.client.ClientWrapper.retryLocked(ClientWrapper.scala:135)
	at org.apache.spark.sql.hive.client.ClientWrapper.withHiveState(ClientWrapper.scala:172)
	at org.apache.spark.sql.hive.client.ClientWrapper.getPartitionsByFilter(ClientWrapper.scala:318)
	at org.apache.spark.sql.hive.client.HiveTable.getPartitions(ClientInterface.scala:78)
	at org.apache.spark.sql.hive.MetastoreRelation.getHiveQlPartitions(HiveMetastoreCatalog.scala:670)
	at org.apache.spark.sql.hive.execution.HiveTableScan.doExecute(HiveTableScan.scala:137)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:90)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:90)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:89)
	at org.apache.spark.sql.execution.Exchange$$anonfun$doExecute$1.apply(Exchange.scala:164)
	at org.apache.spark.sql.execution.Exchange$$anonfun$doExecute$1.apply(Exchange.scala:151)
	at org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:48)
	... 85 more
Caused by: MetaException(message:Filtering is supported only on partition keys of type string)
	at org.apache.hadoop.hive.metastore.parser.ExpressionTree$FilterBuilder.setError(ExpressionTree.java:185)
	at org.apache.hadoop.hive.metastore.parser.ExpressionTree$LeafNode.getJdoFilterPushdownParam(ExpressionTree.java:452)
	at org.apache.hadoop.hive.metastore.parser.ExpressionTree$LeafNode.generateJDOFilterOverPartitions(ExpressionTree.java:357)
	at org.apache.hadoop.hive.metastore.parser.ExpressionTree$LeafNode.generateJDOFilter(ExpressionTree.java:279)
	at org.apache.hadoop.hive.metastore.parser.ExpressionTree$TreeNode.generateJDOFilter(ExpressionTree.java:243)
	at org.apache.hadoop.hive.metastore.parser.ExpressionTree.generateJDOFilterFragment(ExpressionTree.java:590)
	at org.apache.hadoop.hive.metastore.ObjectStore.makeQueryFilterString(ObjectStore.java:2417)
	at org.apache.hadoop.hive.metastore.ObjectStore.getPartitionsViaOrmFilter(ObjectStore.java:2029)
	at org.apache.hadoop.hive.metastore.ObjectStore.access$500(ObjectStore.java:146)
	at org.apache.hadoop.hive.metastore.ObjectStore$4.getJdoResult(ObjectStore.java:2332)
```
https://amplab.cs.berkeley.edu/jenkins/view/Spark-QA-Test/job/Spark-Master-Maven-with-YARN/2945/HADOOP_PROFILE=hadoop-2.4,label=centos/testReport/junit/org.apache.spark.sql.hive.execution/SortMergeCompatibilitySuite/auto_sortmerge_join_16/

Author: Michael Armbrust <michael@databricks.com>

Closes #7409 from marmbrus/revertMetastorePushdown and squashes the following commits:

92fabd3 [Michael Armbrust] Revert SPARK-6910 and SPARK-9027
5d3bdf2 [Michael Armbrust] Revert "[SPARK-9027] [SQL] Generalize metastore predicate pushdown"
---
 .../spark/sql/hive/HiveMetastoreCatalog.scala | 58 +++++++-------
 .../org/apache/spark/sql/hive/HiveShim.scala  |  1 -
 .../spark/sql/hive/HiveStrategies.scala       |  4 +-
 .../sql/hive/client/ClientInterface.scala     | 11 +--
 .../spark/sql/hive/client/ClientWrapper.scala | 21 +++--
 .../spark/sql/hive/client/HiveShim.scala      | 72 +----------------
 .../sql/hive/execution/HiveTableScan.scala    |  7 +-
 .../spark/sql/hive/client/FiltersSuite.scala  | 78 -------------------
 .../spark/sql/hive/client/VersionsSuite.scala |  8 --
 .../sql/hive/execution/PruningSuite.scala     |  2 +-
 10 files changed, 44 insertions(+), 218 deletions(-)
 delete mode 100644 sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index 5bdf68c83fca7..4b7a782c805a0 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -301,9 +301,7 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
     val result = if (metastoreRelation.hiveQlTable.isPartitioned) {
       val partitionSchema = StructType.fromAttributes(metastoreRelation.partitionKeys)
       val partitionColumnDataTypes = partitionSchema.map(_.dataType)
-      // We're converting the entire table into ParquetRelation, so predicates to Hive metastore
-      // are empty.
-      val partitions = metastoreRelation.getHiveQlPartitions().map { p =>
+      val partitions = metastoreRelation.hiveQlPartitions.map { p =>
         val location = p.getLocation
         val values = InternalRow.fromSeq(p.getValues.zip(partitionColumnDataTypes).map {
           case (rawValue, dataType) => Cast(Literal(rawValue), dataType).eval(null)
@@ -646,6 +644,32 @@ private[hive] case class MetastoreRelation
     new Table(tTable)
   }
 
+  @transient val hiveQlPartitions: Seq[Partition] = table.getAllPartitions.map { p =>
+    val tPartition = new org.apache.hadoop.hive.metastore.api.Partition
+    tPartition.setDbName(databaseName)
+    tPartition.setTableName(tableName)
+    tPartition.setValues(p.values)
+
+    val sd = new org.apache.hadoop.hive.metastore.api.StorageDescriptor()
+    tPartition.setSd(sd)
+    sd.setCols(table.schema.map(c => new FieldSchema(c.name, c.hiveType, c.comment)))
+
+    sd.setLocation(p.storage.location)
+    sd.setInputFormat(p.storage.inputFormat)
+    sd.setOutputFormat(p.storage.outputFormat)
+
+    val serdeInfo = new org.apache.hadoop.hive.metastore.api.SerDeInfo
+    sd.setSerdeInfo(serdeInfo)
+    serdeInfo.setSerializationLib(p.storage.serde)
+
+    val serdeParameters = new java.util.HashMap[String, String]()
+    serdeInfo.setParameters(serdeParameters)
+    table.serdeProperties.foreach { case (k, v) => serdeParameters.put(k, v) }
+    p.storage.serdeProperties.foreach { case (k, v) => serdeParameters.put(k, v) }
+
+    new Partition(hiveQlTable, tPartition)
+  }
+
   @transient override lazy val statistics: Statistics = Statistics(
     sizeInBytes = {
       val totalSize = hiveQlTable.getParameters.get(StatsSetupConst.TOTAL_SIZE)
@@ -666,34 +690,6 @@ private[hive] case class MetastoreRelation
     }
   )
 
-  def getHiveQlPartitions(predicates: Seq[Expression] = Nil): Seq[Partition] = {
-    table.getPartitions(predicates).map { p =>
-      val tPartition = new org.apache.hadoop.hive.metastore.api.Partition
-      tPartition.setDbName(databaseName)
-      tPartition.setTableName(tableName)
-      tPartition.setValues(p.values)
-
-      val sd = new org.apache.hadoop.hive.metastore.api.StorageDescriptor()
-      tPartition.setSd(sd)
-      sd.setCols(table.schema.map(c => new FieldSchema(c.name, c.hiveType, c.comment)))
-
-      sd.setLocation(p.storage.location)
-      sd.setInputFormat(p.storage.inputFormat)
-      sd.setOutputFormat(p.storage.outputFormat)
-
-      val serdeInfo = new org.apache.hadoop.hive.metastore.api.SerDeInfo
-      sd.setSerdeInfo(serdeInfo)
-      serdeInfo.setSerializationLib(p.storage.serde)
-
-      val serdeParameters = new java.util.HashMap[String, String]()
-      serdeInfo.setParameters(serdeParameters)
-      table.serdeProperties.foreach { case (k, v) => serdeParameters.put(k, v) }
-      p.storage.serdeProperties.foreach { case (k, v) => serdeParameters.put(k, v) }
-
-      new Partition(hiveQlTable, tPartition)
-    }
-  }
-
   /** Only compare database and tablename, not alias. */
   override def sameResult(plan: LogicalPlan): Boolean = {
     plan match {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveShim.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveShim.scala
index a357bb39ca7fd..d08c594151654 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveShim.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveShim.scala
@@ -27,7 +27,6 @@ import scala.reflect.ClassTag
 
 import com.esotericsoftware.kryo.Kryo
 import com.esotericsoftware.kryo.io.{Input, Output}
-
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.hive.ql.exec.{UDF, Utilities}
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
index 9638a8201e190..ed359620a5f7f 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
@@ -125,7 +125,7 @@ private[hive] trait HiveStrategies {
                 InterpretedPredicate.create(castedPredicate)
               }
 
-            val partitions = relation.getHiveQlPartitions(pruningPredicates).filter { part =>
+            val partitions = relation.hiveQlPartitions.filter { part =>
               val partitionValues = part.getValues
               var i = 0
               while (i < partitionValues.size()) {
@@ -213,7 +213,7 @@ private[hive] trait HiveStrategies {
           projectList,
           otherPredicates,
           identity[Seq[Expression]],
-          HiveTableScan(_, relation, pruningPredicates)(hiveContext)) :: Nil
+          HiveTableScan(_, relation, pruningPredicates.reduceLeftOption(And))(hiveContext)) :: Nil
       case _ =>
         Nil
     }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientInterface.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientInterface.scala
index 1656587d14835..0a1d761a52f88 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientInterface.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientInterface.scala
@@ -21,7 +21,6 @@ import java.io.PrintStream
 import java.util.{Map => JMap}
 
 import org.apache.spark.sql.catalyst.analysis.{NoSuchDatabaseException, NoSuchTableException}
-import org.apache.spark.sql.catalyst.expressions.Expression
 
 private[hive] case class HiveDatabase(
     name: String,
@@ -72,12 +71,7 @@ private[hive] case class HiveTable(
 
   def isPartitioned: Boolean = partitionColumns.nonEmpty
 
-  def getPartitions(predicates: Seq[Expression]): Seq[HivePartition] = {
-    predicates match {
-      case Nil => client.getAllPartitions(this)
-      case _ => client.getPartitionsByFilter(this, predicates)
-    }
-  }
+  def getAllPartitions: Seq[HivePartition] = client.getAllPartitions(this)
 
   // Hive does not support backticks when passing names to the client.
   def qualifiedName: String = s"$database.$name"
@@ -138,9 +132,6 @@ private[hive] trait ClientInterface {
   /** Returns all partitions for the given table. */
   def getAllPartitions(hTable: HiveTable): Seq[HivePartition]
 
-  /** Returns partitions filtered by predicates for the given table. */
-  def getPartitionsByFilter(hTable: HiveTable, predicates: Seq[Expression]): Seq[HivePartition]
-
   /** Loads a static partition into an existing table. */
   def loadPartition(
       loadPath: String,
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala
index 8adda54754230..53f457ad4f3cc 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala
@@ -17,21 +17,25 @@
 
 package org.apache.spark.sql.hive.client
 
-import java.io.{File, PrintStream}
-import java.util.{Map => JMap}
+import java.io.{BufferedReader, InputStreamReader, File, PrintStream}
+import java.net.URI
+import java.util.{ArrayList => JArrayList, Map => JMap, List => JList, Set => JSet}
 import javax.annotation.concurrent.GuardedBy
 
 import scala.collection.JavaConversions._
 import scala.language.reflectiveCalls
 
 import org.apache.hadoop.fs.Path
+import org.apache.hadoop.hive.metastore.api.Database
 import org.apache.hadoop.hive.conf.HiveConf
-import org.apache.hadoop.hive.metastore.api.{Database, FieldSchema}
 import org.apache.hadoop.hive.metastore.{TableType => HTableType}
+import org.apache.hadoop.hive.metastore.api
+import org.apache.hadoop.hive.metastore.api.FieldSchema
+import org.apache.hadoop.hive.ql.metadata
 import org.apache.hadoop.hive.ql.metadata.Hive
-import org.apache.hadoop.hive.ql.processors._
 import org.apache.hadoop.hive.ql.session.SessionState
-import org.apache.hadoop.hive.ql.{Driver, metadata}
+import org.apache.hadoop.hive.ql.processors._
+import org.apache.hadoop.hive.ql.Driver
 
 import org.apache.spark.Logging
 import org.apache.spark.sql.catalyst.expressions.Expression
@@ -312,13 +316,6 @@ private[hive] class ClientWrapper(
     shim.getAllPartitions(client, qlTable).map(toHivePartition)
   }
 
-  override def getPartitionsByFilter(
-      hTable: HiveTable,
-      predicates: Seq[Expression]): Seq[HivePartition] = withHiveState {
-    val qlTable = toQlTable(hTable)
-    shim.getPartitionsByFilter(client, qlTable, predicates).map(toHivePartition)
-  }
-
   override def listTables(dbName: String): Seq[String] = withHiveState {
     client.getAllTables(dbName)
   }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
index d12778c7583df..1fa9d278e2a57 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
@@ -31,11 +31,6 @@ import org.apache.hadoop.hive.ql.Driver
 import org.apache.hadoop.hive.ql.metadata.{Hive, Partition, Table}
 import org.apache.hadoop.hive.ql.processors.{CommandProcessor, CommandProcessorFactory}
 import org.apache.hadoop.hive.ql.session.SessionState
-import org.apache.hadoop.hive.serde.serdeConstants
-
-import org.apache.spark.Logging
-import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.types.{StringType, IntegralType}
 
 /**
  * A shim that defines the interface between ClientWrapper and the underlying Hive library used to
@@ -66,8 +61,6 @@ private[client] sealed abstract class Shim {
 
   def getAllPartitions(hive: Hive, table: Table): Seq[Partition]
 
-  def getPartitionsByFilter(hive: Hive, table: Table, predicates: Seq[Expression]): Seq[Partition]
-
   def getCommandProcessor(token: String, conf: HiveConf): CommandProcessor
 
   def getDriverResults(driver: Driver): Seq[String]
@@ -116,7 +109,7 @@ private[client] sealed abstract class Shim {
 
 }
 
-private[client] class Shim_v0_12 extends Shim with Logging {
+private[client] class Shim_v0_12 extends Shim {
 
   private lazy val startMethod =
     findStaticMethod(
@@ -203,17 +196,6 @@ private[client] class Shim_v0_12 extends Shim with Logging {
   override def getAllPartitions(hive: Hive, table: Table): Seq[Partition] =
     getAllPartitionsMethod.invoke(hive, table).asInstanceOf[JSet[Partition]].toSeq
 
-  override def getPartitionsByFilter(
-      hive: Hive,
-      table: Table,
-      predicates: Seq[Expression]): Seq[Partition] = {
-    // getPartitionsByFilter() doesn't support binary comparison ops in Hive 0.12.
-    // See HIVE-4888.
-    logDebug("Hive 0.12 doesn't support predicate pushdown to metastore. " +
-      "Please use Hive 0.13 or higher.")
-    getAllPartitions(hive, table)
-  }
-
   override def getCommandProcessor(token: String, conf: HiveConf): CommandProcessor =
     getCommandProcessorMethod.invoke(null, token, conf).asInstanceOf[CommandProcessor]
 
@@ -285,12 +267,6 @@ private[client] class Shim_v0_13 extends Shim_v0_12 {
       classOf[Hive],
       "getAllPartitionsOf",
       classOf[Table])
-  private lazy val getPartitionsByFilterMethod =
-    findMethod(
-      classOf[Hive],
-      "getPartitionsByFilter",
-      classOf[Table],
-      classOf[String])
   private lazy val getCommandProcessorMethod =
     findStaticMethod(
       classOf[CommandProcessorFactory],
@@ -312,52 +288,6 @@ private[client] class Shim_v0_13 extends Shim_v0_12 {
   override def getAllPartitions(hive: Hive, table: Table): Seq[Partition] =
     getAllPartitionsMethod.invoke(hive, table).asInstanceOf[JSet[Partition]].toSeq
 
-  /**
-   * Converts catalyst expression to the format that Hive's getPartitionsByFilter() expects, i.e.
-   * a string that represents partition predicates like "str_key=\"value\" and int_key=1 ...".
-   *
-   * Unsupported predicates are skipped.
-   */
-  def convertFilters(table: Table, filters: Seq[Expression]): String = {
-    // hive varchar is treated as catalyst string, but hive varchar can't be pushed down.
-    val varcharKeys = table.getPartitionKeys
-      .filter(col => col.getType.startsWith(serdeConstants.VARCHAR_TYPE_NAME))
-      .map(col => col.getName).toSet
-
-    filters.collect {
-      case op @ BinaryComparison(a: Attribute, Literal(v, _: IntegralType)) =>
-        s"${a.name} ${op.symbol} $v"
-      case op @ BinaryComparison(Literal(v, _: IntegralType), a: Attribute) =>
-        s"$v ${op.symbol} ${a.name}"
-
-      case op @ BinaryComparison(a: Attribute, Literal(v, _: StringType))
-          if !varcharKeys.contains(a.name) =>
-        s"""${a.name} ${op.symbol} "$v""""
-      case op @ BinaryComparison(Literal(v, _: StringType), a: Attribute)
-          if !varcharKeys.contains(a.name) =>
-        s""""$v" ${op.symbol} ${a.name}"""
-    }.mkString(" and ")
-  }
-
-  override def getPartitionsByFilter(
-      hive: Hive,
-      table: Table,
-      predicates: Seq[Expression]): Seq[Partition] = {
-
-    // Hive getPartitionsByFilter() takes a string that represents partition
-    // predicates like "str_key=\"value\" and int_key=1 ..."
-    val filter = convertFilters(table, predicates)
-    val partitions =
-      if (filter.isEmpty) {
-        getAllPartitionsMethod.invoke(hive, table).asInstanceOf[JSet[Partition]]
-      } else {
-        logDebug(s"Hive metastore filter is '$filter'.")
-        getPartitionsByFilterMethod.invoke(hive, table, filter).asInstanceOf[JArrayList[Partition]]
-      }
-
-    partitions.toSeq
-  }
-
   override def getCommandProcessor(token: String, conf: HiveConf): CommandProcessor =
     getCommandProcessorMethod.invoke(null, Array(token), conf).asInstanceOf[CommandProcessor]
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScan.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScan.scala
index ba7eb15a1c0c6..d33da8242cc1d 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScan.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScan.scala
@@ -44,7 +44,7 @@ private[hive]
 case class HiveTableScan(
     requestedAttributes: Seq[Attribute],
     relation: MetastoreRelation,
-    partitionPruningPred: Seq[Expression])(
+    partitionPruningPred: Option[Expression])(
     @transient val context: HiveContext)
   extends LeafNode {
 
@@ -56,7 +56,7 @@ case class HiveTableScan(
 
   // Bind all partition key attribute references in the partition pruning predicate for later
   // evaluation.
-  private[this] val boundPruningPred = partitionPruningPred.reduceLeftOption(And).map { pred =>
+  private[this] val boundPruningPred = partitionPruningPred.map { pred =>
     require(
       pred.dataType == BooleanType,
       s"Data type of predicate $pred must be BooleanType rather than ${pred.dataType}.")
@@ -133,8 +133,7 @@ case class HiveTableScan(
   protected override def doExecute(): RDD[InternalRow] = if (!relation.hiveQlTable.isPartitioned) {
     hadoopReader.makeRDDForTable(relation.hiveQlTable)
   } else {
-    hadoopReader.makeRDDForPartitionedTable(
-      prunePartitions(relation.getHiveQlPartitions(partitionPruningPred)))
+    hadoopReader.makeRDDForPartitionedTable(prunePartitions(relation.hiveQlPartitions))
   }
 
   override def output: Seq[Attribute] = attributes
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala
deleted file mode 100644
index 0efcf80bd4ea7..0000000000000
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.hive.client
-
-import scala.collection.JavaConversions._
-
-import org.apache.hadoop.hive.metastore.api.FieldSchema
-import org.apache.hadoop.hive.serde.serdeConstants
-
-import org.apache.spark.{Logging, SparkFunSuite}
-import org.apache.spark.sql.catalyst.dsl.expressions._
-import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.types._
-
-/**
- * A set of tests for the filter conversion logic used when pushing partition pruning into the
- * metastore
- */
-class FiltersSuite extends SparkFunSuite with Logging {
-  private val shim = new Shim_v0_13
-
-  private val testTable = new org.apache.hadoop.hive.ql.metadata.Table("default", "test")
-  private val varCharCol = new FieldSchema()
-  varCharCol.setName("varchar")
-  varCharCol.setType(serdeConstants.VARCHAR_TYPE_NAME)
-  testTable.setPartCols(varCharCol :: Nil)
-
-  filterTest("string filter",
-    (a("stringcol", StringType) > Literal("test")) :: Nil,
-    "stringcol > \"test\"")
-
-  filterTest("string filter backwards",
-    (Literal("test") > a("stringcol", StringType)) :: Nil,
-    "\"test\" > stringcol")
-
-  filterTest("int filter",
-    (a("intcol", IntegerType) === Literal(1)) :: Nil,
-    "intcol = 1")
-
-  filterTest("int filter backwards",
-    (Literal(1) === a("intcol", IntegerType)) :: Nil,
-    "1 = intcol")
-
-  filterTest("int and string filter",
-    (Literal(1) === a("intcol", IntegerType)) :: (Literal("a") === a("strcol", IntegerType)) :: Nil,
-    "1 = intcol and \"a\" = strcol")
-
-  filterTest("skip varchar",
-    (Literal("") === a("varchar", StringType)) :: Nil,
-    "")
-
-  private def filterTest(name: String, filters: Seq[Expression], result: String) = {
-    test(name){
-      val converted = shim.convertFilters(testTable, filters)
-      if (converted != result) {
-        fail(
-          s"Expected filters ${filters.mkString(",")} to convert to '$result' but got '$converted'")
-      }
-    }
-  }
-
-  private def a(name: String, dataType: DataType) = AttributeReference(name, dataType)()
-}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
index 3eb127e23d486..d52e162acbd04 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
@@ -20,9 +20,7 @@ package org.apache.spark.sql.hive.client
 import java.io.File
 
 import org.apache.spark.{Logging, SparkFunSuite}
-import org.apache.spark.sql.catalyst.expressions.{NamedExpression, Literal, AttributeReference, EqualTo}
 import org.apache.spark.sql.catalyst.util.quietly
-import org.apache.spark.sql.types.IntegerType
 import org.apache.spark.util.Utils
 
 /**
@@ -153,12 +151,6 @@ class VersionsSuite extends SparkFunSuite with Logging {
       client.getAllPartitions(client.getTable("default", "src_part"))
     }
 
-    test(s"$version: getPartitionsByFilter") {
-      client.getPartitionsByFilter(client.getTable("default", "src_part"), Seq(EqualTo(
-        AttributeReference("key", IntegerType, false)(NamedExpression.newExprId),
-        Literal(1))))
-    }
-
     test(s"$version: loadPartition") {
       client.loadPartition(
         emptyDir,
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruningSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruningSuite.scala
index e83a7dc77e329..de6a41ce5bfcb 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruningSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruningSuite.scala
@@ -151,7 +151,7 @@ class PruningSuite extends HiveComparisonTest with BeforeAndAfter {
         case p @ HiveTableScan(columns, relation, _) =>
           val columnNames = columns.map(_.name)
           val partValues = if (relation.table.isPartitioned) {
-            p.prunePartitions(relation.getHiveQlPartitions()).map(_.getValues)
+            p.prunePartitions(relation.hiveQlPartitions).map(_.getValues)
           } else {
             Seq.empty
           }

From 4692769655e09d129a62a89a8ffb5d635675aa4d Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Tue, 14 Jul 2015 23:27:42 -0700
Subject: [PATCH 160/549] [SPARK-6259] [MLLIB] Python API for LDA

I implemented the Python API for LDA. But I didn't implemented a method for `LDAModel.describeTopics()`, beause it's a little hard to implement it now. And adding document about that and an example code would fit for another issue.

TODO: LDAModel.describeTopics() in Python must be also implemented. But it would be nice to fit for another issue. Implementing it is a little hard, since the return value of `describeTopics` in Scala consists of Tuple classes.

Author: Yu ISHIKAWA <yuu.ishikawa@gmail.com>

Closes #6791 from yu-iskw/SPARK-6259 and squashes the following commits:

6855f59 [Yu ISHIKAWA] LDA inherits object
28bd165 [Yu ISHIKAWA] Change the place of testing code
d7a332a [Yu ISHIKAWA] Remove the doc comment about the optimizer's default value
083e226 [Yu ISHIKAWA] Add the comment about the supported values and the default value of `optimizer`
9f8bed8 [Yu ISHIKAWA] Simplify casting
faa9764 [Yu ISHIKAWA] Add some comments for the LDA paramters
98f645a [Yu ISHIKAWA] Remove the interface for `describeTopics`. Because it is not implemented.
57ac03d [Yu ISHIKAWA] Remove the unnecessary import in Python unit testing
73412c3 [Yu ISHIKAWA] Fix the typo
2278829 [Yu ISHIKAWA] Fix the indentation
39514ec [Yu ISHIKAWA] Modify how to cast the input data
8117e18 [Yu ISHIKAWA] Fix the validation problems by `lint-scala`
77fd1b7 [Yu ISHIKAWA] Not use LabeledPoint
68f0653 [Yu ISHIKAWA] Support some parameters for `ALS.train()` in Python
25ef2ac [Yu ISHIKAWA] Resolve conflicts with rebasing
---
 .../mllib/api/python/PythonMLLibAPI.scala     | 33 ++++++++++
 python/pyspark/mllib/clustering.py            | 66 ++++++++++++++++++-
 2 files changed, 98 insertions(+), 1 deletion(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
index e628059c4af8e..c58a64001d9a0 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
@@ -502,6 +502,39 @@ private[python] class PythonMLLibAPI extends Serializable {
     new MatrixFactorizationModelWrapper(model)
   }
 
+  /**
+   * Java stub for Python mllib LDA.run()
+   */
+  def trainLDAModel(
+      data: JavaRDD[java.util.List[Any]],
+      k: Int,
+      maxIterations: Int,
+      docConcentration: Double,
+      topicConcentration: Double,
+      seed: java.lang.Long,
+      checkpointInterval: Int,
+      optimizer: String): LDAModel = {
+    val algo = new LDA()
+      .setK(k)
+      .setMaxIterations(maxIterations)
+      .setDocConcentration(docConcentration)
+      .setTopicConcentration(topicConcentration)
+      .setCheckpointInterval(checkpointInterval)
+      .setOptimizer(optimizer)
+
+    if (seed != null) algo.setSeed(seed)
+
+    val documents = data.rdd.map(_.asScala.toArray).map { r =>
+      r(0) match {
+        case i: java.lang.Integer => (i.toLong, r(1).asInstanceOf[Vector])
+        case i: java.lang.Long => (i.toLong, r(1).asInstanceOf[Vector])
+        case _ => throw new IllegalArgumentException("input values contains invalid type value.")
+      }
+    }
+    algo.run(documents)
+  }
+
+
   /**
    * Java stub for Python mllib FPGrowth.train().  This stub returns a handle
    * to the Java object instead of the content of the Java object.  Extra care
diff --git a/python/pyspark/mllib/clustering.py b/python/pyspark/mllib/clustering.py
index ed4d78a2c6788..8a92f6911c24b 100644
--- a/python/pyspark/mllib/clustering.py
+++ b/python/pyspark/mllib/clustering.py
@@ -31,13 +31,15 @@
 from pyspark.rdd import RDD, ignore_unicode_prefix
 from pyspark.mllib.common import JavaModelWrapper, callMLlibFunc, callJavaFunc, _py2java, _java2py
 from pyspark.mllib.linalg import SparseVector, _convert_to_vector, DenseVector
+from pyspark.mllib.regression import LabeledPoint
 from pyspark.mllib.stat.distribution import MultivariateGaussian
 from pyspark.mllib.util import Saveable, Loader, inherit_doc, JavaLoader, JavaSaveable
 from pyspark.streaming import DStream
 
 __all__ = ['KMeansModel', 'KMeans', 'GaussianMixtureModel', 'GaussianMixture',
            'PowerIterationClusteringModel', 'PowerIterationClustering',
-           'StreamingKMeans', 'StreamingKMeansModel']
+           'StreamingKMeans', 'StreamingKMeansModel',
+           'LDA', 'LDAModel']
 
 
 @inherit_doc
@@ -563,6 +565,68 @@ def predictOnValues(self, dstream):
         return dstream.mapValues(lambda x: self._model.predict(x))
 
 
+class LDAModel(JavaModelWrapper):
+
+    """ A clustering model derived from the LDA method.
+
+    Latent Dirichlet Allocation (LDA), a topic model designed for text documents.
+    Terminology
+    - "word" = "term": an element of the vocabulary
+    - "token": instance of a term appearing in a document
+    - "topic": multinomial distribution over words representing some concept
+    References:
+    - Original LDA paper (journal version):
+    Blei, Ng, and Jordan.  "Latent Dirichlet Allocation."  JMLR, 2003.
+
+    >>> from pyspark.mllib.linalg import Vectors
+    >>> from numpy.testing import assert_almost_equal
+    >>> data = [
+    ...     [1, Vectors.dense([0.0, 1.0])],
+    ...     [2, SparseVector(2, {0: 1.0})],
+    ... ]
+    >>> rdd =  sc.parallelize(data)
+    >>> model = LDA.train(rdd, k=2)
+    >>> model.vocabSize()
+    2
+    >>> topics = model.topicsMatrix()
+    >>> topics_expect = array([[0.5,  0.5], [0.5, 0.5]])
+    >>> assert_almost_equal(topics, topics_expect, 1)
+    """
+
+    def topicsMatrix(self):
+        """Inferred topics, where each topic is represented by a distribution over terms."""
+        return self.call("topicsMatrix").toArray()
+
+    def vocabSize(self):
+        """Vocabulary size (number of terms or terms in the vocabulary)"""
+        return self.call("vocabSize")
+
+
+class LDA(object):
+
+    @classmethod
+    def train(cls, rdd, k=10, maxIterations=20, docConcentration=-1.0,
+              topicConcentration=-1.0, seed=None, checkpointInterval=10, optimizer="em"):
+        """Train a LDA model.
+
+        :param rdd:                 RDD of data points
+        :param k:                   Number of clusters you want
+        :param maxIterations:       Number of iterations. Default to 20
+        :param docConcentration:    Concentration parameter (commonly named "alpha")
+            for the prior placed on documents' distributions over topics ("theta").
+        :param topicConcentration:  Concentration parameter (commonly named "beta" or "eta")
+            for the prior placed on topics' distributions over terms.
+        :param seed:                Random Seed
+        :param checkpointInterval:  Period (in iterations) between checkpoints.
+        :param optimizer:           LDAOptimizer used to perform the actual calculation.
+            Currently "em", "online" are supported. Default to "em".
+        """
+        model = callMLlibFunc("trainLDAModel", rdd, k, maxIterations,
+                              docConcentration, topicConcentration, seed,
+                              checkpointInterval, optimizer)
+        return LDAModel(model)
+
+
 def _test():
     import doctest
     import pyspark.mllib.clustering

From 3f6296fed4ee10f53e728eb1e02f13338839b94d Mon Sep 17 00:00:00 2001
From: FlytxtRnD <meethu.mathew@flytxt.com>
Date: Tue, 14 Jul 2015 23:29:02 -0700
Subject: [PATCH 161/549] [SPARK-8018] [MLLIB] KMeans should accept initial
 cluster centers as param

 This allows Kmeans to be initialized using an existing set of cluster centers provided as  a KMeansModel object. This mode of initialization performs a single run.

Author: FlytxtRnD <meethu.mathew@flytxt.com>

Closes #6737 from FlytxtRnD/Kmeans-8018 and squashes the following commits:

94b56df [FlytxtRnD] style correction
ef95ee2 [FlytxtRnD] style correction
c446c58 [FlytxtRnD] documentation and numRuns warning change
06d13ef [FlytxtRnD] numRuns corrected
d12336e [FlytxtRnD] numRuns variable modifications
07f8554 [FlytxtRnD] remove setRuns from setIntialModel
e721dfe [FlytxtRnD] Merge remote-tracking branch 'upstream/master' into Kmeans-8018
242ead1 [FlytxtRnD] corrected == to === in assert
714acb5 [FlytxtRnD] added numRuns
60c8ce2 [FlytxtRnD] ignore runs parameter and initialModel test suite changed
582e6d9 [FlytxtRnD] Merge remote-tracking branch 'upstream/master' into Kmeans-8018
3f5fc8e [FlytxtRnD] test case modified and one runs condition added
cd5dc5c [FlytxtRnD] Merge remote-tracking branch 'upstream/master' into Kmeans-8018
16f1b53 [FlytxtRnD] Merge branch 'Kmeans-8018', remote-tracking branch 'upstream/master' into Kmeans-8018
e9c35d7 [FlytxtRnD] Remove getInitialModel and match cluster count criteria
6959861 [FlytxtRnD] Accept initial cluster centers in KMeans
---
 docs/mllib-clustering.md                      |  1 +
 .../spark/mllib/clustering/KMeans.scala       | 41 ++++++++++++++++---
 .../spark/mllib/clustering/KMeansSuite.scala  | 22 ++++++++++
 3 files changed, 58 insertions(+), 6 deletions(-)

diff --git a/docs/mllib-clustering.md b/docs/mllib-clustering.md
index d72dc20a5ad6e..0fc7036bffeaf 100644
--- a/docs/mllib-clustering.md
+++ b/docs/mllib-clustering.md
@@ -33,6 +33,7 @@ guaranteed to find a globally optimal solution, and when run multiple times on
 a given dataset, the algorithm returns the best clustering result).
 * *initializationSteps* determines the number of steps in the k-means\|\| algorithm.
 * *epsilon* determines the distance threshold within which we consider k-means to have converged.
+* *initialModel* is an optional set of cluster centers used for initialization. If this parameter is supplied, only one run is performed.
 
 **Examples**
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
index 0f8d6a399682d..68297130a7b03 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
@@ -156,6 +156,21 @@ class KMeans private (
     this
   }
 
+  // Initial cluster centers can be provided as a KMeansModel object rather than using the
+  // random or k-means|| initializationMode
+  private var initialModel: Option[KMeansModel] = None
+
+  /**
+   * Set the initial starting point, bypassing the random initialization or k-means||
+   * The condition model.k == this.k must be met, failure results
+   * in an IllegalArgumentException.
+   */
+  def setInitialModel(model: KMeansModel): this.type = {
+    require(model.k == k, "mismatched cluster count")
+    initialModel = Some(model)
+    this
+  }
+
   /**
    * Train a K-means model on the given set of points; `data` should be cached for high
    * performance, because this is an iterative algorithm.
@@ -193,20 +208,34 @@ class KMeans private (
 
     val initStartTime = System.nanoTime()
 
-    val centers = if (initializationMode == KMeans.RANDOM) {
-      initRandom(data)
+    // Only one run is allowed when initialModel is given
+    val numRuns = if (initialModel.nonEmpty) {
+      if (runs > 1) logWarning("Ignoring runs; one run is allowed when initialModel is given.")
+      1
     } else {
-      initKMeansParallel(data)
+      runs
     }
 
+    val centers = initialModel match {
+      case Some(kMeansCenters) => {
+        Array(kMeansCenters.clusterCenters.map(s => new VectorWithNorm(s)))
+      }
+      case None => {
+        if (initializationMode == KMeans.RANDOM) {
+          initRandom(data)
+        } else {
+          initKMeansParallel(data)
+        }
+      }
+    }
     val initTimeInSeconds = (System.nanoTime() - initStartTime) / 1e9
     logInfo(s"Initialization with $initializationMode took " + "%.3f".format(initTimeInSeconds) +
       " seconds.")
 
-    val active = Array.fill(runs)(true)
-    val costs = Array.fill(runs)(0.0)
+    val active = Array.fill(numRuns)(true)
+    val costs = Array.fill(numRuns)(0.0)
 
-    var activeRuns = new ArrayBuffer[Int] ++ (0 until runs)
+    var activeRuns = new ArrayBuffer[Int] ++ (0 until numRuns)
     var iteration = 0
 
     val iterationStartTime = System.nanoTime()
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/KMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/KMeansSuite.scala
index 0dbbd7127444f..3003c62d9876c 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/KMeansSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/KMeansSuite.scala
@@ -278,6 +278,28 @@ class KMeansSuite extends SparkFunSuite with MLlibTestSparkContext {
       }
     }
   }
+
+  test("Initialize using given cluster centers") {
+    val points = Seq(
+      Vectors.dense(0.0, 0.0),
+      Vectors.dense(1.0, 0.0),
+      Vectors.dense(0.0, 1.0),
+      Vectors.dense(1.0, 1.0)
+    )
+    val rdd = sc.parallelize(points, 3)
+    // creating an initial model
+    val initialModel = new KMeansModel(Array(points(0), points(2)))
+
+    val returnModel = new KMeans()
+      .setK(2)
+      .setMaxIterations(0)
+      .setInitialModel(initialModel)
+      .run(rdd)
+   // comparing the returned model and the initial model
+    assert(returnModel.clusterCenters(0) === initialModel.clusterCenters(0))
+    assert(returnModel.clusterCenters(1) === initialModel.clusterCenters(1))
+  }
+
 }
 
 object KMeansSuite extends SparkFunSuite {

From f0e129740dc2442a21dfa7fbd97360df87291095 Mon Sep 17 00:00:00 2001
From: Yijie Shen <henry.yijieshen@gmail.com>
Date: Tue, 14 Jul 2015 23:30:41 -0700
Subject: [PATCH 162/549] [SPARK-8279][SQL]Add math function round

JIRA: https://issues.apache.org/jira/browse/SPARK-8279

Author: Yijie Shen <henry.yijieshen@gmail.com>

Closes #6938 from yijieshen/udf_round_3 and squashes the following commits:

07a124c [Yijie Shen] remove useless def children
392b65b [Yijie Shen] add negative scale test in DecimalSuite
61760ee [Yijie Shen] address reviews
302a78a [Yijie Shen] Add dataframe function test
31dfe7c [Yijie Shen] refactor round to make it readable
8c7a949 [Yijie Shen] rebase & inputTypes update
9555e35 [Yijie Shen] tiny style fix
d10be4a [Yijie Shen] use TypeCollection to specify wanted input and implicit cast
c3b9839 [Yijie Shen] rely on implict cast to handle string input
b0bff79 [Yijie Shen] make round's inner method's name more meaningful
9bd6930 [Yijie Shen] revert accidental change
e6f44c4 [Yijie Shen] refactor eval and genCode
1b87540 [Yijie Shen] modify checkInputDataTypes using foldable
5486b2d [Yijie Shen] DataFrame API modification
2077888 [Yijie Shen] codegen versioned eval
6cd9a64 [Yijie Shen] refactor Round's constructor
9be894e [Yijie Shen] add round functions in o.a.s.sql.functions
7c83e13 [Yijie Shen] more tests on round
56db4bb [Yijie Shen] Add decimal support to Round
7e163ae [Yijie Shen] style fix
653d047 [Yijie Shen] Add math function round
---
 .../catalyst/analysis/FunctionRegistry.scala  |   1 +
 .../spark/sql/catalyst/expressions/math.scala | 203 +++++++++++++++++-
 .../ExpressionTypeCheckingSuite.scala         |  17 ++
 .../expressions/MathFunctionsSuite.scala      |  44 ++++
 .../sql/types/decimal/DecimalSuite.scala      |  23 +-
 .../org/apache/spark/sql/functions.scala      |  32 +++
 .../spark/sql/MathExpressionsSuite.scala      |  15 ++
 .../execution/HiveCompatibilitySuite.scala    |   7 +-
 8 files changed, 329 insertions(+), 13 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index 6b1a94e4b2ad4..ec75f51d5e4ff 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -117,6 +117,7 @@ object FunctionRegistry {
     expression[Pow]("power"),
     expression[UnaryPositive]("positive"),
     expression[Rint]("rint"),
+    expression[Round]("round"),
     expression[ShiftLeft]("shiftleft"),
     expression[ShiftRight]("shiftright"),
     expression[ShiftRightUnsigned]("shiftrightunsigned"),
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
index 4b7fe05dd4980..a7ad452ef4943 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
@@ -19,8 +19,10 @@ package org.apache.spark.sql.catalyst.expressions
 
 import java.{lang => jl}
 
-import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
+import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.{TypeCheckSuccess, TypeCheckFailure}
 import org.apache.spark.sql.catalyst.expressions.codegen._
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
 
@@ -520,3 +522,202 @@ case class Logarithm(left: Expression, right: Expression)
     """
   }
 }
+
+/**
+ * Round the `child`'s result to `scale` decimal place when `scale` >= 0
+ * or round at integral part when `scale` < 0.
+ * For example, round(31.415, 2) would eval to 31.42 and round(31.415, -1) would eval to 30.
+ *
+ * Child of IntegralType would eval to itself when `scale` >= 0.
+ * Child of FractionalType whose value is NaN or Infinite would always eval to itself.
+ *
+ * Round's dataType would always equal to `child`'s dataType except for [[DecimalType.Fixed]],
+ * which leads to scale update in DecimalType's [[PrecisionInfo]]
+ *
+ * @param child expr to be round, all [[NumericType]] is allowed as Input
+ * @param scale new scale to be round to, this should be a constant int at runtime
+ */
+case class Round(child: Expression, scale: Expression)
+  extends BinaryExpression with ExpectsInputTypes {
+
+  import BigDecimal.RoundingMode.HALF_UP
+
+  def this(child: Expression) = this(child, Literal(0))
+
+  override def left: Expression = child
+  override def right: Expression = scale
+
+  // round of Decimal would eval to null if it fails to `changePrecision`
+  override def nullable: Boolean = true
+
+  override def foldable: Boolean = child.foldable
+
+  override lazy val dataType: DataType = child.dataType match {
+    // if the new scale is bigger which means we are scaling up,
+    // keep the original scale as `Decimal` does
+    case DecimalType.Fixed(p, s) => DecimalType(p, if (_scale > s) s else _scale)
+    case t => t
+  }
+
+  override def inputTypes: Seq[AbstractDataType] = Seq(NumericType, IntegerType)
+
+  override def checkInputDataTypes(): TypeCheckResult = {
+    super.checkInputDataTypes() match {
+      case TypeCheckSuccess =>
+        if (scale.foldable) {
+          TypeCheckSuccess
+        } else {
+          TypeCheckFailure("Only foldable Expression is allowed for scale arguments")
+        }
+      case f => f
+    }
+  }
+
+  // Avoid repeated evaluation since `scale` is a constant int,
+  // avoid unnecessary `child` evaluation in both codegen and non-codegen eval
+  // by checking if scaleV == null as well.
+  private lazy val scaleV: Any = scale.eval(EmptyRow)
+  private lazy val _scale: Int = scaleV.asInstanceOf[Int]
+
+  override def eval(input: InternalRow): Any = {
+    if (scaleV == null) { // if scale is null, no need to eval its child at all
+      null
+    } else {
+      val evalE = child.eval(input)
+      if (evalE == null) {
+        null
+      } else {
+        nullSafeEval(evalE)
+      }
+    }
+  }
+
+  // not overriding since _scale is a constant int at runtime
+  def nullSafeEval(input1: Any): Any = {
+    child.dataType match {
+      case _: DecimalType =>
+        val decimal = input1.asInstanceOf[Decimal]
+        if (decimal.changePrecision(decimal.precision, _scale)) decimal else null
+      case ByteType =>
+        BigDecimal(input1.asInstanceOf[Byte]).setScale(_scale, HALF_UP).toByte
+      case ShortType =>
+        BigDecimal(input1.asInstanceOf[Short]).setScale(_scale, HALF_UP).toShort
+      case IntegerType =>
+        BigDecimal(input1.asInstanceOf[Int]).setScale(_scale, HALF_UP).toInt
+      case LongType =>
+        BigDecimal(input1.asInstanceOf[Long]).setScale(_scale, HALF_UP).toLong
+      case FloatType =>
+        val f = input1.asInstanceOf[Float]
+        if (f.isNaN || f.isInfinite) {
+          f
+        } else {
+          BigDecimal(f).setScale(_scale, HALF_UP).toFloat
+        }
+      case DoubleType =>
+        val d = input1.asInstanceOf[Double]
+        if (d.isNaN || d.isInfinite) {
+          d
+        } else {
+          BigDecimal(d).setScale(_scale, HALF_UP).toDouble
+        }
+    }
+  }
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    val ce = child.gen(ctx)
+
+    val evaluationCode = child.dataType match {
+      case _: DecimalType =>
+        s"""
+        if (${ce.primitive}.changePrecision(${ce.primitive}.precision(), ${_scale})) {
+          ${ev.primitive} = ${ce.primitive};
+        } else {
+          ${ev.isNull} = true;
+        }"""
+      case ByteType =>
+        if (_scale < 0) {
+          s"""
+          ${ev.primitive} = new java.math.BigDecimal(${ce.primitive}).
+            setScale(${_scale}, java.math.BigDecimal.ROUND_HALF_UP).byteValue();"""
+        } else {
+          s"${ev.primitive} = ${ce.primitive};"
+        }
+      case ShortType =>
+        if (_scale < 0) {
+          s"""
+          ${ev.primitive} = new java.math.BigDecimal(${ce.primitive}).
+            setScale(${_scale}, java.math.BigDecimal.ROUND_HALF_UP).shortValue();"""
+        } else {
+          s"${ev.primitive} = ${ce.primitive};"
+        }
+      case IntegerType =>
+        if (_scale < 0) {
+          s"""
+          ${ev.primitive} = new java.math.BigDecimal(${ce.primitive}).
+            setScale(${_scale}, java.math.BigDecimal.ROUND_HALF_UP).intValue();"""
+        } else {
+          s"${ev.primitive} = ${ce.primitive};"
+        }
+      case LongType =>
+        if (_scale < 0) {
+          s"""
+          ${ev.primitive} = new java.math.BigDecimal(${ce.primitive}).
+            setScale(${_scale}, java.math.BigDecimal.ROUND_HALF_UP).longValue();"""
+        } else {
+          s"${ev.primitive} = ${ce.primitive};"
+        }
+      case FloatType => // if child eval to NaN or Infinity, just return it.
+        if (_scale == 0) {
+          s"""
+            if (Float.isNaN(${ce.primitive}) || Float.isInfinite(${ce.primitive})){
+              ${ev.primitive} = ${ce.primitive};
+            } else {
+              ${ev.primitive} = Math.round(${ce.primitive});
+            }"""
+        } else {
+          s"""
+            if (Float.isNaN(${ce.primitive}) || Float.isInfinite(${ce.primitive})){
+              ${ev.primitive} = ${ce.primitive};
+            } else {
+              ${ev.primitive} = java.math.BigDecimal.valueOf(${ce.primitive}).
+                setScale(${_scale}, java.math.BigDecimal.ROUND_HALF_UP).floatValue();
+            }"""
+        }
+      case DoubleType => // if child eval to NaN or Infinity, just return it.
+        if (_scale == 0) {
+          s"""
+            if (Double.isNaN(${ce.primitive}) || Double.isInfinite(${ce.primitive})){
+              ${ev.primitive} = ${ce.primitive};
+            } else {
+              ${ev.primitive} = Math.round(${ce.primitive});
+            }"""
+        } else {
+          s"""
+            if (Double.isNaN(${ce.primitive}) || Double.isInfinite(${ce.primitive})){
+              ${ev.primitive} = ${ce.primitive};
+            } else {
+              ${ev.primitive} = java.math.BigDecimal.valueOf(${ce.primitive}).
+                setScale(${_scale}, java.math.BigDecimal.ROUND_HALF_UP).doubleValue();
+            }"""
+        }
+    }
+
+    if (scaleV == null) { // if scale is null, no need to eval its child at all
+      s"""
+        boolean ${ev.isNull} = true;
+        ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+      """
+    } else {
+      s"""
+        ${ce.code}
+        boolean ${ev.isNull} = ${ce.isNull};
+        ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+        if (!${ev.isNull}) {
+          $evaluationCode
+        }
+      """
+    }
+  }
+
+  override def prettyName: String = "round"
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala
index 5958acbe009ca..e885a18254ea0 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala
@@ -52,6 +52,13 @@ class ExpressionTypeCheckingSuite extends SparkFunSuite {
       s"differing types in '${expr.prettyString}' (int and boolean)")
   }
 
+  def assertErrorWithImplicitCast(expr: Expression, errorMessage: String): Unit = {
+    val e = intercept[AnalysisException] {
+      assertSuccess(expr)
+    }
+    assert(e.getMessage.contains(errorMessage))
+  }
+
   test("check types for unary arithmetic") {
     assertError(UnaryMinus('stringField), "operator - accepts numeric type")
     assertError(Abs('stringField), "function abs accepts numeric type")
@@ -171,4 +178,14 @@ class ExpressionTypeCheckingSuite extends SparkFunSuite {
       CreateNamedStruct(Seq('a.string.at(0), "a", "b", 2.0)),
         "Odd position only allow foldable and not-null StringType expressions")
   }
+
+  test("check types for ROUND") {
+    assertErrorWithImplicitCast(Round(Literal(null), 'booleanField),
+      "data type mismatch: argument 2 is expected to be of type int")
+    assertErrorWithImplicitCast(Round(Literal(null), 'complexField),
+      "data type mismatch: argument 2 is expected to be of type int")
+    assertSuccess(Round(Literal(null), Literal(null)))
+    assertError(Round('booleanField, 'intField),
+      "data type mismatch: argument 1 is expected to be of type numeric")
+  }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala
index 7ca9e30b2bcd5..52a874a9d89ef 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+import scala.math.BigDecimal.RoundingMode
+
 import com.google.common.math.LongMath
 
 import org.apache.spark.SparkFunSuite
@@ -336,4 +338,46 @@ class MathFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
       null,
       create_row(null))
   }
+
+  test("round") {
+    val domain = -6 to 6
+    val doublePi: Double = math.Pi
+    val shortPi: Short = 31415
+    val intPi: Int = 314159265
+    val longPi: Long = 31415926535897932L
+    val bdPi: BigDecimal = BigDecimal(31415927L, 7)
+
+    val doubleResults: Seq[Double] = Seq(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.0, 3.1, 3.14, 3.142,
+      3.1416, 3.14159, 3.141593)
+
+    val shortResults: Seq[Short] = Seq[Short](0, 0, 30000, 31000, 31400, 31420) ++
+      Seq.fill[Short](7)(31415)
+
+    val intResults: Seq[Int] = Seq(314000000, 314200000, 314160000, 314159000, 314159300,
+      314159270) ++ Seq.fill(7)(314159265)
+
+    val longResults: Seq[Long] = Seq(31415926536000000L, 31415926535900000L,
+      31415926535900000L, 31415926535898000L, 31415926535897900L, 31415926535897930L) ++
+      Seq.fill(7)(31415926535897932L)
+
+    val bdResults: Seq[BigDecimal] = Seq(BigDecimal(3.0), BigDecimal(3.1), BigDecimal(3.14),
+      BigDecimal(3.142), BigDecimal(3.1416), BigDecimal(3.14159),
+      BigDecimal(3.141593), BigDecimal(3.1415927))
+
+    domain.zipWithIndex.foreach { case (scale, i) =>
+      checkEvaluation(Round(doublePi, scale), doubleResults(i), EmptyRow)
+      checkEvaluation(Round(shortPi, scale), shortResults(i), EmptyRow)
+      checkEvaluation(Round(intPi, scale), intResults(i), EmptyRow)
+      checkEvaluation(Round(longPi, scale), longResults(i), EmptyRow)
+    }
+
+    // round_scale > current_scale would result in precision increase
+    // and not allowed by o.a.s.s.types.Decimal.changePrecision, therefore null
+    (0 to 7).foreach { i =>
+      checkEvaluation(Round(bdPi, i), bdResults(i), EmptyRow)
+    }
+    (8 to 10).foreach { scale =>
+      checkEvaluation(Round(bdPi, scale), null, EmptyRow)
+    }
+  }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/decimal/DecimalSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/decimal/DecimalSuite.scala
index 030bb6d21b18b..f0c849d1a1564 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/decimal/DecimalSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/decimal/DecimalSuite.scala
@@ -24,14 +24,14 @@ import org.scalatest.PrivateMethodTester
 import scala.language.postfixOps
 
 class DecimalSuite extends SparkFunSuite with PrivateMethodTester {
-  test("creating decimals") {
-    /** Check that a Decimal has the given string representation, precision and scale */
-    def checkDecimal(d: Decimal, string: String, precision: Int, scale: Int): Unit = {
-      assert(d.toString === string)
-      assert(d.precision === precision)
-      assert(d.scale === scale)
-    }
+  /** Check that a Decimal has the given string representation, precision and scale */
+  private def checkDecimal(d: Decimal, string: String, precision: Int, scale: Int): Unit = {
+    assert(d.toString === string)
+    assert(d.precision === precision)
+    assert(d.scale === scale)
+  }
 
+  test("creating decimals") {
     checkDecimal(new Decimal(), "0", 1, 0)
     checkDecimal(Decimal(BigDecimal("10.030")), "10.030", 5, 3)
     checkDecimal(Decimal(BigDecimal("10.030"), 4, 1), "10.0", 4, 1)
@@ -53,6 +53,15 @@ class DecimalSuite extends SparkFunSuite with PrivateMethodTester {
     intercept[IllegalArgumentException](Decimal(1e17.toLong, 17, 0))
   }
 
+  test("creating decimals with negative scale") {
+    checkDecimal(Decimal(BigDecimal("98765"), 5, -3), "9.9E+4", 5, -3)
+    checkDecimal(Decimal(BigDecimal("314.159"), 6, -2), "3E+2", 6, -2)
+    checkDecimal(Decimal(BigDecimal(1.579e12), 4, -9), "1.579E+12", 4, -9)
+    checkDecimal(Decimal(BigDecimal(1.579e12), 4, -10), "1.58E+12", 4, -10)
+    checkDecimal(Decimal(103050709L, 9, -10), "1.03050709E+18", 9, -10)
+    checkDecimal(Decimal(1e8.toLong, 10, -10), "1.00000000E+18", 10, -10)
+  }
+
   test("double and long values") {
     /** Check that a Decimal converts to the given double and long values */
     def checkValues(d: Decimal, doubleValue: Double, longValue: Long): Unit = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 0d4e160ed8057..5119ee31d852d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -1389,6 +1389,38 @@ object functions {
    */
   def rint(columnName: String): Column = rint(Column(columnName))
 
+  /**
+   * Returns the value of the column `e` rounded to 0 decimal places.
+   *
+   * @group math_funcs
+   * @since 1.5.0
+   */
+  def round(e: Column): Column = round(e.expr, 0)
+
+  /**
+   * Returns the value of the given column rounded to 0 decimal places.
+   *
+   * @group math_funcs
+   * @since 1.5.0
+   */
+  def round(columnName: String): Column = round(Column(columnName), 0)
+
+  /**
+   * Returns the value of `e` rounded to `scale` decimal places.
+   *
+   * @group math_funcs
+   * @since 1.5.0
+   */
+  def round(e: Column, scale: Int): Column = Round(e.expr, Literal(scale))
+
+  /**
+   * Returns the value of the given column rounded to `scale` decimal places.
+   *
+   * @group math_funcs
+   * @since 1.5.0
+   */
+  def round(columnName: String, scale: Int): Column = round(Column(columnName), scale)
+
   /**
    * Shift the the given value numBits left. If the given value is a long value, this function
    * will return a long value else it will return an integer value.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala
index b30b9f12258b9..087126bb2e513 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala
@@ -198,6 +198,21 @@ class MathExpressionsSuite extends QueryTest {
     testOneToOneMathFunction(rint, math.rint)
   }
 
+  test("round") {
+    val df = Seq(5, 55, 555).map(Tuple1(_)).toDF("a")
+    checkAnswer(
+      df.select(round('a), round('a, -1), round('a, -2)),
+      Seq(Row(5, 10, 0), Row(55, 60, 100), Row(555, 560, 600))
+    )
+
+    val pi = 3.1415
+    checkAnswer(
+      ctx.sql(s"SELECT round($pi, -3), round($pi, -2), round($pi, -1), " +
+        s"round($pi, 0), round($pi, 1), round($pi, 2), round($pi, 3)"),
+      Seq(Row(0.0, 0.0, 0.0, 3.0, 3.1, 3.14, 3.142))
+    )
+  }
+
   test("exp") {
     testOneToOneMathFunction(exp, math.exp)
   }
diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
index c884c399281a8..4ada64bc21966 100644
--- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
+++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
@@ -221,9 +221,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "udf_when",
     "udf_case",
 
-    // Needs constant object inspectors
-    "udf_round",
-
     // the table src(key INT, value STRING) is not the same as HIVE unittest. In Hive
     // is src(key STRING, value STRING), and in the reflect.q, it failed in
     // Integer.valueOf, which expect the first argument passed as STRING type not INT.
@@ -918,8 +915,8 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "udf_regexp_replace",
     "udf_repeat",
     "udf_rlike",
-    "udf_round",
-    //  "udf_round_3",  TODO: FIX THIS failed due to cast exception
+    // "udf_round",  turn this on after we figure out null vs nan vs infinity
+    "udf_round_3",
     "udf_rpad",
     "udf_rtrim",
     "udf_second",

From 1bb8accbc95a0f0856a8bb715f1e94c3ff96a8c7 Mon Sep 17 00:00:00 2001
From: Feynman Liang <fliang@databricks.com>
Date: Tue, 14 Jul 2015 23:50:57 -0700
Subject: [PATCH 163/549] [SPARK-8997] [MLLIB] Performance improvements in
 LocalPrefixSpan

Improves the performance of LocalPrefixSpan by implementing optimizations proposed in [SPARK-8997](https://issues.apache.org/jira/browse/SPARK-8997)

Author: Feynman Liang <fliang@databricks.com>
Author: Feynman Liang <feynman.liang@gmail.com>
Author: Xiangrui Meng <meng@databricks.com>

Closes #7360 from feynmanliang/SPARK-8997-improve-prefixspan and squashes the following commits:

59db2f5 [Feynman Liang] Merge pull request #1 from mengxr/SPARK-8997
91e4357 [Xiangrui Meng] update LocalPrefixSpan impl
9212256 [Feynman Liang] MengXR code review comments
f055d82 [Feynman Liang] Fix failing scalatest
2e00cba [Feynman Liang] Depth first projections
70b93e3 [Feynman Liang] Performance improvements in LocalPrefixSpan, fix tests
---
 .../spark/mllib/fpm/LocalPrefixSpan.scala     | 95 ++++++++-----------
 .../apache/spark/mllib/fpm/PrefixSpan.scala   |  5 +-
 .../spark/mllib/fpm/PrefixSpanSuite.scala     | 14 +--
 3 files changed, 44 insertions(+), 70 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/LocalPrefixSpan.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/LocalPrefixSpan.scala
index 39c48b084e550..7ead6327486cc 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/LocalPrefixSpan.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/LocalPrefixSpan.scala
@@ -17,58 +17,49 @@
 
 package org.apache.spark.mllib.fpm
 
+import scala.collection.mutable
+
 import org.apache.spark.Logging
-import org.apache.spark.annotation.Experimental
 
 /**
- *
- * :: Experimental ::
- *
  * Calculate all patterns of a projected database in local.
  */
-@Experimental
 private[fpm] object LocalPrefixSpan extends Logging with Serializable {
 
   /**
    * Calculate all patterns of a projected database.
    * @param minCount minimum count
    * @param maxPatternLength maximum pattern length
-   * @param prefix prefix
-   * @param projectedDatabase the projected dabase
+   * @param prefixes prefixes in reversed order
+   * @param database the projected database
    * @return a set of sequential pattern pairs,
-   *         the key of pair is sequential pattern (a list of items),
+   *         the key of pair is sequential pattern (a list of items in reversed order),
    *         the value of pair is the pattern's count.
    */
   def run(
       minCount: Long,
       maxPatternLength: Int,
-      prefix: Array[Int],
-      projectedDatabase: Array[Array[Int]]): Array[(Array[Int], Long)] = {
-    val frequentPrefixAndCounts = getFreqItemAndCounts(minCount, projectedDatabase)
-    val frequentPatternAndCounts = frequentPrefixAndCounts
-      .map(x => (prefix ++ Array(x._1), x._2))
-    val prefixProjectedDatabases = getPatternAndProjectedDatabase(
-      prefix, frequentPrefixAndCounts.map(_._1), projectedDatabase)
-
-    val continueProcess = prefixProjectedDatabases.nonEmpty && prefix.length + 1 < maxPatternLength
-    if (continueProcess) {
-      val nextPatterns = prefixProjectedDatabases
-        .map(x => run(minCount, maxPatternLength, x._1, x._2))
-        .reduce(_ ++ _)
-      frequentPatternAndCounts ++ nextPatterns
-    } else {
-      frequentPatternAndCounts
+      prefixes: List[Int],
+      database: Array[Array[Int]]): Iterator[(List[Int], Long)] = {
+    if (prefixes.length == maxPatternLength || database.isEmpty) return Iterator.empty
+    val frequentItemAndCounts = getFreqItemAndCounts(minCount, database)
+    val filteredDatabase = database.map(x => x.filter(frequentItemAndCounts.contains))
+    frequentItemAndCounts.iterator.flatMap { case (item, count) =>
+      val newPrefixes = item :: prefixes
+      val newProjected = project(filteredDatabase, item)
+      Iterator.single((newPrefixes, count)) ++
+        run(minCount, maxPatternLength, newPrefixes, newProjected)
     }
   }
 
   /**
-   * calculate suffix sequence following a prefix in a sequence
-   * @param prefix prefix
-   * @param sequence sequence
+   * Calculate suffix sequence immediately after the first occurrence of an item.
+   * @param item item to get suffix after
+   * @param sequence sequence to extract suffix from
    * @return suffix sequence
    */
-  def getSuffix(prefix: Int, sequence: Array[Int]): Array[Int] = {
-    val index = sequence.indexOf(prefix)
+  def getSuffix(item: Int, sequence: Array[Int]): Array[Int] = {
+    val index = sequence.indexOf(item)
     if (index == -1) {
       Array()
     } else {
@@ -76,38 +67,28 @@ private[fpm] object LocalPrefixSpan extends Logging with Serializable {
     }
   }
 
+  def project(database: Array[Array[Int]], prefix: Int): Array[Array[Int]] = {
+    database
+      .map(getSuffix(prefix, _))
+      .filter(_.nonEmpty)
+  }
+
   /**
    * Generates frequent items by filtering the input data using minimal count level.
-   * @param minCount the absolute minimum count
-   * @param sequences sequences data
-   * @return array of item and count pair
+   * @param minCount the minimum count for an item to be frequent
+   * @param database database of sequences
+   * @return freq item to count map
    */
   private def getFreqItemAndCounts(
       minCount: Long,
-      sequences: Array[Array[Int]]): Array[(Int, Long)] = {
-    sequences.flatMap(_.distinct)
-      .groupBy(x => x)
-      .mapValues(_.length.toLong)
-      .filter(_._2 >= minCount)
-      .toArray
-  }
-
-  /**
-   * Get the frequent prefixes' projected database.
-   * @param prePrefix the frequent prefixes' prefix
-   * @param frequentPrefixes frequent prefixes
-   * @param sequences sequences data
-   * @return prefixes and projected database
-   */
-  private def getPatternAndProjectedDatabase(
-      prePrefix: Array[Int],
-      frequentPrefixes: Array[Int],
-      sequences: Array[Array[Int]]): Array[(Array[Int], Array[Array[Int]])] = {
-    val filteredProjectedDatabase = sequences
-      .map(x => x.filter(frequentPrefixes.contains(_)))
-    frequentPrefixes.map { x =>
-      val sub = filteredProjectedDatabase.map(y => getSuffix(x, y)).filter(_.nonEmpty)
-      (prePrefix ++ Array(x), sub)
-    }.filter(x => x._2.nonEmpty)
+      database: Array[Array[Int]]): mutable.Map[Int, Long] = {
+    // TODO: use PrimitiveKeyOpenHashMap
+    val counts = mutable.Map[Int, Long]().withDefaultValue(0L)
+    database.foreach { sequence =>
+      sequence.distinct.foreach { item =>
+        counts(item) += 1L
+      }
+    }
+    counts.filter(_._2 >= minCount)
   }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala
index 9d8c60ef0fc45..6f52db7b073ae 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala
@@ -150,8 +150,9 @@ class PrefixSpan private (
   private def getPatternsInLocal(
       minCount: Long,
       data: RDD[(Array[Int], Array[Array[Int]])]): RDD[(Array[Int], Long)] = {
-    data.flatMap { x =>
-      LocalPrefixSpan.run(minCount, maxPatternLength, x._1, x._2)
+    data.flatMap { case (prefix, projDB) =>
+      LocalPrefixSpan.run(minCount, maxPatternLength, prefix.toList, projDB)
+        .map { case (pattern: List[Int], count: Long) => (pattern.toArray.reverse, count) }
     }
   }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/fpm/PrefixSpanSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/fpm/PrefixSpanSuite.scala
index 413436d3db85f..9f107c89f6d80 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/fpm/PrefixSpanSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/fpm/PrefixSpanSuite.scala
@@ -18,9 +18,8 @@ package org.apache.spark.mllib.fpm
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.util.MLlibTestSparkContext
-import org.apache.spark.rdd.RDD
 
-class PrefixspanSuite extends SparkFunSuite with MLlibTestSparkContext {
+class PrefixSpanSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   test("PrefixSpan using Integer type") {
 
@@ -48,15 +47,8 @@ class PrefixspanSuite extends SparkFunSuite with MLlibTestSparkContext {
     def compareResult(
         expectedValue: Array[(Array[Int], Long)],
         actualValue: Array[(Array[Int], Long)]): Boolean = {
-      val sortedExpectedValue = expectedValue.sortWith{ (x, y) =>
-        x._1.mkString(",") + ":" + x._2 < y._1.mkString(",") + ":" + y._2
-      }
-      val sortedActualValue = actualValue.sortWith{ (x, y) =>
-        x._1.mkString(",") + ":" + x._2 < y._1.mkString(",") + ":" + y._2
-      }
-      sortedExpectedValue.zip(sortedActualValue)
-        .map(x => x._1._1.mkString(",") == x._2._1.mkString(",") && x._1._2 == x._2._2)
-        .reduce(_&&_)
+      expectedValue.map(x => (x._1.toSeq, x._2)).toSet ==
+        actualValue.map(x => (x._1.toSeq, x._2)).toSet
     }
 
     val prefixspan = new PrefixSpan()

From 14935d846a4f6bcd4d2a448a8f112fa5dee769ba Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Wed, 15 Jul 2015 00:12:21 -0700
Subject: [PATCH 164/549] [HOTFIX][SQL] Unit test breaking.

---
 .../sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala
index e885a18254ea0..a4ce1825cab28 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala
@@ -60,9 +60,9 @@ class ExpressionTypeCheckingSuite extends SparkFunSuite {
   }
 
   test("check types for unary arithmetic") {
-    assertError(UnaryMinus('stringField), "operator - accepts numeric type")
-    assertError(Abs('stringField), "function abs accepts numeric type")
-    assertError(BitwiseNot('stringField), "operator ~ accepts integral type")
+    assertError(UnaryMinus('stringField), "expected to be of type numeric")
+    assertError(Abs('stringField), "expected to be of type numeric")
+    assertError(BitwiseNot('stringField), "type (boolean or tinyint or smallint or int or bigint)")
   }
 
   ignore("check types for binary arithmetic") {

From adb33d3665770daf2ccb8915d19e198be9dc3b47 Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Wed, 15 Jul 2015 17:30:57 +0900
Subject: [PATCH 165/549] [SPARK-9012] [WEBUI] Escape Accumulators in the task
 table

If running the following codes, the task table will be broken because accumulators aren't escaped.
```
val a = sc.accumulator(1, "<table>")
sc.parallelize(1 to 10).foreach(i => a += i)
```

Before this fix,

<img width="1348" alt="screen shot 2015-07-13 at 8 02 44 pm" src="https://cloud.githubusercontent.com/assets/1000778/8649295/b17c491e-299b-11e5-97ee-4e6a64074c4f.png">

After this fix,

<img width="1355" alt="screen shot 2015-07-13 at 8 14 32 pm" src="https://cloud.githubusercontent.com/assets/1000778/8649337/f9e9c9ec-299b-11e5-927e-35c0a2f897f5.png">

Author: zsxwing <zsxwing@gmail.com>

Closes #7369 from zsxwing/SPARK-9012 and squashes the following commits:

a83c9b6 [zsxwing] Escape Accumulators in the task table
---
 core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
index ff0a339a39c65..27b82aaddd2e4 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
@@ -692,7 +692,9 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") {
       val gettingResultTime = getGettingResultTime(info, currentTime)
 
       val maybeAccumulators = info.accumulables
-      val accumulatorsReadable = maybeAccumulators.map{acc => s"${acc.name}: ${acc.update.get}"}
+      val accumulatorsReadable = maybeAccumulators.map { acc =>
+        StringEscapeUtils.escapeHtml4(s"${acc.name}: ${acc.update.get}")
+      }
 
       val maybeInput = metrics.flatMap(_.inputMetrics)
       val inputSortable = maybeInput.map(_.bytesRead.toString).getOrElse("")

From 20bb10f8644a92a57496b5df639008832b30e34d Mon Sep 17 00:00:00 2001
From: MechCoder <manojkumarsivaraj334@gmail.com>
Date: Wed, 15 Jul 2015 08:25:53 -0700
Subject: [PATCH 166/549] [SPARK-8706] [PYSPARK] [PROJECT INFRA] Add pylint
 checks to PySpark

This adds Pylint checks to PySpark.

For now this lazy installs using easy_install to /dev/pylint (similar to the pep8 script).
We still need to figure out what rules to be allowed.

Author: MechCoder <manojkumarsivaraj334@gmail.com>

Closes #7241 from MechCoder/pylint and squashes the following commits:

2fc7291 [MechCoder] Remove pylint test fail
6d883a2 [MechCoder] Silence warnings and make pylint tests fail to check if it works in jenkins
f3a5e17 [MechCoder] undefined-variable
ca8b749 [MechCoder] Minor changes
71629f8 [MechCoder] remove trailing whitespace
8498ff9 [MechCoder] Remove blacklisted arguments and pointless statements check
1dbd094 [MechCoder] Disable all checks for now
8b8aa8a [MechCoder] Add pylint configuration file
7871bb1 [MechCoder] [SPARK-8706] [PySpark] [Project infra] Add pylint checks to PySpark
---
 dev/lint-python                   |  57 ++++-
 pylintrc                          | 404 ++++++++++++++++++++++++++++++
 python/pyspark/ml/param/shared.py |   4 +-
 python/pyspark/tests.py           |   3 +-
 4 files changed, 457 insertions(+), 11 deletions(-)
 create mode 100644 pylintrc

diff --git a/dev/lint-python b/dev/lint-python
index 0c3586462cb37..e02dff220eb87 100755
--- a/dev/lint-python
+++ b/dev/lint-python
@@ -21,12 +21,14 @@ SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )"
 SPARK_ROOT_DIR="$(dirname "$SCRIPT_DIR")"
 PATHS_TO_CHECK="./python/pyspark/ ./ec2/spark_ec2.py ./examples/src/main/python/ ./dev/sparktestsupport"
 PATHS_TO_CHECK="$PATHS_TO_CHECK ./dev/run-tests.py ./python/run-tests.py"
-PYTHON_LINT_REPORT_PATH="$SPARK_ROOT_DIR/dev/python-lint-report.txt"
+PEP8_REPORT_PATH="$SPARK_ROOT_DIR/dev/pep8-report.txt"
+PYLINT_REPORT_PATH="$SPARK_ROOT_DIR/dev/pylint-report.txt"
+PYLINT_INSTALL_INFO="$SPARK_ROOT_DIR/dev/pylint-info.txt"
 
 cd "$SPARK_ROOT_DIR"
 
 # compileall: https://docs.python.org/2/library/compileall.html
-python -B -m compileall -q -l $PATHS_TO_CHECK > "$PYTHON_LINT_REPORT_PATH"
+python -B -m compileall -q -l $PATHS_TO_CHECK > "$PEP8_REPORT_PATH"
 compile_status="${PIPESTATUS[0]}"
 
 # Get pep8 at runtime so that we don't rely on it being installed on the build server.
@@ -47,11 +49,36 @@ if [ ! -e "$PEP8_SCRIPT_PATH" ]; then
     fi
 fi
 
+# Easy install pylint in /dev/pylint. To easy_install into a directory, the PYTHONPATH should
+# be set to the directory.
+# dev/pylint should be appended to the PATH variable as well.
+# Jenkins by default installs the pylint3 version, so for now this just checks the code quality
+# of python3.
+export "PYTHONPATH=$SPARK_ROOT_DIR/dev/pylint"
+export "PYLINT_HOME=$PYTHONPATH"
+export "PATH=$PYTHONPATH:$PATH"
+
+if [ ! -d "$PYLINT_HOME" ]; then
+    mkdir "$PYLINT_HOME"
+    # Redirect the annoying pylint installation output.
+    easy_install -d "$PYLINT_HOME" pylint==1.4.4 &>> "$PYLINT_INSTALL_INFO"
+    easy_install_status="$?"
+
+    if [ "$easy_install_status" -ne 0 ]; then
+        echo "Unable to install pylint locally in \"$PYTHONPATH\"."
+        cat "$PYLINT_INSTALL_INFO"
+        exit "$easy_install_status"
+    fi
+
+    rm "$PYLINT_INSTALL_INFO"
+
+fi
+
 # There is no need to write this output to a file
 #+ first, but we do so so that the check status can
 #+ be output before the report, like with the
 #+ scalastyle and RAT checks.
-python "$PEP8_SCRIPT_PATH" --ignore=E402,E731,E241,W503,E226 $PATHS_TO_CHECK >> "$PYTHON_LINT_REPORT_PATH"
+python "$PEP8_SCRIPT_PATH" --ignore=E402,E731,E241,W503,E226 $PATHS_TO_CHECK >> "$PEP8_REPORT_PATH"
 pep8_status="${PIPESTATUS[0]}"
 
 if [ "$compile_status" -eq 0 -a "$pep8_status" -eq 0 ]; then
@@ -61,13 +88,27 @@ else
 fi
 
 if [ "$lint_status" -ne 0 ]; then
-    echo "Python lint checks failed."
-    cat "$PYTHON_LINT_REPORT_PATH"
+    echo "PEP8 checks failed."
+    cat "$PEP8_REPORT_PATH"
+else
+    echo "PEP8 checks passed."
+fi
+
+rm "$PEP8_REPORT_PATH"
+
+for to_be_checked in "$PATHS_TO_CHECK"
+do
+    pylint --rcfile="$SPARK_ROOT_DIR/pylintrc" $to_be_checked >> "$PYLINT_REPORT_PATH"
+done
+
+if [ "${PIPESTATUS[0]}" -ne 0 ]; then
+    lint_status=1
+    echo "Pylint checks failed."
+    cat "$PYLINT_REPORT_PATH"
 else
-    echo "Python lint checks passed."
+    echo "Pylint checks passed."
 fi
 
-# rm "$PEP8_SCRIPT_PATH"
-rm "$PYTHON_LINT_REPORT_PATH"
+rm "$PYLINT_REPORT_PATH"
 
 exit "$lint_status"
diff --git a/pylintrc b/pylintrc
new file mode 100644
index 0000000000000..061775960393b
--- /dev/null
+++ b/pylintrc
@@ -0,0 +1,404 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+[MASTER]
+
+# Specify a configuration file.
+#rcfile=
+
+# Python code to execute, usually for sys.path manipulation such as
+# pygtk.require().
+#init-hook=
+
+# Profiled execution.
+profile=no
+
+# Add files or directories to the blacklist. They should be base names, not
+# paths.
+ignore=pyspark.heapq3
+
+# Pickle collected data for later comparisons.
+persistent=yes
+
+# List of plugins (as comma separated values of python modules names) to load,
+# usually to register additional checkers.
+load-plugins=
+
+# Use multiple processes to speed up Pylint.
+jobs=1
+
+# Allow loading of arbitrary C extensions. Extensions are imported into the
+# active Python interpreter and may run arbitrary code.
+unsafe-load-any-extension=no
+
+# A comma-separated list of package or module names from where C extensions may
+# be loaded. Extensions are loading into the active Python interpreter and may
+# run arbitrary code
+extension-pkg-whitelist=
+
+# Allow optimization of some AST trees. This will activate a peephole AST
+# optimizer, which will apply various small optimizations. For instance, it can
+# be used to obtain the result of joining multiple strings with the addition
+# operator. Joining a lot of strings can lead to a maximum recursion error in
+# Pylint and this flag can prevent that. It has one side effect, the resulting
+# AST will be different than the one from reality.
+optimize-ast=no
+
+
+[MESSAGES CONTROL]
+
+# Only show warnings with the listed confidence levels. Leave empty to show
+# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED
+confidence=
+
+# Enable the message, report, category or checker with the given id(s). You can
+# either give multiple identifier separated by comma (,) or put this option
+# multiple time. See also the "--disable" option for examples.
+enable=
+
+# Disable the message, report, category or checker with the given id(s). You
+# can either give multiple identifiers separated by comma (,) or put this
+# option multiple times (only on the command line, not in the configuration
+# file where it should appear only once).You can also use "--disable=all" to
+# disable everything first and then reenable specific checks. For example, if
+# you want to run only the similarities checker, you can use "--disable=all
+# --enable=similarities". If you want to run only the classes checker, but have
+# no Warning level messages displayed, use"--disable=all --enable=classes
+# --disable=W"
+
+# These errors are arranged in order of number of warning given in pylint.
+# If you would like to improve the code quality of pyspark, remove any of these disabled errors
+# run ./dev/lint-python and see if the errors raised by pylint can be fixed.
+
+disable=invalid-name,missing-docstring,protected-access,unused-argument,no-member,unused-wildcard-import,redefined-builtin,too-many-arguments,unused-variable,too-few-public-methods,bad-continuation,duplicate-code,redefined-outer-name,too-many-ancestors,import-error,superfluous-parens,unused-import,line-too-long,no-name-in-module,unnecessary-lambda,import-self,no-self-use,unidiomatic-typecheck,fixme,too-many-locals,cyclic-import,too-many-branches,bare-except,wildcard-import,dangerous-default-value,broad-except,too-many-public-methods,deprecated-lambda,anomalous-backslash-in-string,too-many-lines,reimported,too-many-statements,bad-whitespace,unpacking-non-sequence,too-many-instance-attributes,abstract-method,old-style-class,global-statement,attribute-defined-outside-init,arguments-differ,undefined-all-variable,no-init,useless-else-on-loop,super-init-not-called,notimplemented-raised,too-many-return-statements,pointless-string-statement,global-variable-undefined,bad-classmethod-argument,too-many-format-args,parse-error,no-self-argument,pointless-statement,undefined-variable
+
+
+[REPORTS]
+
+# Set the output format. Available formats are text, parseable, colorized, msvs
+# (visual studio) and html. You can also give a reporter class, eg
+# mypackage.mymodule.MyReporterClass.
+output-format=text
+
+# Put messages in a separate file for each module / package specified on the
+# command line instead of printing them on stdout. Reports (if any) will be
+# written in a file name "pylint_global.[txt|html]".
+files-output=no
+
+# Tells whether to display a full report or only the messages
+reports=no
+
+# Python expression which should return a note less than 10 (10 is the highest
+# note). You have access to the variables errors warning, statement which
+# respectively contain the number of errors / warnings messages and the total
+# number of statements analyzed. This is used by the global evaluation report
+# (RP0004).
+evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
+
+# Add a comment according to your evaluation note. This is used by the global
+# evaluation report (RP0004).
+comment=no
+
+# Template used to display messages. This is a python new-style format string
+# used to format the message information. See doc for all details
+#msg-template=
+
+
+[MISCELLANEOUS]
+
+# List of note tags to take in consideration, separated by a comma.
+notes=FIXME,XXX,TODO
+
+
+[BASIC]
+
+# Required attributes for module, separated by a comma
+required-attributes=
+
+# List of builtins function names that should not be used, separated by a comma
+bad-functions=
+
+# Good variable names which should always be accepted, separated by a comma
+good-names=i,j,k,ex,Run,_
+
+# Bad variable names which should always be refused, separated by a comma
+bad-names=baz,toto,tutu,tata
+
+# Colon-delimited sets of names that determine each other's naming style when
+# the name regexes allow several styles.
+name-group=
+
+# Include a hint for the correct naming format with invalid-name
+include-naming-hint=no
+
+# Regular expression matching correct function names
+function-rgx=[a-z_][a-z0-9_]{2,30}$
+
+# Naming hint for function names
+function-name-hint=[a-z_][a-z0-9_]{2,30}$
+
+# Regular expression matching correct variable names
+variable-rgx=[a-z_][a-z0-9_]{2,30}$
+
+# Naming hint for variable names
+variable-name-hint=[a-z_][a-z0-9_]{2,30}$
+
+# Regular expression matching correct constant names
+const-rgx=(([A-Z_][A-Z0-9_]*)|(__.*__))$
+
+# Naming hint for constant names
+const-name-hint=(([A-Z_][A-Z0-9_]*)|(__.*__))$
+
+# Regular expression matching correct attribute names
+attr-rgx=[a-z_][a-z0-9_]{2,30}$
+
+# Naming hint for attribute names
+attr-name-hint=[a-z_][a-z0-9_]{2,30}$
+
+# Regular expression matching correct argument names
+argument-rgx=[a-z_][a-z0-9_]{2,30}$
+
+# Naming hint for argument names
+argument-name-hint=[a-z_][a-z0-9_]{2,30}$
+
+# Regular expression matching correct class attribute names
+class-attribute-rgx=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$
+
+# Naming hint for class attribute names
+class-attribute-name-hint=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$
+
+# Regular expression matching correct inline iteration names
+inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$
+
+# Naming hint for inline iteration names
+inlinevar-name-hint=[A-Za-z_][A-Za-z0-9_]*$
+
+# Regular expression matching correct class names
+class-rgx=[A-Z_][a-zA-Z0-9]+$
+
+# Naming hint for class names
+class-name-hint=[A-Z_][a-zA-Z0-9]+$
+
+# Regular expression matching correct module names
+module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
+
+# Naming hint for module names
+module-name-hint=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
+
+# Regular expression matching correct method names
+method-rgx=[a-z_][a-z0-9_]{2,30}$
+
+# Naming hint for method names
+method-name-hint=[a-z_][a-z0-9_]{2,30}$
+
+# Regular expression which should only match function or class names that do
+# not require a docstring.
+no-docstring-rgx=__.*__
+
+# Minimum line length for functions/classes that require docstrings, shorter
+# ones are exempt.
+docstring-min-length=-1
+
+
+[FORMAT]
+
+# Maximum number of characters on a single line.
+max-line-length=100
+
+# Regexp for a line that is allowed to be longer than the limit.
+ignore-long-lines=^\s*(# )?<?https?://\S+>?$
+
+# Allow the body of an if to be on the same line as the test if there is no
+# else.
+single-line-if-stmt=no
+
+# List of optional constructs for which whitespace checking is disabled
+no-space-check=trailing-comma,dict-separator
+
+# Maximum number of lines in a module
+max-module-lines=1000
+
+# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1
+# tab).
+indent-string='    '
+
+# Number of spaces of indent required inside a hanging or continued line.
+indent-after-paren=4
+
+# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
+expected-line-ending-format=
+
+
+[SIMILARITIES]
+
+# Minimum lines number of a similarity.
+min-similarity-lines=4
+
+# Ignore comments when computing similarities.
+ignore-comments=yes
+
+# Ignore docstrings when computing similarities.
+ignore-docstrings=yes
+
+# Ignore imports when computing similarities.
+ignore-imports=no
+
+
+[VARIABLES]
+
+# Tells whether we should check for unused import in __init__ files.
+init-import=no
+
+# A regular expression matching the name of dummy variables (i.e. expectedly
+# not used).
+dummy-variables-rgx=_$|dummy
+
+# List of additional names supposed to be defined in builtins. Remember that
+# you should avoid to define new builtins when possible.
+additional-builtins=
+
+# List of strings which can identify a callback function by name. A callback
+# name must start or end with one of those strings.
+callbacks=cb_,_cb
+
+
+[SPELLING]
+
+# Spelling dictionary name. Available dictionaries: none. To make it working
+# install python-enchant package.
+spelling-dict=
+
+# List of comma separated words that should not be checked.
+spelling-ignore-words=
+
+# A path to a file that contains private dictionary; one word per line.
+spelling-private-dict-file=
+
+# Tells whether to store unknown words to indicated private dictionary in
+# --spelling-private-dict-file option instead of raising a message.
+spelling-store-unknown-words=no
+
+
+[LOGGING]
+
+# Logging modules to check that the string format arguments are in logging
+# function parameter format
+logging-modules=logging
+
+
+[TYPECHECK]
+
+# Tells whether missing members accessed in mixin class should be ignored. A
+# mixin class is detected if its name ends with "mixin" (case insensitive).
+ignore-mixin-members=yes
+
+# List of module names for which member attributes should not be checked
+# (useful for modules/projects where namespaces are manipulated during runtime
+# and thus existing member attributes cannot be deduced by static analysis
+ignored-modules=
+
+# List of classes names for which member attributes should not be checked
+# (useful for classes with attributes dynamically set).
+ignored-classes=SQLObject
+
+# When zope mode is activated, add a predefined set of Zope acquired attributes
+# to generated-members.
+zope=no
+
+# List of members which are set dynamically and missed by pylint inference
+# system, and so shouldn't trigger E0201 when accessed. Python regular
+# expressions are accepted.
+generated-members=REQUEST,acl_users,aq_parent
+
+
+[CLASSES]
+
+# List of interface methods to ignore, separated by a comma. This is used for
+# instance to not check methods defines in Zope's Interface base class.
+ignore-iface-methods=isImplementedBy,deferred,extends,names,namesAndDescriptions,queryDescriptionFor,getBases,getDescriptionFor,getDoc,getName,getTaggedValue,getTaggedValueTags,isEqualOrExtendedBy,setTaggedValue,isImplementedByInstancesOf,adaptWith,is_implemented_by
+
+# List of method names used to declare (i.e. assign) instance attributes.
+defining-attr-methods=__init__,__new__,setUp
+
+# List of valid names for the first argument in a class method.
+valid-classmethod-first-arg=cls
+
+# List of valid names for the first argument in a metaclass class method.
+valid-metaclass-classmethod-first-arg=mcs
+
+# List of member names, which should be excluded from the protected access
+# warning.
+exclude-protected=_asdict,_fields,_replace,_source,_make
+
+
+[IMPORTS]
+
+# Deprecated modules which should not be used, separated by a comma
+deprecated-modules=regsub,TERMIOS,Bastion,rexec
+
+# Create a graph of every (i.e. internal and external) dependencies in the
+# given file (report RP0402 must not be disabled)
+import-graph=
+
+# Create a graph of external dependencies in the given file (report RP0402 must
+# not be disabled)
+ext-import-graph=
+
+# Create a graph of internal dependencies in the given file (report RP0402 must
+# not be disabled)
+int-import-graph=
+
+
+[DESIGN]
+
+# Maximum number of arguments for function / method
+max-args=5
+
+# Argument names that match this expression will be ignored. Default to name
+# with leading underscore
+ignored-argument-names=_.*
+
+# Maximum number of locals for function / method body
+max-locals=15
+
+# Maximum number of return / yield for function / method body
+max-returns=6
+
+# Maximum number of branch for function / method body
+max-branches=12
+
+# Maximum number of statements in function / method body
+max-statements=50
+
+# Maximum number of parents for a class (see R0901).
+max-parents=7
+
+# Maximum number of attributes for a class (see R0902).
+max-attributes=7
+
+# Minimum number of public methods for a class (see R0903).
+min-public-methods=2
+
+# Maximum number of public methods for a class (see R0904).
+max-public-methods=20
+
+
+[EXCEPTIONS]
+
+# Exceptions that will emit a warning when being caught. Defaults to
+# "Exception"
+overgeneral-exceptions=Exception
diff --git a/python/pyspark/ml/param/shared.py b/python/pyspark/ml/param/shared.py
index bc088e4c29e26..595124726366d 100644
--- a/python/pyspark/ml/param/shared.py
+++ b/python/pyspark/ml/param/shared.py
@@ -444,7 +444,7 @@ class DecisionTreeParams(Params):
     minInfoGain = Param(Params._dummy(), "minInfoGain", "Minimum information gain for a split to be considered at a tree node.")
     maxMemoryInMB = Param(Params._dummy(), "maxMemoryInMB", "Maximum memory in MB allocated to histogram aggregation.")
     cacheNodeIds = Param(Params._dummy(), "cacheNodeIds", "If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees.")
-    
+
 
     def __init__(self):
         super(DecisionTreeParams, self).__init__()
@@ -460,7 +460,7 @@ def __init__(self):
         self.maxMemoryInMB = Param(self, "maxMemoryInMB", "Maximum memory in MB allocated to histogram aggregation.")
         #: param for If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees.
         self.cacheNodeIds = Param(self, "cacheNodeIds", "If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees.")
-        
+
     def setMaxDepth(self, value):
         """
         Sets the value of :py:attr:`maxDepth`.
diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py
index c5c0add49d02c..21225016805bc 100644
--- a/python/pyspark/tests.py
+++ b/python/pyspark/tests.py
@@ -893,7 +893,8 @@ def test_pipe_functions(self):
             self.assertRaises(Py4JJavaError, rdd.pipe('cc', checkCode=True).collect)
         result = rdd.pipe('cat').collect()
         result.sort()
-        [self.assertEqual(x, y) for x, y in zip(data, result)]
+        for x, y in zip(data, result):
+            self.assertEqual(x, y)
         self.assertRaises(Py4JJavaError, rdd.pipe('grep 4', checkCode=True).collect)
         self.assertEqual([], rdd.pipe('grep 4').collect())
 

From 6f6902597d5d687049c103bc0cf6da30919b92d8 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@appier.com>
Date: Wed, 15 Jul 2015 09:48:33 -0700
Subject: [PATCH 167/549] [SPARK-8840] [SPARKR] Add float coercion on SparkR

JIRA: https://issues.apache.org/jira/browse/SPARK-8840

Currently the type coercion rules don't include float type. This PR simply adds it.

Author: Liang-Chi Hsieh <viirya@appier.com>

Closes #7280 from viirya/add_r_float_coercion and squashes the following commits:

c86dc0e [Liang-Chi Hsieh] For comments.
dbf0c1b [Liang-Chi Hsieh] Implicitly convert Double to Float based on provided schema.
733015a [Liang-Chi Hsieh] Add test case for DataFrame with float type.
30c2a40 [Liang-Chi Hsieh] Update test case.
52b5294 [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into add_r_float_coercion
6f9159d [Liang-Chi Hsieh] Add another test case.
8db3244 [Liang-Chi Hsieh] schema also needs to support float. add test case.
0dcc992 [Liang-Chi Hsieh] Add float coercion on SparkR.
---
 R/pkg/R/deserialize.R                         |  1 +
 R/pkg/R/schema.R                              |  1 +
 R/pkg/inst/tests/test_sparkSQL.R              | 26 +++++++++++++++++++
 .../scala/org/apache/spark/api/r/SerDe.scala  |  4 +++
 .../org/apache/spark/sql/api/r/SQLUtils.scala | 15 ++++++++---
 5 files changed, 44 insertions(+), 3 deletions(-)

diff --git a/R/pkg/R/deserialize.R b/R/pkg/R/deserialize.R
index d961bbc383688..7d1f6b0819ed0 100644
--- a/R/pkg/R/deserialize.R
+++ b/R/pkg/R/deserialize.R
@@ -23,6 +23,7 @@
 # Int -> integer
 # String -> character
 # Boolean -> logical
+# Float -> double
 # Double -> double
 # Long -> double
 # Array[Byte] -> raw
diff --git a/R/pkg/R/schema.R b/R/pkg/R/schema.R
index 15e2bdbd55d79..06df430687682 100644
--- a/R/pkg/R/schema.R
+++ b/R/pkg/R/schema.R
@@ -123,6 +123,7 @@ structField.character <- function(x, type, nullable = TRUE) {
   }
   options <- c("byte",
                "integer",
+               "float",
                "double",
                "numeric",
                "character",
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index b0ea38854304e..76f74f80834a9 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -108,6 +108,32 @@ test_that("create DataFrame from RDD", {
   expect_equal(count(df), 10)
   expect_equal(columns(df), c("a", "b"))
   expect_equal(dtypes(df), list(c("a", "int"), c("b", "string")))
+
+  df <- jsonFile(sqlContext, jsonPathNa)
+  hiveCtx <- tryCatch({
+    newJObject("org.apache.spark.sql.hive.test.TestHiveContext", ssc)
+  }, error = function(err) {
+    skip("Hive is not build with SparkSQL, skipped")
+  })
+  sql(hiveCtx, "CREATE TABLE people (name string, age double, height float)")
+  insertInto(df, "people")
+  expect_equal(sql(hiveCtx, "SELECT age from people WHERE name = 'Bob'"), c(16))
+  expect_equal(sql(hiveCtx, "SELECT height from people WHERE name ='Bob'"), c(176.5))
+
+  schema <- structType(structField("name", "string"), structField("age", "integer"),
+                       structField("height", "float"))
+  df2 <- createDataFrame(sqlContext, df.toRDD, schema)
+  expect_equal(columns(df2), c("name", "age", "height"))
+  expect_equal(dtypes(df2), list(c("name", "string"), c("age", "int"), c("height", "float")))
+  expect_equal(collect(where(df2, df2$name == "Bob")), c("Bob", 16, 176.5))
+
+  localDF <- data.frame(name=c("John", "Smith", "Sarah"), age=c(19, 23, 18), height=c(164.10, 181.4, 173.7))
+  df <- createDataFrame(sqlContext, localDF, schema)
+  expect_is(df, "DataFrame")
+  expect_equal(count(df), 3)
+  expect_equal(columns(df), c("name", "age", "height"))
+  expect_equal(dtypes(df), list(c("name", "string"), c("age", "int"), c("height", "float")))
+  expect_equal(collect(where(df, df$name == "John")), c("John", 19, 164.10))
 })
 
 test_that("convert NAs to null type in DataFrames", {
diff --git a/core/src/main/scala/org/apache/spark/api/r/SerDe.scala b/core/src/main/scala/org/apache/spark/api/r/SerDe.scala
index 56adc857d4ce0..d5b4260bf4529 100644
--- a/core/src/main/scala/org/apache/spark/api/r/SerDe.scala
+++ b/core/src/main/scala/org/apache/spark/api/r/SerDe.scala
@@ -179,6 +179,7 @@ private[spark] object SerDe {
   // Int -> integer
   // String -> character
   // Boolean -> logical
+  // Float -> double
   // Double -> double
   // Long -> double
   // Array[Byte] -> raw
@@ -215,6 +216,9 @@ private[spark] object SerDe {
         case "long" | "java.lang.Long" =>
           writeType(dos, "double")
           writeDouble(dos, value.asInstanceOf[Long].toDouble)
+        case "float" | "java.lang.Float" =>
+          writeType(dos, "double")
+          writeDouble(dos, value.asInstanceOf[Float].toDouble)
         case "double" | "java.lang.Double" =>
           writeType(dos, "double")
           writeDouble(dos, value.asInstanceOf[Double])
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala
index 43b62f0e822f8..92861ab038f19 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala
@@ -47,6 +47,7 @@ private[r] object SQLUtils {
     dataType match {
       case "byte" => org.apache.spark.sql.types.ByteType
       case "integer" => org.apache.spark.sql.types.IntegerType
+      case "float" => org.apache.spark.sql.types.FloatType
       case "double" => org.apache.spark.sql.types.DoubleType
       case "numeric" => org.apache.spark.sql.types.DoubleType
       case "character" => org.apache.spark.sql.types.StringType
@@ -68,7 +69,7 @@ private[r] object SQLUtils {
 
   def createDF(rdd: RDD[Array[Byte]], schema: StructType, sqlContext: SQLContext): DataFrame = {
     val num = schema.fields.size
-    val rowRDD = rdd.map(bytesToRow)
+    val rowRDD = rdd.map(bytesToRow(_, schema))
     sqlContext.createDataFrame(rowRDD, schema)
   }
 
@@ -76,12 +77,20 @@ private[r] object SQLUtils {
     df.map(r => rowToRBytes(r))
   }
 
-  private[this] def bytesToRow(bytes: Array[Byte]): Row = {
+  private[this] def doConversion(data: Object, dataType: DataType): Object = {
+    data match {
+      case d: java.lang.Double if dataType == FloatType =>
+        new java.lang.Float(d)
+      case _ => data
+    }
+  }
+
+  private[this] def bytesToRow(bytes: Array[Byte], schema: StructType): Row = {
     val bis = new ByteArrayInputStream(bytes)
     val dis = new DataInputStream(bis)
     val num = SerDe.readInt(dis)
     Row.fromSeq((0 until num).map { i =>
-      SerDe.readObject(dis)
+      doConversion(SerDe.readObject(dis), schema.fields(i).dataType)
     }.toSeq)
   }
 

From fa4ec3606a965238423f977808163983c9d56e0a Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@outlook.com>
Date: Wed, 15 Jul 2015 10:31:39 -0700
Subject: [PATCH 168/549] [SPARK-9020][SQL] Support mutable state in code gen
 expressions

We can keep expressions' mutable states in generated class(like `SpecificProjection`) as member variables, so that we can read and modify them inside codegened expressions.

Author: Wenchen Fan <cloud0fan@outlook.com>

Closes #7392 from cloud-fan/mutable-state and squashes the following commits:

eb3a221 [Wenchen Fan] fix order
73144d8 [Wenchen Fan] naming improvement
318f41d [Wenchen Fan] address more comments
d43b65d [Wenchen Fan] address comments
fd45c7a [Wenchen Fan] Support mutable state in code gen expressions
---
 .../scala/org/apache/spark/TaskContext.scala  | 15 ++-
 .../expressions/codegen/CodeGenerator.scala   | 17 +++-
 .../codegen/GenerateMutableProjection.scala   |  4 +
 .../codegen/GenerateOrdering.scala            | 38 +++++--
 .../codegen/GeneratePredicate.scala           |  4 +
 .../codegen/GenerateProjection.scala          | 99 ++++++++++---------
 .../sql/catalyst/expressions/random.scala     | 29 +++++-
 .../MonotonicallyIncreasingID.scala           | 19 +++-
 .../expressions/SparkPartitionID.scala        | 12 ++-
 9 files changed, 171 insertions(+), 66 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/TaskContext.scala b/core/src/main/scala/org/apache/spark/TaskContext.scala
index d09e17dea0911..248339148d9b7 100644
--- a/core/src/main/scala/org/apache/spark/TaskContext.scala
+++ b/core/src/main/scala/org/apache/spark/TaskContext.scala
@@ -32,7 +32,20 @@ object TaskContext {
    */
   def get(): TaskContext = taskContext.get
 
-  private val taskContext: ThreadLocal[TaskContext] = new ThreadLocal[TaskContext]
+  /**
+   * Returns the partition id of currently active TaskContext. It will return 0
+   * if there is no active TaskContext for cases like local execution.
+   */
+  def getPartitionId(): Int = {
+    val tc = taskContext.get()
+    if (tc == null) {
+      0
+    } else {
+      tc.partitionId()
+    }
+  }
+
+  private[this] val taskContext: ThreadLocal[TaskContext] = new ThreadLocal[TaskContext]
 
   // Note: protected[spark] instead of private[spark] to prevent the following two from
   // showing up in JavaDoc.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index 9f6329bbda4ec..328d635de8743 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -56,6 +56,18 @@ class CodeGenContext {
    */
   val references: mutable.ArrayBuffer[Expression] = new mutable.ArrayBuffer[Expression]()
 
+  /**
+   * Holding expressions' mutable states like `MonotonicallyIncreasingID.count` as a
+   * 3-tuple: java type, variable name, code to init it.
+   * They will be kept as member variables in generated classes like `SpecificProjection`.
+   */
+  val mutableStates: mutable.ArrayBuffer[(String, String, String)] =
+    mutable.ArrayBuffer.empty[(String, String, String)]
+
+  def addMutableState(javaType: String, variableName: String, initialValue: String): Unit = {
+    mutableStates += ((javaType, variableName, initialValue))
+  }
+
   val stringType: String = classOf[UTF8String].getName
   val decimalType: String = classOf[Decimal].getName
 
@@ -203,7 +215,10 @@ class CodeGenContext {
   def isPrimitiveType(dt: DataType): Boolean = isPrimitiveType(javaType(dt))
 }
 
-
+/**
+ * A wrapper for generated class, defines a `generate` method so that we can pass extra objects
+ * into generated class.
+ */
 abstract class GeneratedClass {
   def generate(expressions: Array[Expression]): Any
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
index addb8023d9c0b..71e47d4f9b620 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
@@ -46,6 +46,9 @@ object GenerateMutableProjection extends CodeGenerator[Seq[Expression], () => Mu
             ${ctx.setColumn("mutableRow", e.dataType, i, evaluationCode.primitive)};
         """
     }.mkString("\n")
+    val mutableStates = ctx.mutableStates.map { case (javaType, variableName, initialValue) =>
+      s"private $javaType $variableName = $initialValue;"
+    }.mkString("\n      ")
     val code = s"""
       public Object generate($exprType[] expr) {
         return new SpecificProjection(expr);
@@ -55,6 +58,7 @@ object GenerateMutableProjection extends CodeGenerator[Seq[Expression], () => Mu
 
         private $exprType[] expressions = null;
         private $mutableRowType mutableRow = null;
+        $mutableStates
 
         public SpecificProjection($exprType[] expr) {
           expressions = expr;
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateOrdering.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateOrdering.scala
index d05dfc108e63a..856ff9f1f96f8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateOrdering.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateOrdering.scala
@@ -46,30 +46,47 @@ object GenerateOrdering extends CodeGenerator[Seq[SortOrder], Ordering[InternalR
   protected def create(ordering: Seq[SortOrder]): Ordering[InternalRow] = {
     val ctx = newCodeGenContext()
 
-    val comparisons = ordering.zipWithIndex.map { case (order, i) =>
-      val evalA = order.child.gen(ctx)
-      val evalB = order.child.gen(ctx)
+    val comparisons = ordering.map { order =>
+      val eval = order.child.gen(ctx)
       val asc = order.direction == Ascending
+      val isNullA = ctx.freshName("isNullA")
+      val primitiveA = ctx.freshName("primitiveA")
+      val isNullB = ctx.freshName("isNullB")
+      val primitiveB = ctx.freshName("primitiveB")
       s"""
           i = a;
-          ${evalA.code}
+          boolean $isNullA;
+          ${ctx.javaType(order.child.dataType)} $primitiveA;
+          {
+            ${eval.code}
+            $isNullA = ${eval.isNull};
+            $primitiveA = ${eval.primitive};
+          }
           i = b;
-          ${evalB.code}
-          if (${evalA.isNull} && ${evalB.isNull}) {
+          boolean $isNullB;
+          ${ctx.javaType(order.child.dataType)} $primitiveB;
+          {
+            ${eval.code}
+            $isNullB = ${eval.isNull};
+            $primitiveB = ${eval.primitive};
+          }
+          if ($isNullA && $isNullB) {
             // Nothing
-          } else if (${evalA.isNull}) {
+          } else if ($isNullA) {
             return ${if (order.direction == Ascending) "-1" else "1"};
-          } else if (${evalB.isNull}) {
+          } else if ($isNullB) {
             return ${if (order.direction == Ascending) "1" else "-1"};
           } else {
-            int comp = ${ctx.genComp(order.child.dataType, evalA.primitive, evalB.primitive)};
+            int comp = ${ctx.genComp(order.child.dataType, primitiveA, primitiveB)};
             if (comp != 0) {
               return ${if (asc) "comp" else "-comp"};
             }
           }
       """
     }.mkString("\n")
-
+    val mutableStates = ctx.mutableStates.map { case (javaType, variableName, initialValue) =>
+      s"private $javaType $variableName = $initialValue;"
+    }.mkString("\n      ")
     val code = s"""
       public SpecificOrdering generate($exprType[] expr) {
         return new SpecificOrdering(expr);
@@ -78,6 +95,7 @@ object GenerateOrdering extends CodeGenerator[Seq[SortOrder], Ordering[InternalR
       class SpecificOrdering extends ${classOf[BaseOrdering].getName} {
 
         private $exprType[] expressions = null;
+        $mutableStates
 
         public SpecificOrdering($exprType[] expr) {
           expressions = expr;
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratePredicate.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratePredicate.scala
index 274a42cb69087..9e5a745d512e9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratePredicate.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratePredicate.scala
@@ -40,6 +40,9 @@ object GeneratePredicate extends CodeGenerator[Expression, (InternalRow) => Bool
   protected def create(predicate: Expression): ((InternalRow) => Boolean) = {
     val ctx = newCodeGenContext()
     val eval = predicate.gen(ctx)
+    val mutableStates = ctx.mutableStates.map { case (javaType, variableName, initialValue) =>
+      s"private $javaType $variableName = $initialValue;"
+    }.mkString("\n      ")
     val code = s"""
       public SpecificPredicate generate($exprType[] expr) {
         return new SpecificPredicate(expr);
@@ -47,6 +50,7 @@ object GeneratePredicate extends CodeGenerator[Expression, (InternalRow) => Bool
 
       class SpecificPredicate extends ${classOf[Predicate].getName} {
         private final $exprType[] expressions;
+        $mutableStates
         public SpecificPredicate($exprType[] expr) {
           expressions = expr;
         }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala
index 3c7ee9cc16599..3e5ca308dc31d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala
@@ -151,6 +151,10 @@ object GenerateProjection extends CodeGenerator[Seq[Expression], Projection] {
         s"""if (!nullBits[$i]) arr[$i] = c$i;"""
     }.mkString("\n      ")
 
+    val mutableStates = ctx.mutableStates.map { case (javaType, variableName, initialValue) =>
+      s"private $javaType $variableName = $initialValue;"
+    }.mkString("\n      ")
+
     val code = s"""
     public SpecificProjection generate($exprType[] expr) {
       return new SpecificProjection(expr);
@@ -158,6 +162,7 @@ object GenerateProjection extends CodeGenerator[Seq[Expression], Projection] {
 
     class SpecificProjection extends ${classOf[BaseProject].getName} {
       private $exprType[] expressions = null;
+      $mutableStates
 
       public SpecificProjection($exprType[] expr) {
         expressions = expr;
@@ -165,65 +170,65 @@ object GenerateProjection extends CodeGenerator[Seq[Expression], Projection] {
 
       @Override
       public Object apply(Object r) {
-        return new SpecificRow(expressions, (InternalRow) r);
+        return new SpecificRow((InternalRow) r);
       }
-    }
 
-    final class SpecificRow extends ${classOf[MutableRow].getName} {
+      final class SpecificRow extends ${classOf[MutableRow].getName} {
 
-      $columns
+        $columns
 
-      public SpecificRow($exprType[] expressions, InternalRow i) {
-        $initColumns
-      }
+        public SpecificRow(InternalRow i) {
+          $initColumns
+        }
 
-      public int length() { return ${expressions.length};}
-      protected boolean[] nullBits = new boolean[${expressions.length}];
-      public void setNullAt(int i) { nullBits[i] = true; }
-      public boolean isNullAt(int i) { return nullBits[i]; }
+        public int length() { return ${expressions.length};}
+        protected boolean[] nullBits = new boolean[${expressions.length}];
+        public void setNullAt(int i) { nullBits[i] = true; }
+        public boolean isNullAt(int i) { return nullBits[i]; }
 
-      public Object get(int i) {
-        if (isNullAt(i)) return null;
-        switch (i) {
-        $getCases
+        public Object get(int i) {
+          if (isNullAt(i)) return null;
+          switch (i) {
+          $getCases
+          }
+          return null;
         }
-        return null;
-      }
-      public void update(int i, Object value) {
-        if (value == null) {
-          setNullAt(i);
-          return;
+        public void update(int i, Object value) {
+          if (value == null) {
+            setNullAt(i);
+            return;
+          }
+          nullBits[i] = false;
+          switch (i) {
+          $updateCases
+          }
         }
-        nullBits[i] = false;
-        switch (i) {
-        $updateCases
+        $specificAccessorFunctions
+        $specificMutatorFunctions
+
+        @Override
+        public int hashCode() {
+          int result = 37;
+          $hashUpdates
+          return result;
         }
-      }
-      $specificAccessorFunctions
-      $specificMutatorFunctions
-
-      @Override
-      public int hashCode() {
-        int result = 37;
-        $hashUpdates
-        return result;
-      }
 
-      @Override
-      public boolean equals(Object other) {
-        if (other instanceof SpecificRow) {
-          SpecificRow row = (SpecificRow) other;
-          $columnChecks
-          return true;
+        @Override
+        public boolean equals(Object other) {
+          if (other instanceof SpecificRow) {
+            SpecificRow row = (SpecificRow) other;
+            $columnChecks
+            return true;
+          }
+          return super.equals(other);
         }
-        return super.equals(other);
-      }
 
-      @Override
-      public InternalRow copy() {
-        Object[] arr = new Object[${expressions.length}];
-        ${copyColumns}
-        return new ${classOf[GenericInternalRow].getName}(arr);
+        @Override
+        public InternalRow copy() {
+          Object[] arr = new Object[${expressions.length}];
+          ${copyColumns}
+          return new ${classOf[GenericInternalRow].getName}(arr);
+        }
       }
     }
     """
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/random.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/random.scala
index 6cdc3000382e2..e10ba55396664 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/random.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/random.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.catalyst.expressions
 import org.apache.spark.TaskContext
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode}
 import org.apache.spark.sql.types.{DataType, DoubleType}
 import org.apache.spark.util.Utils
 import org.apache.spark.util.random.XORShiftRandom
@@ -38,11 +39,7 @@ abstract class RDG(seed: Long) extends LeafExpression with Serializable {
    * Record ID within each partition. By being transient, the Random Number Generator is
    * reset every time we serialize and deserialize it.
    */
-  @transient protected lazy val partitionId = TaskContext.get() match {
-    case null => 0
-    case _ => TaskContext.get().partitionId()
-  }
-  @transient protected lazy val rng = new XORShiftRandom(seed + partitionId)
+  @transient protected lazy val rng = new XORShiftRandom(seed + TaskContext.getPartitionId)
 
   override def deterministic: Boolean = false
 
@@ -61,6 +58,17 @@ case class Rand(seed: Long) extends RDG(seed) {
     case IntegerLiteral(s) => s
     case _ => throw new AnalysisException("Input argument to rand must be an integer literal.")
   })
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    val rngTerm = ctx.freshName("rng")
+    val className = classOf[XORShiftRandom].getCanonicalName
+    ctx.addMutableState(className, rngTerm,
+      s"new $className($seed + org.apache.spark.TaskContext.getPartitionId())")
+    ev.isNull = "false"
+    s"""
+      final ${ctx.javaType(dataType)} ${ev.primitive} = $rngTerm.nextDouble();
+    """
+  }
 }
 
 /** Generate a random column with i.i.d. gaussian random distribution. */
@@ -73,4 +81,15 @@ case class Randn(seed: Long) extends RDG(seed) {
     case IntegerLiteral(s) => s
     case _ => throw new AnalysisException("Input argument to rand must be an integer literal.")
   })
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    val rngTerm = ctx.freshName("rng")
+    val className = classOf[XORShiftRandom].getCanonicalName
+    ctx.addMutableState(className, rngTerm,
+      s"new $className($seed + org.apache.spark.TaskContext.getPartitionId())")
+    ev.isNull = "false"
+    s"""
+      final ${ctx.javaType(dataType)} ${ev.primitive} = $rngTerm.nextGaussian();
+    """
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/MonotonicallyIncreasingID.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/MonotonicallyIncreasingID.scala
index 437d143e53f3f..69a37750d7525 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/MonotonicallyIncreasingID.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/MonotonicallyIncreasingID.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.execution.expressions
 import org.apache.spark.TaskContext
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.LeafExpression
+import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, CodeGenContext}
 import org.apache.spark.sql.types.{LongType, DataType}
 
 /**
@@ -40,6 +41,8 @@ private[sql] case class MonotonicallyIncreasingID() extends LeafExpression {
    */
   @transient private[this] var count: Long = 0L
 
+  @transient private lazy val partitionMask = TaskContext.getPartitionId.toLong << 33
+
   override def nullable: Boolean = false
 
   override def dataType: DataType = LongType
@@ -47,6 +50,20 @@ private[sql] case class MonotonicallyIncreasingID() extends LeafExpression {
   override def eval(input: InternalRow): Long = {
     val currentCount = count
     count += 1
-    (TaskContext.get().partitionId().toLong << 33) + currentCount
+    partitionMask + currentCount
+  }
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    val countTerm = ctx.freshName("count")
+    val partitionMaskTerm = ctx.freshName("partitionMask")
+    ctx.addMutableState(ctx.JAVA_LONG, countTerm, "0L")
+    ctx.addMutableState(ctx.JAVA_LONG, partitionMaskTerm,
+      "((long) org.apache.spark.TaskContext.getPartitionId()) << 33")
+
+    ev.isNull = "false"
+    s"""
+      final ${ctx.javaType(dataType)} ${ev.primitive} = $partitionMaskTerm + $countTerm;
+      $countTerm++;
+    """
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/SparkPartitionID.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/SparkPartitionID.scala
index 822d3d8c9108d..5f1b514f2cff2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/SparkPartitionID.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/SparkPartitionID.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.execution.expressions
 import org.apache.spark.TaskContext
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.LeafExpression
+import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, CodeGenContext}
 import org.apache.spark.sql.types.{IntegerType, DataType}
 
 
@@ -32,5 +33,14 @@ private[sql] case object SparkPartitionID extends LeafExpression {
 
   override def dataType: DataType = IntegerType
 
-  override def eval(input: InternalRow): Int = TaskContext.get().partitionId()
+  @transient private lazy val partitionId = TaskContext.getPartitionId
+
+  override def eval(input: InternalRow): Int = partitionId
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    val idTerm = ctx.freshName("partitionId")
+    ctx.addMutableState(ctx.JAVA_INT, idTerm, "org.apache.spark.TaskContext.getPartitionId()")
+    ev.isNull = "false"
+    s"final ${ctx.javaType(dataType)} ${ev.primitive} = $idTerm;"
+  }
 }

From a9385271a9f6b97ec6aa619cf56ee556ba2fb0de Mon Sep 17 00:00:00 2001
From: "zhichao.li" <zhichao.li@intel.com>
Date: Wed, 15 Jul 2015 10:43:38 -0700
Subject: [PATCH 169/549] [SPARK-8221][SQL]Add pmod function

https://issues.apache.org/jira/browse/SPARK-8221

One concern is the result would be negative if the divisor is not positive( i.e pmod(7, -3) ), but the behavior is the same as hive.

Author: zhichao.li <zhichao.li@intel.com>

Closes #6783 from zhichao-li/pmod2 and squashes the following commits:

7083eb9 [zhichao.li] update to the latest type checking
d26dba7 [zhichao.li] add pmod
---
 .../catalyst/analysis/FunctionRegistry.scala  |  1 +
 .../catalyst/analysis/HiveTypeCoercion.scala  |  6 ++
 .../sql/catalyst/expressions/arithmetic.scala | 94 +++++++++++++++++++
 .../ArithmeticExpressionSuite.scala           | 16 +++-
 .../org/apache/spark/sql/functions.scala      | 17 ++++
 .../spark/sql/DataFrameFunctionsSuite.scala   | 37 ++++++++
 6 files changed, 170 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index ec75f51d5e4ff..d2678ce860701 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -115,6 +115,7 @@ object FunctionRegistry {
     expression[Log2]("log2"),
     expression[Pow]("pow"),
     expression[Pow]("power"),
+    expression[Pmod]("pmod"),
     expression[UnaryPositive]("positive"),
     expression[Rint]("rint"),
     expression[Round]("round"),
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
index 15da5eecc8d3c..25087915b5c35 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
@@ -426,6 +426,12 @@ object HiveTypeCoercion {
             DecimalType(min(p1 - s1, p2 - s2) + max(s1, s2), max(s1, s2))
           )
 
+        case Pmod(e1 @ DecimalType.Expression(p1, s1), e2 @ DecimalType.Expression(p2, s2)) =>
+          Cast(
+            Pmod(Cast(e1, DecimalType.Unlimited), Cast(e2, DecimalType.Unlimited)),
+            DecimalType(min(p1 - s1, p2 - s2) + max(s1, s2), max(s1, s2))
+          )
+
         // When we compare 2 decimal types with different precisions, cast them to the smallest
         // common precision.
         case b @ BinaryComparison(e1 @ DecimalType.Expression(p1, s1),
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
index 1a55a0876f303..394ef556e04a2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
@@ -377,3 +377,97 @@ case class MinOf(left: Expression, right: Expression) extends BinaryArithmetic {
   override def symbol: String = "min"
   override def prettyName: String = symbol
 }
+
+case class Pmod(left: Expression, right: Expression) extends BinaryArithmetic {
+
+  override def toString: String = s"pmod($left, $right)"
+
+  override def symbol: String = "pmod"
+
+  protected def checkTypesInternal(t: DataType) =
+    TypeUtils.checkForNumericExpr(t, "pmod")
+
+  override def inputType: AbstractDataType = NumericType
+
+  protected override def nullSafeEval(left: Any, right: Any) =
+    dataType match {
+      case IntegerType => pmod(left.asInstanceOf[Int], right.asInstanceOf[Int])
+      case LongType => pmod(left.asInstanceOf[Long], right.asInstanceOf[Long])
+      case ShortType => pmod(left.asInstanceOf[Short], right.asInstanceOf[Short])
+      case ByteType => pmod(left.asInstanceOf[Byte], right.asInstanceOf[Byte])
+      case FloatType => pmod(left.asInstanceOf[Float], right.asInstanceOf[Float])
+      case DoubleType => pmod(left.asInstanceOf[Double], right.asInstanceOf[Double])
+      case _: DecimalType => pmod(left.asInstanceOf[Decimal], right.asInstanceOf[Decimal])
+    }
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    nullSafeCodeGen(ctx, ev, (eval1, eval2) => {
+      dataType match {
+        case dt: DecimalType =>
+          val decimalAdd = "$plus"
+          s"""
+            ${ctx.javaType(dataType)} r = $eval1.remainder($eval2);
+            if (r.compare(new org.apache.spark.sql.types.Decimal().set(0)) < 0) {
+              ${ev.primitive} = (r.$decimalAdd($eval2)).remainder($eval2);
+            } else {
+              ${ev.primitive} = r;
+            }
+          """
+        // byte and short are casted into int when add, minus, times or divide
+        case ByteType | ShortType =>
+          s"""
+            ${ctx.javaType(dataType)} r = (${ctx.javaType(dataType)})($eval1 % $eval2);
+            if (r < 0) {
+              ${ev.primitive} = (${ctx.javaType(dataType)})((r + $eval2) % $eval2);
+            } else {
+              ${ev.primitive} = r;
+            }
+          """
+        case _ =>
+          s"""
+            ${ctx.javaType(dataType)} r = $eval1 % $eval2;
+            if (r < 0) {
+              ${ev.primitive} = (r + $eval2) % $eval2;
+            } else {
+              ${ev.primitive} = r;
+            }
+          """
+      }
+    })
+  }
+
+  private def pmod(a: Int, n: Int): Int = {
+    val r = a % n
+    if (r < 0) {(r + n) % n} else r
+  }
+
+  private def pmod(a: Long, n: Long): Long = {
+    val r = a % n
+    if (r < 0) {(r + n) % n} else r
+  }
+
+  private def pmod(a: Byte, n: Byte): Byte = {
+    val r = a % n
+    if (r < 0) {((r + n) % n).toByte} else r.toByte
+  }
+
+  private def pmod(a: Double, n: Double): Double = {
+    val r = a % n
+    if (r < 0) {(r + n) % n} else r
+  }
+
+  private def pmod(a: Short, n: Short): Short = {
+    val r = a % n
+    if (r < 0) {((r + n) % n).toShort} else r.toShort
+  }
+
+  private def pmod(a: Float, n: Float): Float = {
+    val r = a % n
+    if (r < 0) {(r + n) % n} else r
+  }
+
+  private def pmod(a: Decimal, n: Decimal): Decimal = {
+    val r = a % n
+    if (r.compare(Decimal(0)) < 0) {(r + n) % n} else r
+  }
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ArithmeticExpressionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ArithmeticExpressionSuite.scala
index 6c93698f8017b..e7e5231d32c9e 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ArithmeticExpressionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ArithmeticExpressionSuite.scala
@@ -21,7 +21,6 @@ import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.types.Decimal
 
-
 class ArithmeticExpressionSuite extends SparkFunSuite with ExpressionEvalHelper {
 
   /**
@@ -158,4 +157,19 @@ class ArithmeticExpressionSuite extends SparkFunSuite with ExpressionEvalHelper
     checkEvaluation(MinOf(Array(1.toByte, 2.toByte), Array(1.toByte, 3.toByte)),
       Array(1.toByte, 2.toByte))
   }
+
+  test("pmod") {
+    testNumericDataTypes { convert =>
+      val left = Literal(convert(7))
+      val right = Literal(convert(3))
+      checkEvaluation(Pmod(left, right), convert(1))
+      checkEvaluation(Pmod(Literal.create(null, left.dataType), right), null)
+      checkEvaluation(Pmod(left, Literal.create(null, right.dataType)), null)
+      checkEvaluation(Remainder(left, Literal(convert(0))), null)  // mod by 0
+    }
+    checkEvaluation(Pmod(-7, 3), 2)
+    checkEvaluation(Pmod(7.2D, 4.1D), 3.1000000000000005)
+    checkEvaluation(Pmod(Decimal(0.7), Decimal(0.2)), Decimal(0.1))
+    checkEvaluation(Pmod(2L, Long.MaxValue), 2)
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 5119ee31d852d..c7deaca8437a1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -1371,6 +1371,23 @@ object functions {
    */
   def pow(l: Double, rightName: String): Column = pow(l, Column(rightName))
 
+  /**
+   * Returns the positive value of dividend mod divisor.
+   *
+   * @group math_funcs
+   * @since 1.5.0
+   */
+  def pmod(dividend: Column, divisor: Column): Column = Pmod(dividend.expr, divisor.expr)
+
+  /**
+   * Returns the positive value of dividend mod divisor.
+   *
+   * @group math_funcs
+   * @since 1.5.0
+   */
+  def pmod(dividendColName: String, divisorColName: String): Column =
+    pmod(Column(dividendColName), Column(divisorColName))
+
   /**
    * Returns the double value that is closest in value to the argument and
    * is equal to a mathematical integer.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
index 6cebec95d2850..70bd78737f69c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
@@ -403,4 +403,41 @@ class DataFrameFunctionsSuite extends QueryTest {
       Seq(Row(2), Row(2), Row(2), Row(2), Row(3), Row(3))
     )
   }
+
+  test("pmod") {
+    val intData = Seq((7, 3), (-7, 3)).toDF("a", "b")
+    checkAnswer(
+      intData.select(pmod('a, 'b)),
+      Seq(Row(1), Row(2))
+    )
+    checkAnswer(
+      intData.select(pmod('a, lit(3))),
+      Seq(Row(1), Row(2))
+    )
+    checkAnswer(
+      intData.select(pmod(lit(-7), 'b)),
+      Seq(Row(2), Row(2))
+    )
+    checkAnswer(
+      intData.selectExpr("pmod(a, b)"),
+      Seq(Row(1), Row(2))
+    )
+    checkAnswer(
+      intData.selectExpr("pmod(a, 3)"),
+      Seq(Row(1), Row(2))
+    )
+    checkAnswer(
+      intData.selectExpr("pmod(-7, b)"),
+      Seq(Row(2), Row(2))
+    )
+    val doubleData = Seq((7.2, 4.1)).toDF("a", "b")
+    checkAnswer(
+      doubleData.select(pmod('a, 'b)),
+      Seq(Row(3.1000000000000005))  // same as hive
+    )
+    checkAnswer(
+      doubleData.select(pmod(lit(2), lit(Int.MaxValue))),
+      Seq(Row(2))
+    )
+  }
 }

From 9716a727fb2d11380794549039e12e53c771e120 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@appier.com>
Date: Wed, 15 Jul 2015 10:46:22 -0700
Subject: [PATCH 170/549] [Minor][SQL] Allow spaces in the beginning and ending
 of string for Interval

This is a minor fixing for #7355 to allow spaces in the beginning and ending of string parsed to `Interval`.

Author: Liang-Chi Hsieh <viirya@appier.com>

Closes #7390 from viirya/fix_interval_string and squashes the following commits:

9eb6831 [Liang-Chi Hsieh] Use trim instead of modifying regex.
57861f7 [Liang-Chi Hsieh] Fix scala style.
815a9cb [Liang-Chi Hsieh] Slightly modify regex to allow spaces in the beginning and ending of string.
---
 .../main/java/org/apache/spark/unsafe/types/Interval.java   | 1 +
 .../java/org/apache/spark/unsafe/types/IntervalSuite.java   | 6 ++++++
 2 files changed, 7 insertions(+)

diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/Interval.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/Interval.java
index eb7475e9df869..905ea0b7b878c 100644
--- a/unsafe/src/main/java/org/apache/spark/unsafe/types/Interval.java
+++ b/unsafe/src/main/java/org/apache/spark/unsafe/types/Interval.java
@@ -62,6 +62,7 @@ public static Interval fromString(String s) {
     if (s == null) {
       return null;
     }
+    s = s.trim();
     Matcher m = p.matcher(s);
     if (!m.matches() || s.equals("interval")) {
       return null;
diff --git a/unsafe/src/test/java/org/apache/spark/unsafe/types/IntervalSuite.java b/unsafe/src/test/java/org/apache/spark/unsafe/types/IntervalSuite.java
index 44a949a371f2b..1832d0bc65551 100644
--- a/unsafe/src/test/java/org/apache/spark/unsafe/types/IntervalSuite.java
+++ b/unsafe/src/test/java/org/apache/spark/unsafe/types/IntervalSuite.java
@@ -75,6 +75,12 @@ public void fromStringTest() {
     Interval result = new Interval(-5 * 12 + 23, 0);
     assertEquals(Interval.fromString(input), result);
 
+    input = "interval   -5  years  23   month   ";
+    assertEquals(Interval.fromString(input), result);
+
+    input = "  interval   -5  years  23   month   ";
+    assertEquals(Interval.fromString(input), result);
+
     // Error cases
     input = "interval   3month 1 hour";
     assertEquals(Interval.fromString(input), null);

From 303c1201c468d360a5f600ce37b8bee75a77a0e6 Mon Sep 17 00:00:00 2001
From: Shuo Xiang <shuoxiangpub@gmail.com>
Date: Wed, 15 Jul 2015 12:10:53 -0700
Subject: [PATCH 171/549] [SPARK-7555] [DOCS] Add doc for elastic net in
 ml-guide and mllib-guide

jkbradley I put the elastic net under the **Algorithm guide** section. Also add the formula of elastic net in mllib-linear `mllib-linear-methods#regularizers`.

dbtsai I left the code tab for you to add example code. Do you think it is the right place?

Author: Shuo Xiang <shuoxiangpub@gmail.com>

Closes #6504 from coderxiang/elasticnet and squashes the following commits:

f6061ee [Shuo Xiang] typo
90a7c88 [Shuo Xiang] Merge remote-tracking branch 'upstream/master' into elasticnet
0610a36 [Shuo Xiang] move out the elastic net to ml-linear-methods
8747190 [Shuo Xiang] merge master
706d3f7 [Shuo Xiang] add python code
9bc2b4c [Shuo Xiang] typo
db32a60 [Shuo Xiang] java code sample
aab3b3a [Shuo Xiang] Merge remote-tracking branch 'upstream/master' into elasticnet
a0dae07 [Shuo Xiang] simplify code
d8616fd [Shuo Xiang] Update the definition of elastic net. Add scala code; Mention Lasso and Ridge
df5bd14 [Shuo Xiang] use wikipeida page in ml-linear-methods.md
78d9366 [Shuo Xiang] address comments
8ce37c2 [Shuo Xiang] Merge branch 'elasticnet' of github.com:coderxiang/spark into elasticnet
8f24848 [Shuo Xiang] Merge branch 'elastic-net-doc' of github.com:coderxiang/spark into elastic-net-doc
998d766 [Shuo Xiang] Merge branch 'elastic-net-doc' of github.com:coderxiang/spark into elastic-net-doc
89f10e4 [Shuo Xiang] Merge remote-tracking branch 'upstream/master' into elastic-net-doc
9262a72 [Shuo Xiang] update
7e07d12 [Shuo Xiang] update
b32f21a [Shuo Xiang] add doc for elastic net in sparkml
937eef1 [Shuo Xiang] Merge remote-tracking branch 'upstream/master' into elastic-net-doc
180b496 [Shuo Xiang] Merge remote-tracking branch 'upstream/master'
aa0717d [Shuo Xiang] Merge remote-tracking branch 'upstream/master'
5f109b4 [Shuo Xiang] Merge remote-tracking branch 'upstream/master'
c5c5bfe [Shuo Xiang] Merge remote-tracking branch 'upstream/master'
98804c9 [Shuo Xiang] fix bug in topBykey and update test
---
 docs/ml-guide.md             |  31 +++++++++
 docs/ml-linear-methods.md    | 129 +++++++++++++++++++++++++++++++++++
 docs/mllib-linear-methods.md |  53 +++++++-------
 3 files changed, 188 insertions(+), 25 deletions(-)
 create mode 100644 docs/ml-linear-methods.md

diff --git a/docs/ml-guide.md b/docs/ml-guide.md
index c74cb1f1ef8ea..8c46adf256a9a 100644
--- a/docs/ml-guide.md
+++ b/docs/ml-guide.md
@@ -3,6 +3,24 @@ layout: global
 title: Spark ML Programming Guide
 ---
 
+`\[
+\newcommand{\R}{\mathbb{R}}
+\newcommand{\E}{\mathbb{E}}
+\newcommand{\x}{\mathbf{x}}
+\newcommand{\y}{\mathbf{y}}
+\newcommand{\wv}{\mathbf{w}}
+\newcommand{\av}{\mathbf{\alpha}}
+\newcommand{\bv}{\mathbf{b}}
+\newcommand{\N}{\mathbb{N}}
+\newcommand{\id}{\mathbf{I}}
+\newcommand{\ind}{\mathbf{1}}
+\newcommand{\0}{\mathbf{0}}
+\newcommand{\unit}{\mathbf{e}}
+\newcommand{\one}{\mathbf{1}}
+\newcommand{\zero}{\mathbf{0}}
+\]`
+
+
 Spark 1.2 introduced a new package called `spark.ml`, which aims to provide a uniform set of
 high-level APIs that help users create and tune practical machine learning pipelines.
 
@@ -154,6 +172,19 @@ Parameters belong to specific instances of `Estimator`s and `Transformer`s.
 For example, if we have two `LogisticRegression` instances `lr1` and `lr2`, then we can build a `ParamMap` with both `maxIter` parameters specified: `ParamMap(lr1.maxIter -> 10, lr2.maxIter -> 20)`.
 This is useful if there are two algorithms with the `maxIter` parameter in a `Pipeline`.
 
+# Algorithm Guides
+
+There are now several algorithms in the Pipelines API which are not in the lower-level MLlib API, so we link to documentation for them here.  These algorithms are mostly feature transformers, which fit naturally into the `Transformer` abstraction in Pipelines, and ensembles, which fit naturally into the `Estimator` abstraction in the Pipelines.
+
+**Pipelines API Algorithm Guides**
+
+* [Feature Extraction, Transformation, and Selection](ml-features.html)
+* [Ensembles](ml-ensembles.html)
+
+**Algorithms in `spark.ml`**
+
+* [Linear methods with elastic net regularization](ml-linear-methods.html)
+
 # Code Examples
 
 This section gives code examples illustrating the functionality discussed above.
diff --git a/docs/ml-linear-methods.md b/docs/ml-linear-methods.md
new file mode 100644
index 0000000000000..1ac83d94c9e81
--- /dev/null
+++ b/docs/ml-linear-methods.md
@@ -0,0 +1,129 @@
+---
+layout: global
+title: Linear Methods - ML
+displayTitle: <a href="ml-guide.html">ML</a> - Linear Methods
+---
+
+
+`\[
+\newcommand{\R}{\mathbb{R}}
+\newcommand{\E}{\mathbb{E}}
+\newcommand{\x}{\mathbf{x}}
+\newcommand{\y}{\mathbf{y}}
+\newcommand{\wv}{\mathbf{w}}
+\newcommand{\av}{\mathbf{\alpha}}
+\newcommand{\bv}{\mathbf{b}}
+\newcommand{\N}{\mathbb{N}}
+\newcommand{\id}{\mathbf{I}}
+\newcommand{\ind}{\mathbf{1}}
+\newcommand{\0}{\mathbf{0}}
+\newcommand{\unit}{\mathbf{e}}
+\newcommand{\one}{\mathbf{1}}
+\newcommand{\zero}{\mathbf{0}}
+\]`
+
+
+In MLlib, we implement popular linear methods such as logistic regression and linear least squares with L1 or L2 regularization. Refer to [the linear methods in mllib](mllib-linear-methods.html) for details. In `spark.ml`, we also include Pipelines API for [Elastic net](http://en.wikipedia.org/wiki/Elastic_net_regularization), a hybrid of L1 and L2 regularization proposed in [this paper](http://users.stat.umn.edu/~zouxx019/Papers/elasticnet.pdf). Mathematically it is defined as a linear combination of the L1-norm and the L2-norm:
+`\[
+\alpha \|\wv\|_1 + (1-\alpha) \frac{1}{2}\|\wv\|_2^2, \alpha \in [0, 1].
+\]`
+By setting $\alpha$ properly, it contains both L1 and L2 regularization as special cases. For example, if a [linear regression](https://en.wikipedia.org/wiki/Linear_regression) model is trained with the elastic net parameter $\alpha$ set to $1$, it is equivalent to a [Lasso](http://en.wikipedia.org/wiki/Least_squares#Lasso_method) model. On the other hand, if $\alpha$ is set to $0$, the trained model reduces to a [ridge regression](http://en.wikipedia.org/wiki/Tikhonov_regularization) model. We implement Pipelines API for both linear regression and logistic regression with elastic net regularization.
+
+**Examples**
+
+<div class="codetabs">
+
+<div data-lang="scala" markdown="1">
+
+{% highlight scala %}
+
+import org.apache.spark.ml.classification.LogisticRegression
+import org.apache.spark.mllib.util.MLUtils
+
+// Load training data
+val training = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+
+val lr = new LogisticRegression()
+  .setMaxIter(10)
+  .setRegParam(0.3)
+  .setElasticNetParam(0.8)
+
+// Fit the model
+val lrModel = lr.fit(training)
+
+// Print the weights and intercept for logistic regression
+println(s"Weights: ${lrModel.weights} Intercept: ${lrModel.intercept}")
+
+{% endhighlight %}
+
+</div>
+
+<div data-lang="java" markdown="1">
+
+{% highlight java %}
+
+import org.apache.spark.ml.classification.LogisticRegression;
+import org.apache.spark.ml.classification.LogisticRegressionModel;
+import org.apache.spark.mllib.regression.LabeledPoint;
+import org.apache.spark.mllib.util.MLUtils;
+import org.apache.spark.SparkConf;
+import org.apache.spark.SparkContext;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.SQLContext;
+
+public class LogisticRegressionWithElasticNetExample {
+  public static void main(String[] args) {
+    SparkConf conf = new SparkConf()
+      .setAppName("Logistic Regression with Elastic Net Example");
+
+    SparkContext sc = new SparkContext(conf);
+    SQLContext sql = new SQLContext(sc);
+    String path = "sample_libsvm_data.txt";
+
+    // Load training data
+    DataFrame training = sql.createDataFrame(MLUtils.loadLibSVMFile(sc, path).toJavaRDD(), LabeledPoint.class);
+
+    LogisticRegression lr = new LogisticRegression()
+      .setMaxIter(10)
+      .setRegParam(0.3)
+      .setElasticNetParam(0.8)
+
+    // Fit the model
+    LogisticRegressionModel lrModel = lr.fit(training);
+
+    // Print the weights and intercept for logistic regression
+    System.out.println("Weights: " + lrModel.weights() + " Intercept: " + lrModel.intercept());
+  }
+}
+{% endhighlight %}
+</div>
+
+<div data-lang="python" markdown="1">
+
+{% highlight python %}
+
+from pyspark.ml.classification import LogisticRegression
+from pyspark.mllib.regression import LabeledPoint
+from pyspark.mllib.util import MLUtils
+
+# Load training data
+training = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+
+lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
+
+# Fit the model
+lrModel = lr.fit(training)
+
+# Print the weights and intercept for logistic regression
+print("Weights: " + str(lrModel.weights))
+print("Intercept: " + str(lrModel.intercept))
+{% endhighlight %}
+
+</div>
+
+</div>
+
+### Optimization
+
+The optimization algorithm underlies the implementation is called [Orthant-Wise Limited-memory QuasiNewton](http://research-srv.microsoft.com/en-us/um/people/jfgao/paper/icml07scalable.pdf)
+(OWL-QN). It is an extension of L-BFGS that can effectively handle L1 regularization and elastic net.
diff --git a/docs/mllib-linear-methods.md b/docs/mllib-linear-methods.md
index 3927d65fbf8fb..07655baa414b5 100644
--- a/docs/mllib-linear-methods.md
+++ b/docs/mllib-linear-methods.md
@@ -10,7 +10,7 @@ displayTitle: <a href="mllib-guide.html">MLlib</a> - Linear Methods
 
 `\[
 \newcommand{\R}{\mathbb{R}}
-\newcommand{\E}{\mathbb{E}} 
+\newcommand{\E}{\mathbb{E}}
 \newcommand{\x}{\mathbf{x}}
 \newcommand{\y}{\mathbf{y}}
 \newcommand{\wv}{\mathbf{w}}
@@ -18,10 +18,10 @@ displayTitle: <a href="mllib-guide.html">MLlib</a> - Linear Methods
 \newcommand{\bv}{\mathbf{b}}
 \newcommand{\N}{\mathbb{N}}
 \newcommand{\id}{\mathbf{I}}
-\newcommand{\ind}{\mathbf{1}} 
-\newcommand{\0}{\mathbf{0}} 
-\newcommand{\unit}{\mathbf{e}} 
-\newcommand{\one}{\mathbf{1}} 
+\newcommand{\ind}{\mathbf{1}}
+\newcommand{\0}{\mathbf{0}}
+\newcommand{\unit}{\mathbf{e}}
+\newcommand{\one}{\mathbf{1}}
 \newcommand{\zero}{\mathbf{0}}
 \]`
 
@@ -29,7 +29,7 @@ displayTitle: <a href="mllib-guide.html">MLlib</a> - Linear Methods
 
 Many standard *machine learning* methods can be formulated as a convex optimization problem, i.e.
 the task of finding a minimizer of a convex function `$f$` that depends on a variable vector
-`$\wv$` (called `weights` in the code), which has `$d$` entries. 
+`$\wv$` (called `weights` in the code), which has `$d$` entries.
 Formally, we can write this as the optimization problem `$\min_{\wv \in\R^d} \; f(\wv)$`, where
 the objective function is of the form
 `\begin{equation}
@@ -39,7 +39,7 @@ the objective function is of the form
     \ .
 \end{equation}`
 Here the vectors `$\x_i\in\R^d$` are the training data examples, for `$1\le i\le n$`, and
-`$y_i\in\R$` are their corresponding labels, which we want to predict. 
+`$y_i\in\R$` are their corresponding labels, which we want to predict.
 We call the method *linear* if $L(\wv; \x, y)$ can be expressed as a function of $\wv^T x$ and $y$.
 Several of MLlib's classification and regression algorithms fall into this category,
 and are discussed here.
@@ -99,6 +99,9 @@ regularizers in MLlib:
     <tr>
       <td>L1</td><td>$\|\wv\|_1$</td><td>$\mathrm{sign}(\wv)$</td>
     </tr>
+    <tr>
+      <td>elastic net</td><td>$\alpha \|\wv\|_1 + (1-\alpha)\frac{1}{2}\|\wv\|_2^2$</td><td>$\alpha \mathrm{sign}(\wv) + (1-\alpha) \wv$</td>
+    </tr>
   </tbody>
 </table>
 
@@ -107,7 +110,7 @@ of `$\wv$`.
 
 L2-regularized problems are generally easier to solve than L1-regularized due to smoothness.
 However, L1 regularization can help promote sparsity in weights leading to smaller and more interpretable models, the latter of which can be useful for feature selection.
-It is not recommended to train models without any regularization,
+[Elastic net](http://en.wikipedia.org/wiki/Elastic_net_regularization) is a combination of L1 and L2 regularization. It is not recommended to train models without any regularization,
 especially when the number of training examples is small.
 
 ### Optimization
@@ -531,7 +534,7 @@ sameModel = LogisticRegressionModel.load(sc, "myModelPath")
 ### Linear least squares, Lasso, and ridge regression
 
 
-Linear least squares is the most common formulation for regression problems. 
+Linear least squares is the most common formulation for regression problems.
 It is a linear method as described above in equation `$\eqref{eq:regPrimal}$`, with the loss
 function in the formulation given by the squared loss:
 `\[
@@ -539,8 +542,8 @@ L(\wv;\x,y) :=  \frac{1}{2} (\wv^T \x - y)^2.
 \]`
 
 Various related regression methods are derived by using different types of regularization:
-[*ordinary least squares*](http://en.wikipedia.org/wiki/Ordinary_least_squares) or 
-[*linear least squares*](http://en.wikipedia.org/wiki/Linear_least_squares_(mathematics)) uses 
+[*ordinary least squares*](http://en.wikipedia.org/wiki/Ordinary_least_squares) or
+[*linear least squares*](http://en.wikipedia.org/wiki/Linear_least_squares_(mathematics)) uses
  no regularization; [*ridge regression*](http://en.wikipedia.org/wiki/Ridge_regression) uses L2
 regularization; and [*Lasso*](http://en.wikipedia.org/wiki/Lasso_(statistics)) uses L1
 regularization.  For all of these models, the average loss or training error, $\frac{1}{n} \sum_{i=1}^n (\wv^T x_i - y_i)^2$, is
@@ -552,7 +555,7 @@ known as the [mean squared error](http://en.wikipedia.org/wiki/Mean_squared_erro
 
 <div data-lang="scala" markdown="1">
 The following example demonstrate how to load training data, parse it as an RDD of LabeledPoint.
-The example then uses LinearRegressionWithSGD to build a simple linear model to predict label 
+The example then uses LinearRegressionWithSGD to build a simple linear model to predict label
 values. We compute the mean squared error at the end to evaluate
 [goodness of fit](http://en.wikipedia.org/wiki/Goodness_of_fit).
 
@@ -614,7 +617,7 @@ public class LinearRegression {
   public static void main(String[] args) {
     SparkConf conf = new SparkConf().setAppName("Linear Regression Example");
     JavaSparkContext sc = new JavaSparkContext(conf);
-    
+
     // Load and parse the data
     String path = "data/mllib/ridge-data/lpsa.data";
     JavaRDD<String> data = sc.textFile(path);
@@ -634,7 +637,7 @@ public class LinearRegression {
 
     // Building the model
     int numIterations = 100;
-    final LinearRegressionModel model = 
+    final LinearRegressionModel model =
       LinearRegressionWithSGD.train(JavaRDD.toRDD(parsedData), numIterations);
 
     // Evaluate model on training examples and compute training error
@@ -665,7 +668,7 @@ public class LinearRegression {
 
 <div data-lang="python" markdown="1">
 The following example demonstrate how to load training data, parse it as an RDD of LabeledPoint.
-The example then uses LinearRegressionWithSGD to build a simple linear model to predict label 
+The example then uses LinearRegressionWithSGD to build a simple linear model to predict label
 values. We compute the mean squared error at the end to evaluate
 [goodness of fit](http://en.wikipedia.org/wiki/Goodness_of_fit).
 
@@ -706,8 +709,8 @@ a dependency.
 
 ###Streaming linear regression
 
-When data arrive in a streaming fashion, it is useful to fit regression models online, 
-updating the parameters of the model as new data arrives. MLlib currently supports 
+When data arrive in a streaming fashion, it is useful to fit regression models online,
+updating the parameters of the model as new data arrives. MLlib currently supports
 streaming linear regression using ordinary least squares. The fitting is similar
 to that performed offline, except fitting occurs on each batch of data, so that
 the model continually updates to reflect the data from the stream.
@@ -722,7 +725,7 @@ online to the first stream, and make predictions on the second stream.
 
 <div data-lang="scala" markdown="1">
 
-First, we import the necessary classes for parsing our input data and creating the model. 
+First, we import the necessary classes for parsing our input data and creating the model.
 
 {% highlight scala %}
 
@@ -734,7 +737,7 @@ import org.apache.spark.mllib.regression.StreamingLinearRegressionWithSGD
 
 Then we make input streams for training and testing data. We assume a StreamingContext `ssc`
 has already been created, see [Spark Streaming Programming Guide](streaming-programming-guide.html#initializing)
-for more info. For this example, we use labeled points in training and testing streams, 
+for more info. For this example, we use labeled points in training and testing streams,
 but in practice you will likely want to use unlabeled vectors for test data.
 
 {% highlight scala %}
@@ -754,7 +757,7 @@ val model = new StreamingLinearRegressionWithSGD()
 
 {% endhighlight %}
 
-Now we register the streams for training and testing and start the job. 
+Now we register the streams for training and testing and start the job.
 Printing predictions alongside true labels lets us easily see the result.
 
 {% highlight scala %}
@@ -764,14 +767,14 @@ model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print()
 
 ssc.start()
 ssc.awaitTermination()
- 
+
 {% endhighlight %}
 
 We can now save text files with data to the training or testing folders.
-Each line should be a data point formatted as `(y,[x1,x2,x3])` where `y` is the label 
-and `x1,x2,x3` are the features. Anytime a text file is placed in `/training/data/dir` 
-the model will update. Anytime a text file is placed in `/testing/data/dir` you will see predictions. 
-As you feed more data to the training directory, the predictions 
+Each line should be a data point formatted as `(y,[x1,x2,x3])` where `y` is the label
+and `x1,x2,x3` are the features. Anytime a text file is placed in `/training/data/dir`
+the model will update. Anytime a text file is placed in `/testing/data/dir` you will see predictions.
+As you feed more data to the training directory, the predictions
 will get better!
 
 </div>

From ec9b621647b893abae3afef219bceab382b99564 Mon Sep 17 00:00:00 2001
From: Steve Loughran <stevel@hortonworks.com>
Date: Wed, 15 Jul 2015 12:15:35 -0700
Subject: [PATCH 172/549] SPARK-9070 JavaDataFrameSuite teardown NPEs if setup
 failed

fix teardown to skip table delete if hive context is null

Author: Steve Loughran <stevel@hortonworks.com>

Closes #7425 from steveloughran/stevel/patches/SPARK-9070-JavaDataFrameSuite-NPE and squashes the following commits:

1982d38 [Steve Loughran] SPARK-9070 JavaDataFrameSuite teardown NPEs if setup failed
---
 .../test/org/apache/spark/sql/hive/JavaDataFrameSuite.java    | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/sql/hive/src/test/java/test/org/apache/spark/sql/hive/JavaDataFrameSuite.java b/sql/hive/src/test/java/test/org/apache/spark/sql/hive/JavaDataFrameSuite.java
index c4828c4717643..741a3cd31c603 100644
--- a/sql/hive/src/test/java/test/org/apache/spark/sql/hive/JavaDataFrameSuite.java
+++ b/sql/hive/src/test/java/test/org/apache/spark/sql/hive/JavaDataFrameSuite.java
@@ -61,7 +61,9 @@ public void setUp() throws IOException {
   @After
   public void tearDown() throws IOException {
     // Clean up tables.
-    hc.sql("DROP TABLE IF EXISTS window_table");
+    if (hc != null) {
+      hc.sql("DROP TABLE IF EXISTS window_table");
+    }
   }
 
   @Test

From 536533cad83a26f8fa7c60042904a31057ab56c2 Mon Sep 17 00:00:00 2001
From: Feynman Liang <fliang@databricks.com>
Date: Wed, 15 Jul 2015 13:32:25 -0700
Subject: [PATCH 173/549] [SPARK-9005] [MLLIB] Fix RegressionMetrics
 computation of explainedVariance

Fixes implementation of `explainedVariance` and `r2` to be consistent with their definitions as described in [SPARK-9005](https://issues.apache.org/jira/browse/SPARK-9005).

Author: Feynman Liang <fliang@databricks.com>

Closes #7361 from feynmanliang/SPARK-9005-RegressionMetrics-bugs and squashes the following commits:

f1112fc [Feynman Liang] Add explainedVariance formula
1a3d098 [Feynman Liang] SROwen code review comments
08a0e1b [Feynman Liang] Fix pyspark tests
db8605a [Feynman Liang] Style fix
bde9761 [Feynman Liang] Fix RegressionMetrics tests, relax assumption predictor is unbiased
c235de0 [Feynman Liang] Fix RegressionMetrics tests
4c4e56f [Feynman Liang] Fix RegressionMetrics computation of explainedVariance and r2
---
 .../mllib/evaluation/RegressionMetrics.scala  | 27 +++++---
 .../evaluation/RegressionMetricsSuite.scala   | 69 +++++++++++++++++--
 python/pyspark/mllib/evaluation.py            |  2 +-
 3 files changed, 83 insertions(+), 15 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala
index e577bf87f885e..408847afa800d 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala
@@ -53,14 +53,22 @@ class RegressionMetrics(predictionAndObservations: RDD[(Double, Double)]) extend
       )
     summary
   }
+  private lazy val SSerr = math.pow(summary.normL2(1), 2)
+  private lazy val SStot = summary.variance(0) * (summary.count - 1)
+  private lazy val SSreg = {
+    val yMean = summary.mean(0)
+    predictionAndObservations.map {
+      case (prediction, _) => math.pow(prediction - yMean, 2)
+    }.sum()
+  }
 
   /**
-   * Returns the explained variance regression score.
-   * explainedVariance = 1 - variance(y - \hat{y}) / variance(y)
-   * Reference: [[http://en.wikipedia.org/wiki/Explained_variation]]
+   * Returns the variance explained by regression.
+   * explainedVariance = \sum_i (\hat{y_i} - \bar{y})^2 / n
+   * @see [[https://en.wikipedia.org/wiki/Fraction_of_variance_unexplained]]
    */
   def explainedVariance: Double = {
-    1 - summary.variance(1) / summary.variance(0)
+    SSreg / summary.count
   }
 
   /**
@@ -76,8 +84,7 @@ class RegressionMetrics(predictionAndObservations: RDD[(Double, Double)]) extend
    * expected value of the squared error loss or quadratic loss.
    */
   def meanSquaredError: Double = {
-    val rmse = summary.normL2(1) / math.sqrt(summary.count)
-    rmse * rmse
+    SSerr / summary.count
   }
 
   /**
@@ -85,14 +92,14 @@ class RegressionMetrics(predictionAndObservations: RDD[(Double, Double)]) extend
    * the mean squared error.
    */
   def rootMeanSquaredError: Double = {
-    summary.normL2(1) / math.sqrt(summary.count)
+    math.sqrt(this.meanSquaredError)
   }
 
   /**
-   * Returns R^2^, the coefficient of determination.
-   * Reference: [[http://en.wikipedia.org/wiki/Coefficient_of_determination]]
+   * Returns R^2^, the unadjusted coefficient of determination.
+   * @see [[http://en.wikipedia.org/wiki/Coefficient_of_determination]]
    */
   def r2: Double = {
-    1 - math.pow(summary.normL2(1), 2) / (summary.variance(0) * (summary.count - 1))
+    1 - SSerr / SStot
   }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/evaluation/RegressionMetricsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/evaluation/RegressionMetricsSuite.scala
index 9de2bdb6d7246..4b7f1be58f99b 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/evaluation/RegressionMetricsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/evaluation/RegressionMetricsSuite.scala
@@ -23,24 +23,85 @@ import org.apache.spark.mllib.util.TestingUtils._
 
 class RegressionMetricsSuite extends SparkFunSuite with MLlibTestSparkContext {
 
-  test("regression metrics") {
+  test("regression metrics for unbiased (includes intercept term) predictor") {
+    /* Verify results in R:
+       preds = c(2.25, -0.25, 1.75, 7.75)
+       obs = c(3.0, -0.5, 2.0, 7.0)
+
+       SStot = sum((obs - mean(obs))^2)
+       SSreg = sum((preds - mean(obs))^2)
+       SSerr = sum((obs - preds)^2)
+
+       explainedVariance = SSreg / length(obs)
+       explainedVariance
+       > [1] 8.796875
+       meanAbsoluteError = mean(abs(preds - obs))
+       meanAbsoluteError
+       > [1] 0.5
+       meanSquaredError = mean((preds - obs)^2)
+       meanSquaredError
+       > [1] 0.3125
+       rmse = sqrt(meanSquaredError)
+       rmse
+       > [1] 0.559017
+       r2 = 1 - SSerr / SStot
+       r2
+       > [1] 0.9571734
+     */
+    val predictionAndObservations = sc.parallelize(
+      Seq((2.25, 3.0), (-0.25, -0.5), (1.75, 2.0), (7.75, 7.0)), 2)
+    val metrics = new RegressionMetrics(predictionAndObservations)
+    assert(metrics.explainedVariance ~== 8.79687 absTol 1E-5,
+      "explained variance regression score mismatch")
+    assert(metrics.meanAbsoluteError ~== 0.5 absTol 1E-5, "mean absolute error mismatch")
+    assert(metrics.meanSquaredError ~== 0.3125 absTol 1E-5, "mean squared error mismatch")
+    assert(metrics.rootMeanSquaredError ~== 0.55901 absTol 1E-5,
+      "root mean squared error mismatch")
+    assert(metrics.r2 ~== 0.95717 absTol 1E-5, "r2 score mismatch")
+  }
+
+  test("regression metrics for biased (no intercept term) predictor") {
+    /* Verify results in R:
+       preds = c(2.5, 0.0, 2.0, 8.0)
+       obs = c(3.0, -0.5, 2.0, 7.0)
+
+       SStot = sum((obs - mean(obs))^2)
+       SSreg = sum((preds - mean(obs))^2)
+       SSerr = sum((obs - preds)^2)
+
+       explainedVariance = SSreg / length(obs)
+       explainedVariance
+       > [1] 8.859375
+       meanAbsoluteError = mean(abs(preds - obs))
+       meanAbsoluteError
+       > [1] 0.5
+       meanSquaredError = mean((preds - obs)^2)
+       meanSquaredError
+       > [1] 0.375
+       rmse = sqrt(meanSquaredError)
+       rmse
+       > [1] 0.6123724
+       r2 = 1 - SSerr / SStot
+       r2
+       > [1] 0.9486081
+     */
     val predictionAndObservations = sc.parallelize(
       Seq((2.5, 3.0), (0.0, -0.5), (2.0, 2.0), (8.0, 7.0)), 2)
     val metrics = new RegressionMetrics(predictionAndObservations)
-    assert(metrics.explainedVariance ~== 0.95717 absTol 1E-5,
+    assert(metrics.explainedVariance ~== 8.85937 absTol 1E-5,
       "explained variance regression score mismatch")
     assert(metrics.meanAbsoluteError ~== 0.5 absTol 1E-5, "mean absolute error mismatch")
     assert(metrics.meanSquaredError ~== 0.375 absTol 1E-5, "mean squared error mismatch")
     assert(metrics.rootMeanSquaredError ~== 0.61237 absTol 1E-5,
       "root mean squared error mismatch")
-    assert(metrics.r2 ~== 0.94861 absTol 1E-5, "r2 score mismatch")
+    assert(metrics.r2 ~== 0.94860 absTol 1E-5, "r2 score mismatch")
   }
 
   test("regression metrics with complete fitting") {
     val predictionAndObservations = sc.parallelize(
       Seq((3.0, 3.0), (0.0, 0.0), (2.0, 2.0), (8.0, 8.0)), 2)
     val metrics = new RegressionMetrics(predictionAndObservations)
-    assert(metrics.explainedVariance ~== 1.0 absTol 1E-5,
+    assert(metrics.explainedVariance ~== 8.6875 absTol 1E-5,
       "explained variance regression score mismatch")
     assert(metrics.meanAbsoluteError ~== 0.0 absTol 1E-5, "mean absolute error mismatch")
     assert(metrics.meanSquaredError ~== 0.0 absTol 1E-5, "mean squared error mismatch")
diff --git a/python/pyspark/mllib/evaluation.py b/python/pyspark/mllib/evaluation.py
index f21403707e12a..4398ca86f2ec2 100644
--- a/python/pyspark/mllib/evaluation.py
+++ b/python/pyspark/mllib/evaluation.py
@@ -82,7 +82,7 @@ class RegressionMetrics(JavaModelWrapper):
     ...     (2.5, 3.0), (0.0, -0.5), (2.0, 2.0), (8.0, 7.0)])
     >>> metrics = RegressionMetrics(predictionAndObservations)
     >>> metrics.explainedVariance
-    0.95...
+    8.859...
     >>> metrics.meanAbsoluteError
     0.5...
     >>> metrics.meanSquaredError

From b9a922e260bec1b211437f020be37fab46a85db0 Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Wed, 15 Jul 2015 14:02:23 -0700
Subject: [PATCH 174/549] [SPARK-6602][Core]Replace Akka Serialization with
 Spark Serializer

Replace Akka Serialization with Spark Serializer and add unit tests.

Author: zsxwing <zsxwing@gmail.com>

Closes #7159 from zsxwing/remove-akka-serialization and squashes the following commits:

fc0fca3 [zsxwing] Merge branch 'master' into remove-akka-serialization
cf81a58 [zsxwing] Fix the code style
73251c6 [zsxwing] Add test scope
9ef4af9 [zsxwing] Add AkkaRpcEndpointRef.hashCode
433115c [zsxwing] Remove final
be3edb0 [zsxwing] Support deserializing RpcEndpointRef
ecec410 [zsxwing] Replace Akka Serialization with Spark Serializer
---
 core/pom.xml                                  |   5 +
 .../master/FileSystemPersistenceEngine.scala  |  35 ++---
 .../apache/spark/deploy/master/Master.scala   |  18 +--
 .../deploy/master/PersistenceEngine.scala     |   8 +-
 .../deploy/master/RecoveryModeFactory.scala   |   9 +-
 .../master/ZooKeeperPersistenceEngine.scala   |  16 +--
 .../scala/org/apache/spark/rpc/RpcEnv.scala   |   6 +
 .../apache/spark/rpc/akka/AkkaRpcEnv.scala    |  14 +-
 .../master/CustomRecoveryModeFactory.scala    |  31 ++---
 .../spark/deploy/master/MasterSuite.scala     |   2 +-
 .../master/PersistenceEngineSuite.scala       | 126 ++++++++++++++++++
 pom.xml                                       |   6 +
 12 files changed, 214 insertions(+), 62 deletions(-)
 create mode 100644 core/src/test/scala/org/apache/spark/deploy/master/PersistenceEngineSuite.scala

diff --git a/core/pom.xml b/core/pom.xml
index 558cc3fb9f2f3..73f7a75cab9d3 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -372,6 +372,11 @@
       <artifactId>junit-interface</artifactId>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>org.apache.curator</groupId>
+      <artifactId>curator-test</artifactId>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>net.razorvine</groupId>
       <artifactId>pyrolite</artifactId>
diff --git a/core/src/main/scala/org/apache/spark/deploy/master/FileSystemPersistenceEngine.scala b/core/src/main/scala/org/apache/spark/deploy/master/FileSystemPersistenceEngine.scala
index f459ed5b3a1a1..aa379d4cd61e7 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/FileSystemPersistenceEngine.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/FileSystemPersistenceEngine.scala
@@ -21,9 +21,8 @@ import java.io._
 
 import scala.reflect.ClassTag
 
-import akka.serialization.Serialization
-
 import org.apache.spark.Logging
+import org.apache.spark.serializer.{DeserializationStream, SerializationStream, Serializer}
 import org.apache.spark.util.Utils
 
 
@@ -32,11 +31,11 @@ import org.apache.spark.util.Utils
  * Files are deleted when applications and workers are removed.
  *
  * @param dir Directory to store files. Created if non-existent (but not recursively).
- * @param serialization Used to serialize our objects.
+ * @param serializer Used to serialize our objects.
  */
 private[master] class FileSystemPersistenceEngine(
     val dir: String,
-    val serialization: Serialization)
+    val serializer: Serializer)
   extends PersistenceEngine with Logging {
 
   new File(dir).mkdir()
@@ -57,27 +56,31 @@ private[master] class FileSystemPersistenceEngine(
   private def serializeIntoFile(file: File, value: AnyRef) {
     val created = file.createNewFile()
     if (!created) { throw new IllegalStateException("Could not create file: " + file) }
-    val serializer = serialization.findSerializerFor(value)
-    val serialized = serializer.toBinary(value)
-    val out = new FileOutputStream(file)
+    val fileOut = new FileOutputStream(file)
+    var out: SerializationStream = null
     Utils.tryWithSafeFinally {
-      out.write(serialized)
+      out = serializer.newInstance().serializeStream(fileOut)
+      out.writeObject(value)
     } {
-      out.close()
+      fileOut.close()
+      if (out != null) {
+        out.close()
+      }
     }
   }
 
   private def deserializeFromFile[T](file: File)(implicit m: ClassTag[T]): T = {
-    val fileData = new Array[Byte](file.length().asInstanceOf[Int])
-    val dis = new DataInputStream(new FileInputStream(file))
+    val fileIn = new FileInputStream(file)
+    var in: DeserializationStream = null
     try {
-      dis.readFully(fileData)
+      in = serializer.newInstance().deserializeStream(fileIn)
+      in.readObject[T]()
     } finally {
-      dis.close()
+      fileIn.close()
+      if (in != null) {
+        in.close()
+      }
     }
-    val clazz = m.runtimeClass.asInstanceOf[Class[T]]
-    val serializer = serialization.serializerFor(clazz)
-    serializer.fromBinary(fileData).asInstanceOf[T]
   }
 
 }
diff --git a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
index 245b047e7dfbd..4615febf17d24 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
@@ -27,11 +27,8 @@ import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet}
 import scala.language.postfixOps
 import scala.util.Random
 
-import akka.serialization.Serialization
-import akka.serialization.SerializationExtension
 import org.apache.hadoop.fs.Path
 
-import org.apache.spark.rpc.akka.AkkaRpcEnv
 import org.apache.spark.rpc._
 import org.apache.spark.{Logging, SecurityManager, SparkConf, SparkException}
 import org.apache.spark.deploy.{ApplicationDescription, DriverDescription,
@@ -44,6 +41,7 @@ import org.apache.spark.deploy.master.ui.MasterWebUI
 import org.apache.spark.deploy.rest.StandaloneRestServer
 import org.apache.spark.metrics.MetricsSystem
 import org.apache.spark.scheduler.{EventLoggingListener, ReplayListenerBus}
+import org.apache.spark.serializer.{JavaSerializer, Serializer}
 import org.apache.spark.ui.SparkUI
 import org.apache.spark.util.{ThreadUtils, SignalLogger, Utils}
 
@@ -58,9 +56,6 @@ private[master] class Master(
   private val forwardMessageThread =
     ThreadUtils.newDaemonSingleThreadScheduledExecutor("master-forward-message-thread")
 
-  // TODO Remove it once we don't use akka.serialization.Serialization
-  private val actorSystem = rpcEnv.asInstanceOf[AkkaRpcEnv].actorSystem
-
   private val hadoopConf = SparkHadoopUtil.get.newConfiguration(conf)
 
   private def createDateFormat = new SimpleDateFormat("yyyyMMddHHmmss") // For application IDs
@@ -161,20 +156,21 @@ private[master] class Master(
     masterMetricsSystem.getServletHandlers.foreach(webUi.attachHandler)
     applicationMetricsSystem.getServletHandlers.foreach(webUi.attachHandler)
 
+    val serializer = new JavaSerializer(conf)
     val (persistenceEngine_, leaderElectionAgent_) = RECOVERY_MODE match {
       case "ZOOKEEPER" =>
         logInfo("Persisting recovery state to ZooKeeper")
         val zkFactory =
-          new ZooKeeperRecoveryModeFactory(conf, SerializationExtension(actorSystem))
+          new ZooKeeperRecoveryModeFactory(conf, serializer)
         (zkFactory.createPersistenceEngine(), zkFactory.createLeaderElectionAgent(this))
       case "FILESYSTEM" =>
         val fsFactory =
-          new FileSystemRecoveryModeFactory(conf, SerializationExtension(actorSystem))
+          new FileSystemRecoveryModeFactory(conf, serializer)
         (fsFactory.createPersistenceEngine(), fsFactory.createLeaderElectionAgent(this))
       case "CUSTOM" =>
         val clazz = Utils.classForName(conf.get("spark.deploy.recoveryMode.factory"))
-        val factory = clazz.getConstructor(classOf[SparkConf], classOf[Serialization])
-          .newInstance(conf, SerializationExtension(actorSystem))
+        val factory = clazz.getConstructor(classOf[SparkConf], classOf[Serializer])
+          .newInstance(conf, serializer)
           .asInstanceOf[StandaloneRecoveryModeFactory]
         (factory.createPersistenceEngine(), factory.createLeaderElectionAgent(this))
       case _ =>
@@ -213,7 +209,7 @@ private[master] class Master(
 
   override def receive: PartialFunction[Any, Unit] = {
     case ElectedLeader => {
-      val (storedApps, storedDrivers, storedWorkers) = persistenceEngine.readPersistedData()
+      val (storedApps, storedDrivers, storedWorkers) = persistenceEngine.readPersistedData(rpcEnv)
       state = if (storedApps.isEmpty && storedDrivers.isEmpty && storedWorkers.isEmpty) {
         RecoveryState.ALIVE
       } else {
diff --git a/core/src/main/scala/org/apache/spark/deploy/master/PersistenceEngine.scala b/core/src/main/scala/org/apache/spark/deploy/master/PersistenceEngine.scala
index a03d460509e03..58a00bceee6af 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/PersistenceEngine.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/PersistenceEngine.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.deploy.master
 
 import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.rpc.RpcEnv
 
 import scala.reflect.ClassTag
 
@@ -80,8 +81,11 @@ abstract class PersistenceEngine {
    * Returns the persisted data sorted by their respective ids (which implies that they're
    * sorted by time of creation).
    */
-  final def readPersistedData(): (Seq[ApplicationInfo], Seq[DriverInfo], Seq[WorkerInfo]) = {
-    (read[ApplicationInfo]("app_"), read[DriverInfo]("driver_"), read[WorkerInfo]("worker_"))
+  final def readPersistedData(
+      rpcEnv: RpcEnv): (Seq[ApplicationInfo], Seq[DriverInfo], Seq[WorkerInfo]) = {
+    rpcEnv.deserialize { () =>
+      (read[ApplicationInfo]("app_"), read[DriverInfo]("driver_"), read[WorkerInfo]("worker_"))
+    }
   }
 
   def close() {}
diff --git a/core/src/main/scala/org/apache/spark/deploy/master/RecoveryModeFactory.scala b/core/src/main/scala/org/apache/spark/deploy/master/RecoveryModeFactory.scala
index 351db8fab2041..c4c3283fb73f7 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/RecoveryModeFactory.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/RecoveryModeFactory.scala
@@ -17,10 +17,9 @@
 
 package org.apache.spark.deploy.master
 
-import akka.serialization.Serialization
-
 import org.apache.spark.{Logging, SparkConf}
 import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.serializer.Serializer
 
 /**
  * ::DeveloperApi::
@@ -30,7 +29,7 @@ import org.apache.spark.annotation.DeveloperApi
  *
  */
 @DeveloperApi
-abstract class StandaloneRecoveryModeFactory(conf: SparkConf, serializer: Serialization) {
+abstract class StandaloneRecoveryModeFactory(conf: SparkConf, serializer: Serializer) {
 
   /**
    * PersistenceEngine defines how the persistent data(Information about worker, driver etc..)
@@ -49,7 +48,7 @@ abstract class StandaloneRecoveryModeFactory(conf: SparkConf, serializer: Serial
  * LeaderAgent in this case is a no-op. Since leader is forever leader as the actual
  * recovery is made by restoring from filesystem.
  */
-private[master] class FileSystemRecoveryModeFactory(conf: SparkConf, serializer: Serialization)
+private[master] class FileSystemRecoveryModeFactory(conf: SparkConf, serializer: Serializer)
   extends StandaloneRecoveryModeFactory(conf, serializer) with Logging {
 
   val RECOVERY_DIR = conf.get("spark.deploy.recoveryDirectory", "")
@@ -64,7 +63,7 @@ private[master] class FileSystemRecoveryModeFactory(conf: SparkConf, serializer:
   }
 }
 
-private[master] class ZooKeeperRecoveryModeFactory(conf: SparkConf, serializer: Serialization)
+private[master] class ZooKeeperRecoveryModeFactory(conf: SparkConf, serializer: Serializer)
   extends StandaloneRecoveryModeFactory(conf, serializer) {
 
   def createPersistenceEngine(): PersistenceEngine = {
diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ZooKeeperPersistenceEngine.scala b/core/src/main/scala/org/apache/spark/deploy/master/ZooKeeperPersistenceEngine.scala
index 328d95a7a0c68..563831cc6b8dd 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/ZooKeeperPersistenceEngine.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/ZooKeeperPersistenceEngine.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.deploy.master
 
-import akka.serialization.Serialization
+import java.nio.ByteBuffer
 
 import scala.collection.JavaConversions._
 import scala.reflect.ClassTag
@@ -27,9 +27,10 @@ import org.apache.zookeeper.CreateMode
 
 import org.apache.spark.{Logging, SparkConf}
 import org.apache.spark.deploy.SparkCuratorUtil
+import org.apache.spark.serializer.Serializer
 
 
-private[master] class ZooKeeperPersistenceEngine(conf: SparkConf, val serialization: Serialization)
+private[master] class ZooKeeperPersistenceEngine(conf: SparkConf, val serializer: Serializer)
   extends PersistenceEngine
   with Logging {
 
@@ -57,17 +58,16 @@ private[master] class ZooKeeperPersistenceEngine(conf: SparkConf, val serializat
   }
 
   private def serializeIntoFile(path: String, value: AnyRef) {
-    val serializer = serialization.findSerializerFor(value)
-    val serialized = serializer.toBinary(value)
-    zk.create().withMode(CreateMode.PERSISTENT).forPath(path, serialized)
+    val serialized = serializer.newInstance().serialize(value)
+    val bytes = new Array[Byte](serialized.remaining())
+    serialized.get(bytes)
+    zk.create().withMode(CreateMode.PERSISTENT).forPath(path, bytes)
   }
 
   private def deserializeFromFile[T](filename: String)(implicit m: ClassTag[T]): Option[T] = {
     val fileData = zk.getData().forPath(WORKING_DIR + "/" + filename)
-    val clazz = m.runtimeClass.asInstanceOf[Class[T]]
-    val serializer = serialization.serializerFor(clazz)
     try {
-      Some(serializer.fromBinary(fileData).asInstanceOf[T])
+      Some(serializer.newInstance().deserialize[T](ByteBuffer.wrap(fileData)))
     } catch {
       case e: Exception => {
         logWarning("Exception while reading persisted file, deleting", e)
diff --git a/core/src/main/scala/org/apache/spark/rpc/RpcEnv.scala b/core/src/main/scala/org/apache/spark/rpc/RpcEnv.scala
index c9fcc7a36cc04..29debe8081308 100644
--- a/core/src/main/scala/org/apache/spark/rpc/RpcEnv.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/RpcEnv.scala
@@ -139,6 +139,12 @@ private[spark] abstract class RpcEnv(conf: SparkConf) {
    * creating it manually because different [[RpcEnv]] may have different formats.
    */
   def uriOf(systemName: String, address: RpcAddress, endpointName: String): String
+
+  /**
+   * [[RpcEndpointRef]] cannot be deserialized without [[RpcEnv]]. So when deserializing any object
+   * that contains [[RpcEndpointRef]]s, the deserialization codes should be wrapped by this method.
+   */
+  def deserialize[T](deserializationAction: () => T): T
 }
 
 
diff --git a/core/src/main/scala/org/apache/spark/rpc/akka/AkkaRpcEnv.scala b/core/src/main/scala/org/apache/spark/rpc/akka/AkkaRpcEnv.scala
index f2d87f68341af..fc17542abf81d 100644
--- a/core/src/main/scala/org/apache/spark/rpc/akka/AkkaRpcEnv.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/akka/AkkaRpcEnv.scala
@@ -28,7 +28,7 @@ import akka.actor.{ActorSystem, ExtendedActorSystem, Actor, ActorRef, Props, Add
 import akka.event.Logging.Error
 import akka.pattern.{ask => akkaAsk}
 import akka.remote.{AssociationEvent, AssociatedEvent, DisassociatedEvent, AssociationErrorEvent}
-import com.google.common.util.concurrent.MoreExecutors
+import akka.serialization.JavaSerializer
 
 import org.apache.spark.{SparkException, Logging, SparkConf}
 import org.apache.spark.rpc._
@@ -239,6 +239,12 @@ private[spark] class AkkaRpcEnv private[akka] (
   }
 
   override def toString: String = s"${getClass.getSimpleName}($actorSystem)"
+
+  override def deserialize[T](deserializationAction: () => T): T = {
+    JavaSerializer.currentSystem.withValue(actorSystem.asInstanceOf[ExtendedActorSystem]) {
+      deserializationAction()
+    }
+  }
 }
 
 private[spark] class AkkaRpcEnvFactory extends RpcEnvFactory {
@@ -315,6 +321,12 @@ private[akka] class AkkaRpcEndpointRef(
 
   override def toString: String = s"${getClass.getSimpleName}($actorRef)"
 
+  final override def equals(that: Any): Boolean = that match {
+    case other: AkkaRpcEndpointRef => actorRef == other.actorRef
+    case _ => false
+  }
+
+  final override def hashCode(): Int = if (actorRef == null) 0 else actorRef.hashCode()
 }
 
 /**
diff --git a/core/src/test/scala/org/apache/spark/deploy/master/CustomRecoveryModeFactory.scala b/core/src/test/scala/org/apache/spark/deploy/master/CustomRecoveryModeFactory.scala
index f4e56632e426a..8c96b0e71dfdd 100644
--- a/core/src/test/scala/org/apache/spark/deploy/master/CustomRecoveryModeFactory.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/master/CustomRecoveryModeFactory.scala
@@ -19,18 +19,19 @@
 // when they are outside of org.apache.spark.
 package other.supplier
 
+import java.nio.ByteBuffer
+
 import scala.collection.mutable
 import scala.reflect.ClassTag
 
-import akka.serialization.Serialization
-
 import org.apache.spark.SparkConf
 import org.apache.spark.deploy.master._
+import org.apache.spark.serializer.Serializer
 
 class CustomRecoveryModeFactory(
   conf: SparkConf,
-  serialization: Serialization
-) extends StandaloneRecoveryModeFactory(conf, serialization) {
+  serializer: Serializer
+) extends StandaloneRecoveryModeFactory(conf, serializer) {
 
   CustomRecoveryModeFactory.instantiationAttempts += 1
 
@@ -40,7 +41,7 @@ class CustomRecoveryModeFactory(
    *
    */
   override def createPersistenceEngine(): PersistenceEngine =
-    new CustomPersistenceEngine(serialization)
+    new CustomPersistenceEngine(serializer)
 
   /**
    * Create an instance of LeaderAgent that decides who gets elected as master.
@@ -53,7 +54,7 @@ object CustomRecoveryModeFactory {
   @volatile var instantiationAttempts = 0
 }
 
-class CustomPersistenceEngine(serialization: Serialization) extends PersistenceEngine {
+class CustomPersistenceEngine(serializer: Serializer) extends PersistenceEngine {
   val data = mutable.HashMap[String, Array[Byte]]()
 
   CustomPersistenceEngine.lastInstance = Some(this)
@@ -64,10 +65,10 @@ class CustomPersistenceEngine(serialization: Serialization) extends PersistenceE
    */
   override def persist(name: String, obj: Object): Unit = {
     CustomPersistenceEngine.persistAttempts += 1
-    serialization.serialize(obj) match {
-      case util.Success(bytes) => data += name -> bytes
-      case util.Failure(cause) => throw new RuntimeException(cause)
-    }
+    val serialized = serializer.newInstance().serialize(obj)
+    val bytes = new Array[Byte](serialized.remaining())
+    serialized.get(bytes)
+    data += name -> bytes
   }
 
   /**
@@ -84,15 +85,9 @@ class CustomPersistenceEngine(serialization: Serialization) extends PersistenceE
    */
   override def read[T: ClassTag](prefix: String): Seq[T] = {
     CustomPersistenceEngine.readAttempts += 1
-    val clazz = implicitly[ClassTag[T]].runtimeClass.asInstanceOf[Class[T]]
     val results = for ((name, bytes) <- data; if name.startsWith(prefix))
-      yield serialization.deserialize(bytes, clazz)
-
-    results.find(_.isFailure).foreach {
-      case util.Failure(cause) => throw new RuntimeException(cause)
-    }
-
-    results.flatMap(_.toOption).toSeq
+      yield serializer.newInstance().deserialize[T](ByteBuffer.wrap(bytes))
+    results.toSeq
   }
 }
 
diff --git a/core/src/test/scala/org/apache/spark/deploy/master/MasterSuite.scala b/core/src/test/scala/org/apache/spark/deploy/master/MasterSuite.scala
index 9cb6dd43bac47..a8fbaf1d9da0a 100644
--- a/core/src/test/scala/org/apache/spark/deploy/master/MasterSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/master/MasterSuite.scala
@@ -105,7 +105,7 @@ class MasterSuite extends SparkFunSuite with Matchers with Eventually {
       persistenceEngine.addDriver(driverToPersist)
       persistenceEngine.addWorker(workerToPersist)
 
-      val (apps, drivers, workers) = persistenceEngine.readPersistedData()
+      val (apps, drivers, workers) = persistenceEngine.readPersistedData(rpcEnv)
 
       apps.map(_.id) should contain(appToPersist.id)
       drivers.map(_.id) should contain(driverToPersist.id)
diff --git a/core/src/test/scala/org/apache/spark/deploy/master/PersistenceEngineSuite.scala b/core/src/test/scala/org/apache/spark/deploy/master/PersistenceEngineSuite.scala
new file mode 100644
index 0000000000000..11e87bd1dd8eb
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/deploy/master/PersistenceEngineSuite.scala
@@ -0,0 +1,126 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.spark.deploy.master
+
+import java.net.ServerSocket
+
+import org.apache.commons.lang3.RandomUtils
+import org.apache.curator.test.TestingServer
+
+import org.apache.spark.{SecurityManager, SparkConf, SparkFunSuite}
+import org.apache.spark.rpc.{RpcEndpoint, RpcEnv}
+import org.apache.spark.serializer.{Serializer, JavaSerializer}
+import org.apache.spark.util.Utils
+
+class PersistenceEngineSuite extends SparkFunSuite {
+
+  test("FileSystemPersistenceEngine") {
+    val dir = Utils.createTempDir()
+    try {
+      val conf = new SparkConf()
+      testPersistenceEngine(conf, serializer =>
+        new FileSystemPersistenceEngine(dir.getAbsolutePath, serializer)
+      )
+    } finally {
+      Utils.deleteRecursively(dir)
+    }
+  }
+
+  test("ZooKeeperPersistenceEngine") {
+    val conf = new SparkConf()
+    // TestingServer logs the port conflict exception rather than throwing an exception.
+    // So we have to find a free port by ourselves. This approach cannot guarantee always starting
+    // zkTestServer successfully because there is a time gap between finding a free port and
+    // starting zkTestServer. But the failure possibility should be very low.
+    val zkTestServer = new TestingServer(findFreePort(conf))
+    try {
+      testPersistenceEngine(conf, serializer => {
+        conf.set("spark.deploy.zookeeper.url", zkTestServer.getConnectString)
+        new ZooKeeperPersistenceEngine(conf, serializer)
+      })
+    } finally {
+      zkTestServer.stop()
+    }
+  }
+
+  private def testPersistenceEngine(
+      conf: SparkConf, persistenceEngineCreator: Serializer => PersistenceEngine): Unit = {
+    val serializer = new JavaSerializer(conf)
+    val persistenceEngine = persistenceEngineCreator(serializer)
+    persistenceEngine.persist("test_1", "test_1_value")
+    assert(Seq("test_1_value") === persistenceEngine.read[String]("test_"))
+    persistenceEngine.persist("test_2", "test_2_value")
+    assert(Set("test_1_value", "test_2_value") === persistenceEngine.read[String]("test_").toSet)
+    persistenceEngine.unpersist("test_1")
+    assert(Seq("test_2_value") === persistenceEngine.read[String]("test_"))
+    persistenceEngine.unpersist("test_2")
+    assert(persistenceEngine.read[String]("test_").isEmpty)
+
+    // Test deserializing objects that contain RpcEndpointRef
+    val rpcEnv = RpcEnv.create("test", "localhost", 12345, conf, new SecurityManager(conf))
+    try {
+      // Create a real endpoint so that we can test RpcEndpointRef deserialization
+      val workerEndpoint = rpcEnv.setupEndpoint("worker", new RpcEndpoint {
+        override val rpcEnv: RpcEnv = rpcEnv
+      })
+
+      val workerToPersist = new WorkerInfo(
+        id = "test_worker",
+        host = "127.0.0.1",
+        port = 10000,
+        cores = 0,
+        memory = 0,
+        endpoint = workerEndpoint,
+        webUiPort = 0,
+        publicAddress = ""
+      )
+
+      persistenceEngine.addWorker(workerToPersist)
+
+      val (storedApps, storedDrivers, storedWorkers) = persistenceEngine.readPersistedData(rpcEnv)
+
+      assert(storedApps.isEmpty)
+      assert(storedDrivers.isEmpty)
+
+      // Check deserializing WorkerInfo
+      assert(storedWorkers.size == 1)
+      val recoveryWorkerInfo = storedWorkers.head
+      assert(workerToPersist.id === recoveryWorkerInfo.id)
+      assert(workerToPersist.host === recoveryWorkerInfo.host)
+      assert(workerToPersist.port === recoveryWorkerInfo.port)
+      assert(workerToPersist.cores === recoveryWorkerInfo.cores)
+      assert(workerToPersist.memory === recoveryWorkerInfo.memory)
+      assert(workerToPersist.endpoint === recoveryWorkerInfo.endpoint)
+      assert(workerToPersist.webUiPort === recoveryWorkerInfo.webUiPort)
+      assert(workerToPersist.publicAddress === recoveryWorkerInfo.publicAddress)
+    } finally {
+      rpcEnv.shutdown()
+      rpcEnv.awaitTermination()
+    }
+  }
+
+  private def findFreePort(conf: SparkConf): Int = {
+    val candidatePort = RandomUtils.nextInt(1024, 65536)
+    Utils.startServiceOnPort(candidatePort, (trialPort: Int) => {
+      val socket = new ServerSocket(trialPort)
+      socket.close()
+      (null, trialPort)
+    }, conf)._2
+  }
+}
diff --git a/pom.xml b/pom.xml
index 370c95dd03632..aa49e2ab7294b 100644
--- a/pom.xml
+++ b/pom.xml
@@ -748,6 +748,12 @@
         <artifactId>curator-framework</artifactId>
         <version>${curator.version}</version>
       </dependency>
+      <dependency>
+        <groupId>org.apache.curator</groupId>
+        <artifactId>curator-test</artifactId>
+        <version>${curator.version}</version>
+        <scope>test</scope>
+      </dependency>
       <dependency>
         <groupId>org.apache.hadoop</groupId>
         <artifactId>hadoop-client</artifactId>

From 674eb2a4c3ff595760f990daf369ba75d2547593 Mon Sep 17 00:00:00 2001
From: KaiXinXiaoLei <huleilei1@huawei.com>
Date: Wed, 15 Jul 2015 22:31:10 +0100
Subject: [PATCH 175/549] [SPARK-8974] Catch exceptions in allocation schedule
 task.

I meet a problem. When I submit some tasks, the thread spark-dynamic-executor-allocation should seed the message about "requestTotalExecutors", and the new executor should start. But I meet a problem about this thread, like:

2015-07-14 19:02:17,461 | WARN  | [spark-dynamic-executor-allocation] | Error sending message [message = RequestExecutors(1)] in 1 attempts
java.util.concurrent.TimeoutException: Futures timed out after [120 seconds]
        at scala.concurrent.impl.Promise$DefaultPromise.ready(Promise.scala:219)
        at scala.concurrent.impl.Promise$DefaultPromise.result(Promise.scala:223)
        at scala.concurrent.Await$$anonfun$result$1.apply(package.scala:107)
        at scala.concurrent.BlockContext$DefaultBlockContext$.blockOn(BlockContext.scala:53)
        at scala.concurrent.Await$.result(package.scala:107)
        at org.apache.spark.rpc.RpcEndpointRef.askWithRetry(RpcEndpointRef.scala:102)
        at org.apache.spark.rpc.RpcEndpointRef.askWithRetry(RpcEndpointRef.scala:78)
        at org.apache.spark.scheduler.cluster.YarnSchedulerBackend.doRequestTotalExecutors(YarnSchedulerBackend.scala:57)
        at org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend.requestTotalExecutors(CoarseGrainedSchedulerBackend.scala:351)
        at org.apache.spark.SparkContext.requestTotalExecutors(SparkContext.scala:1382)
        at org.apache.spark.ExecutorAllocationManager.addExecutors(ExecutorAllocationManager.scala:343)
        at org.apache.spark.ExecutorAllocationManager.updateAndSyncNumExecutorsTarget(ExecutorAllocationManager.scala:295)
        at org.apache.spark.ExecutorAllocationManager.org$apache$spark$ExecutorAllocationManager$$schedule(ExecutorAllocationManager.scala:248)

when after some minutes, I find a new ApplicationMaster start,  and tasks submitted start to run. The tasks Completed. And after long time (eg, ten minutes), the number of executor  does not reduce to zero.  I use the default value of "spark.dynamicAllocation.minExecutors".

Author: KaiXinXiaoLei <huleilei1@huawei.com>

Closes #7352 from KaiXinXiaoLei/dym and squashes the following commits:

3603631 [KaiXinXiaoLei] change logError to logWarning
efc4f24 [KaiXinXiaoLei] change file
---
 .../org/apache/spark/ExecutorAllocationManager.scala | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
index 0c50b4002cf7b..648bcfe28cad2 100644
--- a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
+++ b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
@@ -20,6 +20,7 @@ package org.apache.spark
 import java.util.concurrent.TimeUnit
 
 import scala.collection.mutable
+import scala.util.control.ControlThrowable
 
 import com.codahale.metrics.{Gauge, MetricRegistry}
 
@@ -211,7 +212,16 @@ private[spark] class ExecutorAllocationManager(
     listenerBus.addListener(listener)
 
     val scheduleTask = new Runnable() {
-      override def run(): Unit = Utils.logUncaughtExceptions(schedule())
+      override def run(): Unit = {
+        try {
+          schedule()
+        } catch {
+          case ct: ControlThrowable =>
+            throw ct
+          case t: Throwable =>
+            logWarning(s"Uncaught exception in thread ${Thread.currentThread().getName}", t)
+        }
+      }
     }
     executor.scheduleAtFixedRate(scheduleTask, 0, intervalMillis, TimeUnit.MILLISECONDS)
   }

From affbe329ae0100bd50a3c3fb081b0f2b07efce33 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Wed, 15 Jul 2015 14:52:02 -0700
Subject: [PATCH 176/549] [SPARK-9071][SQL] MonotonicallyIncreasingID and
 SparkPartitionID should be marked as nondeterministic.

I also took the chance to more explicitly define the semantics of deterministic.

Author: Reynold Xin <rxin@databricks.com>

Closes #7428 from rxin/non-deterministic and squashes the following commits:

a760827 [Reynold Xin] [SPARK-9071][SQL] MonotonicallyIncreasingID and SparkPartitionID should be marked as nondeterministic.
---
 .../spark/sql/catalyst/expressions/Expression.scala    | 10 ++++++++--
 .../expressions/MonotonicallyIncreasingID.scala        |  4 +++-
 .../sql/execution/expressions/SparkPartitionID.scala   |  4 +++-
 3 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
index 3f19ac2b592b5..7b37ae7335253 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
@@ -61,9 +61,15 @@ abstract class Expression extends TreeNode[Expression] {
   def foldable: Boolean = false
 
   /**
-   * Returns true when the current expression always return the same result for fixed input values.
+   * Returns true when the current expression always return the same result for fixed inputs from
+   * children.
+   *
+   * Note that this means that an expression should be considered as non-deterministic if:
+   * - if it relies on some mutable internal state, or
+   * - if it relies on some implicit input that is not part of the children expression list.
+   *
+   * An example would be `SparkPartitionID` that relies on the partition id returned by TaskContext.
    */
-  // TODO: Need to define explicit input values vs implicit input values.
   def deterministic: Boolean = true
 
   def nullable: Boolean
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/MonotonicallyIncreasingID.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/MonotonicallyIncreasingID.scala
index 69a37750d7525..fec403fe2d348 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/MonotonicallyIncreasingID.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/MonotonicallyIncreasingID.scala
@@ -41,7 +41,9 @@ private[sql] case class MonotonicallyIncreasingID() extends LeafExpression {
    */
   @transient private[this] var count: Long = 0L
 
-  @transient private lazy val partitionMask = TaskContext.getPartitionId.toLong << 33
+  @transient private lazy val partitionMask = TaskContext.getPartitionId().toLong << 33
+
+  override def deterministic: Boolean = false
 
   override def nullable: Boolean = false
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/SparkPartitionID.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/SparkPartitionID.scala
index 5f1b514f2cff2..7c790c549a5d8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/SparkPartitionID.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/SparkPartitionID.scala
@@ -29,11 +29,13 @@ import org.apache.spark.sql.types.{IntegerType, DataType}
  */
 private[sql] case object SparkPartitionID extends LeafExpression {
 
+  override def deterministic: Boolean = false
+
   override def nullable: Boolean = false
 
   override def dataType: DataType = IntegerType
 
-  @transient private lazy val partitionId = TaskContext.getPartitionId
+  @transient private lazy val partitionId = TaskContext.getPartitionId()
 
   override def eval(input: InternalRow): Int = partitionId
 

From b0645195d0da57065885e078e08bd6c42f4f19b0 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Wed, 15 Jul 2015 17:50:11 -0700
Subject: [PATCH 177/549] [SPARK-9086][SQL] Remove BinaryNode from TreeNode.

These traits are not super useful, and yet cause problems with toString in expressions due to the orders they are mixed in.

Author: Reynold Xin <rxin@databricks.com>

Closes #7433 from rxin/remove-binary-node and squashes the following commits:

1881f78 [Reynold Xin] [SPARK-9086][SQL] Remove BinaryNode from TreeNode.
---
 .../sql/catalyst/expressions/Expression.scala   | 17 ++++++++++++++---
 .../catalyst/plans/logical/LogicalPlan.scala    |  7 ++++++-
 .../spark/sql/catalyst/trees/TreeNode.scala     |  9 ---------
 .../apache/spark/sql/execution/SparkPlan.scala  |  7 ++++++-
 4 files changed, 26 insertions(+), 14 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
index 7b37ae7335253..87667316aca67 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
@@ -187,8 +187,10 @@ abstract class Expression extends TreeNode[Expression] {
 /**
  * A leaf expression, i.e. one without any child expressions.
  */
-abstract class LeafExpression extends Expression with trees.LeafNode[Expression] {
+abstract class LeafExpression extends Expression {
   self: Product =>
+
+  def children: Seq[Expression] = Nil
 }
 
 
@@ -196,9 +198,13 @@ abstract class LeafExpression extends Expression with trees.LeafNode[Expression]
  * An expression with one input and one output. The output is by default evaluated to null
  * if the input is evaluated to null.
  */
-abstract class UnaryExpression extends Expression with trees.UnaryNode[Expression] {
+abstract class UnaryExpression extends Expression {
   self: Product =>
 
+  def child: Expression
+
+  override def children: Seq[Expression] = child :: Nil
+
   override def foldable: Boolean = child.foldable
   override def nullable: Boolean = child.nullable
 
@@ -271,9 +277,14 @@ abstract class UnaryExpression extends Expression with trees.UnaryNode[Expressio
  * An expression with two inputs and one output. The output is by default evaluated to null
  * if any input is evaluated to null.
  */
-abstract class BinaryExpression extends Expression with trees.BinaryNode[Expression] {
+abstract class BinaryExpression extends Expression {
   self: Product =>
 
+  def left: Expression
+  def right: Expression
+
+  override def children: Seq[Expression] = Seq(left, right)
+
   override def foldable: Boolean = left.foldable && right.foldable
 
   override def nullable: Boolean = left.nullable || right.nullable
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
index e911b907e8536..d7077a0ec907a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
@@ -291,6 +291,11 @@ abstract class UnaryNode extends LogicalPlan with trees.UnaryNode[LogicalPlan] {
 /**
  * A logical plan node with a left and right child.
  */
-abstract class BinaryNode extends LogicalPlan with trees.BinaryNode[LogicalPlan] {
+abstract class BinaryNode extends LogicalPlan {
   self: Product =>
+
+  def left: LogicalPlan
+  def right: LogicalPlan
+
+  override def children: Seq[LogicalPlan] = Seq(left, right)
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
index 09f6c6b0ec423..16844b2f4b680 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
@@ -453,15 +453,6 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] {
   }
 }
 
-/**
- * A [[TreeNode]] that has two children, [[left]] and [[right]].
- */
-trait BinaryNode[BaseType <: TreeNode[BaseType]] {
-  def left: BaseType
-  def right: BaseType
-
-  def children: Seq[BaseType] = Seq(left, right)
-}
 
 /**
  * A [[TreeNode]] with no children.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
index 4d7d8626a0ecc..9dc7879fa4a1a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
@@ -247,6 +247,11 @@ private[sql] trait UnaryNode extends SparkPlan with trees.UnaryNode[SparkPlan] {
   override def outputPartitioning: Partitioning = child.outputPartitioning
 }
 
-private[sql] trait BinaryNode extends SparkPlan with trees.BinaryNode[SparkPlan] {
+private[sql] trait BinaryNode extends SparkPlan {
   self: Product =>
+
+  def left: SparkPlan
+  def right: SparkPlan
+
+  override def children: Seq[SparkPlan] = Seq(left, right)
 }

From 6960a7938c61cc07f181ca85e0d8152ceeb453d9 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Wed, 15 Jul 2015 20:33:06 -0700
Subject: [PATCH 178/549] [SPARK-8774] [ML] Add R model formula with basic
 support as a transformer

This implements minimal R formula support as a feature transformer. Both numeric and string labels are supported, but features must be numeric for now.

cc mengxr

Author: Eric Liang <ekl@databricks.com>

Closes #7381 from ericl/spark-8774-1 and squashes the following commits:

d1959d2 [Eric Liang] clarify comment
2db68aa [Eric Liang] second round of comments
dc3c943 [Eric Liang] address comments
5765ec6 [Eric Liang] fix style checks
1f361b0 [Eric Liang] doc
fb0826b [Eric Liang] [SPARK-8774] Add R model formula with basic support as a transformer
---
 .../apache/spark/ml/feature/RFormula.scala    | 151 ++++++++++++++++++
 .../spark/ml/feature/VectorAssembler.scala    |   2 +-
 .../ml/feature/RFormulaParserSuite.scala      |  34 ++++
 .../spark/ml/feature/RFormulaSuite.scala      |  93 +++++++++++
 4 files changed, 279 insertions(+), 1 deletion(-)
 create mode 100644 mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
 create mode 100644 mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaParserSuite.scala
 create mode 100644 mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
new file mode 100644
index 0000000000000..d9a36bda386b3
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
@@ -0,0 +1,151 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.feature
+
+import scala.util.parsing.combinator.RegexParsers
+
+import org.apache.spark.annotation.Experimental
+import org.apache.spark.ml.Transformer
+import org.apache.spark.ml.param.{Param, ParamMap}
+import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasLabelCol}
+import org.apache.spark.ml.util.Identifiable
+import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.types._
+
+/**
+ * :: Experimental ::
+ * Implements the transforms required for fitting a dataset against an R model formula. Currently
+ * we support a limited subset of the R operators, including '~' and '+'. Also see the R formula
+ * docs here: http://stat.ethz.ch/R-manual/R-patched/library/stats/html/formula.html
+ */
+@Experimental
+class RFormula(override val uid: String)
+  extends Transformer with HasFeaturesCol with HasLabelCol {
+
+  def this() = this(Identifiable.randomUID("rFormula"))
+
+  /**
+   * R formula parameter. The formula is provided in string form.
+   * @group setParam
+   */
+  val formula: Param[String] = new Param(this, "formula", "R model formula")
+
+  private var parsedFormula: Option[ParsedRFormula] = None
+
+  /**
+   * Sets the formula to use for this transformer. Must be called before use.
+   * @group setParam
+   * @param value an R formula in string form (e.g. "y ~ x + z")
+   */
+  def setFormula(value: String): this.type = {
+    parsedFormula = Some(RFormulaParser.parse(value))
+    set(formula, value)
+    this
+  }
+
+  /** @group getParam */
+  def getFormula: String = $(formula)
+
+  /** @group getParam */
+  def setFeaturesCol(value: String): this.type = set(featuresCol, value)
+
+  /** @group getParam */
+  def setLabelCol(value: String): this.type = set(labelCol, value)
+
+  override def transformSchema(schema: StructType): StructType = {
+    checkCanTransform(schema)
+    val withFeatures = transformFeatures.transformSchema(schema)
+    if (hasLabelCol(schema)) {
+      withFeatures
+    } else {
+      val nullable = schema(parsedFormula.get.label).dataType match {
+        case _: NumericType | BooleanType => false
+        case _ => true
+      }
+      StructType(withFeatures.fields :+ StructField($(labelCol), DoubleType, nullable))
+    }
+  }
+
+  override def transform(dataset: DataFrame): DataFrame = {
+    checkCanTransform(dataset.schema)
+    transformLabel(transformFeatures.transform(dataset))
+  }
+
+  override def copy(extra: ParamMap): RFormula = defaultCopy(extra)
+
+  override def toString: String = s"RFormula(${get(formula)})"
+
+  private def transformLabel(dataset: DataFrame): DataFrame = {
+    if (hasLabelCol(dataset.schema)) {
+      dataset
+    } else {
+      val labelName = parsedFormula.get.label
+      dataset.schema(labelName).dataType match {
+        case _: NumericType | BooleanType =>
+          dataset.withColumn($(labelCol), dataset(labelName).cast(DoubleType))
+        // TODO(ekl) add support for string-type labels
+        case other =>
+          throw new IllegalArgumentException("Unsupported type for label: " + other)
+      }
+    }
+  }
+
+  private def transformFeatures: Transformer = {
+    // TODO(ekl) add support for non-numeric features and feature interactions
+    new VectorAssembler(uid)
+      .setInputCols(parsedFormula.get.terms.toArray)
+      .setOutputCol($(featuresCol))
+  }
+
+  private def checkCanTransform(schema: StructType) {
+    require(parsedFormula.isDefined, "Must call setFormula() first.")
+    val columnNames = schema.map(_.name)
+    require(!columnNames.contains($(featuresCol)), "Features column already exists.")
+    require(
+      !columnNames.contains($(labelCol)) || schema($(labelCol)).dataType == DoubleType,
+      "Label column already exists and is not of type DoubleType.")
+  }
+
+  private def hasLabelCol(schema: StructType): Boolean = {
+    schema.map(_.name).contains($(labelCol))
+  }
+}
+
+/**
+ * Represents a parsed R formula.
+ */
+private[ml] case class ParsedRFormula(label: String, terms: Seq[String])
+
+/**
+ * Limited implementation of R formula parsing. Currently supports: '~', '+'.
+ */
+private[ml] object RFormulaParser extends RegexParsers {
+  def term: Parser[String] = "([a-zA-Z]|\\.[a-zA-Z_])[a-zA-Z0-9._]*".r
+
+  def expr: Parser[List[String]] = term ~ rep("+" ~> term) ^^ { case a ~ list => a :: list }
+
+  def formula: Parser[ParsedRFormula] =
+    (term ~ "~" ~ expr) ^^ { case r ~ "~" ~ t => ParsedRFormula(r, t) }
+
+  def parse(value: String): ParsedRFormula = parseAll(formula, value) match {
+    case Success(result, _) => result
+    case failure: NoSuccess => throw new IllegalArgumentException(
+      "Could not parse formula: " + value)
+  }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala
index 9f83c2ee16178..086917fa680f8 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala
@@ -116,7 +116,7 @@ class VectorAssembler(override val uid: String)
     if (schema.fieldNames.contains(outputColName)) {
       throw new IllegalArgumentException(s"Output column $outputColName already exists.")
     }
-    StructType(schema.fields :+ new StructField(outputColName, new VectorUDT, false))
+    StructType(schema.fields :+ new StructField(outputColName, new VectorUDT, true))
   }
 
   override def copy(extra: ParamMap): VectorAssembler = defaultCopy(extra)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaParserSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaParserSuite.scala
new file mode 100644
index 0000000000000..c8d065f37a605
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaParserSuite.scala
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.feature
+
+import org.apache.spark.SparkFunSuite
+
+class RFormulaParserSuite extends SparkFunSuite {
+  private def checkParse(formula: String, label: String, terms: Seq[String]) {
+    val parsed = RFormulaParser.parse(formula)
+    assert(parsed.label == label)
+    assert(parsed.terms == terms)
+  }
+
+  test("parse simple formulas") {
+    checkParse("y ~ x", "y", Seq("x"))
+    checkParse("y ~   ._foo  ", "y", Seq("._foo"))
+    checkParse("resp ~ A_VAR + B + c123", "resp", Seq("A_VAR", "B", "c123"))
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
new file mode 100644
index 0000000000000..fa8611b243a9f
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
@@ -0,0 +1,93 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.feature
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.ml.param.ParamsSuite
+import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.util.MLlibTestSparkContext
+
+class RFormulaSuite extends SparkFunSuite with MLlibTestSparkContext {
+  test("params") {
+    ParamsSuite.checkParams(new RFormula())
+  }
+
+  test("transform numeric data") {
+    val formula = new RFormula().setFormula("id ~ v1 + v2")
+    val original = sqlContext.createDataFrame(
+      Seq((0, 1.0, 3.0), (2, 2.0, 5.0))).toDF("id", "v1", "v2")
+    val result = formula.transform(original)
+    val resultSchema = formula.transformSchema(original.schema)
+    val expected = sqlContext.createDataFrame(
+      Seq(
+        (0, 1.0, 3.0, Vectors.dense(Array(1.0, 3.0)), 0.0),
+        (2, 2.0, 5.0, Vectors.dense(Array(2.0, 5.0)), 2.0))
+      ).toDF("id", "v1", "v2", "features", "label")
+    // TODO(ekl) make schema comparisons ignore metadata, to avoid .toString
+    assert(result.schema.toString == resultSchema.toString)
+    assert(resultSchema == expected.schema)
+    assert(result.collect().toSeq == expected.collect().toSeq)
+  }
+
+  test("features column already exists") {
+    val formula = new RFormula().setFormula("y ~ x").setFeaturesCol("x")
+    val original = sqlContext.createDataFrame(Seq((0, 1.0), (2, 2.0))).toDF("x", "y")
+    intercept[IllegalArgumentException] {
+      formula.transformSchema(original.schema)
+    }
+    intercept[IllegalArgumentException] {
+      formula.transform(original)
+    }
+  }
+
+  test("label column already exists") {
+    val formula = new RFormula().setFormula("y ~ x").setLabelCol("y")
+    val original = sqlContext.createDataFrame(Seq((0, 1.0), (2, 2.0))).toDF("x", "y")
+    val resultSchema = formula.transformSchema(original.schema)
+    assert(resultSchema.length == 3)
+    assert(resultSchema.toString == formula.transform(original).schema.toString)
+  }
+
+  test("label column already exists but is not double type") {
+    val formula = new RFormula().setFormula("y ~ x").setLabelCol("y")
+    val original = sqlContext.createDataFrame(Seq((0, 1), (2, 2))).toDF("x", "y")
+    intercept[IllegalArgumentException] {
+      formula.transformSchema(original.schema)
+    }
+    intercept[IllegalArgumentException] {
+      formula.transform(original)
+    }
+  }
+
+// TODO(ekl) enable after we implement string label support
+//  test("transform string label") {
+//    val formula = new RFormula().setFormula("name ~ id")
+//    val original = sqlContext.createDataFrame(
+//      Seq((1, "foo"), (2, "bar"), (3, "bar"))).toDF("id", "name")
+//    val result = formula.transform(original)
+//    val resultSchema = formula.transformSchema(original.schema)
+//    val expected = sqlContext.createDataFrame(
+//      Seq(
+//        (1, "foo", Vectors.dense(Array(1.0)), 1.0),
+//        (2, "bar", Vectors.dense(Array(2.0)), 0.0),
+//        (3, "bar", Vectors.dense(Array(3.0)), 0.0))
+//      ).toDF("id", "name", "features", "label")
+//    assert(result.schema.toString == resultSchema.toString)
+//    assert(result.collect().toSeq == expected.collect().toSeq)
+//  }
+}

From 73d92b00b9a6f5dfc2f8116447d17b381cd74f80 Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Wed, 15 Jul 2015 21:02:42 -0700
Subject: [PATCH 179/549] [SPARK-9018] [MLLIB] add stopwatches

Add stopwatches for easy instrumentation of MLlib algorithms. This is based on the `TimeTracker` used in decision trees. The distributed version uses Spark accumulator. jkbradley

Author: Xiangrui Meng <meng@databricks.com>

Closes #7415 from mengxr/SPARK-9018 and squashes the following commits:

40b4347 [Xiangrui Meng] == -> ===
c477745 [Xiangrui Meng] address Joseph's comments
f981a49 [Xiangrui Meng] add stopwatches
---
 .../apache/spark/ml/util/stopwatches.scala    | 151 ++++++++++++++++++
 .../apache/spark/ml/util/StopwatchSuite.scala | 109 +++++++++++++
 2 files changed, 260 insertions(+)
 create mode 100644 mllib/src/main/scala/org/apache/spark/ml/util/stopwatches.scala
 create mode 100644 mllib/src/test/scala/org/apache/spark/ml/util/StopwatchSuite.scala

diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/stopwatches.scala b/mllib/src/main/scala/org/apache/spark/ml/util/stopwatches.scala
new file mode 100644
index 0000000000000..5fdf878a3df72
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/util/stopwatches.scala
@@ -0,0 +1,151 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.util
+
+import scala.collection.mutable
+
+import org.apache.spark.{Accumulator, SparkContext}
+
+/**
+ * Abstract class for stopwatches.
+ */
+private[spark] abstract class Stopwatch extends Serializable {
+
+  @transient private var running: Boolean = false
+  private var startTime: Long = _
+
+  /**
+   * Name of the stopwatch.
+   */
+  val name: String
+
+  /**
+   * Starts the stopwatch.
+   * Throws an exception if the stopwatch is already running.
+   */
+  def start(): Unit = {
+    assume(!running, "start() called but the stopwatch is already running.")
+    running = true
+    startTime = now
+  }
+
+  /**
+   * Stops the stopwatch and returns the duration of the last session in milliseconds.
+   * Throws an exception if the stopwatch is not running.
+   */
+  def stop(): Long = {
+    assume(running, "stop() called but the stopwatch is not running.")
+    val duration = now - startTime
+    add(duration)
+    running = false
+    duration
+  }
+
+  /**
+   * Checks whether the stopwatch is running.
+   */
+  def isRunning: Boolean = running
+
+  /**
+   * Returns total elapsed time in milliseconds, not counting the current session if the stopwatch
+   * is running.
+   */
+  def elapsed(): Long
+
+  /**
+   * Gets the current time in milliseconds.
+   */
+  protected def now: Long = System.currentTimeMillis()
+
+  /**
+   * Adds input duration to total elapsed time.
+   */
+  protected def add(duration: Long): Unit
+}
+
+/**
+ * A local [[Stopwatch]].
+ */
+private[spark] class LocalStopwatch(override val name: String) extends Stopwatch {
+
+  private var elapsedTime: Long = 0L
+
+  override def elapsed(): Long = elapsedTime
+
+  override protected def add(duration: Long): Unit = {
+    elapsedTime += duration
+  }
+}
+
+/**
+ * A distributed [[Stopwatch]] using Spark accumulator.
+ * @param sc SparkContext
+ */
+private[spark] class DistributedStopwatch(
+    sc: SparkContext,
+    override val name: String) extends Stopwatch {
+
+  private val elapsedTime: Accumulator[Long] = sc.accumulator(0L, s"DistributedStopwatch($name)")
+
+  override def elapsed(): Long = elapsedTime.value
+
+  override protected def add(duration: Long): Unit = {
+    elapsedTime += duration
+  }
+}
+
+/**
+ * A multiple stopwatch that contains local and distributed stopwatches.
+ * @param sc SparkContext
+ */
+private[spark] class MultiStopwatch(@transient private val sc: SparkContext) extends Serializable {
+
+  private val stopwatches: mutable.Map[String, Stopwatch] = mutable.Map.empty
+
+  /**
+   * Adds a local stopwatch.
+   * @param name stopwatch name
+   */
+  def addLocal(name: String): this.type = {
+    require(!stopwatches.contains(name), s"Stopwatch with name $name already exists.")
+    stopwatches(name) = new LocalStopwatch(name)
+    this
+  }
+
+  /**
+   * Adds a distributed stopwatch.
+   * @param name stopwatch name
+   */
+  def addDistributed(name: String): this.type = {
+    require(!stopwatches.contains(name), s"Stopwatch with name $name already exists.")
+    stopwatches(name) = new DistributedStopwatch(sc, name)
+    this
+  }
+
+  /**
+   * Gets a stopwatch.
+   * @param name stopwatch name
+   */
+  def apply(name: String): Stopwatch = stopwatches(name)
+
+  override def toString: String = {
+    stopwatches.values.toArray.sortBy(_.name)
+      .map(c => s"  ${c.name}: ${c.elapsed()}ms")
+      .mkString("{\n", ",\n", "\n}")
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/util/StopwatchSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/util/StopwatchSuite.scala
new file mode 100644
index 0000000000000..8df6617fe0228
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/ml/util/StopwatchSuite.scala
@@ -0,0 +1,109 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.util
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.mllib.util.MLlibTestSparkContext
+
+class StopwatchSuite extends SparkFunSuite with MLlibTestSparkContext {
+
+  private def testStopwatchOnDriver(sw: Stopwatch): Unit = {
+    assert(sw.name === "sw")
+    assert(sw.elapsed() === 0L)
+    assert(!sw.isRunning)
+    intercept[AssertionError] {
+      sw.stop()
+    }
+    sw.start()
+    Thread.sleep(50)
+    val duration = sw.stop()
+    assert(duration >= 50 && duration < 100) // using a loose upper bound
+    val elapsed = sw.elapsed()
+    assert(elapsed === duration)
+    sw.start()
+    Thread.sleep(50)
+    val duration2 = sw.stop()
+    assert(duration2 >= 50 && duration2 < 100)
+    val elapsed2 = sw.elapsed()
+    assert(elapsed2 === duration + duration2)
+    sw.start()
+    assert(sw.isRunning)
+    intercept[AssertionError] {
+      sw.start()
+    }
+  }
+
+  test("LocalStopwatch") {
+    val sw = new LocalStopwatch("sw")
+    testStopwatchOnDriver(sw)
+  }
+
+  test("DistributedStopwatch on driver") {
+    val sw = new DistributedStopwatch(sc, "sw")
+    testStopwatchOnDriver(sw)
+  }
+
+  test("DistributedStopwatch on executors") {
+    val sw = new DistributedStopwatch(sc, "sw")
+    val rdd = sc.parallelize(0 until 4, 4)
+    rdd.foreach { i =>
+      sw.start()
+      Thread.sleep(50)
+      sw.stop()
+    }
+    assert(!sw.isRunning)
+    val elapsed = sw.elapsed()
+    assert(elapsed >= 200 && elapsed < 400) // using a loose upper bound
+  }
+
+  test("MultiStopwatch") {
+    val sw = new MultiStopwatch(sc)
+      .addLocal("local")
+      .addDistributed("spark")
+    assert(sw("local").name === "local")
+    assert(sw("spark").name === "spark")
+    intercept[NoSuchElementException] {
+      sw("some")
+    }
+    assert(sw.toString === "{\n  local: 0ms,\n  spark: 0ms\n}")
+    sw("local").start()
+    sw("spark").start()
+    Thread.sleep(50)
+    sw("local").stop()
+    Thread.sleep(50)
+    sw("spark").stop()
+    val localElapsed = sw("local").elapsed()
+    val sparkElapsed = sw("spark").elapsed()
+    assert(localElapsed >= 50 && localElapsed < 100)
+    assert(sparkElapsed >= 100 && sparkElapsed < 200)
+    assert(sw.toString ===
+      s"{\n  local: ${localElapsed}ms,\n  spark: ${sparkElapsed}ms\n}")
+    val rdd = sc.parallelize(0 until 4, 4)
+    rdd.foreach { i =>
+      sw("local").start()
+      sw("spark").start()
+      Thread.sleep(50)
+      sw("spark").stop()
+      sw("local").stop()
+    }
+    val localElapsed2 = sw("local").elapsed()
+    assert(localElapsed2 === localElapsed)
+    val sparkElapsed2 = sw("spark").elapsed()
+    assert(sparkElapsed2 >= 300 && sparkElapsed2 < 600)
+  }
+}

From 9c64a75bfc5e2566d1b4cd0d9b4585a818086ca6 Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Wed, 15 Jul 2015 21:08:30 -0700
Subject: [PATCH 180/549] [SPARK-9060] [SQL] Revert SPARK-8359, SPARK-8800, and
 SPARK-8677

JIRA: https://issues.apache.org/jira/browse/SPARK-9060

This PR reverts:
* https://github.com/apache/spark/commit/31bd30687bc29c0e457c37308d489ae2b6e5b72a (SPARK-8359)
* https://github.com/apache/spark/commit/24fda7381171738cbbbacb5965393b660763e562 (SPARK-8677)
* https://github.com/apache/spark/commit/4b5cfc988f23988c2334882a255d494fc93d252e (SPARK-8800)

Author: Yin Huai <yhuai@databricks.com>

Closes #7426 from yhuai/SPARK-9060 and squashes the following commits:

651264d [Yin Huai] Revert "[SPARK-8359] [SQL] Fix incorrect decimal precision after multiplication"
cfda7e4 [Yin Huai] Revert "[SPARK-8677] [SQL] Fix non-terminating decimal expansion for decimal divide operation"
2de9afe [Yin Huai] Revert "[SPARK-8800] [SQL] Fix inaccurate precision/scale of Decimal division operation"
---
 .../org/apache/spark/sql/types/Decimal.scala  | 21 ++-----------------
 .../sql/types/decimal/DecimalSuite.scala      | 18 ----------------
 2 files changed, 2 insertions(+), 37 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala
index f5bd068d60dc4..a85af9e04aedb 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala
@@ -17,8 +17,6 @@
 
 package org.apache.spark.sql.types
 
-import java.math.{MathContext, RoundingMode}
-
 import org.apache.spark.annotation.DeveloperApi
 
 /**
@@ -138,14 +136,6 @@ final class Decimal extends Ordered[Decimal] with Serializable {
   }
 
   def toBigDecimal: BigDecimal = {
-    if (decimalVal.ne(null)) {
-      decimalVal(MathContext.UNLIMITED)
-    } else {
-      BigDecimal(longVal, _scale)(MathContext.UNLIMITED)
-    }
-  }
-
-  def toLimitedBigDecimal: BigDecimal = {
     if (decimalVal.ne(null)) {
       decimalVal
     } else {
@@ -273,15 +263,8 @@ final class Decimal extends Ordered[Decimal] with Serializable {
 
   def * (that: Decimal): Decimal = Decimal(toBigDecimal * that.toBigDecimal)
 
-  def / (that: Decimal): Decimal = {
-    if (that.isZero) {
-      null
-    } else {
-      // To avoid non-terminating decimal expansion problem, we get scala's BigDecimal with limited
-      // precision and scala.
-      Decimal(toLimitedBigDecimal / that.toLimitedBigDecimal)
-    }
-  }
+  def / (that: Decimal): Decimal =
+    if (that.isZero) null else Decimal(toBigDecimal / that.toBigDecimal)
 
   def % (that: Decimal): Decimal =
     if (that.isZero) null else Decimal(toBigDecimal % that.toBigDecimal)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/decimal/DecimalSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/decimal/DecimalSuite.scala
index f0c849d1a1564..1d297beb3868d 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/decimal/DecimalSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/decimal/DecimalSuite.scala
@@ -171,22 +171,4 @@ class DecimalSuite extends SparkFunSuite with PrivateMethodTester {
     assert(new Decimal().set(100L, 10, 0).toUnscaledLong === 100L)
     assert(Decimal(Long.MaxValue, 100, 0).toUnscaledLong === Long.MaxValue)
   }
-
-  test("accurate precision after multiplication") {
-    val decimal = (Decimal(Long.MaxValue, 38, 0) * Decimal(Long.MaxValue, 38, 0)).toJavaBigDecimal
-    assert(decimal.unscaledValue.toString === "85070591730234615847396907784232501249")
-  }
-
-  test("fix non-terminating decimal expansion problem") {
-    val decimal = Decimal(1.0, 10, 3) / Decimal(3.0, 10, 3)
-    // The difference between decimal should not be more than 0.001.
-    assert(decimal.toDouble - 0.333 < 0.001)
-  }
-
-  test("fix loss of precision/scale when doing division operation") {
-    val a = Decimal(2) / Decimal(3)
-    assert(a.toDouble < 1.0 && a.toDouble > 0.6)
-    val b = Decimal(1) / Decimal(8)
-    assert(b.toDouble === 0.125)
-  }
 }

From 42dea3acf90ec506a0b79720b55ae1d753cc7544 Mon Sep 17 00:00:00 2001
From: Cheng Hao <hao.cheng@intel.com>
Date: Wed, 15 Jul 2015 21:47:21 -0700
Subject: [PATCH 181/549] [SPARK-8245][SQL] FormatNumber/Length Support for
 Expression

- `BinaryType` for `Length`
- `FormatNumber`

Author: Cheng Hao <hao.cheng@intel.com>

Closes #7034 from chenghao-intel/expression and squashes the following commits:

e534b87 [Cheng Hao] python api style issue
601bbf5 [Cheng Hao] add python API support
3ebe288 [Cheng Hao] update as feedback
52274f7 [Cheng Hao] add support for udf_format_number and length for binary
---
 python/pyspark/sql/functions.py               | 25 ++++-
 .../catalyst/analysis/FunctionRegistry.scala  |  5 +-
 .../expressions/stringOperations.scala        | 94 +++++++++++++++++--
 .../expressions/StringFunctionsSuite.scala    | 53 +++++++++--
 .../org/apache/spark/sql/functions.scala      | 32 ++++++-
 .../spark/sql/DataFrameFunctionsSuite.scala   | 93 +++++++++++++++---
 6 files changed, 261 insertions(+), 41 deletions(-)

diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index dca39fa833435..e0816b3e654bc 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -39,6 +39,8 @@
     'coalesce',
     'countDistinct',
     'explode',
+    'format_number',
+    'length',
     'log2',
     'md5',
     'monotonicallyIncreasingId',
@@ -47,7 +49,6 @@
     'sha1',
     'sha2',
     'sparkPartitionId',
-    'strlen',
     'struct',
     'udf',
     'when']
@@ -506,14 +507,28 @@ def sparkPartitionId():
 
 @ignore_unicode_prefix
 @since(1.5)
-def strlen(col):
-    """Calculates the length of a string expression.
+def length(col):
+    """Calculates the length of a string or binary expression.
 
-    >>> sqlContext.createDataFrame([('ABC',)], ['a']).select(strlen('a').alias('length')).collect()
+    >>> sqlContext.createDataFrame([('ABC',)], ['a']).select(length('a').alias('length')).collect()
     [Row(length=3)]
     """
     sc = SparkContext._active_spark_context
-    return Column(sc._jvm.functions.strlen(_to_java_column(col)))
+    return Column(sc._jvm.functions.length(_to_java_column(col)))
+
+
+@ignore_unicode_prefix
+@since(1.5)
+def format_number(col, d):
+    """Formats the number X to a format like '#,###,###.##', rounded to d decimal places,
+       and returns the result as a string.
+    :param col: the column name of the numeric value to be formatted
+    :param d: the N decimal places
+    >>> sqlContext.createDataFrame([(5,)], ['a']).select(format_number('a', 4).alias('v')).collect()
+    [Row(v=u'5.0000')]
+    """
+    sc = SparkContext._active_spark_context
+    return Column(sc._jvm.functions.format_number(_to_java_column(col), d))
 
 
 @ignore_unicode_prefix
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index d2678ce860701..e0beafe710079 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -152,11 +152,12 @@ object FunctionRegistry {
     expression[Base64]("base64"),
     expression[Encode]("encode"),
     expression[Decode]("decode"),
-    expression[StringInstr]("instr"),
+    expression[FormatNumber]("format_number"),
     expression[Lower]("lcase"),
     expression[Lower]("lower"),
-    expression[StringLength]("length"),
+    expression[Length]("length"),
     expression[Levenshtein]("levenshtein"),
+    expression[StringInstr]("instr"),
     expression[StringLocate]("locate"),
     expression[StringLPad]("lpad"),
     expression[StringTrimLeft]("ltrim"),
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
index 03b55ce5fe7cc..c64afe7b3f19a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
@@ -17,11 +17,10 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+import java.text.DecimalFormat
 import java.util.Locale
 import java.util.regex.Pattern
 
-import org.apache.commons.lang3.StringUtils
-
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.UnresolvedException
 import org.apache.spark.sql.catalyst.expressions.codegen._
@@ -553,17 +552,22 @@ case class Substring(str: Expression, pos: Expression, len: Expression)
 }
 
 /**
- * A function that return the length of the given string expression.
+ * A function that return the length of the given string or binary expression.
  */
-case class StringLength(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
+case class Length(child: Expression) extends UnaryExpression with ExpectsInputTypes {
   override def dataType: DataType = IntegerType
-  override def inputTypes: Seq[DataType] = Seq(StringType)
+  override def inputTypes: Seq[AbstractDataType] = Seq(TypeCollection(StringType, BinaryType))
 
-  protected override def nullSafeEval(string: Any): Any =
-    string.asInstanceOf[UTF8String].numChars
+  protected override def nullSafeEval(value: Any): Any = child.dataType match {
+    case StringType => value.asInstanceOf[UTF8String].numChars
+    case BinaryType => value.asInstanceOf[Array[Byte]].length
+  }
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
-    defineCodeGen(ctx, ev, c => s"($c).numChars()")
+    child.dataType match {
+      case StringType => defineCodeGen(ctx, ev, c => s"($c).numChars()")
+      case BinaryType => defineCodeGen(ctx, ev, c => s"($c).length")
+    }
   }
 
   override def prettyName: String = "length"
@@ -668,3 +672,77 @@ case class Encode(value: Expression, charset: Expression)
   }
 }
 
+/**
+ * Formats the number X to a format like '#,###,###.##', rounded to D decimal places,
+ * and returns the result as a string. If D is 0, the result has no decimal point or
+ * fractional part.
+ */
+case class FormatNumber(x: Expression, d: Expression)
+  extends BinaryExpression with ExpectsInputTypes {
+
+  override def left: Expression = x
+  override def right: Expression = d
+  override def dataType: DataType = StringType
+  override def inputTypes: Seq[AbstractDataType] = Seq(NumericType, IntegerType)
+
+  // Associated with the pattern, for the last d value, and we will update the
+  // pattern (DecimalFormat) once the new coming d value differ with the last one.
+  @transient
+  private var lastDValue: Int = -100
+
+  // A cached DecimalFormat, for performance concern, we will change it
+  // only if the d value changed.
+  @transient
+  private val pattern: StringBuffer = new StringBuffer()
+
+  @transient
+  private val numberFormat: DecimalFormat = new DecimalFormat("")
+
+  override def eval(input: InternalRow): Any = {
+    val xObject = x.eval(input)
+    if (xObject == null) {
+      return null
+    }
+
+    val dObject = d.eval(input)
+
+    if (dObject == null || dObject.asInstanceOf[Int] < 0) {
+      return null
+    }
+    val dValue = dObject.asInstanceOf[Int]
+
+    if (dValue != lastDValue) {
+      // construct a new DecimalFormat only if a new dValue
+      pattern.delete(0, pattern.length())
+      pattern.append("#,###,###,###,###,###,##0")
+
+      // decimal place
+      if (dValue > 0) {
+        pattern.append(".")
+
+        var i = 0
+        while (i < dValue) {
+          i += 1
+          pattern.append("0")
+        }
+      }
+      val dFormat = new DecimalFormat(pattern.toString())
+      lastDValue = dValue;
+      numberFormat.applyPattern(dFormat.toPattern())
+    }
+
+    x.dataType match {
+      case ByteType => UTF8String.fromString(numberFormat.format(xObject.asInstanceOf[Byte]))
+      case ShortType => UTF8String.fromString(numberFormat.format(xObject.asInstanceOf[Short]))
+      case FloatType => UTF8String.fromString(numberFormat.format(xObject.asInstanceOf[Float]))
+      case IntegerType => UTF8String.fromString(numberFormat.format(xObject.asInstanceOf[Int]))
+      case LongType => UTF8String.fromString(numberFormat.format(xObject.asInstanceOf[Long]))
+      case DoubleType => UTF8String.fromString(numberFormat.format(xObject.asInstanceOf[Double]))
+      case _: DecimalType =>
+        UTF8String.fromString(numberFormat.format(xObject.asInstanceOf[Decimal].toJavaBigDecimal))
+    }
+  }
+
+  override def prettyName: String = "format_number"
+}
+
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringFunctionsSuite.scala
index b19f4ee37a109..5d7763bedf6bd 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringFunctionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringFunctionsSuite.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.dsl.expressions._
-import org.apache.spark.sql.types.{BinaryType, IntegerType, StringType}
+import org.apache.spark.sql.types._
 
 
 class StringFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
@@ -216,15 +216,6 @@ class StringFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     }
   }
 
-  test("length for string") {
-    val a = 'a.string.at(0)
-    checkEvaluation(StringLength(Literal("abc")), 3, create_row("abdef"))
-    checkEvaluation(StringLength(a), 5, create_row("abdef"))
-    checkEvaluation(StringLength(a), 0, create_row(""))
-    checkEvaluation(StringLength(a), null, create_row(null))
-    checkEvaluation(StringLength(Literal.create(null, StringType)), null, create_row("abdef"))
-  }
-
   test("ascii for string") {
     val a = 'a.string.at(0)
     checkEvaluation(Ascii(Literal("efg")), 101, create_row("abdef"))
@@ -426,4 +417,46 @@ class StringFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(
       StringSplit(s1, s2), Seq("aa", "bb", "cc"), row1)
   }
+
+  test("length for string / binary") {
+    val a = 'a.string.at(0)
+    val b = 'b.binary.at(0)
+    val bytes = Array[Byte](1, 2, 3, 1, 2)
+    val string = "abdef"
+
+    // scalastyle:off
+    // non ascii characters are not allowed in the source code, so we disable the scalastyle.
+    checkEvaluation(Length(Literal("a花花c")), 4, create_row(string))
+    // scalastyle:on
+    checkEvaluation(Length(Literal(bytes)), 5, create_row(Array[Byte]()))
+
+    checkEvaluation(Length(a), 5, create_row(string))
+    checkEvaluation(Length(b), 5, create_row(bytes))
+
+    checkEvaluation(Length(a), 0, create_row(""))
+    checkEvaluation(Length(b), 0, create_row(Array[Byte]()))
+
+    checkEvaluation(Length(a), null, create_row(null))
+    checkEvaluation(Length(b), null, create_row(null))
+
+    checkEvaluation(Length(Literal.create(null, StringType)), null, create_row(string))
+    checkEvaluation(Length(Literal.create(null, BinaryType)), null, create_row(bytes))
+  }
+
+  test("number format") {
+    checkEvaluation(FormatNumber(Literal(4.asInstanceOf[Byte]), Literal(3)), "4.000")
+    checkEvaluation(FormatNumber(Literal(4.asInstanceOf[Short]), Literal(3)), "4.000")
+    checkEvaluation(FormatNumber(Literal(4.0f), Literal(3)), "4.000")
+    checkEvaluation(FormatNumber(Literal(4), Literal(3)), "4.000")
+    checkEvaluation(FormatNumber(Literal(12831273.23481d), Literal(3)), "12,831,273.235")
+    checkEvaluation(FormatNumber(Literal(12831273.83421d), Literal(0)), "12,831,274")
+    checkEvaluation(FormatNumber(Literal(123123324123L), Literal(3)), "123,123,324,123.000")
+    checkEvaluation(FormatNumber(Literal(123123324123L), Literal(-1)), null)
+    checkEvaluation(
+      FormatNumber(
+        Literal(Decimal(123123324123L) * Decimal(123123.21234d)), Literal(4)),
+      "15,159,339,180,002,773.2778")
+    checkEvaluation(FormatNumber(Literal.create(null, IntegerType), Literal(3)), null)
+    checkEvaluation(FormatNumber(Literal.create(null, NullType), Literal(3)), null)
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index c7deaca8437a1..d6da284a4c788 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -1685,20 +1685,44 @@ object functions {
   //////////////////////////////////////////////////////////////////////////////////////////////
 
   /**
-   * Computes the length of a given string value.
+   * Computes the length of a given string / binary value.
    *
    * @group string_funcs
    * @since 1.5.0
    */
-  def strlen(e: Column): Column = StringLength(e.expr)
+  def length(e: Column): Column = Length(e.expr)
 
   /**
-   * Computes the length of a given string column.
+   * Computes the length of a given string / binary column.
    *
    * @group string_funcs
    * @since 1.5.0
    */
-  def strlen(columnName: String): Column = strlen(Column(columnName))
+  def length(columnName: String): Column = length(Column(columnName))
+
+  /**
+   * Formats the number X to a format like '#,###,###.##', rounded to d decimal places,
+   * and returns the result as a string.
+   * If d is 0, the result has no decimal point or fractional part.
+   * If d < 0, the result will be null.
+   *
+   * @group string_funcs
+   * @since 1.5.0
+   */
+  def format_number(x: Column, d: Int): Column = FormatNumber(x.expr, lit(d).expr)
+
+  /**
+   * Formats the number X to a format like '#,###,###.##', rounded to d decimal places,
+   * and returns the result as a string.
+   * If d is 0, the result has no decimal point or fractional part.
+   * If d < 0, the result will be null.
+   *
+   * @group string_funcs
+   * @since 1.5.0
+   */
+  def format_number(columnXName: String, d: Int): Column = {
+    format_number(Column(columnXName), d)
+  }
 
   /**
    * Computes the Levenshtein distance of the two given strings.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
index 70bd78737f69c..6dccdd857b453 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
@@ -208,17 +208,6 @@ class DataFrameFunctionsSuite extends QueryTest {
       Row(2743272264L, 2180413220L))
   }
 
-  test("string length function") {
-    val df = Seq(("abc", "")).toDF("a", "b")
-    checkAnswer(
-      df.select(strlen($"a"), strlen("b")),
-      Row(3, 0))
-
-    checkAnswer(
-      df.selectExpr("length(a)", "length(b)"),
-      Row(3, 0))
-  }
-
   test("Levenshtein distance") {
     val df = Seq(("kitten", "sitting"), ("frog", "fog")).toDF("l", "r")
     checkAnswer(df.select(levenshtein("l", "r")), Seq(Row(3), Row(1)))
@@ -433,11 +422,91 @@ class DataFrameFunctionsSuite extends QueryTest {
     val doubleData = Seq((7.2, 4.1)).toDF("a", "b")
     checkAnswer(
       doubleData.select(pmod('a, 'b)),
-      Seq(Row(3.1000000000000005))  // same as hive
+      Seq(Row(3.1000000000000005)) // same as hive
     )
     checkAnswer(
       doubleData.select(pmod(lit(2), lit(Int.MaxValue))),
       Seq(Row(2))
     )
   }
+
+  test("string / binary length function") {
+    val df = Seq(("123", Array[Byte](1, 2, 3, 4), 123)).toDF("a", "b", "c")
+    checkAnswer(
+      df.select(length($"a"), length("a"), length($"b"), length("b")),
+      Row(3, 3, 4, 4))
+
+    checkAnswer(
+      df.selectExpr("length(a)", "length(b)"),
+      Row(3, 4))
+
+    intercept[AnalysisException] {
+      checkAnswer(
+        df.selectExpr("length(c)"), // int type of the argument is unacceptable
+        Row("5.0000"))
+    }
+  }
+
+  test("number format function") {
+    val tuple =
+      ("aa", 1.asInstanceOf[Byte], 2.asInstanceOf[Short],
+        3.13223f, 4, 5L, 6.48173d, Decimal(7.128381))
+    val df =
+      Seq(tuple)
+        .toDF(
+          "a", // string "aa"
+          "b", // byte    1
+          "c", // short   2
+          "d", // float   3.13223f
+          "e", // integer 4
+          "f", // long    5L
+          "g", // double  6.48173d
+          "h") // decimal 7.128381
+
+    checkAnswer(
+      df.select(
+        format_number($"f", 4),
+        format_number("f", 4)),
+      Row("5.0000", "5.0000"))
+
+    checkAnswer(
+      df.selectExpr("format_number(b, e)"), // convert the 1st argument to integer
+      Row("1.0000"))
+
+    checkAnswer(
+      df.selectExpr("format_number(c, e)"), // convert the 1st argument to integer
+      Row("2.0000"))
+
+    checkAnswer(
+      df.selectExpr("format_number(d, e)"), // convert the 1st argument to double
+      Row("3.1322"))
+
+    checkAnswer(
+      df.selectExpr("format_number(e, e)"), // not convert anything
+      Row("4.0000"))
+
+    checkAnswer(
+      df.selectExpr("format_number(f, e)"), // not convert anything
+      Row("5.0000"))
+
+    checkAnswer(
+      df.selectExpr("format_number(g, e)"), // not convert anything
+      Row("6.4817"))
+
+    checkAnswer(
+      df.selectExpr("format_number(h, e)"), // not convert anything
+      Row("7.1284"))
+
+    intercept[AnalysisException] {
+      checkAnswer(
+        df.selectExpr("format_number(a, e)"), // string type of the 1st argument is unacceptable
+        Row("5.0000"))
+    }
+
+    intercept[AnalysisException] {
+      checkAnswer(
+        df.selectExpr("format_number(e, g)"), // decimal type of the 2nd argument is unacceptable
+        Row("5.0000"))
+    }
+  }
 }

From ba33096846dc8061e97a7bf8f3b46f899d530159 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@outlook.com>
Date: Wed, 15 Jul 2015 22:27:39 -0700
Subject: [PATCH 182/549] [SPARK-9068][SQL] refactor the implicit type cast
 code

based on https://github.com/apache/spark/pull/7348

Author: Wenchen Fan <cloud0fan@outlook.com>

Closes #7420 from cloud-fan/type-check and squashes the following commits:

7633fa9 [Wenchen Fan] revert
fe169b0 [Wenchen Fan] improve test
03b70da [Wenchen Fan] enhance implicit type cast
---
 .../catalyst/analysis/HiveTypeCoercion.scala  | 33 +++-----
 .../sql/catalyst/expressions/Expression.scala | 20 +++--
 .../sql/catalyst/expressions/arithmetic.scala |  2 -
 .../sql/catalyst/expressions/bitwise.scala    |  8 +-
 .../catalyst/expressions/conditionals.scala   |  4 +-
 .../spark/sql/types/AbstractDataType.scala    | 45 +++--------
 .../apache/spark/sql/types/ArrayType.scala    |  2 +-
 .../org/apache/spark/sql/types/DataType.scala |  2 +-
 .../apache/spark/sql/types/DecimalType.scala  |  2 +-
 .../org/apache/spark/sql/types/MapType.scala  |  2 +-
 .../apache/spark/sql/types/StructType.scala   |  2 +-
 .../ExpressionTypeCheckingSuite.scala         | 75 +++++++++----------
 .../analysis/HiveTypeCoercionSuite.scala      | 10 +--
 13 files changed, 81 insertions(+), 126 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
index 25087915b5c35..50db7d21f01ca 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
@@ -675,10 +675,10 @@ object HiveTypeCoercion {
       case b @ BinaryOperator(left, right) if left.dataType != right.dataType =>
         findTightestCommonTypeOfTwo(left.dataType, right.dataType).map { commonType =>
           if (b.inputType.acceptsType(commonType)) {
-            // If the expression accepts the tighest common type, cast to that.
+            // If the expression accepts the tightest common type, cast to that.
             val newLeft = if (left.dataType == commonType) left else Cast(left, commonType)
             val newRight = if (right.dataType == commonType) right else Cast(right, commonType)
-            b.makeCopy(Array(newLeft, newRight))
+            b.withNewChildren(Seq(newLeft, newRight))
           } else {
             // Otherwise, don't do anything with the expression.
             b
@@ -697,7 +697,7 @@ object HiveTypeCoercion {
         // general implicit casting.
         val children: Seq[Expression] = e.children.zip(e.inputTypes).map { case (in, expected) =>
           if (in.dataType == NullType && !expected.acceptsType(NullType)) {
-            Cast(in, expected.defaultConcreteType)
+            Literal.create(null, expected.defaultConcreteType)
           } else {
             in
           }
@@ -719,27 +719,22 @@ object HiveTypeCoercion {
       @Nullable val ret: Expression = (inType, expectedType) match {
 
         // If the expected type is already a parent of the input type, no need to cast.
-        case _ if expectedType.isSameType(inType) => e
+        case _ if expectedType.acceptsType(inType) => e
 
         // Cast null type (usually from null literals) into target types
         case (NullType, target) => Cast(e, target.defaultConcreteType)
 
-        // If the function accepts any numeric type (i.e. the ADT `NumericType`) and the input is
-        // already a number, leave it as is.
-        case (_: NumericType, NumericType) => e
-
         // If the function accepts any numeric type and the input is a string, we follow the hive
         // convention and cast that input into a double
         case (StringType, NumericType) => Cast(e, NumericType.defaultConcreteType)
 
-        // Implicit cast among numeric types
+        // Implicit cast among numeric types. When we reach here, input type is not acceptable.
+
         // If input is a numeric type but not decimal, and we expect a decimal type,
         // cast the input to unlimited precision decimal.
-        case (_: NumericType, DecimalType) if !inType.isInstanceOf[DecimalType] =>
-          Cast(e, DecimalType.Unlimited)
+        case (_: NumericType, DecimalType) => Cast(e, DecimalType.Unlimited)
         // For any other numeric types, implicitly cast to each other, e.g. long -> int, int -> long
-        case (_: NumericType, target: NumericType) if e.dataType != target => Cast(e, target)
-        case (_: NumericType, target: NumericType) => e
+        case (_: NumericType, target: NumericType) => Cast(e, target)
 
         // Implicit cast between date time types
         case (DateType, TimestampType) => Cast(e, TimestampType)
@@ -753,15 +748,9 @@ object HiveTypeCoercion {
         case (StringType, BinaryType) => Cast(e, BinaryType)
         case (any, StringType) if any != StringType => Cast(e, StringType)
 
-        // Type collection.
-        // First see if we can find our input type in the type collection. If we can, then just
-        // use the current expression; otherwise, find the first one we can implicitly cast.
-        case (_, TypeCollection(types)) =>
-          if (types.exists(_.isSameType(inType))) {
-            e
-          } else {
-            types.flatMap(implicitCast(e, _)).headOption.orNull
-          }
+        // When we reach here, input type is not acceptable for any types in this type collection,
+        // try to find the first one we can implicitly cast.
+        case (_, TypeCollection(types)) => types.flatMap(implicitCast(e, _)).headOption.orNull
 
         // Else, just return the same input expression
         case _ => null
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
index 87667316aca67..a655cc8e48ae1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
@@ -386,17 +386,15 @@ abstract class BinaryOperator extends BinaryExpression with ExpectsInputTypes {
   override def inputTypes: Seq[AbstractDataType] = Seq(inputType, inputType)
 
   override def checkInputDataTypes(): TypeCheckResult = {
-    // First call the checker for ExpectsInputTypes, and then check whether left and right have
-    // the same type.
-    super.checkInputDataTypes() match {
-      case TypeCheckResult.TypeCheckSuccess =>
-        if (left.dataType != right.dataType) {
-          TypeCheckResult.TypeCheckFailure(s"differing types in '$prettyString' " +
-            s"(${left.dataType.simpleString} and ${right.dataType.simpleString}).")
-        } else {
-          TypeCheckResult.TypeCheckSuccess
-        }
-      case TypeCheckResult.TypeCheckFailure(msg) => TypeCheckResult.TypeCheckFailure(msg)
+    // First check whether left and right have the same type, then check if the type is acceptable.
+    if (left.dataType != right.dataType) {
+      TypeCheckResult.TypeCheckFailure(s"differing types in '$prettyString' " +
+        s"(${left.dataType.simpleString} and ${right.dataType.simpleString}).")
+    } else if (!inputType.acceptsType(left.dataType)) {
+      TypeCheckResult.TypeCheckFailure(s"'$prettyString' accepts ${inputType.simpleString} type," +
+        s" not ${left.dataType.simpleString}")
+    } else {
+      TypeCheckResult.TypeCheckSuccess
     }
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
index 394ef556e04a2..382cbe3b84a07 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
@@ -320,7 +320,6 @@ case class MaxOf(left: Expression, right: Expression) extends BinaryArithmetic {
   }
 
   override def symbol: String = "max"
-  override def prettyName: String = symbol
 }
 
 case class MinOf(left: Expression, right: Expression) extends BinaryArithmetic {
@@ -375,7 +374,6 @@ case class MinOf(left: Expression, right: Expression) extends BinaryArithmetic {
   }
 
   override def symbol: String = "min"
-  override def prettyName: String = symbol
 }
 
 case class Pmod(left: Expression, right: Expression) extends BinaryArithmetic {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/bitwise.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/bitwise.scala
index af1abbcd2239b..a1e48c4210877 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/bitwise.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/bitwise.scala
@@ -28,7 +28,7 @@ import org.apache.spark.sql.types._
  */
 case class BitwiseAnd(left: Expression, right: Expression) extends BinaryArithmetic {
 
-  override def inputType: AbstractDataType = TypeCollection.Bitwise
+  override def inputType: AbstractDataType = IntegralType
 
   override def symbol: String = "&"
 
@@ -53,7 +53,7 @@ case class BitwiseAnd(left: Expression, right: Expression) extends BinaryArithme
  */
 case class BitwiseOr(left: Expression, right: Expression) extends BinaryArithmetic {
 
-  override def inputType: AbstractDataType = TypeCollection.Bitwise
+  override def inputType: AbstractDataType = IntegralType
 
   override def symbol: String = "|"
 
@@ -78,7 +78,7 @@ case class BitwiseOr(left: Expression, right: Expression) extends BinaryArithmet
  */
 case class BitwiseXor(left: Expression, right: Expression) extends BinaryArithmetic {
 
-  override def inputType: AbstractDataType = TypeCollection.Bitwise
+  override def inputType: AbstractDataType = IntegralType
 
   override def symbol: String = "^"
 
@@ -101,7 +101,7 @@ case class BitwiseXor(left: Expression, right: Expression) extends BinaryArithme
  */
 case class BitwiseNot(child: Expression) extends UnaryExpression with ExpectsInputTypes {
 
-  override def inputTypes: Seq[AbstractDataType] = Seq(TypeCollection.Bitwise)
+  override def inputTypes: Seq[AbstractDataType] = Seq(IntegralType)
 
   override def dataType: DataType = child.dataType
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionals.scala
index c7f039ede26b3..9162b73fe56eb 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionals.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionals.scala
@@ -35,8 +35,8 @@ case class If(predicate: Expression, trueValue: Expression, falseValue: Expressi
       TypeCheckResult.TypeCheckFailure(
         s"type of predicate expression in If should be boolean, not ${predicate.dataType}")
     } else if (trueValue.dataType != falseValue.dataType) {
-      TypeCheckResult.TypeCheckFailure(
-        s"differing types in If (${trueValue.dataType} and ${falseValue.dataType}).")
+      TypeCheckResult.TypeCheckFailure(s"differing types in '$prettyString' " +
+        s"(${trueValue.dataType.simpleString} and ${falseValue.dataType.simpleString}).")
     } else {
       TypeCheckResult.TypeCheckSuccess
     }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala
index f5715f7a829ff..076d7b5a5118d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala
@@ -34,32 +34,18 @@ private[sql] abstract class AbstractDataType {
   private[sql] def defaultConcreteType: DataType
 
   /**
-   * Returns true if this data type is the same type as `other`.  This is different that equality
-   * as equality will also consider data type parametrization, such as decimal precision.
+   * Returns true if `other` is an acceptable input type for a function that expects this,
+   * possibly abstract DataType.
    *
    * {{{
    *   // this should return true
-   *   DecimalType.isSameType(DecimalType(10, 2))
-   *
-   *   // this should return false
-   *   NumericType.isSameType(DecimalType(10, 2))
-   * }}}
-   */
-  private[sql] def isSameType(other: DataType): Boolean
-
-  /**
-   * Returns true if `other` is an acceptable input type for a function that expectes this,
-   * possibly abstract, DataType.
-   *
-   * {{{
-   *   // this should return true
-   *   DecimalType.isSameType(DecimalType(10, 2))
+   *   DecimalType.acceptsType(DecimalType(10, 2))
    *
    *   // this should return true as well
    *   NumericType.acceptsType(DecimalType(10, 2))
    * }}}
    */
-  private[sql] def acceptsType(other: DataType): Boolean = isSameType(other)
+  private[sql] def acceptsType(other: DataType): Boolean
 
   /** Readable string representation for the type. */
   private[sql] def simpleString: String
@@ -83,10 +69,8 @@ private[sql] class TypeCollection(private val types: Seq[AbstractDataType])
 
   override private[sql] def defaultConcreteType: DataType = types.head.defaultConcreteType
 
-  override private[sql] def isSameType(other: DataType): Boolean = false
-
   override private[sql] def acceptsType(other: DataType): Boolean =
-    types.exists(_.isSameType(other))
+    types.exists(_.acceptsType(other))
 
   override private[sql] def simpleString: String = {
     types.map(_.simpleString).mkString("(", " or ", ")")
@@ -107,13 +91,6 @@ private[sql] object TypeCollection {
     TimestampType, DateType,
     StringType, BinaryType)
 
-  /**
-   * Types that can be used in bitwise operations.
-   */
-  val Bitwise = TypeCollection(
-    BooleanType,
-    ByteType, ShortType, IntegerType, LongType)
-
   def apply(types: AbstractDataType*): TypeCollection = new TypeCollection(types)
 
   def unapply(typ: AbstractDataType): Option[Seq[AbstractDataType]] = typ match {
@@ -134,8 +111,6 @@ protected[sql] object AnyDataType extends AbstractDataType {
 
   override private[sql] def simpleString: String = "any"
 
-  override private[sql] def isSameType(other: DataType): Boolean = false
-
   override private[sql] def acceptsType(other: DataType): Boolean = true
 }
 
@@ -183,13 +158,11 @@ private[sql] object NumericType extends AbstractDataType {
 
   override private[sql] def simpleString: String = "numeric"
 
-  override private[sql] def isSameType(other: DataType): Boolean = false
-
   override private[sql] def acceptsType(other: DataType): Boolean = other.isInstanceOf[NumericType]
 }
 
 
-private[sql] object IntegralType {
+private[sql] object IntegralType extends AbstractDataType {
   /**
    * Enables matching against IntegralType for expressions:
    * {{{
@@ -198,6 +171,12 @@ private[sql] object IntegralType {
    * }}}
    */
   def unapply(e: Expression): Boolean = e.dataType.isInstanceOf[IntegralType]
+
+  override private[sql] def defaultConcreteType: DataType = IntegerType
+
+  override private[sql] def simpleString: String = "integral"
+
+  override private[sql] def acceptsType(other: DataType): Boolean = other.isInstanceOf[IntegralType]
 }
 
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala
index 76ca7a84c1d1a..5094058164b2f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala
@@ -28,7 +28,7 @@ object ArrayType extends AbstractDataType {
 
   override private[sql] def defaultConcreteType: DataType = ArrayType(NullType, containsNull = true)
 
-  override private[sql] def isSameType(other: DataType): Boolean = {
+  override private[sql] def acceptsType(other: DataType): Boolean = {
     other.isInstanceOf[ArrayType]
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala
index da83a7f0ba379..2d133eea19fe0 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala
@@ -79,7 +79,7 @@ abstract class DataType extends AbstractDataType {
 
   override private[sql] def defaultConcreteType: DataType = this
 
-  override private[sql] def isSameType(other: DataType): Boolean = this == other
+  override private[sql] def acceptsType(other: DataType): Boolean = this == other
 }
 
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala
index a1cafeab1704d..377c75f6e85a5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala
@@ -86,7 +86,7 @@ object DecimalType extends AbstractDataType {
 
   override private[sql] def defaultConcreteType: DataType = Unlimited
 
-  override private[sql] def isSameType(other: DataType): Boolean = {
+  override private[sql] def acceptsType(other: DataType): Boolean = {
     other.isInstanceOf[DecimalType]
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala
index ddead10bc2171..ac34b642827ca 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala
@@ -71,7 +71,7 @@ object MapType extends AbstractDataType {
 
   override private[sql] def defaultConcreteType: DataType = apply(NullType, NullType)
 
-  override private[sql] def isSameType(other: DataType): Boolean = {
+  override private[sql] def acceptsType(other: DataType): Boolean = {
     other.isInstanceOf[MapType]
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
index b8097403ec3cc..2ef97a427c37e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
@@ -307,7 +307,7 @@ object StructType extends AbstractDataType {
 
   override private[sql] def defaultConcreteType: DataType = new StructType
 
-  override private[sql] def isSameType(other: DataType): Boolean = {
+  override private[sql] def acceptsType(other: DataType): Boolean = {
     other.isInstanceOf[StructType]
   }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala
index a4ce1825cab28..ed0d20e7de80e 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala
@@ -23,7 +23,7 @@ import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.dsl.plans._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical.LocalRelation
-import org.apache.spark.sql.types.StringType
+import org.apache.spark.sql.types.{TypeCollection, StringType}
 
 class ExpressionTypeCheckingSuite extends SparkFunSuite {
 
@@ -49,23 +49,16 @@ class ExpressionTypeCheckingSuite extends SparkFunSuite {
 
   def assertErrorForDifferingTypes(expr: Expression): Unit = {
     assertError(expr,
-      s"differing types in '${expr.prettyString}' (int and boolean)")
-  }
-
-  def assertErrorWithImplicitCast(expr: Expression, errorMessage: String): Unit = {
-    val e = intercept[AnalysisException] {
-      assertSuccess(expr)
-    }
-    assert(e.getMessage.contains(errorMessage))
+      s"differing types in '${expr.prettyString}'")
   }
 
   test("check types for unary arithmetic") {
     assertError(UnaryMinus('stringField), "expected to be of type numeric")
     assertError(Abs('stringField), "expected to be of type numeric")
-    assertError(BitwiseNot('stringField), "type (boolean or tinyint or smallint or int or bigint)")
+    assertError(BitwiseNot('stringField), "expected to be of type integral")
   }
 
-  ignore("check types for binary arithmetic") {
+  test("check types for binary arithmetic") {
     // We will cast String to Double for binary arithmetic
     assertSuccess(Add('intField, 'stringField))
     assertSuccess(Subtract('intField, 'stringField))
@@ -85,21 +78,23 @@ class ExpressionTypeCheckingSuite extends SparkFunSuite {
     assertErrorForDifferingTypes(MaxOf('intField, 'booleanField))
     assertErrorForDifferingTypes(MinOf('intField, 'booleanField))
 
-    assertError(Add('booleanField, 'booleanField), "operator + accepts numeric type")
-    assertError(Subtract('booleanField, 'booleanField), "operator - accepts numeric type")
-    assertError(Multiply('booleanField, 'booleanField), "operator * accepts numeric type")
-    assertError(Divide('booleanField, 'booleanField), "operator / accepts numeric type")
-    assertError(Remainder('booleanField, 'booleanField), "operator % accepts numeric type")
+    assertError(Add('booleanField, 'booleanField), "accepts numeric type")
+    assertError(Subtract('booleanField, 'booleanField), "accepts numeric type")
+    assertError(Multiply('booleanField, 'booleanField), "accepts numeric type")
+    assertError(Divide('booleanField, 'booleanField), "accepts numeric type")
+    assertError(Remainder('booleanField, 'booleanField), "accepts numeric type")
 
-    assertError(BitwiseAnd('booleanField, 'booleanField), "operator & accepts integral type")
-    assertError(BitwiseOr('booleanField, 'booleanField), "operator | accepts integral type")
-    assertError(BitwiseXor('booleanField, 'booleanField), "operator ^ accepts integral type")
+    assertError(BitwiseAnd('booleanField, 'booleanField), "accepts integral type")
+    assertError(BitwiseOr('booleanField, 'booleanField), "accepts integral type")
+    assertError(BitwiseXor('booleanField, 'booleanField), "accepts integral type")
 
-    assertError(MaxOf('complexField, 'complexField), "function maxOf accepts non-complex type")
-    assertError(MinOf('complexField, 'complexField), "function minOf accepts non-complex type")
+    assertError(MaxOf('complexField, 'complexField),
+      s"accepts ${TypeCollection.Ordered.simpleString} type")
+    assertError(MinOf('complexField, 'complexField),
+      s"accepts ${TypeCollection.Ordered.simpleString} type")
   }
 
-  ignore("check types for predicates") {
+  test("check types for predicates") {
     // We will cast String to Double for binary comparison
     assertSuccess(EqualTo('intField, 'stringField))
     assertSuccess(EqualNullSafe('intField, 'stringField))
@@ -112,25 +107,23 @@ class ExpressionTypeCheckingSuite extends SparkFunSuite {
     assertSuccess(EqualTo('intField, 'booleanField))
     assertSuccess(EqualNullSafe('intField, 'booleanField))
 
-    assertError(EqualTo('intField, 'complexField), "differing types")
-    assertError(EqualNullSafe('intField, 'complexField), "differing types")
-
+    assertErrorForDifferingTypes(EqualTo('intField, 'complexField))
+    assertErrorForDifferingTypes(EqualNullSafe('intField, 'complexField))
     assertErrorForDifferingTypes(LessThan('intField, 'booleanField))
     assertErrorForDifferingTypes(LessThanOrEqual('intField, 'booleanField))
     assertErrorForDifferingTypes(GreaterThan('intField, 'booleanField))
     assertErrorForDifferingTypes(GreaterThanOrEqual('intField, 'booleanField))
 
-    assertError(
-      LessThan('complexField, 'complexField), "operator < accepts non-complex type")
-    assertError(
-      LessThanOrEqual('complexField, 'complexField), "operator <= accepts non-complex type")
-    assertError(
-      GreaterThan('complexField, 'complexField), "operator > accepts non-complex type")
-    assertError(
-      GreaterThanOrEqual('complexField, 'complexField), "operator >= accepts non-complex type")
+    assertError(LessThan('complexField, 'complexField),
+      s"accepts ${TypeCollection.Ordered.simpleString} type")
+    assertError(LessThanOrEqual('complexField, 'complexField),
+      s"accepts ${TypeCollection.Ordered.simpleString} type")
+    assertError(GreaterThan('complexField, 'complexField),
+      s"accepts ${TypeCollection.Ordered.simpleString} type")
+    assertError(GreaterThanOrEqual('complexField, 'complexField),
+      s"accepts ${TypeCollection.Ordered.simpleString} type")
 
-    assertError(
-      If('intField, 'stringField, 'stringField),
+    assertError(If('intField, 'stringField, 'stringField),
       "type of predicate expression in If should be boolean")
     assertErrorForDifferingTypes(If('booleanField, 'intField, 'booleanField))
 
@@ -180,12 +173,12 @@ class ExpressionTypeCheckingSuite extends SparkFunSuite {
   }
 
   test("check types for ROUND") {
-    assertErrorWithImplicitCast(Round(Literal(null), 'booleanField),
-      "data type mismatch: argument 2 is expected to be of type int")
-    assertErrorWithImplicitCast(Round(Literal(null), 'complexField),
-      "data type mismatch: argument 2 is expected to be of type int")
     assertSuccess(Round(Literal(null), Literal(null)))
-    assertError(Round('booleanField, 'intField),
-      "data type mismatch: argument 1 is expected to be of type numeric")
+    assertSuccess(Round('intField, Literal(1)))
+
+    assertError(Round('intField, 'intField), "Only foldable Expression is allowed")
+    assertError(Round('intField, 'booleanField), "expected to be of type int")
+    assertError(Round('intField, 'complexField), "expected to be of type int")
+    assertError(Round('booleanField, 'intField), "expected to be of type numeric")
   }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
index 8e9b20a3ebe42..d0fd033b981c8 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
@@ -203,7 +203,7 @@ class HiveTypeCoercionSuite extends PlanTest {
 
     ruleTest(HiveTypeCoercion.ImplicitTypeCasts,
       NumericTypeUnaryExpression(Literal.create(null, NullType)),
-      NumericTypeUnaryExpression(Cast(Literal.create(null, NullType), DoubleType)))
+      NumericTypeUnaryExpression(Literal.create(null, DoubleType)))
   }
 
   test("cast NullType for binary operators") {
@@ -215,9 +215,7 @@ class HiveTypeCoercionSuite extends PlanTest {
 
     ruleTest(HiveTypeCoercion.ImplicitTypeCasts,
       NumericTypeBinaryOperator(Literal.create(null, NullType), Literal.create(null, NullType)),
-      NumericTypeBinaryOperator(
-        Cast(Literal.create(null, NullType), DoubleType),
-        Cast(Literal.create(null, NullType), DoubleType)))
+      NumericTypeBinaryOperator(Literal.create(null, DoubleType), Literal.create(null, DoubleType)))
   }
 
   test("coalesce casts") {
@@ -345,14 +343,14 @@ object HiveTypeCoercionSuite {
   }
 
   case class AnyTypeBinaryOperator(left: Expression, right: Expression)
-    extends BinaryOperator with ExpectsInputTypes {
+    extends BinaryOperator {
     override def dataType: DataType = NullType
     override def inputType: AbstractDataType = AnyDataType
     override def symbol: String = "anytype"
   }
 
   case class NumericTypeBinaryOperator(left: Expression, right: Expression)
-    extends BinaryOperator with ExpectsInputTypes {
+    extends BinaryOperator {
     override def dataType: DataType = NullType
     override def inputType: AbstractDataType = NumericType
     override def symbol: String = "numerictype"

From e27212317c7341852c52d9a85137b8f94cb0d935 Mon Sep 17 00:00:00 2001
From: Cheng Hao <hao.cheng@intel.com>
Date: Wed, 15 Jul 2015 23:35:27 -0700
Subject: [PATCH 183/549] [SPARK-8972] [SQL] Incorrect result for rollup

We don't support the complex expression keys in the rollup/cube, and we even will not report it if we have the complex group by keys, that will cause very confusing/incorrect result.

e.g. `SELECT key%100 FROM src GROUP BY key %100 with ROLLUP`

This PR adds an additional project during the analyzing for the complex GROUP BY keys, and that projection will be the child of `Expand`, so to `Expand`, the GROUP BY KEY are always the simple key(attribute names).

Author: Cheng Hao <hao.cheng@intel.com>

Closes #7343 from chenghao-intel/expand and squashes the following commits:

1ebbb59 [Cheng Hao] update the comment
827873f [Cheng Hao] update as feedback
34def69 [Cheng Hao] Add more unit test and comments
c695760 [Cheng Hao] fix bug of incorrect result for rollup
---
 .../sql/catalyst/analysis/Analyzer.scala      | 42 +++++++++++++--
 ...CUBE #1-0-63b61fb3f0e74226001ad279be440864 |  6 +++
 ...CUBE #2-0-7a511f02a16f0af4f810b1666cfcd896 | 10 ++++
 ...pingSet-0-8c14c24670a4b06c440346277ce9cf1c | 10 ++++
 ...llup #1-0-a78e3dbf242f240249e36b3d3fd0926a |  6 +++
 ...llup #2-0-bf180c9d1a18f61b9d9f31bb0115cf89 | 10 ++++
 ...llup #3-0-9257085d123728730be96b6d9fbb84ce | 10 ++++
 .../sql/hive/execution/HiveQuerySuite.scala   | 54 +++++++++++++++++++
 8 files changed, 145 insertions(+), 3 deletions(-)
 create mode 100644 sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for CUBE #1-0-63b61fb3f0e74226001ad279be440864
 create mode 100644 sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for CUBE #2-0-7a511f02a16f0af4f810b1666cfcd896
 create mode 100644 sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for GroupingSet-0-8c14c24670a4b06c440346277ce9cf1c
 create mode 100644 sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for Rollup #1-0-a78e3dbf242f240249e36b3d3fd0926a
 create mode 100644 sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for Rollup #2-0-bf180c9d1a18f61b9d9f31bb0115cf89
 create mode 100644 sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for Rollup #3-0-9257085d123728730be96b6d9fbb84ce

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 891408e310049..df8e7f2381fbd 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -194,16 +194,52 @@ class Analyzer(
     }
 
     def apply(plan: LogicalPlan): LogicalPlan = plan transform {
+      case a if !a.childrenResolved => a // be sure all of the children are resolved.
       case a: Cube =>
         GroupingSets(bitmasks(a), a.groupByExprs, a.child, a.aggregations)
       case a: Rollup =>
         GroupingSets(bitmasks(a), a.groupByExprs, a.child, a.aggregations)
       case x: GroupingSets =>
         val gid = AttributeReference(VirtualColumn.groupingIdName, IntegerType, false)()
+        // We will insert another Projection if the GROUP BY keys contains the
+        // non-attribute expressions. And the top operators can references those
+        // expressions by its alias.
+        // e.g. SELECT key%5 as c1 FROM src GROUP BY key%5 ==>
+        //      SELECT a as c1 FROM (SELECT key%5 AS a FROM src) GROUP BY a
+
+        // find all of the non-attribute expressions in the GROUP BY keys
+        val nonAttributeGroupByExpressions = new ArrayBuffer[Alias]()
+
+        // The pair of (the original GROUP BY key, associated attribute)
+        val groupByExprPairs = x.groupByExprs.map(_ match {
+          case e: NamedExpression => (e, e.toAttribute)
+          case other => {
+            val alias = Alias(other, other.toString)()
+            nonAttributeGroupByExpressions += alias // add the non-attributes expression alias
+            (other, alias.toAttribute)
+          }
+        })
+
+        // substitute the non-attribute expressions for aggregations.
+        val aggregation = x.aggregations.map(expr => expr.transformDown {
+          case e => groupByExprPairs.find(_._1.semanticEquals(e)).map(_._2).getOrElse(e)
+        }.asInstanceOf[NamedExpression])
+
+        // substitute the group by expressions.
+        val newGroupByExprs = groupByExprPairs.map(_._2)
+
+        val child = if (nonAttributeGroupByExpressions.length > 0) {
+          // insert additional projection if contains the
+          // non-attribute expressions in the GROUP BY keys
+          Project(x.child.output ++ nonAttributeGroupByExpressions, x.child)
+        } else {
+          x.child
+        }
+
         Aggregate(
-          x.groupByExprs :+ VirtualColumn.groupingIdAttribute,
-          x.aggregations,
-          Expand(x.bitmasks, x.groupByExprs, gid, x.child))
+          newGroupByExprs :+ VirtualColumn.groupingIdAttribute,
+          aggregation,
+          Expand(x.bitmasks, newGroupByExprs, gid, child))
     }
   }
 
diff --git a/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for CUBE #1-0-63b61fb3f0e74226001ad279be440864 b/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for CUBE #1-0-63b61fb3f0e74226001ad279be440864
new file mode 100644
index 0000000000000..dac1b84b916d7
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for CUBE #1-0-63b61fb3f0e74226001ad279be440864	
@@ -0,0 +1,6 @@
+500	NULL	0
+91	0	1
+84	1	1
+105	2	1
+113	3	1
+107	4	1
diff --git a/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for CUBE #2-0-7a511f02a16f0af4f810b1666cfcd896 b/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for CUBE #2-0-7a511f02a16f0af4f810b1666cfcd896
new file mode 100644
index 0000000000000..c7cb747c0a659
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for CUBE #2-0-7a511f02a16f0af4f810b1666cfcd896	
@@ -0,0 +1,10 @@
+1	NULL	-3	2
+1	NULL	-1	2
+1	NULL	3	2
+1	NULL	4	2
+1	NULL	5	2
+1	NULL	6	2
+1	NULL	12	2
+1	NULL	14	2
+1	NULL	15	2
+1	NULL	22	2
diff --git a/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for GroupingSet-0-8c14c24670a4b06c440346277ce9cf1c b/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for GroupingSet-0-8c14c24670a4b06c440346277ce9cf1c
new file mode 100644
index 0000000000000..c7cb747c0a659
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for GroupingSet-0-8c14c24670a4b06c440346277ce9cf1c	
@@ -0,0 +1,10 @@
+1	NULL	-3	2
+1	NULL	-1	2
+1	NULL	3	2
+1	NULL	4	2
+1	NULL	5	2
+1	NULL	6	2
+1	NULL	12	2
+1	NULL	14	2
+1	NULL	15	2
+1	NULL	22	2
diff --git a/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for Rollup #1-0-a78e3dbf242f240249e36b3d3fd0926a b/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for Rollup #1-0-a78e3dbf242f240249e36b3d3fd0926a
new file mode 100644
index 0000000000000..dac1b84b916d7
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for Rollup #1-0-a78e3dbf242f240249e36b3d3fd0926a	
@@ -0,0 +1,6 @@
+500	NULL	0
+91	0	1
+84	1	1
+105	2	1
+113	3	1
+107	4	1
diff --git a/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for Rollup #2-0-bf180c9d1a18f61b9d9f31bb0115cf89 b/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for Rollup #2-0-bf180c9d1a18f61b9d9f31bb0115cf89
new file mode 100644
index 0000000000000..1eea4a9b23687
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for Rollup #2-0-bf180c9d1a18f61b9d9f31bb0115cf89	
@@ -0,0 +1,10 @@
+1	0	5	3
+1	0	15	3
+1	0	25	3
+1	0	60	3
+1	0	75	3
+1	0	80	3
+1	0	100	3
+1	0	140	3
+1	0	145	3
+1	0	150	3
diff --git a/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for Rollup #3-0-9257085d123728730be96b6d9fbb84ce b/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for Rollup #3-0-9257085d123728730be96b6d9fbb84ce
new file mode 100644
index 0000000000000..1eea4a9b23687
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for Rollup #3-0-9257085d123728730be96b6d9fbb84ce	
@@ -0,0 +1,10 @@
+1	0	5	3
+1	0	15	3
+1	0	25	3
+1	0	60	3
+1	0	75	3
+1	0	80	3
+1	0	100	3
+1	0	140	3
+1	0	145	3
+1	0	150	3
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
index 991da2f829ae5..11a843becce69 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
@@ -85,6 +85,60 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter {
     }
   }
 
+  createQueryTest("SPARK-8976 Wrong Result for Rollup #1",
+    """
+      SELECT count(*) AS cnt, key % 5,GROUPING__ID FROM src group by key%5 WITH ROLLUP
+    """.stripMargin)
+
+  createQueryTest("SPARK-8976 Wrong Result for Rollup #2",
+    """
+      SELECT
+        count(*) AS cnt,
+        key % 5 as k1,
+        key-5 as k2,
+        GROUPING__ID as k3
+      FROM src group by key%5, key-5
+      WITH ROLLUP ORDER BY cnt, k1, k2, k3 LIMIT 10
+    """.stripMargin)
+
+  createQueryTest("SPARK-8976 Wrong Result for Rollup #3",
+    """
+      SELECT
+        count(*) AS cnt,
+        key % 5 as k1,
+        key-5 as k2,
+        GROUPING__ID as k3
+      FROM (SELECT key, key%2, key - 5 FROM src) t group by key%5, key-5
+      WITH ROLLUP ORDER BY cnt, k1, k2, k3 LIMIT 10
+    """.stripMargin)
+
+  createQueryTest("SPARK-8976 Wrong Result for CUBE #1",
+    """
+      SELECT count(*) AS cnt, key % 5,GROUPING__ID FROM src group by key%5 WITH CUBE
+    """.stripMargin)
+
+  createQueryTest("SPARK-8976 Wrong Result for CUBE #2",
+    """
+      SELECT
+        count(*) AS cnt,
+        key % 5 as k1,
+        key-5 as k2,
+        GROUPING__ID as k3
+      FROM (SELECT key, key%2, key - 5 FROM src) t group by key%5, key-5
+      WITH CUBE ORDER BY cnt, k1, k2, k3 LIMIT 10
+    """.stripMargin)
+
+  createQueryTest("SPARK-8976 Wrong Result for GroupingSet",
+    """
+      SELECT
+        count(*) AS cnt,
+        key % 5 as k1,
+        key-5 as k2,
+        GROUPING__ID as k3
+      FROM (SELECT key, key%2, key - 5 FROM src) t group by key%5, key-5
+      GROUPING SETS (key%5, key-5) ORDER BY cnt, k1, k2, k3 LIMIT 10
+    """.stripMargin)
+
   createQueryTest("insert table with generator with column name",
     """
       |  CREATE TABLE gen_tmp (key Int);

From 0a795336df20c7ec969366e613286f0c060a4eeb Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@appier.com>
Date: Wed, 15 Jul 2015 23:36:57 -0700
Subject: [PATCH 184/549] [SPARK-8807] [SPARKR] Add between operator in SparkR

JIRA: https://issues.apache.org/jira/browse/SPARK-8807

Add between operator in SparkR.

Author: Liang-Chi Hsieh <viirya@appier.com>

Closes #7356 from viirya/add_r_between and squashes the following commits:

7f51b44 [Liang-Chi Hsieh] Add test for non-numeric column.
c6a25c5 [Liang-Chi Hsieh] Add between function.
---
 R/pkg/NAMESPACE                  |  1 +
 R/pkg/R/column.R                 | 17 +++++++++++++++++
 R/pkg/R/generics.R               |  4 ++++
 R/pkg/inst/tests/test_sparkSQL.R | 12 ++++++++++++
 4 files changed, 34 insertions(+)

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 7f857222452d4..331307c2077a5 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -77,6 +77,7 @@ exportMethods("abs",
               "atan",
               "atan2",
               "avg",
+              "between",
               "cast",
               "cbrt",
               "ceiling",
diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R
index 8e4b0f5bf1c4d..2892e1416cc65 100644
--- a/R/pkg/R/column.R
+++ b/R/pkg/R/column.R
@@ -187,6 +187,23 @@ setMethod("substr", signature(x = "Column"),
             column(jc)
           })
 
+#' between
+#'
+#' Test if the column is between the lower bound and upper bound, inclusive.
+#'
+#' @rdname column
+#'
+#' @param bounds lower and upper bounds
+setMethod("between", signature(x = "Column"),
+          function(x, bounds) {
+            if (is.vector(bounds) && length(bounds) == 2) {
+              jc <- callJMethod(x@jc, "between", bounds[1], bounds[2])
+              column(jc)
+            } else {
+              stop("bounds should be a vector of lower and upper bounds")
+            }
+          })
+
 #' Casts the column to a different data type.
 #'
 #' @rdname column
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index fad9d71158c51..ebe6fbd97ce86 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -567,6 +567,10 @@ setGeneric("asc", function(x) { standardGeneric("asc") })
 #' @export
 setGeneric("avg", function(x, ...) { standardGeneric("avg") })
 
+#' @rdname column
+#' @export
+setGeneric("between", function(x, bounds) { standardGeneric("between") })
+
 #' @rdname column
 #' @export
 setGeneric("cast", function(x, dataType) { standardGeneric("cast") })
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index 76f74f80834a9..cdfe6481f60ea 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -638,6 +638,18 @@ test_that("column functions", {
   c7 <- floor(c) + log(c) + log10(c) + log1p(c) + rint(c)
   c8 <- sign(c) + sin(c) + sinh(c) + tan(c) + tanh(c)
   c9 <- toDegrees(c) + toRadians(c)
+
+  df <- jsonFile(sqlContext, jsonPath)
+  df2 <- select(df, between(df$age, c(20, 30)), between(df$age, c(10, 20)))
+  expect_equal(collect(df2)[[2, 1]], TRUE)
+  expect_equal(collect(df2)[[2, 2]], FALSE)
+  expect_equal(collect(df2)[[3, 1]], FALSE)
+  expect_equal(collect(df2)[[3, 2]], TRUE)
+
+  df3 <- select(df, between(df$name, c("Apache", "Spark")))
+  expect_equal(collect(df3)[[1, 1]], TRUE)
+  expect_equal(collect(df3)[[2, 1]], FALSE)
+  expect_equal(collect(df3)[[3, 1]], TRUE)
 })
 
 test_that("column binary mathfunctions", {

From 011551620faa87107a787530f074af3d9be7e695 Mon Sep 17 00:00:00 2001
From: Daniel Darabos <darabos.daniel@gmail.com>
Date: Thu, 16 Jul 2015 08:16:54 +0100
Subject: [PATCH 185/549] [SPARK-8893] Add runtime checks against non-positive
 number of partitions

https://issues.apache.org/jira/browse/SPARK-8893

> What does `sc.parallelize(1 to 3).repartition(p).collect` return? I would expect `Array(1, 2, 3)` regardless of `p`. But if `p` < 1, it returns `Array()`. I think instead it should throw an `IllegalArgumentException`.

> I think the case is pretty clear for `p` < 0. But the behavior for `p` = 0 is also error prone. In fact that's how I found this strange behavior. I used `rdd.repartition(a/b)` with positive `a` and `b`, but `a/b` was rounded down to zero and the results surprised me. I'd prefer an exception instead of unexpected (corrupt) results.

Author: Daniel Darabos <darabos.daniel@gmail.com>

Closes #7285 from darabos/patch-1 and squashes the following commits:

decba82 [Daniel Darabos] Allow repartitioning empty RDDs to zero partitions.
97de852 [Daniel Darabos] Allow zero partition count in HashPartitioner
f6ba5fb [Daniel Darabos] Use require() for simpler syntax.
d5e3df8 [Daniel Darabos] Require positive number of partitions in HashPartitioner
897c628 [Daniel Darabos] Require positive maxPartitions in CoalescedRDD
---
 core/src/main/scala/org/apache/spark/Partitioner.scala      | 2 ++
 core/src/main/scala/org/apache/spark/rdd/CoalescedRDD.scala | 5 ++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/Partitioner.scala b/core/src/main/scala/org/apache/spark/Partitioner.scala
index 82889bcd30988..ad68512dccb79 100644
--- a/core/src/main/scala/org/apache/spark/Partitioner.scala
+++ b/core/src/main/scala/org/apache/spark/Partitioner.scala
@@ -76,6 +76,8 @@ object Partitioner {
  * produce an unexpected or incorrect result.
  */
 class HashPartitioner(partitions: Int) extends Partitioner {
+  require(partitions >= 0, s"Number of partitions ($partitions) cannot be negative.")
+
   def numPartitions: Int = partitions
 
   def getPartition(key: Any): Int = key match {
diff --git a/core/src/main/scala/org/apache/spark/rdd/CoalescedRDD.scala b/core/src/main/scala/org/apache/spark/rdd/CoalescedRDD.scala
index 663eebb8e4191..90d9735cb3f69 100644
--- a/core/src/main/scala/org/apache/spark/rdd/CoalescedRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/CoalescedRDD.scala
@@ -69,7 +69,7 @@ private[spark] case class CoalescedRDDPartition(
  * the preferred location of each new partition overlaps with as many preferred locations of its
  * parent partitions
  * @param prev RDD to be coalesced
- * @param maxPartitions number of desired partitions in the coalesced RDD
+ * @param maxPartitions number of desired partitions in the coalesced RDD (must be positive)
  * @param balanceSlack used to trade-off balance and locality. 1.0 is all locality, 0 is all balance
  */
 private[spark] class CoalescedRDD[T: ClassTag](
@@ -78,6 +78,9 @@ private[spark] class CoalescedRDD[T: ClassTag](
     balanceSlack: Double = 0.10)
   extends RDD[T](prev.context, Nil) {  // Nil since we implement getDependencies
 
+  require(maxPartitions > 0 || maxPartitions == prev.partitions.length,
+    s"Number of partitions ($maxPartitions) must be positive.")
+
   override def getPartitions: Array[Partition] = {
     val pc = new PartitionCoalescer(maxPartitions, prev, balanceSlack)
 

From 4ea6480a3ba4ca7e09089c9b99d4a855894b9015 Mon Sep 17 00:00:00 2001
From: Tarek Auel <tarek.auel@googlemail.com>
Date: Thu, 16 Jul 2015 08:26:39 -0700
Subject: [PATCH 186/549] [SPARK-8995] [SQL] cast date strings like '2015-01-01
 12:15:31' to date

Jira https://issues.apache.org/jira/browse/SPARK-8995

In PR #6981we noticed that we cannot cast date strings that contains a time, like '2015-03-18 12:39:40' to date. Besides it's not possible to cast a string like '18:03:20' to a timestamp.

If a time is passed without a date, today is inferred as date.

Author: Tarek Auel <tarek.auel@googlemail.com>
Author: Tarek Auel <tarek.auel@gmail.com>

Closes #7353 from tarekauel/SPARK-8995 and squashes the following commits:

14f333b [Tarek Auel] [SPARK-8995] added tests for daylight saving time
ca1ae69 [Tarek Auel] [SPARK-8995] style fix
d20b8b4 [Tarek Auel] [SPARK-8995] bug fix: distinguish between 0 and null
ef05753 [Tarek Auel] [SPARK-8995] added check for year >= 1000
01c9ff3 [Tarek Auel] [SPARK-8995] support for time strings
34ec573 [Tarek Auel] fixed style
71622c0 [Tarek Auel] improved timestamp and date parsing
0e30c0a [Tarek Auel] Hive compatibility
cfbaed7 [Tarek Auel] fixed wrong checks
71f89c1 [Tarek Auel] [SPARK-8995] minor style fix
f7452fa [Tarek Auel] [SPARK-8995] removed old timestamp parsing
30e5aec [Tarek Auel] [SPARK-8995] date and timestamp cast
c1083fb [Tarek Auel] [SPARK-8995] cast date strings like '2015-01-01 12:15:31' to date or timestamp
---
 .../spark/sql/catalyst/expressions/Cast.scala |  17 +-
 .../sql/catalyst/util/DateTimeUtils.scala     | 198 ++++++++++++++++
 .../sql/catalyst/expressions/CastSuite.scala  | 144 ++++++++++++
 .../catalyst/util/DateTimeUtilsSuite.scala    | 218 ++++++++++++++++++
 4 files changed, 562 insertions(+), 15 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
index ab02addfb4d25..83d5b3b76b0a3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -167,17 +167,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
   // TimestampConverter
   private[this] def castToTimestamp(from: DataType): Any => Any = from match {
     case StringType =>
-      buildCast[UTF8String](_, utfs => {
-        // Throw away extra if more than 9 decimal places
-        val s = utfs.toString
-        val periodIdx = s.indexOf(".")
-        var n = s
-        if (periodIdx != -1 && n.length() - periodIdx > 9) {
-          n = n.substring(0, periodIdx + 10)
-        }
-        try DateTimeUtils.fromJavaTimestamp(Timestamp.valueOf(n))
-        catch { case _: java.lang.IllegalArgumentException => null }
-      })
+      buildCast[UTF8String](_, utfs => DateTimeUtils.stringToTimestamp(utfs).orNull)
     case BooleanType =>
       buildCast[Boolean](_, b => if (b) 1L else 0)
     case LongType =>
@@ -220,10 +210,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
   // DateConverter
   private[this] def castToDate(from: DataType): Any => Any = from match {
     case StringType =>
-      buildCast[UTF8String](_, s =>
-        try DateTimeUtils.fromJavaDate(Date.valueOf(s.toString))
-        catch { case _: java.lang.IllegalArgumentException => null }
-      )
+      buildCast[UTF8String](_, s => DateTimeUtils.stringToDate(s).orNull)
     case TimestampType =>
       // throw valid precision more than seconds, according to Hive.
       // Timestamp.nanos is in 0 to 999,999,999, no more than a second.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
index c1ddee3ef0230..53c32a0a9802b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
@@ -21,6 +21,8 @@ import java.sql.{Date, Timestamp}
 import java.text.{DateFormat, SimpleDateFormat}
 import java.util.{Calendar, TimeZone}
 
+import org.apache.spark.unsafe.types.UTF8String
+
 /**
  * Helper functions for converting between internal and external date and time representations.
  * Dates are exposed externally as java.sql.Date and are represented internally as the number of
@@ -180,4 +182,200 @@ object DateTimeUtils {
     val nanos = (us % MICROS_PER_SECOND) * 1000L
     (day.toInt, secondsInDay * NANOS_PER_SECOND + nanos)
   }
+
+  /**
+   * Parses a given UTF8 date string to the corresponding a corresponding [[Long]] value.
+   * The return type is [[Option]] in order to distinguish between 0L and null. The following
+   * formats are allowed:
+   *
+   * `yyyy`
+   * `yyyy-[m]m`
+   * `yyyy-[m]m-[d]d`
+   * `yyyy-[m]m-[d]d `
+   * `yyyy-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]`
+   * `yyyy-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]Z`
+   * `yyyy-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]-[h]h:[m]m`
+   * `yyyy-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]+[h]h:[m]m`
+   * `yyyy-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]`
+   * `yyyy-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]Z`
+   * `yyyy-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]-[h]h:[m]m`
+   * `yyyy-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]+[h]h:[m]m`
+   * `[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]`
+   * `[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]Z`
+   * `[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]-[h]h:[m]m`
+   * `[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]+[h]h:[m]m`
+   * `T[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]`
+   * `T[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]Z`
+   * `T[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]-[h]h:[m]m`
+   * `T[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]+[h]h:[m]m`
+   */
+  def stringToTimestamp(s: UTF8String): Option[Long] = {
+    if (s == null) {
+      return None
+    }
+    var timeZone: Option[Byte] = None
+    val segments: Array[Int] = Array[Int](1, 1, 1, 0, 0, 0, 0, 0, 0)
+    var i = 0
+    var currentSegmentValue = 0
+    val bytes = s.getBytes
+    var j = 0
+    var digitsMilli = 0
+    var justTime = false
+    while (j < bytes.length) {
+      val b = bytes(j)
+      val parsedValue = b - '0'.toByte
+      if (parsedValue < 0 || parsedValue > 9) {
+        if (j == 0 && b == 'T') {
+          justTime = true
+          i += 3
+        } else if (i < 2) {
+          if (b == '-') {
+            segments(i) = currentSegmentValue
+            currentSegmentValue = 0
+            i += 1
+          } else if (i == 0 && b == ':') {
+            justTime = true
+            segments(3) = currentSegmentValue
+            currentSegmentValue = 0
+            i = 4
+          } else {
+            return None
+          }
+        } else if (i == 2) {
+          if (b == ' ' || b == 'T') {
+            segments(i) = currentSegmentValue
+            currentSegmentValue = 0
+            i += 1
+          } else {
+            return None
+          }
+        } else if (i == 3 || i == 4) {
+          if (b == ':') {
+            segments(i) = currentSegmentValue
+            currentSegmentValue = 0
+            i += 1
+          } else {
+            return None
+          }
+        } else if (i == 5 || i == 6) {
+          if (b == 'Z') {
+            segments(i) = currentSegmentValue
+            currentSegmentValue = 0
+            i += 1
+            timeZone = Some(43)
+          } else if (b == '-' || b == '+') {
+            segments(i) = currentSegmentValue
+            currentSegmentValue = 0
+            i += 1
+            timeZone = Some(b)
+          } else if (b == '.' && i == 5) {
+            segments(i) = currentSegmentValue
+            currentSegmentValue = 0
+            i += 1
+          } else {
+            return None
+          }
+          if (i == 6  && b != '.') {
+            i += 1
+          }
+        } else {
+          if (b == ':' || b == ' ') {
+            segments(i) = currentSegmentValue
+            currentSegmentValue = 0
+            i += 1
+          } else {
+            return None
+          }
+        }
+      } else {
+        if (i == 6) {
+          digitsMilli += 1
+        }
+        currentSegmentValue = currentSegmentValue * 10 + parsedValue
+      }
+      j += 1
+    }
+
+    segments(i) = currentSegmentValue
+
+    while (digitsMilli < 6) {
+      segments(6) *= 10
+      digitsMilli += 1
+    }
+
+    if (!justTime && (segments(0) < 1000 || segments(0) > 9999 || segments(1) < 1 ||
+        segments(1) > 12 || segments(2) < 1 || segments(2) > 31)) {
+      return None
+    }
+
+    if (segments(3) < 0 || segments(3) > 23 || segments(4) < 0 || segments(4) > 59 ||
+        segments(5) < 0 || segments(5) > 59 || segments(6) < 0 || segments(6) > 999999 ||
+        segments(7) < 0 || segments(7) > 23 || segments(8) < 0 || segments(8) > 59) {
+      return None
+    }
+
+    val c = if (timeZone.isEmpty) {
+      Calendar.getInstance()
+    } else {
+      Calendar.getInstance(
+        TimeZone.getTimeZone(f"GMT${timeZone.get.toChar}${segments(7)}%02d:${segments(8)}%02d"))
+    }
+
+    if (justTime) {
+      c.set(Calendar.HOUR, segments(3))
+      c.set(Calendar.MINUTE, segments(4))
+      c.set(Calendar.SECOND, segments(5))
+    } else {
+      c.set(segments(0), segments(1) - 1, segments(2), segments(3), segments(4), segments(5))
+    }
+
+    Some(c.getTimeInMillis / 1000 * 1000000 + segments(6))
+  }
+
+  /**
+   * Parses a given UTF8 date string to the corresponding a corresponding [[Int]] value.
+   * The return type is [[Option]] in order to distinguish between 0 and null. The following
+   * formats are allowed:
+   *
+   * `yyyy`,
+   * `yyyy-[m]m`
+   * `yyyy-[m]m-[d]d`
+   * `yyyy-[m]m-[d]d `
+   * `yyyy-[m]m-[d]d *`
+   * `yyyy-[m]m-[d]dT*`
+   */
+  def stringToDate(s: UTF8String): Option[Int] = {
+    if (s == null) {
+      return None
+    }
+    val segments: Array[Int] = Array[Int](1, 1, 1)
+    var i = 0
+    var currentSegmentValue = 0
+    val bytes = s.getBytes
+    var j = 0
+    while (j < bytes.length && (i < 3 && !(bytes(j) == ' ' || bytes(j) == 'T'))) {
+      val b = bytes(j)
+      if (i < 2 && b == '-') {
+        segments(i) = currentSegmentValue
+        currentSegmentValue = 0
+        i += 1
+      } else {
+        val parsedValue = b - '0'.toByte
+        if (parsedValue < 0 || parsedValue > 9) {
+          return None
+        } else {
+          currentSegmentValue = currentSegmentValue * 10 + parsedValue
+        }
+      }
+      j += 1
+    }
+    segments(i) = currentSegmentValue
+    if (segments(0) < 1000 || segments(0) > 9999 || segments(1) < 1 || segments(1) > 12 ||
+        segments(2) < 1 || segments(2) > 31) {
+      return None
+    }
+    val c = Calendar.getInstance()
+    c.set(segments(0), segments(1) - 1, segments(2), 0, 0, 0)
+    Some((c.getTimeInMillis / 1000 / 3600 / 24).toInt)
+  }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
index 1de161c367a1d..ef8bcd41f7280 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.catalyst.expressions
 
 import java.sql.{Timestamp, Date}
+import java.util.{TimeZone, Calendar}
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.InternalRow
@@ -41,6 +42,137 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(cast(v, Literal(expected).dataType), expected)
   }
 
+  test("cast string to date") {
+    var c = Calendar.getInstance()
+    c.set(2015, 0, 1, 0, 0, 0)
+    c.set(Calendar.MILLISECOND, 0)
+    checkEvaluation(Cast(Literal("2015"), DateType), new Date(c.getTimeInMillis))
+    c = Calendar.getInstance()
+    c.set(2015, 2, 1, 0, 0, 0)
+    c.set(Calendar.MILLISECOND, 0)
+    checkEvaluation(Cast(Literal("2015-03"), DateType), new Date(c.getTimeInMillis))
+    c = Calendar.getInstance()
+    c.set(2015, 2, 18, 0, 0, 0)
+    c.set(Calendar.MILLISECOND, 0)
+    checkEvaluation(Cast(Literal("2015-03-18"), DateType), new Date(c.getTimeInMillis))
+    checkEvaluation(Cast(Literal("2015-03-18 "), DateType), new Date(c.getTimeInMillis))
+    checkEvaluation(Cast(Literal("2015-03-18 123142"), DateType), new Date(c.getTimeInMillis))
+    checkEvaluation(Cast(Literal("2015-03-18T123123"), DateType), new Date(c.getTimeInMillis))
+    checkEvaluation(Cast(Literal("2015-03-18T"), DateType), new Date(c.getTimeInMillis))
+
+    checkEvaluation(Cast(Literal("2015-03-18X"), DateType), null)
+    checkEvaluation(Cast(Literal("2015/03/18"), DateType), null)
+    checkEvaluation(Cast(Literal("2015.03.18"), DateType), null)
+    checkEvaluation(Cast(Literal("20150318"), DateType), null)
+    checkEvaluation(Cast(Literal("2015-031-8"), DateType), null)
+  }
+
+  test("cast string to timestamp") {
+    checkEvaluation(Cast(Literal("123"), TimestampType),
+      null)
+
+    var c = Calendar.getInstance()
+    c.set(2015, 0, 1, 0, 0, 0)
+    c.set(Calendar.MILLISECOND, 0)
+    checkEvaluation(Cast(Literal("2015"), TimestampType),
+      new Timestamp(c.getTimeInMillis))
+    c = Calendar.getInstance()
+    c.set(2015, 2, 1, 0, 0, 0)
+    c.set(Calendar.MILLISECOND, 0)
+    checkEvaluation(Cast(Literal("2015-03"), TimestampType),
+      new Timestamp(c.getTimeInMillis))
+    c = Calendar.getInstance()
+    c.set(2015, 2, 18, 0, 0, 0)
+    c.set(Calendar.MILLISECOND, 0)
+    checkEvaluation(Cast(Literal("2015-03-18"), TimestampType),
+      new Timestamp(c.getTimeInMillis))
+    checkEvaluation(Cast(Literal("2015-03-18 "), TimestampType),
+      new Timestamp(c.getTimeInMillis))
+    checkEvaluation(Cast(Literal("2015-03-18T"), TimestampType),
+      new Timestamp(c.getTimeInMillis))
+
+    c = Calendar.getInstance()
+    c.set(2015, 2, 18, 12, 3, 17)
+    c.set(Calendar.MILLISECOND, 0)
+    checkEvaluation(Cast(Literal("2015-03-18 12:03:17"), TimestampType),
+      new Timestamp(c.getTimeInMillis))
+    checkEvaluation(Cast(Literal("2015-03-18T12:03:17"), TimestampType),
+      new Timestamp(c.getTimeInMillis))
+
+    c = Calendar.getInstance(TimeZone.getTimeZone("UTC"))
+    c.set(2015, 2, 18, 12, 3, 17)
+    c.set(Calendar.MILLISECOND, 0)
+    checkEvaluation(Cast(Literal("2015-03-18T12:03:17Z"), TimestampType),
+      new Timestamp(c.getTimeInMillis))
+    checkEvaluation(Cast(Literal("2015-03-18 12:03:17Z"), TimestampType),
+      new Timestamp(c.getTimeInMillis))
+
+    c = Calendar.getInstance(TimeZone.getTimeZone("GMT-01:00"))
+    c.set(2015, 2, 18, 12, 3, 17)
+    c.set(Calendar.MILLISECOND, 0)
+    checkEvaluation(Cast(Literal("2015-03-18T12:03:17-1:0"), TimestampType),
+      new Timestamp(c.getTimeInMillis))
+    checkEvaluation(Cast(Literal("2015-03-18T12:03:17-01:00"), TimestampType),
+      new Timestamp(c.getTimeInMillis))
+
+    c = Calendar.getInstance(TimeZone.getTimeZone("GMT+07:30"))
+    c.set(2015, 2, 18, 12, 3, 17)
+    c.set(Calendar.MILLISECOND, 0)
+    checkEvaluation(Cast(Literal("2015-03-18T12:03:17+07:30"), TimestampType),
+      new Timestamp(c.getTimeInMillis))
+
+    c = Calendar.getInstance(TimeZone.getTimeZone("GMT+07:03"))
+    c.set(2015, 2, 18, 12, 3, 17)
+    c.set(Calendar.MILLISECOND, 0)
+    checkEvaluation(Cast(Literal("2015-03-18T12:03:17+7:3"), TimestampType),
+      new Timestamp(c.getTimeInMillis))
+
+    c = Calendar.getInstance()
+    c.set(2015, 2, 18, 12, 3, 17)
+    c.set(Calendar.MILLISECOND, 123)
+    checkEvaluation(Cast(Literal("2015-03-18 12:03:17.123"), TimestampType),
+      new Timestamp(c.getTimeInMillis))
+    checkEvaluation(Cast(Literal("2015-03-18T12:03:17.123"), TimestampType),
+      new Timestamp(c.getTimeInMillis))
+
+    c = Calendar.getInstance(TimeZone.getTimeZone("UTC"))
+    c.set(2015, 2, 18, 12, 3, 17)
+    c.set(Calendar.MILLISECOND, 456)
+    checkEvaluation(Cast(Literal("2015-03-18T12:03:17.456Z"), TimestampType),
+      new Timestamp(c.getTimeInMillis))
+    checkEvaluation(Cast(Literal("2015-03-18 12:03:17.456Z"), TimestampType),
+      new Timestamp(c.getTimeInMillis))
+
+    c = Calendar.getInstance(TimeZone.getTimeZone("GMT-01:00"))
+    c.set(2015, 2, 18, 12, 3, 17)
+    c.set(Calendar.MILLISECOND, 123)
+    checkEvaluation(Cast(Literal("2015-03-18T12:03:17.123-1:0"), TimestampType),
+      new Timestamp(c.getTimeInMillis))
+    checkEvaluation(Cast(Literal("2015-03-18T12:03:17.123-01:00"), TimestampType),
+      new Timestamp(c.getTimeInMillis))
+
+    c = Calendar.getInstance(TimeZone.getTimeZone("GMT+07:30"))
+    c.set(2015, 2, 18, 12, 3, 17)
+    c.set(Calendar.MILLISECOND, 123)
+    checkEvaluation(Cast(Literal("2015-03-18T12:03:17.123+07:30"), TimestampType),
+      new Timestamp(c.getTimeInMillis))
+
+    c = Calendar.getInstance(TimeZone.getTimeZone("GMT+07:03"))
+    c.set(2015, 2, 18, 12, 3, 17)
+    c.set(Calendar.MILLISECOND, 123)
+    checkEvaluation(Cast(Literal("2015-03-18T12:03:17.123+7:3"), TimestampType),
+      new Timestamp(c.getTimeInMillis))
+
+    checkEvaluation(Cast(Literal("2015-03-18 123142"), TimestampType), null)
+    checkEvaluation(Cast(Literal("2015-03-18T123123"), TimestampType), null)
+    checkEvaluation(Cast(Literal("2015-03-18X"), TimestampType), null)
+    checkEvaluation(Cast(Literal("2015/03/18"), TimestampType), null)
+    checkEvaluation(Cast(Literal("2015.03.18"), TimestampType), null)
+    checkEvaluation(Cast(Literal("20150318"), TimestampType), null)
+    checkEvaluation(Cast(Literal("2015-031-8"), TimestampType), null)
+    checkEvaluation(Cast(Literal("2015-03-18T12:03:17-0:70"), TimestampType), null)
+  }
+
   test("cast from int") {
     checkCast(0, false)
     checkCast(1, true)
@@ -149,6 +281,18 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
     val nts = sts + ".1"
     val ts = Timestamp.valueOf(nts)
 
+    val defaultTimeZone = TimeZone.getDefault
+    TimeZone.setDefault(TimeZone.getTimeZone("America/Los_Angeles"))
+    var c = Calendar.getInstance()
+    c.set(2015, 2, 8, 2, 30, 0)
+    checkEvaluation(cast(cast(new Timestamp(c.getTimeInMillis), StringType), TimestampType),
+      c.getTimeInMillis * 1000)
+    c = Calendar.getInstance()
+    c.set(2015, 10, 1, 2, 30, 0)
+    checkEvaluation(cast(cast(new Timestamp(c.getTimeInMillis), StringType), TimestampType),
+      c.getTimeInMillis * 1000)
+    TimeZone.setDefault(defaultTimeZone)
+
     checkEvaluation(cast("abdef", StringType), "abdef")
     checkEvaluation(cast("abdef", DecimalType.Unlimited), null)
     checkEvaluation(cast("abdef", TimestampType), null)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
index f63ac191e7366..c65fcbc4d1bc1 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
@@ -19,8 +19,10 @@ package org.apache.spark.sql.catalyst.util
 
 import java.sql.{Date, Timestamp}
 import java.text.SimpleDateFormat
+import java.util.{TimeZone, Calendar}
 
 import org.apache.spark.SparkFunSuite
+import org.apache.spark.unsafe.types.UTF8String
 
 class DateTimeUtilsSuite extends SparkFunSuite {
 
@@ -86,4 +88,220 @@ class DateTimeUtilsSuite extends SparkFunSuite {
     checkFromToJavaDate(new Date(df1.parse("1776-07-04 10:30:00").getTime))
     checkFromToJavaDate(new Date(df2.parse("1776-07-04 18:30:00 UTC").getTime))
   }
+
+  test("string to date") {
+    val millisPerDay = 1000L * 3600L * 24L
+    var c = Calendar.getInstance()
+    c.set(2015, 0, 28, 0, 0, 0)
+    c.set(Calendar.MILLISECOND, 0)
+    assert(DateTimeUtils.stringToDate(UTF8String.fromString("2015-01-28")).get ===
+      c.getTimeInMillis / millisPerDay)
+    c.set(2015, 0, 1, 0, 0, 0)
+    c.set(Calendar.MILLISECOND, 0)
+    assert(DateTimeUtils.stringToDate(UTF8String.fromString("2015")).get ===
+      c.getTimeInMillis / millisPerDay)
+    c = Calendar.getInstance()
+    c.set(2015, 2, 1, 0, 0, 0)
+    c.set(Calendar.MILLISECOND, 0)
+    assert(DateTimeUtils.stringToDate(UTF8String.fromString("2015-03")).get ===
+      c.getTimeInMillis / millisPerDay)
+    c = Calendar.getInstance()
+    c.set(2015, 2, 18, 0, 0, 0)
+    c.set(Calendar.MILLISECOND, 0)
+    assert(DateTimeUtils.stringToDate(UTF8String.fromString("2015-03-18")).get ===
+      c.getTimeInMillis / millisPerDay)
+    assert(DateTimeUtils.stringToDate(UTF8String.fromString("2015-03-18 ")).get ===
+      c.getTimeInMillis / millisPerDay)
+    assert(DateTimeUtils.stringToDate(UTF8String.fromString("2015-03-18 123142")).get ===
+      c.getTimeInMillis / millisPerDay)
+    assert(DateTimeUtils.stringToDate(UTF8String.fromString("2015-03-18T123123")).get ===
+      c.getTimeInMillis / millisPerDay)
+    assert(DateTimeUtils.stringToDate(UTF8String.fromString("2015-03-18T")).get ===
+      c.getTimeInMillis / millisPerDay)
+
+    assert(DateTimeUtils.stringToDate(UTF8String.fromString("2015-03-18X")).isEmpty)
+    assert(DateTimeUtils.stringToDate(UTF8String.fromString("2015/03/18")).isEmpty)
+    assert(DateTimeUtils.stringToDate(UTF8String.fromString("2015.03.18")).isEmpty)
+    assert(DateTimeUtils.stringToDate(UTF8String.fromString("20150318")).isEmpty)
+    assert(DateTimeUtils.stringToDate(UTF8String.fromString("2015-031-8")).isEmpty)
+  }
+
+  test("string to timestamp") {
+    var c = Calendar.getInstance()
+    c.set(1969, 11, 31, 16, 0, 0)
+    c.set(Calendar.MILLISECOND, 0)
+    assert(DateTimeUtils.stringToTimestamp(UTF8String.fromString("1969-12-31 16:00:00")).get ===
+      c.getTimeInMillis * 1000)
+    c.set(2015, 0, 1, 0, 0, 0)
+    c.set(Calendar.MILLISECOND, 0)
+    assert(DateTimeUtils.stringToTimestamp(UTF8String.fromString("2015")).get ===
+      c.getTimeInMillis * 1000)
+    c = Calendar.getInstance()
+    c.set(2015, 2, 1, 0, 0, 0)
+    c.set(Calendar.MILLISECOND, 0)
+    assert(DateTimeUtils.stringToTimestamp(UTF8String.fromString("2015-03")).get ===
+      c.getTimeInMillis * 1000)
+    c = Calendar.getInstance()
+    c.set(2015, 2, 18, 0, 0, 0)
+    c.set(Calendar.MILLISECOND, 0)
+    assert(DateTimeUtils.stringToTimestamp(UTF8String.fromString("2015-03-18")).get ===
+      c.getTimeInMillis * 1000)
+    assert(DateTimeUtils.stringToTimestamp(UTF8String.fromString("2015-03-18 ")).get ===
+      c.getTimeInMillis * 1000)
+    assert(DateTimeUtils.stringToTimestamp(UTF8String.fromString("2015-03-18T")).get ===
+      c.getTimeInMillis * 1000)
+
+    c = Calendar.getInstance()
+    c.set(2015, 2, 18, 12, 3, 17)
+    c.set(Calendar.MILLISECOND, 0)
+    assert(DateTimeUtils.stringToTimestamp(UTF8String.fromString("2015-03-18 12:03:17")).get ===
+      c.getTimeInMillis * 1000)
+    assert(DateTimeUtils.stringToTimestamp(UTF8String.fromString("2015-03-18T12:03:17")).get ===
+      c.getTimeInMillis * 1000)
+
+    c = Calendar.getInstance(TimeZone.getTimeZone("GMT-13:53"))
+    c.set(2015, 2, 18, 12, 3, 17)
+    c.set(Calendar.MILLISECOND, 0)
+    assert(DateTimeUtils.stringToTimestamp(
+      UTF8String.fromString("2015-03-18T12:03:17-13:53")).get === c.getTimeInMillis * 1000)
+
+    c = Calendar.getInstance(TimeZone.getTimeZone("UTC"))
+    c.set(2015, 2, 18, 12, 3, 17)
+    c.set(Calendar.MILLISECOND, 0)
+    assert(DateTimeUtils.stringToTimestamp(UTF8String.fromString("2015-03-18T12:03:17Z")).get ===
+      c.getTimeInMillis * 1000)
+    assert(DateTimeUtils.stringToTimestamp(UTF8String.fromString("2015-03-18 12:03:17Z")).get ===
+      c.getTimeInMillis * 1000)
+
+    c = Calendar.getInstance(TimeZone.getTimeZone("GMT-01:00"))
+    c.set(2015, 2, 18, 12, 3, 17)
+    c.set(Calendar.MILLISECOND, 0)
+    assert(DateTimeUtils.stringToTimestamp(UTF8String.fromString("2015-03-18T12:03:17-1:0")).get ===
+      c.getTimeInMillis * 1000)
+    assert(DateTimeUtils.stringToTimestamp(
+      UTF8String.fromString("2015-03-18T12:03:17-01:00")).get === c.getTimeInMillis * 1000)
+
+    c = Calendar.getInstance(TimeZone.getTimeZone("GMT+07:30"))
+    c.set(2015, 2, 18, 12, 3, 17)
+    c.set(Calendar.MILLISECOND, 0)
+    assert(DateTimeUtils.stringToTimestamp(
+      UTF8String.fromString("2015-03-18T12:03:17+07:30")).get === c.getTimeInMillis * 1000)
+
+    c = Calendar.getInstance(TimeZone.getTimeZone("GMT+07:03"))
+    c.set(2015, 2, 18, 12, 3, 17)
+    c.set(Calendar.MILLISECOND, 0)
+    assert(DateTimeUtils.stringToTimestamp(
+      UTF8String.fromString("2015-03-18T12:03:17+07:03")).get === c.getTimeInMillis * 1000)
+
+    c = Calendar.getInstance()
+    c.set(2015, 2, 18, 12, 3, 17)
+    c.set(Calendar.MILLISECOND, 123)
+    assert(DateTimeUtils.stringToTimestamp(
+      UTF8String.fromString("2015-03-18 12:03:17.123")).get === c.getTimeInMillis * 1000)
+    assert(DateTimeUtils.stringToTimestamp(
+      UTF8String.fromString("2015-03-18T12:03:17.123")).get === c.getTimeInMillis * 1000)
+
+    c = Calendar.getInstance(TimeZone.getTimeZone("UTC"))
+    c.set(2015, 2, 18, 12, 3, 17)
+    c.set(Calendar.MILLISECOND, 456)
+    assert(DateTimeUtils.stringToTimestamp(
+      UTF8String.fromString("2015-03-18T12:03:17.456Z")).get  === c.getTimeInMillis * 1000)
+    assert(DateTimeUtils.stringToTimestamp(
+      UTF8String.fromString("2015-03-18 12:03:17.456Z")).get  === c.getTimeInMillis * 1000)
+
+    c = Calendar.getInstance(TimeZone.getTimeZone("GMT-01:00"))
+    c.set(2015, 2, 18, 12, 3, 17)
+    c.set(Calendar.MILLISECOND, 123)
+    assert(DateTimeUtils.stringToTimestamp(
+      UTF8String.fromString("2015-03-18T12:03:17.123-1:0")).get  === c.getTimeInMillis * 1000)
+    assert(DateTimeUtils.stringToTimestamp(
+      UTF8String.fromString("2015-03-18T12:03:17.123-01:00")).get ===  c.getTimeInMillis * 1000)
+
+    c = Calendar.getInstance(TimeZone.getTimeZone("GMT+07:30"))
+    c.set(2015, 2, 18, 12, 3, 17)
+    c.set(Calendar.MILLISECOND, 123)
+    assert(DateTimeUtils.stringToTimestamp(
+      UTF8String.fromString("2015-03-18T12:03:17.123+07:30")).get ===  c.getTimeInMillis * 1000)
+
+    c = Calendar.getInstance(TimeZone.getTimeZone("GMT+07:30"))
+    c.set(2015, 2, 18, 12, 3, 17)
+    c.set(Calendar.MILLISECOND, 123)
+    assert(DateTimeUtils.stringToTimestamp(
+      UTF8String.fromString("2015-03-18T12:03:17.123+07:30")).get === c.getTimeInMillis * 1000)
+
+    c = Calendar.getInstance(TimeZone.getTimeZone("GMT+07:30"))
+    c.set(2015, 2, 18, 12, 3, 17)
+    c.set(Calendar.MILLISECOND, 123)
+    assert(DateTimeUtils.stringToTimestamp(
+      UTF8String.fromString("2015-03-18T12:03:17.123121+7:30")).get ===
+        c.getTimeInMillis * 1000 + 121)
+
+    c = Calendar.getInstance(TimeZone.getTimeZone("GMT+07:30"))
+    c.set(2015, 2, 18, 12, 3, 17)
+    c.set(Calendar.MILLISECOND, 123)
+    assert(DateTimeUtils.stringToTimestamp(
+      UTF8String.fromString("2015-03-18T12:03:17.12312+7:30")).get ===
+        c.getTimeInMillis * 1000 + 120)
+
+    c = Calendar.getInstance(TimeZone.getTimeZone("GMT+07:30"))
+    c.set(Calendar.HOUR, 18)
+    c.set(Calendar.MINUTE, 12)
+    c.set(Calendar.SECOND, 15)
+    c.set(Calendar.MILLISECOND, 123)
+    assert(DateTimeUtils.stringToTimestamp(
+      UTF8String.fromString("T18:12:15.12312+7:30")).get ===
+      c.getTimeInMillis * 1000 + 120)
+
+    c = Calendar.getInstance(TimeZone.getTimeZone("GMT+07:30"))
+    c.set(Calendar.HOUR, 18)
+    c.set(Calendar.MINUTE, 12)
+    c.set(Calendar.SECOND, 15)
+    c.set(Calendar.MILLISECOND, 123)
+    assert(DateTimeUtils.stringToTimestamp(
+      UTF8String.fromString("18:12:15.12312+7:30")).get ===
+      c.getTimeInMillis * 1000 + 120)
+
+    c = Calendar.getInstance()
+    c.set(2011, 4, 6, 7, 8, 9)
+    c.set(Calendar.MILLISECOND, 100)
+    assert(DateTimeUtils.stringToTimestamp(
+      UTF8String.fromString("2011-05-06 07:08:09.1000")).get === c.getTimeInMillis * 1000)
+
+    val defaultTimeZone = TimeZone.getDefault
+    TimeZone.setDefault(TimeZone.getTimeZone("America/Los_Angeles"))
+
+    c = Calendar.getInstance()
+    c.set(2015, 2, 8, 2, 0, 0)
+    c.set(Calendar.MILLISECOND, 0)
+    assert(DateTimeUtils.stringToTimestamp(
+      UTF8String.fromString("2015-3-8 2:0:0")).get === c.getTimeInMillis * 1000)
+    c.add(Calendar.MINUTE, 30)
+    assert(DateTimeUtils.stringToTimestamp(
+      UTF8String.fromString("2015-3-8 3:30:0")).get === c.getTimeInMillis * 1000)
+    assert(DateTimeUtils.stringToTimestamp(
+      UTF8String.fromString("2015-3-8 2:30:0")).get === c.getTimeInMillis * 1000)
+
+    c = Calendar.getInstance()
+    c.set(2015, 10, 1, 1, 59, 0)
+    c.set(Calendar.MILLISECOND, 0)
+    c.add(Calendar.MINUTE, 31)
+    assert(DateTimeUtils.stringToTimestamp(
+      UTF8String.fromString("2015-11-1 2:30:0")).get === c.getTimeInMillis * 1000)
+    TimeZone.setDefault(defaultTimeZone)
+
+    assert(DateTimeUtils.stringToTimestamp(UTF8String.fromString("238")).isEmpty)
+    assert(DateTimeUtils.stringToTimestamp(UTF8String.fromString("2015-03-18 123142")).isEmpty)
+    assert(DateTimeUtils.stringToTimestamp(UTF8String.fromString("2015-03-18T123123")).isEmpty)
+    assert(DateTimeUtils.stringToTimestamp(UTF8String.fromString("2015-03-18X")).isEmpty)
+    assert(DateTimeUtils.stringToTimestamp(UTF8String.fromString("2015/03/18")).isEmpty)
+    assert(DateTimeUtils.stringToTimestamp(UTF8String.fromString("2015.03.18")).isEmpty)
+    assert(DateTimeUtils.stringToTimestamp(UTF8String.fromString("20150318")).isEmpty)
+    assert(DateTimeUtils.stringToTimestamp(UTF8String.fromString("2015-031-8")).isEmpty)
+    assert(DateTimeUtils.stringToTimestamp(
+      UTF8String.fromString("2015-03-18T12:03.17-20:0")).isEmpty)
+    assert(DateTimeUtils.stringToTimestamp(
+      UTF8String.fromString("2015-03-18T12:03.17-0:70")).isEmpty)
+    assert(DateTimeUtils.stringToTimestamp(
+      UTF8String.fromString("2015-03-18T12:03.17-1:0:0")).isEmpty)
+  }
 }

From b536d5dc6c2c712270b8130ddd9945dff19a27d9 Mon Sep 17 00:00:00 2001
From: Jan Prach <jendap@gmail.com>
Date: Thu, 16 Jul 2015 18:42:41 +0100
Subject: [PATCH 187/549] [SPARK-9015] [BUILD] Clean project import in scala
 ide

Cleanup maven for a clean import in scala-ide / eclipse.

* remove groovy plugin which is really not needed at all
* add-source from build-helper-maven-plugin is not needed as recent version of scala-maven-plugin do it automatically
* add lifecycle-mapping plugin to hide a few useless warnings from ide

Author: Jan Prach <jendap@gmail.com>

Closes #7375 from jendap/clean-project-import-in-scala-ide and squashes the following commits:

c4b4c0f [Jan Prach] fix whitespaces
5a83e07 [Jan Prach] Revert "remove java compiler warnings from java tests"
312007e [Jan Prach] scala-maven-plugin itself add scala sources by default
f47d856 [Jan Prach] remove spark-1.4-staging repository
c8a54db [Jan Prach] remove java compiler warnings from java tests
999a068 [Jan Prach] remove some maven warnings in scala ide
80fbdc5 [Jan Prach] remove groovy and gmavenplus plugin
---
 pom.xml          | 130 +++++++++++++++++++----------------------------
 repl/pom.xml     |   2 -
 sql/core/pom.xml |   1 -
 sql/hive/pom.xml |   1 -
 tools/pom.xml    |   4 --
 5 files changed, 53 insertions(+), 85 deletions(-)

diff --git a/pom.xml b/pom.xml
index aa49e2ab7294b..c5c655834bdeb 100644
--- a/pom.xml
+++ b/pom.xml
@@ -152,7 +152,6 @@
     <aws.kinesis.client.version>1.2.1</aws.kinesis.client.version>
     <commons.httpclient.version>4.3.2</commons.httpclient.version>
     <commons.math3.version>3.4.1</commons.math3.version>
-    <test_classpath_file>${project.build.directory}/spark-test-classpath.txt</test_classpath_file>
     <scala.version>2.10.4</scala.version>
     <scala.binary.version>2.10</scala.binary.version>
     <jline.version>${scala.version}</jline.version>
@@ -283,18 +282,6 @@
         <enabled>false</enabled>
       </snapshots>
     </repository>
-    <!-- TODO: This can be deleted after Spark 1.4 is posted -->
-    <repository>
-      <id>spark-1.4-staging</id>
-      <name>Spark 1.4 RC4 Staging Repository</name>
-      <url>https://repository.apache.org/content/repositories/orgapachespark-1112</url>
-      <releases>
-        <enabled>true</enabled>
-      </releases>
-      <snapshots>
-        <enabled>false</enabled>
-      </snapshots>
-    </repository>
   </repositories>
   <pluginRepositories>
     <pluginRepository>
@@ -318,17 +305,6 @@
       <artifactId>unused</artifactId>
       <version>1.0.0</version>
     </dependency>
-    <!--
-      This depndency has been added to provided scope as it is needed for executing build
-      specific groovy scripts using gmaven+ and not required for downstream project building
-      with spark.
-    -->
-    <dependency>
-      <groupId>org.codehaus.groovy</groupId>
-      <artifactId>groovy-all</artifactId>
-      <version>2.3.7</version>
-      <scope>provided</scope>
-    </dependency>
     <!--
          This is needed by the scalatest plugin, and so is declared here to be available in
          all child modules, just as scalatest is run in all children
@@ -1412,6 +1388,58 @@
           <artifactId>maven-deploy-plugin</artifactId>
           <version>2.8.2</version>
         </plugin>
+        <!-- This plugin's configuration is used to store Eclipse m2e settings only. -->
+        <!-- It has no influence on the Maven build itself. -->
+        <plugin>
+          <groupId>org.eclipse.m2e</groupId>
+          <artifactId>lifecycle-mapping</artifactId>
+          <version>1.0.0</version>
+          <configuration>
+            <lifecycleMappingMetadata>
+              <pluginExecutions>
+                <pluginExecution>
+                  <pluginExecutionFilter>
+                    <groupId>org.apache.maven.plugins</groupId>
+                    <artifactId>maven-dependency-plugin</artifactId>
+                    <versionRange>[2.8,)</versionRange>
+                    <goals>
+                      <goal>build-classpath</goal>
+                    </goals>
+                  </pluginExecutionFilter>
+                  <action>
+                    <ignore></ignore>
+                  </action>
+                </pluginExecution>
+                <pluginExecution>
+                  <pluginExecutionFilter>
+                    <groupId>org.apache.maven.plugins</groupId>
+                    <artifactId>maven-jar-plugin</artifactId>
+                    <versionRange>[2.6,)</versionRange>
+                    <goals>
+                      <goal>test-jar</goal>
+                    </goals>
+                  </pluginExecutionFilter>
+                  <action>
+                    <ignore></ignore>
+                  </action>
+                </pluginExecution>
+                <pluginExecution>
+                  <pluginExecutionFilter>
+                    <groupId>org.apache.maven.plugins</groupId>
+                    <artifactId>maven-antrun-plugin</artifactId>
+                    <versionRange>[1.8,)</versionRange>
+                    <goals>
+                      <goal>run</goal>
+                    </goals>
+                  </pluginExecutionFilter>
+                  <action>
+                    <ignore></ignore>
+                  </action>
+                </pluginExecution>
+              </pluginExecutions>
+            </lifecycleMappingMetadata>
+          </configuration>
+        </plugin>
       </plugins>
     </pluginManagement>
 
@@ -1429,34 +1457,12 @@
             </goals>
             <configuration>
               <includeScope>test</includeScope>
-              <outputFile>${test_classpath_file}</outputFile>
+              <outputProperty>test_classpath</outputProperty>
             </configuration>
           </execution>
         </executions>
       </plugin>
 
-      <!-- This plugin reads a file into maven property. And it lets us write groovy !! -->
-      <plugin>
-        <groupId>org.codehaus.gmavenplus</groupId>
-        <artifactId>gmavenplus-plugin</artifactId>
-        <version>1.5</version>
-        <executions>
-          <execution>
-            <phase>process-test-classes</phase>
-            <goals>
-              <goal>execute</goal>
-            </goals>
-            <configuration>
-              <scripts>
-                <script><![CDATA[
-                def file = new File(project.properties.test_classpath_file)
-                project.properties.test_classpath = file.getText().split().join(":")
-                ]]></script>
-              </scripts>
-            </configuration>
-          </execution>
-        </executions>
-      </plugin>
       <!--
         The shade plug-in is used here to create effective pom's (see SPARK-3812), and also
         remove references from the shaded libraries from artifacts published by Spark.
@@ -1523,36 +1529,6 @@
         <groupId>org.apache.maven.plugins</groupId>
         <artifactId>maven-enforcer-plugin</artifactId>
       </plugin>
-      <plugin>
-        <groupId>org.codehaus.mojo</groupId>
-        <artifactId>build-helper-maven-plugin</artifactId>
-        <executions>
-          <execution>
-            <id>add-scala-sources</id>
-            <phase>generate-sources</phase>
-            <goals>
-              <goal>add-source</goal>
-            </goals>
-            <configuration>
-              <sources>
-                <source>src/main/scala</source>
-              </sources>
-            </configuration>
-          </execution>
-          <execution>
-            <id>add-scala-test-sources</id>
-            <phase>generate-test-sources</phase>
-            <goals>
-              <goal>add-test-source</goal>
-            </goals>
-            <configuration>
-              <sources>
-                <source>src/test/scala</source>
-              </sources>
-            </configuration>
-          </execution>
-        </executions>
-      </plugin>
       <plugin>
         <groupId>net.alchim31.maven</groupId>
         <artifactId>scala-maven-plugin</artifactId>
diff --git a/repl/pom.xml b/repl/pom.xml
index 70c9bd7c01296..a5a0f1fc2c857 100644
--- a/repl/pom.xml
+++ b/repl/pom.xml
@@ -133,7 +133,6 @@
             </goals>
             <configuration>
               <sources>
-                <source>src/main/scala</source>
                 <source>${extra.source.dir}</source>
               </sources>
             </configuration>
@@ -146,7 +145,6 @@
             </goals>
             <configuration>
               <sources>
-                <source>src/test/scala</source>
                 <source>${extra.testsource.dir}</source>
               </sources>
             </configuration>
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index 2c03fee9dbd71..be0966641b5c4 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -123,7 +123,6 @@
             </goals>
             <configuration>
               <sources>
-                <source>src/test/scala</source>
                 <source>src/test/gen-java</source>
               </sources>
             </configuration>
diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
index a17546d706248..b00f320318be0 100644
--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@@ -133,7 +133,6 @@
                 </goals>
                 <configuration>
                   <sources>
-                    <source>src/test/scala</source>
                     <source>compatibility/src/test/scala</source>
                   </sources>
                 </configuration>
diff --git a/tools/pom.xml b/tools/pom.xml
index feffde4c857eb..298ee2348b58e 100644
--- a/tools/pom.xml
+++ b/tools/pom.xml
@@ -76,10 +76,6 @@
         <groupId>org.apache.maven.plugins</groupId>
         <artifactId>maven-source-plugin</artifactId>
       </plugin>
-      <plugin>
-        <groupId>org.codehaus.mojo</groupId>
-        <artifactId>build-helper-maven-plugin</artifactId>
-      </plugin>
     </plugins>
   </build>
 </project>

From 43dac2c880d6f310a958531aee0bb4ac1d9b7025 Mon Sep 17 00:00:00 2001
From: Yijie Shen <henry.yijieshen@gmail.com>
Date: Thu, 16 Jul 2015 10:52:09 -0700
Subject: [PATCH 188/549] [SPARK-6941] [SQL] Provide a better error message to
 when inserting into RDD based table

JIRA: https://issues.apache.org/jira/browse/SPARK-6941

Author: Yijie Shen <henry.yijieshen@gmail.com>

Closes #7342 from yijieshen/SPARK-6941 and squashes the following commits:

f82cbe7 [Yijie Shen] reorder import
dd67e40 [Yijie Shen] resolve comments
09518af [Yijie Shen] fix import order in DataframeSuite
0c635d4 [Yijie Shen] make match more specific
9df388d [Yijie Shen] move check into PreWriteCheck
847ab20 [Yijie Shen] Detect insertion error in DataSourceStrategy
---
 .../org/apache/spark/sql/sources/rules.scala  |  9 ++-
 .../org/apache/spark/sql/DataFrameSuite.scala | 55 ++++++++++++++++++-
 2 files changed, 60 insertions(+), 4 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/rules.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/rules.scala
index a3fd7f13b3db7..40ee048e2653e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/rules.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/rules.scala
@@ -21,7 +21,7 @@ import org.apache.spark.sql.{SaveMode, AnalysisException}
 import org.apache.spark.sql.catalyst.analysis.{EliminateSubQueries, Catalog}
 import org.apache.spark.sql.catalyst.expressions.{Attribute, Cast, Alias}
 import org.apache.spark.sql.catalyst.plans.logical
-import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoTable, LogicalPlan, Project}
+import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.types.DataType
 
@@ -119,6 +119,13 @@ private[sql] case class PreWriteCheck(catalog: Catalog) extends (LogicalPlan =>
         // The relation in l is not an InsertableRelation.
         failAnalysis(s"$l does not allow insertion.")
 
+      case logical.InsertIntoTable(t, _, _, _, _) =>
+        if (!t.isInstanceOf[LeafNode] || t == OneRowRelation || t.isInstanceOf[LocalRelation]) {
+          failAnalysis(s"Inserting into an RDD-based table is not allowed.")
+        } else {
+          // OK
+        }
+
       case CreateTableUsingAsSelect(tableName, _, _, _, SaveMode.Overwrite, _, query) =>
         // When the SaveMode is Overwrite, we need to check if the table is an input table of
         // the query. If so, we will throw an AnalysisException to let users know it is not allowed.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index f592a9934d0e6..23244fd310d0f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -17,19 +17,23 @@
 
 package org.apache.spark.sql
 
+import java.io.File
+
 import scala.language.postfixOps
 
+import org.apache.spark.sql.catalyst.plans.logical.OneRowRelation
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types._
-import org.apache.spark.sql.test.{ExamplePointUDT, ExamplePoint}
-
+import org.apache.spark.sql.test.{ExamplePointUDT, ExamplePoint, SQLTestUtils}
 
-class DataFrameSuite extends QueryTest {
+class DataFrameSuite extends QueryTest with SQLTestUtils {
   import org.apache.spark.sql.TestData._
 
   lazy val ctx = org.apache.spark.sql.test.TestSQLContext
   import ctx.implicits._
 
+  def sqlContext: SQLContext = ctx
+
   test("analysis error should be eagerly reported") {
     val oldSetting = ctx.conf.dataFrameEagerAnalysis
     // Eager analysis.
@@ -761,4 +765,49 @@ class DataFrameSuite extends QueryTest {
     assert(f.getMessage.contains("column3"))
     assert(!f.getMessage.contains("column2"))
   }
+
+  test("SPARK-6941: Better error message for inserting into RDD-based Table") {
+    withTempDir { dir =>
+
+      val tempParquetFile = new File(dir, "tmp_parquet")
+      val tempJsonFile = new File(dir, "tmp_json")
+
+      val df = Seq(Tuple1(1)).toDF()
+      val insertion = Seq(Tuple1(2)).toDF("col")
+
+      // pass case: parquet table (HadoopFsRelation)
+      df.write.mode(SaveMode.Overwrite).parquet(tempParquetFile.getCanonicalPath)
+      val pdf = ctx.read.parquet(tempParquetFile.getCanonicalPath)
+      pdf.registerTempTable("parquet_base")
+      insertion.write.insertInto("parquet_base")
+
+      // pass case: json table (InsertableRelation)
+      df.write.mode(SaveMode.Overwrite).json(tempJsonFile.getCanonicalPath)
+      val jdf = ctx.read.json(tempJsonFile.getCanonicalPath)
+      jdf.registerTempTable("json_base")
+      insertion.write.mode(SaveMode.Overwrite).insertInto("json_base")
+
+      // error cases: insert into an RDD
+      df.registerTempTable("rdd_base")
+      val e1 = intercept[AnalysisException] {
+        insertion.write.insertInto("rdd_base")
+      }
+      assert(e1.getMessage.contains("Inserting into an RDD-based table is not allowed."))
+
+      // error case: insert into a logical plan that is not a LeafNode
+      val indirectDS = pdf.select("_1").filter($"_1" > 5)
+      indirectDS.registerTempTable("indirect_ds")
+      val e2 = intercept[AnalysisException] {
+        insertion.write.insertInto("indirect_ds")
+      }
+      assert(e2.getMessage.contains("Inserting into an RDD-based table is not allowed."))
+
+      // error case: insert into an OneRowRelation
+      new DataFrame(ctx, OneRowRelation).registerTempTable("one_row")
+      val e3 = intercept[AnalysisException] {
+        insertion.write.insertInto("one_row")
+      }
+      assert(e3.getMessage.contains("Inserting into an RDD-based table is not allowed."))
+    }
+  }
 }

From fec10f0c63171bd5aff7a762a6c94df035f5fb52 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Thu, 16 Jul 2015 13:58:39 -0700
Subject: [PATCH 189/549] [SPARK-9085][SQL] Remove LeafNode, UnaryNode,
 BinaryNode from TreeNode.

This builds on #7433 but also removes LeafNode/UnaryNode. These are slightly more complicated to remove. I had to change some abstract classes to traits in order for it to work.

The problem with LeafNode/UnaryNode is that they are often mixed in at the end of an Expression, and then the toString function actually gets resolved to the ones defined in TreeNode, rather than in Expression.

Author: Reynold Xin <rxin@databricks.com>

Closes #7434 from rxin/remove-binary-unary-leaf-node and squashes the following commits:

9e8a4de [Reynold Xin] Generator should not be foldable.
3135a8b [Reynold Xin] SortOrder should not be foldable.
9c589cf [Reynold Xin] Fixed one more test case...
2225331 [Reynold Xin] Aggregate expressions should not be foldable.
16b5c90 [Reynold Xin] [SPARK-9085][SQL] Remove LeafNode, UnaryNode, BinaryNode from TreeNode.
---
 .../sql/catalyst/analysis/unresolved.scala    | 10 +++---
 .../catalyst/expressions/BoundAttribute.scala |  2 +-
 .../sql/catalyst/expressions/Expression.scala |  1 -
 .../sql/catalyst/expressions/SortOrder.scala  |  6 ++--
 .../sql/catalyst/expressions/aggregates.scala | 35 ++++++++++---------
 .../sql/catalyst/expressions/generators.scala | 12 +++----
 .../expressions/namedExpressions.scala        | 19 ++++++----
 .../sql/catalyst/optimizer/Optimizer.scala    |  4 +--
 .../catalyst/plans/logical/LogicalPlan.scala  | 11 ++++--
 .../spark/sql/catalyst/trees/TreeNode.scala   | 16 ---------
 .../spark/sql/execution/SparkPlan.scala       | 19 ++++++----
 11 files changed, 69 insertions(+), 66 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
index f2e579afe833a..7089f079b6dde 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
@@ -50,8 +50,7 @@ case class UnresolvedRelation(
 /**
  * Holds the name of an attribute that has yet to be resolved.
  */
-case class UnresolvedAttribute(nameParts: Seq[String])
-  extends Attribute with trees.LeafNode[Expression] {
+case class UnresolvedAttribute(nameParts: Seq[String]) extends Attribute {
 
   def name: String =
     nameParts.map(n => if (n.contains(".")) s"`$n`" else n).mkString(".")
@@ -96,7 +95,7 @@ case class UnresolvedFunction(name: String, children: Seq[Expression]) extends E
  * Represents all of the input attributes to a given relational operator, for example in
  * "SELECT * FROM ...". A [[Star]] gets automatically expanded during analysis.
  */
-trait Star extends NamedExpression with trees.LeafNode[Expression] {
+abstract class Star extends LeafExpression with NamedExpression {
   self: Product =>
 
   override def name: String = throw new UnresolvedException(this, "name")
@@ -151,7 +150,7 @@ case class UnresolvedStar(table: Option[String]) extends Star {
  * @param names the names to be associated with each output of computing [[child]].
  */
 case class MultiAlias(child: Expression, names: Seq[String])
-  extends NamedExpression with trees.UnaryNode[Expression] {
+  extends UnaryExpression with NamedExpression {
 
   override def name: String = throw new UnresolvedException(this, "name")
 
@@ -210,8 +209,7 @@ case class UnresolvedExtractValue(child: Expression, extraction: Expression)
 /**
  * Holds the expression that has yet to be aliased.
  */
-case class UnresolvedAlias(child: Expression) extends NamedExpression
-  with trees.UnaryNode[Expression] {
+case class UnresolvedAlias(child: Expression) extends UnaryExpression with NamedExpression {
 
   override def toAttribute: Attribute = throw new UnresolvedException(this, "toAttribute")
   override def qualifiers: Seq[String] = throw new UnresolvedException(this, "qualifiers")
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala
index 3f0d7b803125f..b09aea03318da 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala
@@ -30,7 +30,7 @@ import org.apache.spark.sql.types._
  * the layout of intermediate tuples, BindReferences should be run after all such transformations.
  */
 case class BoundReference(ordinal: Int, dataType: DataType, nullable: Boolean)
-  extends NamedExpression with trees.LeafNode[Expression] {
+  extends LeafExpression with NamedExpression {
 
   override def toString: String = s"input[$ordinal]"
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
index a655cc8e48ae1..f396bd08a8238 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
@@ -20,7 +20,6 @@ package org.apache.spark.sql.catalyst.expressions
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.{TypeCheckResult, UnresolvedAttribute}
 import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode}
-import org.apache.spark.sql.catalyst.trees
 import org.apache.spark.sql.catalyst.trees.TreeNode
 import org.apache.spark.sql.types._
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SortOrder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SortOrder.scala
index 8ab4ef060b68c..b8f7068c9e5e5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SortOrder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SortOrder.scala
@@ -30,8 +30,10 @@ case object Descending extends SortDirection
  * An expression that can be used to sort a tuple.  This class extends expression primarily so that
  * transformations over expression will descend into its child.
  */
-case class SortOrder(child: Expression, direction: SortDirection) extends Expression
-    with trees.UnaryNode[Expression] {
+case class SortOrder(child: Expression, direction: SortDirection) extends UnaryExpression {
+
+  /** Sort order is not foldable because we don't have an eval for it. */
+  override def foldable: Boolean = false
 
   override def dataType: DataType = child.dataType
   override def nullable: Boolean = child.nullable
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
index c0e17f97e9b3c..71c943dc79e9e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
@@ -20,16 +20,20 @@ package org.apache.spark.sql.catalyst.expressions
 import com.clearspring.analytics.stream.cardinality.HyperLogLog
 
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.trees
 import org.apache.spark.sql.catalyst.errors.TreeNodeException
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
 import org.apache.spark.sql.catalyst.util.TypeUtils
 import org.apache.spark.sql.types._
 import org.apache.spark.util.collection.OpenHashSet
 
-abstract class AggregateExpression extends Expression {
+trait AggregateExpression extends Expression {
   self: Product =>
 
+  /**
+   * Aggregate expressions should not be foldable.
+   */
+  override def foldable: Boolean = false
+
   /**
    * Creates a new instance that can be used to compute this aggregate expression for a group
    * of input rows/
@@ -60,7 +64,7 @@ case class SplitEvaluation(
  * An [[AggregateExpression]] that can be partially computed without seeing all relevant tuples.
  * These partial evaluations can then be combined to compute the actual answer.
  */
-abstract class PartialAggregate extends AggregateExpression {
+trait PartialAggregate extends AggregateExpression {
   self: Product =>
 
   /**
@@ -74,7 +78,7 @@ abstract class PartialAggregate extends AggregateExpression {
  * [[AggregateExpression]] with an algorithm that will be used to compute one specific result.
  */
 abstract class AggregateFunction
-  extends AggregateExpression with Serializable with trees.LeafNode[Expression] {
+  extends LeafExpression with AggregateExpression with Serializable {
   self: Product =>
 
   /** Base should return the generic aggregate expression that this function is computing */
@@ -91,7 +95,7 @@ abstract class AggregateFunction
   }
 }
 
-case class Min(child: Expression) extends PartialAggregate with trees.UnaryNode[Expression] {
+case class Min(child: Expression) extends UnaryExpression with PartialAggregate {
 
   override def nullable: Boolean = true
   override def dataType: DataType = child.dataType
@@ -124,7 +128,7 @@ case class MinFunction(expr: Expression, base: AggregateExpression) extends Aggr
   override def eval(input: InternalRow): Any = currentMin.value
 }
 
-case class Max(child: Expression) extends PartialAggregate with trees.UnaryNode[Expression] {
+case class Max(child: Expression) extends UnaryExpression with PartialAggregate {
 
   override def nullable: Boolean = true
   override def dataType: DataType = child.dataType
@@ -157,7 +161,7 @@ case class MaxFunction(expr: Expression, base: AggregateExpression) extends Aggr
   override def eval(input: InternalRow): Any = currentMax.value
 }
 
-case class Count(child: Expression) extends PartialAggregate with trees.UnaryNode[Expression] {
+case class Count(child: Expression) extends UnaryExpression with PartialAggregate {
 
   override def nullable: Boolean = false
   override def dataType: LongType.type = LongType
@@ -310,7 +314,7 @@ private[sql] case object HyperLogLogUDT extends UserDefinedType[HyperLogLog] {
 }
 
 case class ApproxCountDistinctPartition(child: Expression, relativeSD: Double)
-  extends AggregateExpression with trees.UnaryNode[Expression] {
+  extends UnaryExpression with AggregateExpression {
 
   override def nullable: Boolean = false
   override def dataType: DataType = HyperLogLogUDT
@@ -340,7 +344,7 @@ case class ApproxCountDistinctPartitionFunction(
 }
 
 case class ApproxCountDistinctMerge(child: Expression, relativeSD: Double)
-  extends AggregateExpression with trees.UnaryNode[Expression] {
+  extends UnaryExpression with AggregateExpression {
 
   override def nullable: Boolean = false
   override def dataType: LongType.type = LongType
@@ -368,7 +372,7 @@ case class ApproxCountDistinctMergeFunction(
 }
 
 case class ApproxCountDistinct(child: Expression, relativeSD: Double = 0.05)
-  extends PartialAggregate with trees.UnaryNode[Expression] {
+  extends UnaryExpression with PartialAggregate {
 
   override def nullable: Boolean = false
   override def dataType: LongType.type = LongType
@@ -386,7 +390,7 @@ case class ApproxCountDistinct(child: Expression, relativeSD: Double = 0.05)
   override def newInstance(): CountDistinctFunction = new CountDistinctFunction(child :: Nil, this)
 }
 
-case class Average(child: Expression) extends PartialAggregate with trees.UnaryNode[Expression] {
+case class Average(child: Expression) extends UnaryExpression with PartialAggregate {
 
   override def prettyName: String = "avg"
 
@@ -479,7 +483,7 @@ case class AverageFunction(expr: Expression, base: AggregateExpression)
   }
 }
 
-case class Sum(child: Expression) extends PartialAggregate with trees.UnaryNode[Expression] {
+case class Sum(child: Expression) extends UnaryExpression with PartialAggregate {
 
   override def nullable: Boolean = true
 
@@ -606,8 +610,7 @@ case class CombineSumFunction(expr: Expression, base: AggregateExpression)
   }
 }
 
-case class SumDistinct(child: Expression)
-  extends PartialAggregate with trees.UnaryNode[Expression] {
+case class SumDistinct(child: Expression) extends UnaryExpression with PartialAggregate {
 
   def this() = this(null)
   override def nullable: Boolean = true
@@ -701,7 +704,7 @@ case class CombineSetsAndSumFunction(
   }
 }
 
-case class First(child: Expression) extends PartialAggregate with trees.UnaryNode[Expression] {
+case class First(child: Expression) extends UnaryExpression with PartialAggregate {
   override def nullable: Boolean = true
   override def dataType: DataType = child.dataType
   override def toString: String = s"FIRST($child)"
@@ -729,7 +732,7 @@ case class FirstFunction(expr: Expression, base: AggregateExpression) extends Ag
   override def eval(input: InternalRow): Any = result
 }
 
-case class Last(child: Expression) extends PartialAggregate with trees.UnaryNode[Expression] {
+case class Last(child: Expression) extends UnaryExpression with PartialAggregate {
   override def references: AttributeSet = child.references
   override def nullable: Boolean = true
   override def dataType: DataType = child.dataType
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
index b68d30a26abd8..51dc77ee3fc5f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
@@ -40,13 +40,14 @@ import org.apache.spark.sql.types._
  * requested.  The attributes produced by this function will be automatically copied anytime rules
  * result in changes to the Generator or its children.
  */
-abstract class Generator extends Expression {
-  self: Product =>
+trait Generator extends Expression { self: Product =>
 
   // TODO ideally we should return the type of ArrayType(StructType),
   // however, we don't keep the output field names in the Generator.
   override def dataType: DataType = throw new UnsupportedOperationException
 
+  override def foldable: Boolean = false
+
   override def nullable: Boolean = false
 
   /**
@@ -99,8 +100,9 @@ case class UserDefinedGenerator(
 /**
  * Given an input array produces a sequence of rows for each value in the array.
  */
-case class Explode(child: Expression)
-  extends Generator with trees.UnaryNode[Expression] {
+case class Explode(child: Expression) extends UnaryExpression with Generator {
+
+  override def children: Seq[Expression] = child :: Nil
 
   override def checkInputDataTypes(): TypeCheckResult = {
     if (child.dataType.isInstanceOf[ArrayType] || child.dataType.isInstanceOf[MapType]) {
@@ -127,6 +129,4 @@ case class Explode(child: Expression)
         else inputMap.map { case (k, v) => InternalRow(k, v) }
     }
   }
-
-  override def toString: String = s"explode($child)"
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
index 6181c60c0e453..8bf7a7ce4e647 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
@@ -37,8 +37,13 @@ object NamedExpression {
  */
 case class ExprId(id: Long)
 
-abstract class NamedExpression extends Expression {
-  self: Product =>
+/**
+ * An [[Expression]] that is named.
+ */
+trait NamedExpression extends Expression { self: Product =>
+
+  /** We should never fold named expressions in order to not remove the alias. */
+  override def foldable: Boolean = false
 
   def name: String
   def exprId: ExprId
@@ -78,8 +83,7 @@ abstract class NamedExpression extends Expression {
     }
 }
 
-abstract class Attribute extends NamedExpression {
-  self: Product =>
+abstract class Attribute extends LeafExpression with NamedExpression { self: Product =>
 
   override def references: AttributeSet = AttributeSet(this)
 
@@ -110,7 +114,7 @@ case class Alias(child: Expression, name: String)(
     val exprId: ExprId = NamedExpression.newExprId,
     val qualifiers: Seq[String] = Nil,
     val explicitMetadata: Option[Metadata] = None)
-  extends NamedExpression with trees.UnaryNode[Expression] {
+  extends UnaryExpression with NamedExpression {
 
   // Alias(Generator, xx) need to be transformed into Generate(generator, ...)
   override lazy val resolved =
@@ -172,7 +176,8 @@ case class AttributeReference(
     nullable: Boolean = true,
     override val metadata: Metadata = Metadata.empty)(
     val exprId: ExprId = NamedExpression.newExprId,
-    val qualifiers: Seq[String] = Nil) extends Attribute with trees.LeafNode[Expression] {
+    val qualifiers: Seq[String] = Nil)
+  extends Attribute {
 
   /**
    * Returns true iff the expression id is the same for both attributes.
@@ -242,7 +247,7 @@ case class AttributeReference(
  * A place holder used when printing expressions without debugging information such as the
  * expression id or the unresolved indicator.
  */
-case class PrettyAttribute(name: String) extends Attribute with trees.LeafNode[Expression] {
+case class PrettyAttribute(name: String) extends Attribute {
 
   override def toString: String = name
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 5d80214abf141..2f94b457f4cdc 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -342,7 +342,7 @@ object ConstantFolding extends Rule[LogicalPlan] {
       case l: Literal => l
 
       // Fold expressions that are foldable.
-      case e if e.foldable => Literal.create(e.eval(null), e.dataType)
+      case e if e.foldable => Literal.create(e.eval(EmptyRow), e.dataType)
 
       // Fold "literal in (item1, item2, ..., literal, ...)" into true directly.
       case In(Literal(v, _), list) if list.exists {
@@ -361,7 +361,7 @@ object OptimizeIn extends Rule[LogicalPlan] {
   def apply(plan: LogicalPlan): LogicalPlan = plan transform {
     case q: LogicalPlan => q transformExpressionsDown {
       case In(v, list) if !list.exists(!_.isInstanceOf[Literal]) =>
-        val hSet = list.map(e => e.eval(null))
+        val hSet = list.map(e => e.eval(EmptyRow))
         InSet(v, HashSet() ++ hSet)
     }
   }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
index d7077a0ec907a..adac37231cc4a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
@@ -23,7 +23,6 @@ import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.QueryPlan
 import org.apache.spark.sql.catalyst.trees.TreeNode
-import org.apache.spark.sql.catalyst.trees
 
 
 abstract class LogicalPlan extends QueryPlan[LogicalPlan] with Logging {
@@ -277,15 +276,21 @@ abstract class LogicalPlan extends QueryPlan[LogicalPlan] with Logging {
 /**
  * A logical plan node with no children.
  */
-abstract class LeafNode extends LogicalPlan with trees.LeafNode[LogicalPlan] {
+abstract class LeafNode extends LogicalPlan {
   self: Product =>
+
+  override def children: Seq[LogicalPlan] = Nil
 }
 
 /**
  * A logical plan node with single child.
  */
-abstract class UnaryNode extends LogicalPlan with trees.UnaryNode[LogicalPlan] {
+abstract class UnaryNode extends LogicalPlan {
   self: Product =>
+
+  def child: LogicalPlan
+
+  override def children: Seq[LogicalPlan] = child :: Nil
 }
 
 /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
index 16844b2f4b680..0f95ca688a7a8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
@@ -452,19 +452,3 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] {
     s"$nodeName(${args.mkString(",")})"
   }
 }
-
-
-/**
- * A [[TreeNode]] with no children.
- */
-trait LeafNode[BaseType <: TreeNode[BaseType]] {
-  def children: Seq[BaseType] = Nil
-}
-
-/**
- * A [[TreeNode]] with a single [[child]].
- */
-trait UnaryNode[BaseType <: TreeNode[BaseType]] {
-  def child: BaseType
-  def children: Seq[BaseType] = child :: Nil
-}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
index 9dc7879fa4a1a..632f633d82a2e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
@@ -17,20 +17,20 @@
 
 package org.apache.spark.sql.execution
 
-import org.apache.spark.annotation.DeveloperApi
+import scala.collection.mutable.ArrayBuffer
+
 import org.apache.spark.Logging
+import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.{RDD, RDDOperationScope}
 import org.apache.spark.sql.SQLContext
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.{CatalystTypeConverters, trees}
+import org.apache.spark.sql.catalyst.CatalystTypeConverters
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.catalyst.plans.QueryPlan
 import org.apache.spark.sql.catalyst.plans.physical._
 
-import scala.collection.mutable.ArrayBuffer
-
 object SparkPlan {
   protected[sql] val currentContext = new ThreadLocal[SQLContext]()
 }
@@ -238,12 +238,19 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ
   }
 }
 
-private[sql] trait LeafNode extends SparkPlan with trees.LeafNode[SparkPlan] {
+private[sql] trait LeafNode extends SparkPlan {
   self: Product =>
+
+  override def children: Seq[SparkPlan] = Nil
 }
 
-private[sql] trait UnaryNode extends SparkPlan with trees.UnaryNode[SparkPlan] {
+private[sql] trait UnaryNode extends SparkPlan {
   self: Product =>
+
+  def child: SparkPlan
+
+  override def children: Seq[SparkPlan] = child :: Nil
+
   override def outputPartitioning: Partitioning = child.outputPartitioning
 }
 

From 031d7d41430ec1f3c3353e33eab4821a9bcd58a5 Mon Sep 17 00:00:00 2001
From: jerryshao <saisai.shao@intel.com>
Date: Thu, 16 Jul 2015 16:55:46 -0700
Subject: [PATCH 190/549] [SPARK-6304] [STREAMING] Fix checkpointing doesn't
 retain driver port issue.

Author: jerryshao <saisai.shao@intel.com>
Author: Saisai Shao <saisai.shao@intel.com>

Closes #5060 from jerryshao/SPARK-6304 and squashes the following commits:

89b01f5 [jerryshao] Update the unit test to add more cases
275d252 [jerryshao] Address the comments
7cc146d [jerryshao] Address the comments
2624723 [jerryshao] Fix rebase conflict
45befaa [Saisai Shao] Update the unit test
bbc1c9c [Saisai Shao] Fix checkpointing doesn't retain driver port issue
---
 .../apache/spark/streaming/Checkpoint.scala   |  2 +
 .../spark/streaming/CheckpointSuite.scala     | 45 ++++++++++++++++++-
 2 files changed, 46 insertions(+), 1 deletion(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala b/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala
index 5279331c9e122..65d4e933bf8e9 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala
@@ -48,6 +48,8 @@ class Checkpoint(@transient ssc: StreamingContext, val checkpointTime: Time)
     // Reload properties for the checkpoint application since user wants to set a reload property
     // or spark had changed its value and user wants to set it back.
     val propertiesToReload = List(
+      "spark.driver.host",
+      "spark.driver.port",
       "spark.master",
       "spark.yarn.keytab",
       "spark.yarn.principal")
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala
index 6a94928076236..d308ac05a54fe 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala
@@ -191,8 +191,51 @@ class CheckpointSuite extends TestSuiteBase {
     }
   }
 
+  // This tests if "spark.driver.host" and "spark.driver.port" is set by user, can be recovered
+  // with correct value.
+  test("get correct spark.driver.[host|port] from checkpoint") {
+    val conf = Map("spark.driver.host" -> "localhost", "spark.driver.port" -> "9999")
+    conf.foreach(kv => System.setProperty(kv._1, kv._2))
+    ssc = new StreamingContext(master, framework, batchDuration)
+    val originalConf = ssc.conf
+    assert(originalConf.get("spark.driver.host") === "localhost")
+    assert(originalConf.get("spark.driver.port") === "9999")
+
+    val cp = new Checkpoint(ssc, Time(1000))
+    ssc.stop()
+
+    // Serialize/deserialize to simulate write to storage and reading it back
+    val newCp = Utils.deserialize[Checkpoint](Utils.serialize(cp))
+
+    val newCpConf = newCp.createSparkConf()
+    assert(newCpConf.contains("spark.driver.host"))
+    assert(newCpConf.contains("spark.driver.port"))
+    assert(newCpConf.get("spark.driver.host") === "localhost")
+    assert(newCpConf.get("spark.driver.port") === "9999")
+
+    // Check if all the parameters have been restored
+    ssc = new StreamingContext(null, newCp, null)
+    val restoredConf = ssc.conf
+    assert(restoredConf.get("spark.driver.host") === "localhost")
+    assert(restoredConf.get("spark.driver.port") === "9999")
+    ssc.stop()
+
+    // If spark.driver.host and spark.driver.host is not set in system property, these two
+    // parameters should not be presented in the newly recovered conf.
+    conf.foreach(kv => System.clearProperty(kv._1))
+    val newCpConf1 = newCp.createSparkConf()
+    assert(!newCpConf1.contains("spark.driver.host"))
+    assert(!newCpConf1.contains("spark.driver.port"))
+
+    // Spark itself will dispatch a random, not-used port for spark.driver.port if it is not set
+    // explicitly.
+    ssc = new StreamingContext(null, newCp, null)
+    val restoredConf1 = ssc.conf
+    assert(restoredConf1.get("spark.driver.host") === "localhost")
+    assert(restoredConf1.get("spark.driver.port") !== "9999")
+  }
 
-  // This tests whether the systm can recover from a master failure with simple
+  // This tests whether the system can recover from a master failure with simple
   // non-stateful operations. This assumes as reliable, replayable input
   // source - TestInputDStream.
   test("recovery with map and reduceByKey operations") {

From 57e9b13bf99ab82896114d47c62fbee648c9be7d Mon Sep 17 00:00:00 2001
From: Aaron Davidson <aaron@databricks.com>
Date: Thu, 16 Jul 2015 18:14:45 -0700
Subject: [PATCH 191/549] [SPARK-8644] Include call site in SparkException
 stack traces thrown by job failures

Example exception (new part at bottom, clearly demarcated):

```
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 1 times, most recent failure: Lost task 0.0 in stage 0.0 (TID 0, localhost): java.lang.RuntimeException: uh-oh!
	at org.apache.spark.scheduler.DAGSchedulerSuite$$anonfun$37$$anonfun$38$$anonfun$apply$mcJ$sp$2.apply(DAGSchedulerSuite.scala:880)
	at org.apache.spark.scheduler.DAGSchedulerSuite$$anonfun$37$$anonfun$38$$anonfun$apply$mcJ$sp$2.apply(DAGSchedulerSuite.scala:880)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
	at org.apache.spark.util.Utils$.getIteratorSize(Utils.scala:1640)
	at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1099)
	at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1099)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1777)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1777)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
	at org.apache.spark.scheduler.Task.run(Task.scala:70)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
	at java.lang.Thread.run(Thread.java:744)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1298)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1289)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1288)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1288)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:755)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:755)
	at scala.Option.foreach(Option.scala:236)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:755)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1509)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1470)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1459)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:560)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1744)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1762)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1777)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1791)
	at org.apache.spark.rdd.RDD.count(RDD.scala:1099)
	at org.apache.spark.scheduler.DAGSchedulerSuite$$anonfun$37$$anonfun$38.apply$mcJ$sp(DAGSchedulerSuite.scala:880)
	at org.apache.spark.scheduler.DAGSchedulerSuite$$anonfun$37$$anonfun$38.apply(DAGSchedulerSuite.scala:880)
	at org.apache.spark.scheduler.DAGSchedulerSuite$$anonfun$37$$anonfun$38.apply(DAGSchedulerSuite.scala:880)
	at org.scalatest.Assertions$class.intercept(Assertions.scala:997)
	at org.scalatest.FunSuite.intercept(FunSuite.scala:1555)
	at org.apache.spark.scheduler.DAGSchedulerSuite$$anonfun$37.apply$mcV$sp(DAGSchedulerSuite.scala:879)
	at org.apache.spark.scheduler.DAGSchedulerSuite$$anonfun$37.apply(DAGSchedulerSuite.scala:878)
	at org.apache.spark.scheduler.DAGSchedulerSuite$$anonfun$37.apply(DAGSchedulerSuite.scala:878)
	at org.scalatest.Transformer$$anonfun$apply$1.apply$mcV$sp(Transformer.scala:22)
	at org.scalatest.OutcomeOf$class.outcomeOf(OutcomeOf.scala:85)
	at org.scalatest.OutcomeOf$.outcomeOf(OutcomeOf.scala:104)
	at org.scalatest.Transformer.apply(Transformer.scala:22)
	at org.scalatest.Transformer.apply(Transformer.scala:20)
	at org.scalatest.FunSuiteLike$$anon$1.apply(FunSuiteLike.scala:166)
	at org.apache.spark.SparkFunSuite.withFixture(SparkFunSuite.scala:42)
	at org.scalatest.FunSuiteLike$class.invokeWithFixture$1(FunSuiteLike.scala:163)
	at org.scalatest.FunSuiteLike$$anonfun$runTest$1.apply(FunSuiteLike.scala:175)
	at org.scalatest.FunSuiteLike$$anonfun$runTest$1.apply(FunSuiteLike.scala:175)
	at org.scalatest.SuperEngine.runTestImpl(Engine.scala:306)
	at org.scalatest.FunSuiteLike$class.runTest(FunSuiteLike.scala:175)
	at org.apache.spark.scheduler.DAGSchedulerSuite.org$scalatest$BeforeAndAfter$$super$runTest(DAGSchedulerSuite.scala:70)
	at org.scalatest.BeforeAndAfter$class.runTest(BeforeAndAfter.scala:200)
	at org.apache.spark.scheduler.DAGSchedulerSuite.org$scalatest$BeforeAndAfterEach$$super$runTest(DAGSchedulerSuite.scala:70)
	at org.scalatest.BeforeAndAfterEach$class.runTest(BeforeAndAfterEach.scala:255)
	at org.apache.spark.scheduler.DAGSchedulerSuite.runTest(DAGSchedulerSuite.scala:70)
	at org.scalatest.FunSuiteLike$$anonfun$runTests$1.apply(FunSuiteLike.scala:208)
	at org.scalatest.FunSuiteLike$$anonfun$runTests$1.apply(FunSuiteLike.scala:208)
	at org.scalatest.SuperEngine$$anonfun$traverseSubNodes$1$1.apply(Engine.scala:413)
	at org.scalatest.SuperEngine$$anonfun$traverseSubNodes$1$1.apply(Engine.scala:401)
	at scala.collection.immutable.List.foreach(List.scala:318)
	at org.scalatest.SuperEngine.traverseSubNodes$1(Engine.scala:401)
	at org.scalatest.SuperEngine.org$scalatest$SuperEngine$$runTestsInBranch(Engine.scala:396)
	at org.scalatest.SuperEngine.runTestsImpl(Engine.scala:483)
	at org.scalatest.FunSuiteLike$class.runTests(FunSuiteLike.scala:208)
	at org.scalatest.FunSuite.runTests(FunSuite.scala:1555)
	at org.scalatest.Suite$class.run(Suite.scala:1424)
	at org.scalatest.FunSuite.org$scalatest$FunSuiteLike$$super$run(FunSuite.scala:1555)
	at org.scalatest.FunSuiteLike$$anonfun$run$1.apply(FunSuiteLike.scala:212)
	at org.scalatest.FunSuiteLike$$anonfun$run$1.apply(FunSuiteLike.scala:212)
	at org.scalatest.SuperEngine.runImpl(Engine.scala:545)
	at org.scalatest.FunSuiteLike$class.run(FunSuiteLike.scala:212)
	at org.apache.spark.scheduler.DAGSchedulerSuite.org$scalatest$BeforeAndAfter$$super$run(DAGSchedulerSuite.scala:70)
	at org.scalatest.BeforeAndAfter$class.run(BeforeAndAfter.scala:241)
	at org.apache.spark.scheduler.DAGSchedulerSuite.org$scalatest$BeforeAndAfterAll$$super$run(DAGSchedulerSuite.scala:70)
	at org.scalatest.BeforeAndAfterAll$class.liftedTree1$1(BeforeAndAfterAll.scala:257)
	at org.scalatest.BeforeAndAfterAll$class.run(BeforeAndAfterAll.scala:256)
	at org.apache.spark.scheduler.DAGSchedulerSuite.run(DAGSchedulerSuite.scala:70)
	at org.scalatest.tools.Framework.org$scalatest$tools$Framework$$runSuite(Framework.scala:462)
	at org.scalatest.tools.Framework$ScalaTestTask.execute(Framework.scala:671)
	at sbt.ForkMain$Run$2.call(ForkMain.java:294)
	at sbt.ForkMain$Run$2.call(ForkMain.java:284)
	at java.util.concurrent.FutureTask.run(FutureTask.java:262)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
	at java.lang.Thread.run(Thread.java:744)
```

Author: Aaron Davidson <aaron@databricks.com>

Closes #7028 from aarondav/stack-trace and squashes the following commits:

4714664 [Aaron Davidson] [SPARK-8644] Include call site in SparkException stack traces thrown by job failures
---
 .../org/apache/spark/scheduler/DAGScheduler.scala |  3 +++
 .../spark/scheduler/DAGSchedulerSuite.scala       | 15 +++++++++++++++
 2 files changed, 18 insertions(+)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
index f3d87ee5c4fd1..f8ba3d2d1a9b2 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -556,6 +556,9 @@ class DAGScheduler(
       case JobFailed(exception: Exception) =>
         logInfo("Job %d failed: %s, took %f s".format
           (waiter.jobId, callSite.shortForm, (System.nanoTime - start) / 1e9))
+        // SPARK-8644: Include user stack trace in exceptions coming from DAGScheduler.
+        val callerStackTrace = Thread.currentThread().getStackTrace.tail
+        exception.setStackTrace(exception.getStackTrace ++ callerStackTrace)
         throw exception
     }
   }
diff --git a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
index 6bc45f249f975..4f2b0fa162b72 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
@@ -875,6 +875,21 @@ class DAGSchedulerSuite
     assertDataStructuresEmpty
   }
 
+  test("Spark exceptions should include call site in stack trace") {
+    val e = intercept[SparkException] {
+      sc.parallelize(1 to 10, 2).map { _ => throw new RuntimeException("uh-oh!") }.count()
+    }
+
+    // Does not include message, ONLY stack trace.
+    val stackTraceString = e.getStackTraceString
+
+    // should actually include the RDD operation that invoked the method:
+    assert(stackTraceString.contains("org.apache.spark.rdd.RDD.count"))
+
+    // should include the FunSuite setup:
+    assert(stackTraceString.contains("org.scalatest.FunSuite"))
+  }
+
   /**
    * Assert that the supplied TaskSet has exactly the given hosts as its preferred locations.
    * Note that this checks only the host and not the executor ID.

From 49351c7f597c67950cc65e5014a89fad31b9a6f7 Mon Sep 17 00:00:00 2001
From: Lianhui Wang <lianhuiwang09@gmail.com>
Date: Thu, 16 Jul 2015 19:31:14 -0700
Subject: [PATCH 192/549] [SPARK-8646] PySpark does not run on YARN if master
 not provided in command line

andrewor14 davies vanzin can you take a look at this? thanks

Author: Lianhui Wang <lianhuiwang09@gmail.com>

Closes #7438 from lianhuiwang/SPARK-8646 and squashes the following commits:

cb3f12d [Lianhui Wang] add whitespace
6d874a6 [Lianhui Wang] support pyspark for yarn-client
---
 python/pyspark/context.py                                    | 5 +++++
 .../src/main/scala/org/apache/spark/deploy/yarn/Client.scala | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/context.py b/python/pyspark/context.py
index d7466729b8f36..43bde5ae41e23 100644
--- a/python/pyspark/context.py
+++ b/python/pyspark/context.py
@@ -152,6 +152,11 @@ def _do_init(self, master, appName, sparkHome, pyFiles, environment, batchSize,
         self.master = self._conf.get("spark.master")
         self.appName = self._conf.get("spark.app.name")
         self.sparkHome = self._conf.get("spark.home", None)
+
+        # Let YARN know it's a pyspark app, so it distributes needed libraries.
+        if self.master == "yarn-client":
+            self._conf.set("spark.yarn.isPython", "true")
+
         for (k, v) in self._conf.getAll():
             if k.startswith("spark.executorEnv."):
                 varName = k[len("spark.executorEnv."):]
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
index f86b6d1e5d7bc..b74ea9a10afb2 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -616,7 +616,7 @@ private[spark] class Client(
     val appId = newAppResponse.getApplicationId
     val appStagingDir = getAppStagingDir(appId)
     val pySparkArchives =
-      if (sys.props.getOrElse("spark.yarn.isPython", "false").toBoolean) {
+      if (sparkConf.getBoolean("spark.yarn.isPython", false)) {
         findPySparkArchives()
       } else {
         Nil

From d86bbb4e286f16f77ba125452b07827684eafeed Mon Sep 17 00:00:00 2001
From: Timothy Chen <tnachen@gmail.com>
Date: Thu, 16 Jul 2015 19:36:45 -0700
Subject: [PATCH 193/549] [SPARK-6284] [MESOS] Add mesos role, principal and
 secret

Mesos supports framework authentication and role to be set per framework, which the role is used to identify the framework's role which impacts the sharing weight of resource allocation and optional authentication information to allow the framework to be connected to the master.

Author: Timothy Chen <tnachen@gmail.com>

Closes #4960 from tnachen/mesos_fw_auth and squashes the following commits:

0f9f03e [Timothy Chen] Fix review comments.
8f9488a [Timothy Chen] Fix rebase
f7fc2a9 [Timothy Chen] Add mesos role, auth and secret.
---
 .../mesos/CoarseMesosSchedulerBackend.scala   |  35 ++---
 .../cluster/mesos/MesosClusterScheduler.scala |  28 ++--
 .../cluster/mesos/MesosSchedulerBackend.scala | 118 ++++++++++------
 .../cluster/mesos/MesosSchedulerUtils.scala   | 126 +++++++++++++++---
 .../CoarseMesosSchedulerBackendSuite.scala    |  19 ++-
 .../mesos/MesosSchedulerBackendSuite.scala    | 106 ++++++++++++++-
 docs/running-on-mesos.md                      |  22 +++
 7 files changed, 358 insertions(+), 96 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala
index cbade131494bc..b7fde0d9b3265 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala
@@ -18,8 +18,8 @@
 package org.apache.spark.scheduler.cluster.mesos
 
 import java.io.File
-import java.util.{List => JList, Collections}
 import java.util.concurrent.locks.ReentrantLock
+import java.util.{Collections, List => JList}
 
 import scala.collection.JavaConversions._
 import scala.collection.mutable.{HashMap, HashSet}
@@ -27,12 +27,11 @@ import scala.collection.mutable.{HashMap, HashSet}
 import com.google.common.collect.HashBiMap
 import org.apache.mesos.Protos.{TaskInfo => MesosTaskInfo, _}
 import org.apache.mesos.{Scheduler => MScheduler, _}
-import org.apache.mesos.Protos.{TaskInfo => MesosTaskInfo, _}
-import org.apache.spark.{SparkContext, SparkEnv, SparkException, TaskState}
 import org.apache.spark.rpc.RpcAddress
 import org.apache.spark.scheduler.TaskSchedulerImpl
 import org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend
 import org.apache.spark.util.Utils
+import org.apache.spark.{SparkContext, SparkEnv, SparkException, TaskState}
 
 /**
  * A SchedulerBackend that runs tasks on Mesos, but uses "coarse-grained" tasks, where it holds
@@ -69,7 +68,7 @@ private[spark] class CoarseMesosSchedulerBackend(
 
   /**
    * The total number of executors we aim to have. Undefined when not using dynamic allocation
-   * and before the ExecutorAllocatorManager calls [[doRequesTotalExecutors]].
+   * and before the ExecutorAllocatorManager calls [[doRequestTotalExecutors]].
    */
   private var executorLimitOption: Option[Int] = None
 
@@ -103,8 +102,9 @@ private[spark] class CoarseMesosSchedulerBackend(
 
   override def start() {
     super.start()
-    val fwInfo = FrameworkInfo.newBuilder().setUser(sc.sparkUser).setName(sc.appName).build()
-    startScheduler(master, CoarseMesosSchedulerBackend.this, fwInfo)
+    val driver = createSchedulerDriver(
+      master, CoarseMesosSchedulerBackend.this, sc.sparkUser, sc.appName, sc.conf)
+    startScheduler(driver)
   }
 
   def createCommand(offer: Offer, numCores: Int, taskId: Int): CommandInfo = {
@@ -224,24 +224,29 @@ private[spark] class CoarseMesosSchedulerBackend(
           taskIdToSlaveId(taskId) = slaveId
           slaveIdsWithExecutors += slaveId
           coresByTaskId(taskId) = cpusToUse
-          val task = MesosTaskInfo.newBuilder()
+          // Gather cpu resources from the available resources and use them in the task.
+          val (remainingResources, cpuResourcesToUse) =
+            partitionResources(offer.getResourcesList, "cpus", cpusToUse)
+          val (_, memResourcesToUse) =
+            partitionResources(remainingResources, "mem", calculateTotalMemory(sc))
+          val taskBuilder = MesosTaskInfo.newBuilder()
             .setTaskId(TaskID.newBuilder().setValue(taskId.toString).build())
             .setSlaveId(offer.getSlaveId)
             .setCommand(createCommand(offer, cpusToUse + extraCoresPerSlave, taskId))
             .setName("Task " + taskId)
-            .addResources(createResource("cpus", cpusToUse))
-            .addResources(createResource("mem", calculateTotalMemory(sc)))
+            .addAllResources(cpuResourcesToUse)
+            .addAllResources(memResourcesToUse)
 
           sc.conf.getOption("spark.mesos.executor.docker.image").foreach { image =>
             MesosSchedulerBackendUtil
-              .setupContainerBuilderDockerInfo(image, sc.conf, task.getContainerBuilder)
+              .setupContainerBuilderDockerInfo(image, sc.conf, taskBuilder.getContainerBuilder())
           }
 
           // accept the offer and launch the task
           logDebug(s"Accepting offer: $id with attributes: $offerAttributes mem: $mem cpu: $cpus")
           d.launchTasks(
             Collections.singleton(offer.getId),
-            Collections.singleton(task.build()), filters)
+            Collections.singleton(taskBuilder.build()), filters)
         } else {
           // Decline the offer
           logDebug(s"Declining offer: $id with attributes: $offerAttributes mem: $mem cpu: $cpus")
@@ -255,7 +260,7 @@ private[spark] class CoarseMesosSchedulerBackend(
   override def statusUpdate(d: SchedulerDriver, status: TaskStatus) {
     val taskId = status.getTaskId.getValue.toInt
     val state = status.getState
-    logInfo("Mesos task " + taskId + " is now " + state)
+    logInfo(s"Mesos task $taskId is now $state")
     stateLock.synchronized {
       if (TaskState.isFinished(TaskState.fromMesos(state))) {
         val slaveId = taskIdToSlaveId(taskId)
@@ -270,7 +275,7 @@ private[spark] class CoarseMesosSchedulerBackend(
         if (TaskState.isFailed(TaskState.fromMesos(state))) {
           failuresBySlaveId(slaveId) = failuresBySlaveId.getOrElse(slaveId, 0) + 1
           if (failuresBySlaveId(slaveId) >= MAX_SLAVE_FAILURES) {
-            logInfo("Blacklisting Mesos slave " + slaveId + " due to too many failures; " +
+            logInfo(s"Blacklisting Mesos slave $slaveId due to too many failures; " +
                 "is Spark installed on it?")
           }
         }
@@ -282,7 +287,7 @@ private[spark] class CoarseMesosSchedulerBackend(
   }
 
   override def error(d: SchedulerDriver, message: String) {
-    logError("Mesos error: " + message)
+    logError(s"Mesos error: $message")
     scheduler.error(message)
   }
 
@@ -323,7 +328,7 @@ private[spark] class CoarseMesosSchedulerBackend(
   }
 
   override def slaveLost(d: SchedulerDriver, slaveId: SlaveID): Unit = {
-    logInfo("Mesos slave lost: " + slaveId.getValue)
+    logInfo(s"Mesos slave lost: ${slaveId.getValue}")
     executorTerminated(d, slaveId.getValue, "Mesos slave lost: " + slaveId.getValue)
   }
 
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala
index d3a20f822176e..f078547e71352 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala
@@ -295,20 +295,24 @@ private[spark] class MesosClusterScheduler(
   def start(): Unit = {
     // TODO: Implement leader election to make sure only one framework running in the cluster.
     val fwId = schedulerState.fetch[String]("frameworkId")
-    val builder = FrameworkInfo.newBuilder()
-      .setUser(Utils.getCurrentUserName())
-      .setName(appName)
-      .setWebuiUrl(frameworkUrl)
-      .setCheckpoint(true)
-      .setFailoverTimeout(Integer.MAX_VALUE) // Setting to max so tasks keep running on crash
     fwId.foreach { id =>
-      builder.setId(FrameworkID.newBuilder().setValue(id).build())
       frameworkId = id
     }
     recoverState()
     metricsSystem.registerSource(new MesosClusterSchedulerSource(this))
     metricsSystem.start()
-    startScheduler(master, MesosClusterScheduler.this, builder.build())
+    val driver = createSchedulerDriver(
+      master,
+      MesosClusterScheduler.this,
+      Utils.getCurrentUserName(),
+      appName,
+      conf,
+      Some(frameworkUrl),
+      Some(true),
+      Some(Integer.MAX_VALUE),
+      fwId)
+
+    startScheduler(driver)
     ready = true
   }
 
@@ -449,12 +453,8 @@ private[spark] class MesosClusterScheduler(
         offer.cpu -= driverCpu
         offer.mem -= driverMem
         val taskId = TaskID.newBuilder().setValue(submission.submissionId).build()
-        val cpuResource = Resource.newBuilder()
-          .setName("cpus").setType(Value.Type.SCALAR)
-          .setScalar(Value.Scalar.newBuilder().setValue(driverCpu)).build()
-        val memResource = Resource.newBuilder()
-          .setName("mem").setType(Value.Type.SCALAR)
-          .setScalar(Value.Scalar.newBuilder().setValue(driverMem)).build()
+        val cpuResource = createResource("cpus", driverCpu)
+        val memResource = createResource("mem", driverMem)
         val commandInfo = buildDriverCommand(submission)
         val appName = submission.schedulerProperties("spark.app.name")
         val taskInfo = TaskInfo.newBuilder()
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala
index d72e2af456e15..3f63ec1c5832f 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala
@@ -32,6 +32,7 @@ import org.apache.spark.scheduler._
 import org.apache.spark.scheduler.cluster.ExecutorInfo
 import org.apache.spark.util.Utils
 
+
 /**
  * A SchedulerBackend for running fine-grained tasks on Mesos. Each Spark task is mapped to a
  * separate Mesos task, allowing multiple applications to share cluster nodes both in space (tasks
@@ -45,8 +46,8 @@ private[spark] class MesosSchedulerBackend(
   with MScheduler
   with MesosSchedulerUtils {
 
-  // Which slave IDs we have executors on
-  val slaveIdsWithExecutors = new HashSet[String]
+  // Stores the slave ids that has launched a Mesos executor.
+  val slaveIdToExecutorInfo = new HashMap[String, MesosExecutorInfo]
   val taskIdToSlaveId = new HashMap[Long, String]
 
   // An ExecutorInfo for our tasks
@@ -66,12 +67,21 @@ private[spark] class MesosSchedulerBackend(
   @volatile var appId: String = _
 
   override def start() {
-    val fwInfo = FrameworkInfo.newBuilder().setUser(sc.sparkUser).setName(sc.appName).build()
     classLoader = Thread.currentThread.getContextClassLoader
-    startScheduler(master, MesosSchedulerBackend.this, fwInfo)
+    val driver = createSchedulerDriver(
+      master, MesosSchedulerBackend.this, sc.sparkUser, sc.appName, sc.conf)
+    startScheduler(driver)
   }
 
-  def createExecutorInfo(execId: String): MesosExecutorInfo = {
+  /**
+   * Creates a MesosExecutorInfo that is used to launch a Mesos executor.
+   * @param availableResources Available resources that is offered by Mesos
+   * @param execId The executor id to assign to this new executor.
+   * @return A tuple of the new mesos executor info and the remaining available resources.
+   */
+  def createExecutorInfo(
+      availableResources: JList[Resource],
+      execId: String): (MesosExecutorInfo, JList[Resource]) = {
     val executorSparkHome = sc.conf.getOption("spark.mesos.executor.home")
       .orElse(sc.getSparkHome()) // Fall back to driver Spark home for backward compatibility
       .getOrElse {
@@ -115,32 +125,25 @@ private[spark] class MesosSchedulerBackend(
       command.setValue(s"cd ${basename}*; $prefixEnv ./bin/spark-class $executorBackendName")
       command.addUris(CommandInfo.URI.newBuilder().setValue(uri.get))
     }
-    val cpus = Resource.newBuilder()
-      .setName("cpus")
-      .setType(Value.Type.SCALAR)
-      .setScalar(Value.Scalar.newBuilder()
-      .setValue(mesosExecutorCores).build())
-      .build()
-    val memory = Resource.newBuilder()
-      .setName("mem")
-      .setType(Value.Type.SCALAR)
-      .setScalar(
-        Value.Scalar.newBuilder()
-          .setValue(calculateTotalMemory(sc)).build())
-      .build()
-    val executorInfo = MesosExecutorInfo.newBuilder()
+    val builder = MesosExecutorInfo.newBuilder()
+    val (resourcesAfterCpu, usedCpuResources) =
+      partitionResources(availableResources, "cpus", scheduler.CPUS_PER_TASK)
+    val (resourcesAfterMem, usedMemResources) =
+      partitionResources(resourcesAfterCpu, "mem", calculateTotalMemory(sc))
+
+    builder.addAllResources(usedCpuResources)
+    builder.addAllResources(usedMemResources)
+    val executorInfo = builder
       .setExecutorId(ExecutorID.newBuilder().setValue(execId).build())
       .setCommand(command)
       .setData(ByteString.copyFrom(createExecArg()))
-      .addResources(cpus)
-      .addResources(memory)
 
     sc.conf.getOption("spark.mesos.executor.docker.image").foreach { image =>
       MesosSchedulerBackendUtil
         .setupContainerBuilderDockerInfo(image, sc.conf, executorInfo.getContainerBuilder())
     }
 
-    executorInfo.build()
+    (executorInfo.build(), resourcesAfterMem)
   }
 
   /**
@@ -183,6 +186,18 @@ private[spark] class MesosSchedulerBackend(
 
   override def reregistered(d: SchedulerDriver, masterInfo: MasterInfo) {}
 
+  private def getTasksSummary(tasks: JArrayList[MesosTaskInfo]): String = {
+    val builder = new StringBuilder
+    tasks.foreach { t =>
+      builder.append("Task id: ").append(t.getTaskId.getValue).append("\n")
+        .append("Slave id: ").append(t.getSlaveId.getValue).append("\n")
+        .append("Task resources: ").append(t.getResourcesList).append("\n")
+        .append("Executor resources: ").append(t.getExecutor.getResourcesList)
+        .append("---------------------------------------------\n")
+    }
+    builder.toString()
+  }
+
   /**
    * Method called by Mesos to offer resources on slaves. We respond by asking our active task sets
    * for tasks in order of priority. We fill each node with tasks in a round-robin manner so that
@@ -207,7 +222,7 @@ private[spark] class MesosSchedulerBackend(
 
         val meetsRequirements =
           (meetsConstraints && meetsMemoryRequirements && meetsCPURequirements) ||
-          (slaveIdsWithExecutors.contains(slaveId) && cpus >= scheduler.CPUS_PER_TASK)
+          (slaveIdToExecutorInfo.contains(slaveId) && cpus >= scheduler.CPUS_PER_TASK)
 
         // add some debug messaging
         val debugstr = if (meetsRequirements) "Accepting" else "Declining"
@@ -221,7 +236,7 @@ private[spark] class MesosSchedulerBackend(
       unUsableOffers.foreach(o => d.declineOffer(o.getId))
 
       val workerOffers = usableOffers.map { o =>
-        val cpus = if (slaveIdsWithExecutors.contains(o.getSlaveId.getValue)) {
+        val cpus = if (slaveIdToExecutorInfo.contains(o.getSlaveId.getValue)) {
           getResource(o.getResourcesList, "cpus").toInt
         } else {
           // If the Mesos executor has not been started on this slave yet, set aside a few
@@ -236,6 +251,10 @@ private[spark] class MesosSchedulerBackend(
 
       val slaveIdToOffer = usableOffers.map(o => o.getSlaveId.getValue -> o).toMap
       val slaveIdToWorkerOffer = workerOffers.map(o => o.executorId -> o).toMap
+      val slaveIdToResources = new HashMap[String, JList[Resource]]()
+      usableOffers.foreach { o =>
+        slaveIdToResources(o.getSlaveId.getValue) = o.getResourcesList
+      }
 
       val mesosTasks = new HashMap[String, JArrayList[MesosTaskInfo]]
 
@@ -245,15 +264,19 @@ private[spark] class MesosSchedulerBackend(
       val acceptedOffers = scheduler.resourceOffers(workerOffers).filter(!_.isEmpty)
       acceptedOffers
         .foreach { offer =>
-        offer.foreach { taskDesc =>
-          val slaveId = taskDesc.executorId
-          slaveIdsWithExecutors += slaveId
-          slavesIdsOfAcceptedOffers += slaveId
-          taskIdToSlaveId(taskDesc.taskId) = slaveId
-          mesosTasks.getOrElseUpdate(slaveId, new JArrayList[MesosTaskInfo])
-            .add(createMesosTask(taskDesc, slaveId))
+          offer.foreach { taskDesc =>
+            val slaveId = taskDesc.executorId
+            slavesIdsOfAcceptedOffers += slaveId
+            taskIdToSlaveId(taskDesc.taskId) = slaveId
+            val (mesosTask, remainingResources) = createMesosTask(
+              taskDesc,
+              slaveIdToResources(slaveId),
+              slaveId)
+            mesosTasks.getOrElseUpdate(slaveId, new JArrayList[MesosTaskInfo])
+              .add(mesosTask)
+            slaveIdToResources(slaveId) = remainingResources
+          }
         }
-      }
 
       // Reply to the offers
       val filters = Filters.newBuilder().setRefuseSeconds(1).build() // TODO: lower timeout?
@@ -264,6 +287,7 @@ private[spark] class MesosSchedulerBackend(
             // TODO: Add support for log urls for Mesos
             new ExecutorInfo(o.host, o.cores, Map.empty)))
         )
+        logTrace(s"Launching Mesos tasks on slave '$slaveId', tasks:\n${getTasksSummary(tasks)}")
         d.launchTasks(Collections.singleton(slaveIdToOffer(slaveId).getId), tasks, filters)
       }
 
@@ -272,26 +296,32 @@ private[spark] class MesosSchedulerBackend(
       for (o <- usableOffers if !slavesIdsOfAcceptedOffers.contains(o.getSlaveId.getValue)) {
         d.declineOffer(o.getId)
       }
-
     }
   }
 
-  /** Turn a Spark TaskDescription into a Mesos task */
-  def createMesosTask(task: TaskDescription, slaveId: String): MesosTaskInfo = {
+  /** Turn a Spark TaskDescription into a Mesos task and also resources unused by the task */
+  def createMesosTask(
+      task: TaskDescription,
+      resources: JList[Resource],
+      slaveId: String): (MesosTaskInfo, JList[Resource]) = {
     val taskId = TaskID.newBuilder().setValue(task.taskId.toString).build()
-    val cpuResource = Resource.newBuilder()
-      .setName("cpus")
-      .setType(Value.Type.SCALAR)
-      .setScalar(Value.Scalar.newBuilder().setValue(scheduler.CPUS_PER_TASK).build())
-      .build()
-    MesosTaskInfo.newBuilder()
+    val (executorInfo, remainingResources) = if (slaveIdToExecutorInfo.contains(slaveId)) {
+      (slaveIdToExecutorInfo(slaveId), resources)
+    } else {
+      createExecutorInfo(resources, slaveId)
+    }
+    slaveIdToExecutorInfo(slaveId) = executorInfo
+    val (finalResources, cpuResources) =
+      partitionResources(remainingResources, "cpus", scheduler.CPUS_PER_TASK)
+    val taskInfo = MesosTaskInfo.newBuilder()
       .setTaskId(taskId)
       .setSlaveId(SlaveID.newBuilder().setValue(slaveId).build())
-      .setExecutor(createExecutorInfo(slaveId))
+      .setExecutor(executorInfo)
       .setName(task.name)
-      .addResources(cpuResource)
+      .addAllResources(cpuResources)
       .setData(MesosTaskLaunchData(task.serializedTask, task.attemptNumber).toByteString)
       .build()
+    (taskInfo, finalResources)
   }
 
   override def statusUpdate(d: SchedulerDriver, status: TaskStatus) {
@@ -337,7 +367,7 @@ private[spark] class MesosSchedulerBackend(
   private def removeExecutor(slaveId: String, reason: String) = {
     synchronized {
       listenerBus.post(SparkListenerExecutorRemoved(System.currentTimeMillis(), slaveId, reason))
-      slaveIdsWithExecutors -= slaveId
+      slaveIdToExecutorInfo -= slaveId
     }
   }
 
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtils.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtils.scala
index 925702e63afd3..c04920e4f5873 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtils.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtils.scala
@@ -21,15 +21,17 @@ import java.util.{List => JList}
 import java.util.concurrent.CountDownLatch
 
 import scala.collection.JavaConversions._
+import scala.collection.mutable.ArrayBuffer
 import scala.util.control.NonFatal
 
 import com.google.common.base.Splitter
 import org.apache.mesos.{MesosSchedulerDriver, SchedulerDriver, Scheduler, Protos}
 import org.apache.mesos.Protos._
-import org.apache.mesos.protobuf.GeneratedMessage
-import org.apache.spark.{Logging, SparkContext}
+import org.apache.mesos.protobuf.{ByteString, GeneratedMessage}
+import org.apache.spark.{SparkException, SparkConf, Logging, SparkContext}
 import org.apache.spark.util.Utils
 
+
 /**
  * Shared trait for implementing a Mesos Scheduler. This holds common state and helper
  * methods and Mesos scheduler will use.
@@ -42,13 +44,63 @@ private[mesos] trait MesosSchedulerUtils extends Logging {
   protected var mesosDriver: SchedulerDriver = null
 
   /**
-   * Starts the MesosSchedulerDriver with the provided information. This method returns
-   * only after the scheduler has registered with Mesos.
-   * @param masterUrl Mesos master connection URL
-   * @param scheduler Scheduler object
-   * @param fwInfo FrameworkInfo to pass to the Mesos master
+   * Creates a new MesosSchedulerDriver that communicates to the Mesos master.
+   * @param masterUrl The url to connect to Mesos master
+   * @param scheduler the scheduler class to receive scheduler callbacks
+   * @param sparkUser User to impersonate with when running tasks
+   * @param appName The framework name to display on the Mesos UI
+   * @param conf Spark configuration
+   * @param webuiUrl The WebUI url to link from Mesos UI
+   * @param checkpoint Option to checkpoint tasks for failover
+   * @param failoverTimeout Duration Mesos master expect scheduler to reconnect on disconnect
+   * @param frameworkId The id of the new framework
    */
-  def startScheduler(masterUrl: String, scheduler: Scheduler, fwInfo: FrameworkInfo): Unit = {
+  protected def createSchedulerDriver(
+      masterUrl: String,
+      scheduler: Scheduler,
+      sparkUser: String,
+      appName: String,
+      conf: SparkConf,
+      webuiUrl: Option[String] = None,
+      checkpoint: Option[Boolean] = None,
+      failoverTimeout: Option[Double] = None,
+      frameworkId: Option[String] = None): SchedulerDriver = {
+    val fwInfoBuilder = FrameworkInfo.newBuilder().setUser(sparkUser).setName(appName)
+    val credBuilder = Credential.newBuilder()
+    webuiUrl.foreach { url => fwInfoBuilder.setWebuiUrl(url) }
+    checkpoint.foreach { checkpoint => fwInfoBuilder.setCheckpoint(checkpoint) }
+    failoverTimeout.foreach { timeout => fwInfoBuilder.setFailoverTimeout(timeout) }
+    frameworkId.foreach { id =>
+      fwInfoBuilder.setId(FrameworkID.newBuilder().setValue(id).build())
+    }
+    conf.getOption("spark.mesos.principal").foreach { principal =>
+      fwInfoBuilder.setPrincipal(principal)
+      credBuilder.setPrincipal(principal)
+    }
+    conf.getOption("spark.mesos.secret").foreach { secret =>
+      credBuilder.setSecret(ByteString.copyFromUtf8(secret))
+    }
+    if (credBuilder.hasSecret && !fwInfoBuilder.hasPrincipal) {
+      throw new SparkException(
+        "spark.mesos.principal must be configured when spark.mesos.secret is set")
+    }
+    conf.getOption("spark.mesos.role").foreach { role =>
+      fwInfoBuilder.setRole(role)
+    }
+    if (credBuilder.hasPrincipal) {
+      new MesosSchedulerDriver(
+        scheduler, fwInfoBuilder.build(), masterUrl, credBuilder.build())
+    } else {
+      new MesosSchedulerDriver(scheduler, fwInfoBuilder.build(), masterUrl)
+    }
+  }
+
+  /**
+   * Starts the MesosSchedulerDriver and stores the current running driver to this new instance.
+   * This driver is expected to not be running.
+   * This method returns only after the scheduler has registered with Mesos.
+   */
+  def startScheduler(newDriver: SchedulerDriver): Unit = {
     synchronized {
       if (mesosDriver != null) {
         registerLatch.await()
@@ -59,11 +111,11 @@ private[mesos] trait MesosSchedulerUtils extends Logging {
         setDaemon(true)
 
         override def run() {
-          mesosDriver = new MesosSchedulerDriver(scheduler, fwInfo, masterUrl)
+          mesosDriver = newDriver
           try {
             val ret = mesosDriver.run()
             logInfo("driver.run() returned with code " + ret)
-            if (ret.equals(Status.DRIVER_ABORTED)) {
+            if (ret != null && ret.equals(Status.DRIVER_ABORTED)) {
               System.exit(1)
             }
           } catch {
@@ -82,18 +134,62 @@ private[mesos] trait MesosSchedulerUtils extends Logging {
   /**
    * Signal that the scheduler has registered with Mesos.
    */
+  protected def getResource(res: JList[Resource], name: String): Double = {
+    // A resource can have multiple values in the offer since it can either be from
+    // a specific role or wildcard.
+    res.filter(_.getName == name).map(_.getScalar.getValue).sum
+  }
+
   protected def markRegistered(): Unit = {
     registerLatch.countDown()
   }
 
+  def createResource(name: String, amount: Double, role: Option[String] = None): Resource = {
+    val builder = Resource.newBuilder()
+      .setName(name)
+      .setType(Value.Type.SCALAR)
+      .setScalar(Value.Scalar.newBuilder().setValue(amount).build())
+
+    role.foreach { r => builder.setRole(r) }
+
+    builder.build()
+  }
+
   /**
-   * Get the amount of resources for the specified type from the resource list
+   * Partition the existing set of resources into two groups, those remaining to be
+   * scheduled and those requested to be used for a new task.
+   * @param resources The full list of available resources
+   * @param resourceName The name of the resource to take from the available resources
+   * @param amountToUse The amount of resources to take from the available resources
+   * @return The remaining resources list and the used resources list.
    */
-  protected def getResource(res: JList[Resource], name: String): Double = {
-    for (r <- res if r.getName == name) {
-      return r.getScalar.getValue
+  def partitionResources(
+      resources: JList[Resource],
+      resourceName: String,
+      amountToUse: Double): (List[Resource], List[Resource]) = {
+    var remain = amountToUse
+    var requestedResources = new ArrayBuffer[Resource]
+    val remainingResources = resources.map {
+      case r => {
+        if (remain > 0 &&
+          r.getType == Value.Type.SCALAR &&
+          r.getScalar.getValue > 0.0 &&
+          r.getName == resourceName) {
+          val usage = Math.min(remain, r.getScalar.getValue)
+          requestedResources += createResource(resourceName, usage, Some(r.getRole))
+          remain -= usage
+          createResource(resourceName, r.getScalar.getValue - usage, Some(r.getRole))
+        } else {
+          r
+        }
+      }
     }
-    0.0
+
+    // Filter any resource that has depleted.
+    val filteredResources =
+      remainingResources.filter(r => r.getType != Value.Type.SCALAR || r.getScalar.getValue > 0.0)
+
+    (filteredResources.toList, requestedResources.toList)
   }
 
   /** Helper method to get the key,value-set pair for a Mesos Attribute protobuf */
diff --git a/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackendSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackendSuite.scala
index 3f1692917a357..4b504df7b8851 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackendSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackendSuite.scala
@@ -22,7 +22,7 @@ import java.util.Collections
 
 import org.apache.mesos.Protos.Value.Scalar
 import org.apache.mesos.Protos._
-import org.apache.mesos.SchedulerDriver
+import org.apache.mesos.{Protos, Scheduler, SchedulerDriver}
 import org.mockito.Matchers._
 import org.mockito.Mockito._
 import org.mockito.Matchers
@@ -60,7 +60,16 @@ class CoarseMesosSchedulerBackendSuite extends SparkFunSuite
       taskScheduler: TaskSchedulerImpl,
       driver: SchedulerDriver): CoarseMesosSchedulerBackend = {
     val backend = new CoarseMesosSchedulerBackend(taskScheduler, sc, "master") {
-      mesosDriver = driver
+      override protected def createSchedulerDriver(
+        masterUrl: String,
+        scheduler: Scheduler,
+        sparkUser: String,
+        appName: String,
+        conf: SparkConf,
+        webuiUrl: Option[String] = None,
+        checkpoint: Option[Boolean] = None,
+        failoverTimeout: Option[Double] = None,
+        frameworkId: Option[String] = None): SchedulerDriver = driver
       markRegistered()
     }
     backend.start()
@@ -80,6 +89,7 @@ class CoarseMesosSchedulerBackendSuite extends SparkFunSuite
 
   test("mesos supports killing and limiting executors") {
     val driver = mock[SchedulerDriver]
+    when(driver.start()).thenReturn(Protos.Status.DRIVER_RUNNING)
     val taskScheduler = mock[TaskSchedulerImpl]
     when(taskScheduler.sc).thenReturn(sc)
 
@@ -87,7 +97,7 @@ class CoarseMesosSchedulerBackendSuite extends SparkFunSuite
     sparkConf.set("spark.driver.port", "1234")
 
     val backend = createSchedulerBackend(taskScheduler, driver)
-    val minMem = backend.calculateTotalMemory(sc).toInt
+    val minMem = backend.calculateTotalMemory(sc)
     val minCpu = 4
 
     val mesosOffers = new java.util.ArrayList[Offer]
@@ -130,11 +140,12 @@ class CoarseMesosSchedulerBackendSuite extends SparkFunSuite
 
   test("mesos supports killing and relaunching tasks with executors") {
     val driver = mock[SchedulerDriver]
+    when(driver.start()).thenReturn(Protos.Status.DRIVER_RUNNING)
     val taskScheduler = mock[TaskSchedulerImpl]
     when(taskScheduler.sc).thenReturn(sc)
 
     val backend = createSchedulerBackend(taskScheduler, driver)
-    val minMem = backend.calculateTotalMemory(sc).toInt + 1024
+    val minMem = backend.calculateTotalMemory(sc) + 1024
     val minCpu = 4
 
     val mesosOffers = new java.util.ArrayList[Offer]
diff --git a/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendSuite.scala
index d01837fe78957..5ed30f64d705f 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendSuite.scala
@@ -21,6 +21,7 @@ import java.nio.ByteBuffer
 import java.util
 import java.util.Collections
 
+import scala.collection.JavaConversions._
 import scala.collection.mutable
 import scala.collection.mutable.ArrayBuffer
 
@@ -60,14 +61,17 @@ class MesosSchedulerBackendSuite extends SparkFunSuite with LocalSparkContext wi
 
     val mesosSchedulerBackend = new MesosSchedulerBackend(taskScheduler, sc, "master")
 
+    val resources = List(
+      mesosSchedulerBackend.createResource("cpus", 4),
+      mesosSchedulerBackend.createResource("mem", 1024))
     // uri is null.
-    val executorInfo = mesosSchedulerBackend.createExecutorInfo("test-id")
+    val (executorInfo, _) = mesosSchedulerBackend.createExecutorInfo(resources, "test-id")
     assert(executorInfo.getCommand.getValue ===
       s" /mesos-home/bin/spark-class ${classOf[MesosExecutorBackend].getName}")
 
     // uri exists.
     conf.set("spark.executor.uri", "hdfs:///test-app-1.0.0.tgz")
-    val executorInfo1 = mesosSchedulerBackend.createExecutorInfo("test-id")
+    val (executorInfo1, _) = mesosSchedulerBackend.createExecutorInfo(resources, "test-id")
     assert(executorInfo1.getCommand.getValue ===
       s"cd test-app-1*;  ./bin/spark-class ${classOf[MesosExecutorBackend].getName}")
   }
@@ -93,7 +97,8 @@ class MesosSchedulerBackendSuite extends SparkFunSuite with LocalSparkContext wi
 
     val backend = new MesosSchedulerBackend(taskScheduler, sc, "master")
 
-    val execInfo = backend.createExecutorInfo("mockExecutor")
+    val (execInfo, _) = backend.createExecutorInfo(
+      List(backend.createResource("cpus", 4)), "mockExecutor")
     assert(execInfo.getContainer.getDocker.getImage.equals("spark/mock"))
     val portmaps = execInfo.getContainer.getDocker.getPortMappingsList
     assert(portmaps.get(0).getHostPort.equals(80))
@@ -194,7 +199,7 @@ class MesosSchedulerBackendSuite extends SparkFunSuite with LocalSparkContext wi
     )
     verify(driver, times(1)).declineOffer(mesosOffers.get(1).getId)
     verify(driver, times(1)).declineOffer(mesosOffers.get(2).getId)
-    assert(capture.getValue.size() == 1)
+    assert(capture.getValue.size() === 1)
     val taskInfo = capture.getValue.iterator().next()
     assert(taskInfo.getName.equals("n1"))
     val cpus = taskInfo.getResourcesList.get(0)
@@ -214,4 +219,97 @@ class MesosSchedulerBackendSuite extends SparkFunSuite with LocalSparkContext wi
     backend.resourceOffers(driver, mesosOffers2)
     verify(driver, times(1)).declineOffer(mesosOffers2.get(0).getId)
   }
+
+  test("can handle multiple roles") {
+    val driver = mock[SchedulerDriver]
+    val taskScheduler = mock[TaskSchedulerImpl]
+
+    val listenerBus = mock[LiveListenerBus]
+    listenerBus.post(
+      SparkListenerExecutorAdded(anyLong, "s1", new ExecutorInfo("host1", 2, Map.empty)))
+
+    val sc = mock[SparkContext]
+    when(sc.executorMemory).thenReturn(100)
+    when(sc.getSparkHome()).thenReturn(Option("/path"))
+    when(sc.executorEnvs).thenReturn(new mutable.HashMap[String, String])
+    when(sc.conf).thenReturn(new SparkConf)
+    when(sc.listenerBus).thenReturn(listenerBus)
+
+    val id = 1
+    val builder = Offer.newBuilder()
+    builder.addResourcesBuilder()
+      .setName("mem")
+      .setType(Value.Type.SCALAR)
+      .setRole("prod")
+      .setScalar(Scalar.newBuilder().setValue(500))
+    builder.addResourcesBuilder()
+      .setName("cpus")
+      .setRole("prod")
+      .setType(Value.Type.SCALAR)
+      .setScalar(Scalar.newBuilder().setValue(1))
+    builder.addResourcesBuilder()
+      .setName("mem")
+      .setRole("dev")
+      .setType(Value.Type.SCALAR)
+      .setScalar(Scalar.newBuilder().setValue(600))
+    builder.addResourcesBuilder()
+      .setName("cpus")
+      .setRole("dev")
+      .setType(Value.Type.SCALAR)
+      .setScalar(Scalar.newBuilder().setValue(2))
+    val offer = builder.setId(OfferID.newBuilder().setValue(s"o${id.toString}").build())
+      .setFrameworkId(FrameworkID.newBuilder().setValue("f1"))
+      .setSlaveId(SlaveID.newBuilder().setValue(s"s${id.toString}"))
+      .setHostname(s"host${id.toString}").build()
+
+
+    val mesosOffers = new java.util.ArrayList[Offer]
+    mesosOffers.add(offer)
+
+    val backend = new MesosSchedulerBackend(taskScheduler, sc, "master")
+
+    val expectedWorkerOffers = new ArrayBuffer[WorkerOffer](1)
+    expectedWorkerOffers.append(new WorkerOffer(
+      mesosOffers.get(0).getSlaveId.getValue,
+      mesosOffers.get(0).getHostname,
+      2 // Deducting 1 for executor
+    ))
+
+    val taskDesc = new TaskDescription(1L, 0, "s1", "n1", 0, ByteBuffer.wrap(new Array[Byte](0)))
+    when(taskScheduler.resourceOffers(expectedWorkerOffers)).thenReturn(Seq(Seq(taskDesc)))
+    when(taskScheduler.CPUS_PER_TASK).thenReturn(1)
+
+    val capture = ArgumentCaptor.forClass(classOf[util.Collection[TaskInfo]])
+    when(
+      driver.launchTasks(
+        Matchers.eq(Collections.singleton(mesosOffers.get(0).getId)),
+        capture.capture(),
+        any(classOf[Filters])
+      )
+    ).thenReturn(Status.valueOf(1))
+
+    backend.resourceOffers(driver, mesosOffers)
+
+    verify(driver, times(1)).launchTasks(
+      Matchers.eq(Collections.singleton(mesosOffers.get(0).getId)),
+      capture.capture(),
+      any(classOf[Filters])
+    )
+
+    assert(capture.getValue.size() === 1)
+    val taskInfo = capture.getValue.iterator().next()
+    assert(taskInfo.getName.equals("n1"))
+    assert(taskInfo.getResourcesCount === 1)
+    val cpusDev = taskInfo.getResourcesList.get(0)
+    assert(cpusDev.getName.equals("cpus"))
+    assert(cpusDev.getScalar.getValue.equals(1.0))
+    assert(cpusDev.getRole.equals("dev"))
+    val executorResources = taskInfo.getExecutor.getResourcesList
+    assert(executorResources.exists { r =>
+      r.getName.equals("mem") && r.getScalar.getValue.equals(484.0) && r.getRole.equals("prod")
+    })
+    assert(executorResources.exists { r =>
+      r.getName.equals("cpus") && r.getScalar.getValue.equals(1.0) && r.getRole.equals("prod")
+    })
+  }
 }
diff --git a/docs/running-on-mesos.md b/docs/running-on-mesos.md
index 1f915d8ea1d73..debdd2adf22d6 100644
--- a/docs/running-on-mesos.md
+++ b/docs/running-on-mesos.md
@@ -306,6 +306,28 @@ See the [configuration page](configuration.html) for information on Spark config
     the final overhead will be this value.
   </td>
 </tr>
+<tr>
+  <td><code>spark.mesos.principal</code></td>
+  <td>Framework principal to authenticate to Mesos</td>
+  <td>
+    Set the principal with which Spark framework will use to authenticate with Mesos.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.mesos.secret</code></td>
+  <td>Framework secret to authenticate to Mesos</td>
+  <td>
+    Set the secret with which Spark framework will use to authenticate with Mesos.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.mesos.role</code></td>
+  <td>Role for the Spark framework</td>
+  <td>
+    Set the role of this Spark framework for Mesos. Roles are used in Mesos for reservations
+    and resource weight sharing.
+  </td>
+</tr>
 <tr>
   <td><code>spark.mesos.constraints</code></td>
   <td>Attribute based constraints to be matched against when accepting resource offers.</td>

From 96aa3340f41d8de4560caec97e8f3de23252c792 Mon Sep 17 00:00:00 2001
From: Andrew Or <andrew@databricks.com>
Date: Thu, 16 Jul 2015 19:39:54 -0700
Subject: [PATCH 194/549] [SPARK-8119] HeartbeatReceiver should replace
 executors, not kill

**Symptom.** If an executor in an application times out, `HeartbeatReceiver` attempts to kill it. After this happens, however, the application never gets an executor back even when there are cluster resources available.

**Cause.** The issue is that `sc.killExecutor` automatically assumes that the application wishes to adjust its resource requirements permanently downwards. This is not the intention in `HeartbeatReceiver`, however, which simply wants a replacement for the expired executor.

**Fix.** Differentiate between the intention to kill and the intention to replace an executor with a fresh one. More details can be found in the commit message.

Author: Andrew Or <andrew@databricks.com>

Closes #7107 from andrewor14/heartbeat-no-kill and squashes the following commits:

1cd2cd7 [Andrew Or] Add regression test for SPARK-8119
25a347d [Andrew Or] Reuse more code in scheduler backend
31ebd40 [Andrew Or] Differentiate between kill and replace
---
 .../org/apache/spark/HeartbeatReceiver.scala  |   4 +-
 .../scala/org/apache/spark/SparkContext.scala |  40 ++++-
 .../CoarseGrainedSchedulerBackend.scala       |  40 +++--
 .../apache/spark/HeartbeatReceiverSuite.scala | 147 +++++++++++++++---
 4 files changed, 194 insertions(+), 37 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/HeartbeatReceiver.scala b/core/src/main/scala/org/apache/spark/HeartbeatReceiver.scala
index 221b1dab43278..43dd4a170731d 100644
--- a/core/src/main/scala/org/apache/spark/HeartbeatReceiver.scala
+++ b/core/src/main/scala/org/apache/spark/HeartbeatReceiver.scala
@@ -181,7 +181,9 @@ private[spark] class HeartbeatReceiver(sc: SparkContext, clock: Clock)
           // Asynchronously kill the executor to avoid blocking the current thread
           killExecutorThread.submit(new Runnable {
             override def run(): Unit = Utils.tryLogNonFatalError {
-              sc.killExecutor(executorId)
+              // Note: we want to get an executor back after expiring this one,
+              // so do not simply call `sc.killExecutor` here (SPARK-8119)
+              sc.killAndReplaceExecutor(executorId)
             }
           })
         }
diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index bd1cc332a63e7..d00c012d80560 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -1419,6 +1419,12 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
   /**
    * :: DeveloperApi ::
    * Request that the cluster manager kill the specified executors.
+   *
+   * Note: This is an indication to the cluster manager that the application wishes to adjust
+   * its resource usage downwards. If the application wishes to replace the executors it kills
+   * through this method with new ones, it should follow up explicitly with a call to
+   * {{SparkContext#requestExecutors}}.
+   *
    * This is currently only supported in YARN mode. Return whether the request is received.
    */
   @DeveloperApi
@@ -1436,12 +1442,42 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
 
   /**
    * :: DeveloperApi ::
-   * Request that cluster manager the kill the specified executor.
-   * This is currently only supported in Yarn mode. Return whether the request is received.
+   * Request that the cluster manager kill the specified executor.
+   *
+   * Note: This is an indication to the cluster manager that the application wishes to adjust
+   * its resource usage downwards. If the application wishes to replace the executor it kills
+   * through this method with a new one, it should follow up explicitly with a call to
+   * {{SparkContext#requestExecutors}}.
+   *
+   * This is currently only supported in YARN mode. Return whether the request is received.
    */
   @DeveloperApi
   override def killExecutor(executorId: String): Boolean = super.killExecutor(executorId)
 
+  /**
+   * Request that the cluster manager kill the specified executor without adjusting the
+   * application resource requirements.
+   *
+   * The effect is that a new executor will be launched in place of the one killed by
+   * this request. This assumes the cluster manager will automatically and eventually
+   * fulfill all missing application resource requests.
+   *
+   * Note: The replace is by no means guaranteed; another application on the same cluster
+   * can steal the window of opportunity and acquire this application's resources in the
+   * mean time.
+   *
+   * This is currently only supported in YARN mode. Return whether the request is received.
+   */
+  private[spark] def killAndReplaceExecutor(executorId: String): Boolean = {
+    schedulerBackend match {
+      case b: CoarseGrainedSchedulerBackend =>
+        b.killExecutors(Seq(executorId), replace = true)
+      case _ =>
+        logWarning("Killing executors is only supported in coarse-grained mode")
+        false
+    }
+  }
+
   /** The version of Spark on which this application is running. */
   def version: String = SPARK_VERSION
 
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
index 7c7f70d8a193b..0e3215d6e9ec8 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
@@ -371,26 +371,36 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
 
   /**
    * Request that the cluster manager kill the specified executors.
-   * Return whether the kill request is acknowledged.
+   * @return whether the kill request is acknowledged.
    */
   final override def killExecutors(executorIds: Seq[String]): Boolean = synchronized {
+    killExecutors(executorIds, replace = false)
+  }
+
+  /**
+   * Request that the cluster manager kill the specified executors.
+   *
+   * @param executorIds identifiers of executors to kill
+   * @param replace whether to replace the killed executors with new ones
+   * @return whether the kill request is acknowledged.
+   */
+  final def killExecutors(executorIds: Seq[String], replace: Boolean): Boolean = synchronized {
     logInfo(s"Requesting to kill executor(s) ${executorIds.mkString(", ")}")
-    val filteredExecutorIds = new ArrayBuffer[String]
-    executorIds.foreach { id =>
-      if (executorDataMap.contains(id)) {
-        filteredExecutorIds += id
-      } else {
-        logWarning(s"Executor to kill $id does not exist!")
-      }
+    val (knownExecutors, unknownExecutors) = executorIds.partition(executorDataMap.contains)
+    unknownExecutors.foreach { id =>
+      logWarning(s"Executor to kill $id does not exist!")
+    }
+
+    // If we do not wish to replace the executors we kill, sync the target number of executors
+    // with the cluster manager to avoid allocating new ones. When computing the new target,
+    // take into account executors that are pending to be added or removed.
+    if (!replace) {
+      doRequestTotalExecutors(numExistingExecutors + numPendingExecutors
+        - executorsPendingToRemove.size - knownExecutors.size)
     }
-    // Killing executors means effectively that we want less executors than before, so also update
-    // the target number of executors to avoid having the backend allocate new ones.
-    val newTotal = (numExistingExecutors + numPendingExecutors - executorsPendingToRemove.size
-      - filteredExecutorIds.size)
-    doRequestTotalExecutors(newTotal)
 
-    executorsPendingToRemove ++= filteredExecutorIds
-    doKillExecutors(filteredExecutorIds)
+    executorsPendingToRemove ++= knownExecutors
+    doKillExecutors(knownExecutors)
   }
 
   /**
diff --git a/core/src/test/scala/org/apache/spark/HeartbeatReceiverSuite.scala b/core/src/test/scala/org/apache/spark/HeartbeatReceiverSuite.scala
index b31b09196608f..5a2670e4d1cf0 100644
--- a/core/src/test/scala/org/apache/spark/HeartbeatReceiverSuite.scala
+++ b/core/src/test/scala/org/apache/spark/HeartbeatReceiverSuite.scala
@@ -17,6 +17,9 @@
 
 package org.apache.spark
 
+import java.util.concurrent.{ExecutorService, TimeUnit}
+
+import scala.collection.mutable
 import scala.language.postfixOps
 
 import org.scalatest.{BeforeAndAfterEach, PrivateMethodTester}
@@ -25,11 +28,16 @@ import org.mockito.Matchers
 import org.mockito.Matchers._
 
 import org.apache.spark.executor.TaskMetrics
-import org.apache.spark.rpc.RpcEndpointRef
+import org.apache.spark.rpc.{RpcCallContext, RpcEndpoint, RpcEnv, RpcEndpointRef}
 import org.apache.spark.scheduler._
+import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages._
+import org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend
 import org.apache.spark.storage.BlockManagerId
 import org.apache.spark.util.ManualClock
 
+/**
+ * A test suite for the heartbeating behavior between the driver and the executors.
+ */
 class HeartbeatReceiverSuite
   extends SparkFunSuite
   with BeforeAndAfterEach
@@ -40,23 +48,40 @@ class HeartbeatReceiverSuite
   private val executorId2 = "executor-2"
 
   // Shared state that must be reset before and after each test
-  private var scheduler: TaskScheduler = null
+  private var scheduler: TaskSchedulerImpl = null
   private var heartbeatReceiver: HeartbeatReceiver = null
   private var heartbeatReceiverRef: RpcEndpointRef = null
   private var heartbeatReceiverClock: ManualClock = null
 
+  // Helper private method accessors for HeartbeatReceiver
+  private val _executorLastSeen = PrivateMethod[collection.Map[String, Long]]('executorLastSeen)
+  private val _executorTimeoutMs = PrivateMethod[Long]('executorTimeoutMs)
+  private val _killExecutorThread = PrivateMethod[ExecutorService]('killExecutorThread)
+
+  /**
+   * Before each test, set up the SparkContext and a custom [[HeartbeatReceiver]]
+   * that uses a manual clock.
+   */
   override def beforeEach(): Unit = {
-    sc = spy(new SparkContext("local[2]", "test"))
-    scheduler = mock(classOf[TaskScheduler])
+    val conf = new SparkConf()
+      .setMaster("local[2]")
+      .setAppName("test")
+      .set("spark.dynamicAllocation.testing", "true")
+    sc = spy(new SparkContext(conf))
+    scheduler = mock(classOf[TaskSchedulerImpl])
     when(sc.taskScheduler).thenReturn(scheduler)
+    when(scheduler.sc).thenReturn(sc)
     heartbeatReceiverClock = new ManualClock
     heartbeatReceiver = new HeartbeatReceiver(sc, heartbeatReceiverClock)
     heartbeatReceiverRef = sc.env.rpcEnv.setupEndpoint("heartbeat", heartbeatReceiver)
     when(scheduler.executorHeartbeatReceived(any(), any(), any())).thenReturn(true)
   }
 
+  /**
+   * After each test, clean up all state and stop the [[SparkContext]].
+   */
   override def afterEach(): Unit = {
-    resetSparkContext()
+    super.afterEach()
     scheduler = null
     heartbeatReceiver = null
     heartbeatReceiverRef = null
@@ -75,7 +100,7 @@ class HeartbeatReceiverSuite
     heartbeatReceiver.onExecutorAdded(SparkListenerExecutorAdded(0, executorId2, null))
     triggerHeartbeat(executorId1, executorShouldReregister = false)
     triggerHeartbeat(executorId2, executorShouldReregister = false)
-    val trackedExecutors = executorLastSeen(heartbeatReceiver)
+    val trackedExecutors = heartbeatReceiver.invokePrivate(_executorLastSeen())
     assert(trackedExecutors.size === 2)
     assert(trackedExecutors.contains(executorId1))
     assert(trackedExecutors.contains(executorId2))
@@ -83,15 +108,15 @@ class HeartbeatReceiverSuite
 
   test("reregister if scheduler is not ready yet") {
     heartbeatReceiver.onExecutorAdded(SparkListenerExecutorAdded(0, executorId1, null))
-    // Task scheduler not set in HeartbeatReceiver
+    // Task scheduler is not set yet in HeartbeatReceiver, so executors should reregister
     triggerHeartbeat(executorId1, executorShouldReregister = true)
   }
 
   test("reregister if heartbeat from unregistered executor") {
     heartbeatReceiverRef.askWithRetry[Boolean](TaskSchedulerIsSet)
-    // Received heartbeat from unknown receiver, so we ask it to re-register
+    // Received heartbeat from unknown executor, so we ask it to re-register
     triggerHeartbeat(executorId1, executorShouldReregister = true)
-    assert(executorLastSeen(heartbeatReceiver).isEmpty)
+    assert(heartbeatReceiver.invokePrivate(_executorLastSeen()).isEmpty)
   }
 
   test("reregister if heartbeat from removed executor") {
@@ -104,14 +129,14 @@ class HeartbeatReceiverSuite
     // A heartbeat from the second executor should require reregistering
     triggerHeartbeat(executorId1, executorShouldReregister = false)
     triggerHeartbeat(executorId2, executorShouldReregister = true)
-    val trackedExecutors = executorLastSeen(heartbeatReceiver)
+    val trackedExecutors = heartbeatReceiver.invokePrivate(_executorLastSeen())
     assert(trackedExecutors.size === 1)
     assert(trackedExecutors.contains(executorId1))
     assert(!trackedExecutors.contains(executorId2))
   }
 
   test("expire dead hosts") {
-    val executorTimeout = executorTimeoutMs(heartbeatReceiver)
+    val executorTimeout = heartbeatReceiver.invokePrivate(_executorTimeoutMs())
     heartbeatReceiverRef.askWithRetry[Boolean](TaskSchedulerIsSet)
     heartbeatReceiver.onExecutorAdded(SparkListenerExecutorAdded(0, executorId1, null))
     heartbeatReceiver.onExecutorAdded(SparkListenerExecutorAdded(0, executorId2, null))
@@ -124,12 +149,61 @@ class HeartbeatReceiverSuite
     heartbeatReceiverRef.askWithRetry[Boolean](ExpireDeadHosts)
     // Only the second executor should be expired as a dead host
     verify(scheduler).executorLost(Matchers.eq(executorId2), any())
-    val trackedExecutors = executorLastSeen(heartbeatReceiver)
+    val trackedExecutors = heartbeatReceiver.invokePrivate(_executorLastSeen())
     assert(trackedExecutors.size === 1)
     assert(trackedExecutors.contains(executorId1))
     assert(!trackedExecutors.contains(executorId2))
   }
 
+  test("expire dead hosts should kill executors with replacement (SPARK-8119)") {
+    // Set up a fake backend and cluster manager to simulate killing executors
+    val rpcEnv = sc.env.rpcEnv
+    val fakeClusterManager = new FakeClusterManager(rpcEnv)
+    val fakeClusterManagerRef = rpcEnv.setupEndpoint("fake-cm", fakeClusterManager)
+    val fakeSchedulerBackend = new FakeSchedulerBackend(scheduler, rpcEnv, fakeClusterManagerRef)
+    when(sc.schedulerBackend).thenReturn(fakeSchedulerBackend)
+
+    // Register fake executors with our fake scheduler backend
+    // This is necessary because the backend refuses to kill executors it does not know about
+    fakeSchedulerBackend.start()
+    val dummyExecutorEndpoint1 = new FakeExecutorEndpoint(rpcEnv)
+    val dummyExecutorEndpoint2 = new FakeExecutorEndpoint(rpcEnv)
+    val dummyExecutorEndpointRef1 = rpcEnv.setupEndpoint("fake-executor-1", dummyExecutorEndpoint1)
+    val dummyExecutorEndpointRef2 = rpcEnv.setupEndpoint("fake-executor-2", dummyExecutorEndpoint2)
+    fakeSchedulerBackend.driverEndpoint.askWithRetry[RegisteredExecutor.type](
+      RegisterExecutor(executorId1, dummyExecutorEndpointRef1, "dummy:4040", 0, Map.empty))
+    fakeSchedulerBackend.driverEndpoint.askWithRetry[RegisteredExecutor.type](
+      RegisterExecutor(executorId2, dummyExecutorEndpointRef2, "dummy:4040", 0, Map.empty))
+    heartbeatReceiverRef.askWithRetry[Boolean](TaskSchedulerIsSet)
+    heartbeatReceiver.onExecutorAdded(SparkListenerExecutorAdded(0, executorId1, null))
+    heartbeatReceiver.onExecutorAdded(SparkListenerExecutorAdded(0, executorId2, null))
+    triggerHeartbeat(executorId1, executorShouldReregister = false)
+    triggerHeartbeat(executorId2, executorShouldReregister = false)
+
+    // Adjust the target number of executors on the cluster manager side
+    assert(fakeClusterManager.getTargetNumExecutors === 0)
+    sc.requestTotalExecutors(2)
+    assert(fakeClusterManager.getTargetNumExecutors === 2)
+    assert(fakeClusterManager.getExecutorIdsToKill.isEmpty)
+
+    // Expire the executors. This should trigger our fake backend to kill the executors.
+    // Since the kill request is sent to the cluster manager asynchronously, we need to block
+    // on the kill thread to ensure that the cluster manager actually received our requests.
+    // Here we use a timeout of O(seconds), but in practice this whole test takes O(10ms).
+    val executorTimeout = heartbeatReceiver.invokePrivate(_executorTimeoutMs())
+    heartbeatReceiverClock.advance(executorTimeout * 2)
+    heartbeatReceiverRef.askWithRetry[Boolean](ExpireDeadHosts)
+    val killThread = heartbeatReceiver.invokePrivate(_killExecutorThread())
+    killThread.shutdown() // needed for awaitTermination
+    killThread.awaitTermination(10L, TimeUnit.SECONDS)
+
+    // The target number of executors should not change! Otherwise, having an expired
+    // executor means we permanently adjust the target number downwards until we
+    // explicitly request new executors. For more detail, see SPARK-8119.
+    assert(fakeClusterManager.getTargetNumExecutors === 2)
+    assert(fakeClusterManager.getExecutorIdsToKill === Set(executorId1, executorId2))
+  }
+
   /** Manually send a heartbeat and return the response. */
   private def triggerHeartbeat(
       executorId: String,
@@ -148,14 +222,49 @@ class HeartbeatReceiverSuite
     }
   }
 
-  // Helper methods to access private fields in HeartbeatReceiver
-  private val _executorLastSeen = PrivateMethod[collection.Map[String, Long]]('executorLastSeen)
-  private val _executorTimeoutMs = PrivateMethod[Long]('executorTimeoutMs)
-  private def executorLastSeen(receiver: HeartbeatReceiver): collection.Map[String, Long] = {
-    receiver invokePrivate _executorLastSeen()
+}
+
+// TODO: use these classes to add end-to-end tests for dynamic allocation!
+
+/**
+ * Dummy RPC endpoint to simulate executors.
+ */
+private class FakeExecutorEndpoint(override val rpcEnv: RpcEnv) extends RpcEndpoint
+
+/**
+ * Dummy scheduler backend to simulate executor allocation requests to the cluster manager.
+ */
+private class FakeSchedulerBackend(
+    scheduler: TaskSchedulerImpl,
+    rpcEnv: RpcEnv,
+    clusterManagerEndpoint: RpcEndpointRef)
+  extends CoarseGrainedSchedulerBackend(scheduler, rpcEnv) {
+
+  protected override def doRequestTotalExecutors(requestedTotal: Int): Boolean = {
+    clusterManagerEndpoint.askWithRetry[Boolean](RequestExecutors(requestedTotal))
   }
-  private def executorTimeoutMs(receiver: HeartbeatReceiver): Long = {
-    receiver invokePrivate _executorTimeoutMs()
+
+  protected override def doKillExecutors(executorIds: Seq[String]): Boolean = {
+    clusterManagerEndpoint.askWithRetry[Boolean](KillExecutors(executorIds))
   }
+}
 
+/**
+ * Dummy cluster manager to simulate responses to executor allocation requests.
+ */
+private class FakeClusterManager(override val rpcEnv: RpcEnv) extends RpcEndpoint {
+  private var targetNumExecutors = 0
+  private val executorIdsToKill = new mutable.HashSet[String]
+
+  def getTargetNumExecutors: Int = targetNumExecutors
+  def getExecutorIdsToKill: Set[String] = executorIdsToKill.toSet
+
+  override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
+    case RequestExecutors(requestedTotal) =>
+      targetNumExecutors = requestedTotal
+      context.reply(true)
+    case KillExecutors(executorIds) =>
+      executorIdsToKill ++= executorIds
+      context.reply(true)
+  }
 }

From 812b63bbee8d0b30884f7a96b207e8834b774957 Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Thu, 16 Jul 2015 21:09:09 -0700
Subject: [PATCH 195/549] [SPARK-8857][SPARK-8859][Core]Add an internal flag to
 Accumulable and send internal accumulator updates to the driver via
 heartbeats

This PR includes the following changes:

1. Remove the thread local `Accumulators.localAccums`. Instead, all Accumulators in the executors will register with its TaskContext.
2. Add an internal flag to Accumulable. For internal Accumulators, their updates will be sent to the driver via heartbeats.

Author: zsxwing <zsxwing@gmail.com>

Closes #7448 from zsxwing/accumulators and squashes the following commits:

c24bc5b [zsxwing] Add comments
bd7dcf1 [zsxwing] Add an internal flag to Accumulable and send internal accumulator updates to the driver via heartbeats
---
 .../scala/org/apache/spark/Accumulators.scala | 68 ++++++++-----------
 .../scala/org/apache/spark/TaskContext.scala  | 18 +++++
 .../org/apache/spark/TaskContextImpl.scala    | 19 +++++-
 .../org/apache/spark/executor/Executor.scala  |  6 +-
 .../apache/spark/executor/TaskMetrics.scala   | 16 +++++
 .../apache/spark/scheduler/DAGScheduler.scala |  3 +-
 .../spark/scheduler/DAGSchedulerEvent.scala   |  2 +-
 .../org/apache/spark/scheduler/Task.scala     | 13 +++-
 .../apache/spark/scheduler/TaskResult.scala   |  8 ++-
 .../spark/scheduler/TaskSetManagerSuite.scala |  7 +-
 10 files changed, 104 insertions(+), 56 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/Accumulators.scala b/core/src/main/scala/org/apache/spark/Accumulators.scala
index 5a8d17bd99933..2f4fcac890eef 100644
--- a/core/src/main/scala/org/apache/spark/Accumulators.scala
+++ b/core/src/main/scala/org/apache/spark/Accumulators.scala
@@ -20,7 +20,8 @@ package org.apache.spark
 import java.io.{ObjectInputStream, Serializable}
 
 import scala.collection.generic.Growable
-import scala.collection.mutable.Map
+import scala.collection.Map
+import scala.collection.mutable
 import scala.ref.WeakReference
 import scala.reflect.ClassTag
 
@@ -39,25 +40,44 @@ import org.apache.spark.util.Utils
  * @param initialValue initial value of accumulator
  * @param param helper object defining how to add elements of type `R` and `T`
  * @param name human-readable name for use in Spark's web UI
+ * @param internal if this [[Accumulable]] is internal. Internal [[Accumulable]]s will be reported
+ *                 to the driver via heartbeats. For internal [[Accumulable]]s, `R` must be
+ *                 thread safe so that they can be reported correctly.
  * @tparam R the full accumulated data (result type)
  * @tparam T partial data that can be added in
  */
-class Accumulable[R, T] (
+class Accumulable[R, T] private[spark] (
     @transient initialValue: R,
     param: AccumulableParam[R, T],
-    val name: Option[String])
+    val name: Option[String],
+    internal: Boolean)
   extends Serializable {
 
+  private[spark] def this(
+      @transient initialValue: R, param: AccumulableParam[R, T], internal: Boolean) = {
+    this(initialValue, param, None, internal)
+  }
+
+  def this(@transient initialValue: R, param: AccumulableParam[R, T], name: Option[String]) =
+    this(initialValue, param, name, false)
+
   def this(@transient initialValue: R, param: AccumulableParam[R, T]) =
     this(initialValue, param, None)
 
   val id: Long = Accumulators.newId
 
-  @transient private var value_ = initialValue // Current value on master
+  @volatile @transient private var value_ : R = initialValue // Current value on master
   val zero = param.zero(initialValue)  // Zero value to be passed to workers
   private var deserialized = false
 
-  Accumulators.register(this, true)
+  Accumulators.register(this)
+
+  /**
+   * If this [[Accumulable]] is internal. Internal [[Accumulable]]s will be reported to the driver
+   * via heartbeats. For internal [[Accumulable]]s, `R` must be thread safe so that they can be
+   * reported correctly.
+   */
+  private[spark] def isInternal: Boolean = internal
 
   /**
    * Add more data to this accumulator / accumulable
@@ -132,7 +152,8 @@ class Accumulable[R, T] (
     in.defaultReadObject()
     value_ = zero
     deserialized = true
-    Accumulators.register(this, false)
+    val taskContext = TaskContext.get()
+    taskContext.registerAccumulator(this)
   }
 
   override def toString: String = if (value_ == null) "null" else value_.toString
@@ -284,16 +305,7 @@ private[spark] object Accumulators extends Logging {
    * It keeps weak references to these objects so that accumulators can be garbage-collected
    * once the RDDs and user-code that reference them are cleaned up.
    */
-  val originals = Map[Long, WeakReference[Accumulable[_, _]]]()
-
-  /**
-   * This thread-local map holds per-task copies of accumulators; it is used to collect the set
-   * of accumulator updates to send back to the driver when tasks complete. After tasks complete,
-   * this map is cleared by `Accumulators.clear()` (see Executor.scala).
-   */
-  private val localAccums = new ThreadLocal[Map[Long, Accumulable[_, _]]]() {
-    override protected def initialValue() = Map[Long, Accumulable[_, _]]()
-  }
+  val originals = mutable.Map[Long, WeakReference[Accumulable[_, _]]]()
 
   private var lastId: Long = 0
 
@@ -302,19 +314,8 @@ private[spark] object Accumulators extends Logging {
     lastId
   }
 
-  def register(a: Accumulable[_, _], original: Boolean): Unit = synchronized {
-    if (original) {
-      originals(a.id) = new WeakReference[Accumulable[_, _]](a)
-    } else {
-      localAccums.get()(a.id) = a
-    }
-  }
-
-  // Clear the local (non-original) accumulators for the current thread
-  def clear() {
-    synchronized {
-      localAccums.get.clear()
-    }
+  def register(a: Accumulable[_, _]): Unit = synchronized {
+    originals(a.id) = new WeakReference[Accumulable[_, _]](a)
   }
 
   def remove(accId: Long) {
@@ -323,15 +324,6 @@ private[spark] object Accumulators extends Logging {
     }
   }
 
-  // Get the values of the local accumulators for the current thread (by ID)
-  def values: Map[Long, Any] = synchronized {
-    val ret = Map[Long, Any]()
-    for ((id, accum) <- localAccums.get) {
-      ret(id) = accum.localValue
-    }
-    return ret
-  }
-
   // Add values to the original accumulators with some given IDs
   def add(values: Map[Long, Any]): Unit = synchronized {
     for ((id, value) <- values) {
diff --git a/core/src/main/scala/org/apache/spark/TaskContext.scala b/core/src/main/scala/org/apache/spark/TaskContext.scala
index 248339148d9b7..345bb500a7dec 100644
--- a/core/src/main/scala/org/apache/spark/TaskContext.scala
+++ b/core/src/main/scala/org/apache/spark/TaskContext.scala
@@ -152,4 +152,22 @@ abstract class TaskContext extends Serializable {
    * Returns the manager for this task's managed memory.
    */
   private[spark] def taskMemoryManager(): TaskMemoryManager
+
+  /**
+   * Register an accumulator that belongs to this task. Accumulators must call this method when
+   * deserializing in executors.
+   */
+  private[spark] def registerAccumulator(a: Accumulable[_, _]): Unit
+
+  /**
+   * Return the local values of internal accumulators that belong to this task. The key of the Map
+   * is the accumulator id and the value of the Map is the latest accumulator local value.
+   */
+  private[spark] def collectInternalAccumulators(): Map[Long, Any]
+
+  /**
+   * Return the local values of accumulators that belong to this task. The key of the Map is the
+   * accumulator id and the value of the Map is the latest accumulator local value.
+   */
+  private[spark] def collectAccumulators(): Map[Long, Any]
 }
diff --git a/core/src/main/scala/org/apache/spark/TaskContextImpl.scala b/core/src/main/scala/org/apache/spark/TaskContextImpl.scala
index b4d572cb52313..6e394f1b12445 100644
--- a/core/src/main/scala/org/apache/spark/TaskContextImpl.scala
+++ b/core/src/main/scala/org/apache/spark/TaskContextImpl.scala
@@ -17,12 +17,12 @@
 
 package org.apache.spark
 
+import scala.collection.mutable.{ArrayBuffer, HashMap}
+
 import org.apache.spark.executor.TaskMetrics
 import org.apache.spark.unsafe.memory.TaskMemoryManager
 import org.apache.spark.util.{TaskCompletionListener, TaskCompletionListenerException}
 
-import scala.collection.mutable.ArrayBuffer
-
 private[spark] class TaskContextImpl(
     val stageId: Int,
     val partitionId: Int,
@@ -94,5 +94,18 @@ private[spark] class TaskContextImpl(
   override def isRunningLocally(): Boolean = runningLocally
 
   override def isInterrupted(): Boolean = interrupted
-}
 
+  @transient private val accumulators = new HashMap[Long, Accumulable[_, _]]
+
+  private[spark] override def registerAccumulator(a: Accumulable[_, _]): Unit = synchronized {
+    accumulators(a.id) = a
+  }
+
+  private[spark] override def collectInternalAccumulators(): Map[Long, Any] = synchronized {
+    accumulators.filter(_._2.isInternal).mapValues(_.localValue).toMap
+  }
+
+  private[spark] override def collectAccumulators(): Map[Long, Any] = synchronized {
+    accumulators.mapValues(_.localValue).toMap
+  }
+}
diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala
index 1a02051c87f19..9087debde8c41 100644
--- a/core/src/main/scala/org/apache/spark/executor/Executor.scala
+++ b/core/src/main/scala/org/apache/spark/executor/Executor.scala
@@ -209,7 +209,7 @@ private[spark] class Executor(
 
         // Run the actual task and measure its runtime.
         taskStart = System.currentTimeMillis()
-        val value = try {
+        val (value, accumUpdates) = try {
           task.run(taskAttemptId = taskId, attemptNumber = attemptNumber)
         } finally {
           // Note: this memory freeing logic is duplicated in DAGScheduler.runLocallyWithinThread;
@@ -247,7 +247,6 @@ private[spark] class Executor(
           m.setResultSerializationTime(afterSerialization - beforeSerialization)
         }
 
-        val accumUpdates = Accumulators.values
         val directResult = new DirectTaskResult(valueBytes, accumUpdates, task.metrics.orNull)
         val serializedDirectResult = ser.serialize(directResult)
         val resultSize = serializedDirectResult.limit
@@ -314,8 +313,6 @@ private[spark] class Executor(
         env.shuffleMemoryManager.releaseMemoryForThisThread()
         // Release memory used by this thread for unrolling blocks
         env.blockManager.memoryStore.releaseUnrollMemoryForThisThread()
-        // Release memory used by this thread for accumulators
-        Accumulators.clear()
         runningTasks.remove(taskId)
       }
     }
@@ -424,6 +421,7 @@ private[spark] class Executor(
           metrics.updateShuffleReadMetrics()
           metrics.updateInputMetrics()
           metrics.setJvmGCTime(curGCTime - taskRunner.startGCTime)
+          metrics.updateAccumulators()
 
           if (isLocal) {
             // JobProgressListener will hold an reference of it during
diff --git a/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala b/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala
index e80feeeab4142..42207a9553592 100644
--- a/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala
+++ b/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala
@@ -223,6 +223,22 @@ class TaskMetrics extends Serializable {
     // overhead.
     _hostname = TaskMetrics.getCachedHostName(_hostname)
   }
+
+  private var _accumulatorUpdates: Map[Long, Any] = Map.empty
+  @transient private var _accumulatorsUpdater: () => Map[Long, Any] = null
+
+  private[spark] def updateAccumulators(): Unit = synchronized {
+    _accumulatorUpdates = _accumulatorsUpdater()
+  }
+
+  /**
+   * Return the latest updates of accumulators in this task.
+   */
+  def accumulatorUpdates(): Map[Long, Any] = _accumulatorUpdates
+
+  private[spark] def setAccumulatorsUpdater(accumulatorsUpdater: () => Map[Long, Any]): Unit = {
+    _accumulatorsUpdater = accumulatorsUpdater
+  }
 }
 
 private[spark] object TaskMetrics {
diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
index f8ba3d2d1a9b2..dd55cd8054332 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -22,7 +22,8 @@ import java.util.Properties
 import java.util.concurrent.TimeUnit
 import java.util.concurrent.atomic.AtomicInteger
 
-import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet, Map, Stack}
+import scala.collection.Map
+import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet, Stack}
 import scala.concurrent.duration._
 import scala.language.existentials
 import scala.language.postfixOps
diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGSchedulerEvent.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGSchedulerEvent.scala
index 2b6f7e4205c32..a927eae2b04be 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGSchedulerEvent.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGSchedulerEvent.scala
@@ -19,7 +19,7 @@ package org.apache.spark.scheduler
 
 import java.util.Properties
 
-import scala.collection.mutable.Map
+import scala.collection.Map
 import scala.language.existentials
 
 import org.apache.spark._
diff --git a/core/src/main/scala/org/apache/spark/scheduler/Task.scala b/core/src/main/scala/org/apache/spark/scheduler/Task.scala
index 15101c64f0503..6a86f9d4b8530 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/Task.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/Task.scala
@@ -45,14 +45,20 @@ import org.apache.spark.util.Utils
  */
 private[spark] abstract class Task[T](val stageId: Int, var partitionId: Int) extends Serializable {
 
+  /**
+   * The key of the Map is the accumulator id and the value of the Map is the latest accumulator
+   * local value.
+   */
+  type AccumulatorUpdates = Map[Long, Any]
+
   /**
    * Called by [[Executor]] to run this task.
    *
    * @param taskAttemptId an identifier for this task attempt that is unique within a SparkContext.
    * @param attemptNumber how many times this task has been attempted (0 for the first attempt)
-   * @return the result of the task
+   * @return the result of the task along with updates of Accumulators.
    */
-  final def run(taskAttemptId: Long, attemptNumber: Int): T = {
+  final def run(taskAttemptId: Long, attemptNumber: Int): (T, AccumulatorUpdates) = {
     context = new TaskContextImpl(
       stageId = stageId,
       partitionId = partitionId,
@@ -62,12 +68,13 @@ private[spark] abstract class Task[T](val stageId: Int, var partitionId: Int) ex
       runningLocally = false)
     TaskContext.setTaskContext(context)
     context.taskMetrics.setHostname(Utils.localHostName())
+    context.taskMetrics.setAccumulatorsUpdater(context.collectInternalAccumulators)
     taskThread = Thread.currentThread()
     if (_killed) {
       kill(interruptThread = false)
     }
     try {
-      runTask(context)
+      (runTask(context), context.collectAccumulators())
     } finally {
       context.markTaskCompleted()
       TaskContext.unset()
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskResult.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskResult.scala
index 8b2a742b96988..b82c7f3fa54f8 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskResult.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskResult.scala
@@ -20,7 +20,8 @@ package org.apache.spark.scheduler
 import java.io._
 import java.nio.ByteBuffer
 
-import scala.collection.mutable.Map
+import scala.collection.Map
+import scala.collection.mutable
 
 import org.apache.spark.SparkEnv
 import org.apache.spark.executor.TaskMetrics
@@ -69,10 +70,11 @@ class DirectTaskResult[T](var valueBytes: ByteBuffer, var accumUpdates: Map[Long
     if (numUpdates == 0) {
       accumUpdates = null
     } else {
-      accumUpdates = Map()
+      val _accumUpdates = mutable.Map[Long, Any]()
       for (i <- 0 until numUpdates) {
-        accumUpdates(in.readLong()) = in.readObject()
+        _accumUpdates(in.readLong()) = in.readObject()
       }
+      accumUpdates = _accumUpdates
     }
     metrics = in.readObject().asInstanceOf[TaskMetrics]
     valueObjectDeserialized = false
diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
index 0060f3396dcde..cdae0d83d01dc 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
@@ -19,12 +19,13 @@ package org.apache.spark.scheduler
 
 import java.util.Random
 
-import scala.collection.mutable.ArrayBuffer
+import scala.collection.Map
 import scala.collection.mutable
+import scala.collection.mutable.ArrayBuffer
 
 import org.apache.spark._
 import org.apache.spark.executor.TaskMetrics
-import org.apache.spark.util.{ManualClock, Utils}
+import org.apache.spark.util.ManualClock
 
 class FakeDAGScheduler(sc: SparkContext, taskScheduler: FakeTaskScheduler)
   extends DAGScheduler(sc) {
@@ -37,7 +38,7 @@ class FakeDAGScheduler(sc: SparkContext, taskScheduler: FakeTaskScheduler)
       task: Task[_],
       reason: TaskEndReason,
       result: Any,
-      accumUpdates: mutable.Map[Long, Any],
+      accumUpdates: Map[Long, Any],
       taskInfo: TaskInfo,
       taskMetrics: TaskMetrics) {
     taskScheduler.endedTasks(taskInfo.index) = reason

From f893955b9cc6ea456fc5845890893c08d8878481 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@outlook.com>
Date: Thu, 16 Jul 2015 21:41:36 -0700
Subject: [PATCH 196/549] [SPARK-8899] [SQL] remove duplicated equals method
 for Row

Author: Wenchen Fan <cloud0fan@outlook.com>

Closes #7291 from cloud-fan/row and squashes the following commits:

a11addf [Wenchen Fan] move hashCode back to internal row
2de6180 [Wenchen Fan] making apply() call to get()
fbe1b24 [Wenchen Fan] add null check
ebdf148 [Wenchen Fan] address comments
25ef087 [Wenchen Fan] remove duplicated equals method for Row
---
 .../sql/catalyst/expressions/UnsafeRow.java   |  5 ---
 .../main/scala/org/apache/spark/sql/Row.scala | 44 +++++++++++++++++--
 .../spark/sql/catalyst/InternalRow.scala      | 37 +---------------
 .../spark/sql/catalyst/expressions/Cast.scala |  1 -
 .../sql/catalyst/expressions/Projection.scala | 12 ++---
 .../expressions/SpecificMutableRow.scala      |  2 +-
 .../spark/sql/catalyst/expressions/rows.scala | 23 +---------
 7 files changed, 50 insertions(+), 74 deletions(-)

diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
index 4b99030d1046f..87294a0e21441 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
@@ -246,11 +246,6 @@ public void setFloat(int ordinal, float value) {
     PlatformDependent.UNSAFE.putFloat(baseObject, getFieldOffset(ordinal), value);
   }
 
-  @Override
-  public int size() {
-    return numFields;
-  }
-
   /**
    * Returns the object for column `i`, which should not be primitive type.
    */
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala
index 0f2fd6a86d177..5f0592dc1d77b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql
 
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.GenericRow
 import org.apache.spark.sql.types.StructType
 
@@ -151,7 +152,7 @@ trait Row extends Serializable {
    *   StructType -> org.apache.spark.sql.Row
    * }}}
    */
-  def apply(i: Int): Any
+  def apply(i: Int): Any = get(i)
 
   /**
    * Returns the value at position i. If the value is null, null is returned. The following
@@ -176,10 +177,10 @@ trait Row extends Serializable {
    *   StructType -> org.apache.spark.sql.Row
    * }}}
    */
-  def get(i: Int): Any = apply(i)
+  def get(i: Int): Any
 
   /** Checks whether the value at position i is null. */
-  def isNullAt(i: Int): Boolean = apply(i) == null
+  def isNullAt(i: Int): Boolean = get(i) == null
 
   /**
    * Returns the value at position i as a primitive boolean.
@@ -311,7 +312,7 @@ trait Row extends Serializable {
    *
    * @throws ClassCastException when data type does not match.
    */
-  def getAs[T](i: Int): T = apply(i).asInstanceOf[T]
+  def getAs[T](i: Int): T = get(i).asInstanceOf[T]
 
   /**
    * Returns the value of a given fieldName.
@@ -363,6 +364,41 @@ trait Row extends Serializable {
     false
   }
 
+  protected def canEqual(other: Any) =
+    other.isInstanceOf[Row] && !other.isInstanceOf[InternalRow]
+
+  override def equals(o: Any): Boolean = {
+    if (o == null || !canEqual(o)) return false
+
+    val other = o.asInstanceOf[Row]
+    if (length != other.length) {
+      return false
+    }
+
+    var i = 0
+    while (i < length) {
+      if (isNullAt(i) != other.isNullAt(i)) {
+        return false
+      }
+      if (!isNullAt(i)) {
+        val o1 = get(i)
+        val o2 = other.get(i)
+        if (o1.isInstanceOf[Array[Byte]]) {
+          // handle equality of Array[Byte]
+          val b1 = o1.asInstanceOf[Array[Byte]]
+          if (!o2.isInstanceOf[Array[Byte]] ||
+            !java.util.Arrays.equals(b1, o2.asInstanceOf[Array[Byte]])) {
+            return false
+          }
+        } else if (o1 != o2) {
+          return false
+        }
+      }
+      i += 1
+    }
+    return true
+  }
+
   /* ---------------------- utility methods for Scala ---------------------- */
 
   /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala
index 57de0f26a9720..e2fafb88ee43e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala
@@ -53,41 +53,8 @@ abstract class InternalRow extends Row {
 
   // A default implementation to change the return type
   override def copy(): InternalRow = this
-  override def apply(i: Int): Any = get(i)
 
-  override def equals(o: Any): Boolean = {
-    if (!o.isInstanceOf[Row]) {
-      return false
-    }
-
-    val other = o.asInstanceOf[Row]
-    if (length != other.length) {
-      return false
-    }
-
-    var i = 0
-    while (i < length) {
-      if (isNullAt(i) != other.isNullAt(i)) {
-        return false
-      }
-      if (!isNullAt(i)) {
-        val o1 = apply(i)
-        val o2 = other.apply(i)
-        if (o1.isInstanceOf[Array[Byte]]) {
-          // handle equality of Array[Byte]
-          val b1 = o1.asInstanceOf[Array[Byte]]
-          if (!o2.isInstanceOf[Array[Byte]] ||
-            !java.util.Arrays.equals(b1, o2.asInstanceOf[Array[Byte]])) {
-            return false
-          }
-        } else if (o1 != o2) {
-          return false
-        }
-      }
-      i += 1
-    }
-    true
-  }
+  protected override def canEqual(other: Any) = other.isInstanceOf[InternalRow]
 
   // Custom hashCode function that matches the efficient code generated version.
   override def hashCode: Int = {
@@ -98,7 +65,7 @@ abstract class InternalRow extends Row {
         if (isNullAt(i)) {
           0
         } else {
-          apply(i) match {
+          get(i) match {
             case b: Boolean => if (b) 0 else 1
             case b: Byte => b.toInt
             case s: Short => s.toInt
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
index 83d5b3b76b0a3..65ae87fe6d166 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -56,7 +56,6 @@ object Cast {
     case (_, DateType) => true
 
     case (StringType, IntervalType) => true
-    case (IntervalType, StringType) => true
 
     case (StringType, _: NumericType) => true
     case (BooleanType, _: NumericType) => true
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
index 886a486bf5ee0..bf47a6c75b809 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
@@ -110,7 +110,7 @@ class JoinedRow extends InternalRow {
 
   override def length: Int = row1.length + row2.length
 
-  override def apply(i: Int): Any =
+  override def get(i: Int): Any =
     if (i < row1.length) row1(i) else row2(i - row1.length)
 
   override def isNullAt(i: Int): Boolean =
@@ -204,7 +204,7 @@ class JoinedRow2 extends InternalRow {
 
   override def length: Int = row1.length + row2.length
 
-  override def apply(i: Int): Any =
+  override def get(i: Int): Any =
     if (i < row1.length) row1(i) else row2(i - row1.length)
 
   override def isNullAt(i: Int): Boolean =
@@ -292,7 +292,7 @@ class JoinedRow3 extends InternalRow {
 
   override def length: Int = row1.length + row2.length
 
-  override def apply(i: Int): Any =
+  override def get(i: Int): Any =
     if (i < row1.length) row1(i) else row2(i - row1.length)
 
   override def isNullAt(i: Int): Boolean =
@@ -380,7 +380,7 @@ class JoinedRow4 extends InternalRow {
 
   override def length: Int = row1.length + row2.length
 
-  override def apply(i: Int): Any =
+  override def get(i: Int): Any =
     if (i < row1.length) row1(i) else row2(i - row1.length)
 
   override def isNullAt(i: Int): Boolean =
@@ -468,7 +468,7 @@ class JoinedRow5 extends InternalRow {
 
   override def length: Int = row1.length + row2.length
 
-  override def apply(i: Int): Any =
+  override def get(i: Int): Any =
     if (i < row1.length) row1(i) else row2(i - row1.length)
 
   override def isNullAt(i: Int): Boolean =
@@ -556,7 +556,7 @@ class JoinedRow6 extends InternalRow {
 
   override def length: Int = row1.length + row2.length
 
-  override def apply(i: Int): Any =
+  override def get(i: Int): Any =
     if (i < row1.length) row1(i) else row2(i - row1.length)
 
   override def isNullAt(i: Int): Boolean =
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SpecificMutableRow.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SpecificMutableRow.scala
index efa24710a5a67..6f291d2c86c1e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SpecificMutableRow.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SpecificMutableRow.scala
@@ -219,7 +219,7 @@ final class SpecificMutableRow(val values: Array[MutableValue]) extends MutableR
     values(i).isNull = true
   }
 
-  override def apply(i: Int): Any = values(i).boxed
+  override def get(i: Int): Any = values(i).boxed
 
   override def isNullAt(i: Int): Boolean = values(i).isNull
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala
index 094904bbf9c15..d78be5a5958f9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala
@@ -66,7 +66,7 @@ trait ArrayBackedRow {
 
   def length: Int = values.length
 
-  override def apply(i: Int): Any = values(i)
+  override def get(i: Int): Any = values(i)
 
   def setNullAt(i: Int): Unit = { values(i) = null}
 
@@ -84,27 +84,6 @@ class GenericRow(protected[sql] val values: Array[Any]) extends Row with ArrayBa
 
   def this(size: Int) = this(new Array[Any](size))
 
-  // This is used by test or outside
-  override def equals(o: Any): Boolean = o match {
-    case other: Row if other.length == length =>
-      var i = 0
-      while (i < length) {
-        if (isNullAt(i) != other.isNullAt(i)) {
-          return false
-        }
-        val equal = (apply(i), other.apply(i)) match {
-          case (a: Array[Byte], b: Array[Byte]) => java.util.Arrays.equals(a, b)
-          case (a, b) => a == b
-        }
-        if (!equal) {
-          return false
-        }
-        i += 1
-      }
-      true
-    case _ => false
-  }
-
   override def copy(): Row = this
 }
 

From 322d286bb7773389ed07df96290e427b21c775bd Mon Sep 17 00:00:00 2001
From: "Joseph K. Bradley" <joseph@databricks.com>
Date: Thu, 16 Jul 2015 22:26:59 -0700
Subject: [PATCH 197/549] [SPARK-7131] [ML] Copy Decision Tree, Random Forest
 impl to spark.ml

This PR copies the RandomForest implementation from spark.mllib to spark.ml.  Note that this includes the DecisionTree implementation, but not the GradientBoostedTrees one (which will come later).

I essentially copied a minimal amount of code to spark.ml, removed the use of bins (and only used splits), and modified code only as much as necessary to get it to compile.  The spark.ml implementation still uses some spark.mllib classes (privately), which can be moved in future PRs.

This refactoring will be helpful in extending the node representation to include more information, such as class probabilities.

Specifically:
* Copied code from spark.mllib to spark.ml:
  * mllib.tree.DecisionTree, mllib.tree.RandomForest copied to ml.tree.impl.RandomForest (main implementation)
  * NodeIdCache (needed to use splits instead of bins)
  * TreePoint (use splits instead of bins)
* Added ml.tree.LearningNode used in RandomForest training (needed vars)
* Removed bins from implementation, and only used splits
* Small fix in JavaDecisionTreeRegressorSuite

CC: mengxr  manishamde  codedeft chouqin

Author: Joseph K. Bradley <joseph@databricks.com>

Closes #7294 from jkbradley/dt-move-impl and squashes the following commits:

48749be [Joseph K. Bradley] cleanups based on code review, mostly style
bea9703 [Joseph K. Bradley] scala style fixes.  added some scala doc
4e6d2a4 [Joseph K. Bradley] removed unnecessary use of copyValues, setParent for trees
9a4d721 [Joseph K. Bradley] cleanups. removed InfoGainStats from ml, using old one for now.
836e7d4 [Joseph K. Bradley] Fixed test suite failures
bd5e063 [Joseph K. Bradley] fixed bucketizing issue
0df3759 [Joseph K. Bradley] Need to remove use of Bucketizer
d5224a9 [Joseph K. Bradley] modified tree and forest to use moved impl
cc01823 [Joseph K. Bradley] still editing RF to get it to work
19143fb [Joseph K. Bradley] More progress, but not done yet.  Rebased with master after 1.4 release.
---
 .../DecisionTreeClassifier.scala              |   13 +-
 .../RandomForestClassifier.scala              |   16 +-
 .../ml/regression/DecisionTreeRegressor.scala |   13 +-
 .../ml/regression/RandomForestRegressor.scala |   15 +-
 .../scala/org/apache/spark/ml/tree/Node.scala |  129 ++
 .../org/apache/spark/ml/tree/Split.scala      |   30 +-
 .../spark/ml/tree/impl/NodeIdCache.scala      |  194 +++
 .../spark/ml/tree/impl/RandomForest.scala     | 1132 +++++++++++++++++
 .../apache/spark/ml/tree/impl/TreePoint.scala |  134 ++
 .../spark/mllib/tree/impl/BaggedPoint.scala   |   10 +-
 .../mllib/tree/impl/DTStatsAggregator.scala   |    2 +-
 .../tree/impl/DecisionTreeMetadata.scala      |    4 +-
 .../spark/mllib/tree/impl/NodeIdCache.scala   |    4 +-
 .../spark/mllib/tree/impl/TimeTracker.scala   |    2 +-
 .../spark/mllib/tree/impl/TreePoint.scala     |    4 +-
 .../spark/mllib/tree/impurity/Impurity.scala  |    4 +-
 .../tree/model/InformationGainStats.scala     |    2 +-
 .../JavaDecisionTreeRegressorSuite.java       |    2 +-
 18 files changed, 1678 insertions(+), 32 deletions(-)
 create mode 100644 mllib/src/main/scala/org/apache/spark/ml/tree/impl/NodeIdCache.scala
 create mode 100644 mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala
 create mode 100644 mllib/src/main/scala/org/apache/spark/ml/tree/impl/TreePoint.scala

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
index 2dc1824964a42..36fe1bd40469c 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
@@ -21,10 +21,10 @@ import org.apache.spark.annotation.Experimental
 import org.apache.spark.ml.{PredictionModel, Predictor}
 import org.apache.spark.ml.param.ParamMap
 import org.apache.spark.ml.tree.{DecisionTreeModel, DecisionTreeParams, Node, TreeClassifierParams}
+import org.apache.spark.ml.tree.impl.RandomForest
 import org.apache.spark.ml.util.{Identifiable, MetadataUtils}
 import org.apache.spark.mllib.linalg.Vector
 import org.apache.spark.mllib.regression.LabeledPoint
-import org.apache.spark.mllib.tree.{DecisionTree => OldDecisionTree}
 import org.apache.spark.mllib.tree.configuration.{Algo => OldAlgo, Strategy => OldStrategy}
 import org.apache.spark.mllib.tree.model.{DecisionTreeModel => OldDecisionTreeModel}
 import org.apache.spark.rdd.RDD
@@ -75,8 +75,9 @@ final class DecisionTreeClassifier(override val uid: String)
     }
     val oldDataset: RDD[LabeledPoint] = extractLabeledPoints(dataset)
     val strategy = getOldStrategy(categoricalFeatures, numClasses)
-    val oldModel = OldDecisionTree.train(oldDataset, strategy)
-    DecisionTreeClassificationModel.fromOld(oldModel, this, categoricalFeatures)
+    val trees = RandomForest.run(oldDataset, strategy, numTrees = 1, featureSubsetStrategy = "all",
+      seed = 0L, parentUID = Some(uid))
+    trees.head.asInstanceOf[DecisionTreeClassificationModel]
   }
 
   /** (private[ml]) Create a Strategy instance to use with the old API. */
@@ -112,6 +113,12 @@ final class DecisionTreeClassificationModel private[ml] (
   require(rootNode != null,
     "DecisionTreeClassificationModel given null rootNode, but it requires a non-null rootNode.")
 
+  /**
+   * Construct a decision tree classification model.
+   * @param rootNode  Root node of tree, with other nodes attached.
+   */
+  def this(rootNode: Node) = this(Identifiable.randomUID("dtc"), rootNode)
+
   override protected def predict(features: Vector): Double = {
     rootNode.predict(features)
   }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
index d3c67494a31e4..490f04c7c7172 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
@@ -20,13 +20,13 @@ package org.apache.spark.ml.classification
 import scala.collection.mutable
 
 import org.apache.spark.annotation.Experimental
+import org.apache.spark.ml.tree.impl.RandomForest
 import org.apache.spark.ml.{PredictionModel, Predictor}
 import org.apache.spark.ml.param.ParamMap
 import org.apache.spark.ml.tree.{DecisionTreeModel, RandomForestParams, TreeClassifierParams, TreeEnsembleModel}
 import org.apache.spark.ml.util.{Identifiable, MetadataUtils}
 import org.apache.spark.mllib.linalg.Vector
 import org.apache.spark.mllib.regression.LabeledPoint
-import org.apache.spark.mllib.tree.{RandomForest => OldRandomForest}
 import org.apache.spark.mllib.tree.configuration.{Algo => OldAlgo}
 import org.apache.spark.mllib.tree.model.{RandomForestModel => OldRandomForestModel}
 import org.apache.spark.rdd.RDD
@@ -93,9 +93,10 @@ final class RandomForestClassifier(override val uid: String)
     val oldDataset: RDD[LabeledPoint] = extractLabeledPoints(dataset)
     val strategy =
       super.getOldStrategy(categoricalFeatures, numClasses, OldAlgo.Classification, getOldImpurity)
-    val oldModel = OldRandomForest.trainClassifier(
-      oldDataset, strategy, getNumTrees, getFeatureSubsetStrategy, getSeed.toInt)
-    RandomForestClassificationModel.fromOld(oldModel, this, categoricalFeatures)
+    val trees =
+      RandomForest.run(oldDataset, strategy, getNumTrees, getFeatureSubsetStrategy, getSeed)
+        .map(_.asInstanceOf[DecisionTreeClassificationModel])
+    new RandomForestClassificationModel(trees)
   }
 
   override def copy(extra: ParamMap): RandomForestClassifier = defaultCopy(extra)
@@ -128,6 +129,13 @@ final class RandomForestClassificationModel private[ml] (
 
   require(numTrees > 0, "RandomForestClassificationModel requires at least 1 tree.")
 
+  /**
+   * Construct a random forest classification model, with all trees weighted equally.
+   * @param trees  Component trees
+   */
+  def this(trees: Array[DecisionTreeClassificationModel]) =
+    this(Identifiable.randomUID("rfc"), trees)
+
   override def trees: Array[DecisionTreeModel] = _trees.asInstanceOf[Array[DecisionTreeModel]]
 
   // Note: We may add support for weights (based on tree performance) later on.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala
index be1f8063d41d8..6f3340c2f02be 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala
@@ -21,10 +21,10 @@ import org.apache.spark.annotation.Experimental
 import org.apache.spark.ml.{PredictionModel, Predictor}
 import org.apache.spark.ml.param.ParamMap
 import org.apache.spark.ml.tree.{DecisionTreeModel, DecisionTreeParams, Node, TreeRegressorParams}
+import org.apache.spark.ml.tree.impl.RandomForest
 import org.apache.spark.ml.util.{Identifiable, MetadataUtils}
 import org.apache.spark.mllib.linalg.Vector
 import org.apache.spark.mllib.regression.LabeledPoint
-import org.apache.spark.mllib.tree.{DecisionTree => OldDecisionTree}
 import org.apache.spark.mllib.tree.configuration.{Algo => OldAlgo, Strategy => OldStrategy}
 import org.apache.spark.mllib.tree.model.{DecisionTreeModel => OldDecisionTreeModel}
 import org.apache.spark.rdd.RDD
@@ -67,8 +67,9 @@ final class DecisionTreeRegressor(override val uid: String)
       MetadataUtils.getCategoricalFeatures(dataset.schema($(featuresCol)))
     val oldDataset: RDD[LabeledPoint] = extractLabeledPoints(dataset)
     val strategy = getOldStrategy(categoricalFeatures)
-    val oldModel = OldDecisionTree.train(oldDataset, strategy)
-    DecisionTreeRegressionModel.fromOld(oldModel, this, categoricalFeatures)
+    val trees = RandomForest.run(oldDataset, strategy, numTrees = 1, featureSubsetStrategy = "all",
+      seed = 0L, parentUID = Some(uid))
+    trees.head.asInstanceOf[DecisionTreeRegressionModel]
   }
 
   /** (private[ml]) Create a Strategy instance to use with the old API. */
@@ -102,6 +103,12 @@ final class DecisionTreeRegressionModel private[ml] (
   require(rootNode != null,
     "DecisionTreeClassificationModel given null rootNode, but it requires a non-null rootNode.")
 
+  /**
+   * Construct a decision tree regression model.
+   * @param rootNode  Root node of tree, with other nodes attached.
+   */
+  def this(rootNode: Node) = this(Identifiable.randomUID("dtr"), rootNode)
+
   override protected def predict(features: Vector): Double = {
     rootNode.predict(features)
   }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala
index 21c59061a02fa..5fd5c7c7bd3fc 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala
@@ -21,10 +21,10 @@ import org.apache.spark.annotation.Experimental
 import org.apache.spark.ml.{PredictionModel, Predictor}
 import org.apache.spark.ml.param.ParamMap
 import org.apache.spark.ml.tree.{DecisionTreeModel, RandomForestParams, TreeEnsembleModel, TreeRegressorParams}
+import org.apache.spark.ml.tree.impl.RandomForest
 import org.apache.spark.ml.util.{Identifiable, MetadataUtils}
 import org.apache.spark.mllib.linalg.Vector
 import org.apache.spark.mllib.regression.LabeledPoint
-import org.apache.spark.mllib.tree.{RandomForest => OldRandomForest}
 import org.apache.spark.mllib.tree.configuration.{Algo => OldAlgo}
 import org.apache.spark.mllib.tree.model.{RandomForestModel => OldRandomForestModel}
 import org.apache.spark.rdd.RDD
@@ -82,9 +82,10 @@ final class RandomForestRegressor(override val uid: String)
     val oldDataset: RDD[LabeledPoint] = extractLabeledPoints(dataset)
     val strategy =
       super.getOldStrategy(categoricalFeatures, numClasses = 0, OldAlgo.Regression, getOldImpurity)
-    val oldModel = OldRandomForest.trainRegressor(
-      oldDataset, strategy, getNumTrees, getFeatureSubsetStrategy, getSeed.toInt)
-    RandomForestRegressionModel.fromOld(oldModel, this, categoricalFeatures)
+    val trees =
+      RandomForest.run(oldDataset, strategy, getNumTrees, getFeatureSubsetStrategy, getSeed)
+        .map(_.asInstanceOf[DecisionTreeRegressionModel])
+    new RandomForestRegressionModel(trees)
   }
 
   override def copy(extra: ParamMap): RandomForestRegressor = defaultCopy(extra)
@@ -115,6 +116,12 @@ final class RandomForestRegressionModel private[ml] (
 
   require(numTrees > 0, "RandomForestRegressionModel requires at least 1 tree.")
 
+  /**
+   * Construct a random forest regression model, with all trees weighted equally.
+   * @param trees  Component trees
+   */
+  def this(trees: Array[DecisionTreeRegressionModel]) = this(Identifiable.randomUID("rfr"), trees)
+
   override def trees: Array[DecisionTreeModel] = _trees.asInstanceOf[Array[DecisionTreeModel]]
 
   // Note: We may add support for weights (based on tree performance) later on.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/Node.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/Node.scala
index 4242154be14ce..bbc2427ca7d3d 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tree/Node.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tree/Node.scala
@@ -209,3 +209,132 @@ private object InternalNode {
     }
   }
 }
+
+/**
+ * Version of a node used in learning.  This uses vars so that we can modify nodes as we split the
+ * tree by adding children, etc.
+ *
+ * For now, we use node IDs.  These will be kept internal since we hope to remove node IDs
+ * in the future, or at least change the indexing (so that we can support much deeper trees).
+ *
+ * This node can either be:
+ *  - a leaf node, with leftChild, rightChild, split set to null, or
+ *  - an internal node, with all values set
+ *
+ * @param id  We currently use the same indexing as the old implementation in
+ *            [[org.apache.spark.mllib.tree.model.Node]], but this will change later.
+ * @param predictionStats  Predicted label + class probability (for classification).
+ *                         We will later modify this to store aggregate statistics for labels
+ *                         to provide all class probabilities (for classification) and maybe a
+ *                         distribution (for regression).
+ * @param isLeaf  Indicates whether this node will definitely be a leaf in the learned tree,
+ *                so that we do not need to consider splitting it further.
+ * @param stats  Old structure for storing stats about information gain, prediction, etc.
+ *               This is legacy and will be modified in the future.
+ */
+private[tree] class LearningNode(
+    var id: Int,
+    var predictionStats: OldPredict,
+    var impurity: Double,
+    var leftChild: Option[LearningNode],
+    var rightChild: Option[LearningNode],
+    var split: Option[Split],
+    var isLeaf: Boolean,
+    var stats: Option[OldInformationGainStats]) extends Serializable {
+
+  /**
+   * Convert this [[LearningNode]] to a regular [[Node]], and recurse on any children.
+   */
+  def toNode: Node = {
+    if (leftChild.nonEmpty) {
+      assert(rightChild.nonEmpty && split.nonEmpty && stats.nonEmpty,
+        "Unknown error during Decision Tree learning.  Could not convert LearningNode to Node.")
+      new InternalNode(predictionStats.predict, impurity, stats.get.gain,
+        leftChild.get.toNode, rightChild.get.toNode, split.get)
+    } else {
+      new LeafNode(predictionStats.predict, impurity)
+    }
+  }
+
+}
+
+private[tree] object LearningNode {
+
+  /** Create a node with some of its fields set. */
+  def apply(
+      id: Int,
+      predictionStats: OldPredict,
+      impurity: Double,
+      isLeaf: Boolean): LearningNode = {
+    new LearningNode(id, predictionStats, impurity, None, None, None, false, None)
+  }
+
+  /** Create an empty node with the given node index.  Values must be set later on. */
+  def emptyNode(nodeIndex: Int): LearningNode = {
+    new LearningNode(nodeIndex, new OldPredict(Double.NaN, Double.NaN), Double.NaN,
+      None, None, None, false, None)
+  }
+
+  // The below indexing methods were copied from spark.mllib.tree.model.Node
+
+  /**
+   * Return the index of the left child of this node.
+   */
+  def leftChildIndex(nodeIndex: Int): Int = nodeIndex << 1
+
+  /**
+   * Return the index of the right child of this node.
+   */
+  def rightChildIndex(nodeIndex: Int): Int = (nodeIndex << 1) + 1
+
+  /**
+   * Get the parent index of the given node, or 0 if it is the root.
+   */
+  def parentIndex(nodeIndex: Int): Int = nodeIndex >> 1
+
+  /**
+   * Return the level of a tree which the given node is in.
+   */
+  def indexToLevel(nodeIndex: Int): Int = if (nodeIndex == 0) {
+    throw new IllegalArgumentException(s"0 is not a valid node index.")
+  } else {
+    java.lang.Integer.numberOfTrailingZeros(java.lang.Integer.highestOneBit(nodeIndex))
+  }
+
+  /**
+   * Returns true if this is a left child.
+   * Note: Returns false for the root.
+   */
+  def isLeftChild(nodeIndex: Int): Boolean = nodeIndex > 1 && nodeIndex % 2 == 0
+
+  /**
+   * Return the maximum number of nodes which can be in the given level of the tree.
+   * @param level  Level of tree (0 = root).
+   */
+  def maxNodesInLevel(level: Int): Int = 1 << level
+
+  /**
+   * Return the index of the first node in the given level.
+   * @param level  Level of tree (0 = root).
+   */
+  def startIndexInLevel(level: Int): Int = 1 << level
+
+  /**
+   * Traces down from a root node to get the node with the given node index.
+   * This assumes the node exists.
+   */
+  def getNode(nodeIndex: Int, rootNode: LearningNode): LearningNode = {
+    var tmpNode: LearningNode = rootNode
+    var levelsToGo = indexToLevel(nodeIndex)
+    while (levelsToGo > 0) {
+      if ((nodeIndex & (1 << levelsToGo - 1)) == 0) {
+        tmpNode = tmpNode.leftChild.asInstanceOf[LearningNode]
+      } else {
+        tmpNode = tmpNode.rightChild.asInstanceOf[LearningNode]
+      }
+      levelsToGo -= 1
+    }
+    tmpNode
+  }
+
+}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/Split.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/Split.scala
index 7acdeeee72d23..78199cc2df582 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tree/Split.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tree/Split.scala
@@ -34,9 +34,19 @@ sealed trait Split extends Serializable {
   /** Index of feature which this split tests */
   def featureIndex: Int
 
-  /** Return true (split to left) or false (split to right) */
+  /**
+   * Return true (split to left) or false (split to right).
+   * @param features  Vector of features (original values, not binned).
+   */
   private[ml] def shouldGoLeft(features: Vector): Boolean
 
+  /**
+   * Return true (split to left) or false (split to right).
+   * @param binnedFeature Binned feature value.
+   * @param splits All splits for the given feature.
+   */
+  private[tree] def shouldGoLeft(binnedFeature: Int, splits: Array[Split]): Boolean
+
   /** Convert to old Split format */
   private[tree] def toOld: OldSplit
 }
@@ -94,6 +104,14 @@ final class CategoricalSplit private[ml] (
     }
   }
 
+  override private[tree] def shouldGoLeft(binnedFeature: Int, splits: Array[Split]): Boolean = {
+    if (isLeft) {
+      categories.contains(binnedFeature.toDouble)
+    } else {
+      !categories.contains(binnedFeature.toDouble)
+    }
+  }
+
   override def equals(o: Any): Boolean = {
     o match {
       case other: CategoricalSplit => featureIndex == other.featureIndex &&
@@ -144,6 +162,16 @@ final class ContinuousSplit private[ml] (override val featureIndex: Int, val thr
     features(featureIndex) <= threshold
   }
 
+  override private[tree] def shouldGoLeft(binnedFeature: Int, splits: Array[Split]): Boolean = {
+    if (binnedFeature == splits.length) {
+      // > last split, so split right
+      false
+    } else {
+      val featureValueUpperBound = splits(binnedFeature).asInstanceOf[ContinuousSplit].threshold
+      featureValueUpperBound <= threshold
+    }
+  }
+
   override def equals(o: Any): Boolean = {
     o match {
       case other: ContinuousSplit =>
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/NodeIdCache.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/NodeIdCache.scala
new file mode 100644
index 0000000000000..488e8e4fb5dcd
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/NodeIdCache.scala
@@ -0,0 +1,194 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.tree.impl
+
+import java.io.IOException
+
+import scala.collection.mutable
+
+import org.apache.hadoop.fs.{Path, FileSystem}
+
+import org.apache.spark.Logging
+import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.ml.tree.{LearningNode, Split}
+import org.apache.spark.mllib.tree.impl.BaggedPoint
+import org.apache.spark.rdd.RDD
+import org.apache.spark.storage.StorageLevel
+
+
+/**
+ * This is used by the node id cache to find the child id that a data point would belong to.
+ * @param split Split information.
+ * @param nodeIndex The current node index of a data point that this will update.
+ */
+private[tree] case class NodeIndexUpdater(split: Split, nodeIndex: Int) {
+
+  /**
+   * Determine a child node index based on the feature value and the split.
+   * @param binnedFeature Binned feature value.
+   * @param splits Split information to convert the bin indices to approximate feature values.
+   * @return Child node index to update to.
+   */
+  def updateNodeIndex(binnedFeature: Int, splits: Array[Split]): Int = {
+    if (split.shouldGoLeft(binnedFeature, splits)) {
+      LearningNode.leftChildIndex(nodeIndex)
+    } else {
+      LearningNode.rightChildIndex(nodeIndex)
+    }
+  }
+}
+
+/**
+ * Each TreePoint belongs to a particular node per tree.
+ * Each row in the nodeIdsForInstances RDD is an array over trees of the node index
+ * in each tree. Initially, values should all be 1 for root node.
+ * The nodeIdsForInstances RDD needs to be updated at each iteration.
+ * @param nodeIdsForInstances The initial values in the cache
+ *                           (should be an Array of all 1's (meaning the root nodes)).
+ * @param checkpointInterval The checkpointing interval
+ *                           (how often should the cache be checkpointed.).
+ */
+private[spark] class NodeIdCache(
+  var nodeIdsForInstances: RDD[Array[Int]],
+  val checkpointInterval: Int) extends Logging {
+
+  // Keep a reference to a previous node Ids for instances.
+  // Because we will keep on re-persisting updated node Ids,
+  // we want to unpersist the previous RDD.
+  private var prevNodeIdsForInstances: RDD[Array[Int]] = null
+
+  // To keep track of the past checkpointed RDDs.
+  private val checkpointQueue = mutable.Queue[RDD[Array[Int]]]()
+  private var rddUpdateCount = 0
+
+  // Indicates whether we can checkpoint
+  private val canCheckpoint = nodeIdsForInstances.sparkContext.getCheckpointDir.nonEmpty
+
+  // FileSystem instance for deleting checkpoints as needed
+  private val fs = FileSystem.get(nodeIdsForInstances.sparkContext.hadoopConfiguration)
+
+  /**
+   * Update the node index values in the cache.
+   * This updates the RDD and its lineage.
+   * TODO: Passing bin information to executors seems unnecessary and costly.
+   * @param data The RDD of training rows.
+   * @param nodeIdUpdaters A map of node index updaters.
+   *                       The key is the indices of nodes that we want to update.
+   * @param splits  Split information needed to find child node indices.
+   */
+  def updateNodeIndices(
+      data: RDD[BaggedPoint[TreePoint]],
+      nodeIdUpdaters: Array[mutable.Map[Int, NodeIndexUpdater]],
+      splits: Array[Array[Split]]): Unit = {
+    if (prevNodeIdsForInstances != null) {
+      // Unpersist the previous one if one exists.
+      prevNodeIdsForInstances.unpersist()
+    }
+
+    prevNodeIdsForInstances = nodeIdsForInstances
+    nodeIdsForInstances = data.zip(nodeIdsForInstances).map { case (point, ids) =>
+      var treeId = 0
+      while (treeId < nodeIdUpdaters.length) {
+        val nodeIdUpdater = nodeIdUpdaters(treeId).getOrElse(ids(treeId), null)
+        if (nodeIdUpdater != null) {
+          val featureIndex = nodeIdUpdater.split.featureIndex
+          val newNodeIndex = nodeIdUpdater.updateNodeIndex(
+            binnedFeature = point.datum.binnedFeatures(featureIndex),
+            splits = splits(featureIndex))
+          ids(treeId) = newNodeIndex
+        }
+        treeId += 1
+      }
+      ids
+    }
+
+    // Keep on persisting new ones.
+    nodeIdsForInstances.persist(StorageLevel.MEMORY_AND_DISK)
+    rddUpdateCount += 1
+
+    // Handle checkpointing if the directory is not None.
+    if (canCheckpoint && (rddUpdateCount % checkpointInterval) == 0) {
+      // Let's see if we can delete previous checkpoints.
+      var canDelete = true
+      while (checkpointQueue.size > 1 && canDelete) {
+        // We can delete the oldest checkpoint iff
+        // the next checkpoint actually exists in the file system.
+        if (checkpointQueue(1).getCheckpointFile.isDefined) {
+          val old = checkpointQueue.dequeue()
+          // Since the old checkpoint is not deleted by Spark, we'll manually delete it here.
+          try {
+            fs.delete(new Path(old.getCheckpointFile.get), true)
+          } catch {
+            case e: IOException =>
+              logError("Decision Tree learning using cacheNodeIds failed to remove checkpoint" +
+                s" file: ${old.getCheckpointFile.get}")
+          }
+        } else {
+          canDelete = false
+        }
+      }
+
+      nodeIdsForInstances.checkpoint()
+      checkpointQueue.enqueue(nodeIdsForInstances)
+    }
+  }
+
+  /**
+   * Call this after training is finished to delete any remaining checkpoints.
+   */
+  def deleteAllCheckpoints(): Unit = {
+    while (checkpointQueue.nonEmpty) {
+      val old = checkpointQueue.dequeue()
+      if (old.getCheckpointFile.isDefined) {
+        try {
+          fs.delete(new Path(old.getCheckpointFile.get), true)
+        } catch {
+          case e: IOException =>
+            logError("Decision Tree learning using cacheNodeIds failed to remove checkpoint" +
+              s" file: ${old.getCheckpointFile.get}")
+        }
+      }
+    }
+  }
+  if (prevNodeIdsForInstances != null) {
+    // Unpersist the previous one if one exists.
+    prevNodeIdsForInstances.unpersist()
+  }
+}
+
+@DeveloperApi
+private[spark] object NodeIdCache {
+  /**
+   * Initialize the node Id cache with initial node Id values.
+   * @param data The RDD of training rows.
+   * @param numTrees The number of trees that we want to create cache for.
+   * @param checkpointInterval The checkpointing interval
+   *                           (how often should the cache be checkpointed.).
+   * @param initVal The initial values in the cache.
+   * @return A node Id cache containing an RDD of initial root node Indices.
+   */
+  def init(
+      data: RDD[BaggedPoint[TreePoint]],
+      numTrees: Int,
+      checkpointInterval: Int,
+      initVal: Int = 1): NodeIdCache = {
+    new NodeIdCache(
+      data.map(_ => Array.fill[Int](numTrees)(initVal)),
+      checkpointInterval)
+  }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala
new file mode 100644
index 0000000000000..15b56bd844bad
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala
@@ -0,0 +1,1132 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.tree.impl
+
+import java.io.IOException
+
+import scala.collection.mutable
+import scala.util.Random
+
+import org.apache.spark.Logging
+import org.apache.spark.ml.classification.DecisionTreeClassificationModel
+import org.apache.spark.ml.regression.DecisionTreeRegressionModel
+import org.apache.spark.ml.tree._
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.mllib.tree.configuration.{Algo => OldAlgo, Strategy => OldStrategy}
+import org.apache.spark.mllib.tree.impl.{BaggedPoint, DTStatsAggregator, DecisionTreeMetadata,
+  TimeTracker}
+import org.apache.spark.mllib.tree.impurity.ImpurityCalculator
+import org.apache.spark.mllib.tree.model.{InformationGainStats, Predict}
+import org.apache.spark.rdd.RDD
+import org.apache.spark.storage.StorageLevel
+import org.apache.spark.util.random.{SamplingUtils, XORShiftRandom}
+
+
+private[ml] object RandomForest extends Logging {
+
+  /**
+   * Train a random forest.
+   * @param input Training data: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]]
+   * @return an unweighted set of trees
+   */
+  def run(
+      input: RDD[LabeledPoint],
+      strategy: OldStrategy,
+      numTrees: Int,
+      featureSubsetStrategy: String,
+      seed: Long,
+      parentUID: Option[String] = None): Array[DecisionTreeModel] = {
+
+    val timer = new TimeTracker()
+
+    timer.start("total")
+
+    timer.start("init")
+
+    val retaggedInput = input.retag(classOf[LabeledPoint])
+    val metadata =
+      DecisionTreeMetadata.buildMetadata(retaggedInput, strategy, numTrees, featureSubsetStrategy)
+    logDebug("algo = " + strategy.algo)
+    logDebug("numTrees = " + numTrees)
+    logDebug("seed = " + seed)
+    logDebug("maxBins = " + metadata.maxBins)
+    logDebug("featureSubsetStrategy = " + featureSubsetStrategy)
+    logDebug("numFeaturesPerNode = " + metadata.numFeaturesPerNode)
+    logDebug("subsamplingRate = " + strategy.subsamplingRate)
+
+    // Find the splits and the corresponding bins (interval between the splits) using a sample
+    // of the input data.
+    timer.start("findSplitsBins")
+    val splits = findSplits(retaggedInput, metadata)
+    timer.stop("findSplitsBins")
+    logDebug("numBins: feature: number of bins")
+    logDebug(Range(0, metadata.numFeatures).map { featureIndex =>
+      s"\t$featureIndex\t${metadata.numBins(featureIndex)}"
+    }.mkString("\n"))
+
+    // Bin feature values (TreePoint representation).
+    // Cache input RDD for speedup during multiple passes.
+    val treeInput = TreePoint.convertToTreeRDD(retaggedInput, splits, metadata)
+
+    val withReplacement = numTrees > 1
+
+    val baggedInput = BaggedPoint
+      .convertToBaggedRDD(treeInput, strategy.subsamplingRate, numTrees, withReplacement, seed)
+      .persist(StorageLevel.MEMORY_AND_DISK)
+
+    // depth of the decision tree
+    val maxDepth = strategy.maxDepth
+    require(maxDepth <= 30,
+      s"DecisionTree currently only supports maxDepth <= 30, but was given maxDepth = $maxDepth.")
+
+    // Max memory usage for aggregates
+    // TODO: Calculate memory usage more precisely.
+    val maxMemoryUsage: Long = strategy.maxMemoryInMB * 1024L * 1024L
+    logDebug("max memory usage for aggregates = " + maxMemoryUsage + " bytes.")
+    val maxMemoryPerNode = {
+      val featureSubset: Option[Array[Int]] = if (metadata.subsamplingFeatures) {
+        // Find numFeaturesPerNode largest bins to get an upper bound on memory usage.
+        Some(metadata.numBins.zipWithIndex.sortBy(- _._1)
+          .take(metadata.numFeaturesPerNode).map(_._2))
+      } else {
+        None
+      }
+      RandomForest.aggregateSizeForNode(metadata, featureSubset) * 8L
+    }
+    require(maxMemoryPerNode <= maxMemoryUsage,
+      s"RandomForest/DecisionTree given maxMemoryInMB = ${strategy.maxMemoryInMB}," +
+        " which is too small for the given features." +
+        s"  Minimum value = ${maxMemoryPerNode / (1024L * 1024L)}")
+
+    timer.stop("init")
+
+    /*
+     * The main idea here is to perform group-wise training of the decision tree nodes thus
+     * reducing the passes over the data from (# nodes) to (# nodes / maxNumberOfNodesPerGroup).
+     * Each data sample is handled by a particular node (or it reaches a leaf and is not used
+     * in lower levels).
+     */
+
+    // Create an RDD of node Id cache.
+    // At first, all the rows belong to the root nodes (node Id == 1).
+    val nodeIdCache = if (strategy.useNodeIdCache) {
+      Some(NodeIdCache.init(
+        data = baggedInput,
+        numTrees = numTrees,
+        checkpointInterval = strategy.checkpointInterval,
+        initVal = 1))
+    } else {
+      None
+    }
+
+    // FIFO queue of nodes to train: (treeIndex, node)
+    val nodeQueue = new mutable.Queue[(Int, LearningNode)]()
+
+    val rng = new Random()
+    rng.setSeed(seed)
+
+    // Allocate and queue root nodes.
+    val topNodes = Array.fill[LearningNode](numTrees)(LearningNode.emptyNode(nodeIndex = 1))
+    Range(0, numTrees).foreach(treeIndex => nodeQueue.enqueue((treeIndex, topNodes(treeIndex))))
+
+    while (nodeQueue.nonEmpty) {
+      // Collect some nodes to split, and choose features for each node (if subsampling).
+      // Each group of nodes may come from one or multiple trees, and at multiple levels.
+      val (nodesForGroup, treeToNodeToIndexInfo) =
+        RandomForest.selectNodesToSplit(nodeQueue, maxMemoryUsage, metadata, rng)
+      // Sanity check (should never occur):
+      assert(nodesForGroup.nonEmpty,
+        s"RandomForest selected empty nodesForGroup.  Error for unknown reason.")
+
+      // Choose node splits, and enqueue new nodes as needed.
+      timer.start("findBestSplits")
+      RandomForest.findBestSplits(baggedInput, metadata, topNodes, nodesForGroup,
+        treeToNodeToIndexInfo, splits, nodeQueue, timer, nodeIdCache)
+      timer.stop("findBestSplits")
+    }
+
+    baggedInput.unpersist()
+
+    timer.stop("total")
+
+    logInfo("Internal timing for DecisionTree:")
+    logInfo(s"$timer")
+
+    // Delete any remaining checkpoints used for node Id cache.
+    if (nodeIdCache.nonEmpty) {
+      try {
+        nodeIdCache.get.deleteAllCheckpoints()
+      } catch {
+        case e: IOException =>
+          logWarning(s"delete all checkpoints failed. Error reason: ${e.getMessage}")
+      }
+    }
+
+    parentUID match {
+      case Some(uid) =>
+        if (strategy.algo == OldAlgo.Classification) {
+          topNodes.map(rootNode => new DecisionTreeClassificationModel(uid, rootNode.toNode))
+        } else {
+          topNodes.map(rootNode => new DecisionTreeRegressionModel(uid, rootNode.toNode))
+        }
+      case None =>
+        if (strategy.algo == OldAlgo.Classification) {
+          topNodes.map(rootNode => new DecisionTreeClassificationModel(rootNode.toNode))
+        } else {
+          topNodes.map(rootNode => new DecisionTreeRegressionModel(rootNode.toNode))
+        }
+    }
+  }
+
+  /**
+   * Get the node index corresponding to this data point.
+   * This function mimics prediction, passing an example from the root node down to a leaf
+   * or unsplit node; that node's index is returned.
+   *
+   * @param node  Node in tree from which to classify the given data point.
+   * @param binnedFeatures  Binned feature vector for data point.
+   * @param splits possible splits for all features, indexed (numFeatures)(numSplits)
+   * @return  Leaf index if the data point reaches a leaf.
+   *          Otherwise, last node reachable in tree matching this example.
+   *          Note: This is the global node index, i.e., the index used in the tree.
+   *                This index is different from the index used during training a particular
+   *                group of nodes on one call to [[findBestSplits()]].
+   */
+  private def predictNodeIndex(
+      node: LearningNode,
+      binnedFeatures: Array[Int],
+      splits: Array[Array[Split]]): Int = {
+    if (node.isLeaf || node.split.isEmpty) {
+      node.id
+    } else {
+      val split = node.split.get
+      val featureIndex = split.featureIndex
+      val splitLeft = split.shouldGoLeft(binnedFeatures(featureIndex), splits(featureIndex))
+      if (node.leftChild.isEmpty) {
+        // Not yet split. Return index from next layer of nodes to train
+        if (splitLeft) {
+          LearningNode.leftChildIndex(node.id)
+        } else {
+          LearningNode.rightChildIndex(node.id)
+        }
+      } else {
+        if (splitLeft) {
+          predictNodeIndex(node.leftChild.get, binnedFeatures, splits)
+        } else {
+          predictNodeIndex(node.rightChild.get, binnedFeatures, splits)
+        }
+      }
+    }
+  }
+
+  /**
+   * Helper for binSeqOp, for data which can contain a mix of ordered and unordered features.
+   *
+   * For ordered features, a single bin is updated.
+   * For unordered features, bins correspond to subsets of categories; either the left or right bin
+   * for each subset is updated.
+   *
+   * @param agg  Array storing aggregate calculation, with a set of sufficient statistics for
+   *             each (feature, bin).
+   * @param treePoint  Data point being aggregated.
+   * @param splits possible splits indexed (numFeatures)(numSplits)
+   * @param unorderedFeatures  Set of indices of unordered features.
+   * @param instanceWeight  Weight (importance) of instance in dataset.
+   */
+  private def mixedBinSeqOp(
+      agg: DTStatsAggregator,
+      treePoint: TreePoint,
+      splits: Array[Array[Split]],
+      unorderedFeatures: Set[Int],
+      instanceWeight: Double,
+      featuresForNode: Option[Array[Int]]): Unit = {
+    val numFeaturesPerNode = if (featuresForNode.nonEmpty) {
+      // Use subsampled features
+      featuresForNode.get.length
+    } else {
+      // Use all features
+      agg.metadata.numFeatures
+    }
+    // Iterate over features.
+    var featureIndexIdx = 0
+    while (featureIndexIdx < numFeaturesPerNode) {
+      val featureIndex = if (featuresForNode.nonEmpty) {
+        featuresForNode.get.apply(featureIndexIdx)
+      } else {
+        featureIndexIdx
+      }
+      if (unorderedFeatures.contains(featureIndex)) {
+        // Unordered feature
+        val featureValue = treePoint.binnedFeatures(featureIndex)
+        val (leftNodeFeatureOffset, rightNodeFeatureOffset) =
+          agg.getLeftRightFeatureOffsets(featureIndexIdx)
+        // Update the left or right bin for each split.
+        val numSplits = agg.metadata.numSplits(featureIndex)
+        val featureSplits = splits(featureIndex)
+        var splitIndex = 0
+        while (splitIndex < numSplits) {
+          if (featureSplits(splitIndex).shouldGoLeft(featureValue, featureSplits)) {
+            agg.featureUpdate(leftNodeFeatureOffset, splitIndex, treePoint.label, instanceWeight)
+          } else {
+            agg.featureUpdate(rightNodeFeatureOffset, splitIndex, treePoint.label, instanceWeight)
+          }
+          splitIndex += 1
+        }
+      } else {
+        // Ordered feature
+        val binIndex = treePoint.binnedFeatures(featureIndex)
+        agg.update(featureIndexIdx, binIndex, treePoint.label, instanceWeight)
+      }
+      featureIndexIdx += 1
+    }
+  }
+
+  /**
+   * Helper for binSeqOp, for regression and for classification with only ordered features.
+   *
+   * For each feature, the sufficient statistics of one bin are updated.
+   *
+   * @param agg  Array storing aggregate calculation, with a set of sufficient statistics for
+   *             each (feature, bin).
+   * @param treePoint  Data point being aggregated.
+   * @param instanceWeight  Weight (importance) of instance in dataset.
+   */
+  private def orderedBinSeqOp(
+      agg: DTStatsAggregator,
+      treePoint: TreePoint,
+      instanceWeight: Double,
+      featuresForNode: Option[Array[Int]]): Unit = {
+    val label = treePoint.label
+
+    // Iterate over features.
+    if (featuresForNode.nonEmpty) {
+      // Use subsampled features
+      var featureIndexIdx = 0
+      while (featureIndexIdx < featuresForNode.get.length) {
+        val binIndex = treePoint.binnedFeatures(featuresForNode.get.apply(featureIndexIdx))
+        agg.update(featureIndexIdx, binIndex, label, instanceWeight)
+        featureIndexIdx += 1
+      }
+    } else {
+      // Use all features
+      val numFeatures = agg.metadata.numFeatures
+      var featureIndex = 0
+      while (featureIndex < numFeatures) {
+        val binIndex = treePoint.binnedFeatures(featureIndex)
+        agg.update(featureIndex, binIndex, label, instanceWeight)
+        featureIndex += 1
+      }
+    }
+  }
+
+  /**
+   * Given a group of nodes, this finds the best split for each node.
+   *
+   * @param input Training data: RDD of [[org.apache.spark.mllib.tree.impl.TreePoint]]
+   * @param metadata Learning and dataset metadata
+   * @param topNodes Root node for each tree.  Used for matching instances with nodes.
+   * @param nodesForGroup Mapping: treeIndex --> nodes to be split in tree
+   * @param treeToNodeToIndexInfo Mapping: treeIndex --> nodeIndex --> nodeIndexInfo,
+   *                              where nodeIndexInfo stores the index in the group and the
+   *                              feature subsets (if using feature subsets).
+   * @param splits possible splits for all features, indexed (numFeatures)(numSplits)
+   * @param nodeQueue  Queue of nodes to split, with values (treeIndex, node).
+   *                   Updated with new non-leaf nodes which are created.
+   * @param nodeIdCache Node Id cache containing an RDD of Array[Int] where
+   *                    each value in the array is the data point's node Id
+   *                    for a corresponding tree. This is used to prevent the need
+   *                    to pass the entire tree to the executors during
+   *                    the node stat aggregation phase.
+   */
+  private[tree] def findBestSplits(
+      input: RDD[BaggedPoint[TreePoint]],
+      metadata: DecisionTreeMetadata,
+      topNodes: Array[LearningNode],
+      nodesForGroup: Map[Int, Array[LearningNode]],
+      treeToNodeToIndexInfo: Map[Int, Map[Int, NodeIndexInfo]],
+      splits: Array[Array[Split]],
+      nodeQueue: mutable.Queue[(Int, LearningNode)],
+      timer: TimeTracker = new TimeTracker,
+      nodeIdCache: Option[NodeIdCache] = None): Unit = {
+
+    /*
+     * The high-level descriptions of the best split optimizations are noted here.
+     *
+     * *Group-wise training*
+     * We perform bin calculations for groups of nodes to reduce the number of
+     * passes over the data.  Each iteration requires more computation and storage,
+     * but saves several iterations over the data.
+     *
+     * *Bin-wise computation*
+     * We use a bin-wise best split computation strategy instead of a straightforward best split
+     * computation strategy. Instead of analyzing each sample for contribution to the left/right
+     * child node impurity of every split, we first categorize each feature of a sample into a
+     * bin. We exploit this structure to calculate aggregates for bins and then use these aggregates
+     * to calculate information gain for each split.
+     *
+     * *Aggregation over partitions*
+     * Instead of performing a flatMap/reduceByKey operation, we exploit the fact that we know
+     * the number of splits in advance. Thus, we store the aggregates (at the appropriate
+     * indices) in a single array for all bins and rely upon the RDD aggregate method to
+     * drastically reduce the communication overhead.
+     */
+
+    // numNodes:  Number of nodes in this group
+    val numNodes = nodesForGroup.values.map(_.length).sum
+    logDebug("numNodes = " + numNodes)
+    logDebug("numFeatures = " + metadata.numFeatures)
+    logDebug("numClasses = " + metadata.numClasses)
+    logDebug("isMulticlass = " + metadata.isMulticlass)
+    logDebug("isMulticlassWithCategoricalFeatures = " +
+      metadata.isMulticlassWithCategoricalFeatures)
+    logDebug("using nodeIdCache = " + nodeIdCache.nonEmpty.toString)
+
+    /**
+     * Performs a sequential aggregation over a partition for a particular tree and node.
+     *
+     * For each feature, the aggregate sufficient statistics are updated for the relevant
+     * bins.
+     *
+     * @param treeIndex Index of the tree that we want to perform aggregation for.
+     * @param nodeInfo The node info for the tree node.
+     * @param agg Array storing aggregate calculation, with a set of sufficient statistics
+     *            for each (node, feature, bin).
+     * @param baggedPoint Data point being aggregated.
+     */
+    def nodeBinSeqOp(
+        treeIndex: Int,
+        nodeInfo: NodeIndexInfo,
+        agg: Array[DTStatsAggregator],
+        baggedPoint: BaggedPoint[TreePoint]): Unit = {
+      if (nodeInfo != null) {
+        val aggNodeIndex = nodeInfo.nodeIndexInGroup
+        val featuresForNode = nodeInfo.featureSubset
+        val instanceWeight = baggedPoint.subsampleWeights(treeIndex)
+        if (metadata.unorderedFeatures.isEmpty) {
+          orderedBinSeqOp(agg(aggNodeIndex), baggedPoint.datum, instanceWeight, featuresForNode)
+        } else {
+          mixedBinSeqOp(agg(aggNodeIndex), baggedPoint.datum, splits,
+            metadata.unorderedFeatures, instanceWeight, featuresForNode)
+        }
+      }
+    }
+
+    /**
+     * Performs a sequential aggregation over a partition.
+     *
+     * Each data point contributes to one node. For each feature,
+     * the aggregate sufficient statistics are updated for the relevant bins.
+     *
+     * @param agg  Array storing aggregate calculation, with a set of sufficient statistics for
+     *             each (node, feature, bin).
+     * @param baggedPoint   Data point being aggregated.
+     * @return  agg
+     */
+    def binSeqOp(
+        agg: Array[DTStatsAggregator],
+        baggedPoint: BaggedPoint[TreePoint]): Array[DTStatsAggregator] = {
+      treeToNodeToIndexInfo.foreach { case (treeIndex, nodeIndexToInfo) =>
+        val nodeIndex =
+          predictNodeIndex(topNodes(treeIndex), baggedPoint.datum.binnedFeatures, splits)
+        nodeBinSeqOp(treeIndex, nodeIndexToInfo.getOrElse(nodeIndex, null), agg, baggedPoint)
+      }
+      agg
+    }
+
+    /**
+     * Do the same thing as binSeqOp, but with nodeIdCache.
+     */
+    def binSeqOpWithNodeIdCache(
+        agg: Array[DTStatsAggregator],
+        dataPoint: (BaggedPoint[TreePoint], Array[Int])): Array[DTStatsAggregator] = {
+      treeToNodeToIndexInfo.foreach { case (treeIndex, nodeIndexToInfo) =>
+        val baggedPoint = dataPoint._1
+        val nodeIdCache = dataPoint._2
+        val nodeIndex = nodeIdCache(treeIndex)
+        nodeBinSeqOp(treeIndex, nodeIndexToInfo.getOrElse(nodeIndex, null), agg, baggedPoint)
+      }
+
+      agg
+    }
+
+    /**
+     * Get node index in group --> features indices map,
+     * which is a short cut to find feature indices for a node given node index in group.
+     */
+    def getNodeToFeatures(
+        treeToNodeToIndexInfo: Map[Int, Map[Int, NodeIndexInfo]]): Option[Map[Int, Array[Int]]] = {
+      if (!metadata.subsamplingFeatures) {
+        None
+      } else {
+        val mutableNodeToFeatures = new mutable.HashMap[Int, Array[Int]]()
+        treeToNodeToIndexInfo.values.foreach { nodeIdToNodeInfo =>
+          nodeIdToNodeInfo.values.foreach { nodeIndexInfo =>
+            assert(nodeIndexInfo.featureSubset.isDefined)
+            mutableNodeToFeatures(nodeIndexInfo.nodeIndexInGroup) = nodeIndexInfo.featureSubset.get
+          }
+        }
+        Some(mutableNodeToFeatures.toMap)
+      }
+    }
+
+    // array of nodes to train indexed by node index in group
+    val nodes = new Array[LearningNode](numNodes)
+    nodesForGroup.foreach { case (treeIndex, nodesForTree) =>
+      nodesForTree.foreach { node =>
+        nodes(treeToNodeToIndexInfo(treeIndex)(node.id).nodeIndexInGroup) = node
+      }
+    }
+
+    // Calculate best splits for all nodes in the group
+    timer.start("chooseSplits")
+
+    // In each partition, iterate all instances and compute aggregate stats for each node,
+    // yield an (nodeIndex, nodeAggregateStats) pair for each node.
+    // After a `reduceByKey` operation,
+    // stats of a node will be shuffled to a particular partition and be combined together,
+    // then best splits for nodes are found there.
+    // Finally, only best Splits for nodes are collected to driver to construct decision tree.
+    val nodeToFeatures = getNodeToFeatures(treeToNodeToIndexInfo)
+    val nodeToFeaturesBc = input.sparkContext.broadcast(nodeToFeatures)
+
+    val partitionAggregates : RDD[(Int, DTStatsAggregator)] = if (nodeIdCache.nonEmpty) {
+      input.zip(nodeIdCache.get.nodeIdsForInstances).mapPartitions { points =>
+        // Construct a nodeStatsAggregators array to hold node aggregate stats,
+        // each node will have a nodeStatsAggregator
+        val nodeStatsAggregators = Array.tabulate(numNodes) { nodeIndex =>
+          val featuresForNode = nodeToFeaturesBc.value.flatMap { nodeToFeatures =>
+            Some(nodeToFeatures(nodeIndex))
+          }
+          new DTStatsAggregator(metadata, featuresForNode)
+        }
+
+        // iterator all instances in current partition and update aggregate stats
+        points.foreach(binSeqOpWithNodeIdCache(nodeStatsAggregators, _))
+
+        // transform nodeStatsAggregators array to (nodeIndex, nodeAggregateStats) pairs,
+        // which can be combined with other partition using `reduceByKey`
+        nodeStatsAggregators.view.zipWithIndex.map(_.swap).iterator
+      }
+    } else {
+      input.mapPartitions { points =>
+        // Construct a nodeStatsAggregators array to hold node aggregate stats,
+        // each node will have a nodeStatsAggregator
+        val nodeStatsAggregators = Array.tabulate(numNodes) { nodeIndex =>
+          val featuresForNode = nodeToFeaturesBc.value.flatMap { nodeToFeatures =>
+            Some(nodeToFeatures(nodeIndex))
+          }
+          new DTStatsAggregator(metadata, featuresForNode)
+        }
+
+        // iterator all instances in current partition and update aggregate stats
+        points.foreach(binSeqOp(nodeStatsAggregators, _))
+
+        // transform nodeStatsAggregators array to (nodeIndex, nodeAggregateStats) pairs,
+        // which can be combined with other partition using `reduceByKey`
+        nodeStatsAggregators.view.zipWithIndex.map(_.swap).iterator
+      }
+    }
+
+    val nodeToBestSplits = partitionAggregates.reduceByKey((a, b) => a.merge(b)).map {
+      case (nodeIndex, aggStats) =>
+        val featuresForNode = nodeToFeaturesBc.value.flatMap { nodeToFeatures =>
+          Some(nodeToFeatures(nodeIndex))
+        }
+
+        // find best split for each node
+        val (split: Split, stats: InformationGainStats, predict: Predict) =
+          binsToBestSplit(aggStats, splits, featuresForNode, nodes(nodeIndex))
+        (nodeIndex, (split, stats, predict))
+    }.collectAsMap()
+
+    timer.stop("chooseSplits")
+
+    val nodeIdUpdaters = if (nodeIdCache.nonEmpty) {
+      Array.fill[mutable.Map[Int, NodeIndexUpdater]](
+        metadata.numTrees)(mutable.Map[Int, NodeIndexUpdater]())
+    } else {
+      null
+    }
+    // Iterate over all nodes in this group.
+    nodesForGroup.foreach { case (treeIndex, nodesForTree) =>
+      nodesForTree.foreach { node =>
+        val nodeIndex = node.id
+        val nodeInfo = treeToNodeToIndexInfo(treeIndex)(nodeIndex)
+        val aggNodeIndex = nodeInfo.nodeIndexInGroup
+        val (split: Split, stats: InformationGainStats, predict: Predict) =
+          nodeToBestSplits(aggNodeIndex)
+        logDebug("best split = " + split)
+
+        // Extract info for this node.  Create children if not leaf.
+        val isLeaf =
+          (stats.gain <= 0) || (LearningNode.indexToLevel(nodeIndex) == metadata.maxDepth)
+        node.predictionStats = predict
+        node.isLeaf = isLeaf
+        node.stats = Some(stats)
+        node.impurity = stats.impurity
+        logDebug("Node = " + node)
+
+        if (!isLeaf) {
+          node.split = Some(split)
+          val childIsLeaf = (LearningNode.indexToLevel(nodeIndex) + 1) == metadata.maxDepth
+          val leftChildIsLeaf = childIsLeaf || (stats.leftImpurity == 0.0)
+          val rightChildIsLeaf = childIsLeaf || (stats.rightImpurity == 0.0)
+          node.leftChild = Some(LearningNode(LearningNode.leftChildIndex(nodeIndex),
+            stats.leftPredict, stats.leftImpurity, leftChildIsLeaf))
+          node.rightChild = Some(LearningNode(LearningNode.rightChildIndex(nodeIndex),
+            stats.rightPredict, stats.rightImpurity, rightChildIsLeaf))
+
+          if (nodeIdCache.nonEmpty) {
+            val nodeIndexUpdater = NodeIndexUpdater(
+              split = split,
+              nodeIndex = nodeIndex)
+            nodeIdUpdaters(treeIndex).put(nodeIndex, nodeIndexUpdater)
+          }
+
+          // enqueue left child and right child if they are not leaves
+          if (!leftChildIsLeaf) {
+            nodeQueue.enqueue((treeIndex, node.leftChild.get))
+          }
+          if (!rightChildIsLeaf) {
+            nodeQueue.enqueue((treeIndex, node.rightChild.get))
+          }
+
+          logDebug("leftChildIndex = " + node.leftChild.get.id +
+            ", impurity = " + stats.leftImpurity)
+          logDebug("rightChildIndex = " + node.rightChild.get.id +
+            ", impurity = " + stats.rightImpurity)
+        }
+      }
+    }
+
+    if (nodeIdCache.nonEmpty) {
+      // Update the cache if needed.
+      nodeIdCache.get.updateNodeIndices(input, nodeIdUpdaters, splits)
+    }
+  }
+
+  /**
+   * Calculate the information gain for a given (feature, split) based upon left/right aggregates.
+   * @param leftImpurityCalculator left node aggregates for this (feature, split)
+   * @param rightImpurityCalculator right node aggregate for this (feature, split)
+   * @return information gain and statistics for split
+   */
+  private def calculateGainForSplit(
+      leftImpurityCalculator: ImpurityCalculator,
+      rightImpurityCalculator: ImpurityCalculator,
+      metadata: DecisionTreeMetadata,
+      impurity: Double): InformationGainStats = {
+    val leftCount = leftImpurityCalculator.count
+    val rightCount = rightImpurityCalculator.count
+
+    // If left child or right child doesn't satisfy minimum instances per node,
+    // then this split is invalid, return invalid information gain stats.
+    if ((leftCount < metadata.minInstancesPerNode) ||
+      (rightCount < metadata.minInstancesPerNode)) {
+      return InformationGainStats.invalidInformationGainStats
+    }
+
+    val totalCount = leftCount + rightCount
+
+    val leftImpurity = leftImpurityCalculator.calculate() // Note: This equals 0 if count = 0
+    val rightImpurity = rightImpurityCalculator.calculate()
+
+    val leftWeight = leftCount / totalCount.toDouble
+    val rightWeight = rightCount / totalCount.toDouble
+
+    val gain = impurity - leftWeight * leftImpurity - rightWeight * rightImpurity
+
+    // if information gain doesn't satisfy minimum information gain,
+    // then this split is invalid, return invalid information gain stats.
+    if (gain < metadata.minInfoGain) {
+      return InformationGainStats.invalidInformationGainStats
+    }
+
+    // calculate left and right predict
+    val leftPredict = calculatePredict(leftImpurityCalculator)
+    val rightPredict = calculatePredict(rightImpurityCalculator)
+
+    new InformationGainStats(gain, impurity, leftImpurity, rightImpurity,
+      leftPredict, rightPredict)
+  }
+
+  private def calculatePredict(impurityCalculator: ImpurityCalculator): Predict = {
+    val predict = impurityCalculator.predict
+    val prob = impurityCalculator.prob(predict)
+    new Predict(predict, prob)
+  }
+
+  /**
+   * Calculate predict value for current node, given stats of any split.
+   * Note that this function is called only once for each node.
+   * @param leftImpurityCalculator left node aggregates for a split
+   * @param rightImpurityCalculator right node aggregates for a split
+   * @return predict value and impurity for current node
+   */
+  private def calculatePredictImpurity(
+      leftImpurityCalculator: ImpurityCalculator,
+      rightImpurityCalculator: ImpurityCalculator): (Predict, Double) = {
+    val parentNodeAgg = leftImpurityCalculator.copy
+    parentNodeAgg.add(rightImpurityCalculator)
+    val predict = calculatePredict(parentNodeAgg)
+    val impurity = parentNodeAgg.calculate()
+
+    (predict, impurity)
+  }
+
+  /**
+   * Find the best split for a node.
+   * @param binAggregates Bin statistics.
+   * @return tuple for best split: (Split, information gain, prediction at node)
+   */
+  private def binsToBestSplit(
+      binAggregates: DTStatsAggregator,
+      splits: Array[Array[Split]],
+      featuresForNode: Option[Array[Int]],
+      node: LearningNode): (Split, InformationGainStats, Predict) = {
+
+    // Calculate prediction and impurity if current node is top node
+    val level = LearningNode.indexToLevel(node.id)
+    var predictionAndImpurity: Option[(Predict, Double)] = if (level == 0) {
+      None
+    } else {
+      Some((node.predictionStats, node.impurity))
+    }
+
+    // For each (feature, split), calculate the gain, and select the best (feature, split).
+    val (bestSplit, bestSplitStats) =
+      Range(0, binAggregates.metadata.numFeaturesPerNode).map { featureIndexIdx =>
+        val featureIndex = if (featuresForNode.nonEmpty) {
+          featuresForNode.get.apply(featureIndexIdx)
+        } else {
+          featureIndexIdx
+        }
+        val numSplits = binAggregates.metadata.numSplits(featureIndex)
+        if (binAggregates.metadata.isContinuous(featureIndex)) {
+          // Cumulative sum (scanLeft) of bin statistics.
+          // Afterwards, binAggregates for a bin is the sum of aggregates for
+          // that bin + all preceding bins.
+          val nodeFeatureOffset = binAggregates.getFeatureOffset(featureIndexIdx)
+          var splitIndex = 0
+          while (splitIndex < numSplits) {
+            binAggregates.mergeForFeature(nodeFeatureOffset, splitIndex + 1, splitIndex)
+            splitIndex += 1
+          }
+          // Find best split.
+          val (bestFeatureSplitIndex, bestFeatureGainStats) =
+            Range(0, numSplits).map { case splitIdx =>
+              val leftChildStats = binAggregates.getImpurityCalculator(nodeFeatureOffset, splitIdx)
+              val rightChildStats =
+                binAggregates.getImpurityCalculator(nodeFeatureOffset, numSplits)
+              rightChildStats.subtract(leftChildStats)
+              predictionAndImpurity = Some(predictionAndImpurity.getOrElse(
+                calculatePredictImpurity(leftChildStats, rightChildStats)))
+              val gainStats = calculateGainForSplit(leftChildStats,
+                rightChildStats, binAggregates.metadata, predictionAndImpurity.get._2)
+              (splitIdx, gainStats)
+            }.maxBy(_._2.gain)
+          (splits(featureIndex)(bestFeatureSplitIndex), bestFeatureGainStats)
+        } else if (binAggregates.metadata.isUnordered(featureIndex)) {
+          // Unordered categorical feature
+          val (leftChildOffset, rightChildOffset) =
+            binAggregates.getLeftRightFeatureOffsets(featureIndexIdx)
+          val (bestFeatureSplitIndex, bestFeatureGainStats) =
+            Range(0, numSplits).map { splitIndex =>
+              val leftChildStats = binAggregates.getImpurityCalculator(leftChildOffset, splitIndex)
+              val rightChildStats =
+                binAggregates.getImpurityCalculator(rightChildOffset, splitIndex)
+              predictionAndImpurity = Some(predictionAndImpurity.getOrElse(
+                calculatePredictImpurity(leftChildStats, rightChildStats)))
+              val gainStats = calculateGainForSplit(leftChildStats,
+                rightChildStats, binAggregates.metadata, predictionAndImpurity.get._2)
+              (splitIndex, gainStats)
+            }.maxBy(_._2.gain)
+          (splits(featureIndex)(bestFeatureSplitIndex), bestFeatureGainStats)
+        } else {
+          // Ordered categorical feature
+          val nodeFeatureOffset = binAggregates.getFeatureOffset(featureIndexIdx)
+          val numCategories = binAggregates.metadata.numBins(featureIndex)
+
+          /* Each bin is one category (feature value).
+           * The bins are ordered based on centroidForCategories, and this ordering determines which
+           * splits are considered.  (With K categories, we consider K - 1 possible splits.)
+           *
+           * centroidForCategories is a list: (category, centroid)
+           */
+          val centroidForCategories = if (binAggregates.metadata.isMulticlass) {
+            // For categorical variables in multiclass classification,
+            // the bins are ordered by the impurity of their corresponding labels.
+            Range(0, numCategories).map { case featureValue =>
+              val categoryStats =
+                binAggregates.getImpurityCalculator(nodeFeatureOffset, featureValue)
+              val centroid = if (categoryStats.count != 0) {
+                categoryStats.calculate()
+              } else {
+                Double.MaxValue
+              }
+              (featureValue, centroid)
+            }
+          } else { // regression or binary classification
+            // For categorical variables in regression and binary classification,
+            // the bins are ordered by the centroid of their corresponding labels.
+            Range(0, numCategories).map { case featureValue =>
+              val categoryStats =
+                binAggregates.getImpurityCalculator(nodeFeatureOffset, featureValue)
+              val centroid = if (categoryStats.count != 0) {
+                categoryStats.predict
+              } else {
+                Double.MaxValue
+              }
+              (featureValue, centroid)
+            }
+          }
+
+          logDebug("Centroids for categorical variable: " + centroidForCategories.mkString(","))
+
+          // bins sorted by centroids
+          val categoriesSortedByCentroid = centroidForCategories.toList.sortBy(_._2)
+
+          logDebug("Sorted centroids for categorical variable = " +
+            categoriesSortedByCentroid.mkString(","))
+
+          // Cumulative sum (scanLeft) of bin statistics.
+          // Afterwards, binAggregates for a bin is the sum of aggregates for
+          // that bin + all preceding bins.
+          var splitIndex = 0
+          while (splitIndex < numSplits) {
+            val currentCategory = categoriesSortedByCentroid(splitIndex)._1
+            val nextCategory = categoriesSortedByCentroid(splitIndex + 1)._1
+            binAggregates.mergeForFeature(nodeFeatureOffset, nextCategory, currentCategory)
+            splitIndex += 1
+          }
+          // lastCategory = index of bin with total aggregates for this (node, feature)
+          val lastCategory = categoriesSortedByCentroid.last._1
+          // Find best split.
+          val (bestFeatureSplitIndex, bestFeatureGainStats) =
+            Range(0, numSplits).map { splitIndex =>
+              val featureValue = categoriesSortedByCentroid(splitIndex)._1
+              val leftChildStats =
+                binAggregates.getImpurityCalculator(nodeFeatureOffset, featureValue)
+              val rightChildStats =
+                binAggregates.getImpurityCalculator(nodeFeatureOffset, lastCategory)
+              rightChildStats.subtract(leftChildStats)
+              predictionAndImpurity = Some(predictionAndImpurity.getOrElse(
+                calculatePredictImpurity(leftChildStats, rightChildStats)))
+              val gainStats = calculateGainForSplit(leftChildStats,
+                rightChildStats, binAggregates.metadata, predictionAndImpurity.get._2)
+              (splitIndex, gainStats)
+            }.maxBy(_._2.gain)
+          val categoriesForSplit =
+            categoriesSortedByCentroid.map(_._1.toDouble).slice(0, bestFeatureSplitIndex + 1)
+          val bestFeatureSplit =
+            new CategoricalSplit(featureIndex, categoriesForSplit.toArray, numCategories)
+          (bestFeatureSplit, bestFeatureGainStats)
+        }
+      }.maxBy(_._2.gain)
+
+    (bestSplit, bestSplitStats, predictionAndImpurity.get._1)
+  }
+
+  /**
+   * Returns splits and bins for decision tree calculation.
+   * Continuous and categorical features are handled differently.
+   *
+   * Continuous features:
+   *   For each feature, there are numBins - 1 possible splits representing the possible binary
+   *   decisions at each node in the tree.
+   *   This finds locations (feature values) for splits using a subsample of the data.
+   *
+   * Categorical features:
+   *   For each feature, there is 1 bin per split.
+   *   Splits and bins are handled in 2 ways:
+   *   (a) "unordered features"
+   *       For multiclass classification with a low-arity feature
+   *       (i.e., if isMulticlass && isSpaceSufficientForAllCategoricalSplits),
+   *       the feature is split based on subsets of categories.
+   *   (b) "ordered features"
+   *       For regression and binary classification,
+   *       and for multiclass classification with a high-arity feature,
+   *       there is one bin per category.
+   *
+   * @param input Training data: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]]
+   * @param metadata Learning and dataset metadata
+   * @return A tuple of (splits, bins).
+   *         Splits is an Array of [[org.apache.spark.mllib.tree.model.Split]]
+   *          of size (numFeatures, numSplits).
+   *         Bins is an Array of [[org.apache.spark.mllib.tree.model.Bin]]
+   *          of size (numFeatures, numBins).
+   */
+  protected[tree] def findSplits(
+      input: RDD[LabeledPoint],
+      metadata: DecisionTreeMetadata): Array[Array[Split]] = {
+
+    logDebug("isMulticlass = " + metadata.isMulticlass)
+
+    val numFeatures = metadata.numFeatures
+
+    // Sample the input only if there are continuous features.
+    val hasContinuousFeatures = Range(0, numFeatures).exists(metadata.isContinuous)
+    val sampledInput = if (hasContinuousFeatures) {
+      // Calculate the number of samples for approximate quantile calculation.
+      val requiredSamples = math.max(metadata.maxBins * metadata.maxBins, 10000)
+      val fraction = if (requiredSamples < metadata.numExamples) {
+        requiredSamples.toDouble / metadata.numExamples
+      } else {
+        1.0
+      }
+      logDebug("fraction of data used for calculating quantiles = " + fraction)
+      input.sample(withReplacement = false, fraction, new XORShiftRandom(1).nextInt()).collect()
+    } else {
+      new Array[LabeledPoint](0)
+    }
+
+    val splits = new Array[Array[Split]](numFeatures)
+
+    // Find all splits.
+    // Iterate over all features.
+    var featureIndex = 0
+    while (featureIndex < numFeatures) {
+      if (metadata.isContinuous(featureIndex)) {
+        val featureSamples = sampledInput.map(_.features(featureIndex))
+        val featureSplits = findSplitsForContinuousFeature(featureSamples, metadata, featureIndex)
+
+        val numSplits = featureSplits.length
+        logDebug(s"featureIndex = $featureIndex, numSplits = $numSplits")
+        splits(featureIndex) = new Array[Split](numSplits)
+
+        var splitIndex = 0
+        while (splitIndex < numSplits) {
+          val threshold = featureSplits(splitIndex)
+          splits(featureIndex)(splitIndex) = new ContinuousSplit(featureIndex, threshold)
+          splitIndex += 1
+        }
+      } else {
+        // Categorical feature
+        if (metadata.isUnordered(featureIndex)) {
+          val numSplits = metadata.numSplits(featureIndex)
+          val featureArity = metadata.featureArity(featureIndex)
+          // TODO: Use an implicit representation mapping each category to a subset of indices.
+          //       I.e., track indices such that we can calculate the set of bins for which
+          //       feature value x splits to the left.
+          // Unordered features
+          // 2^(maxFeatureValue - 1) - 1 combinations
+          splits(featureIndex) = new Array[Split](numSplits)
+          var splitIndex = 0
+          while (splitIndex < numSplits) {
+            val categories: List[Double] =
+              extractMultiClassCategories(splitIndex + 1, featureArity)
+            splits(featureIndex)(splitIndex) =
+              new CategoricalSplit(featureIndex, categories.toArray, featureArity)
+            splitIndex += 1
+          }
+        } else {
+          // Ordered features
+          //   Bins correspond to feature values, so we do not need to compute splits or bins
+          //   beforehand.  Splits are constructed as needed during training.
+          splits(featureIndex) = new Array[Split](0)
+        }
+      }
+      featureIndex += 1
+    }
+    splits
+  }
+
+  /**
+   * Nested method to extract list of eligible categories given an index. It extracts the
+   * position of ones in a binary representation of the input. If binary
+   * representation of an number is 01101 (13), the output list should (3.0, 2.0,
+   * 0.0). The maxFeatureValue depict the number of rightmost digits that will be tested for ones.
+   */
+  private[tree] def extractMultiClassCategories(
+      input: Int,
+      maxFeatureValue: Int): List[Double] = {
+    var categories = List[Double]()
+    var j = 0
+    var bitShiftedInput = input
+    while (j < maxFeatureValue) {
+      if (bitShiftedInput % 2 != 0) {
+        // updating the list of categories.
+        categories = j.toDouble :: categories
+      }
+      // Right shift by one
+      bitShiftedInput = bitShiftedInput >> 1
+      j += 1
+    }
+    categories
+  }
+
+  /**
+   * Find splits for a continuous feature
+   * NOTE: Returned number of splits is set based on `featureSamples` and
+   *       could be different from the specified `numSplits`.
+   *       The `numSplits` attribute in the `DecisionTreeMetadata` class will be set accordingly.
+   * @param featureSamples feature values of each sample
+   * @param metadata decision tree metadata
+   *                 NOTE: `metadata.numbins` will be changed accordingly
+   *                       if there are not enough splits to be found
+   * @param featureIndex feature index to find splits
+   * @return array of splits
+   */
+  private[tree] def findSplitsForContinuousFeature(
+      featureSamples: Array[Double],
+      metadata: DecisionTreeMetadata,
+      featureIndex: Int): Array[Double] = {
+    require(metadata.isContinuous(featureIndex),
+      "findSplitsForContinuousFeature can only be used to find splits for a continuous feature.")
+
+    val splits = {
+      val numSplits = metadata.numSplits(featureIndex)
+
+      // get count for each distinct value
+      val valueCountMap = featureSamples.foldLeft(Map.empty[Double, Int]) { (m, x) =>
+        m + ((x, m.getOrElse(x, 0) + 1))
+      }
+      // sort distinct values
+      val valueCounts = valueCountMap.toSeq.sortBy(_._1).toArray
+
+      // if possible splits is not enough or just enough, just return all possible splits
+      val possibleSplits = valueCounts.length
+      if (possibleSplits <= numSplits) {
+        valueCounts.map(_._1)
+      } else {
+        // stride between splits
+        val stride: Double = featureSamples.length.toDouble / (numSplits + 1)
+        logDebug("stride = " + stride)
+
+        // iterate `valueCount` to find splits
+        val splitsBuilder = mutable.ArrayBuilder.make[Double]
+        var index = 1
+        // currentCount: sum of counts of values that have been visited
+        var currentCount = valueCounts(0)._2
+        // targetCount: target value for `currentCount`.
+        // If `currentCount` is closest value to `targetCount`,
+        // then current value is a split threshold.
+        // After finding a split threshold, `targetCount` is added by stride.
+        var targetCount = stride
+        while (index < valueCounts.length) {
+          val previousCount = currentCount
+          currentCount += valueCounts(index)._2
+          val previousGap = math.abs(previousCount - targetCount)
+          val currentGap = math.abs(currentCount - targetCount)
+          // If adding count of current value to currentCount
+          // makes the gap between currentCount and targetCount smaller,
+          // previous value is a split threshold.
+          if (previousGap < currentGap) {
+            splitsBuilder += valueCounts(index - 1)._1
+            targetCount += stride
+          }
+          index += 1
+        }
+
+        splitsBuilder.result()
+      }
+    }
+
+    // TODO: Do not fail; just ignore the useless feature.
+    assert(splits.length > 0,
+      s"DecisionTree could not handle feature $featureIndex since it had only 1 unique value." +
+        "  Please remove this feature and then try again.")
+    // set number of splits accordingly
+    metadata.setNumSplits(featureIndex, splits.length)
+
+    splits
+  }
+
+  private[tree] class NodeIndexInfo(
+      val nodeIndexInGroup: Int,
+      val featureSubset: Option[Array[Int]]) extends Serializable
+
+  /**
+   * Pull nodes off of the queue, and collect a group of nodes to be split on this iteration.
+   * This tracks the memory usage for aggregates and stops adding nodes when too much memory
+   * will be needed; this allows an adaptive number of nodes since different nodes may require
+   * different amounts of memory (if featureSubsetStrategy is not "all").
+   *
+   * @param nodeQueue  Queue of nodes to split.
+   * @param maxMemoryUsage  Bound on size of aggregate statistics.
+   * @return  (nodesForGroup, treeToNodeToIndexInfo).
+   *          nodesForGroup holds the nodes to split: treeIndex --> nodes in tree.
+   *
+   *          treeToNodeToIndexInfo holds indices selected features for each node:
+   *            treeIndex --> (global) node index --> (node index in group, feature indices).
+   *          The (global) node index is the index in the tree; the node index in group is the
+   *           index in [0, numNodesInGroup) of the node in this group.
+   *          The feature indices are None if not subsampling features.
+   */
+  private[tree] def selectNodesToSplit(
+      nodeQueue: mutable.Queue[(Int, LearningNode)],
+      maxMemoryUsage: Long,
+      metadata: DecisionTreeMetadata,
+      rng: Random): (Map[Int, Array[LearningNode]], Map[Int, Map[Int, NodeIndexInfo]]) = {
+    // Collect some nodes to split:
+    //  nodesForGroup(treeIndex) = nodes to split
+    val mutableNodesForGroup = new mutable.HashMap[Int, mutable.ArrayBuffer[LearningNode]]()
+    val mutableTreeToNodeToIndexInfo =
+      new mutable.HashMap[Int, mutable.HashMap[Int, NodeIndexInfo]]()
+    var memUsage: Long = 0L
+    var numNodesInGroup = 0
+    while (nodeQueue.nonEmpty && memUsage < maxMemoryUsage) {
+      val (treeIndex, node) = nodeQueue.head
+      // Choose subset of features for node (if subsampling).
+      val featureSubset: Option[Array[Int]] = if (metadata.subsamplingFeatures) {
+        Some(SamplingUtils.reservoirSampleAndCount(Range(0,
+          metadata.numFeatures).iterator, metadata.numFeaturesPerNode, rng.nextLong())._1)
+      } else {
+        None
+      }
+      // Check if enough memory remains to add this node to the group.
+      val nodeMemUsage = RandomForest.aggregateSizeForNode(metadata, featureSubset) * 8L
+      if (memUsage + nodeMemUsage <= maxMemoryUsage) {
+        nodeQueue.dequeue()
+        mutableNodesForGroup.getOrElseUpdate(treeIndex, new mutable.ArrayBuffer[LearningNode]()) +=
+          node
+        mutableTreeToNodeToIndexInfo
+          .getOrElseUpdate(treeIndex, new mutable.HashMap[Int, NodeIndexInfo]())(node.id)
+          = new NodeIndexInfo(numNodesInGroup, featureSubset)
+      }
+      numNodesInGroup += 1
+      memUsage += nodeMemUsage
+    }
+    // Convert mutable maps to immutable ones.
+    val nodesForGroup: Map[Int, Array[LearningNode]] =
+      mutableNodesForGroup.mapValues(_.toArray).toMap
+    val treeToNodeToIndexInfo = mutableTreeToNodeToIndexInfo.mapValues(_.toMap).toMap
+    (nodesForGroup, treeToNodeToIndexInfo)
+  }
+
+  /**
+   * Get the number of values to be stored for this node in the bin aggregates.
+   * @param featureSubset  Indices of features which may be split at this node.
+   *                       If None, then use all features.
+   */
+  private def aggregateSizeForNode(
+      metadata: DecisionTreeMetadata,
+      featureSubset: Option[Array[Int]]): Long = {
+    val totalBins = if (featureSubset.nonEmpty) {
+      featureSubset.get.map(featureIndex => metadata.numBins(featureIndex).toLong).sum
+    } else {
+      metadata.numBins.map(_.toLong).sum
+    }
+    if (metadata.isClassification) {
+      metadata.numClasses * totalBins
+    } else {
+      3 * totalBins
+    }
+  }
+
+}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/TreePoint.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/TreePoint.scala
new file mode 100644
index 0000000000000..9fa27e5e1f721
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/TreePoint.scala
@@ -0,0 +1,134 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.tree.impl
+
+import org.apache.spark.ml.tree.{ContinuousSplit, Split}
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.mllib.tree.impl.DecisionTreeMetadata
+import org.apache.spark.rdd.RDD
+
+
+/**
+ * Internal representation of LabeledPoint for DecisionTree.
+ * This bins feature values based on a subsampled of data as follows:
+ *  (a) Continuous features are binned into ranges.
+ *  (b) Unordered categorical features are binned based on subsets of feature values.
+ *      "Unordered categorical features" are categorical features with low arity used in
+ *      multiclass classification.
+ *  (c) Ordered categorical features are binned based on feature values.
+ *      "Ordered categorical features" are categorical features with high arity,
+ *      or any categorical feature used in regression or binary classification.
+ *
+ * @param label  Label from LabeledPoint
+ * @param binnedFeatures  Binned feature values.
+ *                        Same length as LabeledPoint.features, but values are bin indices.
+ */
+private[spark] class TreePoint(val label: Double, val binnedFeatures: Array[Int])
+  extends Serializable {
+}
+
+private[spark] object TreePoint {
+
+  /**
+   * Convert an input dataset into its TreePoint representation,
+   * binning feature values in preparation for DecisionTree training.
+   * @param input     Input dataset.
+   * @param splits    Splits for features, of size (numFeatures, numSplits).
+   * @param metadata  Learning and dataset metadata
+   * @return  TreePoint dataset representation
+   */
+  def convertToTreeRDD(
+      input: RDD[LabeledPoint],
+      splits: Array[Array[Split]],
+      metadata: DecisionTreeMetadata): RDD[TreePoint] = {
+    // Construct arrays for featureArity for efficiency in the inner loop.
+    val featureArity: Array[Int] = new Array[Int](metadata.numFeatures)
+    var featureIndex = 0
+    while (featureIndex < metadata.numFeatures) {
+      featureArity(featureIndex) = metadata.featureArity.getOrElse(featureIndex, 0)
+      featureIndex += 1
+    }
+    val thresholds: Array[Array[Double]] = featureArity.zipWithIndex.map { case (arity, idx) =>
+      if (arity == 0) {
+        splits(idx).map(_.asInstanceOf[ContinuousSplit].threshold)
+      } else {
+        Array.empty[Double]
+      }
+    }
+    input.map { x =>
+      TreePoint.labeledPointToTreePoint(x, thresholds, featureArity)
+    }
+  }
+
+  /**
+   * Convert one LabeledPoint into its TreePoint representation.
+   * @param thresholds  For each feature, split thresholds for continuous features,
+   *                    empty for categorical features.
+   * @param featureArity  Array indexed by feature, with value 0 for continuous and numCategories
+   *                      for categorical features.
+   */
+  private def labeledPointToTreePoint(
+      labeledPoint: LabeledPoint,
+      thresholds: Array[Array[Double]],
+      featureArity: Array[Int]): TreePoint = {
+    val numFeatures = labeledPoint.features.size
+    val arr = new Array[Int](numFeatures)
+    var featureIndex = 0
+    while (featureIndex < numFeatures) {
+      arr(featureIndex) =
+        findBin(featureIndex, labeledPoint, featureArity(featureIndex), thresholds(featureIndex))
+      featureIndex += 1
+    }
+    new TreePoint(labeledPoint.label, arr)
+  }
+
+  /**
+   * Find discretized value for one (labeledPoint, feature).
+   *
+   * NOTE: We cannot use Bucketizer since it handles split thresholds differently than the old
+   *       (mllib) tree API.  We want to maintain the same behavior as the old tree API.
+   *
+   * @param featureArity  0 for continuous features; number of categories for categorical features.
+   */
+  private def findBin(
+      featureIndex: Int,
+      labeledPoint: LabeledPoint,
+      featureArity: Int,
+      thresholds: Array[Double]): Int = {
+    val featureValue = labeledPoint.features(featureIndex)
+
+    if (featureArity == 0) {
+      val idx = java.util.Arrays.binarySearch(thresholds, featureValue)
+      if (idx >= 0) {
+        idx
+      } else {
+        -idx - 1
+      }
+    } else {
+      // Categorical feature bins are indexed by feature values.
+      if (featureValue < 0 || featureValue >= featureArity) {
+        throw new IllegalArgumentException(
+          s"DecisionTree given invalid data:" +
+            s" Feature $featureIndex is categorical with values in {0,...,${featureArity - 1}," +
+            s" but a data point gives it value $featureValue.\n" +
+            "  Bad data point: " + labeledPoint.toString)
+      }
+      featureValue.toInt
+    }
+  }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/BaggedPoint.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/BaggedPoint.scala
index 089010c81ffb6..572815df0bc4a 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/BaggedPoint.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/BaggedPoint.scala
@@ -38,10 +38,10 @@ import org.apache.spark.util.random.XORShiftRandom
  * TODO: This does not currently support (Double) weighted instances.  Once MLlib has weighted
  *       dataset support, update.  (We store subsampleWeights as Double for this future extension.)
  */
-private[tree] class BaggedPoint[Datum](val datum: Datum, val subsampleWeights: Array[Double])
+private[spark] class BaggedPoint[Datum](val datum: Datum, val subsampleWeights: Array[Double])
   extends Serializable
 
-private[tree] object BaggedPoint {
+private[spark] object BaggedPoint {
 
   /**
    * Convert an input dataset into its BaggedPoint representation,
@@ -60,7 +60,7 @@ private[tree] object BaggedPoint {
       subsamplingRate: Double,
       numSubsamples: Int,
       withReplacement: Boolean,
-      seed: Int = Utils.random.nextInt()): RDD[BaggedPoint[Datum]] = {
+      seed: Long = Utils.random.nextLong()): RDD[BaggedPoint[Datum]] = {
     if (withReplacement) {
       convertToBaggedRDDSamplingWithReplacement(input, subsamplingRate, numSubsamples, seed)
     } else {
@@ -76,7 +76,7 @@ private[tree] object BaggedPoint {
       input: RDD[Datum],
       subsamplingRate: Double,
       numSubsamples: Int,
-      seed: Int): RDD[BaggedPoint[Datum]] = {
+      seed: Long): RDD[BaggedPoint[Datum]] = {
     input.mapPartitionsWithIndex { (partitionIndex, instances) =>
       // Use random seed = seed + partitionIndex + 1 to make generation reproducible.
       val rng = new XORShiftRandom
@@ -100,7 +100,7 @@ private[tree] object BaggedPoint {
       input: RDD[Datum],
       subsample: Double,
       numSubsamples: Int,
-      seed: Int): RDD[BaggedPoint[Datum]] = {
+      seed: Long): RDD[BaggedPoint[Datum]] = {
     input.mapPartitionsWithIndex { (partitionIndex, instances) =>
       // Use random seed = seed + partitionIndex + 1 to make generation reproducible.
       val poisson = new PoissonDistribution(subsample)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/DTStatsAggregator.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/DTStatsAggregator.scala
index ce8825cc03229..7985ed4b4c0fa 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/DTStatsAggregator.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/DTStatsAggregator.scala
@@ -27,7 +27,7 @@ import org.apache.spark.mllib.tree.impurity._
  * and helps with indexing.
  * This class is abstract to support learning with and without feature subsampling.
  */
-private[tree] class DTStatsAggregator(
+private[spark] class DTStatsAggregator(
     val metadata: DecisionTreeMetadata,
     featureSubset: Option[Array[Int]]) extends Serializable {
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/DecisionTreeMetadata.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/DecisionTreeMetadata.scala
index f73896e37c05e..380291ac22bd3 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/DecisionTreeMetadata.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/DecisionTreeMetadata.scala
@@ -37,7 +37,7 @@ import org.apache.spark.rdd.RDD
  *                      I.e., the feature takes values in {0, ..., arity - 1}.
  * @param numBins  Number of bins for each feature.
  */
-private[tree] class DecisionTreeMetadata(
+private[spark] class DecisionTreeMetadata(
     val numFeatures: Int,
     val numExamples: Long,
     val numClasses: Int,
@@ -94,7 +94,7 @@ private[tree] class DecisionTreeMetadata(
 
 }
 
-private[tree] object DecisionTreeMetadata extends Logging {
+private[spark] object DecisionTreeMetadata extends Logging {
 
   /**
    * Construct a [[DecisionTreeMetadata]] instance for this dataset and parameters.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/NodeIdCache.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/NodeIdCache.scala
index bdd0f576b048d..8f9eb24b57b55 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/NodeIdCache.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/NodeIdCache.scala
@@ -75,7 +75,7 @@ private[tree] case class NodeIndexUpdater(
  *                           (how often should the cache be checkpointed.).
  */
 @DeveloperApi
-private[tree] class NodeIdCache(
+private[spark] class NodeIdCache(
   var nodeIdsForInstances: RDD[Array[Int]],
   val checkpointInterval: Int) {
 
@@ -170,7 +170,7 @@ private[tree] class NodeIdCache(
 }
 
 @DeveloperApi
-private[tree] object NodeIdCache {
+private[spark] object NodeIdCache {
   /**
    * Initialize the node Id cache with initial node Id values.
    * @param data The RDD of training rows.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/TimeTracker.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/TimeTracker.scala
index d215d68c4279e..aac84243d5ce1 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/TimeTracker.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/TimeTracker.scala
@@ -25,7 +25,7 @@ import org.apache.spark.annotation.Experimental
  * Time tracker implementation which holds labeled timers.
  */
 @Experimental
-private[tree] class TimeTracker extends Serializable {
+private[spark] class TimeTracker extends Serializable {
 
   private val starts: MutableHashMap[String, Long] = new MutableHashMap[String, Long]()
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/TreePoint.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/TreePoint.scala
index 50b292e71b067..21919d69a38a3 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/TreePoint.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/TreePoint.scala
@@ -37,11 +37,11 @@ import org.apache.spark.rdd.RDD
  * @param binnedFeatures  Binned feature values.
  *                        Same length as LabeledPoint.features, but values are bin indices.
  */
-private[tree] class TreePoint(val label: Double, val binnedFeatures: Array[Int])
+private[spark] class TreePoint(val label: Double, val binnedFeatures: Array[Int])
   extends Serializable {
 }
 
-private[tree] object TreePoint {
+private[spark] object TreePoint {
 
   /**
    * Convert an input dataset into its TreePoint representation,
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Impurity.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Impurity.scala
index 72eb24c49264a..578749d85a4e6 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Impurity.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Impurity.scala
@@ -57,7 +57,7 @@ trait Impurity extends Serializable {
  * Note: Instances of this class do not hold the data; they operate on views of the data.
  * @param statsSize  Length of the vector of sufficient statistics for one bin.
  */
-private[tree] abstract class ImpurityAggregator(val statsSize: Int) extends Serializable {
+private[spark] abstract class ImpurityAggregator(val statsSize: Int) extends Serializable {
 
   /**
    * Merge the stats from one bin into another.
@@ -95,7 +95,7 @@ private[tree] abstract class ImpurityAggregator(val statsSize: Int) extends Seri
  * (node, feature, bin).
  * @param stats  Array of sufficient statistics for a (node, feature, bin).
  */
-private[tree] abstract class ImpurityCalculator(val stats: Array[Double]) {
+private[spark] abstract class ImpurityCalculator(val stats: Array[Double]) {
 
   /**
    * Make a deep copy of this [[ImpurityCalculator]].
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/InformationGainStats.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/InformationGainStats.scala
index 2d087c967f679..dc9e0f9f51ffb 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/InformationGainStats.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/InformationGainStats.scala
@@ -67,7 +67,7 @@ class InformationGainStats(
 }
 
 
-private[tree] object InformationGainStats {
+private[spark] object InformationGainStats {
   /**
    * An [[org.apache.spark.mllib.tree.model.InformationGainStats]] object to
    * denote that current split doesn't satisfies minimum info gain or
diff --git a/mllib/src/test/java/org/apache/spark/ml/regression/JavaDecisionTreeRegressorSuite.java b/mllib/src/test/java/org/apache/spark/ml/regression/JavaDecisionTreeRegressorSuite.java
index 71b041818d7ee..ebe800e749e05 100644
--- a/mllib/src/test/java/org/apache/spark/ml/regression/JavaDecisionTreeRegressorSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/regression/JavaDecisionTreeRegressorSuite.java
@@ -57,7 +57,7 @@ public void runDT() {
     JavaRDD<LabeledPoint> data = sc.parallelize(
       LogisticRegressionSuite.generateLogisticInputAsList(A, B, nPoints, 42), 2).cache();
     Map<Integer, Integer> categoricalFeatures = new HashMap<Integer, Integer>();
-    DataFrame dataFrame = TreeTests.setMetadata(data, categoricalFeatures, 2);
+    DataFrame dataFrame = TreeTests.setMetadata(data, categoricalFeatures, 0);
 
     // This tests setters. Training with various options is tested in Scala.
     DecisionTreeRegressor dt = new DecisionTreeRegressor()

From 358e7bf652d6fedd9377593025cd661c142efeca Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Thu, 16 Jul 2015 23:02:06 -0700
Subject: [PATCH 198/549] [SPARK-9126] [MLLIB] do not assert on time taken by
 Thread.sleep()

Measure lower and upper bounds for task time and use them for validation. This PR also implements `Stopwatch.toString`. This suite should finish in less than 1 second.

jkbradley pwendell

Author: Xiangrui Meng <meng@databricks.com>

Closes #7457 from mengxr/SPARK-9126 and squashes the following commits:

4b40faa [Xiangrui Meng] simplify tests
739f5bd [Xiangrui Meng] do not assert on time taken by Thread.sleep()
---
 .../apache/spark/ml/util/stopwatches.scala    |  4 +-
 .../apache/spark/ml/util/StopwatchSuite.scala | 64 ++++++++++++-------
 2 files changed, 43 insertions(+), 25 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/stopwatches.scala b/mllib/src/main/scala/org/apache/spark/ml/util/stopwatches.scala
index 5fdf878a3df72..8d4174124b5c4 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/util/stopwatches.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/util/stopwatches.scala
@@ -67,6 +67,8 @@ private[spark] abstract class Stopwatch extends Serializable {
    */
   def elapsed(): Long
 
+  override def toString: String = s"$name: ${elapsed()}ms"
+
   /**
    * Gets the current time in milliseconds.
    */
@@ -145,7 +147,7 @@ private[spark] class MultiStopwatch(@transient private val sc: SparkContext) ext
 
   override def toString: String = {
     stopwatches.values.toArray.sortBy(_.name)
-      .map(c => s"  ${c.name}: ${c.elapsed()}ms")
+      .map(c => s"  $c")
       .mkString("{\n", ",\n", "\n}")
   }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/util/StopwatchSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/util/StopwatchSuite.scala
index 8df6617fe0228..9e6bc7193c13b 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/util/StopwatchSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/util/StopwatchSuite.scala
@@ -17,11 +17,15 @@
 
 package org.apache.spark.ml.util
 
+import java.util.Random
+
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 
 class StopwatchSuite extends SparkFunSuite with MLlibTestSparkContext {
 
+  import StopwatchSuite._
+
   private def testStopwatchOnDriver(sw: Stopwatch): Unit = {
     assert(sw.name === "sw")
     assert(sw.elapsed() === 0L)
@@ -29,18 +33,13 @@ class StopwatchSuite extends SparkFunSuite with MLlibTestSparkContext {
     intercept[AssertionError] {
       sw.stop()
     }
-    sw.start()
-    Thread.sleep(50)
-    val duration = sw.stop()
-    assert(duration >= 50 && duration < 100) // using a loose upper bound
+    val duration = checkStopwatch(sw)
     val elapsed = sw.elapsed()
     assert(elapsed === duration)
-    sw.start()
-    Thread.sleep(50)
-    val duration2 = sw.stop()
-    assert(duration2 >= 50 && duration2 < 100)
+    val duration2 = checkStopwatch(sw)
     val elapsed2 = sw.elapsed()
     assert(elapsed2 === duration + duration2)
+    assert(sw.toString === s"sw: ${elapsed2}ms")
     sw.start()
     assert(sw.isRunning)
     intercept[AssertionError] {
@@ -61,14 +60,13 @@ class StopwatchSuite extends SparkFunSuite with MLlibTestSparkContext {
   test("DistributedStopwatch on executors") {
     val sw = new DistributedStopwatch(sc, "sw")
     val rdd = sc.parallelize(0 until 4, 4)
+    val acc = sc.accumulator(0L)
     rdd.foreach { i =>
-      sw.start()
-      Thread.sleep(50)
-      sw.stop()
+      acc += checkStopwatch(sw)
     }
     assert(!sw.isRunning)
     val elapsed = sw.elapsed()
-    assert(elapsed >= 200 && elapsed < 400) // using a loose upper bound
+    assert(elapsed === acc.value)
   }
 
   test("MultiStopwatch") {
@@ -81,29 +79,47 @@ class StopwatchSuite extends SparkFunSuite with MLlibTestSparkContext {
       sw("some")
     }
     assert(sw.toString === "{\n  local: 0ms,\n  spark: 0ms\n}")
-    sw("local").start()
-    sw("spark").start()
-    Thread.sleep(50)
-    sw("local").stop()
-    Thread.sleep(50)
-    sw("spark").stop()
+    val localDuration = checkStopwatch(sw("local"))
+    val sparkDuration = checkStopwatch(sw("spark"))
     val localElapsed = sw("local").elapsed()
     val sparkElapsed = sw("spark").elapsed()
-    assert(localElapsed >= 50 && localElapsed < 100)
-    assert(sparkElapsed >= 100 && sparkElapsed < 200)
+    assert(localElapsed === localDuration)
+    assert(sparkElapsed === sparkDuration)
     assert(sw.toString ===
       s"{\n  local: ${localElapsed}ms,\n  spark: ${sparkElapsed}ms\n}")
     val rdd = sc.parallelize(0 until 4, 4)
+    val acc = sc.accumulator(0L)
     rdd.foreach { i =>
       sw("local").start()
-      sw("spark").start()
-      Thread.sleep(50)
-      sw("spark").stop()
+      val duration = checkStopwatch(sw("spark"))
       sw("local").stop()
+      acc += duration
     }
     val localElapsed2 = sw("local").elapsed()
     assert(localElapsed2 === localElapsed)
     val sparkElapsed2 = sw("spark").elapsed()
-    assert(sparkElapsed2 >= 300 && sparkElapsed2 < 600)
+    assert(sparkElapsed2 === sparkElapsed + acc.value)
   }
 }
+
+private object StopwatchSuite extends SparkFunSuite {
+
+  /**
+   * Checks the input stopwatch on a task that takes a random time (<10ms) to finish. Validates and
+   * returns the duration reported by the stopwatch.
+   */
+  def checkStopwatch(sw: Stopwatch): Long = {
+    val ubStart = now
+    sw.start()
+    val lbStart = now
+    Thread.sleep(new Random().nextInt(10))
+    val lb = now - lbStart
+    val duration = sw.stop()
+    val ub = now - ubStart
+    assert(duration >= lb && duration <= ub)
+    duration
+  }
+
+  /** The current time in milliseconds. */
+  private def now: Long = System.currentTimeMillis()
+}

From 111c05538d9dcee06e918dcd4481104ace712dc3 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Thu, 16 Jul 2015 23:13:06 -0700
Subject: [PATCH 199/549] Added inline comment for the canEqual PR by
 @cloud-fan.

---
 sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala
index 5f0592dc1d77b..3623fefbf2604 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala
@@ -364,8 +364,13 @@ trait Row extends Serializable {
     false
   }
 
-  protected def canEqual(other: Any) =
+  protected def canEqual(other: Any) = {
+    // Note that InternalRow overrides canEqual. These two canEqual's together makes sure that
+    // comparing the external Row and InternalRow will always yield false.
+    // In the future, InternalRow should not extend Row. In that case, we can remove these
+    // canEqual methods.
     other.isInstanceOf[Row] && !other.isInstanceOf[InternalRow]
+  }
 
   override def equals(o: Any): Boolean = {
     if (o == null || !canEqual(o)) return false

From 3f6d28a5ca98cf7d20c2c029094350cc4f9545a0 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@outlook.com>
Date: Fri, 17 Jul 2015 00:59:15 -0700
Subject: [PATCH 200/549] [SPARK-9102] [SQL] Improve project collapse with
 nondeterministic expressions

Currently we will stop project collapse when the lower projection has nondeterministic expressions. However it's overkill sometimes, we should be able to optimize `df.select(Rand(10)).select('a)` to `df.select('a)`

Author: Wenchen Fan <cloud0fan@outlook.com>

Closes #7445 from cloud-fan/non-deterministic and squashes the following commits:

0deaef6 [Wenchen Fan] Improve project collapse with nondeterministic expressions
---
 .../sql/catalyst/optimizer/Optimizer.scala    | 38 ++++++++++---------
 .../optimizer/ProjectCollapsingSuite.scala    | 26 +++++++++++++
 .../org/apache/spark/sql/DataFrameSuite.scala | 10 ++---
 3 files changed, 51 insertions(+), 23 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 2f94b457f4cdc..d5beeec0ffac1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -206,31 +206,33 @@ object ColumnPruning extends Rule[LogicalPlan] {
  */
 object ProjectCollapsing extends Rule[LogicalPlan] {
 
-  /** Returns true if any expression in projectList is non-deterministic. */
-  private def hasNondeterministic(projectList: Seq[NamedExpression]): Boolean = {
-    projectList.exists(expr => expr.find(!_.deterministic).isDefined)
-  }
-
   def apply(plan: LogicalPlan): LogicalPlan = plan transformUp {
-    // We only collapse these two Projects if the child Project's expressions are all
-    // deterministic.
-    case Project(projectList1, Project(projectList2, child))
-         if !hasNondeterministic(projectList2) =>
+    case p @ Project(projectList1, Project(projectList2, child)) =>
       // Create a map of Aliases to their values from the child projection.
       // e.g., 'SELECT ... FROM (SELECT a + b AS c, d ...)' produces Map(c -> Alias(a + b, c)).
       val aliasMap = AttributeMap(projectList2.collect {
-        case a @ Alias(e, _) => (a.toAttribute, a)
+        case a: Alias => (a.toAttribute, a)
       })
 
-      // Substitute any attributes that are produced by the child projection, so that we safely
-      // eliminate it.
-      // e.g., 'SELECT c + 1 FROM (SELECT a + b AS C ...' produces 'SELECT a + b + 1 ...'
-      // TODO: Fix TransformBase to avoid the cast below.
-      val substitutedProjection = projectList1.map(_.transform {
-        case a: Attribute if aliasMap.contains(a) => aliasMap(a)
-      }).asInstanceOf[Seq[NamedExpression]]
+      // We only collapse these two Projects if their overlapped expressions are all
+      // deterministic.
+      val hasNondeterministic = projectList1.flatMap(_.collect {
+        case a: Attribute if aliasMap.contains(a) => aliasMap(a).child
+      }).exists(_.find(!_.deterministic).isDefined)
 
-      Project(substitutedProjection, child)
+      if (hasNondeterministic) {
+        p
+      } else {
+        // Substitute any attributes that are produced by the child projection, so that we safely
+        // eliminate it.
+        // e.g., 'SELECT c + 1 FROM (SELECT a + b AS C ...' produces 'SELECT a + b + 1 ...'
+        // TODO: Fix TransformBase to avoid the cast below.
+        val substitutedProjection = projectList1.map(_.transform {
+          case a: Attribute => aliasMap.getOrElse(a, a)
+        }).asInstanceOf[Seq[NamedExpression]]
+
+        Project(substitutedProjection, child)
+      }
   }
 }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ProjectCollapsingSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ProjectCollapsingSuite.scala
index 151654bffbd66..1aa89991cc698 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ProjectCollapsingSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ProjectCollapsingSuite.scala
@@ -70,4 +70,30 @@ class ProjectCollapsingSuite extends PlanTest {
 
     comparePlans(optimized, correctAnswer)
   }
+
+  test("collapse two nondeterministic, independent projects into one") {
+    val query = testRelation
+      .select(Rand(10).as('rand))
+      .select(Rand(20).as('rand2))
+
+    val optimized = Optimize.execute(query.analyze)
+
+    val correctAnswer = testRelation
+      .select(Rand(20).as('rand2)).analyze
+
+    comparePlans(optimized, correctAnswer)
+  }
+
+  test("collapse one nondeterministic, one deterministic, independent projects into one") {
+    val query = testRelation
+      .select(Rand(10).as('rand), 'a)
+      .select(('a + 1).as('a_plus_1))
+
+    val optimized = Optimize.execute(query.analyze)
+
+    val correctAnswer = testRelation
+      .select(('a + 1).as('a_plus_1)).analyze
+
+    comparePlans(optimized, correctAnswer)
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 23244fd310d0f..192cc0a6e5d7c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -745,8 +745,8 @@ class DataFrameSuite extends QueryTest with SQLTestUtils {
   test("SPARK-8072: Better Exception for Duplicate Columns") {
     // only one duplicate column present
     val e = intercept[org.apache.spark.sql.AnalysisException] {
-      val df1 = Seq((1, 2, 3), (2, 3, 4), (3, 4, 5)).toDF("column1", "column2", "column1")
-                .write.format("parquet").save("temp")
+      Seq((1, 2, 3), (2, 3, 4), (3, 4, 5)).toDF("column1", "column2", "column1")
+        .write.format("parquet").save("temp")
     }
     assert(e.getMessage.contains("Duplicate column(s)"))
     assert(e.getMessage.contains("parquet"))
@@ -755,9 +755,9 @@ class DataFrameSuite extends QueryTest with SQLTestUtils {
 
     // multiple duplicate columns present
     val f = intercept[org.apache.spark.sql.AnalysisException] {
-      val df2 = Seq((1, 2, 3, 4, 5), (2, 3, 4, 5, 6), (3, 4, 5, 6, 7))
-                .toDF("column1", "column2", "column3", "column1", "column3")
-                .write.format("json").save("temp")
+      Seq((1, 2, 3, 4, 5), (2, 3, 4, 5, 6), (3, 4, 5, 6, 7))
+        .toDF("column1", "column2", "column3", "column1", "column3")
+        .write.format("json").save("temp")
     }
     assert(f.getMessage.contains("Duplicate column(s)"))
     assert(f.getMessage.contains("JSON"))

From 5a3c1ad087cb645a9496349ca021168e479ffae9 Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Fri, 17 Jul 2015 17:00:50 +0900
Subject: [PATCH 201/549] [SPARK-9093] [SPARKR] Fix single-quotes strings in
 SparkR

[[SPARK-9093] Fix single-quotes strings in SparkR - ASF JIRA](https://issues.apache.org/jira/browse/SPARK-9093)

This is the result of lintr at the rivision:011551620faa87107a787530f074af3d9be7e695
[[SPARK-9093] The result of lintr at 011551620faa87107a787530f074af3d9be7e695](https://gist.github.com/yu-iskw/8c47acf3202796da4d01)

Author: Yu ISHIKAWA <yuu.ishikawa@gmail.com>

Closes #7439 from yu-iskw/SPARK-9093 and squashes the following commits:

61c391e [Yu ISHIKAWA] [SPARK-9093][SparkR] Fix single-quotes strings in SparkR
---
 R/pkg/R/DataFrame.R              | 10 +++++-----
 R/pkg/R/SQLContext.R             |  4 ++--
 R/pkg/R/serialize.R              |  4 ++--
 R/pkg/R/sparkR.R                 |  2 +-
 R/pkg/inst/tests/test_sparkSQL.R |  4 ++--
 5 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 208813768e264..a58433df3c8c1 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -1314,7 +1314,7 @@ setMethod("except",
 #' write.df(df, "myfile", "parquet", "overwrite")
 #' }
 setMethod("write.df",
-          signature(df = "DataFrame", path = 'character'),
+          signature(df = "DataFrame", path = "character"),
           function(df, path, source = NULL, mode = "append", ...){
             if (is.null(source)) {
               sqlContext <- get(".sparkRSQLsc", envir = .sparkREnv)
@@ -1328,7 +1328,7 @@ setMethod("write.df",
             jmode <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "saveMode", mode)
             options <- varargsToEnv(...)
             if (!is.null(path)) {
-                options[['path']] <- path
+                options[["path"]] <- path
             }
             callJMethod(df@sdf, "save", source, jmode, options)
           })
@@ -1337,7 +1337,7 @@ setMethod("write.df",
 #' @aliases saveDF
 #' @export
 setMethod("saveDF",
-          signature(df = "DataFrame", path = 'character'),
+          signature(df = "DataFrame", path = "character"),
           function(df, path, source = NULL, mode = "append", ...){
             write.df(df, path, source, mode, ...)
           })
@@ -1375,8 +1375,8 @@ setMethod("saveDF",
 #' saveAsTable(df, "myfile")
 #' }
 setMethod("saveAsTable",
-          signature(df = "DataFrame", tableName = 'character', source = 'character',
-                    mode = 'character'),
+          signature(df = "DataFrame", tableName = "character", source = "character",
+                    mode = "character"),
           function(df, tableName, source = NULL, mode="append", ...){
             if (is.null(source)) {
               sqlContext <- get(".sparkRSQLsc", envir = .sparkREnv)
diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R
index 30978bb50d339..110117a18ccbc 100644
--- a/R/pkg/R/SQLContext.R
+++ b/R/pkg/R/SQLContext.R
@@ -457,7 +457,7 @@ dropTempTable <- function(sqlContext, tableName) {
 read.df <- function(sqlContext, path = NULL, source = NULL, schema = NULL, ...) {
   options <- varargsToEnv(...)
   if (!is.null(path)) {
-    options[['path']] <- path
+    options[["path"]] <- path
   }
   if (is.null(source)) {
     sqlContext <- get(".sparkRSQLsc", envir = .sparkREnv)
@@ -506,7 +506,7 @@ loadDF <- function(sqlContext, path = NULL, source = NULL, schema = NULL, ...) {
 createExternalTable <- function(sqlContext, tableName, path = NULL, source = NULL, ...) {
   options <- varargsToEnv(...)
   if (!is.null(path)) {
-    options[['path']] <- path
+    options[["path"]] <- path
   }
   sdf <- callJMethod(sqlContext, "createExternalTable", tableName, source, options)
   dataFrame(sdf)
diff --git a/R/pkg/R/serialize.R b/R/pkg/R/serialize.R
index 78535eff0d2f6..311021e5d8473 100644
--- a/R/pkg/R/serialize.R
+++ b/R/pkg/R/serialize.R
@@ -140,8 +140,8 @@ writeType <- function(con, class) {
                  jobj = "j",
                  environment = "e",
                  Date = "D",
-                 POSIXlt = 't',
-                 POSIXct = 't',
+                 POSIXlt = "t",
+                 POSIXct = "t",
                  stop(paste("Unsupported type for serialization", class)))
   writeBin(charToRaw(type), con)
 }
diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R
index 172335809dec2..79b79d70943cb 100644
--- a/R/pkg/R/sparkR.R
+++ b/R/pkg/R/sparkR.R
@@ -140,7 +140,7 @@ sparkR.init <- function(
     if (!file.exists(path)) {
       stop("JVM is not ready after 10 seconds")
     }
-    f <- file(path, open='rb')
+    f <- file(path, open="rb")
     backendPort <- readInt(f)
     monitorPort <- readInt(f)
     close(f)
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index cdfe6481f60ea..a3039d36c9402 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -57,9 +57,9 @@ test_that("infer types", {
   expect_equal(infer_type(as.Date("2015-03-11")), "date")
   expect_equal(infer_type(as.POSIXlt("2015-03-11 12:13:04.043")), "timestamp")
   expect_equal(infer_type(c(1L, 2L)),
-               list(type = 'array', elementType = "integer", containsNull = TRUE))
+               list(type = "array", elementType = "integer", containsNull = TRUE))
   expect_equal(infer_type(list(1L, 2L)),
-               list(type = 'array', elementType = "integer", containsNull = TRUE))
+               list(type = "array", elementType = "integer", containsNull = TRUE))
   testStruct <- infer_type(list(a = 1L, b = "2"))
   expect_equal(class(testStruct), "structType")
   checkStructField(testStruct$fields()[[1]], "a", "IntegerType", TRUE)

From ec8973d1245d4a99edeb7365d7f4b0063ac31ddf Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Fri, 17 Jul 2015 01:27:14 -0700
Subject: [PATCH 202/549] [SPARK-9022] [SQL] Generated projections for
 UnsafeRow

Added two projections: GenerateUnsafeProjection and FromUnsafeProjection, which could be used to convert UnsafeRow from/to GenericInternalRow.

They will re-use the buffer during projection, similar to MutableProjection (without all the interface MutableProjection has).

cc rxin JoshRosen

Author: Davies Liu <davies@databricks.com>

Closes #7437 from davies/unsafe_proj2 and squashes the following commits:

dbf538e [Davies Liu] test with all the expression (only for supported types)
dc737b2 [Davies Liu] address comment
e424520 [Davies Liu] fix scala style
70e231c [Davies Liu] address comments
729138d [Davies Liu] Merge branch 'master' of github.com:apache/spark into unsafe_proj2
5a26373 [Davies Liu] unsafe projections
---
 .../execution/UnsafeExternalRowSorter.java    |  27 ++--
 .../spark/sql/catalyst/expressions/Cast.scala |   8 +-
 .../sql/catalyst/expressions/Projection.scala |  35 +++++
 .../expressions/UnsafeRowConverter.scala      |  69 +++++-----
 .../expressions/codegen/CodeGenerator.scala   |  15 ++-
 .../codegen/GenerateProjection.scala          |   4 +-
 .../codegen/GenerateUnsafeProjection.scala    | 125 ++++++++++++++++++
 .../expressions/decimalFunctions.scala        |   2 +-
 .../spark/sql/catalyst/expressions/math.scala |   2 +-
 .../spark/sql/catalyst/expressions/misc.scala |  17 ++-
 .../expressions/ExpressionEvalHelper.scala    |  34 ++++-
 11 files changed, 266 insertions(+), 72 deletions(-)
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala

diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java b/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java
index b94601cf6d818..d1d81c87bb052 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java
@@ -28,13 +28,11 @@
 import org.apache.spark.TaskContext;
 import org.apache.spark.sql.AbstractScalaRowIterator;
 import org.apache.spark.sql.catalyst.InternalRow;
-import org.apache.spark.sql.catalyst.expressions.ObjectUnsafeColumnWriter;
 import org.apache.spark.sql.catalyst.expressions.UnsafeColumnWriter;
+import org.apache.spark.sql.catalyst.expressions.UnsafeProjection;
 import org.apache.spark.sql.catalyst.expressions.UnsafeRow;
-import org.apache.spark.sql.catalyst.expressions.UnsafeRowConverter;
 import org.apache.spark.sql.catalyst.util.ObjectPool;
-import org.apache.spark.sql.types.StructField;
-import org.apache.spark.sql.types.StructType;
+import org.apache.spark.sql.types.*;
 import org.apache.spark.unsafe.PlatformDependent;
 import org.apache.spark.util.collection.unsafe.sort.PrefixComparator;
 import org.apache.spark.util.collection.unsafe.sort.RecordComparator;
@@ -52,10 +50,9 @@ final class UnsafeExternalRowSorter {
   private long numRowsInserted = 0;
 
   private final StructType schema;
-  private final UnsafeRowConverter rowConverter;
+  private final UnsafeProjection unsafeProjection;
   private final PrefixComputer prefixComputer;
   private final UnsafeExternalSorter sorter;
-  private byte[] rowConversionBuffer = new byte[1024 * 8];
 
   public static abstract class PrefixComputer {
     abstract long computePrefix(InternalRow row);
@@ -67,7 +64,7 @@ public UnsafeExternalRowSorter(
       PrefixComparator prefixComparator,
       PrefixComputer prefixComputer) throws IOException {
     this.schema = schema;
-    this.rowConverter = new UnsafeRowConverter(schema);
+    this.unsafeProjection = UnsafeProjection.create(schema);
     this.prefixComputer = prefixComputer;
     final SparkEnv sparkEnv = SparkEnv.get();
     final TaskContext taskContext = TaskContext.get();
@@ -94,18 +91,12 @@ void setTestSpillFrequency(int frequency) {
 
   @VisibleForTesting
   void insertRow(InternalRow row) throws IOException {
-    final int sizeRequirement = rowConverter.getSizeRequirement(row);
-    if (sizeRequirement > rowConversionBuffer.length) {
-      rowConversionBuffer = new byte[sizeRequirement];
-    }
-    final int bytesWritten = rowConverter.writeRow(
-      row, rowConversionBuffer, PlatformDependent.BYTE_ARRAY_OFFSET, sizeRequirement, null);
-    assert (bytesWritten == sizeRequirement);
+    UnsafeRow unsafeRow = unsafeProjection.apply(row);
     final long prefix = prefixComputer.computePrefix(row);
     sorter.insertRecord(
-      rowConversionBuffer,
-      PlatformDependent.BYTE_ARRAY_OFFSET,
-      sizeRequirement,
+      unsafeRow.getBaseObject(),
+      unsafeRow.getBaseOffset(),
+      unsafeRow.getSizeInBytes(),
       prefix
     );
     numRowsInserted++;
@@ -186,7 +177,7 @@ public Iterator<InternalRow> sort(Iterator<InternalRow> inputIterator) throws IO
   public static boolean supportsSchema(StructType schema) {
     // TODO: add spilling note to explain why we do this for now:
     for (StructField field : schema.fields()) {
-      if (UnsafeColumnWriter.forType(field.dataType()) instanceof ObjectUnsafeColumnWriter) {
+      if (!UnsafeColumnWriter.canEmbed(field.dataType())) {
         return false;
       }
     }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
index 65ae87fe6d166..692b9fddbb041 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -424,20 +424,20 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
 
       case (BinaryType, StringType) =>
         defineCodeGen (ctx, ev, c =>
-          s"${ctx.stringType}.fromBytes($c)")
+          s"UTF8String.fromBytes($c)")
 
       case (DateType, StringType) =>
         defineCodeGen(ctx, ev, c =>
-          s"""${ctx.stringType}.fromString(
+          s"""UTF8String.fromString(
                 org.apache.spark.sql.catalyst.util.DateTimeUtils.dateToString($c))""")
 
       case (TimestampType, StringType) =>
         defineCodeGen(ctx, ev, c =>
-          s"""${ctx.stringType}.fromString(
+          s"""UTF8String.fromString(
                 org.apache.spark.sql.catalyst.util.DateTimeUtils.timestampToString($c))""")
 
       case (_, StringType) =>
-        defineCodeGen(ctx, ev, c => s"${ctx.stringType}.fromString(String.valueOf($c))")
+        defineCodeGen(ctx, ev, c => s"UTF8String.fromString(String.valueOf($c))")
 
       case (StringType, IntervalType) =>
         defineCodeGen(ctx, ev, c =>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
index bf47a6c75b809..24b01ea55110e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
@@ -18,6 +18,8 @@
 package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.codegen.{GenerateUnsafeProjection, GenerateMutableProjection}
+import org.apache.spark.sql.types.{StructType, DataType}
 
 /**
  * A [[Projection]] that is calculated by calling the `eval` of each of the specified expressions.
@@ -73,6 +75,39 @@ case class InterpretedMutableProjection(expressions: Seq[Expression]) extends Mu
   }
 }
 
+/**
+ * A projection that returns UnsafeRow.
+ */
+abstract class UnsafeProjection extends Projection {
+  override def apply(row: InternalRow): UnsafeRow
+}
+
+object UnsafeProjection {
+  def create(schema: StructType): UnsafeProjection = create(schema.fields.map(_.dataType))
+
+  def create(fields: Seq[DataType]): UnsafeProjection = {
+    val exprs = fields.zipWithIndex.map(x => new BoundReference(x._2, x._1, true))
+    GenerateUnsafeProjection.generate(exprs)
+  }
+}
+
+/**
+ * A projection that could turn UnsafeRow into GenericInternalRow
+ */
+case class FromUnsafeProjection(fields: Seq[DataType]) extends Projection {
+
+  private[this] val expressions = fields.zipWithIndex.map { case (dt, idx) =>
+    new BoundReference(idx, dt, true)
+  }
+
+  @transient private[this] lazy val generatedProj =
+    GenerateMutableProjection.generate(expressions)()
+
+  override def apply(input: InternalRow): InternalRow = {
+    generatedProj(input)
+  }
+}
+
 /**
  * A mutable wrapper that makes two rows appear as a single concatenated row.  Designed to
  * be instantiated once per thread and reused.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverter.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverter.scala
index 6af5e6200e57b..885ab091fcdf5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverter.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverter.scala
@@ -147,77 +147,73 @@ private object UnsafeColumnWriter {
       case t => ObjectUnsafeColumnWriter
     }
   }
+
+  /**
+   * Returns whether the dataType can be embedded into UnsafeRow (not using ObjectPool).
+   */
+  def canEmbed(dataType: DataType): Boolean = {
+    forType(dataType) != ObjectUnsafeColumnWriter
+  }
 }
 
 // ------------------------------------------------------------------------------------------------
 
-private object NullUnsafeColumnWriter extends NullUnsafeColumnWriter
-private object BooleanUnsafeColumnWriter extends BooleanUnsafeColumnWriter
-private object ByteUnsafeColumnWriter extends ByteUnsafeColumnWriter
-private object ShortUnsafeColumnWriter extends ShortUnsafeColumnWriter
-private object IntUnsafeColumnWriter extends IntUnsafeColumnWriter
-private object LongUnsafeColumnWriter extends LongUnsafeColumnWriter
-private object FloatUnsafeColumnWriter extends FloatUnsafeColumnWriter
-private object DoubleUnsafeColumnWriter extends DoubleUnsafeColumnWriter
-private object StringUnsafeColumnWriter extends StringUnsafeColumnWriter
-private object BinaryUnsafeColumnWriter extends BinaryUnsafeColumnWriter
-private object ObjectUnsafeColumnWriter extends ObjectUnsafeColumnWriter
 
 private abstract class PrimitiveUnsafeColumnWriter extends UnsafeColumnWriter {
   // Primitives don't write to the variable-length region:
   def getSize(sourceRow: InternalRow, column: Int): Int = 0
 }
 
-private class NullUnsafeColumnWriter private() extends PrimitiveUnsafeColumnWriter {
+private object NullUnsafeColumnWriter extends PrimitiveUnsafeColumnWriter {
   override def write(source: InternalRow, target: UnsafeRow, column: Int, cursor: Int): Int = {
     target.setNullAt(column)
     0
   }
 }
 
-private class BooleanUnsafeColumnWriter private() extends PrimitiveUnsafeColumnWriter {
+private object BooleanUnsafeColumnWriter extends PrimitiveUnsafeColumnWriter {
   override def write(source: InternalRow, target: UnsafeRow, column: Int, cursor: Int): Int = {
     target.setBoolean(column, source.getBoolean(column))
     0
   }
 }
 
-private class ByteUnsafeColumnWriter private() extends PrimitiveUnsafeColumnWriter {
+private object ByteUnsafeColumnWriter extends PrimitiveUnsafeColumnWriter {
   override def write(source: InternalRow, target: UnsafeRow, column: Int, cursor: Int): Int = {
     target.setByte(column, source.getByte(column))
     0
   }
 }
 
-private class ShortUnsafeColumnWriter private() extends PrimitiveUnsafeColumnWriter {
+private object ShortUnsafeColumnWriter extends PrimitiveUnsafeColumnWriter {
   override def write(source: InternalRow, target: UnsafeRow, column: Int, cursor: Int): Int = {
     target.setShort(column, source.getShort(column))
     0
   }
 }
 
-private class IntUnsafeColumnWriter private() extends PrimitiveUnsafeColumnWriter {
+private object IntUnsafeColumnWriter extends PrimitiveUnsafeColumnWriter {
   override def write(source: InternalRow, target: UnsafeRow, column: Int, cursor: Int): Int = {
     target.setInt(column, source.getInt(column))
     0
   }
 }
 
-private class LongUnsafeColumnWriter private() extends PrimitiveUnsafeColumnWriter {
+private object LongUnsafeColumnWriter extends PrimitiveUnsafeColumnWriter {
   override def write(source: InternalRow, target: UnsafeRow, column: Int, cursor: Int): Int = {
     target.setLong(column, source.getLong(column))
     0
   }
 }
 
-private class FloatUnsafeColumnWriter private() extends PrimitiveUnsafeColumnWriter {
+private object FloatUnsafeColumnWriter extends PrimitiveUnsafeColumnWriter {
   override def write(source: InternalRow, target: UnsafeRow, column: Int, cursor: Int): Int = {
     target.setFloat(column, source.getFloat(column))
     0
   }
 }
 
-private class DoubleUnsafeColumnWriter private() extends PrimitiveUnsafeColumnWriter {
+private object DoubleUnsafeColumnWriter extends PrimitiveUnsafeColumnWriter {
   override def write(source: InternalRow, target: UnsafeRow, column: Int, cursor: Int): Int = {
     target.setDouble(column, source.getDouble(column))
     0
@@ -226,18 +222,21 @@ private class DoubleUnsafeColumnWriter private() extends PrimitiveUnsafeColumnWr
 
 private abstract class BytesUnsafeColumnWriter extends UnsafeColumnWriter {
 
-  def getBytes(source: InternalRow, column: Int): Array[Byte]
+  protected[this] def isString: Boolean
+  protected[this] def getBytes(source: InternalRow, column: Int): Array[Byte]
 
-  def getSize(source: InternalRow, column: Int): Int = {
+  override def getSize(source: InternalRow, column: Int): Int = {
     val numBytes = getBytes(source, column).length
     ByteArrayMethods.roundNumberOfBytesToNearestWord(numBytes)
   }
 
-  protected[this] def isString: Boolean
-
   override def write(source: InternalRow, target: UnsafeRow, column: Int, cursor: Int): Int = {
-    val offset = target.getBaseOffset + cursor
     val bytes = getBytes(source, column)
+    write(target, bytes, column, cursor)
+  }
+
+  def write(target: UnsafeRow, bytes: Array[Byte], column: Int, cursor: Int): Int = {
+    val offset = target.getBaseOffset + cursor
     val numBytes = bytes.length
     if ((numBytes & 0x07) > 0) {
       // zero-out the padding bytes
@@ -256,22 +255,32 @@ private abstract class BytesUnsafeColumnWriter extends UnsafeColumnWriter {
   }
 }
 
-private class StringUnsafeColumnWriter private() extends BytesUnsafeColumnWriter {
+private object StringUnsafeColumnWriter extends BytesUnsafeColumnWriter {
   protected[this] def isString: Boolean = true
   def getBytes(source: InternalRow, column: Int): Array[Byte] = {
     source.getAs[UTF8String](column).getBytes
   }
+  // TODO(davies): refactor this
+  // specialized for codegen
+  def getSize(value: UTF8String): Int =
+    ByteArrayMethods.roundNumberOfBytesToNearestWord(value.numBytes())
+  def write(target: UnsafeRow, value: UTF8String, column: Int, cursor: Int): Int = {
+    write(target, value.getBytes, column, cursor)
+  }
 }
 
-private class BinaryUnsafeColumnWriter private() extends BytesUnsafeColumnWriter {
-  protected[this] def isString: Boolean = false
-  def getBytes(source: InternalRow, column: Int): Array[Byte] = {
+private object BinaryUnsafeColumnWriter extends BytesUnsafeColumnWriter {
+  protected[this] override def isString: Boolean = false
+  override def getBytes(source: InternalRow, column: Int): Array[Byte] = {
     source.getAs[Array[Byte]](column)
   }
+  // specialized for codegen
+  def getSize(value: Array[Byte]): Int =
+    ByteArrayMethods.roundNumberOfBytesToNearestWord(value.length)
 }
 
-private class ObjectUnsafeColumnWriter private() extends UnsafeColumnWriter {
-  def getSize(sourceRow: InternalRow, column: Int): Int = 0
+private object ObjectUnsafeColumnWriter extends UnsafeColumnWriter {
+  override def getSize(sourceRow: InternalRow, column: Int): Int = 0
   override def write(source: InternalRow, target: UnsafeRow, column: Int, cursor: Int): Int = {
     val obj = source.get(column)
     val idx = target.getPool.put(obj)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index 328d635de8743..45dc146488e12 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -24,6 +24,7 @@ import com.google.common.cache.{CacheBuilder, CacheLoader}
 import org.codehaus.janino.ClassBodyEvaluator
 
 import org.apache.spark.Logging
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
@@ -68,9 +69,6 @@ class CodeGenContext {
     mutableStates += ((javaType, variableName, initialValue))
   }
 
-  val stringType: String = classOf[UTF8String].getName
-  val decimalType: String = classOf[Decimal].getName
-
   final val JAVA_BOOLEAN = "boolean"
   final val JAVA_BYTE = "byte"
   final val JAVA_SHORT = "short"
@@ -136,9 +134,9 @@ class CodeGenContext {
     case LongType | TimestampType => JAVA_LONG
     case FloatType => JAVA_FLOAT
     case DoubleType => JAVA_DOUBLE
-    case dt: DecimalType => decimalType
+    case dt: DecimalType => "Decimal"
     case BinaryType => "byte[]"
-    case StringType => stringType
+    case StringType => "UTF8String"
     case _: StructType => "InternalRow"
     case _: ArrayType => s"scala.collection.Seq"
     case _: MapType => s"scala.collection.Map"
@@ -262,7 +260,12 @@ abstract class CodeGenerator[InType <: AnyRef, OutType <: AnyRef] extends Loggin
   private[this] def doCompile(code: String): GeneratedClass = {
     val evaluator = new ClassBodyEvaluator()
     evaluator.setParentClassLoader(getClass.getClassLoader)
-    evaluator.setDefaultImports(Array("org.apache.spark.sql.catalyst.InternalRow"))
+    evaluator.setDefaultImports(Array(
+      classOf[InternalRow].getName,
+      classOf[UnsafeRow].getName,
+      classOf[UTF8String].getName,
+      classOf[Decimal].getName
+    ))
     evaluator.setExtendedClass(classOf[GeneratedClass])
     try {
       evaluator.cook(code)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala
index 3e5ca308dc31d..8f9fcbf810554 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala
@@ -23,7 +23,7 @@ import org.apache.spark.sql.types._
 /**
  * Java can not access Projection (in package object)
  */
-abstract class BaseProject extends Projection {}
+abstract class BaseProjection extends Projection {}
 
 /**
  * Generates bytecode that produces a new [[InternalRow]] object based on a fixed set of input
@@ -160,7 +160,7 @@ object GenerateProjection extends CodeGenerator[Seq[Expression], Projection] {
       return new SpecificProjection(expr);
     }
 
-    class SpecificProjection extends ${classOf[BaseProject].getName} {
+    class SpecificProjection extends ${classOf[BaseProjection].getName} {
       private $exprType[] expressions = null;
       $mutableStates
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
new file mode 100644
index 0000000000000..a81d545a8ec63
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
@@ -0,0 +1,125 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions.codegen
+
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.types.{NullType, BinaryType, StringType}
+
+
+/**
+ * Generates a [[Projection]] that returns an [[UnsafeRow]].
+ *
+ * It generates the code for all the expressions, compute the total length for all the columns
+ * (can be accessed via variables), and then copy the data into a scratch buffer space in the
+ * form of UnsafeRow (the scratch buffer will grow as needed).
+ *
+ * Note: The returned UnsafeRow will be pointed to a scratch buffer inside the projection.
+ */
+object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafeProjection] {
+
+  protected def canonicalize(in: Seq[Expression]): Seq[Expression] =
+    in.map(ExpressionCanonicalizer.execute)
+
+  protected def bind(in: Seq[Expression], inputSchema: Seq[Attribute]): Seq[Expression] =
+    in.map(BindReferences.bindReference(_, inputSchema))
+
+  protected def create(expressions: Seq[Expression]): UnsafeProjection = {
+    val ctx = newCodeGenContext()
+    val exprs = expressions.map(_.gen(ctx))
+    val allExprs = exprs.map(_.code).mkString("\n")
+    val fixedSize = 8 * exprs.length + UnsafeRow.calculateBitSetWidthInBytes(exprs.length)
+    val stringWriter = "org.apache.spark.sql.catalyst.expressions.StringUnsafeColumnWriter"
+    val binaryWriter = "org.apache.spark.sql.catalyst.expressions.BinaryUnsafeColumnWriter"
+    val additionalSize = expressions.zipWithIndex.map { case (e, i) =>
+      e.dataType match {
+        case StringType =>
+          s" + (${exprs(i).isNull} ? 0 : $stringWriter.getSize(${exprs(i).primitive}))"
+        case BinaryType =>
+          s" + (${exprs(i).isNull} ? 0 : $binaryWriter.getSize(${exprs(i).primitive}))"
+        case _ => ""
+      }
+    }.mkString("")
+
+    val writers = expressions.zipWithIndex.map { case (e, i) =>
+      val update = e.dataType match {
+        case dt if ctx.isPrimitiveType(dt) =>
+          s"${ctx.setColumn("target", dt, i, exprs(i).primitive)}"
+        case StringType =>
+          s"cursor += $stringWriter.write(target, ${exprs(i).primitive}, $i, cursor)"
+        case BinaryType =>
+          s"cursor += $binaryWriter.write(target, ${exprs(i).primitive}, $i, cursor)"
+        case NullType => ""
+        case _ =>
+          throw new UnsupportedOperationException(s"Not supported DataType: ${e.dataType}")
+      }
+      s"""if (${exprs(i).isNull}) {
+            target.setNullAt($i);
+          } else {
+            $update;
+          }"""
+    }.mkString("\n          ")
+
+    val mutableStates = ctx.mutableStates.map { case (javaType, variableName, initialValue) =>
+      s"private $javaType $variableName = $initialValue;"
+    }.mkString("\n      ")
+
+    val code = s"""
+    private $exprType[] expressions;
+
+    public Object generate($exprType[] expr) {
+      this.expressions = expr;
+      return new SpecificProjection();
+    }
+
+    class SpecificProjection extends ${classOf[UnsafeProjection].getName} {
+
+      private UnsafeRow target = new UnsafeRow();
+      private byte[] buffer = new byte[64];
+
+      $mutableStates
+
+      public SpecificProjection() {}
+
+      // Scala.Function1 need this
+      public Object apply(Object row) {
+        return apply((InternalRow) row);
+      }
+
+      public UnsafeRow apply(InternalRow i) {
+        ${allExprs}
+
+        // additionalSize had '+' in the beginning
+        int numBytes = $fixedSize $additionalSize;
+        if (numBytes > buffer.length) {
+          buffer = new byte[numBytes];
+        }
+        target.pointTo(buffer, org.apache.spark.unsafe.PlatformDependent.BYTE_ARRAY_OFFSET,
+          ${expressions.size}, numBytes, null);
+        int cursor = $fixedSize;
+        $writers
+        return target;
+      }
+    }
+    """
+
+    logDebug(s"code for ${expressions.mkString(",")}:\n$code")
+
+    val c = compile(code)
+    c.generate(ctx.references.toArray).asInstanceOf[UnsafeProjection]
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalFunctions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalFunctions.scala
index 2fa74b4ffc5da..b9d4736a65e26 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalFunctions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalFunctions.scala
@@ -54,7 +54,7 @@ case class MakeDecimal(child: Expression, precision: Int, scale: Int) extends Un
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     nullSafeCodeGen(ctx, ev, eval => {
       s"""
-        ${ev.primitive} = (new ${ctx.decimalType}()).setOrNull($eval, $precision, $scale);
+        ${ev.primitive} = (new Decimal()).setOrNull($eval, $precision, $scale);
         ${ev.isNull} = ${ev.primitive} == null;
       """
     })
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
index a7ad452ef4943..84b289c4d1a68 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
@@ -263,7 +263,7 @@ case class Bin(child: Expression)
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     defineCodeGen(ctx, ev, (c) =>
-      s"${ctx.stringType}.fromString(java.lang.Long.toBinaryString($c))")
+      s"UTF8String.fromString(java.lang.Long.toBinaryString($c))")
   }
 }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala
index a269ec4a1e6dc..8d8d66ddeb341 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala
@@ -17,12 +17,11 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
-import java.security.MessageDigest
-import java.security.NoSuchAlgorithmException
+import java.security.{MessageDigest, NoSuchAlgorithmException}
 import java.util.zip.CRC32
 
 import org.apache.commons.codec.digest.DigestUtils
-import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
+
 import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
@@ -42,7 +41,7 @@ case class Md5(child: Expression) extends UnaryExpression with ImplicitCastInput
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     defineCodeGen(ctx, ev, c =>
-      s"${ctx.stringType}.fromString(org.apache.commons.codec.digest.DigestUtils.md5Hex($c))")
+      s"UTF8String.fromString(org.apache.commons.codec.digest.DigestUtils.md5Hex($c))")
   }
 }
 
@@ -93,19 +92,19 @@ case class Sha2(left: Expression, right: Expression)
           try {
             java.security.MessageDigest md = java.security.MessageDigest.getInstance("SHA-224");
             md.update($eval1);
-            ${ev.primitive} = ${ctx.stringType}.fromBytes(md.digest());
+            ${ev.primitive} = UTF8String.fromBytes(md.digest());
           } catch (java.security.NoSuchAlgorithmException e) {
             ${ev.isNull} = true;
           }
         } else if ($eval2 == 256 || $eval2 == 0) {
           ${ev.primitive} =
-            ${ctx.stringType}.fromString($digestUtils.sha256Hex($eval1));
+            UTF8String.fromString($digestUtils.sha256Hex($eval1));
         } else if ($eval2 == 384) {
           ${ev.primitive} =
-            ${ctx.stringType}.fromString($digestUtils.sha384Hex($eval1));
+            UTF8String.fromString($digestUtils.sha384Hex($eval1));
         } else if ($eval2 == 512) {
           ${ev.primitive} =
-            ${ctx.stringType}.fromString($digestUtils.sha512Hex($eval1));
+            UTF8String.fromString($digestUtils.sha512Hex($eval1));
         } else {
           ${ev.isNull} = true;
         }
@@ -129,7 +128,7 @@ case class Sha1(child: Expression) extends UnaryExpression with ImplicitCastInpu
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     defineCodeGen(ctx, ev, c =>
-      s"${ctx.stringType}.fromString(org.apache.commons.codec.digest.DigestUtils.shaHex($c))"
+      s"UTF8String.fromString(org.apache.commons.codec.digest.DigestUtils.shaHex($c))"
     )
   }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
index 43392df4bec2e..c43486b3ddcf5 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
@@ -23,7 +23,7 @@ import org.scalatest.Matchers._
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.CatalystTypeConverters
-import org.apache.spark.sql.catalyst.expressions.codegen.{GenerateProjection, GenerateMutableProjection}
+import org.apache.spark.sql.catalyst.expressions.codegen.{GenerateUnsafeProjection, GenerateProjection, GenerateMutableProjection}
 import org.apache.spark.sql.catalyst.optimizer.DefaultOptimizer
 import org.apache.spark.sql.catalyst.plans.logical.{OneRowRelation, Project}
 
@@ -43,6 +43,9 @@ trait ExpressionEvalHelper {
     checkEvaluationWithoutCodegen(expression, catalystValue, inputRow)
     checkEvaluationWithGeneratedMutableProjection(expression, catalystValue, inputRow)
     checkEvaluationWithGeneratedProjection(expression, catalystValue, inputRow)
+    if (UnsafeColumnWriter.canEmbed(expression.dataType)) {
+      checkEvalutionWithUnsafeProjection(expression, catalystValue, inputRow)
+    }
     checkEvaluationWithOptimization(expression, catalystValue, inputRow)
   }
 
@@ -142,6 +145,35 @@ trait ExpressionEvalHelper {
     }
   }
 
+  protected def checkEvalutionWithUnsafeProjection(
+      expression: Expression,
+      expected: Any,
+      inputRow: InternalRow = EmptyRow): Unit = {
+    val ctx = GenerateUnsafeProjection.newCodeGenContext()
+    lazy val evaluated = expression.gen(ctx)
+
+    val plan = try {
+      GenerateUnsafeProjection.generate(Alias(expression, s"Optimized($expression)")() :: Nil)
+    } catch {
+      case e: Throwable =>
+        fail(
+          s"""
+            |Code generation of $expression failed:
+            |${evaluated.code}
+            |$e
+          """.stripMargin)
+    }
+
+    val unsafeRow = plan(inputRow)
+    // UnsafeRow cannot be compared with GenericInternalRow directly
+    val actual = FromUnsafeProjection(expression.dataType :: Nil)(unsafeRow)
+    val expectedRow = InternalRow(expected)
+    if (actual != expectedRow) {
+      val input = if (inputRow == EmptyRow) "" else s", input: $inputRow"
+      fail(s"Incorrect Evaluation: $expression, actual: $actual, expected: $expected$input")
+    }
+  }
+
   protected def checkEvaluationWithOptimization(
       expression: Expression,
       expected: Any,

From c043a3e9df55721f21332f7c44ff351832d20324 Mon Sep 17 00:00:00 2001
From: Hari Shreedharan <hshreedharan@apache.org>
Date: Fri, 17 Jul 2015 09:38:08 -0500
Subject: [PATCH 203/549] [SPARK-8851] [YARN] In Client mode, make sure the
 client logs in and updates tokens

In client side, the flow is SparkSubmit -> SparkContext -> yarn/Client. Since the yarn client only gets a cloned config and the staging dir is set here, it is not really possible to do re-logins in the SparkContext. So, do the initial logins in Spark Submit and do re-logins as we do now in the AM, but the Client behaves like an executor in this specific context and reads the credentials file to update the tokens. This way, even if the streaming context is started up from checkpoint - it is fine since we have logged in from SparkSubmit itself itself.

Author: Hari Shreedharan <hshreedharan@apache.org>

Closes #7394 from harishreedharan/yarn-client-login and squashes the following commits:

9a2166f [Hari Shreedharan] make it possible to use command line args and config parameters together.
de08f57 [Hari Shreedharan] Fix import order.
5c4fa63 [Hari Shreedharan] Add a comment explaining what is being done in YarnClientSchedulerBackend.
c872caa [Hari Shreedharan] Fix typo in log message.
2c80540 [Hari Shreedharan] Move token renewal to YarnClientSchedulerBackend.
0c48ac2 [Hari Shreedharan] Remove direct use of ExecutorDelegationTokenUpdater in Client.
26f8bfa [Hari Shreedharan] [SPARK-8851][YARN] In Client mode, make sure the client logs in and updates tokens.
58b1969 [Hari Shreedharan] Simple attempt 1.
---
 .../apache/spark/deploy/SparkHadoopUtil.scala | 29 ++++++++++-------
 .../org/apache/spark/deploy/SparkSubmit.scala | 10 ++++--
 .../org/apache/spark/deploy/yarn/Client.scala | 32 ++++++++++++-------
 .../cluster/YarnClientSchedulerBackend.scala  | 11 +++++--
 4 files changed, 56 insertions(+), 26 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
index 9f94118829ff1..6b14d407a6380 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
@@ -25,6 +25,7 @@ import java.util.{Arrays, Comparator}
 import scala.collection.JavaConversions._
 import scala.concurrent.duration._
 import scala.language.postfixOps
+import scala.util.control.NonFatal
 
 import com.google.common.primitives.Longs
 import org.apache.hadoop.conf.Configuration
@@ -248,19 +249,25 @@ class SparkHadoopUtil extends Logging {
       dir: Path,
       prefix: String,
       exclusionSuffix: String): Array[FileStatus] = {
-    val fileStatuses = remoteFs.listStatus(dir,
-      new PathFilter {
-        override def accept(path: Path): Boolean = {
-          val name = path.getName
-          name.startsWith(prefix) && !name.endsWith(exclusionSuffix)
+    try {
+      val fileStatuses = remoteFs.listStatus(dir,
+        new PathFilter {
+          override def accept(path: Path): Boolean = {
+            val name = path.getName
+            name.startsWith(prefix) && !name.endsWith(exclusionSuffix)
+          }
+        })
+      Arrays.sort(fileStatuses, new Comparator[FileStatus] {
+        override def compare(o1: FileStatus, o2: FileStatus): Int = {
+          Longs.compare(o1.getModificationTime, o2.getModificationTime)
         }
       })
-    Arrays.sort(fileStatuses, new Comparator[FileStatus] {
-      override def compare(o1: FileStatus, o2: FileStatus): Int = {
-        Longs.compare(o1.getModificationTime, o2.getModificationTime)
-      }
-    })
-    fileStatuses
+      fileStatuses
+    } catch {
+      case NonFatal(e) =>
+        logWarning("Error while attempting to list files from application staging dir", e)
+        Array.empty
+    }
   }
 
   /**
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index 036cb6e054791..0b39ee8fe3ba0 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -508,8 +508,14 @@ object SparkSubmit {
     }
 
     // Let YARN know it's a pyspark app, so it distributes needed libraries.
-    if (clusterManager == YARN && args.isPython) {
-      sysProps.put("spark.yarn.isPython", "true")
+    if (clusterManager == YARN) {
+      if (args.isPython) {
+        sysProps.put("spark.yarn.isPython", "true")
+      }
+      if (args.principal != null) {
+        require(args.keytab != null, "Keytab must be specified when the keytab is specified")
+        UserGroupInformation.loginUserFromKeytab(args.principal, args.keytab)
+      }
     }
 
     // In yarn-cluster mode, use yarn.Client as a wrapper around the user class
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
index b74ea9a10afb2..bc28ce5eeae72 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -80,10 +80,12 @@ private[spark] class Client(
   private val isClusterMode = args.isClusterMode
 
   private var loginFromKeytab = false
+  private var principal: String = null
+  private var keytab: String = null
+
   private val fireAndForget = isClusterMode &&
     !sparkConf.getBoolean("spark.yarn.submit.waitAppCompletion", true)
 
-
   def stop(): Unit = yarnClient.stop()
 
   /**
@@ -339,7 +341,7 @@ private[spark] class Client(
     if (loginFromKeytab) {
       logInfo("To enable the AM to login from keytab, credentials are being copied over to the AM" +
         " via the YARN Secure Distributed Cache.")
-      val (_, localizedPath) = distribute(args.keytab,
+      val (_, localizedPath) = distribute(keytab,
         destName = Some(sparkConf.get("spark.yarn.keytab")),
         appMasterOnly = true)
       require(localizedPath != null, "Keytab file already distributed.")
@@ -785,19 +787,27 @@ private[spark] class Client(
   }
 
   def setupCredentials(): Unit = {
-    if (args.principal != null) {
-      require(args.keytab != null, "Keytab must be specified when principal is specified.")
+    loginFromKeytab = args.principal != null || sparkConf.contains("spark.yarn.principal")
+    if (loginFromKeytab) {
+      principal =
+        if (args.principal != null) args.principal else sparkConf.get("spark.yarn.principal")
+      keytab = {
+        if (args.keytab != null) {
+          args.keytab
+        } else {
+          sparkConf.getOption("spark.yarn.keytab").orNull
+        }
+      }
+
+      require(keytab != null, "Keytab must be specified when principal is specified.")
       logInfo("Attempting to login to the Kerberos" +
-        s" using principal: ${args.principal} and keytab: ${args.keytab}")
-      val f = new File(args.keytab)
+        s" using principal: $principal and keytab: $keytab")
+      val f = new File(keytab)
       // Generate a file name that can be used for the keytab file, that does not conflict
       // with any user file.
       val keytabFileName = f.getName + "-" + UUID.randomUUID().toString
-      UserGroupInformation.loginUserFromKeytab(args.principal, args.keytab)
-      loginFromKeytab = true
       sparkConf.set("spark.yarn.keytab", keytabFileName)
-      sparkConf.set("spark.yarn.principal", args.principal)
-      logInfo("Successfully logged into the KDC.")
+      sparkConf.set("spark.yarn.principal", principal)
     }
     credentials = UserGroupInformation.getCurrentUser.getCredentials
   }
@@ -1162,7 +1172,7 @@ object Client extends Logging {
    *
    * If not a "local:" file and no alternate name, the environment is not modified.
    *
-   * @parma conf      Spark configuration.
+   * @param conf      Spark configuration.
    * @param uri       URI to add to classpath (optional).
    * @param fileName  Alternate name for the file (optional).
    * @param env       Map holding the environment variables.
diff --git a/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala b/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala
index 3a0b9443d2d7b..d97fa2e2151bc 100644
--- a/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala
+++ b/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala
@@ -20,10 +20,9 @@ package org.apache.spark.scheduler.cluster
 import scala.collection.mutable.ArrayBuffer
 
 import org.apache.hadoop.yarn.api.records.{ApplicationId, YarnApplicationState}
-import org.apache.hadoop.yarn.exceptions.ApplicationNotFoundException
 
 import org.apache.spark.{SparkException, Logging, SparkContext}
-import org.apache.spark.deploy.yarn.{Client, ClientArguments}
+import org.apache.spark.deploy.yarn.{Client, ClientArguments, YarnSparkHadoopUtil}
 import org.apache.spark.scheduler.TaskSchedulerImpl
 
 private[spark] class YarnClientSchedulerBackend(
@@ -62,6 +61,13 @@ private[spark] class YarnClientSchedulerBackend(
     super.start()
 
     waitForApplication()
+
+    // SPARK-8851: In yarn-client mode, the AM still does the credentials refresh. The driver
+    // reads the credentials from HDFS, just like the executors and updates its own credentials
+    // cache.
+    if (conf.contains("spark.yarn.credentials.file")) {
+      YarnSparkHadoopUtil.get.startExecutorDelegationTokenRenewer(conf)
+    }
     monitorThread = asyncMonitorApplication()
     monitorThread.start()
   }
@@ -158,6 +164,7 @@ private[spark] class YarnClientSchedulerBackend(
     }
     super.stop()
     client.stop()
+    YarnSparkHadoopUtil.get.stopExecutorDelegationTokenRenewer()
     logInfo("Stopped")
   }
 

From 441e072a227378cae31afc45a608318b58ce2ac4 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Fri, 17 Jul 2015 09:00:41 -0700
Subject: [PATCH 204/549] [MINOR] [ML] fix wrong annotation of RFormula.formula

fix wrong annotation of RFormula.formula

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #7470 from yanboliang/RFormula and squashes the following commits:

61f1919 [Yanbo Liang] fix wrong annotation
---
 mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
index d9a36bda386b3..56169f2a01fc9 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
@@ -42,7 +42,7 @@ class RFormula(override val uid: String)
 
   /**
    * R formula parameter. The formula is provided in string form.
-   * @group setParam
+   * @group param
    */
   val formula: Param[String] = new Param(this, "formula", "R model formula")
 

From 59d24c226a441db5f08c58ec407ba5873bd3b954 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@outlook.com>
Date: Fri, 17 Jul 2015 09:31:13 -0700
Subject: [PATCH 205/549] [SPARK-9130][SQL] throw exception when check equality
 between external and internal row

instead of return false, throw exception when check equality between external and internal row is better.

Author: Wenchen Fan <cloud0fan@outlook.com>

Closes #7460 from cloud-fan/row-compare and squashes the following commits:

8a20911 [Wenchen Fan] improve equals
402daa8 [Wenchen Fan] throw exception when check equality between external and internal row
---
 .../main/scala/org/apache/spark/sql/Row.scala | 27 ++++++++++++++-----
 .../spark/sql/catalyst/InternalRow.scala      |  7 ++++-
 .../scala/org/apache/spark/sql/RowTest.scala  | 26 ++++++++++++++++++
 3 files changed, 53 insertions(+), 7 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala
index 3623fefbf2604..2cb64d00935de 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala
@@ -364,18 +364,33 @@ trait Row extends Serializable {
     false
   }
 
-  protected def canEqual(other: Any) = {
-    // Note that InternalRow overrides canEqual. These two canEqual's together makes sure that
-    // comparing the external Row and InternalRow will always yield false.
+  /**
+   * Returns true if we can check equality for these 2 rows.
+   * Equality check between external row and internal row is not allowed.
+   * Here we do this check to prevent call `equals` on external row with internal row.
+   */
+  protected def canEqual(other: Row) = {
+    // Note that `Row` is not only the interface of external row but also the parent
+    // of `InternalRow`, so we have to ensure `other` is not a internal row here to prevent
+    // call `equals` on external row with internal row.
+    // `InternalRow` overrides canEqual, and these two canEquals together makes sure that
+    // equality check between external Row and InternalRow will always fail.
     // In the future, InternalRow should not extend Row. In that case, we can remove these
     // canEqual methods.
-    other.isInstanceOf[Row] && !other.isInstanceOf[InternalRow]
+    !other.isInstanceOf[InternalRow]
   }
 
   override def equals(o: Any): Boolean = {
-    if (o == null || !canEqual(o)) return false
-
+    if (!o.isInstanceOf[Row]) return false
     val other = o.asInstanceOf[Row]
+
+    if (!canEqual(other)) {
+      throw new UnsupportedOperationException(
+        "cannot check equality between external and internal rows")
+    }
+
+    if (other eq null) return false
+
     if (length != other.length) {
       return false
     }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala
index e2fafb88ee43e..024973a6b9fcd 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala
@@ -54,7 +54,12 @@ abstract class InternalRow extends Row {
   // A default implementation to change the return type
   override def copy(): InternalRow = this
 
-  protected override def canEqual(other: Any) = other.isInstanceOf[InternalRow]
+  /**
+   * Returns true if we can check equality for these 2 rows.
+   * Equality check between external row and internal row is not allowed.
+   * Here we do this check to prevent call `equals` on internal row with external row.
+   */
+  protected override def canEqual(other: Row) = other.isInstanceOf[InternalRow]
 
   // Custom hashCode function that matches the efficient code generated version.
   override def hashCode: Int = {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/RowTest.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/RowTest.scala
index bbb9739e9cc76..878a1bb9b7e6d 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/RowTest.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/RowTest.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql
 
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{GenericRow, GenericRowWithSchema}
 import org.apache.spark.sql.types._
 import org.scalatest.{Matchers, FunSpec}
@@ -68,4 +69,29 @@ class RowTest extends FunSpec with Matchers {
       sampleRow.getValuesMap(List("col1", "col2")) shouldBe expected
     }
   }
+
+  describe("row equals") {
+    val externalRow = Row(1, 2)
+    val externalRow2 = Row(1, 2)
+    val internalRow = InternalRow(1, 2)
+    val internalRow2 = InternalRow(1, 2)
+
+    it("equality check for external rows") {
+      externalRow shouldEqual externalRow2
+    }
+
+    it("equality check for internal rows") {
+      internalRow shouldEqual internalRow2
+    }
+
+    it("throws an exception when check equality between external and internal rows") {
+      def assertError(f: => Unit): Unit = {
+        val e = intercept[UnsupportedOperationException](f)
+        e.getMessage.contains("cannot check equality between external and internal rows")
+      }
+
+      assertError(internalRow.equals(externalRow))
+      assertError(externalRow.equals(internalRow))
+    }
+  }
 }

From 305e77cd83f3dbe680a920d5329c2e8c58452d5b Mon Sep 17 00:00:00 2001
From: "zhichao.li" <zhichao.li@intel.com>
Date: Fri, 17 Jul 2015 09:32:27 -0700
Subject: [PATCH 206/549] [SPARK-8209[SQL]Add function conv

cc chenghao-intel  adrian-wang

Author: zhichao.li <zhichao.li@intel.com>

Closes #6872 from zhichao-li/conv and squashes the following commits:

6ef3b37 [zhichao.li] add unittest and comments
78d9836 [zhichao.li] polish dataframe api and add unittest
e2bace3 [zhichao.li] update to use ImplicitCastInputTypes
cbcad3f [zhichao.li] add function conv
---
 .../catalyst/analysis/FunctionRegistry.scala  |   1 +
 .../spark/sql/catalyst/expressions/math.scala | 191 ++++++++++++++++++
 .../expressions/MathFunctionsSuite.scala      |  21 +-
 .../org/apache/spark/sql/functions.scala      |  18 ++
 .../spark/sql/MathExpressionsSuite.scala      |  13 ++
 5 files changed, 242 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index e0beafe710079..a45181712dbdf 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -99,6 +99,7 @@ object FunctionRegistry {
     expression[Ceil]("ceil"),
     expression[Ceil]("ceiling"),
     expression[Cos]("cos"),
+    expression[Conv]("conv"),
     expression[EulerNumber]("e"),
     expression[Exp]("exp"),
     expression[Expm1]("expm1"),
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
index 84b289c4d1a68..7a543ff36afd1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.catalyst.expressions
 
 import java.{lang => jl}
+import java.util.Arrays
 
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.{TypeCheckSuccess, TypeCheckFailure}
@@ -139,6 +140,196 @@ case class Cos(child: Expression) extends UnaryMathExpression(math.cos, "COS")
 
 case class Cosh(child: Expression) extends UnaryMathExpression(math.cosh, "COSH")
 
+/**
+ * Convert a num from one base to another
+ * @param numExpr the number to be converted
+ * @param fromBaseExpr from which base
+ * @param toBaseExpr to which base
+ */
+case class Conv(numExpr: Expression, fromBaseExpr: Expression, toBaseExpr: Expression)
+  extends Expression with ImplicitCastInputTypes{
+
+  override def foldable: Boolean = numExpr.foldable && fromBaseExpr.foldable && toBaseExpr.foldable
+
+  override def nullable: Boolean = numExpr.nullable || fromBaseExpr.nullable || toBaseExpr.nullable
+
+  override def children: Seq[Expression] = Seq(numExpr, fromBaseExpr, toBaseExpr)
+
+  override def inputTypes: Seq[AbstractDataType] = Seq(StringType, IntegerType, IntegerType)
+
+  /** Returns the result of evaluating this expression on a given input Row */
+  override def eval(input: InternalRow): Any = {
+    val num = numExpr.eval(input)
+    val fromBase = fromBaseExpr.eval(input)
+    val toBase = toBaseExpr.eval(input)
+    if (num == null || fromBase == null || toBase == null) {
+      null
+    } else {
+      conv(num.asInstanceOf[UTF8String].getBytes,
+        fromBase.asInstanceOf[Int], toBase.asInstanceOf[Int])
+    }
+  }
+
+  /**
+   * Returns the [[DataType]] of the result of evaluating this expression.  It is
+   * invalid to query the dataType of an unresolved expression (i.e., when `resolved` == false).
+   */
+  override def dataType: DataType = StringType
+
+  private val value = new Array[Byte](64)
+
+  /**
+   * Divide x by m as if x is an unsigned 64-bit integer. Examples:
+   * unsignedLongDiv(-1, 2) == Long.MAX_VALUE unsignedLongDiv(6, 3) == 2
+   * unsignedLongDiv(0, 5) == 0
+   *
+   * @param x is treated as unsigned
+   * @param m is treated as signed
+   */
+  private def unsignedLongDiv(x: Long, m: Int): Long = {
+    if (x >= 0) {
+      x / m
+    } else {
+      // Let uval be the value of the unsigned long with the same bits as x
+      // Two's complement => x = uval - 2*MAX - 2
+      // => uval = x + 2*MAX + 2
+      // Now, use the fact: (a+b)/c = a/c + b/c + (a%c+b%c)/c
+      (x / m + 2 * (Long.MaxValue / m) + 2 / m + (x % m + 2 * (Long.MaxValue % m) + 2 % m) / m)
+    }
+  }
+
+  /**
+   * Decode v into value[].
+   *
+   * @param v is treated as an unsigned 64-bit integer
+   * @param radix must be between MIN_RADIX and MAX_RADIX
+   */
+  private def decode(v: Long, radix: Int): Unit = {
+    var tmpV = v
+    Arrays.fill(value, 0.asInstanceOf[Byte])
+    var i = value.length - 1
+    while (tmpV != 0) {
+      val q = unsignedLongDiv(tmpV, radix)
+      value(i) = (tmpV - q * radix).asInstanceOf[Byte]
+      tmpV = q
+      i -= 1
+    }
+  }
+
+  /**
+   * Convert value[] into a long. On overflow, return -1 (as mySQL does). If a
+   * negative digit is found, ignore the suffix starting there.
+   *
+   * @param radix  must be between MIN_RADIX and MAX_RADIX
+   * @param fromPos is the first element that should be conisdered
+   * @return the result should be treated as an unsigned 64-bit integer.
+   */
+  private def encode(radix: Int, fromPos: Int): Long = {
+    var v: Long = 0L
+    val bound = unsignedLongDiv(-1 - radix, radix) // Possible overflow once
+    // val
+    // exceeds this value
+    var i = fromPos
+    while (i < value.length && value(i) >= 0) {
+      if (v >= bound) {
+        // Check for overflow
+        if (unsignedLongDiv(-1 - value(i), radix) < v) {
+          return -1
+        }
+      }
+      v = v * radix + value(i)
+      i += 1
+    }
+    return v
+  }
+
+  /**
+   * Convert the bytes in value[] to the corresponding chars.
+   *
+   * @param radix must be between MIN_RADIX and MAX_RADIX
+   * @param fromPos is the first nonzero element
+   */
+  private def byte2char(radix: Int, fromPos: Int): Unit = {
+    var i = fromPos
+    while (i < value.length) {
+      value(i) = Character.toUpperCase(Character.forDigit(value(i), radix)).asInstanceOf[Byte]
+      i += 1
+    }
+  }
+
+  /**
+   * Convert the chars in value[] to the corresponding integers. Convert invalid
+   * characters to -1.
+   *
+   * @param radix must be between MIN_RADIX and MAX_RADIX
+   * @param fromPos is the first nonzero element
+   */
+  private def char2byte(radix: Int, fromPos: Int): Unit = {
+    var i = fromPos
+    while ( i < value.length) {
+      value(i) = Character.digit(value(i), radix).asInstanceOf[Byte]
+      i += 1
+    }
+  }
+
+  /**
+   * Convert numbers between different number bases. If toBase>0 the result is
+   * unsigned, otherwise it is signed.
+   * NB: This logic is borrowed from org.apache.hadoop.hive.ql.ud.UDFConv
+   */
+  private def conv(n: Array[Byte] , fromBase: Int, toBase: Int ): UTF8String = {
+    if (n == null || fromBase == null || toBase == null || n.isEmpty) {
+      return null
+    }
+
+    if (fromBase < Character.MIN_RADIX || fromBase > Character.MAX_RADIX
+      || Math.abs(toBase) < Character.MIN_RADIX
+      || Math.abs(toBase) > Character.MAX_RADIX) {
+      return null
+    }
+
+    var (negative, first) = if (n(0) == '-') (true, 1) else (false, 0)
+
+    // Copy the digits in the right side of the array
+    var i = 1
+    while (i <= n.length - first) {
+      value(value.length - i) = n(n.length - i)
+      i += 1
+    }
+    char2byte(fromBase, value.length - n.length + first)
+
+    // Do the conversion by going through a 64 bit integer
+    var v = encode(fromBase, value.length - n.length + first)
+    if (negative && toBase > 0) {
+      if (v < 0) {
+        v = -1
+      } else {
+        v = -v
+      }
+    }
+    if (toBase < 0 && v < 0) {
+      v = -v
+      negative = true
+    }
+    decode(v, Math.abs(toBase))
+
+    // Find the first non-zero digit or the last digits if all are zero.
+    val firstNonZeroPos = {
+      val firstNonZero = value.indexWhere( _ != 0)
+      if (firstNonZero != -1) firstNonZero else value.length - 1
+    }
+
+    byte2char(Math.abs(toBase), firstNonZeroPos)
+
+    var resultStartPos = firstNonZeroPos
+    if (negative && toBase < 0) {
+      resultStartPos = firstNonZeroPos - 1
+      value(resultStartPos) = '-'
+    }
+    UTF8String.fromBytes( Arrays.copyOfRange(value, resultStartPos, value.length))
+  }
+}
+
 case class Exp(child: Expression) extends UnaryMathExpression(math.exp, "EXP")
 
 case class Expm1(child: Expression) extends UnaryMathExpression(math.expm1, "EXPM1")
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala
index 52a874a9d89ef..ca35c7ef8ae5d 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala
@@ -17,14 +17,13 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
-import scala.math.BigDecimal.RoundingMode
-
 import com.google.common.math.LongMath
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.types._
 
+
 class MathFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
 
   /**
@@ -95,6 +94,24 @@ class MathFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(c(Literal(1.0), Literal.create(null, DoubleType)), null, create_row(null))
   }
 
+  test("conv") {
+    checkEvaluation(Conv(Literal("3"), Literal(10), Literal(2)), "11")
+    checkEvaluation(Conv(Literal("-15"), Literal(10), Literal(-16)), "-F")
+    checkEvaluation(Conv(Literal("-15"), Literal(10), Literal(16)), "FFFFFFFFFFFFFFF1")
+    checkEvaluation(Conv(Literal("big"), Literal(36), Literal(16)), "3A48")
+    checkEvaluation(Conv(Literal(null), Literal(36), Literal(16)), null)
+    checkEvaluation(Conv(Literal("3"), Literal(null), Literal(16)), null)
+    checkEvaluation(
+      Conv(Literal("1234"), Literal(10), Literal(37)), null)
+    checkEvaluation(
+      Conv(Literal(""), Literal(10), Literal(16)), null)
+    checkEvaluation(
+      Conv(Literal("9223372036854775807"), Literal(36), Literal(16)), "FFFFFFFFFFFFFFFF")
+    // If there is an invalid digit in the number, the longest valid prefix should be converted.
+    checkEvaluation(
+      Conv(Literal("11abc"), Literal(10), Literal(16)), "B")
+  }
+
   test("e") {
     testLeaf(EulerNumber, math.E)
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index d6da284a4c788..fe511c296cfd2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -68,6 +68,24 @@ object functions {
    */
   def column(colName: String): Column = Column(colName)
 
+  /**
+   * Convert a number from one base to another for the specified expressions
+   *
+   * @group math_funcs
+   * @since 1.5.0
+   */
+  def conv(num: Column, fromBase: Int, toBase: Int): Column =
+    Conv(num.expr, lit(fromBase).expr, lit(toBase).expr)
+
+  /**
+   * Convert a number from one base to another for the specified expressions
+   *
+   * @group math_funcs
+   * @since 1.5.0
+   */
+  def conv(numColName: String, fromBase: Int, toBase: Int): Column =
+    conv(Column(numColName), fromBase, toBase)
+
   /**
    * Creates a [[Column]] of literal value.
    *
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala
index 087126bb2e513..8eb3fec756b4c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala
@@ -178,6 +178,19 @@ class MathExpressionsSuite extends QueryTest {
       Row(0.0, 1.0, 2.0))
   }
 
+  test("conv") {
+    val df = Seq(("333", 10, 2)).toDF("num", "fromBase", "toBase")
+    checkAnswer(df.select(conv('num, 10, 16)), Row("14D"))
+    checkAnswer(df.select(conv("num", 10, 16)), Row("14D"))
+    checkAnswer(df.select(conv(lit(100), 2, 16)), Row("4"))
+    checkAnswer(df.select(conv(lit(3122234455L), 10, 16)), Row("BA198457"))
+    checkAnswer(df.selectExpr("conv(num, fromBase, toBase)"), Row("101001101"))
+    checkAnswer(df.selectExpr("""conv("100", 2, 10)"""), Row("4"))
+    checkAnswer(df.selectExpr("""conv("-10", 16, -10)"""), Row("-16"))
+    checkAnswer(
+      df.selectExpr("""conv("9223372036854775807", 36, -16)"""), Row("-1")) // for overflow
+  }
+
   test("floor") {
     testOneToOneMathFunction(floor, math.floor)
   }

From eba6a1af4c8ffb21934a59a61a419d625f37cceb Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@appier.com>
Date: Fri, 17 Jul 2015 09:38:08 -0700
Subject: [PATCH 207/549] [SPARK-8945][SQL] Add add and subtract expressions
 for IntervalType

JIRA: https://issues.apache.org/jira/browse/SPARK-8945

Add add and subtract expressions for IntervalType.

Author: Liang-Chi Hsieh <viirya@appier.com>

This patch had conflicts when merged, resolved by
Committer: Reynold Xin <rxin@databricks.com>

Closes #7398 from viirya/interval_add_subtract and squashes the following commits:

acd1f1e [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into interval_add_subtract
5abae28 [Liang-Chi Hsieh] For comments.
6f5b72e [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into interval_add_subtract
dbe3906 [Liang-Chi Hsieh] For comments.
13a2fc5 [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into interval_add_subtract
83ec129 [Liang-Chi Hsieh] Remove intervalMethod.
acfe1ab [Liang-Chi Hsieh] Fix scala style.
d3e9d0e [Liang-Chi Hsieh] Add add and subtract expressions for IntervalType.
---
 .../sql/catalyst/expressions/arithmetic.scala | 60 ++++++++++++++++---
 .../expressions/codegen/CodeGenerator.scala   |  4 +-
 .../sql/catalyst/expressions/literals.scala   |  3 +-
 .../spark/sql/types/AbstractDataType.scala    |  6 ++
 .../ExpressionTypeCheckingSuite.scala         |  6 +-
 .../org/apache/spark/sql/SQLQuerySuite.scala  | 17 ++++++
 .../apache/spark/unsafe/types/Interval.java   | 16 +++++
 .../spark/unsafe/types/IntervalSuite.java     | 38 ++++++++++++
 8 files changed, 136 insertions(+), 14 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
index 382cbe3b84a07..1616d1bc0aed5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
@@ -21,11 +21,12 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode}
 import org.apache.spark.sql.catalyst.util.TypeUtils
 import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.Interval
 
 
 case class UnaryMinus(child: Expression) extends UnaryExpression with ExpectsInputTypes {
 
-  override def inputTypes: Seq[AbstractDataType] = Seq(NumericType)
+  override def inputTypes: Seq[AbstractDataType] = Seq(TypeCollection.NumericAndInterval)
 
   override def dataType: DataType = child.dataType
 
@@ -36,15 +37,22 @@ case class UnaryMinus(child: Expression) extends UnaryExpression with ExpectsInp
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = dataType match {
     case dt: DecimalType => defineCodeGen(ctx, ev, c => s"$c.unary_$$minus()")
     case dt: NumericType => defineCodeGen(ctx, ev, c => s"(${ctx.javaType(dt)})(-($c))")
+    case dt: IntervalType => defineCodeGen(ctx, ev, c => s"$c.negate()")
   }
 
-  protected override def nullSafeEval(input: Any): Any = numeric.negate(input)
+  protected override def nullSafeEval(input: Any): Any = {
+    if (dataType.isInstanceOf[IntervalType]) {
+      input.asInstanceOf[Interval].negate()
+    } else {
+      numeric.negate(input)
+    }
+  }
 }
 
 case class UnaryPositive(child: Expression) extends UnaryExpression with ExpectsInputTypes {
   override def prettyName: String = "positive"
 
-  override def inputTypes: Seq[AbstractDataType] = Seq(NumericType)
+  override def inputTypes: Seq[AbstractDataType] = Seq(TypeCollection.NumericAndInterval)
 
   override def dataType: DataType = child.dataType
 
@@ -95,32 +103,66 @@ private[sql] object BinaryArithmetic {
 
 case class Add(left: Expression, right: Expression) extends BinaryArithmetic {
 
-  override def inputType: AbstractDataType = NumericType
+  override def inputType: AbstractDataType = TypeCollection.NumericAndInterval
 
   override def symbol: String = "+"
-  override def decimalMethod: String = "$plus"
 
   override lazy val resolved =
     childrenResolved && checkInputDataTypes().isSuccess && !DecimalType.isFixed(dataType)
 
   private lazy val numeric = TypeUtils.getNumeric(dataType)
 
-  protected override def nullSafeEval(input1: Any, input2: Any): Any = numeric.plus(input1, input2)
+  protected override def nullSafeEval(input1: Any, input2: Any): Any = {
+    if (dataType.isInstanceOf[IntervalType]) {
+      input1.asInstanceOf[Interval].add(input2.asInstanceOf[Interval])
+    } else {
+      numeric.plus(input1, input2)
+    }
+  }
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = dataType match {
+    case dt: DecimalType =>
+      defineCodeGen(ctx, ev, (eval1, eval2) => s"$eval1.$$plus($eval2)")
+    case ByteType | ShortType =>
+      defineCodeGen(ctx, ev,
+        (eval1, eval2) => s"(${ctx.javaType(dataType)})($eval1 $symbol $eval2)")
+    case IntervalType =>
+      defineCodeGen(ctx, ev, (eval1, eval2) => s"$eval1.add($eval2)")
+    case _ =>
+      defineCodeGen(ctx, ev, (eval1, eval2) => s"$eval1 $symbol $eval2")
+  }
 }
 
 case class Subtract(left: Expression, right: Expression) extends BinaryArithmetic {
 
-  override def inputType: AbstractDataType = NumericType
+  override def inputType: AbstractDataType = TypeCollection.NumericAndInterval
 
   override def symbol: String = "-"
-  override def decimalMethod: String = "$minus"
 
   override lazy val resolved =
     childrenResolved && checkInputDataTypes().isSuccess && !DecimalType.isFixed(dataType)
 
   private lazy val numeric = TypeUtils.getNumeric(dataType)
 
-  protected override def nullSafeEval(input1: Any, input2: Any): Any = numeric.minus(input1, input2)
+  protected override def nullSafeEval(input1: Any, input2: Any): Any = {
+    if (dataType.isInstanceOf[IntervalType]) {
+      input1.asInstanceOf[Interval].subtract(input2.asInstanceOf[Interval])
+    } else {
+      numeric.minus(input1, input2)
+    }
+  }
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = dataType match {
+    case dt: DecimalType =>
+      defineCodeGen(ctx, ev, (eval1, eval2) => s"$eval1.$$minus($eval2)")
+    case ByteType | ShortType =>
+      defineCodeGen(ctx, ev,
+        (eval1, eval2) => s"(${ctx.javaType(dataType)})($eval1 $symbol $eval2)")
+    case IntervalType =>
+      defineCodeGen(ctx, ev, (eval1, eval2) => s"$eval1.subtract($eval2)")
+    case _ =>
+      defineCodeGen(ctx, ev, (eval1, eval2) => s"$eval1 $symbol $eval2")
+  }
 }
 
 case class Multiply(left: Expression, right: Expression) extends BinaryArithmetic {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index 45dc146488e12..7c388bc346306 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -27,7 +27,7 @@ import org.apache.spark.Logging
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.types._
-import org.apache.spark.unsafe.types.UTF8String
+import org.apache.spark.unsafe.types._
 
 
 // These classes are here to avoid issues with serialization and integration with quasiquotes.
@@ -69,6 +69,7 @@ class CodeGenContext {
     mutableStates += ((javaType, variableName, initialValue))
   }
 
+  final val intervalType: String = classOf[Interval].getName
   final val JAVA_BOOLEAN = "boolean"
   final val JAVA_BYTE = "byte"
   final val JAVA_SHORT = "short"
@@ -137,6 +138,7 @@ class CodeGenContext {
     case dt: DecimalType => "Decimal"
     case BinaryType => "byte[]"
     case StringType => "UTF8String"
+    case IntervalType => intervalType
     case _: StructType => "InternalRow"
     case _: ArrayType => s"scala.collection.Seq"
     case _: MapType => s"scala.collection.Map"
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
index 3a7a7ae440036..e1fdb29541fa8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
@@ -24,7 +24,7 @@ import org.apache.spark.sql.catalyst.CatalystTypeConverters
 import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode}
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.types._
-import org.apache.spark.unsafe.types.UTF8String
+import org.apache.spark.unsafe.types._
 
 object Literal {
   def apply(v: Any): Literal = v match {
@@ -42,6 +42,7 @@ object Literal {
     case t: Timestamp => Literal(DateTimeUtils.fromJavaTimestamp(t), TimestampType)
     case d: Date => Literal(DateTimeUtils.fromJavaDate(d), DateType)
     case a: Array[Byte] => Literal(a, BinaryType)
+    case i: Interval => Literal(i, IntervalType)
     case null => Literal(null, NullType)
     case _ =>
       throw new RuntimeException("Unsupported literal type " + v.getClass + " " + v)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala
index 076d7b5a5118d..40bf4b299c990 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala
@@ -91,6 +91,12 @@ private[sql] object TypeCollection {
     TimestampType, DateType,
     StringType, BinaryType)
 
+  /**
+   * Types that include numeric types and interval type. They are only used in unary_minus,
+   * unary_positive, add and subtract operations.
+   */
+  val NumericAndInterval = TypeCollection(NumericType, IntervalType)
+
   def apply(types: AbstractDataType*): TypeCollection = new TypeCollection(types)
 
   def unapply(typ: AbstractDataType): Option[Seq[AbstractDataType]] = typ match {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala
index ed0d20e7de80e..ad15136ee9a2f 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala
@@ -53,7 +53,7 @@ class ExpressionTypeCheckingSuite extends SparkFunSuite {
   }
 
   test("check types for unary arithmetic") {
-    assertError(UnaryMinus('stringField), "expected to be of type numeric")
+    assertError(UnaryMinus('stringField), "type (numeric or interval)")
     assertError(Abs('stringField), "expected to be of type numeric")
     assertError(BitwiseNot('stringField), "expected to be of type integral")
   }
@@ -78,8 +78,8 @@ class ExpressionTypeCheckingSuite extends SparkFunSuite {
     assertErrorForDifferingTypes(MaxOf('intField, 'booleanField))
     assertErrorForDifferingTypes(MinOf('intField, 'booleanField))
 
-    assertError(Add('booleanField, 'booleanField), "accepts numeric type")
-    assertError(Subtract('booleanField, 'booleanField), "accepts numeric type")
+    assertError(Add('booleanField, 'booleanField), "accepts (numeric or interval) type")
+    assertError(Subtract('booleanField, 'booleanField), "accepts (numeric or interval) type")
     assertError(Multiply('booleanField, 'booleanField), "accepts numeric type")
     assertError(Divide('booleanField, 'booleanField), "accepts numeric type")
     assertError(Remainder('booleanField, 'booleanField), "accepts numeric type")
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 231440892bf0b..5b8b70ed5ae11 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -1492,4 +1492,21 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll with SQLTestUtils {
     // Currently we don't yet support nanosecond
     checkIntervalParseError("select interval 23 nanosecond")
   }
+
+  test("SPARK-8945: add and subtract expressions for interval type") {
+    import org.apache.spark.unsafe.types.Interval
+
+    val df = sql("select interval 3 years -3 month 7 week 123 microseconds as i")
+    checkAnswer(df, Row(new Interval(12 * 3 - 3, 7L * 1000 * 1000 * 3600 * 24 * 7 + 123)))
+
+    checkAnswer(df.select(df("i") + new Interval(2, 123)),
+      Row(new Interval(12 * 3 - 3 + 2, 7L * 1000 * 1000 * 3600 * 24 * 7 + 123 + 123)))
+
+    checkAnswer(df.select(df("i") - new Interval(2, 123)),
+      Row(new Interval(12 * 3 - 3 - 2, 7L * 1000 * 1000 * 3600 * 24 * 7 + 123 - 123)))
+
+    // unary minus
+    checkAnswer(df.select(-df("i")),
+      Row(new Interval(-(12 * 3 - 3), -(7L * 1000 * 1000 * 3600 * 24 * 7 + 123))))
+  }
 }
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/Interval.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/Interval.java
index 905ea0b7b878c..71b1a85a818ea 100644
--- a/unsafe/src/main/java/org/apache/spark/unsafe/types/Interval.java
+++ b/unsafe/src/main/java/org/apache/spark/unsafe/types/Interval.java
@@ -87,6 +87,22 @@ public Interval(int months, long microseconds) {
     this.microseconds = microseconds;
   }
 
+  public Interval add(Interval that) {
+    int months = this.months + that.months;
+    long microseconds = this.microseconds + that.microseconds;
+    return new Interval(months, microseconds);
+  }
+
+  public Interval subtract(Interval that) {
+    int months = this.months - that.months;
+    long microseconds = this.microseconds - that.microseconds;
+    return new Interval(months, microseconds);
+  }
+
+  public Interval negate() {
+    return new Interval(-this.months, -this.microseconds);
+  }
+
   @Override
   public boolean equals(Object other) {
     if (this == other) return true;
diff --git a/unsafe/src/test/java/org/apache/spark/unsafe/types/IntervalSuite.java b/unsafe/src/test/java/org/apache/spark/unsafe/types/IntervalSuite.java
index 1832d0bc65551..d29517cda66a3 100644
--- a/unsafe/src/test/java/org/apache/spark/unsafe/types/IntervalSuite.java
+++ b/unsafe/src/test/java/org/apache/spark/unsafe/types/IntervalSuite.java
@@ -101,6 +101,44 @@ public void fromStringTest() {
     assertEquals(Interval.fromString(input), null);
   }
 
+  @Test
+  public void addTest() {
+    String input = "interval 3 month 1 hour";
+    String input2 = "interval 2 month 100 hour";
+
+    Interval interval = Interval.fromString(input);
+    Interval interval2 = Interval.fromString(input2);
+
+    assertEquals(interval.add(interval2), new Interval(5, 101 * MICROS_PER_HOUR));
+
+    input = "interval -10 month -81 hour";
+    input2 = "interval 75 month 200 hour";
+
+    interval = Interval.fromString(input);
+    interval2 = Interval.fromString(input2);
+
+    assertEquals(interval.add(interval2), new Interval(65, 119 * MICROS_PER_HOUR));
+  }
+
+  @Test
+  public void subtractTest() {
+    String input = "interval 3 month 1 hour";
+    String input2 = "interval 2 month 100 hour";
+
+    Interval interval = Interval.fromString(input);
+    Interval interval2 = Interval.fromString(input2);
+
+    assertEquals(interval.subtract(interval2), new Interval(1, -99 * MICROS_PER_HOUR));
+
+    input = "interval -10 month -81 hour";
+    input2 = "interval 75 month 200 hour";
+
+    interval = Interval.fromString(input);
+    interval2 = Interval.fromString(input2);
+
+    assertEquals(interval.subtract(interval2), new Interval(-85, -281 * MICROS_PER_HOUR));
+  }
+
   private void testSingleUnit(String unit, int number, int months, long microseconds) {
     String input1 = "interval " + number + " " + unit;
     String input2 = "interval " + number + " " + unit + "s";

From 587c315b204f1439f696620543c38166d95f8a3d Mon Sep 17 00:00:00 2001
From: tien-dungle <tien-dung.le@realimpactanalytics.com>
Date: Fri, 17 Jul 2015 12:11:32 -0700
Subject: [PATCH 208/549] [SPARK-9109] [GRAPHX] Keep the cached edge in the
 graph

The change here is to keep the cached RDDs in the graph object so that when the graph.unpersist() is called these RDDs are correctly unpersisted.

```java
import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD
import org.slf4j.LoggerFactory
import org.apache.spark.graphx.util.GraphGenerators

// Create an RDD for the vertices
val users: RDD[(VertexId, (String, String))] =
  sc.parallelize(Array((3L, ("rxin", "student")), (7L, ("jgonzal", "postdoc")),
                       (5L, ("franklin", "prof")), (2L, ("istoica", "prof"))))
// Create an RDD for edges
val relationships: RDD[Edge[String]] =
  sc.parallelize(Array(Edge(3L, 7L, "collab"),    Edge(5L, 3L, "advisor"),
                       Edge(2L, 5L, "colleague"), Edge(5L, 7L, "pi")))
// Define a default user in case there are relationship with missing user
val defaultUser = ("John Doe", "Missing")
// Build the initial Graph
val graph = Graph(users, relationships, defaultUser)
graph.cache().numEdges

graph.unpersist()

sc.getPersistentRDDs.foreach( r => println( r._2.toString))
```

Author: tien-dungle <tien-dung.le@realimpactanalytics.com>

Closes #7469 from tien-dungle/SPARK-9109_Graphx-unpersist and squashes the following commits:

8d87997 [tien-dungle] Keep the cached edge in the graph
---
 .../scala/org/apache/spark/graphx/impl/GraphImpl.scala   | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/graphx/src/main/scala/org/apache/spark/graphx/impl/GraphImpl.scala b/graphx/src/main/scala/org/apache/spark/graphx/impl/GraphImpl.scala
index 90a74d23a26cc..da95314440d86 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/impl/GraphImpl.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/impl/GraphImpl.scala
@@ -332,9 +332,9 @@ object GraphImpl {
       edgeStorageLevel: StorageLevel,
       vertexStorageLevel: StorageLevel): GraphImpl[VD, ED] = {
     val edgeRDD = EdgeRDD.fromEdges(edges)(classTag[ED], classTag[VD])
-      .withTargetStorageLevel(edgeStorageLevel).cache()
+      .withTargetStorageLevel(edgeStorageLevel)
     val vertexRDD = VertexRDD(vertices, edgeRDD, defaultVertexAttr)
-      .withTargetStorageLevel(vertexStorageLevel).cache()
+      .withTargetStorageLevel(vertexStorageLevel)
     GraphImpl(vertexRDD, edgeRDD)
   }
 
@@ -346,9 +346,14 @@ object GraphImpl {
   def apply[VD: ClassTag, ED: ClassTag](
       vertices: VertexRDD[VD],
       edges: EdgeRDD[ED]): GraphImpl[VD, ED] = {
+
+    vertices.cache()
+
     // Convert the vertex partitions in edges to the correct type
     val newEdges = edges.asInstanceOf[EdgeRDDImpl[ED, _]]
       .mapEdgePartitions((pid, part) => part.withoutVertexAttributes[VD])
+      .cache()
+
     GraphImpl.fromExistingRDDs(vertices, newEdges)
   }
 

From f9a82a884e7cb2a466a33ab64912924ce7ee30c1 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Fri, 17 Jul 2015 12:43:58 -0700
Subject: [PATCH 209/549] [SPARK-9138] [MLLIB] fix Vectors.dense

Vectors.dense() should accept numbers directly, like the one in Scala. We already use it in doctests, it worked by luck.

cc mengxr jkbradley

Author: Davies Liu <davies@databricks.com>

Closes #7476 from davies/fix_vectors_dense and squashes the following commits:

e0fd292 [Davies Liu] fix Vectors.dense
---
 python/pyspark/mllib/linalg.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/python/pyspark/mllib/linalg.py b/python/pyspark/mllib/linalg.py
index 040886f71775b..529bd75894c96 100644
--- a/python/pyspark/mllib/linalg.py
+++ b/python/pyspark/mllib/linalg.py
@@ -30,6 +30,7 @@
     basestring = str
     xrange = range
     import copyreg as copy_reg
+    long = int
 else:
     from itertools import izip as zip
     import copy_reg
@@ -770,14 +771,18 @@ def sparse(size, *args):
         return SparseVector(size, *args)
 
     @staticmethod
-    def dense(elements):
+    def dense(*elements):
         """
-        Create a dense vector of 64-bit floats from a Python list. Always
-        returns a NumPy array.
+        Create a dense vector of 64-bit floats from a Python list or numbers.
 
         >>> Vectors.dense([1, 2, 3])
         DenseVector([1.0, 2.0, 3.0])
+        >>> Vectors.dense(1.0, 2.0)
+        DenseVector([1.0, 2.0])
         """
+        if len(elements) == 1 and not isinstance(elements[0], (float, int, long)):
+            # it's list, numpy.array or other iterable object.
+            elements = elements[0]
         return DenseVector(elements)
 
     @staticmethod

From 806c579f43ce66ac1398200cbc773fa3b69b5cb6 Mon Sep 17 00:00:00 2001
From: Yuhao Yang <hhbyyh@gmail.com>
Date: Fri, 17 Jul 2015 13:43:19 -0700
Subject: [PATCH 210/549] [SPARK-9062] [ML] Change output type of Tokenizer to
 Array(String, true)

jira: https://issues.apache.org/jira/browse/SPARK-9062

Currently output type of Tokenizer is Array(String, false), which is not compatible with Word2Vec and Other transformers since their input type is Array(String, true). Seq[String] in udf will be treated as Array(String, true) by default.

I'm not sure what's the recommended way for Tokenizer to handle the null value in the input. Any suggestion will be welcome.

Author: Yuhao Yang <hhbyyh@gmail.com>

Closes #7414 from hhbyyh/tokenizer and squashes the following commits:

c01bd7a [Yuhao Yang] change output type of tokenizer
---
 .../main/scala/org/apache/spark/ml/feature/Tokenizer.scala    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala
index 5f9f57a2ebcfa..0b3af4747e693 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala
@@ -42,7 +42,7 @@ class Tokenizer(override val uid: String) extends UnaryTransformer[String, Seq[S
     require(inputType == StringType, s"Input type must be string type but got $inputType.")
   }
 
-  override protected def outputDataType: DataType = new ArrayType(StringType, false)
+  override protected def outputDataType: DataType = new ArrayType(StringType, true)
 
   override def copy(extra: ParamMap): Tokenizer = defaultCopy(extra)
 }
@@ -113,7 +113,7 @@ class RegexTokenizer(override val uid: String)
     require(inputType == StringType, s"Input type must be string type but got $inputType.")
   }
 
-  override protected def outputDataType: DataType = new ArrayType(StringType, false)
+  override protected def outputDataType: DataType = new ArrayType(StringType, true)
 
   override def copy(extra: ParamMap): RegexTokenizer = defaultCopy(extra)
 }

From 9974642870404381fa425fadb966c6dd3ac4a94f Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Fri, 17 Jul 2015 13:55:17 -0700
Subject: [PATCH 211/549] [SPARK-8600] [ML] Naive Bayes API for spark.ml
 Pipelines

Naive Bayes API for spark.ml Pipelines

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #7284 from yanboliang/spark-8600 and squashes the following commits:

bc890f7 [Yanbo Liang] remove labels valid check
c3de687 [Yanbo Liang] remove labels from ml.NaiveBayesModel
a2b3088 [Yanbo Liang] address comments
3220b82 [Yanbo Liang] trigger jenkins
3018a41 [Yanbo Liang] address comments
208e166 [Yanbo Liang] Naive Bayes API for spark.ml Pipelines
---
 .../spark/ml/classification/NaiveBayes.scala  | 178 ++++++++++++++++++
 .../mllib/classification/NaiveBayes.scala     |  10 +-
 .../apache/spark/mllib/linalg/Matrices.scala  |   6 +-
 .../classification/JavaNaiveBayesSuite.java   |  98 ++++++++++
 .../ml/classification/NaiveBayesSuite.scala   | 116 ++++++++++++
 5 files changed, 400 insertions(+), 8 deletions(-)
 create mode 100644 mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
 create mode 100644 mllib/src/test/java/org/apache/spark/ml/classification/JavaNaiveBayesSuite.java
 create mode 100644 mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
new file mode 100644
index 0000000000000..1f547e4a98af7
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
@@ -0,0 +1,178 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.classification
+
+import org.apache.spark.SparkException
+import org.apache.spark.ml.{PredictorParams, PredictionModel, Predictor}
+import org.apache.spark.ml.param.{ParamMap, ParamValidators, Param, DoubleParam}
+import org.apache.spark.ml.util.Identifiable
+import org.apache.spark.mllib.classification.{NaiveBayes => OldNaiveBayes}
+import org.apache.spark.mllib.classification.{NaiveBayesModel => OldNaiveBayesModel}
+import org.apache.spark.mllib.linalg._
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.DataFrame
+
+/**
+ * Params for Naive Bayes Classifiers.
+ */
+private[ml] trait NaiveBayesParams extends PredictorParams {
+
+  /**
+   * The smoothing parameter.
+   * (default = 1.0).
+   * @group param
+   */
+  final val lambda: DoubleParam = new DoubleParam(this, "lambda", "The smoothing parameter.",
+    ParamValidators.gtEq(0))
+
+  /** @group getParam */
+  final def getLambda: Double = $(lambda)
+
+  /**
+   * The model type which is a string (case-sensitive).
+   * Supported options: "multinomial" and "bernoulli".
+   * (default = multinomial)
+   * @group param
+   */
+  final val modelType: Param[String] = new Param[String](this, "modelType", "The model type " +
+    "which is a string (case-sensitive). Supported options: multinomial (default) and bernoulli.",
+    ParamValidators.inArray[String](OldNaiveBayes.supportedModelTypes.toArray))
+
+  /** @group getParam */
+  final def getModelType: String = $(modelType)
+}
+
+/**
+ * Naive Bayes Classifiers.
+ * It supports both Multinomial NB
+ * ([[http://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html]])
+ * which can handle finitely supported discrete data. For example, by converting documents into
+ * TF-IDF vectors, it can be used for document classification. By making every vector a
+ * binary (0/1) data, it can also be used as Bernoulli NB
+ * ([[http://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html]]).
+ * The input feature values must be nonnegative.
+ */
+class NaiveBayes(override val uid: String)
+  extends Predictor[Vector, NaiveBayes, NaiveBayesModel]
+  with NaiveBayesParams {
+
+  def this() = this(Identifiable.randomUID("nb"))
+
+  /**
+   * Set the smoothing parameter.
+   * Default is 1.0.
+   * @group setParam
+   */
+  def setLambda(value: Double): this.type = set(lambda, value)
+  setDefault(lambda -> 1.0)
+
+  /**
+   * Set the model type using a string (case-sensitive).
+   * Supported options: "multinomial" and "bernoulli".
+   * Default is "multinomial"
+   */
+  def setModelType(value: String): this.type = set(modelType, value)
+  setDefault(modelType -> OldNaiveBayes.Multinomial)
+
+  override protected def train(dataset: DataFrame): NaiveBayesModel = {
+    val oldDataset: RDD[LabeledPoint] = extractLabeledPoints(dataset)
+    val oldModel = OldNaiveBayes.train(oldDataset, $(lambda), $(modelType))
+    NaiveBayesModel.fromOld(oldModel, this)
+  }
+
+  override def copy(extra: ParamMap): NaiveBayes = defaultCopy(extra)
+}
+
+/**
+ * Model produced by [[NaiveBayes]]
+ */
+class NaiveBayesModel private[ml] (
+    override val uid: String,
+    val pi: Vector,
+    val theta: Matrix)
+  extends PredictionModel[Vector, NaiveBayesModel] with NaiveBayesParams {
+
+  import OldNaiveBayes.{Bernoulli, Multinomial}
+
+  /**
+   * Bernoulli scoring requires log(condprob) if 1, log(1-condprob) if 0.
+   * This precomputes log(1.0 - exp(theta)) and its sum which are used for the linear algebra
+   * application of this condition (in predict function).
+   */
+  private lazy val (thetaMinusNegTheta, negThetaSum) = $(modelType) match {
+    case Multinomial => (None, None)
+    case Bernoulli =>
+      val negTheta = theta.map(value => math.log(1.0 - math.exp(value)))
+      val ones = new DenseVector(Array.fill(theta.numCols){1.0})
+      val thetaMinusNegTheta = theta.map { value =>
+        value - math.log(1.0 - math.exp(value))
+      }
+      (Option(thetaMinusNegTheta), Option(negTheta.multiply(ones)))
+    case _ =>
+      // This should never happen.
+      throw new UnknownError(s"Invalid modelType: ${$(modelType)}.")
+  }
+
+  override protected def predict(features: Vector): Double = {
+    $(modelType) match {
+      case Multinomial =>
+        val prob = theta.multiply(features)
+        BLAS.axpy(1.0, pi, prob)
+        prob.argmax
+      case Bernoulli =>
+        features.foreachActive{ (index, value) =>
+          if (value != 0.0 && value != 1.0) {
+            throw new SparkException(
+              s"Bernoulli naive Bayes requires 0 or 1 feature values but found $features")
+          }
+        }
+        val prob = thetaMinusNegTheta.get.multiply(features)
+        BLAS.axpy(1.0, pi, prob)
+        BLAS.axpy(1.0, negThetaSum.get, prob)
+        prob.argmax
+      case _ =>
+        // This should never happen.
+        throw new UnknownError(s"Invalid modelType: ${$(modelType)}.")
+    }
+  }
+
+  override def copy(extra: ParamMap): NaiveBayesModel = {
+    copyValues(new NaiveBayesModel(uid, pi, theta).setParent(this.parent), extra)
+  }
+
+  override def toString: String = {
+    s"NaiveBayesModel with ${pi.size} classes"
+  }
+
+}
+
+private[ml] object NaiveBayesModel {
+
+  /** Convert a model from the old API */
+  def fromOld(
+      oldModel: OldNaiveBayesModel,
+      parent: NaiveBayes): NaiveBayesModel = {
+    val uid = if (parent != null) parent.uid else Identifiable.randomUID("nb")
+    val labels = Vectors.dense(oldModel.labels)
+    val pi = Vectors.dense(oldModel.pi)
+    val theta = new DenseMatrix(oldModel.labels.length, oldModel.theta(0).length,
+      oldModel.theta.flatten, true)
+    new NaiveBayesModel(uid, pi, theta)
+  }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
index 9e379d7d74b2f..8cf4e15efe7a7 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
@@ -40,7 +40,7 @@ import org.apache.spark.sql.{DataFrame, SQLContext}
  *              where D is number of features
  * @param modelType The type of NB model to fit  can be "multinomial" or "bernoulli"
  */
-class NaiveBayesModel private[mllib] (
+class NaiveBayesModel private[spark] (
     val labels: Array[Double],
     val pi: Array[Double],
     val theta: Array[Array[Double]],
@@ -382,7 +382,7 @@ class NaiveBayes private (
         BLAS.axpy(1.0, c2._2, c1._2)
         (c1._1 + c2._1, c1._2)
       }
-    ).collect()
+    ).collect().sortBy(_._1)
 
     val numLabels = aggregated.length
     var numDocuments = 0L
@@ -425,13 +425,13 @@ class NaiveBayes private (
 object NaiveBayes {
 
   /** String name for multinomial model type. */
-  private[classification] val Multinomial: String = "multinomial"
+  private[spark] val Multinomial: String = "multinomial"
 
   /** String name for Bernoulli model type. */
-  private[classification] val Bernoulli: String = "bernoulli"
+  private[spark] val Bernoulli: String = "bernoulli"
 
   /* Set of modelTypes that NaiveBayes supports */
-  private[classification] val supportedModelTypes = Set(Multinomial, Bernoulli)
+  private[spark] val supportedModelTypes = Set(Multinomial, Bernoulli)
 
   /**
    * Trains a Naive Bayes model given an RDD of `(label, features)` pairs.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
index 0df07663405a3..55da0e094d132 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
@@ -98,7 +98,7 @@ sealed trait Matrix extends Serializable {
   /** Map the values of this matrix using a function. Generates a new matrix. Performs the
     * function on only the backing array. For example, an operation such as addition or
     * subtraction will only be performed on the non-zero values in a `SparseMatrix`. */
-  private[mllib] def map(f: Double => Double): Matrix
+  private[spark] def map(f: Double => Double): Matrix
 
   /** Update all the values of this matrix using the function f. Performed in-place on the
     * backing array. For example, an operation such as addition or subtraction will only be
@@ -289,7 +289,7 @@ class DenseMatrix(
 
   override def copy: DenseMatrix = new DenseMatrix(numRows, numCols, values.clone())
 
-  private[mllib] def map(f: Double => Double) = new DenseMatrix(numRows, numCols, values.map(f),
+  private[spark] def map(f: Double => Double) = new DenseMatrix(numRows, numCols, values.map(f),
     isTransposed)
 
   private[mllib] def update(f: Double => Double): DenseMatrix = {
@@ -555,7 +555,7 @@ class SparseMatrix(
     new SparseMatrix(numRows, numCols, colPtrs, rowIndices, values.clone())
   }
 
-  private[mllib] def map(f: Double => Double) =
+  private[spark] def map(f: Double => Double) =
     new SparseMatrix(numRows, numCols, colPtrs, rowIndices, values.map(f), isTransposed)
 
   private[mllib] def update(f: Double => Double): SparseMatrix = {
diff --git a/mllib/src/test/java/org/apache/spark/ml/classification/JavaNaiveBayesSuite.java b/mllib/src/test/java/org/apache/spark/ml/classification/JavaNaiveBayesSuite.java
new file mode 100644
index 0000000000000..09a9fba0c19cf
--- /dev/null
+++ b/mllib/src/test/java/org/apache/spark/ml/classification/JavaNaiveBayesSuite.java
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.classification;
+
+import java.io.Serializable;
+
+import com.google.common.collect.Lists;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.mllib.linalg.VectorUDT;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+
+public class JavaNaiveBayesSuite implements Serializable {
+
+  private transient JavaSparkContext jsc;
+  private transient SQLContext jsql;
+
+  @Before
+  public void setUp() {
+    jsc = new JavaSparkContext("local", "JavaLogisticRegressionSuite");
+    jsql = new SQLContext(jsc);
+  }
+
+  @After
+  public void tearDown() {
+    jsc.stop();
+    jsc = null;
+  }
+
+  public void validatePrediction(DataFrame predictionAndLabels) {
+    for (Row r : predictionAndLabels.collect()) {
+      double prediction = r.getAs(0);
+      double label = r.getAs(1);
+      assert(prediction == label);
+    }
+  }
+
+  @Test
+  public void naiveBayesDefaultParams() {
+    NaiveBayes nb = new NaiveBayes();
+    assert(nb.getLabelCol() == "label");
+    assert(nb.getFeaturesCol() == "features");
+    assert(nb.getPredictionCol() == "prediction");
+    assert(nb.getLambda() == 1.0);
+    assert(nb.getModelType() == "multinomial");
+  }
+
+  @Test
+  public void testNaiveBayes() {
+    JavaRDD<Row> jrdd = jsc.parallelize(Lists.newArrayList(
+      RowFactory.create(0.0, Vectors.dense(1.0, 0.0, 0.0)),
+      RowFactory.create(0.0, Vectors.dense(2.0, 0.0, 0.0)),
+      RowFactory.create(1.0, Vectors.dense(0.0, 1.0, 0.0)),
+      RowFactory.create(1.0, Vectors.dense(0.0, 2.0, 0.0)),
+      RowFactory.create(2.0, Vectors.dense(0.0, 0.0, 1.0)),
+      RowFactory.create(2.0, Vectors.dense(0.0, 0.0, 2.0))
+    ));
+
+    StructType schema = new StructType(new StructField[]{
+      new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
+      new StructField("features", new VectorUDT(), false, Metadata.empty())
+    });
+
+    DataFrame dataset = jsql.createDataFrame(jrdd, schema);
+    NaiveBayes nb = new NaiveBayes().setLambda(0.5).setModelType("multinomial");
+    NaiveBayesModel model = nb.fit(dataset);
+
+    DataFrame predictionAndLabels = model.transform(dataset).select("prediction", "label");
+    validatePrediction(predictionAndLabels);
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala
new file mode 100644
index 0000000000000..76381a2741296
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala
@@ -0,0 +1,116 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.classification
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.ml.param.ParamsSuite
+import org.apache.spark.mllib.linalg._
+import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.mllib.util.TestingUtils._
+import org.apache.spark.mllib.classification.NaiveBayesSuite._
+import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.Row
+
+class NaiveBayesSuite extends SparkFunSuite with MLlibTestSparkContext {
+
+  def validatePrediction(predictionAndLabels: DataFrame): Unit = {
+    val numOfErrorPredictions = predictionAndLabels.collect().count {
+      case Row(prediction: Double, label: Double) =>
+        prediction != label
+    }
+    // At least 80% of the predictions should be on.
+    assert(numOfErrorPredictions < predictionAndLabels.count() / 5)
+  }
+
+  def validateModelFit(
+      piData: Vector,
+      thetaData: Matrix,
+      model: NaiveBayesModel): Unit = {
+    assert(Vectors.dense(model.pi.toArray.map(math.exp)) ~==
+      Vectors.dense(piData.toArray.map(math.exp)) absTol 0.05, "pi mismatch")
+    assert(model.theta.map(math.exp) ~== thetaData.map(math.exp) absTol 0.05, "theta mismatch")
+  }
+
+  test("params") {
+    ParamsSuite.checkParams(new NaiveBayes)
+    val model = new NaiveBayesModel("nb", pi = Vectors.dense(Array(0.2, 0.8)),
+      theta = new DenseMatrix(2, 3, Array(0.1, 0.2, 0.3, 0.4, 0.6, 0.4)))
+    ParamsSuite.checkParams(model)
+  }
+
+  test("naive bayes: default params") {
+    val nb = new NaiveBayes
+    assert(nb.getLabelCol === "label")
+    assert(nb.getFeaturesCol === "features")
+    assert(nb.getPredictionCol === "prediction")
+    assert(nb.getLambda === 1.0)
+    assert(nb.getModelType === "multinomial")
+  }
+
+  test("Naive Bayes Multinomial") {
+    val nPoints = 1000
+    val piArray = Array(0.5, 0.1, 0.4).map(math.log)
+    val thetaArray = Array(
+      Array(0.70, 0.10, 0.10, 0.10), // label 0
+      Array(0.10, 0.70, 0.10, 0.10), // label 1
+      Array(0.10, 0.10, 0.70, 0.10)  // label 2
+    ).map(_.map(math.log))
+    val pi = Vectors.dense(piArray)
+    val theta = new DenseMatrix(3, 4, thetaArray.flatten, true)
+
+    val testDataset = sqlContext.createDataFrame(generateNaiveBayesInput(
+      piArray, thetaArray, nPoints, 42, "multinomial"))
+    val nb = new NaiveBayes().setLambda(1.0).setModelType("multinomial")
+    val model = nb.fit(testDataset)
+
+    validateModelFit(pi, theta, model)
+    assert(model.hasParent)
+
+    val validationDataset = sqlContext.createDataFrame(generateNaiveBayesInput(
+      piArray, thetaArray, nPoints, 17, "multinomial"))
+    val predictionAndLabels = model.transform(validationDataset).select("prediction", "label")
+
+    validatePrediction(predictionAndLabels)
+  }
+
+  test("Naive Bayes Bernoulli") {
+    val nPoints = 10000
+    val piArray = Array(0.5, 0.3, 0.2).map(math.log)
+    val thetaArray = Array(
+      Array(0.50, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.40), // label 0
+      Array(0.02, 0.70, 0.10, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02), // label 1
+      Array(0.02, 0.02, 0.60, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.30)  // label 2
+    ).map(_.map(math.log))
+    val pi = Vectors.dense(piArray)
+    val theta = new DenseMatrix(3, 12, thetaArray.flatten, true)
+
+    val testDataset = sqlContext.createDataFrame(generateNaiveBayesInput(
+      piArray, thetaArray, nPoints, 45, "bernoulli"))
+    val nb = new NaiveBayes().setLambda(1.0).setModelType("bernoulli")
+    val model = nb.fit(testDataset)
+
+    validateModelFit(pi, theta, model)
+    assert(model.hasParent)
+
+    val validationDataset = sqlContext.createDataFrame(generateNaiveBayesInput(
+      piArray, thetaArray, nPoints, 20, "bernoulli"))
+    val predictionAndLabels = model.transform(validationDataset).select("prediction", "label")
+
+    validatePrediction(predictionAndLabels)
+  }
+}

From 074085d6781a580017a45101b8b54ffd7bd31294 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@outlook.com>
Date: Fri, 17 Jul 2015 13:57:31 -0700
Subject: [PATCH 212/549] [SPARK-9136] [SQL] fix several bugs in
 DateTimeUtils.stringToTimestamp

a follow up of https://github.com/apache/spark/pull/7353

1. we should use `Calendar.HOUR_OF_DAY` instead of `Calendar.HOUR`(this is for AM, PM).
2. we should call `c.set(Calendar.MILLISECOND, 0)` after `Calendar.getInstance`

I'm not sure why the tests didn't fail in jenkins, but I ran latest spark master branch locally and `DateTimeUtilsSuite` failed.

Author: Wenchen Fan <cloud0fan@outlook.com>

Closes #7473 from cloud-fan/datetime and squashes the following commits:

66cdaf2 [Wenchen Fan] fix several bugs in DateTimeUtils.stringToTimestamp
---
 .../spark/sql/catalyst/util/DateTimeUtils.scala     |  5 +++--
 .../sql/catalyst/util/DateTimeUtilsSuite.scala      | 13 +++++++++++--
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
index 53c32a0a9802b..f33e34b380bcf 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
@@ -320,16 +320,17 @@ object DateTimeUtils {
       Calendar.getInstance(
         TimeZone.getTimeZone(f"GMT${timeZone.get.toChar}${segments(7)}%02d:${segments(8)}%02d"))
     }
+    c.set(Calendar.MILLISECOND, 0)
 
     if (justTime) {
-      c.set(Calendar.HOUR, segments(3))
+      c.set(Calendar.HOUR_OF_DAY, segments(3))
       c.set(Calendar.MINUTE, segments(4))
       c.set(Calendar.SECOND, segments(5))
     } else {
       c.set(segments(0), segments(1) - 1, segments(2), segments(3), segments(4), segments(5))
     }
 
-    Some(c.getTimeInMillis / 1000 * 1000000 + segments(6))
+    Some(c.getTimeInMillis * 1000 + segments(6))
   }
 
   /**
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
index c65fcbc4d1bc1..5c3a621c6d11f 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
@@ -243,8 +243,17 @@ class DateTimeUtilsSuite extends SparkFunSuite {
       UTF8String.fromString("2015-03-18T12:03:17.12312+7:30")).get ===
         c.getTimeInMillis * 1000 + 120)
 
+    c = Calendar.getInstance()
+    c.set(Calendar.HOUR_OF_DAY, 18)
+    c.set(Calendar.MINUTE, 12)
+    c.set(Calendar.SECOND, 15)
+    c.set(Calendar.MILLISECOND, 0)
+    assert(DateTimeUtils.stringToTimestamp(
+      UTF8String.fromString("18:12:15")).get ===
+      c.getTimeInMillis * 1000)
+
     c = Calendar.getInstance(TimeZone.getTimeZone("GMT+07:30"))
-    c.set(Calendar.HOUR, 18)
+    c.set(Calendar.HOUR_OF_DAY, 18)
     c.set(Calendar.MINUTE, 12)
     c.set(Calendar.SECOND, 15)
     c.set(Calendar.MILLISECOND, 123)
@@ -253,7 +262,7 @@ class DateTimeUtilsSuite extends SparkFunSuite {
       c.getTimeInMillis * 1000 + 120)
 
     c = Calendar.getInstance(TimeZone.getTimeZone("GMT+07:30"))
-    c.set(Calendar.HOUR, 18)
+    c.set(Calendar.HOUR_OF_DAY, 18)
     c.set(Calendar.MINUTE, 12)
     c.set(Calendar.SECOND, 15)
     c.set(Calendar.MILLISECOND, 123)

From ad0954f6de29761e0e7e543212c5bfe1fdcbed9f Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Fri, 17 Jul 2015 14:00:31 -0700
Subject: [PATCH 213/549] [SPARK-5681] [STREAMING] Move 'stopReceivers' to the
 event loop to resolve the race condition

This is an alternative way to fix `SPARK-5681`. It minimizes the changes.

Closes #4467

Author: zsxwing <zsxwing@gmail.com>
Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #6294 from zsxwing/pr4467 and squashes the following commits:

709ac1f [zsxwing] Fix the comment
e103e8a [zsxwing] Move ReceiverTracker.stop into ReceiverTracker.stop
f637142 [zsxwing] Address minor code style comments
a178d37 [zsxwing] Move 'stopReceivers' to the event looop to resolve the race condition
51fb07e [zsxwing] Fix the code style
3cb19a3 [zsxwing] Merge branch 'master' into pr4467
b4c29e7 [zsxwing] Stop receiver only if we start it
c41ee94 [zsxwing] Make stopReceivers private
7c73c1f [zsxwing] Use trackerStateLock to protect trackerState
a8120c0 [zsxwing] Merge branch 'master' into pr4467
7b1d9af [zsxwing] "case Throwable" => "case NonFatal"
15ed4a1 [zsxwing] Register before starting the receiver
fff63f9 [zsxwing] Use a lock to eliminate the race condition when stopping receivers and registering receivers happen at the same time.
e0ef72a [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into tracker_status_timeout
19b76d9 [Liang-Chi Hsieh] Remove timeout.
34c18dc [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into tracker_status_timeout
c419677 [Liang-Chi Hsieh] Fix style.
9e1a760 [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into tracker_status_timeout
355f9ce [Liang-Chi Hsieh] Separate register and start events for receivers.
3d568e8 [Liang-Chi Hsieh] Let receivers get registered first before going started.
ae0d9fd [Liang-Chi Hsieh] Merge branch 'master' into tracker_status_timeout
77983f3 [Liang-Chi Hsieh] Add tracker status and stop to receive messages when stopping tracker.
---
 .../receiver/ReceiverSupervisor.scala         |  42 ++++--
 .../receiver/ReceiverSupervisorImpl.scala     |   2 +-
 .../streaming/scheduler/ReceiverTracker.scala | 139 ++++++++++++------
 .../spark/streaming/ReceiverSuite.scala       |   2 +
 .../streaming/StreamingContextSuite.scala     |  15 ++
 5 files changed, 138 insertions(+), 62 deletions(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisor.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisor.scala
index eeb14ca3a49e9..6467029a277b2 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisor.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisor.scala
@@ -22,6 +22,7 @@ import java.util.concurrent.CountDownLatch
 
 import scala.collection.mutable.ArrayBuffer
 import scala.concurrent._
+import scala.util.control.NonFatal
 
 import org.apache.spark.{Logging, SparkConf}
 import org.apache.spark.storage.StreamBlockId
@@ -36,7 +37,7 @@ private[streaming] abstract class ReceiverSupervisor(
     conf: SparkConf
   ) extends Logging {
 
-  /** Enumeration to identify current state of the StreamingContext */
+  /** Enumeration to identify current state of the Receiver */
   object ReceiverState extends Enumeration {
     type CheckpointState = Value
     val Initialized, Started, Stopped = Value
@@ -97,8 +98,8 @@ private[streaming] abstract class ReceiverSupervisor(
   /** Called when supervisor is stopped */
   protected def onStop(message: String, error: Option[Throwable]) { }
 
-  /** Called when receiver is started */
-  protected def onReceiverStart() { }
+  /** Called when receiver is started. Return true if the driver accepts us */
+  protected def onReceiverStart(): Boolean
 
   /** Called when receiver is stopped */
   protected def onReceiverStop(message: String, error: Option[Throwable]) { }
@@ -121,13 +122,17 @@ private[streaming] abstract class ReceiverSupervisor(
   /** Start receiver */
   def startReceiver(): Unit = synchronized {
     try {
-      logInfo("Starting receiver")
-      receiver.onStart()
-      logInfo("Called receiver onStart")
-      onReceiverStart()
-      receiverState = Started
+      if (onReceiverStart()) {
+        logInfo("Starting receiver")
+        receiverState = Started
+        receiver.onStart()
+        logInfo("Called receiver onStart")
+      } else {
+        // The driver refused us
+        stop("Registered unsuccessfully because Driver refused to start receiver " + streamId, None)
+      }
     } catch {
-      case t: Throwable =>
+      case NonFatal(t) =>
         stop("Error starting receiver " + streamId, Some(t))
     }
   }
@@ -136,12 +141,19 @@ private[streaming] abstract class ReceiverSupervisor(
   def stopReceiver(message: String, error: Option[Throwable]): Unit = synchronized {
     try {
       logInfo("Stopping receiver with message: " + message + ": " + error.getOrElse(""))
-      receiverState = Stopped
-      receiver.onStop()
-      logInfo("Called receiver onStop")
-      onReceiverStop(message, error)
+      receiverState match {
+        case Initialized =>
+          logWarning("Skip stopping receiver because it has not yet stared")
+        case Started =>
+          receiverState = Stopped
+          receiver.onStop()
+          logInfo("Called receiver onStop")
+          onReceiverStop(message, error)
+        case Stopped =>
+          logWarning("Receiver has been stopped")
+      }
     } catch {
-      case t: Throwable =>
+      case NonFatal(t) =>
         logError("Error stopping receiver " + streamId + t.getStackTraceString)
     }
   }
@@ -167,7 +179,7 @@ private[streaming] abstract class ReceiverSupervisor(
     }(futureExecutionContext)
   }
 
-  /** Check if receiver has been marked for stopping */
+  /** Check if receiver has been marked for starting */
   def isReceiverStarted(): Boolean = {
     logDebug("state = " + receiverState)
     receiverState == Started
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisorImpl.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisorImpl.scala
index 6078cdf8f8790..f6ba66b3ae036 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisorImpl.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisorImpl.scala
@@ -162,7 +162,7 @@ private[streaming] class ReceiverSupervisorImpl(
     env.rpcEnv.stop(endpoint)
   }
 
-  override protected def onReceiverStart() {
+  override protected def onReceiverStart(): Boolean = {
     val msg = RegisterReceiver(
       streamId, receiver.getClass.getSimpleName, Utils.localHostName(), endpoint)
     trackerEndpoint.askWithRetry[Boolean](msg)
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala
index 644e581cd8279..6910d81d9866e 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala
@@ -20,7 +20,6 @@ package org.apache.spark.streaming.scheduler
 import scala.collection.mutable.{ArrayBuffer, HashMap, SynchronizedMap}
 import scala.language.existentials
 import scala.math.max
-import org.apache.spark.rdd._
 
 import org.apache.spark.streaming.util.WriteAheadLogUtils
 import org.apache.spark.{Logging, SparkEnv, SparkException}
@@ -47,6 +46,8 @@ private[streaming] case class ReportError(streamId: Int, message: String, error:
 private[streaming] case class DeregisterReceiver(streamId: Int, msg: String, error: String)
   extends ReceiverTrackerMessage
 
+private[streaming] case object StopAllReceivers extends ReceiverTrackerMessage
+
 /**
  * This class manages the execution of the receivers of ReceiverInputDStreams. Instance of
  * this class must be created after all input streams have been added and StreamingContext.start()
@@ -71,13 +72,23 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false
   )
   private val listenerBus = ssc.scheduler.listenerBus
 
+  /** Enumeration to identify current state of the ReceiverTracker */
+  object TrackerState extends Enumeration {
+    type TrackerState = Value
+    val Initialized, Started, Stopping, Stopped = Value
+  }
+  import TrackerState._
+
+  /** State of the tracker. Protected by "trackerStateLock" */
+  @volatile private var trackerState = Initialized
+
   // endpoint is created when generator starts.
   // This not being null means the tracker has been started and not stopped
   private var endpoint: RpcEndpointRef = null
 
   /** Start the endpoint and receiver execution thread. */
   def start(): Unit = synchronized {
-    if (endpoint != null) {
+    if (isTrackerStarted) {
       throw new SparkException("ReceiverTracker already started")
     }
 
@@ -86,20 +97,46 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false
         "ReceiverTracker", new ReceiverTrackerEndpoint(ssc.env.rpcEnv))
       if (!skipReceiverLaunch) receiverExecutor.start()
       logInfo("ReceiverTracker started")
+      trackerState = Started
     }
   }
 
   /** Stop the receiver execution thread. */
   def stop(graceful: Boolean): Unit = synchronized {
-    if (!receiverInputStreams.isEmpty && endpoint != null) {
+    if (isTrackerStarted) {
       // First, stop the receivers
-      if (!skipReceiverLaunch) receiverExecutor.stop(graceful)
+      trackerState = Stopping
+      if (!skipReceiverLaunch) {
+        // Send the stop signal to all the receivers
+        endpoint.askWithRetry[Boolean](StopAllReceivers)
+
+        // Wait for the Spark job that runs the receivers to be over
+        // That is, for the receivers to quit gracefully.
+        receiverExecutor.awaitTermination(10000)
+
+        if (graceful) {
+          val pollTime = 100
+          logInfo("Waiting for receiver job to terminate gracefully")
+          while (receiverInfo.nonEmpty || receiverExecutor.running) {
+            Thread.sleep(pollTime)
+          }
+          logInfo("Waited for receiver job to terminate gracefully")
+        }
+
+        // Check if all the receivers have been deregistered or not
+        if (receiverInfo.nonEmpty) {
+          logWarning("Not all of the receivers have deregistered, " + receiverInfo)
+        } else {
+          logInfo("All of the receivers have deregistered successfully")
+        }
+      }
 
       // Finally, stop the endpoint
       ssc.env.rpcEnv.stop(endpoint)
       endpoint = null
       receivedBlockTracker.stop()
       logInfo("ReceiverTracker stopped")
+      trackerState = Stopped
     }
   }
 
@@ -145,14 +182,23 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false
       host: String,
       receiverEndpoint: RpcEndpointRef,
       senderAddress: RpcAddress
-    ) {
+    ): Boolean = {
     if (!receiverInputStreamIds.contains(streamId)) {
       throw new SparkException("Register received for unexpected id " + streamId)
     }
-    receiverInfo(streamId) = ReceiverInfo(
-      streamId, s"${typ}-${streamId}", receiverEndpoint, true, host)
-    listenerBus.post(StreamingListenerReceiverStarted(receiverInfo(streamId)))
-    logInfo("Registered receiver for stream " + streamId + " from " + senderAddress)
+
+    if (isTrackerStopping || isTrackerStopped) {
+      false
+    } else {
+      // "stopReceivers" won't happen at the same time because both "registerReceiver" and are
+      // called in the event loop. So here we can assume "stopReceivers" has not yet been called. If
+      // "stopReceivers" is called later, it should be able to see this receiver.
+      receiverInfo(streamId) = ReceiverInfo(
+        streamId, s"${typ}-${streamId}", receiverEndpoint, true, host)
+      listenerBus.post(StreamingListenerReceiverStarted(receiverInfo(streamId)))
+      logInfo("Registered receiver for stream " + streamId + " from " + senderAddress)
+      true
+    }
   }
 
   /** Deregister a receiver */
@@ -220,20 +266,33 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false
 
     override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
       case RegisterReceiver(streamId, typ, host, receiverEndpoint) =>
-        registerReceiver(streamId, typ, host, receiverEndpoint, context.sender.address)
-        context.reply(true)
+        val successful =
+          registerReceiver(streamId, typ, host, receiverEndpoint, context.sender.address)
+        context.reply(successful)
       case AddBlock(receivedBlockInfo) =>
         context.reply(addBlock(receivedBlockInfo))
       case DeregisterReceiver(streamId, message, error) =>
         deregisterReceiver(streamId, message, error)
         context.reply(true)
+      case StopAllReceivers =>
+        assert(isTrackerStopping || isTrackerStopped)
+        stopReceivers()
+        context.reply(true)
+    }
+
+    /** Send stop signal to the receivers. */
+    private def stopReceivers() {
+      // Signal the receivers to stop
+      receiverInfo.values.flatMap { info => Option(info.endpoint)}
+        .foreach { _.send(StopReceiver) }
+      logInfo("Sent stop signal to all " + receiverInfo.size + " receivers")
     }
   }
 
   /** This thread class runs all the receivers on the cluster.  */
   class ReceiverLauncher {
     @transient val env = ssc.env
-    @volatile @transient private var running = false
+    @volatile @transient var running = false
     @transient val thread = new Thread() {
       override def run() {
         try {
@@ -249,31 +308,6 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false
       thread.start()
     }
 
-    def stop(graceful: Boolean) {
-      // Send the stop signal to all the receivers
-      stopReceivers()
-
-      // Wait for the Spark job that runs the receivers to be over
-      // That is, for the receivers to quit gracefully.
-      thread.join(10000)
-
-      if (graceful) {
-        val pollTime = 100
-        logInfo("Waiting for receiver job to terminate gracefully")
-        while (receiverInfo.nonEmpty || running) {
-          Thread.sleep(pollTime)
-        }
-        logInfo("Waited for receiver job to terminate gracefully")
-      }
-
-      // Check if all the receivers have been deregistered or not
-      if (receiverInfo.nonEmpty) {
-        logWarning("Not all of the receivers have deregistered, " + receiverInfo)
-      } else {
-        logInfo("All of the receivers have deregistered successfully")
-      }
-    }
-
     /**
      * Get the list of executors excluding driver
      */
@@ -358,17 +392,30 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false
       // Distribute the receivers and start them
       logInfo("Starting " + receivers.length + " receivers")
       running = true
-      ssc.sparkContext.runJob(tempRDD, ssc.sparkContext.clean(startReceiver))
-      running = false
-      logInfo("All of the receivers have been terminated")
+      try {
+        ssc.sparkContext.runJob(tempRDD, ssc.sparkContext.clean(startReceiver))
+        logInfo("All of the receivers have been terminated")
+      } finally {
+        running = false
+      }
     }
 
-    /** Stops the receivers. */
-    private def stopReceivers() {
-      // Signal the receivers to stop
-      receiverInfo.values.flatMap { info => Option(info.endpoint)}
-                         .foreach { _.send(StopReceiver) }
-      logInfo("Sent stop signal to all " + receiverInfo.size + " receivers")
+    /**
+     * Wait until the Spark job that runs the receivers is terminated, or return when
+     * `milliseconds` elapses
+     */
+    def awaitTermination(milliseconds: Long): Unit = {
+      thread.join(milliseconds)
     }
   }
+
+  /** Check if tracker has been marked for starting */
+  private def isTrackerStarted(): Boolean = trackerState == Started
+
+  /** Check if tracker has been marked for stopping */
+  private def isTrackerStopping(): Boolean = trackerState == Stopping
+
+  /** Check if tracker has been marked for stopped */
+  private def isTrackerStopped(): Boolean = trackerState == Stopped
+
 }
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/ReceiverSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/ReceiverSuite.scala
index 5d7127627eea5..13b4d17c86183 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/ReceiverSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/ReceiverSuite.scala
@@ -346,6 +346,8 @@ class ReceiverSuite extends TestSuiteBase with Timeouts with Serializable {
     def reportError(message: String, throwable: Throwable) {
       errors += throwable
     }
+
+    override protected def onReceiverStart(): Boolean = true
   }
 
   /**
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala
index f588cf5bc1e7c..4bba9691f8aa5 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala
@@ -285,6 +285,21 @@ class StreamingContextSuite extends SparkFunSuite with BeforeAndAfter with Timeo
     }
   }
 
+  test("stop gracefully even if a receiver misses StopReceiver") {
+    // This is not a deterministic unit. But if this unit test is flaky, then there is definitely
+    // something wrong. See SPARK-5681
+    val conf = new SparkConf().setMaster(master).setAppName(appName)
+    sc = new SparkContext(conf)
+    ssc = new StreamingContext(sc, Milliseconds(100))
+    val input = ssc.receiverStream(new TestReceiver)
+    input.foreachRDD(_ => {})
+    ssc.start()
+    // Call `ssc.stop` at once so that it's possible that the receiver will miss "StopReceiver"
+    failAfter(30000 millis) {
+      ssc.stop(stopSparkContext = true, stopGracefully = true)
+    }
+  }
+
   test("stop slow receiver gracefully") {
     val conf = new SparkConf().setMaster(master).setAppName(appName)
     conf.set("spark.streaming.gracefulStopTimeout", "20000s")

From 6da1069696186572c66cbd83947c1a1dbd2bc827 Mon Sep 17 00:00:00 2001
From: Feynman Liang <fliang@databricks.com>
Date: Fri, 17 Jul 2015 14:00:53 -0700
Subject: [PATCH 214/549] [SPARK-9090] [ML] Fix definition of residual in
 LinearRegressionSummary, EnsembleTestHelper, and SquaredError

Make the definition of residuals in Spark consistent with literature. We have been using `prediction - label` for residuals, but literature usually defines `residual = label - prediction`.

Author: Feynman Liang <fliang@databricks.com>

Closes #7435 from feynmanliang/SPARK-9090-Fix-LinearRegressionSummary-Residuals and squashes the following commits:

f4b39d8 [Feynman Liang] Fix doc
bc12a92 [Feynman Liang] Tweak EnsembleTestHelper and SquaredError residuals
63f0d60 [Feynman Liang] Fix definition of residual
---
 .../org/apache/spark/ml/regression/LinearRegression.scala     | 4 ++--
 .../scala/org/apache/spark/mllib/tree/loss/SquaredError.scala | 4 ++--
 .../apache/spark/ml/regression/LinearRegressionSuite.scala    | 4 ++--
 .../org/apache/spark/mllib/tree/EnsembleTestHelper.scala      | 2 +-
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index 8fc986056657d..89718e0f3e15a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -355,9 +355,9 @@ class LinearRegressionSummary private[regression] (
    */
   val r2: Double = metrics.r2
 
-  /** Residuals (predicted value - label value) */
+  /** Residuals (label - predicted value) */
   @transient lazy val residuals: DataFrame = {
-    val t = udf { (pred: Double, label: Double) => pred - label}
+    val t = udf { (pred: Double, label: Double) => label - pred }
     predictions.select(t(col(predictionCol), col(labelCol)).as("residuals"))
   }
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/SquaredError.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/SquaredError.scala
index a5582d3ef3324..011a5d57422f7 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/SquaredError.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/SquaredError.scala
@@ -42,11 +42,11 @@ object SquaredError extends Loss {
    * @return Loss gradient
    */
   override def gradient(prediction: Double, label: Double): Double = {
-    2.0 * (prediction - label)
+    - 2.0 * (label - prediction)
   }
 
   override private[mllib] def computeError(prediction: Double, label: Double): Double = {
-    val err = prediction - label
+    val err = label - prediction
     err * err
   }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
index cf120cf2a4b47..374002c5b4fdd 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
@@ -302,7 +302,7 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
       .map { case Row(features: DenseVector, label: Double) =>
       val prediction =
         features(0) * model.weights(0) + features(1) * model.weights(1) + model.intercept
-      prediction - label
+      label - prediction
     }
       .zip(model.summary.residuals.map(_.getDouble(0)))
       .collect()
@@ -314,7 +314,7 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
        Use the following R code to generate model training results.
 
        predictions <- predict(fit, newx=features)
-       residuals <- predictions - label
+       residuals <- label - predictions
        > mean(residuals^2) # MSE
        [1] 0.009720325
        > mean(abs(residuals)) # MAD
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/tree/EnsembleTestHelper.scala b/mllib/src/test/scala/org/apache/spark/mllib/tree/EnsembleTestHelper.scala
index 8972c229b7ecb..334bf3790fc7a 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/tree/EnsembleTestHelper.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/tree/EnsembleTestHelper.scala
@@ -70,7 +70,7 @@ object EnsembleTestHelper {
       metricName: String = "mse") {
     val predictions = input.map(x => model.predict(x.features))
     val errors = predictions.zip(input.map(_.label)).map { case (prediction, label) =>
-      prediction - label
+      label - prediction
     }
     val metric = metricName match {
       case "mse" =>

From 830666f6fe1e77faa39eed2c1c3cd8e83bc93ef9 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Fri, 17 Jul 2015 14:08:06 -0700
Subject: [PATCH 215/549] [SPARK-8792] [ML] Add Python API for PCA transformer

Add Python API for PCA transformer

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #7190 from yanboliang/spark-8792 and squashes the following commits:

8f4ac31 [Yanbo Liang] address comments
8a79cc0 [Yanbo Liang] Add Python API for PCA transformer
---
 python/pyspark/ml/feature.py | 64 +++++++++++++++++++++++++++++++++++-
 1 file changed, 63 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 9bca7cc000aa5..86e654dd0779f 100644
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -24,7 +24,7 @@
 __all__ = ['Binarizer', 'HashingTF', 'IDF', 'IDFModel', 'NGram', 'Normalizer', 'OneHotEncoder',
            'PolynomialExpansion', 'RegexTokenizer', 'StandardScaler', 'StandardScalerModel',
            'StringIndexer', 'StringIndexerModel', 'Tokenizer', 'VectorAssembler', 'VectorIndexer',
-           'Word2Vec', 'Word2VecModel']
+           'Word2Vec', 'Word2VecModel', 'PCA', 'PCAModel']
 
 
 @inherit_doc
@@ -1048,6 +1048,68 @@ class Word2VecModel(JavaModel):
     """
 
 
+@inherit_doc
+class PCA(JavaEstimator, HasInputCol, HasOutputCol):
+    """
+    PCA trains a model to project vectors to a low-dimensional space using PCA.
+
+    >>> from pyspark.mllib.linalg import Vectors
+    >>> data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),),
+    ...     (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),),
+    ...     (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]),)]
+    >>> df = sqlContext.createDataFrame(data,["features"])
+    >>> pca = PCA(k=2, inputCol="features", outputCol="pca_features")
+    >>> model = pca.fit(df)
+    >>> model.transform(df).collect()[0].pca_features
+    DenseVector([1.648..., -4.013...])
+    """
+
+    # a placeholder to make it appear in the generated doc
+    k = Param(Params._dummy(), "k", "the number of principal components")
+
+    @keyword_only
+    def __init__(self, k=None, inputCol=None, outputCol=None):
+        """
+        __init__(self, k=None, inputCol=None, outputCol=None)
+        """
+        super(PCA, self).__init__()
+        self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.PCA", self.uid)
+        self.k = Param(self, "k", "the number of principal components")
+        kwargs = self.__init__._input_kwargs
+        self.setParams(**kwargs)
+
+    @keyword_only
+    def setParams(self, k=None, inputCol=None, outputCol=None):
+        """
+        setParams(self, k=None, inputCol=None, outputCol=None)
+        Set params for this PCA.
+        """
+        kwargs = self.setParams._input_kwargs
+        return self._set(**kwargs)
+
+    def setK(self, value):
+        """
+        Sets the value of :py:attr:`k`.
+        """
+        self._paramMap[self.k] = value
+        return self
+
+    def getK(self):
+        """
+        Gets the value of k or its default value.
+        """
+        return self.getOrDefault(self.k)
+
+    def _create_model(self, java_model):
+        return PCAModel(java_model)
+
+
+class PCAModel(JavaModel):
+    """
+    Model fitted by PCA.
+    """
+
+
 if __name__ == "__main__":
     import doctest
     from pyspark.context import SparkContext

From 8b8be1f5d698e796b96a92f1ed2c13162a90944e Mon Sep 17 00:00:00 2001
From: Bryan Cutler <bjcutler@us.ibm.com>
Date: Fri, 17 Jul 2015 14:10:16 -0700
Subject: [PATCH 216/549] [SPARK-7127] [MLLIB] Adding broadcast of model before
 prediction for ensembles

Broadcast of ensemble models in transformImpl before call to predict

Author: Bryan Cutler <bjcutler@us.ibm.com>

Closes #6300 from BryanCutler/bcast-ensemble-models-7127 and squashes the following commits:

86e73de [Bryan Cutler] [SPARK-7127] Replaced deprecated callUDF with udf
40a139d [Bryan Cutler] Merge branch 'master' into bcast-ensemble-models-7127
9afad56 [Bryan Cutler] [SPARK-7127] Simplified calls by overriding transformImpl and using broadcasted model in callUDF to make prediction
1f34be4 [Bryan Cutler] [SPARK-7127] Removed accidental newline
171a6ce [Bryan Cutler] [SPARK-7127] Used modelAccessor parameter in predictImpl to access broadcasted model
6fd153c [Bryan Cutler] [SPARK-7127] Applied broadcasting to remaining ensemble models
aaad77b [Bryan Cutler] [SPARK-7127] Removed abstract class for broadcasting model, instead passing a prediction function as param to transform
83904bb [Bryan Cutler] [SPARK-7127] Adding broadcast of model before prediction in RandomForestClassifier
---
 .../main/scala/org/apache/spark/ml/Predictor.scala   | 12 ++++++++----
 .../spark/ml/classification/GBTClassifier.scala      | 11 ++++++++++-
 .../ml/classification/RandomForestClassifier.scala   | 11 ++++++++++-
 .../apache/spark/ml/regression/GBTRegressor.scala    | 11 ++++++++++-
 .../spark/ml/regression/RandomForestRegressor.scala  | 11 ++++++++++-
 5 files changed, 48 insertions(+), 8 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala b/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala
index 333b42711ec52..19fe039b8fd03 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala
@@ -169,10 +169,7 @@ abstract class PredictionModel[FeaturesType, M <: PredictionModel[FeaturesType,
   override def transform(dataset: DataFrame): DataFrame = {
     transformSchema(dataset.schema, logging = true)
     if ($(predictionCol).nonEmpty) {
-      val predictUDF = udf { (features: Any) =>
-        predict(features.asInstanceOf[FeaturesType])
-      }
-      dataset.withColumn($(predictionCol), predictUDF(col($(featuresCol))))
+      transformImpl(dataset)
     } else {
       this.logWarning(s"$uid: Predictor.transform() was called as NOOP" +
         " since no output columns were set.")
@@ -180,6 +177,13 @@ abstract class PredictionModel[FeaturesType, M <: PredictionModel[FeaturesType,
     }
   }
 
+  protected def transformImpl(dataset: DataFrame): DataFrame = {
+    val predictUDF = udf { (features: Any) =>
+      predict(features.asInstanceOf[FeaturesType])
+    }
+    dataset.withColumn($(predictionCol), predictUDF(col($(featuresCol))))
+  }
+
   /**
    * Predict label for the given features.
    * This internal method is used to implement [[transform()]] and output [[predictionCol]].
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
index 554e3b8e052b2..eb0b1a0a405fc 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
@@ -34,6 +34,8 @@ import org.apache.spark.mllib.tree.loss.{LogLoss => OldLogLoss, Loss => OldLoss}
 import org.apache.spark.mllib.tree.model.{GradientBoostedTreesModel => OldGBTModel}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.types.DoubleType
 
 /**
  * :: Experimental ::
@@ -177,8 +179,15 @@ final class GBTClassificationModel(
 
   override def treeWeights: Array[Double] = _treeWeights
 
+  override protected def transformImpl(dataset: DataFrame): DataFrame = {
+    val bcastModel = dataset.sqlContext.sparkContext.broadcast(this)
+    val predictUDF = udf { (features: Any) =>
+      bcastModel.value.predict(features.asInstanceOf[Vector])
+    }
+    dataset.withColumn($(predictionCol), predictUDF(col($(featuresCol))))
+  }
+
   override protected def predict(features: Vector): Double = {
-    // TODO: Override transform() to broadcast model: SPARK-7127
     // TODO: When we add a generic Boosting class, handle transform there?  SPARK-7129
     // Classifies by thresholding sum of weighted tree predictions
     val treePredictions = _trees.map(_.rootNode.predict(features))
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
index 490f04c7c7172..fc0693f67cc2e 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
@@ -31,6 +31,8 @@ import org.apache.spark.mllib.tree.configuration.{Algo => OldAlgo}
 import org.apache.spark.mllib.tree.model.{RandomForestModel => OldRandomForestModel}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.types.DoubleType
 
 /**
  * :: Experimental ::
@@ -143,8 +145,15 @@ final class RandomForestClassificationModel private[ml] (
 
   override def treeWeights: Array[Double] = _treeWeights
 
+  override protected def transformImpl(dataset: DataFrame): DataFrame = {
+    val bcastModel = dataset.sqlContext.sparkContext.broadcast(this)
+    val predictUDF = udf { (features: Any) =>
+      bcastModel.value.predict(features.asInstanceOf[Vector])
+    }
+    dataset.withColumn($(predictionCol), predictUDF(col($(featuresCol))))
+  }
+
   override protected def predict(features: Vector): Double = {
-    // TODO: Override transform() to broadcast model.  SPARK-7127
     // TODO: When we add a generic Bagging class, handle transform there: SPARK-7128
     // Classifies using majority votes.
     // Ignore the weights since all are 1.0 for now.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
index 47c110d027d67..e38dc73ee0ba7 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
@@ -33,6 +33,8 @@ import org.apache.spark.mllib.tree.loss.{AbsoluteError => OldAbsoluteError, Loss
 import org.apache.spark.mllib.tree.model.{GradientBoostedTreesModel => OldGBTModel}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.types.DoubleType
 
 /**
  * :: Experimental ::
@@ -167,8 +169,15 @@ final class GBTRegressionModel(
 
   override def treeWeights: Array[Double] = _treeWeights
 
+  override protected def transformImpl(dataset: DataFrame): DataFrame = {
+    val bcastModel = dataset.sqlContext.sparkContext.broadcast(this)
+    val predictUDF = udf { (features: Any) =>
+      bcastModel.value.predict(features.asInstanceOf[Vector])
+    }
+    dataset.withColumn($(predictionCol), predictUDF(col($(featuresCol))))
+  }
+
   override protected def predict(features: Vector): Double = {
-    // TODO: Override transform() to broadcast model. SPARK-7127
     // TODO: When we add a generic Boosting class, handle transform there?  SPARK-7129
     // Classifies by thresholding sum of weighted tree predictions
     val treePredictions = _trees.map(_.rootNode.predict(features))
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala
index 5fd5c7c7bd3fc..506a878c2553b 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala
@@ -29,6 +29,8 @@ import org.apache.spark.mllib.tree.configuration.{Algo => OldAlgo}
 import org.apache.spark.mllib.tree.model.{RandomForestModel => OldRandomForestModel}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.types.DoubleType
 
 /**
  * :: Experimental ::
@@ -129,8 +131,15 @@ final class RandomForestRegressionModel private[ml] (
 
   override def treeWeights: Array[Double] = _treeWeights
 
+  override protected def transformImpl(dataset: DataFrame): DataFrame = {
+    val bcastModel = dataset.sqlContext.sparkContext.broadcast(this)
+    val predictUDF = udf { (features: Any) =>
+      bcastModel.value.predict(features.asInstanceOf[Vector])
+    }
+    dataset.withColumn($(predictionCol), predictUDF(col($(featuresCol))))
+  }
+
   override protected def predict(features: Vector): Double = {
-    // TODO: Override transform() to broadcast model.  SPARK-7127
     // TODO: When we add a generic Bagging class, handle transform there.  SPARK-7128
     // Predict average of tree predictions.
     // Ignore the weights since all are 1.0 for now.

From 42d8a012f6652df1fa3f560f87c53731ea070640 Mon Sep 17 00:00:00 2001
From: Joshi <rekhajoshm@gmail.com>
Date: Fri, 17 Jul 2015 22:47:28 +0100
Subject: [PATCH 217/549] [SPARK-8593] [CORE] Sort app attempts by start time.

This makes sure attempts are listed in the order they were executed, and that the
app's state matches the state of the most current attempt.

Author: Joshi <rekhajoshm@gmail.com>
Author: Rekha Joshi <rekhajoshm@gmail.com>

Closes #7253 from rekhajoshm/SPARK-8593 and squashes the following commits:

874dd80 [Joshi] History Server: updated order for multiple attempts(logcleaner)
716e0b1 [Joshi] History Server: updated order for multiple attempts(descending start time works everytime)
548c753 [Joshi] History Server: updated order for multiple attempts(descending start time works everytime)
83306a8 [Joshi] History Server: updated order for multiple attempts(descending start time)
b0fc922 [Joshi] History Server: updated order for multiple attempts(updated comment)
cc0fda7 [Joshi] History Server: updated order for multiple attempts(updated test)
304cb0b [Joshi] History Server: updated order for multiple attempts(reverted HistoryPage)
85024e8 [Joshi] History Server: updated order for multiple attempts
a41ac4b [Joshi] History Server: updated order for multiple attempts
ab65fa1 [Joshi] History Server: some attempt completed to work with showIncomplete
0be142d [Rekha Joshi] Merge pull request #3 from apache/master
106fd8e [Rekha Joshi] Merge pull request #2 from apache/master
e3677c9 [Rekha Joshi] Merge pull request #1 from apache/master
---
 .../deploy/history/FsHistoryProvider.scala    | 10 +++-----
 .../history/FsHistoryProviderSuite.scala      | 24 +++++++++----------
 2 files changed, 14 insertions(+), 20 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
index 2cc465e55fceb..e3060ac3fa1a9 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
@@ -407,8 +407,8 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
 
   /**
    * Comparison function that defines the sort order for application attempts within the same
-   * application. Order is: running attempts before complete attempts, running attempts sorted
-   * by start time, completed attempts sorted by end time.
+   * application. Order is: attempts are sorted by descending start time.
+   * Most recent attempt state matches with current state of the app.
    *
    * Normally applications should have a single running attempt; but failure to call sc.stop()
    * may cause multiple running attempts to show up.
@@ -418,11 +418,7 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
   private def compareAttemptInfo(
       a1: FsApplicationAttemptInfo,
       a2: FsApplicationAttemptInfo): Boolean = {
-    if (a1.completed == a2.completed) {
-      if (a1.completed) a1.endTime >= a2.endTime else a1.startTime >= a2.startTime
-    } else {
-      !a1.completed
-    }
+    a1.startTime >= a2.startTime
   }
 
   /**
diff --git a/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala
index 2a62450bcdbad..73cff89544dc3 100644
--- a/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala
@@ -243,13 +243,12 @@ class FsHistoryProviderSuite extends SparkFunSuite with BeforeAndAfter with Matc
     appListAfterRename.size should be (1)
   }
 
-  test("apps with multiple attempts") {
+  test("apps with multiple attempts with order") {
     val provider = new FsHistoryProvider(createTestConf())
 
-    val attempt1 = newLogFile("app1", Some("attempt1"), inProgress = false)
+    val attempt1 = newLogFile("app1", Some("attempt1"), inProgress = true)
     writeFile(attempt1, true, None,
-      SparkListenerApplicationStart("app1", Some("app1"), 1L, "test", Some("attempt1")),
-      SparkListenerApplicationEnd(2L)
+      SparkListenerApplicationStart("app1", Some("app1"), 1L, "test", Some("attempt1"))
       )
 
     updateAndCheck(provider) { list =>
@@ -259,7 +258,7 @@ class FsHistoryProviderSuite extends SparkFunSuite with BeforeAndAfter with Matc
 
     val attempt2 = newLogFile("app1", Some("attempt2"), inProgress = true)
     writeFile(attempt2, true, None,
-      SparkListenerApplicationStart("app1", Some("app1"), 3L, "test", Some("attempt2"))
+      SparkListenerApplicationStart("app1", Some("app1"), 2L, "test", Some("attempt2"))
       )
 
     updateAndCheck(provider) { list =>
@@ -268,22 +267,21 @@ class FsHistoryProviderSuite extends SparkFunSuite with BeforeAndAfter with Matc
       list.head.attempts.head.attemptId should be (Some("attempt2"))
     }
 
-    val completedAttempt2 = newLogFile("app1", Some("attempt2"), inProgress = false)
-    attempt2.delete()
-    writeFile(attempt2, true, None,
-      SparkListenerApplicationStart("app1", Some("app1"), 3L, "test", Some("attempt2")),
+    val attempt3 = newLogFile("app1", Some("attempt3"), inProgress = false)
+    writeFile(attempt3, true, None,
+      SparkListenerApplicationStart("app1", Some("app1"), 3L, "test", Some("attempt3")),
       SparkListenerApplicationEnd(4L)
       )
 
     updateAndCheck(provider) { list =>
       list should not be (null)
       list.size should be (1)
-      list.head.attempts.size should be (2)
-      list.head.attempts.head.attemptId should be (Some("attempt2"))
+      list.head.attempts.size should be (3)
+      list.head.attempts.head.attemptId should be (Some("attempt3"))
     }
 
     val app2Attempt1 = newLogFile("app2", Some("attempt1"), inProgress = false)
-    writeFile(attempt2, true, None,
+    writeFile(attempt1, true, None,
       SparkListenerApplicationStart("app2", Some("app2"), 5L, "test", Some("attempt1")),
       SparkListenerApplicationEnd(6L)
       )
@@ -291,7 +289,7 @@ class FsHistoryProviderSuite extends SparkFunSuite with BeforeAndAfter with Matc
     updateAndCheck(provider) { list =>
       list.size should be (2)
       list.head.attempts.size should be (1)
-      list.last.attempts.size should be (2)
+      list.last.attempts.size should be (3)
       list.head.attempts.head.attemptId should be (Some("attempt1"))
 
       list.foreach { case app =>

From b2aa490bb60176631c94ecadf87c14564960f12c Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Fri, 17 Jul 2015 15:02:13 -0700
Subject: [PATCH 218/549] [SPARK-9142] [SQL] Removing unnecessary self types in
 Catalyst.

Just a small change to add Product type to the base expression/plan abstract classes, based on suggestions on #7434 and offline discussions.

Author: Reynold Xin <rxin@databricks.com>

Closes #7479 from rxin/remove-self-types and squashes the following commits:

e407ffd [Reynold Xin] [SPARK-9142][SQL] Removing unnecessary self types in Catalyst.
---
 .../apache/spark/sql/catalyst/analysis/unresolved.scala  | 1 -
 .../spark/sql/catalyst/expressions/Expression.scala      | 7 +------
 .../spark/sql/catalyst/expressions/aggregates.scala      | 3 ---
 .../spark/sql/catalyst/expressions/arithmetic.scala      | 1 -
 .../spark/sql/catalyst/expressions/conditionals.scala    | 1 -
 .../spark/sql/catalyst/expressions/generators.scala      | 2 +-
 .../org/apache/spark/sql/catalyst/expressions/math.scala | 5 ++---
 .../sql/catalyst/expressions/namedExpressions.scala      | 4 ++--
 .../spark/sql/catalyst/expressions/predicates.scala      | 3 ---
 .../apache/spark/sql/catalyst/expressions/random.scala   | 1 -
 .../sql/catalyst/expressions/windowExpressions.scala     | 2 --
 .../spark/sql/catalyst/plans/logical/LogicalPlan.scala   | 9 +--------
 .../sql/catalyst/plans/logical/basicOperators.scala      | 2 +-
 .../spark/sql/catalyst/plans/logical/partitioning.scala  | 2 --
 .../scala/org/apache/spark/sql/execution/SparkPlan.scala | 9 +--------
 .../scala/org/apache/spark/sql/execution/commands.scala  | 2 --
 .../org/apache/spark/sql/parquet/ParquetRelation.scala   | 2 --
 .../org/apache/spark/sql/hive/HiveMetastoreCatalog.scala | 2 --
 18 files changed, 9 insertions(+), 49 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
index 7089f079b6dde..4a1a1ed61ebe7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
@@ -96,7 +96,6 @@ case class UnresolvedFunction(name: String, children: Seq[Expression]) extends E
  * "SELECT * FROM ...". A [[Star]] gets automatically expanded during analysis.
  */
 abstract class Star extends LeafExpression with NamedExpression {
-  self: Product =>
 
   override def name: String = throw new UnresolvedException(this, "name")
   override def exprId: ExprId = throw new UnresolvedException(this, "exprId")
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
index f396bd08a8238..c70b5af4aa448 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
@@ -43,8 +43,7 @@ import org.apache.spark.sql.types._
  *
  * See [[Substring]] for an example.
  */
-abstract class Expression extends TreeNode[Expression] {
-  self: Product =>
+abstract class Expression extends TreeNode[Expression] with Product {
 
   /**
    * Returns true when an expression is a candidate for static evaluation before the query is
@@ -187,7 +186,6 @@ abstract class Expression extends TreeNode[Expression] {
  * A leaf expression, i.e. one without any child expressions.
  */
 abstract class LeafExpression extends Expression {
-  self: Product =>
 
   def children: Seq[Expression] = Nil
 }
@@ -198,7 +196,6 @@ abstract class LeafExpression extends Expression {
  * if the input is evaluated to null.
  */
 abstract class UnaryExpression extends Expression {
-  self: Product =>
 
   def child: Expression
 
@@ -277,7 +274,6 @@ abstract class UnaryExpression extends Expression {
  * if any input is evaluated to null.
  */
 abstract class BinaryExpression extends Expression {
-  self: Product =>
 
   def left: Expression
   def right: Expression
@@ -370,7 +366,6 @@ abstract class BinaryExpression extends Expression {
  *    the analyzer will find the tightest common type and do the proper type casting.
  */
 abstract class BinaryOperator extends BinaryExpression with ExpectsInputTypes {
-  self: Product =>
 
   /**
    * Expected input type from both left/right child expressions, similar to the
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
index 71c943dc79e9e..af9a674ab4958 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
@@ -27,7 +27,6 @@ import org.apache.spark.sql.types._
 import org.apache.spark.util.collection.OpenHashSet
 
 trait AggregateExpression extends Expression {
-  self: Product =>
 
   /**
    * Aggregate expressions should not be foldable.
@@ -65,7 +64,6 @@ case class SplitEvaluation(
  * These partial evaluations can then be combined to compute the actual answer.
  */
 trait PartialAggregate extends AggregateExpression {
-  self: Product =>
 
   /**
    * Returns a [[SplitEvaluation]] that computes this aggregation using partial aggregation.
@@ -79,7 +77,6 @@ trait PartialAggregate extends AggregateExpression {
  */
 abstract class AggregateFunction
   extends LeafExpression with AggregateExpression with Serializable {
-  self: Product =>
 
   /** Base should return the generic aggregate expression that this function is computing */
   val base: AggregateExpression
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
index 1616d1bc0aed5..c5960eb390ea4 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
@@ -77,7 +77,6 @@ case class Abs(child: Expression) extends UnaryExpression with ExpectsInputTypes
 }
 
 abstract class BinaryArithmetic extends BinaryOperator {
-  self: Product =>
 
   override def dataType: DataType = left.dataType
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionals.scala
index 9162b73fe56eb..15b33da884dcb 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionals.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionals.scala
@@ -77,7 +77,6 @@ case class If(predicate: Expression, trueValue: Expression, falseValue: Expressi
 }
 
 trait CaseWhenLike extends Expression {
-  self: Product =>
 
   // Note that `branches` are considered in consecutive pairs (cond, val), and the optional last
   // element is the value for the default catch-all case (if provided).
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
index 51dc77ee3fc5f..c58a6d36141c1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
@@ -40,7 +40,7 @@ import org.apache.spark.sql.types._
  * requested.  The attributes produced by this function will be automatically copied anytime rules
  * result in changes to the Generator or its children.
  */
-trait Generator extends Expression { self: Product =>
+trait Generator extends Expression {
 
   // TODO ideally we should return the type of ArrayType(StructType),
   // however, we don't keep the output field names in the Generator.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
index 7a543ff36afd1..b05a7b3ed0ea4 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
@@ -34,7 +34,6 @@ import org.apache.spark.unsafe.types.UTF8String
  */
 abstract class LeafMathExpression(c: Double, name: String)
   extends LeafExpression with Serializable {
-  self: Product =>
 
   override def dataType: DataType = DoubleType
   override def foldable: Boolean = true
@@ -58,7 +57,7 @@ abstract class LeafMathExpression(c: Double, name: String)
  * @param name The short name of the function
  */
 abstract class UnaryMathExpression(f: Double => Double, name: String)
-  extends UnaryExpression with Serializable with ImplicitCastInputTypes { self: Product =>
+  extends UnaryExpression with Serializable with ImplicitCastInputTypes {
 
   override def inputTypes: Seq[DataType] = Seq(DoubleType)
   override def dataType: DataType = DoubleType
@@ -92,7 +91,7 @@ abstract class UnaryMathExpression(f: Double => Double, name: String)
  * @param name The short name of the function
  */
 abstract class BinaryMathExpression(f: (Double, Double) => Double, name: String)
-  extends BinaryExpression with Serializable with ImplicitCastInputTypes { self: Product =>
+  extends BinaryExpression with Serializable with ImplicitCastInputTypes {
 
   override def inputTypes: Seq[DataType] = Seq(DoubleType, DoubleType)
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
index 8bf7a7ce4e647..c083ac08ded21 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
@@ -40,7 +40,7 @@ case class ExprId(id: Long)
 /**
  * An [[Expression]] that is named.
  */
-trait NamedExpression extends Expression { self: Product =>
+trait NamedExpression extends Expression {
 
   /** We should never fold named expressions in order to not remove the alias. */
   override def foldable: Boolean = false
@@ -83,7 +83,7 @@ trait NamedExpression extends Expression { self: Product =>
     }
 }
 
-abstract class Attribute extends LeafExpression with NamedExpression { self: Product =>
+abstract class Attribute extends LeafExpression with NamedExpression {
 
   override def references: AttributeSet = AttributeSet(this)
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
index aa6c30e2f79f2..7a6fb2b3788ca 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
@@ -38,8 +38,6 @@ object InterpretedPredicate {
  * An [[Expression]] that returns a boolean value.
  */
 trait Predicate extends Expression {
-  self: Product =>
-
   override def dataType: DataType = BooleanType
 }
 
@@ -222,7 +220,6 @@ case class Or(left: Expression, right: Expression) extends BinaryOperator with P
 
 
 abstract class BinaryComparison extends BinaryOperator with Predicate {
-  self: Product =>
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     if (ctx.isPrimitiveType(left.dataType)) {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/random.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/random.scala
index e10ba55396664..65093dc72264b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/random.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/random.scala
@@ -33,7 +33,6 @@ import org.apache.spark.util.random.XORShiftRandom
  * Since this expression is stateful, it cannot be a case object.
  */
 abstract class RDG(seed: Long) extends LeafExpression with Serializable {
-  self: Product =>
 
   /**
    * Record ID within each partition. By being transient, the Random Number Generator is
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala
index 344361685853f..c8aa571df64fc 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala
@@ -254,8 +254,6 @@ object SpecifiedWindowFrame {
  * to retrieve value corresponding with these n rows.
  */
 trait WindowFunction extends Expression {
-  self: Product =>
-
   def init(): Unit
 
   def reset(): Unit
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
index adac37231cc4a..dd6c5d43f5714 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
@@ -25,8 +25,7 @@ import org.apache.spark.sql.catalyst.plans.QueryPlan
 import org.apache.spark.sql.catalyst.trees.TreeNode
 
 
-abstract class LogicalPlan extends QueryPlan[LogicalPlan] with Logging {
-  self: Product =>
+abstract class LogicalPlan extends QueryPlan[LogicalPlan] with Logging with Product{
 
   /**
    * Computes [[Statistics]] for this plan. The default implementation assumes the output
@@ -277,8 +276,6 @@ abstract class LogicalPlan extends QueryPlan[LogicalPlan] with Logging {
  * A logical plan node with no children.
  */
 abstract class LeafNode extends LogicalPlan {
-  self: Product =>
-
   override def children: Seq[LogicalPlan] = Nil
 }
 
@@ -286,8 +283,6 @@ abstract class LeafNode extends LogicalPlan {
  * A logical plan node with single child.
  */
 abstract class UnaryNode extends LogicalPlan {
-  self: Product =>
-
   def child: LogicalPlan
 
   override def children: Seq[LogicalPlan] = child :: Nil
@@ -297,8 +292,6 @@ abstract class UnaryNode extends LogicalPlan {
  * A logical plan node with a left and right child.
  */
 abstract class BinaryNode extends LogicalPlan {
-  self: Product =>
-
   def left: LogicalPlan
   def right: LogicalPlan
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
index fae339808c233..fbe104db016d6 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
@@ -298,7 +298,7 @@ case class Expand(
 }
 
 trait GroupingAnalytics extends UnaryNode {
-  self: Product =>
+
   def groupByExprs: Seq[Expression]
   def aggregations: Seq[NamedExpression]
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/partitioning.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/partitioning.scala
index 63df2c1ee72ff..1f76b03bcb0f6 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/partitioning.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/partitioning.scala
@@ -24,8 +24,6 @@ import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, SortOrd
  * result have expectations about the distribution and ordering of partitioned input data.
  */
 abstract class RedistributeData extends UnaryNode {
-  self: Product =>
-
   override def output: Seq[Attribute] = child.output
 }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
index 632f633d82a2e..ba12056ee7a1b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
@@ -39,8 +39,7 @@ object SparkPlan {
  * :: DeveloperApi ::
  */
 @DeveloperApi
-abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializable {
-  self: Product =>
+abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Product with Serializable {
 
   /**
    * A handle to the SQL Context that was used to create this plan.   Since many operators need
@@ -239,14 +238,10 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ
 }
 
 private[sql] trait LeafNode extends SparkPlan {
-  self: Product =>
-
   override def children: Seq[SparkPlan] = Nil
 }
 
 private[sql] trait UnaryNode extends SparkPlan {
-  self: Product =>
-
   def child: SparkPlan
 
   override def children: Seq[SparkPlan] = child :: Nil
@@ -255,8 +250,6 @@ private[sql] trait UnaryNode extends SparkPlan {
 }
 
 private[sql] trait BinaryNode extends SparkPlan {
-  self: Product =>
-
   def left: SparkPlan
   def right: SparkPlan
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala
index 5e9951f248ff2..bace3f8a9c8d4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala
@@ -35,8 +35,6 @@ import org.apache.spark.sql.{DataFrame, Row, SQLConf, SQLContext}
  * wrapped in `ExecutedCommand` during execution.
  */
 private[sql] trait RunnableCommand extends LogicalPlan with logical.Command {
-  self: Product =>
-
   override def output: Seq[Attribute] = Seq.empty
   override def children: Seq[LogicalPlan] = Seq.empty
   def run(sqlContext: SQLContext): Seq[Row]
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala
index e0bea65a15f36..086559e9f7658 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala
@@ -54,8 +54,6 @@ private[sql] case class ParquetRelation(
     partitioningAttributes: Seq[Attribute] = Nil)
   extends LeafNode with MultiInstanceRelation {
 
-  self: Product =>
-
   /** Schema derived from ParquetFile */
   def parquetSchema: MessageType =
     ParquetTypesConverter
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index 4b7a782c805a0..6589bc6ea2921 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -596,8 +596,6 @@ private[hive] case class MetastoreRelation
     (@transient sqlContext: SQLContext)
   extends LeafNode with MultiInstanceRelation {
 
-  self: Product =>
-
   override def equals(other: Any): Boolean = other match {
     case relation: MetastoreRelation =>
       databaseName == relation.databaseName &&

From 15fc2ffe5530c43c64cfc37f2d1ce83f04ce3bd9 Mon Sep 17 00:00:00 2001
From: Yijie Shen <henry.yijieshen@gmail.com>
Date: Fri, 17 Jul 2015 15:49:31 -0700
Subject: [PATCH 219/549] [SPARK-9080][SQL] add isNaN predicate expression

JIRA: https://issues.apache.org/jira/browse/SPARK-9080

cc rxin

Author: Yijie Shen <henry.yijieshen@gmail.com>

Closes #7464 from yijieshen/isNaN and squashes the following commits:

11ae039 [Yijie Shen] add isNaN in functions
666718e [Yijie Shen] add isNaN predicate expression
---
 .../catalyst/analysis/FunctionRegistry.scala  |  1 +
 .../sql/catalyst/expressions/predicates.scala | 50 +++++++++++++++++++
 .../catalyst/expressions/PredicateSuite.scala | 12 ++++-
 .../scala/org/apache/spark/sql/Column.scala   |  8 +++
 .../org/apache/spark/sql/functions.scala      | 10 +++-
 .../spark/sql/ColumnExpressionSuite.scala     | 21 ++++++++
 6 files changed, 100 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index a45181712dbdf..7bb2579506a8a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -79,6 +79,7 @@ object FunctionRegistry {
     expression[Explode]("explode"),
     expression[Greatest]("greatest"),
     expression[If]("if"),
+    expression[IsNaN]("isnan"),
     expression[IsNull]("isnull"),
     expression[IsNotNull]("isnotnull"),
     expression[Least]("least"),
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
index 7a6fb2b3788ca..2751c8e75f357 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
@@ -120,6 +120,56 @@ case class InSet(child: Expression, hset: Set[Any])
   }
 }
 
+/**
+ * Evaluates to `true` if it's NaN or null
+ */
+case class IsNaN(child: Expression) extends UnaryExpression
+    with Predicate with ImplicitCastInputTypes {
+
+  override def inputTypes: Seq[AbstractDataType] = Seq(TypeCollection(DoubleType, FloatType))
+
+  override def nullable: Boolean = false
+
+  override def eval(input: InternalRow): Any = {
+    val value = child.eval(input)
+    if (value == null) {
+      true
+    } else {
+      child.dataType match {
+        case DoubleType => value.asInstanceOf[Double].isNaN
+        case FloatType => value.asInstanceOf[Float].isNaN
+      }
+    }
+  }
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    val eval = child.gen(ctx)
+    child.dataType match {
+      case FloatType =>
+        s"""
+          ${eval.code}
+          boolean ${ev.isNull} = false;
+          ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+          if (${eval.isNull}) {
+            ${ev.primitive} = true;
+          } else {
+            ${ev.primitive} = Float.isNaN(${eval.primitive});
+          }
+        """
+      case DoubleType =>
+        s"""
+          ${eval.code}
+          boolean ${ev.isNull} = false;
+          ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+          if (${eval.isNull}) {
+            ${ev.primitive} = true;
+          } else {
+            ${ev.primitive} = Double.isNaN(${eval.primitive});
+          }
+        """
+    }
+  }
+}
 
 case class And(left: Expression, right: Expression) extends BinaryOperator with Predicate {
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
index 188ecef9e7679..052abc51af5fd 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
@@ -21,7 +21,7 @@ import scala.collection.immutable.HashSet
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.dsl.expressions._
-import org.apache.spark.sql.types.{Decimal, IntegerType, BooleanType}
+import org.apache.spark.sql.types.{Decimal, DoubleType, IntegerType, BooleanType}
 
 
 class PredicateSuite extends SparkFunSuite with ExpressionEvalHelper {
@@ -116,6 +116,16 @@ class PredicateSuite extends SparkFunSuite with ExpressionEvalHelper {
       true)
   }
 
+  test("IsNaN") {
+    checkEvaluation(IsNaN(Literal(Double.NaN)), true)
+    checkEvaluation(IsNaN(Literal(Float.NaN)), true)
+    checkEvaluation(IsNaN(Literal(math.log(-3))), true)
+    checkEvaluation(IsNaN(Literal.create(null, DoubleType)), true)
+    checkEvaluation(IsNaN(Literal(Double.PositiveInfinity)), false)
+    checkEvaluation(IsNaN(Literal(Float.MaxValue)), false)
+    checkEvaluation(IsNaN(Literal(5.5f)), false)
+  }
+
   test("INSET") {
     val hS = HashSet[Any]() + 1 + 2
     val nS = HashSet[Any]() + 1 + 2 + null
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
index 10250264625b2..221cd04c6d288 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
@@ -400,6 +400,14 @@ class Column(protected[sql] val expr: Expression) extends Logging {
     (this >= lowerBound) && (this <= upperBound)
   }
 
+  /**
+   * True if the current expression is NaN or null
+   *
+   * @group expr_ops
+   * @since 1.5.0
+   */
+  def isNaN: Column = IsNaN(expr)
+
   /**
    * True if the current expression is null.
    *
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index fe511c296cfd2..b56fd9a71b321 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -620,7 +620,15 @@ object functions {
   def explode(e: Column): Column = Explode(e.expr)
 
   /**
-   * Converts a string exprsesion to lower case.
+   * Return true if the column is NaN or null
+   *
+   * @group normal_funcs
+   * @since 1.5.0
+   */
+  def isNaN(e: Column): Column = IsNaN(e.expr)
+
+  /**
+   * Converts a string expression to lower case.
    *
    * @group normal_funcs
    * @since 1.3.0
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
index 88bb743ab0bc9..8f15479308391 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
@@ -201,6 +201,27 @@ class ColumnExpressionSuite extends QueryTest {
       Row(false, true))
   }
 
+  test("isNaN") {
+    val testData = ctx.createDataFrame(ctx.sparkContext.parallelize(
+      Row(Double.NaN, Float.NaN) ::
+      Row(math.log(-1), math.log(-3).toFloat) ::
+      Row(null, null) ::
+      Row(Double.MaxValue, Float.MinValue):: Nil),
+      StructType(Seq(StructField("a", DoubleType), StructField("b", FloatType))))
+
+    checkAnswer(
+      testData.select($"a".isNaN, $"b".isNaN),
+      Row(true, true) :: Row(true, true) :: Row(true, true) :: Row(false, false) :: Nil)
+
+    checkAnswer(
+      testData.select(isNaN($"a"), isNaN($"b")),
+      Row(true, true) :: Row(true, true) :: Row(true, true) :: Row(false, false) :: Nil)
+
+    checkAnswer(
+      ctx.sql("select isnan(15), isnan('invalid')"),
+      Row(false, true))
+  }
+
   test("===") {
     checkAnswer(
       testData2.filter($"a" === 1),

From fd6b3101fbb0a8c3ebcf89ce9b4e8664406d9869 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@outlook.com>
Date: Fri, 17 Jul 2015 16:03:33 -0700
Subject: [PATCH 220/549] [SPARK-9113] [SQL] enable analysis check code for
 self join

The check was unreachable before, as `case operator: LogicalPlan` catches everything already.

Author: Wenchen Fan <cloud0fan@outlook.com>

Closes #7449 from cloud-fan/tmp and squashes the following commits:

2bb6637 [Wenchen Fan] add test
5493aea [Wenchen Fan] add the check back
27221a7 [Wenchen Fan] remove unnecessary analysis check code for self join
---
 .../sql/catalyst/analysis/Analyzer.scala      |  2 +-
 .../sql/catalyst/analysis/CheckAnalysis.scala | 28 +++++++++----------
 .../plans/logical/basicOperators.scala        |  6 ++--
 .../analysis/AnalysisErrorSuite.scala         | 14 ++++++++--
 4 files changed, 29 insertions(+), 21 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index df8e7f2381fbd..e58f3f64947f3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -316,7 +316,7 @@ class Analyzer(
         )
 
       // Special handling for cases when self-join introduce duplicate expression ids.
-      case j @ Join(left, right, _, _) if left.outputSet.intersect(right.outputSet).nonEmpty =>
+      case j @ Join(left, right, _, _) if !j.selfJoinResolved =>
         val conflictingAttributes = left.outputSet.intersect(right.outputSet)
         logDebug(s"Conflicting attributes ${conflictingAttributes.mkString(",")} in $j")
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
index 476ac2b7cb474..c7f9713344c50 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
@@ -109,29 +109,27 @@ trait CheckAnalysis {
               s"resolved attribute(s) $missingAttributes missing from $input " +
                 s"in operator ${operator.simpleString}")
 
-          case o if !o.resolved =>
-            failAnalysis(
-              s"unresolved operator ${operator.simpleString}")
-
           case p @ Project(exprs, _) if containsMultipleGenerators(exprs) =>
             failAnalysis(
               s"""Only a single table generating function is allowed in a SELECT clause, found:
                  | ${exprs.map(_.prettyString).mkString(",")}""".stripMargin)
 
+          // Special handling for cases when self-join introduce duplicate expression ids.
+          case j @ Join(left, right, _, _) if left.outputSet.intersect(right.outputSet).nonEmpty =>
+            val conflictingAttributes = left.outputSet.intersect(right.outputSet)
+            failAnalysis(
+              s"""
+                 |Failure when resolving conflicting references in Join:
+                 |$plan
+                  |Conflicting attributes: ${conflictingAttributes.mkString(",")}
+                  |""".stripMargin)
+
+          case o if !o.resolved =>
+            failAnalysis(
+              s"unresolved operator ${operator.simpleString}")
 
           case _ => // Analysis successful!
         }
-
-      // Special handling for cases when self-join introduce duplicate expression ids.
-      case j @ Join(left, right, _, _) if left.outputSet.intersect(right.outputSet).nonEmpty =>
-        val conflictingAttributes = left.outputSet.intersect(right.outputSet)
-        failAnalysis(
-          s"""
-             |Failure when resolving conflicting references in Join:
-             |$plan
-             |Conflicting attributes: ${conflictingAttributes.mkString(",")}
-             |""".stripMargin)
-
     }
     extendedCheckRules.foreach(_(plan))
   }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
index fbe104db016d6..17a91247327f7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
@@ -123,11 +123,11 @@ case class Join(
     }
   }
 
-  private def selfJoinResolved: Boolean = left.outputSet.intersect(right.outputSet).isEmpty
+  def selfJoinResolved: Boolean = left.outputSet.intersect(right.outputSet).isEmpty
 
-  // Joins are only resolved if they don't introduce ambiguious expression ids.
+  // Joins are only resolved if they don't introduce ambiguous expression ids.
   override lazy val resolved: Boolean = {
-    childrenResolved && !expressions.exists(!_.resolved) && selfJoinResolved
+    childrenResolved && expressions.forall(_.resolved) && selfJoinResolved
   }
 }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
index f0f17103991ef..2147d07e09bd3 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
@@ -23,10 +23,11 @@ import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical._
-import org.apache.spark.sql.types._
-import org.apache.spark.sql.catalyst.{InternalRow, SimpleCatalystConf}
+import org.apache.spark.sql.catalyst.plans.Inner
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.dsl.plans._
+import org.apache.spark.sql.types._
 
 case class TestFunction(
     children: Seq[Expression],
@@ -164,4 +165,13 @@ class AnalysisErrorSuite extends SparkFunSuite with BeforeAndAfter {
 
     assert(message.contains("resolved attribute(s) a#1 missing from a#2"))
   }
+
+  test("error test for self-join") {
+    val join = Join(testRelation, testRelation, Inner, None)
+    val error = intercept[AnalysisException] {
+      SimpleAnalyzer.checkAnalysis(join)
+    }
+    error.message.contains("Failure when resolving conflicting references in Join")
+    error.message.contains("Conflicting attributes")
+  }
 }

From bd903ee89f1d1bc4daf63f1f07958cb86d667e1e Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@outlook.com>
Date: Fri, 17 Jul 2015 16:28:24 -0700
Subject: [PATCH 221/549] [SPARK-9117] [SQL] fix BooleanSimplification in
 case-insensitive

Author: Wenchen Fan <cloud0fan@outlook.com>

Closes #7452 from cloud-fan/boolean-simplify and squashes the following commits:

2a6e692 [Wenchen Fan] fix style
d3cfd26 [Wenchen Fan] fix BooleanSimplification in case-insensitive
---
 .../sql/catalyst/optimizer/Optimizer.scala    | 28 +++++-----
 .../BooleanSimplificationSuite.scala          | 55 +++++++++----------
 2 files changed, 40 insertions(+), 43 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index d5beeec0ffac1..0f28a0d2c8fff 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -393,26 +393,26 @@ object BooleanSimplification extends Rule[LogicalPlan] with PredicateHelper {
         // (a || b) && (a || c)  =>  a || (b && c)
         case _ =>
           // 1. Split left and right to get the disjunctive predicates,
-          //   i.e. lhsSet = (a, b), rhsSet = (a, c)
+          //   i.e. lhs = (a, b), rhs = (a, c)
           // 2. Find the common predict between lhsSet and rhsSet, i.e. common = (a)
           // 3. Remove common predict from lhsSet and rhsSet, i.e. ldiff = (b), rdiff = (c)
           // 4. Apply the formula, get the optimized predicate: common || (ldiff && rdiff)
-          val lhsSet = splitDisjunctivePredicates(left).toSet
-          val rhsSet = splitDisjunctivePredicates(right).toSet
-          val common = lhsSet.intersect(rhsSet)
+          val lhs = splitDisjunctivePredicates(left)
+          val rhs = splitDisjunctivePredicates(right)
+          val common = lhs.filter(e => rhs.exists(e.semanticEquals(_)))
           if (common.isEmpty) {
             // No common factors, return the original predicate
             and
           } else {
-            val ldiff = lhsSet.diff(common)
-            val rdiff = rhsSet.diff(common)
+            val ldiff = lhs.filterNot(e => common.exists(e.semanticEquals(_)))
+            val rdiff = rhs.filterNot(e => common.exists(e.semanticEquals(_)))
             if (ldiff.isEmpty || rdiff.isEmpty) {
               // (a || b || c || ...) && (a || b) => (a || b)
               common.reduce(Or)
             } else {
               // (a || b || c || ...) && (a || b || d || ...) =>
               // ((c || ...) && (d || ...)) || a || b
-              (common + And(ldiff.reduce(Or), rdiff.reduce(Or))).reduce(Or)
+              (common :+ And(ldiff.reduce(Or), rdiff.reduce(Or))).reduce(Or)
             }
           }
       }  // end of And(left, right)
@@ -431,26 +431,26 @@ object BooleanSimplification extends Rule[LogicalPlan] with PredicateHelper {
         // (a && b) || (a && c)  =>  a && (b || c)
         case _ =>
            // 1. Split left and right to get the conjunctive predicates,
-           //   i.e.  lhsSet = (a, b), rhsSet = (a, c)
+           //   i.e.  lhs = (a, b), rhs = (a, c)
            // 2. Find the common predict between lhsSet and rhsSet, i.e. common = (a)
            // 3. Remove common predict from lhsSet and rhsSet, i.e. ldiff = (b), rdiff = (c)
            // 4. Apply the formula, get the optimized predicate: common && (ldiff || rdiff)
-          val lhsSet = splitConjunctivePredicates(left).toSet
-          val rhsSet = splitConjunctivePredicates(right).toSet
-          val common = lhsSet.intersect(rhsSet)
+          val lhs = splitConjunctivePredicates(left)
+          val rhs = splitConjunctivePredicates(right)
+          val common = lhs.filter(e => rhs.exists(e.semanticEquals(_)))
           if (common.isEmpty) {
             // No common factors, return the original predicate
             or
           } else {
-            val ldiff = lhsSet.diff(common)
-            val rdiff = rhsSet.diff(common)
+            val ldiff = lhs.filterNot(e => common.exists(e.semanticEquals(_)))
+            val rdiff = rhs.filterNot(e => common.exists(e.semanticEquals(_)))
             if (ldiff.isEmpty || rdiff.isEmpty) {
               // (a && b) || (a && b && c && ...) => a && b
               common.reduce(And)
             } else {
               // (a && b && c && ...) || (a && b && d && ...) =>
               // ((c && ...) || (d && ...)) && a && b
-              (common + Or(ldiff.reduce(And), rdiff.reduce(And))).reduce(And)
+              (common :+ Or(ldiff.reduce(And), rdiff.reduce(And))).reduce(And)
             }
           }
       }  // end of Or(left, right)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/BooleanSimplificationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/BooleanSimplificationSuite.scala
index 465a5e6914204..d4916ea8d273a 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/BooleanSimplificationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/BooleanSimplificationSuite.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.catalyst.optimizer
 
-import org.apache.spark.sql.catalyst.analysis.EliminateSubQueries
+import org.apache.spark.sql.catalyst.analysis.{AnalysisSuite, EliminateSubQueries}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.plans.PlanTest
@@ -40,29 +40,11 @@ class BooleanSimplificationSuite extends PlanTest with PredicateHelper {
 
   val testRelation = LocalRelation('a.int, 'b.int, 'c.int, 'd.string)
 
-  // The `foldLeft` is required to handle cases like comparing `a && (b && c)` and `(a && b) && c`
-  def compareConditions(e1: Expression, e2: Expression): Boolean = (e1, e2) match {
-    case (lhs: And, rhs: And) =>
-      val lhsSet = splitConjunctivePredicates(lhs).toSet
-      val rhsSet = splitConjunctivePredicates(rhs).toSet
-      lhsSet.foldLeft(rhsSet) { (set, e) =>
-        set.find(compareConditions(_, e)).map(set - _).getOrElse(set)
-      }.isEmpty
-
-    case (lhs: Or, rhs: Or) =>
-      val lhsSet = splitDisjunctivePredicates(lhs).toSet
-      val rhsSet = splitDisjunctivePredicates(rhs).toSet
-      lhsSet.foldLeft(rhsSet) { (set, e) =>
-        set.find(compareConditions(_, e)).map(set - _).getOrElse(set)
-      }.isEmpty
-
-    case (l, r) => l == r
-  }
-
-  def checkCondition(input: Expression, expected: Expression): Unit = {
+  private def checkCondition(input: Expression, expected: Expression): Unit = {
     val plan = testRelation.where(input).analyze
-    val actual = Optimize.execute(plan).expressions.head
-    compareConditions(actual, expected)
+    val actual = Optimize.execute(plan)
+    val correctAnswer = testRelation.where(expected).analyze
+    comparePlans(actual, correctAnswer)
   }
 
   test("a && a => a") {
@@ -86,10 +68,8 @@ class BooleanSimplificationSuite extends PlanTest with PredicateHelper {
       ('a === 'b && 'c < 1 && 'a === 5) ||
       ('a === 'b && 'b < 5 && 'a > 1)
 
-    val expected =
-      (((('b > 3) && ('c > 2)) ||
-        (('c < 1) && ('a === 5))) ||
-        (('b < 5) && ('a > 1))) && ('a === 'b)
+    val expected = 'a === 'b && (
+      ('b > 3 && 'c > 2) || ('c < 1 && 'a === 5) || ('b < 5 && 'a > 1))
 
     checkCondition(input, expected)
   }
@@ -101,10 +81,27 @@ class BooleanSimplificationSuite extends PlanTest with PredicateHelper {
 
     checkCondition('a < 2 && ('a < 2 || 'a > 3 || 'b > 5) , 'a < 2)
 
-    checkCondition(('a < 2 || 'b > 3) && ('a < 2 || 'c > 5), ('b > 3 && 'c > 5) || 'a < 2)
+    checkCondition(('a < 2 || 'b > 3) && ('a < 2 || 'c > 5), 'a < 2 || ('b > 3 && 'c > 5))
 
     checkCondition(
       ('a === 'b || 'b > 3) && ('a === 'b || 'a > 3) && ('a === 'b || 'a < 5),
-      ('b > 3 && 'a > 3 && 'a < 5) || 'a === 'b)
+      ('a === 'b || 'b > 3 && 'a > 3 && 'a < 5))
+  }
+
+  private def caseInsensitiveAnalyse(plan: LogicalPlan) =
+    AnalysisSuite.caseInsensitiveAnalyzer.execute(plan)
+
+  test("(a && b) || (a && c) => a && (b || c) when case insensitive") {
+    val plan = caseInsensitiveAnalyse(testRelation.where(('a > 2 && 'b > 3) || ('A > 2 && 'b < 5)))
+    val actual = Optimize.execute(plan)
+    val expected = caseInsensitiveAnalyse(testRelation.where('a > 2 && ('b > 3 || 'b < 5)))
+    comparePlans(actual, expected)
+  }
+
+  test("(a || b) && (a || c) => a || (b && c) when case insensitive") {
+    val plan = caseInsensitiveAnalyse(testRelation.where(('a > 2 || 'b > 3) && ('A > 2 || 'b < 5)))
+    val actual = Optimize.execute(plan)
+    val expected = caseInsensitiveAnalyse(testRelation.where('a > 2 || ('b > 3 && 'b < 5)))
+    comparePlans(actual, expected)
   }
 }

From b13ef7723f254c10c685b93eb8dc08a52527ec73 Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Fri, 17 Jul 2015 16:43:18 -0700
Subject: [PATCH 222/549] [SPARK-9030] [STREAMING] Add Kinesis.createStream
 unit tests that actual sends data

Current Kinesis unit tests do not test createStream by sending data. This PR is to add such unit test. Note that this unit will not run by default. It will only run when the relevant environment variables are set.

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #7413 from tdas/kinesis-tests and squashes the following commits:

0e16db5 [Tathagata Das] Added more comments regarding testOrIgnore
1ea5ce0 [Tathagata Das] Added more comments
c7caef7 [Tathagata Das] Address comments
a297b59 [Tathagata Das] Reverted unnecessary change in KafkaStreamSuite
90c9bde [Tathagata Das] Removed scalatest.FunSuite
deb7f4f [Tathagata Das] Removed scalatest.FunSuite
18c2208 [Tathagata Das] Changed how SparkFunSuite is inherited
dbb33a5 [Tathagata Das] Added license
88f6dab [Tathagata Das] Added scala docs
c6be0d7 [Tathagata Das] minor changes
24a992b [Tathagata Das] Moved KinesisTestUtils to src instead of test for future python usage
465b55d [Tathagata Das] Made unit tests optional in a nice way
4d70703 [Tathagata Das] Added license
129d436 [Tathagata Das] Minor updates
cc36510 [Tathagata Das] Added KinesisStreamSuite
---
 .../streaming/kinesis/KinesisTestUtils.scala  | 197 ++++++++++++++++++
 .../streaming/kinesis/KinesisFunSuite.scala   |  37 ++++
 .../kinesis/KinesisReceiverSuite.scala        |  17 --
 .../kinesis/KinesisStreamSuite.scala          | 120 +++++++++++
 4 files changed, 354 insertions(+), 17 deletions(-)
 create mode 100644 extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisTestUtils.scala
 create mode 100644 extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisFunSuite.scala
 create mode 100644 extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisStreamSuite.scala

diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisTestUtils.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisTestUtils.scala
new file mode 100644
index 0000000000000..f6bf552e6bb8e
--- /dev/null
+++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisTestUtils.scala
@@ -0,0 +1,197 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.streaming.kinesis
+
+import java.nio.ByteBuffer
+import java.util.concurrent.TimeUnit
+
+import scala.collection.mutable
+import scala.collection.mutable.ArrayBuffer
+import scala.util.{Failure, Random, Success, Try}
+
+import com.amazonaws.auth.{AWSCredentials, DefaultAWSCredentialsProviderChain}
+import com.amazonaws.regions.RegionUtils
+import com.amazonaws.services.dynamodbv2.AmazonDynamoDBClient
+import com.amazonaws.services.dynamodbv2.document.DynamoDB
+import com.amazonaws.services.kinesis.AmazonKinesisClient
+import com.amazonaws.services.kinesis.model._
+
+import org.apache.spark.Logging
+
+/**
+ * Shared utility methods for performing Kinesis tests that actually transfer data
+ */
+private class KinesisTestUtils(
+    val endpointUrl: String = "https://kinesis.us-west-2.amazonaws.com",
+    _regionName: String = "") extends Logging {
+
+  val regionName = if (_regionName.length == 0) {
+    RegionUtils.getRegionByEndpoint(endpointUrl).getName()
+  } else {
+    RegionUtils.getRegion(_regionName).getName()
+  }
+
+  val streamShardCount = 2
+
+  private val createStreamTimeoutSeconds = 300
+  private val describeStreamPollTimeSeconds = 1
+
+  @volatile
+  private var streamCreated = false
+  private var _streamName: String = _
+
+  private lazy val kinesisClient = {
+    val client = new AmazonKinesisClient(KinesisTestUtils.getAWSCredentials())
+    client.setEndpoint(endpointUrl)
+    client
+  }
+
+  private lazy val dynamoDB = {
+    val dynamoDBClient = new AmazonDynamoDBClient(new DefaultAWSCredentialsProviderChain())
+    dynamoDBClient.setRegion(RegionUtils.getRegion(regionName))
+    new DynamoDB(dynamoDBClient)
+  }
+
+  def streamName: String = {
+    require(streamCreated, "Stream not yet created, call createStream() to create one")
+    _streamName
+  }
+
+  def createStream(): Unit = {
+    logInfo("Creating stream")
+    require(!streamCreated, "Stream already created")
+    _streamName = findNonExistentStreamName()
+
+    // Create a stream. The number of shards determines the provisioned throughput.
+    val createStreamRequest = new CreateStreamRequest()
+    createStreamRequest.setStreamName(_streamName)
+    createStreamRequest.setShardCount(2)
+    kinesisClient.createStream(createStreamRequest)
+
+    // The stream is now being created. Wait for it to become active.
+    waitForStreamToBeActive(_streamName)
+    streamCreated = true
+    logInfo("Created stream")
+  }
+
+  /**
+   * Push data to Kinesis stream and return a map of
+   * shardId -> seq of (data, seq number) pushed to corresponding shard
+   */
+  def pushData(testData: Seq[Int]): Map[String, Seq[(Int, String)]] = {
+    require(streamCreated, "Stream not yet created, call createStream() to create one")
+    val shardIdToSeqNumbers = new mutable.HashMap[String, ArrayBuffer[(Int, String)]]()
+
+    testData.foreach { num =>
+      val str = num.toString
+      val putRecordRequest = new PutRecordRequest().withStreamName(streamName)
+        .withData(ByteBuffer.wrap(str.getBytes()))
+        .withPartitionKey(str)
+
+      val putRecordResult = kinesisClient.putRecord(putRecordRequest)
+      val shardId = putRecordResult.getShardId
+      val seqNumber = putRecordResult.getSequenceNumber()
+      val sentSeqNumbers = shardIdToSeqNumbers.getOrElseUpdate(shardId,
+        new ArrayBuffer[(Int, String)]())
+      sentSeqNumbers += ((num, seqNumber))
+    }
+
+    logInfo(s"Pushed $testData:\n\t ${shardIdToSeqNumbers.mkString("\n\t")}")
+    shardIdToSeqNumbers.toMap
+  }
+
+  def describeStream(streamNameToDescribe: String = streamName): Option[StreamDescription] = {
+    try {
+      val describeStreamRequest = new DescribeStreamRequest().withStreamName(streamNameToDescribe)
+      val desc = kinesisClient.describeStream(describeStreamRequest).getStreamDescription()
+      Some(desc)
+    } catch {
+      case rnfe: ResourceNotFoundException =>
+        None
+    }
+  }
+
+  def deleteStream(): Unit = {
+    try {
+      if (describeStream().nonEmpty) {
+        val deleteStreamRequest = new DeleteStreamRequest()
+        kinesisClient.deleteStream(streamName)
+      }
+    } catch {
+      case e: Exception =>
+        logWarning(s"Could not delete stream $streamName")
+    }
+  }
+
+  def deleteDynamoDBTable(tableName: String): Unit = {
+    try {
+      val table = dynamoDB.getTable(tableName)
+      table.delete()
+      table.waitForDelete()
+    } catch {
+      case e: Exception =>
+        logWarning(s"Could not delete DynamoDB table $tableName")
+    }
+  }
+
+  private def findNonExistentStreamName(): String = {
+    var testStreamName: String = null
+    do {
+      Thread.sleep(TimeUnit.SECONDS.toMillis(describeStreamPollTimeSeconds))
+      testStreamName = s"KinesisTestUtils-${math.abs(Random.nextLong())}"
+    } while (describeStream(testStreamName).nonEmpty)
+    testStreamName
+  }
+
+  private def waitForStreamToBeActive(streamNameToWaitFor: String): Unit = {
+    val startTime = System.currentTimeMillis()
+    val endTime = startTime + TimeUnit.SECONDS.toMillis(createStreamTimeoutSeconds)
+    while (System.currentTimeMillis() < endTime) {
+      Thread.sleep(TimeUnit.SECONDS.toMillis(describeStreamPollTimeSeconds))
+      describeStream(streamNameToWaitFor).foreach { description =>
+        val streamStatus = description.getStreamStatus()
+        logDebug(s"\t- current state: $streamStatus\n")
+        if ("ACTIVE".equals(streamStatus)) {
+          return
+        }
+      }
+    }
+    require(false, s"Stream $streamName never became active")
+  }
+}
+
+private[kinesis] object KinesisTestUtils {
+
+  val envVarName = "RUN_KINESIS_TESTS"
+
+  val shouldRunTests = sys.env.get(envVarName) == Some("1")
+
+  def isAWSCredentialsPresent: Boolean = {
+    Try { new DefaultAWSCredentialsProviderChain().getCredentials() }.isSuccess
+  }
+
+  def getAWSCredentials(): AWSCredentials = {
+    assert(shouldRunTests,
+      "Kinesis test not enabled, should not attempt to get AWS credentials")
+    Try { new DefaultAWSCredentialsProviderChain().getCredentials() } match {
+      case Success(cred) => cred
+      case Failure(e) =>
+        throw new Exception("Kinesis tests enabled, but could get not AWS credentials")
+    }
+  }
+}
diff --git a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisFunSuite.scala b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisFunSuite.scala
new file mode 100644
index 0000000000000..6d011f295e7f7
--- /dev/null
+++ b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisFunSuite.scala
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.streaming.kinesis
+
+import org.apache.spark.SparkFunSuite
+
+/**
+ * Helper class that runs Kinesis real data transfer tests or
+ * ignores them based on env variable is set or not.
+ */
+trait KinesisSuiteHelper { self: SparkFunSuite =>
+  import KinesisTestUtils._
+
+  /** Run the test if environment variable is set or ignore the test */
+  def testOrIgnore(testName: String)(testBody: => Unit) {
+    if (shouldRunTests) {
+      test(testName)(testBody)
+    } else {
+      ignore(s"$testName [enable by setting env var $envVarName=1]")(testBody)
+    }
+  }
+}
diff --git a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala
index 2103dca6b766f..98f2c7c4f1bfb 100644
--- a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala
+++ b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala
@@ -73,23 +73,6 @@ class KinesisReceiverSuite extends TestSuiteBase with Matchers with BeforeAndAft
       checkpointStateMock, currentClockMock)
   }
 
-  test("KinesisUtils API") {
-    val ssc = new StreamingContext(master, framework, batchDuration)
-    // Tests the API, does not actually test data receiving
-    val kinesisStream1 = KinesisUtils.createStream(ssc, "mySparkStream",
-      "https://kinesis.us-west-2.amazonaws.com", Seconds(2),
-      InitialPositionInStream.LATEST, StorageLevel.MEMORY_AND_DISK_2)
-    val kinesisStream2 = KinesisUtils.createStream(ssc, "myAppNam", "mySparkStream",
-      "https://kinesis.us-west-2.amazonaws.com", "us-west-2",
-      InitialPositionInStream.LATEST, Seconds(2), StorageLevel.MEMORY_AND_DISK_2)
-    val kinesisStream3 = KinesisUtils.createStream(ssc, "myAppNam", "mySparkStream",
-      "https://kinesis.us-west-2.amazonaws.com", "us-west-2",
-      InitialPositionInStream.LATEST, Seconds(2), StorageLevel.MEMORY_AND_DISK_2,
-      "awsAccessKey", "awsSecretKey")
-
-    ssc.stop()
-  }
-
   test("check serializability of SerializableAWSCredentials") {
     Utils.deserialize[SerializableAWSCredentials](
       Utils.serialize(new SerializableAWSCredentials("x", "y")))
diff --git a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisStreamSuite.scala b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisStreamSuite.scala
new file mode 100644
index 0000000000000..d3dd541fe4371
--- /dev/null
+++ b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisStreamSuite.scala
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.streaming.kinesis
+
+import scala.collection.mutable
+import scala.concurrent.duration._
+import scala.language.postfixOps
+import scala.util.Random
+
+import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream
+import org.scalatest.concurrent.Eventually
+import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll}
+
+import org.apache.spark.storage.StorageLevel
+import org.apache.spark.streaming._
+import org.apache.spark.{SparkConf, SparkContext, SparkFunSuite}
+
+class KinesisStreamSuite extends SparkFunSuite with KinesisSuiteHelper
+  with Eventually with BeforeAndAfter with BeforeAndAfterAll {
+
+  private val kinesisTestUtils = new KinesisTestUtils()
+
+  // This is the name that KCL uses to save metadata to DynamoDB
+  private val kinesisAppName = s"KinesisStreamSuite-${math.abs(Random.nextLong())}"
+
+  private var ssc: StreamingContext = _
+  private var sc: SparkContext = _
+
+  override def beforeAll(): Unit = {
+    kinesisTestUtils.createStream()
+    val conf = new SparkConf()
+      .setMaster("local[4]")
+      .setAppName("KinesisStreamSuite") // Setting Spark app name to Kinesis app name
+    sc = new SparkContext(conf)
+  }
+
+  override def afterAll(): Unit = {
+    sc.stop()
+    // Delete the Kinesis stream as well as the DynamoDB table generated by
+    // Kinesis Client Library when consuming the stream
+    kinesisTestUtils.deleteStream()
+    kinesisTestUtils.deleteDynamoDBTable(kinesisAppName)
+  }
+
+  before {
+    // Delete the DynamoDB table generated by Kinesis Client Library when
+    // consuming from the stream, so that each unit test can start from
+    // scratch without prior history of data consumption
+    kinesisTestUtils.deleteDynamoDBTable(kinesisAppName)
+  }
+
+  after {
+    if (ssc != null) {
+      ssc.stop(stopSparkContext = false)
+      ssc = null
+    }
+  }
+
+  test("KinesisUtils API") {
+    ssc = new StreamingContext(sc, Seconds(1))
+    // Tests the API, does not actually test data receiving
+    val kinesisStream1 = KinesisUtils.createStream(ssc, "mySparkStream",
+      "https://kinesis.us-west-2.amazonaws.com", Seconds(2),
+      InitialPositionInStream.LATEST, StorageLevel.MEMORY_AND_DISK_2)
+    val kinesisStream2 = KinesisUtils.createStream(ssc, "myAppNam", "mySparkStream",
+      "https://kinesis.us-west-2.amazonaws.com", "us-west-2",
+      InitialPositionInStream.LATEST, Seconds(2), StorageLevel.MEMORY_AND_DISK_2)
+    val kinesisStream3 = KinesisUtils.createStream(ssc, "myAppNam", "mySparkStream",
+      "https://kinesis.us-west-2.amazonaws.com", "us-west-2",
+      InitialPositionInStream.LATEST, Seconds(2), StorageLevel.MEMORY_AND_DISK_2,
+      "awsAccessKey", "awsSecretKey")
+  }
+
+
+  /**
+   * Test the stream by sending data to a Kinesis stream and receiving from it.
+   * This test is not run by default as it requires AWS credentials that the test
+   * environment may not have. Even if there is AWS credentials available, the user
+   * may not want to run these tests to avoid the Kinesis costs. To enable this test,
+   * you must have AWS credentials available through the default AWS provider chain,
+   * and you have to set the system environment variable RUN_KINESIS_TESTS=1 .
+   */
+  testOrIgnore("basic operation") {
+    ssc = new StreamingContext(sc, Seconds(1))
+    val aWSCredentials = KinesisTestUtils.getAWSCredentials()
+    val stream = KinesisUtils.createStream(ssc, kinesisAppName, kinesisTestUtils.streamName,
+      kinesisTestUtils.endpointUrl, kinesisTestUtils.regionName, InitialPositionInStream.LATEST,
+      Seconds(10), StorageLevel.MEMORY_ONLY,
+      aWSCredentials.getAWSAccessKeyId, aWSCredentials.getAWSSecretKey)
+
+    val collected = new mutable.HashSet[Int] with mutable.SynchronizedSet[Int]
+    stream.map { bytes => new String(bytes).toInt }.foreachRDD { rdd =>
+      collected ++= rdd.collect()
+      logInfo("Collected = " + rdd.collect().toSeq.mkString(", "))
+    }
+    ssc.start()
+
+    val testData = 1 to 10
+    eventually(timeout(120 seconds), interval(10 second)) {
+      kinesisTestUtils.pushData(testData)
+      assert(collected === testData.toSet, "\nData received does not match data sent")
+    }
+    ssc.stop()
+  }
+}

From 1707238601690fd0e8e173e2c47f1b4286644a29 Mon Sep 17 00:00:00 2001
From: Daoyuan Wang <daoyuan.wang@intel.com>
Date: Fri, 17 Jul 2015 16:45:46 -0700
Subject: [PATCH 223/549] [SPARK-7026] [SQL] fix left semi join with equi key
 and non-equi condition

When the `condition` extracted by `ExtractEquiJoinKeys` contain join Predicate for left semi join, we can not plan it as semiJoin. Such as

    SELECT * FROM testData2 x
    LEFT SEMI JOIN testData2 y
    ON x.b = y.b
    AND x.a >= y.a + 2
Condition `x.a >= y.a + 2` can not evaluate on table `x`, so it throw errors

Author: Daoyuan Wang <daoyuan.wang@intel.com>

Closes #5643 from adrian-wang/spark7026 and squashes the following commits:

cc09809 [Daoyuan Wang] refactor semijoin and add plan test
575a7c8 [Daoyuan Wang] fix notserializable
27841de [Daoyuan Wang] fix rebase
10bf124 [Daoyuan Wang] fix style
72baa02 [Daoyuan Wang] fix style
8e0afca [Daoyuan Wang] merge commits for rebase
---
 .../spark/sql/execution/SparkStrategies.scala | 10 +-
 .../joins/BroadcastLeftSemiJoinHash.scala     | 42 ++++-----
 .../sql/execution/joins/HashOuterJoin.scala   |  3 +-
 .../sql/execution/joins/HashSemiJoin.scala    | 91 +++++++++++++++++++
 .../execution/joins/LeftSemiJoinHash.scala    | 35 ++-----
 .../org/apache/spark/sql/SQLQuerySuite.scala  | 12 +++
 .../sql/execution/joins/SemiJoinSuite.scala   | 74 +++++++++++++++
 7 files changed, 208 insertions(+), 59 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashSemiJoin.scala
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/joins/SemiJoinSuite.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index 73b463471ec5a..240332a80af0f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -38,14 +38,12 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
       case ExtractEquiJoinKeys(LeftSemi, leftKeys, rightKeys, condition, left, right)
         if sqlContext.conf.autoBroadcastJoinThreshold > 0 &&
           right.statistics.sizeInBytes <= sqlContext.conf.autoBroadcastJoinThreshold =>
-        val semiJoin = joins.BroadcastLeftSemiJoinHash(
-          leftKeys, rightKeys, planLater(left), planLater(right))
-        condition.map(Filter(_, semiJoin)).getOrElse(semiJoin) :: Nil
+        joins.BroadcastLeftSemiJoinHash(
+          leftKeys, rightKeys, planLater(left), planLater(right), condition) :: Nil
       // Find left semi joins where at least some predicates can be evaluated by matching join keys
       case ExtractEquiJoinKeys(LeftSemi, leftKeys, rightKeys, condition, left, right) =>
-        val semiJoin = joins.LeftSemiJoinHash(
-          leftKeys, rightKeys, planLater(left), planLater(right))
-        condition.map(Filter(_, semiJoin)).getOrElse(semiJoin) :: Nil
+        joins.LeftSemiJoinHash(
+          leftKeys, rightKeys, planLater(left), planLater(right), condition) :: Nil
       // no predicate can be evaluated by matching hash keys
       case logical.Join(left, right, LeftSemi, condition) =>
         joins.LeftSemiJoinBNL(planLater(left), planLater(right), condition) :: Nil
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastLeftSemiJoinHash.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastLeftSemiJoinHash.scala
index f7b46d6888d7d..2750f58b005ac 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastLeftSemiJoinHash.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastLeftSemiJoinHash.scala
@@ -33,37 +33,27 @@ case class BroadcastLeftSemiJoinHash(
     leftKeys: Seq[Expression],
     rightKeys: Seq[Expression],
     left: SparkPlan,
-    right: SparkPlan) extends BinaryNode with HashJoin {
-
-  override val buildSide: BuildSide = BuildRight
-
-  override def output: Seq[Attribute] = left.output
+    right: SparkPlan,
+    condition: Option[Expression]) extends BinaryNode with HashSemiJoin {
 
   protected override def doExecute(): RDD[InternalRow] = {
-    val buildIter = buildPlan.execute().map(_.copy()).collect().toIterator
-    val hashSet = new java.util.HashSet[InternalRow]()
-    var currentRow: InternalRow = null
+    val buildIter = right.execute().map(_.copy()).collect().toIterator
 
-    // Create a Hash set of buildKeys
-    while (buildIter.hasNext) {
-      currentRow = buildIter.next()
-      val rowKey = buildSideKeyGenerator(currentRow)
-      if (!rowKey.anyNull) {
-        val keyExists = hashSet.contains(rowKey)
-        if (!keyExists) {
-          // rowKey may be not serializable (from codegen)
-          hashSet.add(rowKey.copy())
-        }
-      }
-    }
+    if (condition.isEmpty) {
+      // rowKey may be not serializable (from codegen)
+      val hashSet = buildKeyHashSet(buildIter, copy = true)
+      val broadcastedRelation = sparkContext.broadcast(hashSet)
 
-    val broadcastedRelation = sparkContext.broadcast(hashSet)
+      left.execute().mapPartitions { streamIter =>
+        hashSemiJoin(streamIter, broadcastedRelation.value)
+      }
+    } else {
+      val hashRelation = HashedRelation(buildIter, rightKeyGenerator)
+      val broadcastedRelation = sparkContext.broadcast(hashRelation)
 
-    streamedPlan.execute().mapPartitions { streamIter =>
-      val joinKeys = streamSideKeyGenerator()
-      streamIter.filter(current => {
-        !joinKeys(current).anyNull && broadcastedRelation.value.contains(joinKeys.currentValue)
-      })
+      left.execute().mapPartitions { streamIter =>
+        hashSemiJoin(streamIter, broadcastedRelation.value)
+      }
     }
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashOuterJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashOuterJoin.scala
index 0522ee85eeb8a..74a7db7761758 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashOuterJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashOuterJoin.scala
@@ -65,8 +65,7 @@ override def outputPartitioning: Partitioning = joinType match {
   @transient private[this] lazy val leftNullRow = new GenericInternalRow(left.output.length)
   @transient private[this] lazy val rightNullRow = new GenericInternalRow(right.output.length)
   @transient private[this] lazy val boundCondition =
-    condition.map(
-      newPredicate(_, left.output ++ right.output)).getOrElse((row: InternalRow) => true)
+    newPredicate(condition.getOrElse(Literal(true)), left.output ++ right.output)
 
   // TODO we need to rewrite all of the iterators with our own implementation instead of the Scala
   // iterator for performance purpose.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashSemiJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashSemiJoin.scala
new file mode 100644
index 0000000000000..1b983bc3a90f9
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashSemiJoin.scala
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.joins
+
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.execution.SparkPlan
+
+
+trait HashSemiJoin {
+  self: SparkPlan =>
+  val leftKeys: Seq[Expression]
+  val rightKeys: Seq[Expression]
+  val left: SparkPlan
+  val right: SparkPlan
+  val condition: Option[Expression]
+
+  override def output: Seq[Attribute] = left.output
+
+  @transient protected lazy val rightKeyGenerator: Projection =
+    newProjection(rightKeys, right.output)
+
+  @transient protected lazy val leftKeyGenerator: () => MutableProjection =
+    newMutableProjection(leftKeys, left.output)
+
+  @transient private lazy val boundCondition =
+    newPredicate(condition.getOrElse(Literal(true)), left.output ++ right.output)
+
+  protected def buildKeyHashSet(
+      buildIter: Iterator[InternalRow],
+      copy: Boolean): java.util.Set[InternalRow] = {
+    val hashSet = new java.util.HashSet[InternalRow]()
+    var currentRow: InternalRow = null
+
+    // Create a Hash set of buildKeys
+    while (buildIter.hasNext) {
+      currentRow = buildIter.next()
+      val rowKey = rightKeyGenerator(currentRow)
+      if (!rowKey.anyNull) {
+        val keyExists = hashSet.contains(rowKey)
+        if (!keyExists) {
+          if (copy) {
+            hashSet.add(rowKey.copy())
+          } else {
+            // rowKey may be not serializable (from codegen)
+            hashSet.add(rowKey)
+          }
+        }
+      }
+    }
+    hashSet
+  }
+
+  protected def hashSemiJoin(
+      streamIter: Iterator[InternalRow],
+      hashedRelation: HashedRelation): Iterator[InternalRow] = {
+    val joinKeys = leftKeyGenerator()
+    val joinedRow = new JoinedRow
+    streamIter.filter(current => {
+      lazy val rowBuffer = hashedRelation.get(joinKeys.currentValue)
+      !joinKeys(current).anyNull && rowBuffer != null && rowBuffer.exists {
+        (build: InternalRow) => boundCondition(joinedRow(current, build))
+      }
+    })
+  }
+
+  protected def hashSemiJoin(
+      streamIter: Iterator[InternalRow],
+      hashSet: java.util.Set[InternalRow]): Iterator[InternalRow] = {
+    val joinKeys = leftKeyGenerator()
+    val joinedRow = new JoinedRow
+    streamIter.filter(current => {
+      !joinKeys(current.copy()).anyNull && hashSet.contains(joinKeys.currentValue)
+    })
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/LeftSemiJoinHash.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/LeftSemiJoinHash.scala
index 611ba928a16ec..9eaac817d9268 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/LeftSemiJoinHash.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/LeftSemiJoinHash.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.execution.joins
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression}
+import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.physical.ClusteredDistribution
 import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}
 
@@ -34,36 +34,21 @@ case class LeftSemiJoinHash(
     leftKeys: Seq[Expression],
     rightKeys: Seq[Expression],
     left: SparkPlan,
-    right: SparkPlan) extends BinaryNode with HashJoin {
-
-  override val buildSide: BuildSide = BuildRight
+    right: SparkPlan,
+    condition: Option[Expression]) extends BinaryNode with HashSemiJoin {
 
   override def requiredChildDistribution: Seq[ClusteredDistribution] =
     ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil
 
-  override def output: Seq[Attribute] = left.output
-
   protected override def doExecute(): RDD[InternalRow] = {
-    buildPlan.execute().zipPartitions(streamedPlan.execute()) { (buildIter, streamIter) =>
-      val hashSet = new java.util.HashSet[InternalRow]()
-      var currentRow: InternalRow = null
-
-      // Create a Hash set of buildKeys
-      while (buildIter.hasNext) {
-        currentRow = buildIter.next()
-        val rowKey = buildSideKeyGenerator(currentRow)
-        if (!rowKey.anyNull) {
-          val keyExists = hashSet.contains(rowKey)
-          if (!keyExists) {
-            hashSet.add(rowKey)
-          }
-        }
+    right.execute().zipPartitions(left.execute()) { (buildIter, streamIter) =>
+      if (condition.isEmpty) {
+        val hashSet = buildKeyHashSet(buildIter, copy = false)
+        hashSemiJoin(streamIter, hashSet)
+      } else {
+        val hashRelation = HashedRelation(buildIter, rightKeyGenerator)
+        hashSemiJoin(streamIter, hashRelation)
       }
-
-      val joinKeys = streamSideKeyGenerator()
-      streamIter.filter(current => {
-        !joinKeys(current).anyNull && hashSet.contains(joinKeys.currentValue)
-      })
     }
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 5b8b70ed5ae11..61d5f2061ae18 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -395,6 +395,18 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll with SQLTestUtils {
     )
   }
 
+  test("left semi greater than predicate and equal operator") {
+    checkAnswer(
+      sql("SELECT * FROM testData2 x LEFT SEMI JOIN testData2 y ON x.b = y.b and x.a >= y.a + 2"),
+      Seq(Row(3, 1), Row(3, 2))
+    )
+
+    checkAnswer(
+      sql("SELECT * FROM testData2 x LEFT SEMI JOIN testData2 y ON x.b = y.a and x.a >= y.b + 1"),
+      Seq(Row(2, 1), Row(2, 2), Row(3, 1), Row(3, 2))
+    )
+  }
+
   test("index into array of arrays") {
     checkAnswer(
       sql(
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/SemiJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/SemiJoinSuite.scala
new file mode 100644
index 0000000000000..927e85a7db3dc
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/SemiJoinSuite.scala
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.joins
+
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.catalyst.expressions.{LessThan, Expression}
+import org.apache.spark.sql.execution.{SparkPlan, SparkPlanTest}
+
+
+class SemiJoinSuite extends SparkPlanTest{
+  val left = Seq(
+    (1, 2.0),
+    (1, 2.0),
+    (2, 1.0),
+    (2, 1.0),
+    (3, 3.0)
+  ).toDF("a", "b")
+
+  val right = Seq(
+    (2, 3.0),
+    (2, 3.0),
+    (3, 2.0),
+    (4, 1.0)
+  ).toDF("c", "d")
+
+  val leftKeys: List[Expression] = 'a :: Nil
+  val rightKeys: List[Expression] = 'c :: Nil
+  val condition = Some(LessThan('b, 'd))
+
+  test("left semi join hash") {
+    checkAnswer2(left, right, (left: SparkPlan, right: SparkPlan) =>
+      LeftSemiJoinHash(leftKeys, rightKeys, left, right, condition),
+      Seq(
+        (2, 1.0),
+        (2, 1.0)
+      ).map(Row.fromTuple))
+  }
+
+  test("left semi join BNL") {
+    checkAnswer2(left, right, (left: SparkPlan, right: SparkPlan) =>
+      LeftSemiJoinBNL(left, right, condition),
+      Seq(
+        (1, 2.0),
+        (1, 2.0),
+        (2, 1.0),
+        (2, 1.0)
+      ).map(Row.fromTuple))
+  }
+
+  test("broadcast left semi join hash") {
+    checkAnswer2(left, right, (left: SparkPlan, right: SparkPlan) =>
+      BroadcastLeftSemiJoinHash(leftKeys, rightKeys, left, right, condition),
+      Seq(
+        (2, 1.0),
+        (2, 1.0)
+      ).map(Row.fromTuple))
+  }
+}

From 529a2c2d92fef062e0078a8608fa3a8ae848c139 Mon Sep 17 00:00:00 2001
From: Yijie Shen <henry.yijieshen@gmail.com>
Date: Fri, 17 Jul 2015 17:33:19 -0700
Subject: [PATCH 224/549] [SPARK-8280][SPARK-8281][SQL]Handle NaN, null and
 Infinity in math

JIRA:
https://issues.apache.org/jira/browse/SPARK-8280
https://issues.apache.org/jira/browse/SPARK-8281

Author: Yijie Shen <henry.yijieshen@gmail.com>

Closes #7451 from yijieshen/nan_null2 and squashes the following commits:

47a529d [Yijie Shen] style fix
63dee44 [Yijie Shen] handle log expressions similar to Hive
188be51 [Yijie Shen] null to nan in Math Expression
---
 .../catalyst/analysis/FunctionRegistry.scala  |   2 +-
 .../spark/sql/catalyst/expressions/math.scala |  97 ++++++++++-------
 .../expressions/MathFunctionsSuite.scala      | 102 +++++++++++++++---
 .../spark/sql/MathExpressionsSuite.scala      |   7 +-
 .../execution/HiveCompatibilitySuite.scala    |  12 ++-
 5 files changed, 157 insertions(+), 63 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index 7bb2579506a8a..ce552a1d65eda 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -112,9 +112,9 @@ object FunctionRegistry {
     expression[Log]("ln"),
     expression[Log10]("log10"),
     expression[Log1p]("log1p"),
+    expression[Log2]("log2"),
     expression[UnaryMinus]("negative"),
     expression[Pi]("pi"),
-    expression[Log2]("log2"),
     expression[Pow]("pow"),
     expression[Pow]("power"),
     expression[Pmod]("pmod"),
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
index b05a7b3ed0ea4..9101f11052218 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
@@ -65,22 +65,38 @@ abstract class UnaryMathExpression(f: Double => Double, name: String)
   override def toString: String = s"$name($child)"
 
   protected override def nullSafeEval(input: Any): Any = {
-    val result = f(input.asInstanceOf[Double])
-    if (result.isNaN) null else result
+    f(input.asInstanceOf[Double])
   }
 
   // name of function in java.lang.Math
   def funcName: String = name.toLowerCase
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
-    nullSafeCodeGen(ctx, ev, eval => {
+    defineCodeGen(ctx, ev, c => s"java.lang.Math.${funcName}($c)")
+  }
+}
+
+abstract class UnaryLogExpression(f: Double => Double, name: String)
+    extends UnaryMathExpression(f, name) { self: Product =>
+
+  // values less than or equal to yAsymptote eval to null in Hive, instead of NaN or -Infinity
+  protected val yAsymptote: Double = 0.0
+
+  protected override def nullSafeEval(input: Any): Any = {
+    val d = input.asInstanceOf[Double]
+    if (d <= yAsymptote) null else f(d)
+  }
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    nullSafeCodeGen(ctx, ev, c =>
       s"""
-        ${ev.primitive} = java.lang.Math.${funcName}($eval);
-        if (Double.valueOf(${ev.primitive}).isNaN()) {
+        if ($c <= $yAsymptote) {
           ${ev.isNull} = true;
+        } else {
+          ${ev.primitive} = java.lang.Math.${funcName}($c);
         }
       """
-    })
+    )
   }
 }
 
@@ -100,8 +116,7 @@ abstract class BinaryMathExpression(f: (Double, Double) => Double, name: String)
   override def dataType: DataType = DoubleType
 
   protected override def nullSafeEval(input1: Any, input2: Any): Any = {
-    val result = f(input1.asInstanceOf[Double], input2.asInstanceOf[Double])
-    if (result.isNaN) null else result
+    f(input1.asInstanceOf[Double], input2.asInstanceOf[Double])
   }
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
@@ -398,25 +413,28 @@ case class Factorial(child: Expression) extends UnaryExpression with ImplicitCas
   }
 }
 
-case class Log(child: Expression) extends UnaryMathExpression(math.log, "LOG")
+case class Log(child: Expression) extends UnaryLogExpression(math.log, "LOG")
 
 case class Log2(child: Expression)
-  extends UnaryMathExpression((x: Double) => math.log(x) / math.log(2), "LOG2") {
+  extends UnaryLogExpression((x: Double) => math.log(x) / math.log(2), "LOG2") {
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
-    nullSafeCodeGen(ctx, ev, eval => {
+    nullSafeCodeGen(ctx, ev, c =>
       s"""
-        ${ev.primitive} = java.lang.Math.log($eval) / java.lang.Math.log(2);
-        if (Double.valueOf(${ev.primitive}).isNaN()) {
+        if ($c <= $yAsymptote) {
           ${ev.isNull} = true;
+        } else {
+          ${ev.primitive} = java.lang.Math.log($c) / java.lang.Math.log(2);
         }
       """
-    })
+    )
   }
 }
 
-case class Log10(child: Expression) extends UnaryMathExpression(math.log10, "LOG10")
+case class Log10(child: Expression) extends UnaryLogExpression(math.log10, "LOG10")
 
-case class Log1p(child: Expression) extends UnaryMathExpression(math.log1p, "LOG1P")
+case class Log1p(child: Expression) extends UnaryLogExpression(math.log1p, "LOG1P") {
+  protected override val yAsymptote: Double = -1.0
+}
 
 case class Rint(child: Expression) extends UnaryMathExpression(math.rint, "ROUND") {
   override def funcName: String = "rint"
@@ -577,27 +595,18 @@ case class Atan2(left: Expression, right: Expression)
 
   protected override def nullSafeEval(input1: Any, input2: Any): Any = {
     // With codegen, the values returned by -0.0 and 0.0 are different. Handled with +0.0
-    val result = math.atan2(input1.asInstanceOf[Double] + 0.0, input2.asInstanceOf[Double] + 0.0)
-    if (result.isNaN) null else result
+    math.atan2(input1.asInstanceOf[Double] + 0.0, input2.asInstanceOf[Double] + 0.0)
   }
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
-    defineCodeGen(ctx, ev, (c1, c2) => s"java.lang.Math.atan2($c1 + 0.0, $c2 + 0.0)") + s"""
-      if (Double.valueOf(${ev.primitive}).isNaN()) {
-        ${ev.isNull} = true;
-      }
-      """
+    defineCodeGen(ctx, ev, (c1, c2) => s"java.lang.Math.atan2($c1 + 0.0, $c2 + 0.0)")
   }
 }
 
 case class Pow(left: Expression, right: Expression)
   extends BinaryMathExpression(math.pow, "POWER") {
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
-    defineCodeGen(ctx, ev, (c1, c2) => s"java.lang.Math.pow($c1, $c2)") + s"""
-      if (Double.valueOf(${ev.primitive}).isNaN()) {
-        ${ev.isNull} = true;
-      }
-      """
+    defineCodeGen(ctx, ev, (c1, c2) => s"java.lang.Math.pow($c1, $c2)")
   }
 }
 
@@ -699,17 +708,33 @@ case class Logarithm(left: Expression, right: Expression)
     this(EulerNumber(), child)
   }
 
+  protected override def nullSafeEval(input1: Any, input2: Any): Any = {
+    val dLeft = input1.asInstanceOf[Double]
+    val dRight = input2.asInstanceOf[Double]
+    // Unlike Hive, we support Log base in (0.0, 1.0]
+    if (dLeft <= 0.0 || dRight <= 0.0) null else math.log(dRight) / math.log(dLeft)
+  }
+
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
-    val logCode = if (left.isInstanceOf[EulerNumber]) {
-      defineCodeGen(ctx, ev, (c1, c2) => s"java.lang.Math.log($c2)")
+    if (left.isInstanceOf[EulerNumber]) {
+      nullSafeCodeGen(ctx, ev, (c1, c2) =>
+        s"""
+          if ($c2 <= 0.0) {
+            ${ev.isNull} = true;
+          } else {
+            ${ev.primitive} = java.lang.Math.log($c2);
+          }
+        """)
     } else {
-      defineCodeGen(ctx, ev, (c1, c2) => s"java.lang.Math.log($c2) / java.lang.Math.log($c1)")
+      nullSafeCodeGen(ctx, ev, (c1, c2) =>
+        s"""
+          if ($c1 <= 0.0 || $c2 <= 0.0) {
+            ${ev.isNull} = true;
+          } else {
+            ${ev.primitive} = java.lang.Math.log($c2) / java.lang.Math.log($c1);
+          }
+        """)
     }
-    logCode + s"""
-      if (Double.isNaN(${ev.primitive})) {
-        ${ev.isNull} = true;
-      }
-    """
   }
 }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala
index ca35c7ef8ae5d..df988f57fbfde 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala
@@ -21,6 +21,10 @@ import com.google.common.math.LongMath
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.catalyst.expressions.codegen.{GenerateProjection, GenerateMutableProjection}
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.optimizer.DefaultOptimizer
+import org.apache.spark.sql.catalyst.plans.logical.{OneRowRelation, Project}
 import org.apache.spark.sql.types._
 
 
@@ -47,6 +51,7 @@ class MathFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
    * @param f The functions in scala.math or elsewhere used to generate expected results
    * @param domain The set of values to run the function with
    * @param expectNull Whether the given values should return null or not
+   * @param expectNaN Whether the given values should eval to NaN or not
    * @tparam T Generic type for primitives
    * @tparam U Generic type for the output of the given function `f`
    */
@@ -55,11 +60,16 @@ class MathFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
       f: T => U,
       domain: Iterable[T] = (-20 to 20).map(_ * 0.1),
       expectNull: Boolean = false,
+      expectNaN: Boolean = false,
       evalType: DataType = DoubleType): Unit = {
     if (expectNull) {
       domain.foreach { value =>
         checkEvaluation(c(Literal(value)), null, EmptyRow)
       }
+    } else if (expectNaN) {
+      domain.foreach { value =>
+        checkNaN(c(Literal(value)), EmptyRow)
+      }
     } else {
       domain.foreach { value =>
         checkEvaluation(c(Literal(value)), f(value), EmptyRow)
@@ -74,16 +84,22 @@ class MathFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
    * @param c The DataFrame function
    * @param f The functions in scala.math
    * @param domain The set of values to run the function with
+   * @param expectNull Whether the given values should return null or not
+   * @param expectNaN Whether the given values should eval to NaN or not
    */
   private def testBinary(
       c: (Expression, Expression) => Expression,
       f: (Double, Double) => Double,
       domain: Iterable[(Double, Double)] = (-20 to 20).map(v => (v * 0.1, v * -0.1)),
-      expectNull: Boolean = false): Unit = {
+      expectNull: Boolean = false, expectNaN: Boolean = false): Unit = {
     if (expectNull) {
       domain.foreach { case (v1, v2) =>
         checkEvaluation(c(Literal(v1), Literal(v2)), null, create_row(null))
       }
+    } else if (expectNaN) {
+      domain.foreach { case (v1, v2) =>
+        checkNaN(c(Literal(v1), Literal(v2)), EmptyRow)
+      }
     } else {
       domain.foreach { case (v1, v2) =>
         checkEvaluation(c(Literal(v1), Literal(v2)), f(v1 + 0.0, v2 + 0.0), EmptyRow)
@@ -112,6 +128,62 @@ class MathFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
       Conv(Literal("11abc"), Literal(10), Literal(16)), "B")
   }
 
+  private def checkNaN(
+      expression: Expression, inputRow: InternalRow = EmptyRow): Unit = {
+    checkNaNWithoutCodegen(expression, inputRow)
+    checkNaNWithGeneratedProjection(expression, inputRow)
+    checkNaNWithOptimization(expression, inputRow)
+  }
+
+  private def checkNaNWithoutCodegen(
+      expression: Expression,
+      expected: Any,
+      inputRow: InternalRow = EmptyRow): Unit = {
+    val actual = try evaluate(expression, inputRow) catch {
+      case e: Exception => fail(s"Exception evaluating $expression", e)
+    }
+    if (!actual.asInstanceOf[Double].isNaN) {
+      val input = if (inputRow == EmptyRow) "" else s", input: $inputRow"
+      fail(s"Incorrect evaluation (codegen off): $expression, " +
+        s"actual: $actual, " +
+        s"expected: NaN")
+    }
+  }
+
+
+  private def checkNaNWithGeneratedProjection(
+    expression: Expression,
+    inputRow: InternalRow = EmptyRow): Unit = {
+
+    val plan = try {
+      GenerateMutableProjection.generate(Alias(expression, s"Optimized($expression)")() :: Nil)()
+    } catch {
+      case e: Throwable =>
+        val ctx = GenerateProjection.newCodeGenContext()
+        val evaluated = expression.gen(ctx)
+        fail(
+          s"""
+             |Code generation of $expression failed:
+             |${evaluated.code}
+             |$e
+          """.stripMargin)
+    }
+
+    val actual = plan(inputRow).apply(0)
+    if (!actual.asInstanceOf[Double].isNaN) {
+      val input = if (inputRow == EmptyRow) "" else s", input: $inputRow"
+      fail(s"Incorrect Evaluation: $expression, actual: $actual, expected: NaN")
+    }
+  }
+
+  private def checkNaNWithOptimization(
+    expression: Expression,
+    inputRow: InternalRow = EmptyRow): Unit = {
+    val plan = Project(Alias(expression, s"Optimized($expression)")() :: Nil, OneRowRelation)
+    val optimizedPlan = DefaultOptimizer.execute(plan)
+    checkNaNWithoutCodegen(optimizedPlan.expressions.head, inputRow)
+  }
+
   test("e") {
     testLeaf(EulerNumber, math.E)
   }
@@ -126,7 +198,7 @@ class MathFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
 
   test("asin") {
     testUnary(Asin, math.asin, (-10 to 10).map(_ * 0.1))
-    testUnary(Asin, math.asin, (11 to 20).map(_ * 0.1), expectNull = true)
+    testUnary(Asin, math.asin, (11 to 20).map(_ * 0.1), expectNaN = true)
   }
 
   test("sinh") {
@@ -139,7 +211,7 @@ class MathFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
 
   test("acos") {
     testUnary(Acos, math.acos, (-10 to 10).map(_ * 0.1))
-    testUnary(Acos, math.acos, (11 to 20).map(_ * 0.1), expectNull = true)
+    testUnary(Acos, math.acos, (11 to 20).map(_ * 0.1), expectNaN = true)
   }
 
   test("cosh") {
@@ -204,18 +276,18 @@ class MathFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
   }
 
   test("log") {
-    testUnary(Log, math.log, (0 to 20).map(_ * 0.1))
-    testUnary(Log, math.log, (-5 to -1).map(_ * 0.1), expectNull = true)
+    testUnary(Log, math.log, (1 to 20).map(_ * 0.1))
+    testUnary(Log, math.log, (-5 to 0).map(_ * 0.1), expectNull = true)
   }
 
   test("log10") {
-    testUnary(Log10, math.log10, (0 to 20).map(_ * 0.1))
-    testUnary(Log10, math.log10, (-5 to -1).map(_ * 0.1), expectNull = true)
+    testUnary(Log10, math.log10, (1 to 20).map(_ * 0.1))
+    testUnary(Log10, math.log10, (-5 to 0).map(_ * 0.1), expectNull = true)
   }
 
   test("log1p") {
-    testUnary(Log1p, math.log1p, (-1 to 20).map(_ * 0.1))
-    testUnary(Log1p, math.log1p, (-10 to -2).map(_ * 1.0), expectNull = true)
+    testUnary(Log1p, math.log1p, (0 to 20).map(_ * 0.1))
+    testUnary(Log1p, math.log1p, (-10 to -1).map(_ * 1.0), expectNull = true)
   }
 
   test("bin") {
@@ -237,22 +309,22 @@ class MathFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
 
   test("log2") {
     def f: (Double) => Double = (x: Double) => math.log(x) / math.log(2)
-    testUnary(Log2, f, (0 to 20).map(_ * 0.1))
-    testUnary(Log2, f, (-5 to -1).map(_ * 1.0), expectNull = true)
+    testUnary(Log2, f, (1 to 20).map(_ * 0.1))
+    testUnary(Log2, f, (-5 to 0).map(_ * 1.0), expectNull = true)
   }
 
   test("sqrt") {
     testUnary(Sqrt, math.sqrt, (0 to 20).map(_ * 0.1))
-    testUnary(Sqrt, math.sqrt, (-5 to -1).map(_ * 1.0), expectNull = true)
+    testUnary(Sqrt, math.sqrt, (-5 to -1).map(_ * 1.0), expectNaN = true)
 
     checkEvaluation(Sqrt(Literal.create(null, DoubleType)), null, create_row(null))
-    checkEvaluation(Sqrt(Literal(-1.0)), null, EmptyRow)
-    checkEvaluation(Sqrt(Literal(-1.5)), null, EmptyRow)
+    checkNaN(Sqrt(Literal(-1.0)), EmptyRow)
+    checkNaN(Sqrt(Literal(-1.5)), EmptyRow)
   }
 
   test("pow") {
     testBinary(Pow, math.pow, (-5 to 5).map(v => (v * 1.0, v * 1.0)))
-    testBinary(Pow, math.pow, Seq((-1.0, 0.9), (-2.2, 1.7), (-2.2, -1.7)), expectNull = true)
+    testBinary(Pow, math.pow, Seq((-1.0, 0.9), (-2.2, 1.7), (-2.2, -1.7)), expectNaN = true)
   }
 
   test("shift left") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala
index 8eb3fec756b4c..a51523f1a7a0f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala
@@ -68,12 +68,7 @@ class MathExpressionsSuite extends QueryTest {
     if (f(-1) === math.log1p(-1)) {
       checkAnswer(
         nnDoubleData.select(c('b)),
-        (1 to 9).map(n => Row(f(n * -0.1))) :+ Row(Double.NegativeInfinity)
-      )
-    } else {
-      checkAnswer(
-        nnDoubleData.select(c('b)),
-        (1 to 10).map(n => Row(null))
+        (1 to 9).map(n => Row(f(n * -0.1))) :+ Row(null)
       )
     }
 
diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
index 4ada64bc21966..6b8f2f6217a54 100644
--- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
+++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
@@ -254,7 +254,10 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     // Spark SQL use Long for TimestampType, lose the precision under 1us
     "timestamp_1",
     "timestamp_2",
-    "timestamp_udf"
+    "timestamp_udf",
+
+    // Unlike Hive, we do support log base in (0, 1.0], therefore disable this
+    "udf7"
   )
 
   /**
@@ -816,19 +819,18 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "udf2",
     "udf5",
     "udf6",
-    // "udf7",  turn this on after we figure out null vs nan vs infinity
     "udf8",
     "udf9",
     "udf_10_trims",
     "udf_E",
     "udf_PI",
     "udf_abs",
-    // "udf_acos",  turn this on after we figure out null vs nan vs infinity
+    "udf_acos",
     "udf_add",
     "udf_array",
     "udf_array_contains",
     "udf_ascii",
-    // "udf_asin",  turn this on after we figure out null vs nan vs infinity
+    "udf_asin",
     "udf_atan",
     "udf_avg",
     "udf_bigint",
@@ -915,7 +917,7 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "udf_regexp_replace",
     "udf_repeat",
     "udf_rlike",
-    // "udf_round",  turn this on after we figure out null vs nan vs infinity
+    "udf_round",
     "udf_round_3",
     "udf_rpad",
     "udf_rtrim",

From 34a889db857f8752a0a78dcedec75ac6cd6cd48d Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Fri, 17 Jul 2015 18:30:04 -0700
Subject: [PATCH 225/549] [SPARK-7879] [MLLIB] KMeans API for spark.ml
 Pipelines

I Implemented the KMeans API for spark.ml Pipelines. But it doesn't include clustering abstractions for spark.ml (SPARK-7610). It would fit for another issues. And I'll try it later, since we are trying to add the hierarchical clustering algorithms in another issue. Thanks.

[SPARK-7879] KMeans API for spark.ml Pipelines - ASF JIRA https://issues.apache.org/jira/browse/SPARK-7879

Author: Yu ISHIKAWA <yuu.ishikawa@gmail.com>

Closes #6756 from yu-iskw/SPARK-7879 and squashes the following commits:

be752de [Yu ISHIKAWA] Add assertions
a14939b [Yu ISHIKAWA] Fix the dashed line's length in pyspark.ml.rst
4c61693 [Yu ISHIKAWA] Remove the test about whether "features" and "prediction" columns exist or not in Python
fb2417c [Yu ISHIKAWA] Use getInt, instead of get
f397be4 [Yu ISHIKAWA] Switch the comparisons.
ca78b7d [Yu ISHIKAWA] Add the Scala docs about the constraints of each parameter.
effc650 [Yu ISHIKAWA] Using expertSetParam and expertGetParam
c8dc6e6 [Yu ISHIKAWA] Remove an unnecessary test
19a9d63 [Yu ISHIKAWA] Include spark.ml.clustering to python tests
1abb19c [Yu ISHIKAWA] Add the statements about spark.ml.clustering into pyspark.ml.rst
f8338bc [Yu ISHIKAWA] Add the placeholders in Python
4a03003 [Yu ISHIKAWA] Test for contains in Python
6566c8b [Yu ISHIKAWA] Use `get`, instead of `apply`
288e8d5 [Yu ISHIKAWA] Using `contains` to check the column names
5a7d574 [Yu ISHIKAWA] Renamce `validateInitializationMode` to `validateInitMode` and remove throwing exception
97cfae3 [Yu ISHIKAWA] Fix the type of return value of `KMeans.copy`
e933723 [Yu ISHIKAWA] Remove the default value of seed from the Model class
978ee2c [Yu ISHIKAWA] Modify the docs of KMeans, according to mllib's KMeans
2ec80bc [Yu ISHIKAWA] Fit on 1 line
e186be1 [Yu ISHIKAWA] Make a few variables, setters and getters be expert ones
b2c205c [Yu ISHIKAWA] Rename the method `getInitializationSteps` to `getInitSteps` and `setInitializationSteps` to `setInitSteps` in Scala and Python
f43f5b4 [Yu ISHIKAWA] Rename the method `getInitializationMode` to `getInitMode` and `setInitializationMode` to `setInitMode` in Scala and Python
3cb5ba4 [Yu ISHIKAWA] Modify the description about epsilon and the validation
4fa409b [Yu ISHIKAWA] Add a comment about the default value of epsilon
2f392e1 [Yu ISHIKAWA] Make some variables `final` and Use `IntParam` and `DoubleParam`
19326f8 [Yu ISHIKAWA] Use `udf`, instead of callUDF
4d2ad1e [Yu ISHIKAWA] Modify the indentations
0ae422f [Yu ISHIKAWA] Add a test for `setParams`
4ff7913 [Yu ISHIKAWA] Add "ml.clustering" to `javacOptions` in SparkBuild.scala
11ffdf1 [Yu ISHIKAWA] Use `===` and the variable
220a176 [Yu ISHIKAWA] Set a random seed in the unit testing
92c3efc [Yu ISHIKAWA] Make the points for a test be fewer
c758692 [Yu ISHIKAWA] Modify the parameters of KMeans in Python
6aca147 [Yu ISHIKAWA] Add some unit testings to validate the setter methods
687cacc [Yu ISHIKAWA] Alias mllib.KMeans as MLlibKMeans in KMeansSuite.scala
a4dfbef [Yu ISHIKAWA] Modify the last brace and indentations
5bedc51 [Yu ISHIKAWA] Remve an extra new line
444c289 [Yu ISHIKAWA] Add the validation for `runs`
e41989c [Yu ISHIKAWA] Modify how to validate `initStep`
7ea133a [Yu ISHIKAWA] Change how to validate `initMode`
7991e15 [Yu ISHIKAWA] Add a validation for `k`
c2df35d [Yu ISHIKAWA] Make `predict` private
93aa2ff [Yu ISHIKAWA] Use `withColumn` in `transform`
d3a79f7 [Yu ISHIKAWA] Remove the inhefited docs
e9532e1 [Yu ISHIKAWA] make `parentModel` of KMeansModel private
8559772 [Yu ISHIKAWA] Remove the `paramMap` parameter of KMeans
6684850 [Yu ISHIKAWA] Rename `initializationSteps` to `initSteps`
99b1b96 [Yu ISHIKAWA] Rename `initializationMode` to `initMode`
79ea82b [Yu ISHIKAWA] Modify the parameters of KMeans docs
6569bcd [Yu ISHIKAWA] Change how to set the default values with `setDefault`
20a795a [Yu ISHIKAWA] Change how to set the default values with `setDefault`
11c2a12 [Yu ISHIKAWA] Limit the imports
badb481 [Yu ISHIKAWA] Alias spark.mllib.{KMeans, KMeansModel}
f80319a [Yu ISHIKAWA] Rebase mater branch and add copy methods
85d92b1 [Yu ISHIKAWA] Add `KMeans.setPredictionCol`
aa9469d [Yu ISHIKAWA] Fix a python test suite error caused by python 3.x
c2d6bcb [Yu ISHIKAWA] ADD Java test suites of the KMeans API for spark.ml Pipeline
598ed2e [Yu ISHIKAWA] Implement the KMeans API for spark.ml Pipelines in Python
63ad785 [Yu ISHIKAWA] Implement the KMeans API for spark.ml Pipelines in Scala
---
 dev/sparktestsupport/modules.py               |   1 +
 .../apache/spark/ml/clustering/KMeans.scala   | 205 +++++++++++++++++
 .../spark/mllib/clustering/KMeans.scala       |  12 +-
 .../spark/ml/clustering/JavaKMeansSuite.java  |  72 ++++++
 .../spark/ml/clustering/KMeansSuite.scala     | 114 ++++++++++
 project/SparkBuild.scala                      |   4 +-
 python/docs/pyspark.ml.rst                    |   8 +
 python/pyspark/ml/clustering.py               | 206 ++++++++++++++++++
 8 files changed, 617 insertions(+), 5 deletions(-)
 create mode 100644 mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
 create mode 100644 mllib/src/test/java/org/apache/spark/ml/clustering/JavaKMeansSuite.java
 create mode 100644 mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala
 create mode 100644 python/pyspark/ml/clustering.py

diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
index 993583e2f4119..3073d489bad4a 100644
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@@ -338,6 +338,7 @@ def contains_file(self, filename):
     python_test_goals=[
         "pyspark.ml.feature",
         "pyspark.ml.classification",
+        "pyspark.ml.clustering",
         "pyspark.ml.recommendation",
         "pyspark.ml.regression",
         "pyspark.ml.tuning",
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
new file mode 100644
index 0000000000000..dc192add6ca13
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
@@ -0,0 +1,205 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.clustering
+
+import org.apache.spark.annotation.Experimental
+import org.apache.spark.ml.param.{Param, Params, IntParam, DoubleParam, ParamMap}
+import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasMaxIter, HasPredictionCol, HasSeed}
+import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
+import org.apache.spark.ml.{Estimator, Model}
+import org.apache.spark.mllib.clustering.{KMeans => MLlibKMeans, KMeansModel => MLlibKMeansModel}
+import org.apache.spark.mllib.linalg.{Vector, VectorUDT}
+import org.apache.spark.sql.functions.{col, udf}
+import org.apache.spark.sql.types.{IntegerType, StructType}
+import org.apache.spark.sql.{DataFrame, Row}
+import org.apache.spark.util.Utils
+
+
+/**
+ * Common params for KMeans and KMeansModel
+ */
+private[clustering] trait KMeansParams
+    extends Params with HasMaxIter with HasFeaturesCol with HasSeed with HasPredictionCol {
+
+  /**
+   * Set the number of clusters to create (k). Must be > 1. Default: 2.
+   * @group param
+   */
+  final val k = new IntParam(this, "k", "number of clusters to create", (x: Int) => x > 1)
+
+  /** @group getParam */
+  def getK: Int = $(k)
+
+  /**
+   * Param the number of runs of the algorithm to execute in parallel. We initialize the algorithm
+   * this many times with random starting conditions (configured by the initialization mode), then
+   * return the best clustering found over any run. Must be >= 1. Default: 1.
+   * @group param
+   */
+  final val runs = new IntParam(this, "runs",
+    "number of runs of the algorithm to execute in parallel", (value: Int) => value >= 1)
+
+  /** @group getParam */
+  def getRuns: Int = $(runs)
+
+  /**
+   * Param the distance threshold within which we've consider centers to have converged.
+   * If all centers move less than this Euclidean distance, we stop iterating one run.
+   * Must be >= 0.0. Default: 1e-4
+   * @group param
+   */
+  final val epsilon = new DoubleParam(this, "epsilon",
+    "distance threshold within which we've consider centers to have converge",
+    (value: Double) => value >= 0.0)
+
+  /** @group getParam */
+  def getEpsilon: Double = $(epsilon)
+
+  /**
+   * Param for the initialization algorithm. This can be either "random" to choose random points as
+   * initial cluster centers, or "k-means||" to use a parallel variant of k-means++
+   * (Bahmani et al., Scalable K-Means++, VLDB 2012). Default: k-means||.
+   * @group expertParam
+   */
+  final val initMode = new Param[String](this, "initMode", "initialization algorithm",
+    (value: String) => MLlibKMeans.validateInitMode(value))
+
+  /** @group expertGetParam */
+  def getInitMode: String = $(initMode)
+
+  /**
+   * Param for the number of steps for the k-means|| initialization mode. This is an advanced
+   * setting -- the default of 5 is almost always enough. Must be > 0. Default: 5.
+   * @group expertParam
+   */
+  final val initSteps = new IntParam(this, "initSteps", "number of steps for k-means||",
+    (value: Int) => value > 0)
+
+  /** @group expertGetParam */
+  def getInitSteps: Int = $(initSteps)
+
+  /**
+   * Validates and transforms the input schema.
+   * @param schema input schema
+   * @return output schema
+   */
+  protected def validateAndTransformSchema(schema: StructType): StructType = {
+    SchemaUtils.checkColumnType(schema, $(featuresCol), new VectorUDT)
+    SchemaUtils.appendColumn(schema, $(predictionCol), IntegerType)
+  }
+}
+
+/**
+ * :: Experimental ::
+ * Model fitted by KMeans.
+ *
+ * @param parentModel a model trained by spark.mllib.clustering.KMeans.
+ */
+@Experimental
+class KMeansModel private[ml] (
+    override val uid: String,
+    private val parentModel: MLlibKMeansModel) extends Model[KMeansModel] with KMeansParams {
+
+  override def copy(extra: ParamMap): KMeansModel = {
+    val copied = new KMeansModel(uid, parentModel)
+    copyValues(copied, extra)
+  }
+
+  override def transform(dataset: DataFrame): DataFrame = {
+    val predictUDF = udf((vector: Vector) => predict(vector))
+    dataset.withColumn($(predictionCol), predictUDF(col($(featuresCol))))
+  }
+
+  override def transformSchema(schema: StructType): StructType = {
+    validateAndTransformSchema(schema)
+  }
+
+  private[clustering] def predict(features: Vector): Int = parentModel.predict(features)
+
+  def clusterCenters: Array[Vector] = parentModel.clusterCenters
+}
+
+/**
+ * :: Experimental ::
+ * K-means clustering with support for multiple parallel runs and a k-means++ like initialization
+ * mode (the k-means|| algorithm by Bahmani et al). When multiple concurrent runs are requested,
+ * they are executed together with joint passes over the data for efficiency.
+ */
+@Experimental
+class KMeans(override val uid: String) extends Estimator[KMeansModel] with KMeansParams {
+
+  setDefault(
+    k -> 2,
+    maxIter -> 20,
+    runs -> 1,
+    initMode -> MLlibKMeans.K_MEANS_PARALLEL,
+    initSteps -> 5,
+    epsilon -> 1e-4)
+
+  override def copy(extra: ParamMap): KMeans = defaultCopy(extra)
+
+  def this() = this(Identifiable.randomUID("kmeans"))
+
+  /** @group setParam */
+  def setFeaturesCol(value: String): this.type = set(featuresCol, value)
+
+  /** @group setParam */
+  def setPredictionCol(value: String): this.type = set(predictionCol, value)
+
+  /** @group setParam */
+  def setK(value: Int): this.type = set(k, value)
+
+  /** @group expertSetParam */
+  def setInitMode(value: String): this.type = set(initMode, value)
+
+  /** @group expertSetParam */
+  def setInitSteps(value: Int): this.type = set(initSteps, value)
+
+  /** @group setParam */
+  def setMaxIter(value: Int): this.type = set(maxIter, value)
+
+  /** @group setParam */
+  def setRuns(value: Int): this.type = set(runs, value)
+
+  /** @group setParam */
+  def setEpsilon(value: Double): this.type = set(epsilon, value)
+
+  /** @group setParam */
+  def setSeed(value: Long): this.type = set(seed, value)
+
+  override def fit(dataset: DataFrame): KMeansModel = {
+    val rdd = dataset.select(col($(featuresCol))).map { case Row(point: Vector) => point }
+
+    val algo = new MLlibKMeans()
+      .setK($(k))
+      .setInitializationMode($(initMode))
+      .setInitializationSteps($(initSteps))
+      .setMaxIterations($(maxIter))
+      .setSeed($(seed))
+      .setEpsilon($(epsilon))
+      .setRuns($(runs))
+    val parentModel = algo.run(rdd)
+    val model = new KMeansModel(uid, parentModel)
+    copyValues(model)
+  }
+
+  override def transformSchema(schema: StructType): StructType = {
+    validateAndTransformSchema(schema)
+  }
+}
+
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
index 68297130a7b03..0a65403f4ec95 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
@@ -85,9 +85,7 @@ class KMeans private (
    * (Bahmani et al., Scalable K-Means++, VLDB 2012). Default: k-means||.
    */
   def setInitializationMode(initializationMode: String): this.type = {
-    if (initializationMode != KMeans.RANDOM && initializationMode != KMeans.K_MEANS_PARALLEL) {
-      throw new IllegalArgumentException("Invalid initialization mode: " + initializationMode)
-    }
+    KMeans.validateInitMode(initializationMode)
     this.initializationMode = initializationMode
     this
   }
@@ -550,6 +548,14 @@ object KMeans {
       v2: VectorWithNorm): Double = {
     MLUtils.fastSquaredDistance(v1.vector, v1.norm, v2.vector, v2.norm)
   }
+
+  private[spark] def validateInitMode(initMode: String): Boolean = {
+    initMode match {
+      case KMeans.RANDOM => true
+      case KMeans.K_MEANS_PARALLEL => true
+      case _ => false
+    }
+  }
 }
 
 /**
diff --git a/mllib/src/test/java/org/apache/spark/ml/clustering/JavaKMeansSuite.java b/mllib/src/test/java/org/apache/spark/ml/clustering/JavaKMeansSuite.java
new file mode 100644
index 0000000000000..d09fa7fd5637c
--- /dev/null
+++ b/mllib/src/test/java/org/apache/spark/ml/clustering/JavaKMeansSuite.java
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.clustering;
+
+import java.io.Serializable;
+import java.util.Arrays;
+import java.util.List;
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+import static org.junit.Assert.assertArrayEquals;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.mllib.linalg.Vector;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.SQLContext;
+
+public class JavaKMeansSuite implements Serializable {
+
+  private transient int k = 5;
+  private transient JavaSparkContext sc;
+  private transient DataFrame dataset;
+  private transient SQLContext sql;
+
+  @Before
+  public void setUp() {
+    sc = new JavaSparkContext("local", "JavaKMeansSuite");
+    sql = new SQLContext(sc);
+
+    dataset = KMeansSuite.generateKMeansData(sql, 50, 3, k);
+  }
+
+  @After
+  public void tearDown() {
+    sc.stop();
+    sc = null;
+  }
+
+  @Test
+  public void fitAndTransform() {
+    KMeans kmeans = new KMeans().setK(k).setSeed(1);
+    KMeansModel model = kmeans.fit(dataset);
+
+    Vector[] centers = model.clusterCenters();
+    assertEquals(k, centers.length);
+
+    DataFrame transformed = model.transform(dataset);
+    List<String> columns = Arrays.asList(transformed.columns());
+    List<String> expectedColumns = Arrays.asList("features", "prediction");
+    for (String column: expectedColumns) {
+      assertTrue(columns.contains(column));
+    }
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala
new file mode 100644
index 0000000000000..1f15ac02f4008
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala
@@ -0,0 +1,114 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.clustering
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.mllib.clustering.{KMeans => MLlibKMeans}
+import org.apache.spark.mllib.linalg.{Vector, Vectors}
+import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.sql.{DataFrame, SQLContext}
+
+private[clustering] case class TestRow(features: Vector)
+
+object KMeansSuite {
+  def generateKMeansData(sql: SQLContext, rows: Int, dim: Int, k: Int): DataFrame = {
+    val sc = sql.sparkContext
+    val rdd = sc.parallelize(1 to rows).map(i => Vectors.dense(Array.fill(dim)((i % k).toDouble)))
+      .map(v => new TestRow(v))
+    sql.createDataFrame(rdd)
+  }
+}
+
+class KMeansSuite extends SparkFunSuite with MLlibTestSparkContext {
+
+  final val k = 5
+  @transient var dataset: DataFrame = _
+
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+
+    dataset = KMeansSuite.generateKMeansData(sqlContext, 50, 3, k)
+  }
+
+  test("default parameters") {
+    val kmeans = new KMeans()
+
+    assert(kmeans.getK === 2)
+    assert(kmeans.getFeaturesCol === "features")
+    assert(kmeans.getPredictionCol === "prediction")
+    assert(kmeans.getMaxIter === 20)
+    assert(kmeans.getRuns === 1)
+    assert(kmeans.getInitMode === MLlibKMeans.K_MEANS_PARALLEL)
+    assert(kmeans.getInitSteps === 5)
+    assert(kmeans.getEpsilon === 1e-4)
+  }
+
+  test("set parameters") {
+    val kmeans = new KMeans()
+      .setK(9)
+      .setFeaturesCol("test_feature")
+      .setPredictionCol("test_prediction")
+      .setMaxIter(33)
+      .setRuns(7)
+      .setInitMode(MLlibKMeans.RANDOM)
+      .setInitSteps(3)
+      .setSeed(123)
+      .setEpsilon(1e-3)
+
+    assert(kmeans.getK === 9)
+    assert(kmeans.getFeaturesCol === "test_feature")
+    assert(kmeans.getPredictionCol === "test_prediction")
+    assert(kmeans.getMaxIter === 33)
+    assert(kmeans.getRuns === 7)
+    assert(kmeans.getInitMode === MLlibKMeans.RANDOM)
+    assert(kmeans.getInitSteps === 3)
+    assert(kmeans.getSeed === 123)
+    assert(kmeans.getEpsilon === 1e-3)
+  }
+
+  test("parameters validation") {
+    intercept[IllegalArgumentException] {
+      new KMeans().setK(1)
+    }
+    intercept[IllegalArgumentException] {
+      new KMeans().setInitMode("no_such_a_mode")
+    }
+    intercept[IllegalArgumentException] {
+      new KMeans().setInitSteps(0)
+    }
+    intercept[IllegalArgumentException] {
+      new KMeans().setRuns(0)
+    }
+  }
+
+  test("fit & transform") {
+    val predictionColName = "kmeans_prediction"
+    val kmeans = new KMeans().setK(k).setPredictionCol(predictionColName).setSeed(1)
+    val model = kmeans.fit(dataset)
+    assert(model.clusterCenters.length === k)
+
+    val transformed = model.transform(dataset)
+    val expectedColumns = Array("features", predictionColName)
+    expectedColumns.foreach { column =>
+      assert(transformed.columns.contains(column))
+    }
+    val clusters = transformed.select(predictionColName).map(_.getInt(0)).distinct().collect().toSet
+    assert(clusters.size === k)
+    assert(clusters === Set(0, 1, 2, 3, 4))
+  }
+}
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index 4291b0be2a616..12828547d7077 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -481,8 +481,8 @@ object Unidoc {
         "mllib.tree.impurity", "mllib.tree.model", "mllib.util",
         "mllib.evaluation", "mllib.feature", "mllib.random", "mllib.stat.correlation",
         "mllib.stat.test", "mllib.tree.impl", "mllib.tree.loss",
-        "ml", "ml.attribute", "ml.classification", "ml.evaluation", "ml.feature", "ml.param",
-        "ml.recommendation", "ml.regression", "ml.tuning"
+        "ml", "ml.attribute", "ml.classification", "ml.clustering", "ml.evaluation", "ml.feature",
+        "ml.param", "ml.recommendation", "ml.regression", "ml.tuning"
       ),
       "-group", "Spark SQL", packageList("sql.api.java", "sql.api.java.types", "sql.hive.api.java"),
       "-noqualifier", "java.lang"
diff --git a/python/docs/pyspark.ml.rst b/python/docs/pyspark.ml.rst
index 518b8e774dd5f..86d4186a2c798 100644
--- a/python/docs/pyspark.ml.rst
+++ b/python/docs/pyspark.ml.rst
@@ -33,6 +33,14 @@ pyspark.ml.classification module
     :undoc-members:
     :inherited-members:
 
+pyspark.ml.clustering module
+----------------------------
+
+.. automodule:: pyspark.ml.clustering
+    :members:
+    :undoc-members:
+    :inherited-members:
+
 pyspark.ml.recommendation module
 --------------------------------
 
diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py
new file mode 100644
index 0000000000000..b5e9b6549d9f1
--- /dev/null
+++ b/python/pyspark/ml/clustering.py
@@ -0,0 +1,206 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from pyspark.ml.util import keyword_only
+from pyspark.ml.wrapper import JavaEstimator, JavaModel
+from pyspark.ml.param.shared import *
+from pyspark.mllib.common import inherit_doc
+from pyspark.mllib.linalg import _convert_to_vector
+
+__all__ = ['KMeans', 'KMeansModel']
+
+
+class KMeansModel(JavaModel):
+    """
+    Model fitted by KMeans.
+    """
+
+    def clusterCenters(self):
+        """Get the cluster centers, represented as a list of NumPy arrays."""
+        return [c.toArray() for c in self._call_java("clusterCenters")]
+
+
+@inherit_doc
+class KMeans(JavaEstimator, HasFeaturesCol, HasMaxIter, HasSeed):
+    """
+    K-means Clustering
+
+    >>> from pyspark.mllib.linalg import Vectors
+    >>> data = [(Vectors.dense([0.0, 0.0]),), (Vectors.dense([1.0, 1.0]),),
+    ...         (Vectors.dense([9.0, 8.0]),), (Vectors.dense([8.0, 9.0]),)]
+    >>> df = sqlContext.createDataFrame(data, ["features"])
+    >>> kmeans = KMeans().setK(2).setSeed(1).setFeaturesCol("features")
+    >>> model = kmeans.fit(df)
+    >>> centers = model.clusterCenters()
+    >>> len(centers)
+    2
+    >>> transformed = model.transform(df).select("features", "prediction")
+    >>> rows = transformed.collect()
+    >>> rows[0].prediction == rows[1].prediction
+    True
+    >>> rows[2].prediction == rows[3].prediction
+    True
+    """
+
+    # a placeholder to make it appear in the generated doc
+    k = Param(Params._dummy(), "k", "number of clusters to create")
+    epsilon = Param(Params._dummy(), "epsilon",
+                    "distance threshold within which " +
+                    "we've consider centers to have converged")
+    runs = Param(Params._dummy(), "runs", "number of runs of the algorithm to execute in parallel")
+    initMode = Param(Params._dummy(), "initMode",
+                     "the initialization algorithm. This can be either \"random\" to " +
+                     "choose random points as initial cluster centers, or \"k-means||\" " +
+                     "to use a parallel variant of k-means++")
+    initSteps = Param(Params._dummy(), "initSteps", "steps for k-means initialization mode")
+
+    @keyword_only
+    def __init__(self, k=2, maxIter=20, runs=1, epsilon=1e-4, initMode="k-means||", initStep=5):
+        super(KMeans, self).__init__()
+        self._java_obj = self._new_java_obj("org.apache.spark.ml.clustering.KMeans", self.uid)
+        self.k = Param(self, "k", "number of clusters to create")
+        self.epsilon = Param(self, "epsilon",
+                             "distance threshold within which " +
+                             "we've consider centers to have converged")
+        self.runs = Param(self, "runs", "number of runs of the algorithm to execute in parallel")
+        self.seed = Param(self, "seed", "random seed")
+        self.initMode = Param(self, "initMode",
+                              "the initialization algorithm. This can be either \"random\" to " +
+                              "choose random points as initial cluster centers, or \"k-means||\" " +
+                              "to use a parallel variant of k-means++")
+        self.initSteps = Param(self, "initSteps", "steps for k-means initialization mode")
+        self._setDefault(k=2, maxIter=20, runs=1, epsilon=1e-4, initMode="k-means||", initSteps=5)
+        kwargs = self.__init__._input_kwargs
+        self.setParams(**kwargs)
+
+    def _create_model(self, java_model):
+        return KMeansModel(java_model)
+
+    @keyword_only
+    def setParams(self, k=2, maxIter=20, runs=1, epsilon=1e-4, initMode="k-means||", initSteps=5):
+        """
+        setParams(self, k=2, maxIter=20, runs=1, epsilon=1e-4, initMode="k-means||", initSteps=5):
+
+        Sets params for KMeans.
+        """
+        kwargs = self.setParams._input_kwargs
+        return self._set(**kwargs)
+
+    def setK(self, value):
+        """
+        Sets the value of :py:attr:`k`.
+
+        >>> algo = KMeans().setK(10)
+        >>> algo.getK()
+        10
+        """
+        self._paramMap[self.k] = value
+        return self
+
+    def getK(self):
+        """
+        Gets the value of `k`
+        """
+        return self.getOrDefault(self.k)
+
+    def setEpsilon(self, value):
+        """
+        Sets the value of :py:attr:`epsilon`.
+
+        >>> algo = KMeans().setEpsilon(1e-5)
+        >>> abs(algo.getEpsilon() - 1e-5) < 1e-5
+        True
+        """
+        self._paramMap[self.epsilon] = value
+        return self
+
+    def getEpsilon(self):
+        """
+        Gets the value of `epsilon`
+        """
+        return self.getOrDefault(self.epsilon)
+
+    def setRuns(self, value):
+        """
+        Sets the value of :py:attr:`runs`.
+
+        >>> algo = KMeans().setRuns(10)
+        >>> algo.getRuns()
+        10
+        """
+        self._paramMap[self.runs] = value
+        return self
+
+    def getRuns(self):
+        """
+        Gets the value of `runs`
+        """
+        return self.getOrDefault(self.runs)
+
+    def setInitMode(self, value):
+        """
+        Sets the value of :py:attr:`initMode`.
+
+        >>> algo = KMeans()
+        >>> algo.getInitMode()
+        'k-means||'
+        >>> algo = algo.setInitMode("random")
+        >>> algo.getInitMode()
+        'random'
+        """
+        self._paramMap[self.initMode] = value
+        return self
+
+    def getInitMode(self):
+        """
+        Gets the value of `initMode`
+        """
+        return self.getOrDefault(self.initMode)
+
+    def setInitSteps(self, value):
+        """
+        Sets the value of :py:attr:`initSteps`.
+
+        >>> algo = KMeans().setInitSteps(10)
+        >>> algo.getInitSteps()
+        10
+        """
+        self._paramMap[self.initSteps] = value
+        return self
+
+    def getInitSteps(self):
+        """
+        Gets the value of `initSteps`
+        """
+        return self.getOrDefault(self.initSteps)
+
+
+if __name__ == "__main__":
+    import doctest
+    from pyspark.context import SparkContext
+    from pyspark.sql import SQLContext
+    globs = globals().copy()
+    # The small batch size here ensures that we see multiple batches,
+    # even in these small test examples:
+    sc = SparkContext("local[2]", "ml.clustering tests")
+    sqlContext = SQLContext(sc)
+    globs['sc'] = sc
+    globs['sqlContext'] = sqlContext
+    (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
+    sc.stop()
+    if failure_count:
+        exit(-1)

From 1017908205b7690dc0b0ed4753b36fab5641f7ac Mon Sep 17 00:00:00 2001
From: Rekha Joshi <rekhajoshm@gmail.com>
Date: Fri, 17 Jul 2015 20:02:05 -0700
Subject: [PATCH 226/549] [SPARK-9118] [ML] Implement IntArrayParam in mllib

Implement IntArrayParam in mllib

Author: Rekha Joshi <rekhajoshm@gmail.com>
Author: Joshi <rekhajoshm@gmail.com>

Closes #7481 from rekhajoshm/SPARK-9118 and squashes the following commits:

d3b1766 [Joshi] Implement IntArrayParam
0be142d [Rekha Joshi] Merge pull request #3 from apache/master
106fd8e [Rekha Joshi] Merge pull request #2 from apache/master
e3677c9 [Rekha Joshi] Merge pull request #1 from apache/master
---
 .../scala/org/apache/spark/ml/param/params.scala | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
index d034d7ec6b60e..824efa5ed4b28 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
@@ -295,6 +295,22 @@ class DoubleArrayParam(parent: Params, name: String, doc: String, isValid: Array
     w(value.asScala.map(_.asInstanceOf[Double]).toArray)
 }
 
+/**
+ * :: DeveloperApi ::
+ * Specialized version of [[Param[Array[Int]]]] for Java.
+ */
+@DeveloperApi
+class IntArrayParam(parent: Params, name: String, doc: String, isValid: Array[Int] => Boolean)
+  extends Param[Array[Int]](parent, name, doc, isValid) {
+
+  def this(parent: Params, name: String, doc: String) =
+    this(parent, name, doc, ParamValidators.alwaysTrue)
+
+  /** Creates a param pair with a [[java.util.List]] of values (for Java and Python). */
+  def w(value: java.util.List[java.lang.Integer]): ParamPair[Array[Int]] =
+    w(value.asScala.map(_.asInstanceOf[Int]).toArray)
+}
+
 /**
  * :: Experimental ::
  * A param and its value.

From b9ef7ac98c3dee3256c4a393e563b42b4612a4bf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20Kozikowski?= <mupakoz@gmail.com>
Date: Sat, 18 Jul 2015 10:12:48 -0700
Subject: [PATCH 227/549] [MLLIB] [DOC] Seed fix in mllib naive bayes example
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previous seed resulted in empty test data set.

Author: Paweł Kozikowski <mupakoz@gmail.com>

Closes #7477 from mupakoz/patch-1 and squashes the following commits:

f5d41ee [Paweł Kozikowski] Mllib Naive Bayes example data set enlarged
---
 data/mllib/sample_naive_bayes_data.txt | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/data/mllib/sample_naive_bayes_data.txt b/data/mllib/sample_naive_bayes_data.txt
index 981da382d6ac8..bd22bea3a59d6 100644
--- a/data/mllib/sample_naive_bayes_data.txt
+++ b/data/mllib/sample_naive_bayes_data.txt
@@ -1,6 +1,12 @@
 0,1 0 0
 0,2 0 0
+0,3 0 0
+0,4 0 0
 1,0 1 0
 1,0 2 0
+1,0 3 0
+1,0 4 0
 2,0 0 1
 2,0 0 2
+2,0 0 3
+2,0 0 4
\ No newline at end of file

From fba3f5ba85673336c0556ef8731dcbcd175c7418 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Sat, 18 Jul 2015 11:06:46 -0700
Subject: [PATCH 228/549] [SPARK-9169][SQL] Improve unit test coverage for null
 expressions.

Author: Reynold Xin <rxin@databricks.com>

Closes #7490 from rxin/unit-test-null-funcs and squashes the following commits:

7b276f0 [Reynold Xin] Move isNaN.
8307287 [Reynold Xin] [SPARK-9169][SQL] Improve unit test coverage for null expressions.
---
 .../catalyst/expressions/nullFunctions.scala  | 81 +++++++++++++++++--
 .../sql/catalyst/expressions/predicates.scala | 51 ------------
 .../expressions/NullFunctionsSuite.scala      | 78 +++++++++---------
 .../catalyst/expressions/PredicateSuite.scala | 12 +--
 4 files changed, 119 insertions(+), 103 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullFunctions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullFunctions.scala
index 1522bcae08d17..98c67084642e3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullFunctions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullFunctions.scala
@@ -21,8 +21,19 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
 import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode}
 import org.apache.spark.sql.catalyst.util.TypeUtils
-import org.apache.spark.sql.types.DataType
+import org.apache.spark.sql.types._
 
+
+/**
+ * An expression that is evaluated to the first non-null input.
+ *
+ * {{{
+ *   coalesce(1, 2) => 1
+ *   coalesce(null, 1, 2) => 1
+ *   coalesce(null, null, 2) => 2
+ *   coalesce(null, null, null) => null
+ * }}}
+ */
 case class Coalesce(children: Seq[Expression]) extends Expression {
 
   /** Coalesce is nullable if all of its children are nullable, or if it has no children. */
@@ -70,6 +81,62 @@ case class Coalesce(children: Seq[Expression]) extends Expression {
   }
 }
 
+
+/**
+ * Evaluates to `true` if it's NaN or null
+ */
+case class IsNaN(child: Expression) extends UnaryExpression
+  with Predicate with ImplicitCastInputTypes {
+
+  override def inputTypes: Seq[AbstractDataType] = Seq(TypeCollection(DoubleType, FloatType))
+
+  override def nullable: Boolean = false
+
+  override def eval(input: InternalRow): Any = {
+    val value = child.eval(input)
+    if (value == null) {
+      true
+    } else {
+      child.dataType match {
+        case DoubleType => value.asInstanceOf[Double].isNaN
+        case FloatType => value.asInstanceOf[Float].isNaN
+      }
+    }
+  }
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    val eval = child.gen(ctx)
+    child.dataType match {
+      case FloatType =>
+        s"""
+          ${eval.code}
+          boolean ${ev.isNull} = false;
+          ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+          if (${eval.isNull}) {
+            ${ev.primitive} = true;
+          } else {
+            ${ev.primitive} = Float.isNaN(${eval.primitive});
+          }
+        """
+      case DoubleType =>
+        s"""
+          ${eval.code}
+          boolean ${ev.isNull} = false;
+          ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+          if (${eval.isNull}) {
+            ${ev.primitive} = true;
+          } else {
+            ${ev.primitive} = Double.isNaN(${eval.primitive});
+          }
+        """
+    }
+  }
+}
+
+
+/**
+ * An expression that is evaluated to true if the input is null.
+ */
 case class IsNull(child: Expression) extends UnaryExpression with Predicate {
   override def nullable: Boolean = false
 
@@ -83,13 +150,14 @@ case class IsNull(child: Expression) extends UnaryExpression with Predicate {
     ev.primitive = eval.isNull
     eval.code
   }
-
-  override def toString: String = s"IS NULL $child"
 }
 
+
+/**
+ * An expression that is evaluated to true if the input is not null.
+ */
 case class IsNotNull(child: Expression) extends UnaryExpression with Predicate {
   override def nullable: Boolean = false
-  override def toString: String = s"IS NOT NULL $child"
 
   override def eval(input: InternalRow): Any = {
     child.eval(input) != null
@@ -103,12 +171,13 @@ case class IsNotNull(child: Expression) extends UnaryExpression with Predicate {
   }
 }
 
+
 /**
- * A predicate that is evaluated to be true if there are at least `n` non-null values.
+ * A predicate that is evaluated to be true if there are at least `n` non-null and non-NaN values.
  */
 case class AtLeastNNonNulls(n: Int, children: Seq[Expression]) extends Predicate {
   override def nullable: Boolean = false
-  override def foldable: Boolean = false
+  override def foldable: Boolean = children.forall(_.foldable)
   override def toString: String = s"AtLeastNNulls(n, ${children.mkString(",")})"
 
   private[this] val childrenArray = children.toArray
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
index 2751c8e75f357..bddd2a9eccfc0 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
@@ -18,7 +18,6 @@
 package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
 import org.apache.spark.sql.catalyst.util.TypeUtils
 import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, CodeGenContext}
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
@@ -120,56 +119,6 @@ case class InSet(child: Expression, hset: Set[Any])
   }
 }
 
-/**
- * Evaluates to `true` if it's NaN or null
- */
-case class IsNaN(child: Expression) extends UnaryExpression
-    with Predicate with ImplicitCastInputTypes {
-
-  override def inputTypes: Seq[AbstractDataType] = Seq(TypeCollection(DoubleType, FloatType))
-
-  override def nullable: Boolean = false
-
-  override def eval(input: InternalRow): Any = {
-    val value = child.eval(input)
-    if (value == null) {
-      true
-    } else {
-      child.dataType match {
-        case DoubleType => value.asInstanceOf[Double].isNaN
-        case FloatType => value.asInstanceOf[Float].isNaN
-      }
-    }
-  }
-
-  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
-    val eval = child.gen(ctx)
-    child.dataType match {
-      case FloatType =>
-        s"""
-          ${eval.code}
-          boolean ${ev.isNull} = false;
-          ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
-          if (${eval.isNull}) {
-            ${ev.primitive} = true;
-          } else {
-            ${ev.primitive} = Float.isNaN(${eval.primitive});
-          }
-        """
-      case DoubleType =>
-        s"""
-          ${eval.code}
-          boolean ${ev.isNull} = false;
-          ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
-          if (${eval.isNull}) {
-            ${ev.primitive} = true;
-          } else {
-            ${ev.primitive} = Double.isNaN(${eval.primitive});
-          }
-        """
-    }
-  }
-}
 
 case class And(left: Expression, right: Expression) extends BinaryOperator with Predicate {
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullFunctionsSuite.scala
index ccdada8b56f83..765cc7a969b5d 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullFunctionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullFunctionsSuite.scala
@@ -18,48 +18,52 @@
 package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.catalyst.dsl.expressions._
-import org.apache.spark.sql.types.{BooleanType, StringType, ShortType}
+import org.apache.spark.sql.types._
 
 class NullFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
 
-  test("null checking") {
-    val row = create_row("^Ba*n", null, true, null)
-    val c1 = 'a.string.at(0)
-    val c2 = 'a.string.at(1)
-    val c3 = 'a.boolean.at(2)
-    val c4 = 'a.boolean.at(3)
-
-    checkEvaluation(c1.isNull, false, row)
-    checkEvaluation(c1.isNotNull, true, row)
-
-    checkEvaluation(c2.isNull, true, row)
-    checkEvaluation(c2.isNotNull, false, row)
-
-    checkEvaluation(Literal.create(1, ShortType).isNull, false)
-    checkEvaluation(Literal.create(1, ShortType).isNotNull, true)
-
-    checkEvaluation(Literal.create(null, ShortType).isNull, true)
-    checkEvaluation(Literal.create(null, ShortType).isNotNull, false)
+  def testAllTypes(testFunc: (Any, DataType) => Unit): Unit = {
+    testFunc(false, BooleanType)
+    testFunc(1.toByte, ByteType)
+    testFunc(1.toShort, ShortType)
+    testFunc(1, IntegerType)
+    testFunc(1L, LongType)
+    testFunc(1.0F, FloatType)
+    testFunc(1.0, DoubleType)
+    testFunc(Decimal(1.5), DecimalType.Unlimited)
+    testFunc(new java.sql.Date(10), DateType)
+    testFunc(new java.sql.Timestamp(10), TimestampType)
+    testFunc("abcd", StringType)
+  }
 
-    checkEvaluation(Coalesce(c1 :: c2 :: Nil), "^Ba*n", row)
-    checkEvaluation(Coalesce(Literal.create(null, StringType) :: Nil), null, row)
-    checkEvaluation(Coalesce(Literal.create(null, StringType) :: c1 :: c2 :: Nil), "^Ba*n", row)
+  test("isnull and isnotnull") {
+    testAllTypes { (value: Any, tpe: DataType) =>
+      checkEvaluation(IsNull(Literal.create(value, tpe)), false)
+      checkEvaluation(IsNotNull(Literal.create(value, tpe)), true)
+      checkEvaluation(IsNull(Literal.create(null, tpe)), true)
+      checkEvaluation(IsNotNull(Literal.create(null, tpe)), false)
+    }
+  }
 
-    checkEvaluation(
-      If(c3, Literal.create("a", StringType), Literal.create("b", StringType)), "a", row)
-    checkEvaluation(If(c3, c1, c2), "^Ba*n", row)
-    checkEvaluation(If(c4, c2, c1), "^Ba*n", row)
-    checkEvaluation(If(Literal.create(null, BooleanType), c2, c1), "^Ba*n", row)
-    checkEvaluation(If(Literal.create(true, BooleanType), c1, c2), "^Ba*n", row)
-    checkEvaluation(If(Literal.create(false, BooleanType), c2, c1), "^Ba*n", row)
-    checkEvaluation(If(Literal.create(false, BooleanType),
-      Literal.create("a", StringType), Literal.create("b", StringType)), "b", row)
+  test("IsNaN") {
+    checkEvaluation(IsNaN(Literal(Double.NaN)), true)
+    checkEvaluation(IsNaN(Literal(Float.NaN)), true)
+    checkEvaluation(IsNaN(Literal(math.log(-3))), true)
+    checkEvaluation(IsNaN(Literal.create(null, DoubleType)), true)
+    checkEvaluation(IsNaN(Literal(Double.PositiveInfinity)), false)
+    checkEvaluation(IsNaN(Literal(Float.MaxValue)), false)
+    checkEvaluation(IsNaN(Literal(5.5f)), false)
+  }
 
-    checkEvaluation(c1 in (c1, c2), true, row)
-    checkEvaluation(
-      Literal.create("^Ba*n", StringType) in (Literal.create("^Ba*n", StringType)), true, row)
-    checkEvaluation(
-      Literal.create("^Ba*n", StringType) in (Literal.create("^Ba*n", StringType), c2), true, row)
+  test("coalesce") {
+    testAllTypes { (value: Any, tpe: DataType) =>
+      val lit = Literal.create(value, tpe)
+      val nullLit = Literal.create(null, tpe)
+      checkEvaluation(Coalesce(Seq(nullLit)), null)
+      checkEvaluation(Coalesce(Seq(lit)), value)
+      checkEvaluation(Coalesce(Seq(nullLit, lit)), value)
+      checkEvaluation(Coalesce(Seq(nullLit, lit, lit)), value)
+      checkEvaluation(Coalesce(Seq(nullLit, nullLit, lit)), value)
+    }
   }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
index 052abc51af5fd..2173a0c25c645 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
@@ -114,16 +114,10 @@ class PredicateSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(
       And(In(Literal(1), Seq(Literal(1), Literal(2))), In(Literal(2), Seq(Literal(1), Literal(2)))),
       true)
-  }
 
-  test("IsNaN") {
-    checkEvaluation(IsNaN(Literal(Double.NaN)), true)
-    checkEvaluation(IsNaN(Literal(Float.NaN)), true)
-    checkEvaluation(IsNaN(Literal(math.log(-3))), true)
-    checkEvaluation(IsNaN(Literal.create(null, DoubleType)), true)
-    checkEvaluation(IsNaN(Literal(Double.PositiveInfinity)), false)
-    checkEvaluation(IsNaN(Literal(Float.MaxValue)), false)
-    checkEvaluation(IsNaN(Literal(5.5f)), false)
+    checkEvaluation(In(Literal("^Ba*n"), Seq(Literal("^Ba*n"))), true)
+    checkEvaluation(In(Literal("^Ba*n"), Seq(Literal("aa"), Literal("^Ba*n"))), true)
+    checkEvaluation(In(Literal("^Ba*n"), Seq(Literal("aa"), Literal("^n"))), false)
   }
 
   test("INSET") {

From b8aec6cd236f09881cad0fff9a6f1a5692934e21 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Sat, 18 Jul 2015 11:08:18 -0700
Subject: [PATCH 229/549] [SPARK-9143] [SQL] Add planner rule for automatically
 inserting Unsafe <-> Safe row format converters

Now that we have two different internal row formats, UnsafeRow and the old Java-object-based row format, we end up having to perform conversions between these two formats. These conversions should not be performed by the operators themselves; instead, the planner should be responsible for inserting appropriate format conversions when they are needed.

This patch makes the following changes:

- Add two new physical operators for performing row format conversions, `ConvertToUnsafe` and `ConvertFromUnsafe`.
- Add new methods to `SparkPlan` to allow operators to express whether they output UnsafeRows and whether they can handle safe or unsafe rows as inputs.
- Implement an `EnsureRowFormats` rule to automatically insert converter operators where necessary.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #7482 from JoshRosen/unsafe-converter-planning and squashes the following commits:

7450fa5 [Josh Rosen] Resolve conflicts in favor of choosing UnsafeRow
5220cce [Josh Rosen] Add roundtrip converter test
2bb8da8 [Josh Rosen] Add Union unsafe support + tests to bump up test coverage
6f79449 [Josh Rosen] Add even more assertions to execute()
08ce199 [Josh Rosen] Rename ConvertFromUnsafe -> ConvertToSafe
0e2d548 [Josh Rosen] Add assertion if operators' input rows are in different formats
cabb703 [Josh Rosen] Add tests for Filter
3b11ce3 [Josh Rosen] Add missing test file.
ae2195a [Josh Rosen] Fixes
0fef0f8 [Josh Rosen] Rename file.
d5f9005 [Josh Rosen] Finish writing EnsureRowFormats planner rule
b5df19b [Josh Rosen] Merge remote-tracking branch 'origin/master' into unsafe-converter-planning
9ba3038 [Josh Rosen] WIP
---
 .../org/apache/spark/sql/SQLContext.scala     |   9 +-
 .../spark/sql/execution/SparkPlan.scala       |  24 ++++
 .../spark/sql/execution/basicOperators.scala  |  11 ++
 .../sql/execution/rowFormatConverters.scala   | 107 ++++++++++++++++++
 .../execution/RowFormatConvertersSuite.scala  |  91 +++++++++++++++
 5 files changed, 239 insertions(+), 3 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/rowFormatConverters.scala
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/RowFormatConvertersSuite.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 46bd60daa1f78..2dda3ad1211fa 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -921,12 +921,15 @@ class SQLContext(@transient val sparkContext: SparkContext)
   protected[sql] lazy val emptyResult = sparkContext.parallelize(Seq.empty[InternalRow], 1)
 
   /**
-   * Prepares a planned SparkPlan for execution by inserting shuffle operations as needed.
+   * Prepares a planned SparkPlan for execution by inserting shuffle operations and internal
+   * row format conversions as needed.
    */
   @transient
   protected[sql] val prepareForExecution = new RuleExecutor[SparkPlan] {
-    val batches =
-      Batch("Add exchange", Once, EnsureRequirements(self)) :: Nil
+    val batches = Seq(
+      Batch("Add exchange", Once, EnsureRequirements(self)),
+      Batch("Add row converters", Once, EnsureRowFormats)
+    )
   }
 
   protected[sql] def openSession(): SQLSession = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
index ba12056ee7a1b..f363e9947d5f6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
@@ -79,12 +79,36 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Product
   /** Specifies sort order for each partition requirements on the input data for this operator. */
   def requiredChildOrdering: Seq[Seq[SortOrder]] = Seq.fill(children.size)(Nil)
 
+  /** Specifies whether this operator outputs UnsafeRows */
+  def outputsUnsafeRows: Boolean = false
+
+  /** Specifies whether this operator is capable of processing UnsafeRows */
+  def canProcessUnsafeRows: Boolean = false
+
+  /**
+   * Specifies whether this operator is capable of processing Java-object-based Rows (i.e. rows
+   * that are not UnsafeRows).
+   */
+  def canProcessSafeRows: Boolean = true
+
   /**
    * Returns the result of this query as an RDD[InternalRow] by delegating to doExecute
    * after adding query plan information to created RDDs for visualization.
    * Concrete implementations of SparkPlan should override doExecute instead.
    */
   final def execute(): RDD[InternalRow] = {
+    if (children.nonEmpty) {
+      val hasUnsafeInputs = children.exists(_.outputsUnsafeRows)
+      val hasSafeInputs = children.exists(!_.outputsUnsafeRows)
+      assert(!(hasSafeInputs && hasUnsafeInputs),
+        "Child operators should output rows in the same format")
+      assert(canProcessSafeRows || canProcessUnsafeRows,
+        "Operator must be able to process at least one row format")
+      assert(!hasSafeInputs || canProcessSafeRows,
+        "Operator will receive safe rows as input but cannot process safe rows")
+      assert(!hasUnsafeInputs || canProcessUnsafeRows,
+        "Operator will receive unsafe rows as input but cannot process unsafe rows")
+    }
     RDDOperationScope.withScope(sparkContext, nodeName, false, true) {
       doExecute()
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
index 4c063c299ba53..82bef269b069f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
@@ -64,6 +64,12 @@ case class Filter(condition: Expression, child: SparkPlan) extends UnaryNode {
   }
 
   override def outputOrdering: Seq[SortOrder] = child.outputOrdering
+
+  override def outputsUnsafeRows: Boolean = child.outputsUnsafeRows
+
+  override def canProcessUnsafeRows: Boolean = true
+
+  override def canProcessSafeRows: Boolean = true
 }
 
 /**
@@ -104,6 +110,9 @@ case class Sample(
 case class Union(children: Seq[SparkPlan]) extends SparkPlan {
   // TODO: attributes output by union should be distinct for nullability purposes
   override def output: Seq[Attribute] = children.head.output
+  override def outputsUnsafeRows: Boolean = children.forall(_.outputsUnsafeRows)
+  override def canProcessUnsafeRows: Boolean = true
+  override def canProcessSafeRows: Boolean = true
   protected override def doExecute(): RDD[InternalRow] =
     sparkContext.union(children.map(_.execute()))
 }
@@ -306,6 +315,8 @@ case class UnsafeExternalSort(
   override def output: Seq[Attribute] = child.output
 
   override def outputOrdering: Seq[SortOrder] = sortOrder
+
+  override def outputsUnsafeRows: Boolean = true
 }
 
 @DeveloperApi
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/rowFormatConverters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/rowFormatConverters.scala
new file mode 100644
index 0000000000000..421d510e6782d
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/rowFormatConverters.scala
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution
+
+import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.rules.Rule
+
+/**
+ * :: DeveloperApi ::
+ * Converts Java-object-based rows into [[UnsafeRow]]s.
+ */
+@DeveloperApi
+case class ConvertToUnsafe(child: SparkPlan) extends UnaryNode {
+  override def output: Seq[Attribute] = child.output
+  override def outputsUnsafeRows: Boolean = true
+  override def canProcessUnsafeRows: Boolean = false
+  override def canProcessSafeRows: Boolean = true
+  override protected def doExecute(): RDD[InternalRow] = {
+    child.execute().mapPartitions { iter =>
+      val convertToUnsafe = UnsafeProjection.create(child.schema)
+      iter.map(convertToUnsafe)
+    }
+  }
+}
+
+/**
+ * :: DeveloperApi ::
+ * Converts [[UnsafeRow]]s back into Java-object-based rows.
+ */
+@DeveloperApi
+case class ConvertToSafe(child: SparkPlan) extends UnaryNode {
+  override def output: Seq[Attribute] = child.output
+  override def outputsUnsafeRows: Boolean = false
+  override def canProcessUnsafeRows: Boolean = true
+  override def canProcessSafeRows: Boolean = false
+  override protected def doExecute(): RDD[InternalRow] = {
+    child.execute().mapPartitions { iter =>
+      val convertToSafe = FromUnsafeProjection(child.output.map(_.dataType))
+      iter.map(convertToSafe)
+    }
+  }
+}
+
+private[sql] object EnsureRowFormats extends Rule[SparkPlan] {
+
+  private def onlyHandlesSafeRows(operator: SparkPlan): Boolean =
+    operator.canProcessSafeRows && !operator.canProcessUnsafeRows
+
+  private def onlyHandlesUnsafeRows(operator: SparkPlan): Boolean =
+    operator.canProcessUnsafeRows && !operator.canProcessSafeRows
+
+  private def handlesBothSafeAndUnsafeRows(operator: SparkPlan): Boolean =
+    operator.canProcessSafeRows && operator.canProcessUnsafeRows
+
+  override def apply(operator: SparkPlan): SparkPlan = operator.transformUp {
+    case operator: SparkPlan if onlyHandlesSafeRows(operator) =>
+      if (operator.children.exists(_.outputsUnsafeRows)) {
+        operator.withNewChildren {
+          operator.children.map {
+            c => if (c.outputsUnsafeRows) ConvertToSafe(c) else c
+          }
+        }
+      } else {
+        operator
+      }
+    case operator: SparkPlan if onlyHandlesUnsafeRows(operator) =>
+      if (operator.children.exists(!_.outputsUnsafeRows)) {
+        operator.withNewChildren {
+          operator.children.map {
+            c => if (!c.outputsUnsafeRows) ConvertToUnsafe(c) else c
+          }
+        }
+      } else {
+        operator
+      }
+    case operator: SparkPlan if handlesBothSafeAndUnsafeRows(operator) =>
+      if (operator.children.map(_.outputsUnsafeRows).toSet.size != 1) {
+        // If this operator's children produce both unsafe and safe rows, then convert everything
+        // to unsafe rows
+        operator.withNewChildren {
+          operator.children.map {
+            c => if (!c.outputsUnsafeRows) ConvertToUnsafe(c) else c
+          }
+        }
+      } else {
+        operator
+      }
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/RowFormatConvertersSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/RowFormatConvertersSuite.scala
new file mode 100644
index 0000000000000..7b75f755918c1
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/RowFormatConvertersSuite.scala
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution
+
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.CatalystTypeConverters
+import org.apache.spark.sql.catalyst.expressions.IsNull
+import org.apache.spark.sql.test.TestSQLContext
+
+class RowFormatConvertersSuite extends SparkPlanTest {
+
+  private def getConverters(plan: SparkPlan): Seq[SparkPlan] = plan.collect {
+    case c: ConvertToUnsafe => c
+    case c: ConvertToSafe => c
+  }
+
+  private val outputsSafe = ExternalSort(Nil, false, PhysicalRDD(Seq.empty, null))
+  assert(!outputsSafe.outputsUnsafeRows)
+  private val outputsUnsafe = UnsafeExternalSort(Nil, false, PhysicalRDD(Seq.empty, null))
+  assert(outputsUnsafe.outputsUnsafeRows)
+
+  test("planner should insert unsafe->safe conversions when required") {
+    val plan = Limit(10, outputsUnsafe)
+    val preparedPlan = TestSQLContext.prepareForExecution.execute(plan)
+    assert(preparedPlan.children.head.isInstanceOf[ConvertToSafe])
+  }
+
+  test("filter can process unsafe rows") {
+    val plan = Filter(IsNull(null), outputsUnsafe)
+    val preparedPlan = TestSQLContext.prepareForExecution.execute(plan)
+    assert(getConverters(preparedPlan).isEmpty)
+    assert(preparedPlan.outputsUnsafeRows)
+  }
+
+  test("filter can process safe rows") {
+    val plan = Filter(IsNull(null), outputsSafe)
+    val preparedPlan = TestSQLContext.prepareForExecution.execute(plan)
+    assert(getConverters(preparedPlan).isEmpty)
+    assert(!preparedPlan.outputsUnsafeRows)
+  }
+
+  test("execute() fails an assertion if inputs rows are of different formats") {
+    val e = intercept[AssertionError] {
+      Union(Seq(outputsSafe, outputsUnsafe)).execute()
+    }
+    assert(e.getMessage.contains("format"))
+  }
+
+  test("union requires all of its input rows' formats to agree") {
+    val plan = Union(Seq(outputsSafe, outputsUnsafe))
+    assert(plan.canProcessSafeRows && plan.canProcessUnsafeRows)
+    val preparedPlan = TestSQLContext.prepareForExecution.execute(plan)
+    assert(preparedPlan.outputsUnsafeRows)
+  }
+
+  test("union can process safe rows") {
+    val plan = Union(Seq(outputsSafe, outputsSafe))
+    val preparedPlan = TestSQLContext.prepareForExecution.execute(plan)
+    assert(!preparedPlan.outputsUnsafeRows)
+  }
+
+  test("union can process unsafe rows") {
+    val plan = Union(Seq(outputsUnsafe, outputsUnsafe))
+    val preparedPlan = TestSQLContext.prepareForExecution.execute(plan)
+    assert(preparedPlan.outputsUnsafeRows)
+  }
+
+  test("round trip with ConvertToUnsafe and ConvertToSafe") {
+    val input = Seq(("hello", 1), ("world", 2))
+    checkAnswer(
+      TestSQLContext.createDataFrame(input),
+      plan => ConvertToSafe(ConvertToUnsafe(plan)),
+      input.map(Row.fromTuple)
+    )
+  }
+}

From 1b4ff05538fbcfe10ca4fa97606bd6e39a8450cb Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@outlook.com>
Date: Sat, 18 Jul 2015 11:13:49 -0700
Subject: [PATCH 230/549] [SPARK-9142][SQL] remove more self type in catalyst

a follow up of https://github.com/apache/spark/pull/7479.
The `TreeNode` is the root case of the requirement of `self: Product =>` stuff, so why not make `TreeNode` extend `Product`?

Author: Wenchen Fan <cloud0fan@outlook.com>

Closes #7495 from cloud-fan/self-type and squashes the following commits:

8676af7 [Wenchen Fan] remove more self type
---
 .../apache/spark/sql/catalyst/expressions/Expression.scala    | 2 +-
 .../org/apache/spark/sql/catalyst/expressions/math.scala      | 2 +-
 .../scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala | 2 +-
 .../apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala | 2 +-
 .../scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala  | 4 ++--
 .../main/scala/org/apache/spark/sql/execution/SparkPlan.scala | 2 +-
 6 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
index c70b5af4aa448..0e128d8bdcd96 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
@@ -43,7 +43,7 @@ import org.apache.spark.sql.types._
  *
  * See [[Substring]] for an example.
  */
-abstract class Expression extends TreeNode[Expression] with Product {
+abstract class Expression extends TreeNode[Expression] {
 
   /**
    * Returns true when an expression is a candidate for static evaluation before the query is
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
index 9101f11052218..eb5c065a34123 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
@@ -77,7 +77,7 @@ abstract class UnaryMathExpression(f: Double => Double, name: String)
 }
 
 abstract class UnaryLogExpression(f: Double => Double, name: String)
-    extends UnaryMathExpression(f, name) { self: Product =>
+    extends UnaryMathExpression(f, name) {
 
   // values less than or equal to yAsymptote eval to null in Hive, instead of NaN or -Infinity
   protected val yAsymptote: Double = 0.0
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
index b89e3382f06a9..d06a7a2add754 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
@@ -22,7 +22,7 @@ import org.apache.spark.sql.catalyst.trees.TreeNode
 import org.apache.spark.sql.types.{ArrayType, DataType, StructField, StructType}
 
 abstract class QueryPlan[PlanType <: TreeNode[PlanType]] extends TreeNode[PlanType] {
-  self: PlanType with Product =>
+  self: PlanType =>
 
   def output: Seq[Attribute]
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
index dd6c5d43f5714..bedeaf06adf12 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
@@ -25,7 +25,7 @@ import org.apache.spark.sql.catalyst.plans.QueryPlan
 import org.apache.spark.sql.catalyst.trees.TreeNode
 
 
-abstract class LogicalPlan extends QueryPlan[LogicalPlan] with Logging with Product{
+abstract class LogicalPlan extends QueryPlan[LogicalPlan] with Logging {
 
   /**
    * Computes [[Statistics]] for this plan. The default implementation assumes the output
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
index 0f95ca688a7a8..122e9fc5ed77f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
@@ -54,8 +54,8 @@ object CurrentOrigin {
   }
 }
 
-abstract class TreeNode[BaseType <: TreeNode[BaseType]] {
-  self: BaseType with Product =>
+abstract class TreeNode[BaseType <: TreeNode[BaseType]] extends Product {
+  self: BaseType =>
 
   val origin: Origin = CurrentOrigin.get
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
index f363e9947d5f6..b0d56b7bf0b86 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
@@ -39,7 +39,7 @@ object SparkPlan {
  * :: DeveloperApi ::
  */
 @DeveloperApi
-abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Product with Serializable {
+abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializable {
 
   /**
    * A handle to the SQL Context that was used to create this plan.   Since many operators need

From 692378c01d949dfe2b2a884add153cd5f8054b5a Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@outlook.com>
Date: Sat, 18 Jul 2015 11:25:16 -0700
Subject: [PATCH 231/549] [SPARK-9167][SQL] use UTC Calendar in `stringToDate`

fix 2 bugs introduced in https://github.com/apache/spark/pull/7353

1. we should use UTC Calendar when cast string to date . Before #7353 , we use `DateTimeUtils.fromJavaDate(Date.valueOf(s.toString))` to cast string to date, and `fromJavaDate` will call `millisToDays` to avoid the time zone issue. Now we use `DateTimeUtils.stringToDate(s)`, we should create a Calendar with UTC in the begging.
2. we should not change the default time zone in test cases. The `threadLocalLocalTimeZone` and `threadLocalTimestampFormat` in `DateTimeUtils` will only be evaluated once for each thread, so we can't set the default time zone back anymore.

Author: Wenchen Fan <cloud0fan@outlook.com>

Closes #7488 from cloud-fan/datetime and squashes the following commits:

9cd6005 [Wenchen Fan] address comments
21ef293 [Wenchen Fan] fix 2 bugs in datetime
---
 .../sql/catalyst/util/DateTimeUtils.scala     |  9 +++++----
 .../sql/catalyst/expressions/CastSuite.scala  |  3 ---
 .../catalyst/util/DateTimeUtilsSuite.scala    | 19 ++++++++++---------
 3 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
index f33e34b380bcf..45e45aef1a349 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
@@ -65,8 +65,8 @@ object DateTimeUtils {
   def millisToDays(millisUtc: Long): Int = {
     // SPARK-6785: use Math.floor so negative number of days (dates before 1970)
     // will correctly work as input for function toJavaDate(Int)
-    val millisLocal = millisUtc.toDouble + threadLocalLocalTimeZone.get().getOffset(millisUtc)
-    Math.floor(millisLocal / MILLIS_PER_DAY).toInt
+    val millisLocal = millisUtc + threadLocalLocalTimeZone.get().getOffset(millisUtc)
+    Math.floor(millisLocal.toDouble / MILLIS_PER_DAY).toInt
   }
 
   // reverse of millisToDays
@@ -375,8 +375,9 @@ object DateTimeUtils {
         segments(2) < 1 || segments(2) > 31) {
       return None
     }
-    val c = Calendar.getInstance()
+    val c = Calendar.getInstance(TimeZone.getTimeZone("GMT"))
     c.set(segments(0), segments(1) - 1, segments(2), 0, 0, 0)
-    Some((c.getTimeInMillis / 1000 / 3600 / 24).toInt)
+    c.set(Calendar.MILLISECOND, 0)
+    Some((c.getTimeInMillis / MILLIS_PER_DAY).toInt)
   }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
index ef8bcd41f7280..ccf448eee0688 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
@@ -281,8 +281,6 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
     val nts = sts + ".1"
     val ts = Timestamp.valueOf(nts)
 
-    val defaultTimeZone = TimeZone.getDefault
-    TimeZone.setDefault(TimeZone.getTimeZone("America/Los_Angeles"))
     var c = Calendar.getInstance()
     c.set(2015, 2, 8, 2, 30, 0)
     checkEvaluation(cast(cast(new Timestamp(c.getTimeInMillis), StringType), TimestampType),
@@ -291,7 +289,6 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
     c.set(2015, 10, 1, 2, 30, 0)
     checkEvaluation(cast(cast(new Timestamp(c.getTimeInMillis), StringType), TimestampType),
       c.getTimeInMillis * 1000)
-    TimeZone.setDefault(defaultTimeZone)
 
     checkEvaluation(cast("abdef", StringType), "abdef")
     checkEvaluation(cast("abdef", DecimalType.Unlimited), null)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
index 5c3a621c6d11f..04c5f09792ac3 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
@@ -90,34 +90,35 @@ class DateTimeUtilsSuite extends SparkFunSuite {
   }
 
   test("string to date") {
-    val millisPerDay = 1000L * 3600L * 24L
+    import DateTimeUtils.millisToDays
+
     var c = Calendar.getInstance()
     c.set(2015, 0, 28, 0, 0, 0)
     c.set(Calendar.MILLISECOND, 0)
     assert(DateTimeUtils.stringToDate(UTF8String.fromString("2015-01-28")).get ===
-      c.getTimeInMillis / millisPerDay)
+      millisToDays(c.getTimeInMillis))
     c.set(2015, 0, 1, 0, 0, 0)
     c.set(Calendar.MILLISECOND, 0)
     assert(DateTimeUtils.stringToDate(UTF8String.fromString("2015")).get ===
-      c.getTimeInMillis / millisPerDay)
+      millisToDays(c.getTimeInMillis))
     c = Calendar.getInstance()
     c.set(2015, 2, 1, 0, 0, 0)
     c.set(Calendar.MILLISECOND, 0)
     assert(DateTimeUtils.stringToDate(UTF8String.fromString("2015-03")).get ===
-      c.getTimeInMillis / millisPerDay)
+      millisToDays(c.getTimeInMillis))
     c = Calendar.getInstance()
     c.set(2015, 2, 18, 0, 0, 0)
     c.set(Calendar.MILLISECOND, 0)
     assert(DateTimeUtils.stringToDate(UTF8String.fromString("2015-03-18")).get ===
-      c.getTimeInMillis / millisPerDay)
+      millisToDays(c.getTimeInMillis))
     assert(DateTimeUtils.stringToDate(UTF8String.fromString("2015-03-18 ")).get ===
-      c.getTimeInMillis / millisPerDay)
+      millisToDays(c.getTimeInMillis))
     assert(DateTimeUtils.stringToDate(UTF8String.fromString("2015-03-18 123142")).get ===
-      c.getTimeInMillis / millisPerDay)
+      millisToDays(c.getTimeInMillis))
     assert(DateTimeUtils.stringToDate(UTF8String.fromString("2015-03-18T123123")).get ===
-      c.getTimeInMillis / millisPerDay)
+      millisToDays(c.getTimeInMillis))
     assert(DateTimeUtils.stringToDate(UTF8String.fromString("2015-03-18T")).get ===
-      c.getTimeInMillis / millisPerDay)
+      millisToDays(c.getTimeInMillis))
 
     assert(DateTimeUtils.stringToDate(UTF8String.fromString("2015-03-18X")).isEmpty)
     assert(DateTimeUtils.stringToDate(UTF8String.fromString("2015/03/18")).isEmpty)

From 86c50bf72c41d95107a55c16a6853dcda7f3e143 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@outlook.com>
Date: Sat, 18 Jul 2015 11:58:53 -0700
Subject: [PATCH 232/549] [SPARK-9171][SQL] add and improve tests for
 nondeterministic expressions

Author: Wenchen Fan <cloud0fan@outlook.com>

Closes #7496 from cloud-fan/tests and squashes the following commits:

0958f90 [Wenchen Fan] improve test for nondeterministic expressions
---
 .../scala/org/apache/spark/TaskContext.scala  |   2 +-
 .../expressions/ExpressionEvalHelper.scala    | 108 ++++++++++--------
 .../expressions/MathFunctionsSuite.scala      |  18 +--
 .../catalyst/expressions/RandomSuite.scala    |   6 +-
 .../spark/sql/ColumnExpressionSuite.scala     |   9 +-
 .../expression/NondeterministicSuite.scala    |  32 ++++++
 6 files changed, 103 insertions(+), 72 deletions(-)
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/expression/NondeterministicSuite.scala

diff --git a/core/src/main/scala/org/apache/spark/TaskContext.scala b/core/src/main/scala/org/apache/spark/TaskContext.scala
index 345bb500a7dec..e93eb93124e51 100644
--- a/core/src/main/scala/org/apache/spark/TaskContext.scala
+++ b/core/src/main/scala/org/apache/spark/TaskContext.scala
@@ -38,7 +38,7 @@ object TaskContext {
    */
   def getPartitionId(): Int = {
     val tc = taskContext.get()
-    if (tc == null) {
+    if (tc eq null) {
       0
     } else {
       tc.partitionId()
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
index c43486b3ddcf5..7a96044d35a09 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
@@ -23,7 +23,7 @@ import org.scalatest.Matchers._
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.CatalystTypeConverters
-import org.apache.spark.sql.catalyst.expressions.codegen.{GenerateUnsafeProjection, GenerateProjection, GenerateMutableProjection}
+import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.catalyst.optimizer.DefaultOptimizer
 import org.apache.spark.sql.catalyst.plans.logical.{OneRowRelation, Project}
 
@@ -38,7 +38,7 @@ trait ExpressionEvalHelper {
   }
 
   protected def checkEvaluation(
-      expression: Expression, expected: Any, inputRow: InternalRow = EmptyRow): Unit = {
+      expression: => Expression, expected: Any, inputRow: InternalRow = EmptyRow): Unit = {
     val catalystValue = CatalystTypeConverters.convertToCatalyst(expected)
     checkEvaluationWithoutCodegen(expression, catalystValue, inputRow)
     checkEvaluationWithGeneratedMutableProjection(expression, catalystValue, inputRow)
@@ -51,12 +51,14 @@ trait ExpressionEvalHelper {
 
   /**
    * Check the equality between result of expression and expected value, it will handle
-   * Array[Byte].
+   * Array[Byte] and Spread[Double].
    */
   protected def checkResult(result: Any, expected: Any): Boolean = {
     (result, expected) match {
       case (result: Array[Byte], expected: Array[Byte]) =>
         java.util.Arrays.equals(result, expected)
+      case (result: Double, expected: Spread[Double]) =>
+        expected.isWithin(result)
       case _ => result == expected
     }
   }
@@ -65,10 +67,29 @@ trait ExpressionEvalHelper {
     expression.eval(inputRow)
   }
 
+  protected def generateProject(
+      generator: => Projection,
+      expression: Expression): Projection = {
+    try {
+      generator
+    } catch {
+      case e: Throwable =>
+        val ctx = new CodeGenContext
+        val evaluated = expression.gen(ctx)
+        fail(
+          s"""
+            |Code generation of $expression failed:
+            |${evaluated.code}
+            |$e
+          """.stripMargin)
+    }
+  }
+
   protected def checkEvaluationWithoutCodegen(
       expression: Expression,
       expected: Any,
       inputRow: InternalRow = EmptyRow): Unit = {
+
     val actual = try evaluate(expression, inputRow) catch {
       case e: Exception => fail(s"Exception evaluating $expression", e)
     }
@@ -85,21 +106,11 @@ trait ExpressionEvalHelper {
       expected: Any,
       inputRow: InternalRow = EmptyRow): Unit = {
 
-    val plan = try {
-      GenerateMutableProjection.generate(Alias(expression, s"Optimized($expression)")() :: Nil)()
-    } catch {
-      case e: Throwable =>
-        val ctx = GenerateProjection.newCodeGenContext()
-        val evaluated = expression.gen(ctx)
-        fail(
-          s"""
-            |Code generation of $expression failed:
-            |${evaluated.code}
-            |$e
-          """.stripMargin)
-    }
+    val plan = generateProject(
+      GenerateMutableProjection.generate(Alias(expression, s"Optimized($expression)")() :: Nil)(),
+      expression)
 
-    val actual = plan(inputRow).apply(0)
+    val actual = plan(inputRow).get(0)
     if (!checkResult(actual, expected)) {
       val input = if (inputRow == EmptyRow) "" else s", input: $inputRow"
       fail(s"Incorrect Evaluation: $expression, actual: $actual, expected: $expected$input")
@@ -110,24 +121,19 @@ trait ExpressionEvalHelper {
       expression: Expression,
       expected: Any,
       inputRow: InternalRow = EmptyRow): Unit = {
-    val ctx = GenerateProjection.newCodeGenContext()
-    lazy val evaluated = expression.gen(ctx)
 
-    val plan = try {
-      GenerateProjection.generate(Alias(expression, s"Optimized($expression)")() :: Nil)
-    } catch {
-      case e: Throwable =>
-        fail(
-          s"""
-            |Code generation of $expression failed:
-            |${evaluated.code}
-            |$e
-          """.stripMargin)
-    }
+    val plan = generateProject(
+      GenerateProjection.generate(Alias(expression, s"Optimized($expression)")() :: Nil),
+      expression)
 
     val actual = plan(inputRow)
     val expectedRow = InternalRow(expected)
+
+    // We reimplement hashCode in generated `SpecificRow`, make sure it's consistent with our
+    // interpreted version.
     if (actual.hashCode() != expectedRow.hashCode()) {
+      val ctx = new CodeGenContext
+      val evaluated = expression.gen(ctx)
       fail(
         s"""
           |Mismatched hashCodes for values: $actual, $expectedRow
@@ -136,9 +142,10 @@ trait ExpressionEvalHelper {
           |Code: $evaluated
         """.stripMargin)
     }
+
     if (actual != expectedRow) {
       val input = if (inputRow == EmptyRow) "" else s", input: $inputRow"
-      fail(s"Incorrect Evaluation: $expression, actual: $actual, expected: $expected$input")
+      fail(s"Incorrect Evaluation: $expression, actual: $actual, expected: $expectedRow$input")
     }
     if (actual.copy() != expectedRow) {
       fail(s"Copy of generated Row is wrong: actual: ${actual.copy()}, expected: $expectedRow")
@@ -149,20 +156,10 @@ trait ExpressionEvalHelper {
       expression: Expression,
       expected: Any,
       inputRow: InternalRow = EmptyRow): Unit = {
-    val ctx = GenerateUnsafeProjection.newCodeGenContext()
-    lazy val evaluated = expression.gen(ctx)
 
-    val plan = try {
-      GenerateUnsafeProjection.generate(Alias(expression, s"Optimized($expression)")() :: Nil)
-    } catch {
-      case e: Throwable =>
-        fail(
-          s"""
-            |Code generation of $expression failed:
-            |${evaluated.code}
-            |$e
-          """.stripMargin)
-    }
+    val plan = generateProject(
+      GenerateUnsafeProjection.generate(Alias(expression, s"Optimized($expression)")() :: Nil),
+      expression)
 
     val unsafeRow = plan(inputRow)
     // UnsafeRow cannot be compared with GenericInternalRow directly
@@ -170,7 +167,7 @@ trait ExpressionEvalHelper {
     val expectedRow = InternalRow(expected)
     if (actual != expectedRow) {
       val input = if (inputRow == EmptyRow) "" else s", input: $inputRow"
-      fail(s"Incorrect Evaluation: $expression, actual: $actual, expected: $expected$input")
+      fail(s"Incorrect Evaluation: $expression, actual: $actual, expected: $expectedRow$input")
     }
   }
 
@@ -184,12 +181,23 @@ trait ExpressionEvalHelper {
   }
 
   protected def checkDoubleEvaluation(
-      expression: Expression,
+      expression: => Expression,
       expected: Spread[Double],
       inputRow: InternalRow = EmptyRow): Unit = {
-    val actual = try evaluate(expression, inputRow) catch {
-      case e: Exception => fail(s"Exception evaluating $expression", e)
-    }
-    actual.asInstanceOf[Double] shouldBe expected
+    checkEvaluationWithoutCodegen(expression, expected)
+    checkEvaluationWithGeneratedMutableProjection(expression, expected)
+    checkEvaluationWithOptimization(expression, expected)
+
+    var plan = generateProject(
+      GenerateProjection.generate(Alias(expression, s"Optimized($expression)")() :: Nil),
+      expression)
+    var actual = plan(inputRow).get(0)
+    assert(checkResult(actual, expected))
+
+    plan = generateProject(
+      GenerateUnsafeProjection.generate(Alias(expression, s"Optimized($expression)")() :: Nil),
+      expression)
+    actual = FromUnsafeProjection(expression.dataType :: Nil)(plan(inputRow)).get(0)
+    assert(checkResult(actual, expected))
   }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala
index df988f57fbfde..04acd5b5ff4d1 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala
@@ -143,7 +143,6 @@ class MathFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
       case e: Exception => fail(s"Exception evaluating $expression", e)
     }
     if (!actual.asInstanceOf[Double].isNaN) {
-      val input = if (inputRow == EmptyRow) "" else s", input: $inputRow"
       fail(s"Incorrect evaluation (codegen off): $expression, " +
         s"actual: $actual, " +
         s"expected: NaN")
@@ -155,23 +154,12 @@ class MathFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     expression: Expression,
     inputRow: InternalRow = EmptyRow): Unit = {
 
-    val plan = try {
-      GenerateMutableProjection.generate(Alias(expression, s"Optimized($expression)")() :: Nil)()
-    } catch {
-      case e: Throwable =>
-        val ctx = GenerateProjection.newCodeGenContext()
-        val evaluated = expression.gen(ctx)
-        fail(
-          s"""
-             |Code generation of $expression failed:
-             |${evaluated.code}
-             |$e
-          """.stripMargin)
-    }
+    val plan = generateProject(
+      GenerateMutableProjection.generate(Alias(expression, s"Optimized($expression)")() :: Nil)(),
+      expression)
 
     val actual = plan(inputRow).apply(0)
     if (!actual.asInstanceOf[Double].isNaN) {
-      val input = if (inputRow == EmptyRow) "" else s", input: $inputRow"
       fail(s"Incorrect Evaluation: $expression, actual: $actual, expected: NaN")
     }
   }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RandomSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RandomSuite.scala
index 9be2b23a53f27..698c81ba24482 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RandomSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RandomSuite.scala
@@ -21,13 +21,13 @@ import org.scalatest.Matchers._
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.dsl.expressions._
-import org.apache.spark.sql.types.{DoubleType, IntegerType}
+import org.apache.spark.sql.types.DoubleType
 
 
 class RandomSuite extends SparkFunSuite with ExpressionEvalHelper {
 
   test("random") {
-    val row = create_row(1.1, 2.0, 3.1, null)
-    checkDoubleEvaluation(Rand(30), (0.7363714192755834 +- 0.001), row)
+    checkDoubleEvaluation(Rand(30), 0.7363714192755834 +- 0.001)
+    checkDoubleEvaluation(Randn(30), 0.5181478766595276 +- 0.001)
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
index 8f15479308391..6bd5804196853 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
@@ -450,7 +450,7 @@ class ColumnExpressionSuite extends QueryTest {
 
   test("monotonicallyIncreasingId") {
     // Make sure we have 2 partitions, each with 2 records.
-    val df = ctx.sparkContext.parallelize(1 to 2, 2).mapPartitions { iter =>
+    val df = ctx.sparkContext.parallelize(Seq[Int](), 2).mapPartitions { _ =>
       Iterator(Tuple1(1), Tuple1(2))
     }.toDF("a")
     checkAnswer(
@@ -460,10 +460,13 @@ class ColumnExpressionSuite extends QueryTest {
   }
 
   test("sparkPartitionId") {
-    val df = ctx.sparkContext.parallelize(1 to 1, 1).map(i => (i, i)).toDF("a", "b")
+    // Make sure we have 2 partitions, each with 2 records.
+    val df = ctx.sparkContext.parallelize(Seq[Int](), 2).mapPartitions { _ =>
+      Iterator(Tuple1(1), Tuple1(2))
+    }.toDF("a")
     checkAnswer(
       df.select(sparkPartitionId()),
-      Row(0)
+      Row(0) :: Row(0) :: Row(1) :: Row(1) :: Nil
     )
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/expression/NondeterministicSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/expression/NondeterministicSuite.scala
new file mode 100644
index 0000000000000..99e11fd64b2b9
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/expression/NondeterministicSuite.scala
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.expression
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.catalyst.expressions. ExpressionEvalHelper
+import org.apache.spark.sql.execution.expressions.{SparkPartitionID, MonotonicallyIncreasingID}
+
+class NondeterministicSuite extends SparkFunSuite with ExpressionEvalHelper {
+  test("MonotonicallyIncreasingID") {
+    checkEvaluation(MonotonicallyIncreasingID(), 0)
+  }
+
+  test("SparkPartitionID") {
+    checkEvaluation(SparkPartitionID, 0)
+  }
+}

From 225de8da2b20ba03b358e222411610e8567aa88d Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@appier.com>
Date: Sat, 18 Jul 2015 12:11:37 -0700
Subject: [PATCH 233/549] [SPARK-9151][SQL] Implement code generation for Abs

JIRA: https://issues.apache.org/jira/browse/SPARK-9151

Add codegen support for `Abs`.

Author: Liang-Chi Hsieh <viirya@appier.com>

Closes #7498 from viirya/abs_codegen and squashes the following commits:

0c8410f [Liang-Chi Hsieh] Implement code generation for Abs.
---
 .../apache/spark/sql/catalyst/expressions/arithmetic.scala | 7 +++++++
 .../main/scala/org/apache/spark/sql/types/Decimal.scala    | 2 ++
 2 files changed, 9 insertions(+)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
index c5960eb390ea4..e83650fc8cb0e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
@@ -73,6 +73,13 @@ case class Abs(child: Expression) extends UnaryExpression with ExpectsInputTypes
 
   private lazy val numeric = TypeUtils.getNumeric(dataType)
 
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = dataType match {
+    case dt: DecimalType =>
+      defineCodeGen(ctx, ev, c => s"$c.abs()")
+    case dt: NumericType =>
+      defineCodeGen(ctx, ev, c => s"(${ctx.javaType(dt)})(java.lang.Math.abs($c))")
+  }
+
   protected override def nullSafeEval(input: Any): Any = numeric.abs(input)
 }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala
index a85af9e04aedb..bc689810bc292 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala
@@ -278,6 +278,8 @@ final class Decimal extends Ordered[Decimal] with Serializable {
       Decimal(-longVal, precision, scale)
     }
   }
+
+  def abs: Decimal = if (this.compare(Decimal(0)) < 0) this.unary_- else this
 }
 
 object Decimal {

From cdc36eef4160dbae32e19a1eadbb4cf062f2fb2b Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Sat, 18 Jul 2015 12:25:04 -0700
Subject: [PATCH 234/549] Closes #6122


From 3d2134fc0d90379b89da08de7614aef1ac674b1b Mon Sep 17 00:00:00 2001
From: Yijie Shen <henry.yijieshen@gmail.com>
Date: Sat, 18 Jul 2015 12:57:53 -0700
Subject: [PATCH 235/549] [SPARK-9055][SQL] WidenTypes should also support
 Intersect and Except

JIRA: https://issues.apache.org/jira/browse/SPARK-9055

cc rxin

Author: Yijie Shen <henry.yijieshen@gmail.com>

Closes #7491 from yijieshen/widen and squashes the following commits:

079fa52 [Yijie Shen] widenType support for intersect and expect
---
 .../catalyst/analysis/HiveTypeCoercion.scala  | 93 +++++++++++--------
 .../plans/logical/basicOperators.scala        |  8 ++
 .../analysis/HiveTypeCoercionSuite.scala      | 34 ++++++-
 3 files changed, 94 insertions(+), 41 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
index 50db7d21f01ca..ff20835e82ba7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst.analysis
 import javax.annotation.Nullable
 
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project, Union}
+import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.types._
 
@@ -168,52 +168,65 @@ object HiveTypeCoercion {
    * - LongType to DoubleType
    */
   object WidenTypes extends Rule[LogicalPlan] {
-    def apply(plan: LogicalPlan): LogicalPlan = plan transform {
-      // TODO: unions with fixed-precision decimals
-      case u @ Union(left, right) if u.childrenResolved && !u.resolved =>
-        val castedInput = left.output.zip(right.output).map {
-          // When a string is found on one side, make the other side a string too.
-          case (lhs, rhs) if lhs.dataType == StringType && rhs.dataType != StringType =>
-            (lhs, Alias(Cast(rhs, StringType), rhs.name)())
-          case (lhs, rhs) if lhs.dataType != StringType && rhs.dataType == StringType =>
-            (Alias(Cast(lhs, StringType), lhs.name)(), rhs)
 
-          case (lhs, rhs) if lhs.dataType != rhs.dataType =>
-            logDebug(s"Resolving mismatched union input ${lhs.dataType}, ${rhs.dataType}")
-            findTightestCommonTypeOfTwo(lhs.dataType, rhs.dataType).map { widestType =>
-              val newLeft =
-                if (lhs.dataType == widestType) lhs else Alias(Cast(lhs, widestType), lhs.name)()
-              val newRight =
-                if (rhs.dataType == widestType) rhs else Alias(Cast(rhs, widestType), rhs.name)()
-
-              (newLeft, newRight)
-            }.getOrElse {
-              // If there is no applicable conversion, leave expression unchanged.
-              (lhs, rhs)
-            }
+    private[this] def widenOutputTypes(planName: String, left: LogicalPlan, right: LogicalPlan):
+        (LogicalPlan, LogicalPlan) = {
+
+      // TODO: with fixed-precision decimals
+      val castedInput = left.output.zip(right.output).map {
+        // When a string is found on one side, make the other side a string too.
+        case (lhs, rhs) if lhs.dataType == StringType && rhs.dataType != StringType =>
+          (lhs, Alias(Cast(rhs, StringType), rhs.name)())
+        case (lhs, rhs) if lhs.dataType != StringType && rhs.dataType == StringType =>
+          (Alias(Cast(lhs, StringType), lhs.name)(), rhs)
+
+        case (lhs, rhs) if lhs.dataType != rhs.dataType =>
+          logDebug(s"Resolving mismatched $planName input ${lhs.dataType}, ${rhs.dataType}")
+          findTightestCommonTypeOfTwo(lhs.dataType, rhs.dataType).map { widestType =>
+            val newLeft =
+              if (lhs.dataType == widestType) lhs else Alias(Cast(lhs, widestType), lhs.name)()
+            val newRight =
+              if (rhs.dataType == widestType) rhs else Alias(Cast(rhs, widestType), rhs.name)()
+
+            (newLeft, newRight)
+          }.getOrElse {
+            // If there is no applicable conversion, leave expression unchanged.
+            (lhs, rhs)
+          }
 
-          case other => other
-        }
+        case other => other
+      }
 
-        val (castedLeft, castedRight) = castedInput.unzip
+      val (castedLeft, castedRight) = castedInput.unzip
 
-        val newLeft =
-          if (castedLeft.map(_.dataType) != left.output.map(_.dataType)) {
-            logDebug(s"Widening numeric types in union $castedLeft ${left.output}")
-            Project(castedLeft, left)
-          } else {
-            left
-          }
+      val newLeft =
+        if (castedLeft.map(_.dataType) != left.output.map(_.dataType)) {
+          logDebug(s"Widening numeric types in $planName $castedLeft ${left.output}")
+          Project(castedLeft, left)
+        } else {
+          left
+        }
 
-        val newRight =
-          if (castedRight.map(_.dataType) != right.output.map(_.dataType)) {
-            logDebug(s"Widening numeric types in union $castedRight ${right.output}")
-            Project(castedRight, right)
-          } else {
-            right
-          }
+      val newRight =
+        if (castedRight.map(_.dataType) != right.output.map(_.dataType)) {
+          logDebug(s"Widening numeric types in $planName $castedRight ${right.output}")
+          Project(castedRight, right)
+        } else {
+          right
+        }
+      (newLeft, newRight)
+    }
 
+    def apply(plan: LogicalPlan): LogicalPlan = plan transform {
+      case u @ Union(left, right) if u.childrenResolved && !u.resolved =>
+        val (newLeft, newRight) = widenOutputTypes(u.nodeName, left, right)
         Union(newLeft, newRight)
+      case e @ Except(left, right) if e.childrenResolved && !e.resolved =>
+        val (newLeft, newRight) = widenOutputTypes(e.nodeName, left, right)
+        Except(newLeft, newRight)
+      case i @ Intersect(left, right) if i.childrenResolved && !i.resolved =>
+        val (newLeft, newRight) = widenOutputTypes(i.nodeName, left, right)
+        Intersect(newLeft, newRight)
     }
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
index 17a91247327f7..986c315b3173a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
@@ -141,6 +141,10 @@ case class BroadcastHint(child: LogicalPlan) extends UnaryNode {
 
 case class Except(left: LogicalPlan, right: LogicalPlan) extends BinaryNode {
   override def output: Seq[Attribute] = left.output
+
+  override lazy val resolved: Boolean =
+    childrenResolved &&
+      left.output.zip(right.output).forall { case (l, r) => l.dataType == r.dataType }
 }
 
 case class InsertIntoTable(
@@ -437,4 +441,8 @@ case object OneRowRelation extends LeafNode {
 
 case class Intersect(left: LogicalPlan, right: LogicalPlan) extends BinaryNode {
   override def output: Seq[Attribute] = left.output
+
+  override lazy val resolved: Boolean =
+    childrenResolved &&
+      left.output.zip(right.output).forall { case (l, r) => l.dataType == r.dataType }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
index d0fd033b981c8..c9b3c69c6de89 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst.analysis
 import org.apache.spark.sql.catalyst.plans.PlanTest
 
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, LocalRelation, Project}
+import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.types._
 
@@ -305,6 +305,38 @@ class HiveTypeCoercionSuite extends PlanTest {
     )
   }
 
+  test("WidenTypes for union except and intersect") {
+    def checkOutput(logical: LogicalPlan, expectTypes: Seq[DataType]): Unit = {
+      logical.output.zip(expectTypes).foreach { case (attr, dt) =>
+        assert(attr.dataType === dt)
+      }
+    }
+
+    val left = LocalRelation(
+      AttributeReference("i", IntegerType)(),
+      AttributeReference("u", DecimalType.Unlimited)(),
+      AttributeReference("b", ByteType)(),
+      AttributeReference("d", DoubleType)())
+    val right = LocalRelation(
+      AttributeReference("s", StringType)(),
+      AttributeReference("d", DecimalType(2, 1))(),
+      AttributeReference("f", FloatType)(),
+      AttributeReference("l", LongType)())
+
+    val wt = HiveTypeCoercion.WidenTypes
+    val expectedTypes = Seq(StringType, DecimalType.Unlimited, FloatType, DoubleType)
+
+    val r1 = wt(Union(left, right)).asInstanceOf[Union]
+    val r2 = wt(Except(left, right)).asInstanceOf[Except]
+    val r3 = wt(Intersect(left, right)).asInstanceOf[Intersect]
+    checkOutput(r1.left, expectedTypes)
+    checkOutput(r1.right, expectedTypes)
+    checkOutput(r2.left, expectedTypes)
+    checkOutput(r2.right, expectedTypes)
+    checkOutput(r3.left, expectedTypes)
+    checkOutput(r3.right, expectedTypes)
+  }
+
   /**
    * There are rules that need to not fire before child expressions get resolved.
    * We use this test to make sure those rules do not fire early.

From 6e1e2eba696e89ba57bf5450b9c72c4386e43dc8 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Sat, 18 Jul 2015 14:07:56 -0700
Subject: [PATCH 236/549] [SPARK-8240][SQL] string function: concat

Author: Reynold Xin <rxin@databricks.com>

Closes #7486 from rxin/concat and squashes the following commits:

5217d6e [Reynold Xin] Removed Hive's concat test.
f5cb7a3 [Reynold Xin] Concat is never nullable.
ae4e61f [Reynold Xin] Removed extra import.
fddcbbd [Reynold Xin] Fixed NPE.
22e831c [Reynold Xin] Added missing file.
57a2352 [Reynold Xin] [SPARK-8240][SQL] string function: concat
---
 .../catalyst/analysis/FunctionRegistry.scala  |   1 +
 .../expressions/stringOperations.scala        |  37 +++
 ...ite.scala => StringExpressionsSuite.scala} |  24 +-
 .../org/apache/spark/sql/functions.scala      |  22 ++
 .../spark/sql/DataFrameFunctionsSuite.scala   | 242 ---------------
 .../spark/sql/StringFunctionsSuite.scala      | 284 ++++++++++++++++++
 .../execution/HiveCompatibilitySuite.scala    |   4 +-
 .../apache/spark/unsafe/types/UTF8String.java |  40 ++-
 .../spark/unsafe/types/UTF8StringSuite.java   |  14 +
 9 files changed, 421 insertions(+), 247 deletions(-)
 rename sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/{StringFunctionsSuite.scala => StringExpressionsSuite.scala} (96%)
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index ce552a1d65eda..d1cda6bc27095 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -152,6 +152,7 @@ object FunctionRegistry {
     // string functions
     expression[Ascii]("ascii"),
     expression[Base64]("base64"),
+    expression[Concat]("concat"),
     expression[Encode]("encode"),
     expression[Decode]("decode"),
     expression[FormatNumber]("format_number"),
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
index c64afe7b3f19a..b36354eff092a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
@@ -27,6 +27,43 @@ import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
 
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// This file defines expressions for string operations.
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+/**
+ * An expression that concatenates multiple input strings into a single string.
+ * Input expressions that are evaluated to nulls are skipped.
+ *
+ * For example, `concat("a", null, "b")` is evaluated to `"ab"`.
+ *
+ * Note that this is different from Hive since Hive outputs null if any input is null.
+ * We never output null.
+ */
+case class Concat(children: Seq[Expression]) extends Expression with ImplicitCastInputTypes {
+
+  override def inputTypes: Seq[AbstractDataType] = Seq.fill(children.size)(StringType)
+  override def dataType: DataType = StringType
+
+  override def nullable: Boolean = false
+  override def foldable: Boolean = children.forall(_.foldable)
+
+  override def eval(input: InternalRow): Any = {
+    val inputs = children.map(_.eval(input).asInstanceOf[UTF8String])
+    UTF8String.concat(inputs : _*)
+  }
+
+  override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    val evals = children.map(_.gen(ctx))
+    val inputs = evals.map { eval => s"${eval.isNull} ? null : ${eval.primitive}" }.mkString(", ")
+    evals.map(_.code).mkString("\n") + s"""
+      boolean ${ev.isNull} = false;
+      UTF8String ${ev.primitive} = UTF8String.concat($inputs);
+    """
+  }
+}
+
 
 trait StringRegexExpression extends ImplicitCastInputTypes {
   self: BinaryExpression =>
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
similarity index 96%
rename from sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringFunctionsSuite.scala
rename to sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
index 5d7763bedf6bd..0ed567a90dd1f 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringFunctionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
@@ -22,7 +22,29 @@ import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.types._
 
 
-class StringFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
+class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
+
+  test("concat") {
+    def testConcat(inputs: String*): Unit = {
+      val expected = inputs.filter(_ != null).mkString
+      checkEvaluation(Concat(inputs.map(Literal.create(_, StringType))), expected, EmptyRow)
+    }
+
+    testConcat()
+    testConcat(null)
+    testConcat("")
+    testConcat("ab")
+    testConcat("a", "b")
+    testConcat("a", "b", "C")
+    testConcat("a", null, "C")
+    testConcat("a", null, null)
+    testConcat(null, null, null)
+
+    // scalastyle:off
+    // non ascii characters are not allowed in the code, so we disable the scalastyle here.
+    testConcat("数据", null, "砖头")
+    // scalastyle:on
+  }
 
   test("StringComparison") {
     val row = create_row("abc", null)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index b56fd9a71b321..c180407389136 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -1710,6 +1710,28 @@ object functions {
   // String functions
   //////////////////////////////////////////////////////////////////////////////////////////////
 
+  /**
+   * Concatenates input strings together into a single string.
+   *
+   * @group string_funcs
+   * @since 1.5.0
+   */
+  @scala.annotation.varargs
+  def concat(exprs: Column*): Column = Concat(exprs.map(_.expr))
+
+  /**
+   * Concatenates input strings together into a single string.
+   *
+   * This is the variant of concat that takes in the column names.
+   *
+   * @group string_funcs
+   * @since 1.5.0
+   */
+  @scala.annotation.varargs
+  def concat(columnName: String, columnNames: String*): Column = {
+    concat((columnName +: columnNames).map(Column.apply): _*)
+  }
+
   /**
    * Computes the length of a given string / binary value.
    *
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
index 6dccdd857b453..29f1197a8543c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
@@ -208,169 +208,6 @@ class DataFrameFunctionsSuite extends QueryTest {
       Row(2743272264L, 2180413220L))
   }
 
-  test("Levenshtein distance") {
-    val df = Seq(("kitten", "sitting"), ("frog", "fog")).toDF("l", "r")
-    checkAnswer(df.select(levenshtein("l", "r")), Seq(Row(3), Row(1)))
-    checkAnswer(df.selectExpr("levenshtein(l, r)"), Seq(Row(3), Row(1)))
-  }
-
-  test("string ascii function") {
-    val df = Seq(("abc", "")).toDF("a", "b")
-    checkAnswer(
-      df.select(ascii($"a"), ascii("b")),
-      Row(97, 0))
-
-    checkAnswer(
-      df.selectExpr("ascii(a)", "ascii(b)"),
-      Row(97, 0))
-  }
-
-  test("string base64/unbase64 function") {
-    val bytes = Array[Byte](1, 2, 3, 4)
-    val df = Seq((bytes, "AQIDBA==")).toDF("a", "b")
-    checkAnswer(
-      df.select(base64("a"), base64($"a"), unbase64("b"), unbase64($"b")),
-      Row("AQIDBA==", "AQIDBA==", bytes, bytes))
-
-    checkAnswer(
-      df.selectExpr("base64(a)", "unbase64(b)"),
-      Row("AQIDBA==", bytes))
-  }
-
-  test("string encode/decode function") {
-    val bytes = Array[Byte](-27, -92, -89, -27, -115, -125, -28, -72, -106, -25, -107, -116)
-    // scalastyle:off  
-    // non ascii characters are not allowed in the code, so we disable the scalastyle here.
-    val df = Seq(("大千世界", "utf-8", bytes)).toDF("a", "b", "c")
-    checkAnswer(
-      df.select(
-        encode($"a", "utf-8"),
-        encode("a", "utf-8"),
-        decode($"c", "utf-8"),
-        decode("c", "utf-8")),
-      Row(bytes, bytes, "大千世界", "大千世界"))
-
-    checkAnswer(
-      df.selectExpr("encode(a, 'utf-8')", "decode(c, 'utf-8')"),
-      Row(bytes, "大千世界"))
-    // scalastyle:on
-  }
-
-  test("string trim functions") {
-    val df = Seq(("  example  ", "")).toDF("a", "b")
-
-    checkAnswer(
-      df.select(ltrim($"a"), rtrim($"a"), trim($"a")),
-      Row("example  ", "  example", "example"))
-
-    checkAnswer(
-      df.selectExpr("ltrim(a)", "rtrim(a)", "trim(a)"),
-      Row("example  ", "  example", "example"))
-  }
-
-  test("string formatString function") {
-    val df = Seq(("aa%d%s", 123, "cc")).toDF("a", "b", "c")
-
-    checkAnswer(
-      df.select(formatString($"a", $"b", $"c"), formatString("aa%d%s", "b", "c")),
-      Row("aa123cc", "aa123cc"))
-
-    checkAnswer(
-      df.selectExpr("printf(a, b, c)"),
-      Row("aa123cc"))
-  }
-
-  test("string instr function") {
-    val df = Seq(("aaads", "aa", "zz")).toDF("a", "b", "c")
-
-    checkAnswer(
-      df.select(instr($"a", $"b"), instr("a", "b")),
-      Row(1, 1))
-
-    checkAnswer(
-      df.selectExpr("instr(a, b)"),
-      Row(1))
-  }
-
-  test("string locate function") {
-    val df = Seq(("aaads", "aa", "zz", 1)).toDF("a", "b", "c", "d")
-
-    checkAnswer(
-      df.select(
-        locate($"b", $"a"), locate("b", "a"), locate($"b", $"a", 1),
-        locate("b", "a", 1), locate($"b", $"a", $"d"), locate("b", "a", "d")),
-      Row(1, 1, 2, 2, 2, 2))
-
-    checkAnswer(
-      df.selectExpr("locate(b, a)", "locate(b, a, d)"),
-      Row(1, 2))
-  }
-
-  test("string padding functions") {
-    val df = Seq(("hi", 5, "??")).toDF("a", "b", "c")
-
-    checkAnswer(
-          df.select(
-            lpad($"a", $"b", $"c"), rpad("a", "b", "c"),
-            lpad($"a", 1, $"c"), rpad("a", 1, "c")),
-          Row("???hi", "hi???", "h", "h"))
-
-    checkAnswer(
-      df.selectExpr("lpad(a, b, c)", "rpad(a, b, c)", "lpad(a, 1, c)", "rpad(a, 1, c)"),
-      Row("???hi", "hi???", "h", "h"))
-  }
-
-  test("string repeat function") {
-    val df = Seq(("hi", 2)).toDF("a", "b")
-
-    checkAnswer(
-      df.select(
-        repeat($"a", 2), repeat("a", 2), repeat($"a", $"b"), repeat("a", "b")),
-      Row("hihi", "hihi", "hihi", "hihi"))
-
-    checkAnswer(
-      df.selectExpr("repeat(a, 2)", "repeat(a, b)"),
-      Row("hihi", "hihi"))
-  }
-
-  test("string reverse function") {
-    val df = Seq(("hi", "hhhi")).toDF("a", "b")
-
-    checkAnswer(
-      df.select(reverse($"a"), reverse("b")),
-      Row("ih", "ihhh"))
-
-    checkAnswer(
-      df.selectExpr("reverse(b)"),
-      Row("ihhh"))
-  }
-
-  test("string space function") {
-    val df = Seq((2, 3)).toDF("a", "b")
-
-    checkAnswer(
-      df.select(space($"a"), space("b")),
-      Row("  ", "   "))
-
-    checkAnswer(
-      df.selectExpr("space(b)"),
-      Row("   "))
-  }
-
-  test("string split function") {
-    val df = Seq(("aa2bb3cc", "[1-9]+")).toDF("a", "b")
-
-    checkAnswer(
-      df.select(
-        split($"a", "[1-9]+"),
-        split("a", "[1-9]+")),
-      Row(Seq("aa", "bb", "cc"), Seq("aa", "bb", "cc")))
-
-    checkAnswer(
-      df.selectExpr("split(a, '[1-9]+')"),
-      Row(Seq("aa", "bb", "cc")))
-  }
-
   test("conditional function: least") {
     checkAnswer(
       testData2.select(least(lit(-1), lit(0), col("a"), col("b"))).limit(1),
@@ -430,83 +267,4 @@ class DataFrameFunctionsSuite extends QueryTest {
     )
   }
 
-  test("string / binary length function") {
-    val df = Seq(("123", Array[Byte](1, 2, 3, 4), 123)).toDF("a", "b", "c")
-    checkAnswer(
-      df.select(length($"a"), length("a"), length($"b"), length("b")),
-      Row(3, 3, 4, 4))
-
-    checkAnswer(
-      df.selectExpr("length(a)", "length(b)"),
-      Row(3, 4))
-
-    intercept[AnalysisException] {
-      checkAnswer(
-        df.selectExpr("length(c)"), // int type of the argument is unacceptable
-        Row("5.0000"))
-    }
-  }
-
-  test("number format function") {
-    val tuple =
-      ("aa", 1.asInstanceOf[Byte], 2.asInstanceOf[Short],
-        3.13223f, 4, 5L, 6.48173d, Decimal(7.128381))
-    val df =
-      Seq(tuple)
-        .toDF(
-          "a", // string "aa"
-          "b", // byte    1
-          "c", // short   2
-          "d", // float   3.13223f
-          "e", // integer 4
-          "f", // long    5L
-          "g", // double  6.48173d
-          "h") // decimal 7.128381
-
-    checkAnswer(
-      df.select(
-        format_number($"f", 4),
-        format_number("f", 4)),
-      Row("5.0000", "5.0000"))
-
-    checkAnswer(
-      df.selectExpr("format_number(b, e)"), // convert the 1st argument to integer
-      Row("1.0000"))
-
-    checkAnswer(
-      df.selectExpr("format_number(c, e)"), // convert the 1st argument to integer
-      Row("2.0000"))
-
-    checkAnswer(
-      df.selectExpr("format_number(d, e)"), // convert the 1st argument to double
-      Row("3.1322"))
-
-    checkAnswer(
-      df.selectExpr("format_number(e, e)"), // not convert anything
-      Row("4.0000"))
-
-    checkAnswer(
-      df.selectExpr("format_number(f, e)"), // not convert anything
-      Row("5.0000"))
-
-    checkAnswer(
-      df.selectExpr("format_number(g, e)"), // not convert anything
-      Row("6.4817"))
-
-    checkAnswer(
-      df.selectExpr("format_number(h, e)"), // not convert anything
-      Row("7.1284"))
-
-    intercept[AnalysisException] {
-      checkAnswer(
-        df.selectExpr("format_number(a, e)"), // string type of the 1st argument is unacceptable
-        Row("5.0000"))
-    }
-
-    intercept[AnalysisException] {
-      checkAnswer(
-        df.selectExpr("format_number(e, g)"), // decimal type of the 2nd argument is unacceptable
-        Row("5.0000"))
-    }
-  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
new file mode 100644
index 0000000000000..4eff33ed45042
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
@@ -0,0 +1,284 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql
+
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.types.Decimal
+
+
+class StringFunctionsSuite extends QueryTest {
+
+  private lazy val ctx = org.apache.spark.sql.test.TestSQLContext
+  import ctx.implicits._
+
+  test("string concat") {
+    val df = Seq[(String, String, String)](("a", "b", null)).toDF("a", "b", "c")
+
+    checkAnswer(
+      df.select(concat($"a", $"b", $"c")),
+      Row("ab"))
+
+    checkAnswer(
+      df.selectExpr("concat(a, b, c)"),
+      Row("ab"))
+  }
+
+
+  test("string Levenshtein distance") {
+    val df = Seq(("kitten", "sitting"), ("frog", "fog")).toDF("l", "r")
+    checkAnswer(df.select(levenshtein("l", "r")), Seq(Row(3), Row(1)))
+    checkAnswer(df.selectExpr("levenshtein(l, r)"), Seq(Row(3), Row(1)))
+  }
+
+  test("string ascii function") {
+    val df = Seq(("abc", "")).toDF("a", "b")
+    checkAnswer(
+      df.select(ascii($"a"), ascii("b")),
+      Row(97, 0))
+
+    checkAnswer(
+      df.selectExpr("ascii(a)", "ascii(b)"),
+      Row(97, 0))
+  }
+
+  test("string base64/unbase64 function") {
+    val bytes = Array[Byte](1, 2, 3, 4)
+    val df = Seq((bytes, "AQIDBA==")).toDF("a", "b")
+    checkAnswer(
+      df.select(base64("a"), base64($"a"), unbase64("b"), unbase64($"b")),
+      Row("AQIDBA==", "AQIDBA==", bytes, bytes))
+
+    checkAnswer(
+      df.selectExpr("base64(a)", "unbase64(b)"),
+      Row("AQIDBA==", bytes))
+  }
+
+  test("string encode/decode function") {
+    val bytes = Array[Byte](-27, -92, -89, -27, -115, -125, -28, -72, -106, -25, -107, -116)
+    // scalastyle:off
+    // non ascii characters are not allowed in the code, so we disable the scalastyle here.
+    val df = Seq(("大千世界", "utf-8", bytes)).toDF("a", "b", "c")
+    checkAnswer(
+      df.select(
+        encode($"a", "utf-8"),
+        encode("a", "utf-8"),
+        decode($"c", "utf-8"),
+        decode("c", "utf-8")),
+      Row(bytes, bytes, "大千世界", "大千世界"))
+
+    checkAnswer(
+      df.selectExpr("encode(a, 'utf-8')", "decode(c, 'utf-8')"),
+      Row(bytes, "大千世界"))
+    // scalastyle:on
+  }
+
+  test("string trim functions") {
+    val df = Seq(("  example  ", "")).toDF("a", "b")
+
+    checkAnswer(
+      df.select(ltrim($"a"), rtrim($"a"), trim($"a")),
+      Row("example  ", "  example", "example"))
+
+    checkAnswer(
+      df.selectExpr("ltrim(a)", "rtrim(a)", "trim(a)"),
+      Row("example  ", "  example", "example"))
+  }
+
+  test("string formatString function") {
+    val df = Seq(("aa%d%s", 123, "cc")).toDF("a", "b", "c")
+
+    checkAnswer(
+      df.select(formatString($"a", $"b", $"c"), formatString("aa%d%s", "b", "c")),
+      Row("aa123cc", "aa123cc"))
+
+    checkAnswer(
+      df.selectExpr("printf(a, b, c)"),
+      Row("aa123cc"))
+  }
+
+  test("string instr function") {
+    val df = Seq(("aaads", "aa", "zz")).toDF("a", "b", "c")
+
+    checkAnswer(
+      df.select(instr($"a", $"b"), instr("a", "b")),
+      Row(1, 1))
+
+    checkAnswer(
+      df.selectExpr("instr(a, b)"),
+      Row(1))
+  }
+
+  test("string locate function") {
+    val df = Seq(("aaads", "aa", "zz", 1)).toDF("a", "b", "c", "d")
+
+    checkAnswer(
+      df.select(
+        locate($"b", $"a"), locate("b", "a"), locate($"b", $"a", 1),
+        locate("b", "a", 1), locate($"b", $"a", $"d"), locate("b", "a", "d")),
+      Row(1, 1, 2, 2, 2, 2))
+
+    checkAnswer(
+      df.selectExpr("locate(b, a)", "locate(b, a, d)"),
+      Row(1, 2))
+  }
+
+  test("string padding functions") {
+    val df = Seq(("hi", 5, "??")).toDF("a", "b", "c")
+
+    checkAnswer(
+      df.select(
+        lpad($"a", $"b", $"c"), rpad("a", "b", "c"),
+        lpad($"a", 1, $"c"), rpad("a", 1, "c")),
+      Row("???hi", "hi???", "h", "h"))
+
+    checkAnswer(
+      df.selectExpr("lpad(a, b, c)", "rpad(a, b, c)", "lpad(a, 1, c)", "rpad(a, 1, c)"),
+      Row("???hi", "hi???", "h", "h"))
+  }
+
+  test("string repeat function") {
+    val df = Seq(("hi", 2)).toDF("a", "b")
+
+    checkAnswer(
+      df.select(
+        repeat($"a", 2), repeat("a", 2), repeat($"a", $"b"), repeat("a", "b")),
+      Row("hihi", "hihi", "hihi", "hihi"))
+
+    checkAnswer(
+      df.selectExpr("repeat(a, 2)", "repeat(a, b)"),
+      Row("hihi", "hihi"))
+  }
+
+  test("string reverse function") {
+    val df = Seq(("hi", "hhhi")).toDF("a", "b")
+
+    checkAnswer(
+      df.select(reverse($"a"), reverse("b")),
+      Row("ih", "ihhh"))
+
+    checkAnswer(
+      df.selectExpr("reverse(b)"),
+      Row("ihhh"))
+  }
+
+  test("string space function") {
+    val df = Seq((2, 3)).toDF("a", "b")
+
+    checkAnswer(
+      df.select(space($"a"), space("b")),
+      Row("  ", "   "))
+
+    checkAnswer(
+      df.selectExpr("space(b)"),
+      Row("   "))
+  }
+
+  test("string split function") {
+    val df = Seq(("aa2bb3cc", "[1-9]+")).toDF("a", "b")
+
+    checkAnswer(
+      df.select(
+        split($"a", "[1-9]+"),
+        split("a", "[1-9]+")),
+      Row(Seq("aa", "bb", "cc"), Seq("aa", "bb", "cc")))
+
+    checkAnswer(
+      df.selectExpr("split(a, '[1-9]+')"),
+      Row(Seq("aa", "bb", "cc")))
+  }
+
+  test("string / binary length function") {
+    val df = Seq(("123", Array[Byte](1, 2, 3, 4), 123)).toDF("a", "b", "c")
+    checkAnswer(
+      df.select(length($"a"), length("a"), length($"b"), length("b")),
+      Row(3, 3, 4, 4))
+
+    checkAnswer(
+      df.selectExpr("length(a)", "length(b)"),
+      Row(3, 4))
+
+    intercept[AnalysisException] {
+      checkAnswer(
+        df.selectExpr("length(c)"), // int type of the argument is unacceptable
+        Row("5.0000"))
+    }
+  }
+
+  test("number format function") {
+    val tuple =
+      ("aa", 1.asInstanceOf[Byte], 2.asInstanceOf[Short],
+        3.13223f, 4, 5L, 6.48173d, Decimal(7.128381))
+    val df =
+      Seq(tuple)
+        .toDF(
+          "a", // string "aa"
+          "b", // byte    1
+          "c", // short   2
+          "d", // float   3.13223f
+          "e", // integer 4
+          "f", // long    5L
+          "g", // double  6.48173d
+          "h") // decimal 7.128381
+
+    checkAnswer(
+      df.select(
+        format_number($"f", 4),
+        format_number("f", 4)),
+      Row("5.0000", "5.0000"))
+
+    checkAnswer(
+      df.selectExpr("format_number(b, e)"), // convert the 1st argument to integer
+      Row("1.0000"))
+
+    checkAnswer(
+      df.selectExpr("format_number(c, e)"), // convert the 1st argument to integer
+      Row("2.0000"))
+
+    checkAnswer(
+      df.selectExpr("format_number(d, e)"), // convert the 1st argument to double
+      Row("3.1322"))
+
+    checkAnswer(
+      df.selectExpr("format_number(e, e)"), // not convert anything
+      Row("4.0000"))
+
+    checkAnswer(
+      df.selectExpr("format_number(f, e)"), // not convert anything
+      Row("5.0000"))
+
+    checkAnswer(
+      df.selectExpr("format_number(g, e)"), // not convert anything
+      Row("6.4817"))
+
+    checkAnswer(
+      df.selectExpr("format_number(h, e)"), // not convert anything
+      Row("7.1284"))
+
+    intercept[AnalysisException] {
+      checkAnswer(
+        df.selectExpr("format_number(a, e)"), // string type of the 1st argument is unacceptable
+        Row("5.0000"))
+    }
+
+    intercept[AnalysisException] {
+      checkAnswer(
+        df.selectExpr("format_number(e, g)"), // decimal type of the 2nd argument is unacceptable
+        Row("5.0000"))
+    }
+  }
+}
diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
index 6b8f2f6217a54..299cc599ff8f7 100644
--- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
+++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
@@ -256,6 +256,9 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "timestamp_2",
     "timestamp_udf",
 
+    // Hive outputs NULL if any concat input has null. We never output null for concat.
+    "udf_concat",
+
     // Unlike Hive, we do support log base in (0, 1.0], therefore disable this
     "udf7"
   )
@@ -846,7 +849,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "udf_case",
     "udf_ceil",
     "udf_ceiling",
-    "udf_concat",
     "udf_concat_insert1",
     "udf_concat_insert2",
     "udf_concat_ws",
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
index e7f9fbb2bc682..9723b6e0834b2 100644
--- a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
+++ b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -21,6 +21,7 @@
 import java.io.Serializable;
 import java.io.UnsupportedEncodingException;
 
+import org.apache.spark.unsafe.PlatformDependent;
 import org.apache.spark.unsafe.array.ByteArrayMethods;
 
 import static org.apache.spark.unsafe.PlatformDependent.*;
@@ -322,7 +323,7 @@ public int indexOf(UTF8String v, int start) {
       }
       i += numBytesForFirstByte(getByte(i));
       c += 1;
-    } while(i < numBytes);
+    } while (i < numBytes);
 
     return -1;
   }
@@ -395,6 +396,39 @@ public UTF8String lpad(int len, UTF8String pad) {
     }
   }
 
+  /**
+   * Concatenates input strings together into a single string. A null input is skipped.
+   * For example, concat("a", null, "c") would yield "ac".
+   */
+  public static UTF8String concat(UTF8String... inputs) {
+    if (inputs == null) {
+      return fromBytes(new byte[0]);
+    }
+
+    // Compute the total length of the result.
+    int totalLength = 0;
+    for (int i = 0; i < inputs.length; i++) {
+      if (inputs[i] != null) {
+        totalLength += inputs[i].numBytes;
+      }
+    }
+
+    // Allocate a new byte array, and copy the inputs one by one into it.
+    final byte[] result = new byte[totalLength];
+    int offset = 0;
+    for (int i = 0; i < inputs.length; i++) {
+      if (inputs[i] != null) {
+        int len = inputs[i].numBytes;
+        PlatformDependent.copyMemory(
+          inputs[i].base, inputs[i].offset,
+          result, PlatformDependent.BYTE_ARRAY_OFFSET + offset,
+          len);
+        offset += len;
+      }
+    }
+    return fromBytes(result);
+  }
+
   @Override
   public String toString() {
     try {
@@ -413,7 +447,7 @@ public UTF8String clone() {
   }
 
   @Override
-  public int compareTo(final UTF8String other) {
+  public int compareTo(@Nonnull final UTF8String other) {
     int len = Math.min(numBytes, other.numBytes);
     // TODO: compare 8 bytes as unsigned long
     for (int i = 0; i < len; i ++) {
@@ -434,7 +468,7 @@ public int compare(final UTF8String other) {
   public boolean equals(final Object other) {
     if (other instanceof UTF8String) {
       UTF8String o = (UTF8String) other;
-      if (numBytes != o.numBytes){
+      if (numBytes != o.numBytes) {
         return false;
       }
       return ByteArrayMethods.arrayEquals(base, offset, o.base, o.offset, numBytes);
diff --git a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
index 694bdc29f39d1..0db7522b50c1a 100644
--- a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
+++ b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
@@ -86,6 +86,20 @@ public void upperAndLower() {
     testUpperandLower("大千世界 数据砖头", "大千世界 数据砖头");
   }
 
+  @Test
+  public void concatTest() {
+    assertEquals(concat(), fromString(""));
+    assertEquals(concat(null), fromString(""));
+    assertEquals(concat(fromString("")), fromString(""));
+    assertEquals(concat(fromString("ab")), fromString("ab"));
+    assertEquals(concat(fromString("a"), fromString("b")), fromString("ab"));
+    assertEquals(concat(fromString("a"), fromString("b"), fromString("c")), fromString("abc"));
+    assertEquals(concat(fromString("a"), null, fromString("c")), fromString("ac"));
+    assertEquals(concat(fromString("a"), null, null), fromString("a"));
+    assertEquals(concat(null, null, null), fromString(""));
+    assertEquals(concat(fromString("数据"), fromString("砖头")), fromString("数据砖头"));
+  }
+
   @Test
   public void contains() {
     assertTrue(fromString("").contains(fromString("")));

From e16a19a39ed3369dffd375d712066d12add71c9e Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Sat, 18 Jul 2015 15:29:38 -0700
Subject: [PATCH 237/549] [SPARK-9174][SQL] Add documentation for all public
 SQLConfs.

Author: Reynold Xin <rxin@databricks.com>

Closes #7500 from rxin/sqlconf and squashes the following commits:

a5726c8 [Reynold Xin] [SPARK-9174][SQL] Add documentation for all public SQLConfs.
---
 .../scala/org/apache/spark/sql/SQLConf.scala  | 144 +++++++-----------
 1 file changed, 53 insertions(+), 91 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
index 6005d35f015a9..2c2f7c35dfdce 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
@@ -26,6 +26,11 @@ import org.apache.parquet.hadoop.ParquetOutputCommitter
 
 import org.apache.spark.sql.catalyst.CatalystConf
 
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// This file defines the configuration options for Spark SQL.
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
 private[spark] object SQLConf {
 
   private val sqlConfEntries = java.util.Collections.synchronizedMap(
@@ -184,17 +189,20 @@ private[spark] object SQLConf {
   val COMPRESS_CACHED = booleanConf("spark.sql.inMemoryColumnarStorage.compressed",
     defaultValue = Some(true),
     doc = "When set to true Spark SQL will automatically select a compression codec for each " +
-      "column based on statistics of the data.")
+      "column based on statistics of the data.",
+    isPublic = false)
 
   val COLUMN_BATCH_SIZE = intConf("spark.sql.inMemoryColumnarStorage.batchSize",
     defaultValue = Some(10000),
     doc = "Controls the size of batches for columnar caching.  Larger batch sizes can improve " +
-      "memory utilization and compression, but risk OOMs when caching data.")
+      "memory utilization and compression, but risk OOMs when caching data.",
+    isPublic = false)
 
   val IN_MEMORY_PARTITION_PRUNING =
     booleanConf("spark.sql.inMemoryColumnarStorage.partitionPruning",
       defaultValue = Some(false),
-      doc = "<TODO>")
+      doc = "When true, enable partition pruning for in-memory columnar tables.",
+      isPublic = false)
 
   val AUTO_BROADCASTJOIN_THRESHOLD = intConf("spark.sql.autoBroadcastJoinThreshold",
     defaultValue = Some(10 * 1024 * 1024),
@@ -203,29 +211,35 @@ private[spark] object SQLConf {
       "Note that currently statistics are only supported for Hive Metastore tables where the " +
       "command<code>ANALYZE TABLE &lt;tableName&gt; COMPUTE STATISTICS noscan</code> has been run.")
 
-  val DEFAULT_SIZE_IN_BYTES = longConf("spark.sql.defaultSizeInBytes", isPublic = false)
+  val DEFAULT_SIZE_IN_BYTES = longConf(
+    "spark.sql.defaultSizeInBytes",
+    doc = "The default table size used in query planning. By default, it is set to a larger " +
+      "value than `spark.sql.autoBroadcastJoinThreshold` to be more conservative. That is to say " +
+      "by default the optimizer will not choose to broadcast a table unless it knows for sure its" +
+      "size is small enough.",
+    isPublic = false)
 
   val SHUFFLE_PARTITIONS = intConf("spark.sql.shuffle.partitions",
     defaultValue = Some(200),
-    doc = "Configures the number of partitions to use when shuffling data for joins or " +
-      "aggregations.")
+    doc = "The default number of partitions to use when shuffling data for joins or aggregations.")
 
   val CODEGEN_ENABLED = booleanConf("spark.sql.codegen",
     defaultValue = Some(true),
     doc = "When true, code will be dynamically generated at runtime for expression evaluation in" +
-      " a specific query. For some queries with complicated expression this option can lead to " +
-      "significant speed-ups. However, for simple queries this can actually slow down query " +
-      "execution.")
+      " a specific query.")
 
   val UNSAFE_ENABLED = booleanConf("spark.sql.unsafe.enabled",
     defaultValue = Some(false),
-    doc = "<TDDO>")
+    doc = "When true, use the new optimized Tungsten physical execution backend.")
 
-  val DIALECT = stringConf("spark.sql.dialect", defaultValue = Some("sql"), doc = "<TODO>")
+  val DIALECT = stringConf(
+    "spark.sql.dialect",
+    defaultValue = Some("sql"),
+    doc = "The default SQL dialect to use.")
 
   val CASE_SENSITIVE = booleanConf("spark.sql.caseSensitive",
     defaultValue = Some(true),
-    doc = "<TODO>")
+    doc = "Whether the query analyzer should be case sensitive or not.")
 
   val PARQUET_SCHEMA_MERGING_ENABLED = booleanConf("spark.sql.parquet.mergeSchema",
     defaultValue = Some(true),
@@ -273,9 +287,8 @@ private[spark] object SQLConf {
   val PARQUET_FOLLOW_PARQUET_FORMAT_SPEC = booleanConf(
     key = "spark.sql.parquet.followParquetFormatSpec",
     defaultValue = Some(false),
-    doc = "Whether to stick to Parquet format specification when converting Parquet schema to " +
-      "Spark SQL schema and vice versa.  Sticks to the specification if set to true; falls back " +
-      "to compatible mode if set to false.",
+    doc = "Whether to follow Parquet's format specification when converting Parquet schema to " +
+      "Spark SQL schema and vice versa.",
     isPublic = false)
 
   val PARQUET_OUTPUT_COMMITTER_CLASS = stringConf(
@@ -290,7 +303,7 @@ private[spark] object SQLConf {
 
   val ORC_FILTER_PUSHDOWN_ENABLED = booleanConf("spark.sql.orc.filterPushdown",
     defaultValue = Some(false),
-    doc = "<TODO>")
+    doc = "When true, enable filter pushdown for ORC files.")
 
   val HIVE_VERIFY_PARTITION_PATH = booleanConf("spark.sql.hive.verifyPartitionPath",
     defaultValue = Some(true),
@@ -302,7 +315,7 @@ private[spark] object SQLConf {
 
   val BROADCAST_TIMEOUT = intConf("spark.sql.broadcastTimeout",
     defaultValue = Some(5 * 60),
-    doc = "<TODO>")
+    doc = "Timeout in seconds for the broadcast wait time in broadcast joins.")
 
   // Options that control which operators can be chosen by the query planner.  These should be
   // considered hints and may be ignored by future versions of Spark SQL.
@@ -313,7 +326,7 @@ private[spark] object SQLConf {
 
   val SORTMERGE_JOIN = booleanConf("spark.sql.planner.sortMergeJoin",
     defaultValue = Some(false),
-    doc = "<TODO>")
+    doc = "When true, use sort merge join (as opposed to hash join) by default for large joins.")
 
   // This is only used for the thriftserver
   val THRIFTSERVER_POOL = stringConf("spark.sql.thriftserver.scheduler.pool",
@@ -321,16 +334,16 @@ private[spark] object SQLConf {
 
   val THRIFTSERVER_UI_STATEMENT_LIMIT = intConf("spark.sql.thriftserver.ui.retainedStatements",
     defaultValue = Some(200),
-    doc = "<TODO>")
+    doc = "The number of SQL statements kept in the JDBC/ODBC web UI history.")
 
   val THRIFTSERVER_UI_SESSION_LIMIT = intConf("spark.sql.thriftserver.ui.retainedSessions",
     defaultValue = Some(200),
-    doc = "<TODO>")
+    doc = "The number of SQL client sessions kept in the JDBC/ODBC web UI history.")
 
   // This is used to set the default data source
   val DEFAULT_DATA_SOURCE_NAME = stringConf("spark.sql.sources.default",
     defaultValue = Some("org.apache.spark.sql.parquet"),
-    doc = "<TODO>")
+    doc = "The default data source to use in input/output.")
 
   // This is used to control the when we will split a schema's JSON string to multiple pieces
   // in order to fit the JSON string in metastore's table property (by default, the value has
@@ -338,18 +351,20 @@ private[spark] object SQLConf {
   // to its length exceeds the threshold.
   val SCHEMA_STRING_LENGTH_THRESHOLD = intConf("spark.sql.sources.schemaStringLengthThreshold",
     defaultValue = Some(4000),
-    doc = "<TODO>")
+    doc = "The maximum length allowed in a single cell when " +
+      "storing additional schema information in Hive's metastore.",
+    isPublic = false)
 
   // Whether to perform partition discovery when loading external data sources.  Default to true.
   val PARTITION_DISCOVERY_ENABLED = booleanConf("spark.sql.sources.partitionDiscovery.enabled",
     defaultValue = Some(true),
-    doc = "<TODO>")
+    doc = "When true, automtically discover data partitions.")
 
   // Whether to perform partition column type inference. Default to true.
   val PARTITION_COLUMN_TYPE_INFERENCE =
     booleanConf("spark.sql.sources.partitionColumnTypeInference.enabled",
       defaultValue = Some(true),
-      doc = "<TODO>")
+      doc = "When true, automatically infer the data types for partitioned columns.")
 
   // The output committer class used by HadoopFsRelation. The specified class needs to be a
   // subclass of org.apache.hadoop.mapreduce.OutputCommitter.
@@ -363,22 +378,28 @@ private[spark] object SQLConf {
 
   // Whether to perform eager analysis when constructing a dataframe.
   // Set to false when debugging requires the ability to look at invalid query plans.
-  val DATAFRAME_EAGER_ANALYSIS = booleanConf("spark.sql.eagerAnalysis",
+  val DATAFRAME_EAGER_ANALYSIS = booleanConf(
+    "spark.sql.eagerAnalysis",
     defaultValue = Some(true),
-    doc = "<TODO>")
+    doc = "When true, eagerly applies query analysis on DataFrame operations.",
+    isPublic = false)
 
   // Whether to automatically resolve ambiguity in join conditions for self-joins.
   // See SPARK-6231.
-  val DATAFRAME_SELF_JOIN_AUTO_RESOLVE_AMBIGUITY =
-    booleanConf("spark.sql.selfJoinAutoResolveAmbiguity", defaultValue = Some(true), doc = "<TODO>")
+  val DATAFRAME_SELF_JOIN_AUTO_RESOLVE_AMBIGUITY = booleanConf(
+    "spark.sql.selfJoinAutoResolveAmbiguity",
+    defaultValue = Some(true),
+    isPublic = false)
 
   // Whether to retain group by columns or not in GroupedData.agg.
-  val DATAFRAME_RETAIN_GROUP_COLUMNS = booleanConf("spark.sql.retainGroupColumns",
+  val DATAFRAME_RETAIN_GROUP_COLUMNS = booleanConf(
+    "spark.sql.retainGroupColumns",
     defaultValue = Some(true),
-    doc = "<TODO>")
+    isPublic = false)
 
-  val USE_SQL_SERIALIZER2 = booleanConf("spark.sql.useSerializer2",
-    defaultValue = Some(true), doc = "<TODO>")
+  val USE_SQL_SERIALIZER2 = booleanConf(
+    "spark.sql.useSerializer2",
+    defaultValue = Some(true), isPublic = false)
 
   val USE_JACKSON_STREAMING_API = booleanConf("spark.sql.json.useJacksonStreamingAPI",
     defaultValue = Some(true), doc = "<TODO>")
@@ -422,112 +443,53 @@ private[sql] class SQLConf extends Serializable with CatalystConf {
    */
   private[spark] def dialect: String = getConf(DIALECT)
 
-  /** When true tables cached using the in-memory columnar caching will be compressed. */
   private[spark] def useCompression: Boolean = getConf(COMPRESS_CACHED)
 
-  /** The compression codec for writing to a Parquetfile */
   private[spark] def parquetCompressionCodec: String = getConf(PARQUET_COMPRESSION)
 
   private[spark] def parquetCacheMetadata: Boolean = getConf(PARQUET_CACHE_METADATA)
 
-  /** The number of rows that will be  */
   private[spark] def columnBatchSize: Int = getConf(COLUMN_BATCH_SIZE)
 
-  /** Number of partitions to use for shuffle operators. */
   private[spark] def numShufflePartitions: Int = getConf(SHUFFLE_PARTITIONS)
 
-  /** When true predicates will be passed to the parquet record reader when possible. */
   private[spark] def parquetFilterPushDown: Boolean = getConf(PARQUET_FILTER_PUSHDOWN_ENABLED)
 
-  /** When true uses Parquet implementation based on data source API */
   private[spark] def parquetUseDataSourceApi: Boolean = getConf(PARQUET_USE_DATA_SOURCE_API)
 
   private[spark] def orcFilterPushDown: Boolean = getConf(ORC_FILTER_PUSHDOWN_ENABLED)
 
-  /** When true uses verifyPartitionPath to prune the path which is not exists. */
   private[spark] def verifyPartitionPath: Boolean = getConf(HIVE_VERIFY_PARTITION_PATH)
 
-  /** When true the planner will use the external sort, which may spill to disk. */
   private[spark] def externalSortEnabled: Boolean = getConf(EXTERNAL_SORT)
 
-  /**
-   * Sort merge join would sort the two side of join first, and then iterate both sides together
-   * only once to get all matches. Using sort merge join can save a lot of memory usage compared
-   * to HashJoin.
-   */
   private[spark] def sortMergeJoinEnabled: Boolean = getConf(SORTMERGE_JOIN)
 
-  /**
-   * When set to true, Spark SQL will use the Janino at runtime to generate custom bytecode
-   * that evaluates expressions found in queries.  In general this custom code runs much faster
-   * than interpreted evaluation, but there are some start-up costs (5-10ms) due to compilation.
-   */
   private[spark] def codegenEnabled: Boolean = getConf(CODEGEN_ENABLED)
 
-  /**
-   * caseSensitive analysis true by default
-   */
   def caseSensitiveAnalysis: Boolean = getConf(SQLConf.CASE_SENSITIVE)
 
-  /**
-   * When set to true, Spark SQL will use managed memory for certain operations.  This option only
-   * takes effect if codegen is enabled.
-   *
-   * Defaults to false as this feature is currently experimental.
-   */
   private[spark] def unsafeEnabled: Boolean = getConf(UNSAFE_ENABLED)
 
   private[spark] def useSqlSerializer2: Boolean = getConf(USE_SQL_SERIALIZER2)
 
-  /**
-   * Selects between the new (true) and old (false) JSON handlers, to be removed in Spark 1.5.0
-   */
   private[spark] def useJacksonStreamingAPI: Boolean = getConf(USE_JACKSON_STREAMING_API)
 
-  /**
-   * Upper bound on the sizes (in bytes) of the tables qualified for the auto conversion to
-   * a broadcast value during the physical executions of join operations.  Setting this to -1
-   * effectively disables auto conversion.
-   *
-   * Hive setting: hive.auto.convert.join.noconditionaltask.size, whose default value is 10000.
-   */
   private[spark] def autoBroadcastJoinThreshold: Int = getConf(AUTO_BROADCASTJOIN_THRESHOLD)
 
-  /**
-   * The default size in bytes to assign to a logical operator's estimation statistics.  By default,
-   * it is set to a larger value than `autoBroadcastJoinThreshold`, hence any logical operator
-   * without a properly implemented estimation of this statistic will not be incorrectly broadcasted
-   * in joins.
-   */
   private[spark] def defaultSizeInBytes: Long =
     getConf(DEFAULT_SIZE_IN_BYTES, autoBroadcastJoinThreshold + 1L)
 
-  /**
-   * When set to true, we always treat byte arrays in Parquet files as strings.
-   */
   private[spark] def isParquetBinaryAsString: Boolean = getConf(PARQUET_BINARY_AS_STRING)
 
-  /**
-   * When set to true, we always treat INT96Values in Parquet files as timestamp.
-   */
   private[spark] def isParquetINT96AsTimestamp: Boolean = getConf(PARQUET_INT96_AS_TIMESTAMP)
 
-  /**
-   * When set to true, sticks to Parquet format spec when converting Parquet schema to Spark SQL
-   * schema and vice versa.  Otherwise, falls back to compatible mode.
-   */
   private[spark] def followParquetFormatSpec: Boolean = getConf(PARQUET_FOLLOW_PARQUET_FORMAT_SPEC)
 
-  /**
-   * When set to true, partition pruning for in-memory columnar tables is enabled.
-   */
   private[spark] def inMemoryPartitionPruning: Boolean = getConf(IN_MEMORY_PARTITION_PRUNING)
 
   private[spark] def columnNameOfCorruptRecord: String = getConf(COLUMN_NAME_OF_CORRUPT_RECORD)
 
-  /**
-   * Timeout in seconds for the broadcast wait time in hash join
-   */
   private[spark] def broadcastTimeout: Int = getConf(BROADCAST_TIMEOUT)
 
   private[spark] def defaultDataSourceName: String = getConf(DEFAULT_DATA_SOURCE_NAME)

From 9914b1b2c5d5fe020f54d95f59f03023de2ea78a Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Sat, 18 Jul 2015 18:18:19 -0700
Subject: [PATCH 238/549] [SPARK-9150][SQL] Create CodegenFallback and
 Unevaluable trait

It is very hard to track which expressions have code gen implemented or not. This patch removes the default fallback gencode implementation from Expression, and moves that into a new trait called CodegenFallback. Each concrete expression needs to either implement code generation, or mix in CodegenFallback. This makes it very easy to track which expressions have code generation implemented already.

Additionally, this patch creates an Unevaluable trait that can be used to track expressions that don't support evaluation (e.g. Star).

Author: Reynold Xin <rxin@databricks.com>

Closes #7487 from rxin/codegenfallback and squashes the following commits:

14ebf38 [Reynold Xin] Fixed Conv
6c1c882 [Reynold Xin] Fixed Alias.
b42611b [Reynold Xin] [SPARK-9150][SQL] Create a trait to track code generation for expressions.
cb5c066 [Reynold Xin] Removed extra import.
39cbe40 [Reynold Xin] [SPARK-8240][SQL] string function: concat
---
 .../sql/catalyst/analysis/unresolved.scala    | 43 ++++---------
 .../spark/sql/catalyst/expressions/Cast.scala |  7 +--
 .../sql/catalyst/expressions/Expression.scala | 28 +++++----
 .../sql/catalyst/expressions/ScalaUDF.scala   |  4 +-
 .../sql/catalyst/expressions/SortOrder.scala  | 10 +--
 .../sql/catalyst/expressions/aggregates.scala | 10 +--
 .../sql/catalyst/expressions/arithmetic.scala |  5 +-
 .../expressions/codegen/CodegenFallback.scala | 40 ++++++++++++
 .../expressions/complexTypeCreator.scala      | 11 ++--
 .../expressions/datetimeFunctions.scala       |  5 +-
 .../sql/catalyst/expressions/generators.scala |  8 +--
 .../sql/catalyst/expressions/literals.scala   |  7 ++-
 .../spark/sql/catalyst/expressions/math.scala | 61 ++++++++++---------
 .../expressions/namedExpressions.scala        | 15 ++---
 .../sql/catalyst/expressions/predicates.scala |  7 ++-
 .../spark/sql/catalyst/expressions/sets.scala | 12 ++--
 .../expressions/stringOperations.scala        | 48 +++++++++------
 .../expressions/windowExpressions.scala       | 41 ++++---------
 .../plans/physical/partitioning.scala         | 16 +----
 .../analysis/AnalysisErrorSuite.scala         |  4 +-
 .../analysis/HiveTypeCoercionSuite.scala      |  8 +--
 .../sql/catalyst/trees/TreeNodeSuite.scala    |  3 +-
 .../spark/sql/execution/pythonUDFs.scala      |  6 +-
 .../org/apache/spark/sql/hive/hiveUDFs.scala  | 25 ++++----
 24 files changed, 206 insertions(+), 218 deletions(-)
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodegenFallback.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
index 4a1a1ed61ebe7..0daee1990a6e0 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
@@ -17,9 +17,8 @@
 
 package org.apache.spark.sql.catalyst.analysis
 
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.{errors, trees}
-import org.apache.spark.sql.catalyst.errors.TreeNodeException
+import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
+import org.apache.spark.sql.catalyst.errors
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical.LeafNode
 import org.apache.spark.sql.catalyst.trees.TreeNode
@@ -50,7 +49,7 @@ case class UnresolvedRelation(
 /**
  * Holds the name of an attribute that has yet to be resolved.
  */
-case class UnresolvedAttribute(nameParts: Seq[String]) extends Attribute {
+case class UnresolvedAttribute(nameParts: Seq[String]) extends Attribute with Unevaluable {
 
   def name: String =
     nameParts.map(n => if (n.contains(".")) s"`$n`" else n).mkString(".")
@@ -66,10 +65,6 @@ case class UnresolvedAttribute(nameParts: Seq[String]) extends Attribute {
   override def withQualifiers(newQualifiers: Seq[String]): UnresolvedAttribute = this
   override def withName(newName: String): UnresolvedAttribute = UnresolvedAttribute.quoted(newName)
 
-  // Unresolved attributes are transient at compile time and don't get evaluated during execution.
-  override def eval(input: InternalRow = null): Any =
-    throw new TreeNodeException(this, s"No function to evaluate expression. type: ${this.nodeName}")
-
   override def toString: String = s"'$name"
 }
 
@@ -78,16 +73,14 @@ object UnresolvedAttribute {
   def quoted(name: String): UnresolvedAttribute = new UnresolvedAttribute(Seq(name))
 }
 
-case class UnresolvedFunction(name: String, children: Seq[Expression]) extends Expression {
+case class UnresolvedFunction(name: String, children: Seq[Expression])
+  extends Expression with Unevaluable {
+
   override def dataType: DataType = throw new UnresolvedException(this, "dataType")
   override def foldable: Boolean = throw new UnresolvedException(this, "foldable")
   override def nullable: Boolean = throw new UnresolvedException(this, "nullable")
   override lazy val resolved = false
 
-  // Unresolved functions are transient at compile time and don't get evaluated during execution.
-  override def eval(input: InternalRow = null): Any =
-    throw new TreeNodeException(this, s"No function to evaluate expression. type: ${this.nodeName}")
-
   override def toString: String = s"'$name(${children.mkString(",")})"
 }
 
@@ -105,10 +98,6 @@ abstract class Star extends LeafExpression with NamedExpression {
   override def toAttribute: Attribute = throw new UnresolvedException(this, "toAttribute")
   override lazy val resolved = false
 
-  // Star gets expanded at runtime so we never evaluate a Star.
-  override def eval(input: InternalRow = null): Any =
-    throw new TreeNodeException(this, s"No function to evaluate expression. type: ${this.nodeName}")
-
   def expand(input: Seq[Attribute], resolver: Resolver): Seq[NamedExpression]
 }
 
@@ -120,7 +109,7 @@ abstract class Star extends LeafExpression with NamedExpression {
  * @param table an optional table that should be the target of the expansion.  If omitted all
  *              tables' columns are produced.
  */
-case class UnresolvedStar(table: Option[String]) extends Star {
+case class UnresolvedStar(table: Option[String]) extends Star with Unevaluable {
 
   override def expand(input: Seq[Attribute], resolver: Resolver): Seq[NamedExpression] = {
     val expandedAttributes: Seq[Attribute] = table match {
@@ -149,7 +138,7 @@ case class UnresolvedStar(table: Option[String]) extends Star {
  * @param names the names to be associated with each output of computing [[child]].
  */
 case class MultiAlias(child: Expression, names: Seq[String])
-  extends UnaryExpression with NamedExpression {
+  extends UnaryExpression with NamedExpression with CodegenFallback {
 
   override def name: String = throw new UnresolvedException(this, "name")
 
@@ -165,9 +154,6 @@ case class MultiAlias(child: Expression, names: Seq[String])
 
   override lazy val resolved = false
 
-  override def eval(input: InternalRow = null): Any =
-    throw new TreeNodeException(this, s"No function to evaluate expression. type: ${this.nodeName}")
-
   override def toString: String = s"$child AS $names"
 
 }
@@ -178,7 +164,7 @@ case class MultiAlias(child: Expression, names: Seq[String])
  *
  * @param expressions Expressions to expand.
  */
-case class ResolvedStar(expressions: Seq[NamedExpression]) extends Star {
+case class ResolvedStar(expressions: Seq[NamedExpression]) extends Star with Unevaluable {
   override def expand(input: Seq[Attribute], resolver: Resolver): Seq[NamedExpression] = expressions
   override def toString: String = expressions.mkString("ResolvedStar(", ", ", ")")
 }
@@ -192,23 +178,21 @@ case class ResolvedStar(expressions: Seq[NamedExpression]) extends Star {
  *                   can be key of Map, index of Array, field name of Struct.
  */
 case class UnresolvedExtractValue(child: Expression, extraction: Expression)
-  extends UnaryExpression {
+  extends UnaryExpression with Unevaluable {
 
   override def dataType: DataType = throw new UnresolvedException(this, "dataType")
   override def foldable: Boolean = throw new UnresolvedException(this, "foldable")
   override def nullable: Boolean = throw new UnresolvedException(this, "nullable")
   override lazy val resolved = false
 
-  override def eval(input: InternalRow = null): Any =
-    throw new TreeNodeException(this, s"No function to evaluate expression. type: ${this.nodeName}")
-
   override def toString: String = s"$child[$extraction]"
 }
 
 /**
  * Holds the expression that has yet to be aliased.
  */
-case class UnresolvedAlias(child: Expression) extends UnaryExpression with NamedExpression {
+case class UnresolvedAlias(child: Expression)
+  extends UnaryExpression with NamedExpression with Unevaluable {
 
   override def toAttribute: Attribute = throw new UnresolvedException(this, "toAttribute")
   override def qualifiers: Seq[String] = throw new UnresolvedException(this, "qualifiers")
@@ -218,7 +202,4 @@ case class UnresolvedAlias(child: Expression) extends UnaryExpression with Named
   override def name: String = throw new UnresolvedException(this, "name")
 
   override lazy val resolved = false
-
-  override def eval(input: InternalRow = null): Any =
-    throw new TreeNodeException(this, s"No function to evaluate expression. type: ${this.nodeName}")
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
index 692b9fddbb041..3346d3c9f9e61 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -18,12 +18,10 @@
 package org.apache.spark.sql.catalyst.expressions
 
 import java.math.{BigDecimal => JavaBigDecimal}
-import java.sql.{Date, Timestamp}
 
-import org.apache.spark.Logging
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
-import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode}
+import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.{Interval, UTF8String}
@@ -106,7 +104,8 @@ object Cast {
 }
 
 /** Cast the child expression to the target data type. */
-case class Cast(child: Expression, dataType: DataType) extends UnaryExpression with Logging {
+case class Cast(child: Expression, dataType: DataType)
+  extends UnaryExpression with CodegenFallback {
 
   override def checkInputDataTypes(): TypeCheckResult = {
     if (Cast.canCast(child.dataType, dataType)) {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
index 0e128d8bdcd96..d0a1aa9a1e912 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
@@ -101,19 +101,7 @@ abstract class Expression extends TreeNode[Expression] {
    * @param ev an [[GeneratedExpressionCode]] with unique terms.
    * @return Java source code
    */
-  protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
-    ctx.references += this
-    val objectTerm = ctx.freshName("obj")
-    s"""
-      /* expression: ${this} */
-      Object $objectTerm = expressions[${ctx.references.size - 1}].eval(i);
-      boolean ${ev.isNull} = $objectTerm == null;
-      ${ctx.javaType(this.dataType)} ${ev.primitive} = ${ctx.defaultValue(this.dataType)};
-      if (!${ev.isNull}) {
-        ${ev.primitive} = (${ctx.boxedType(this.dataType)}) $objectTerm;
-      }
-    """
-  }
+  protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String
 
   /**
    * Returns `true` if this expression and all its children have been resolved to a specific schema
@@ -182,6 +170,20 @@ abstract class Expression extends TreeNode[Expression] {
 }
 
 
+/**
+ * An expression that cannot be evaluated. Some expressions don't live past analysis or optimization
+ * time (e.g. Star). This trait is used by those expressions.
+ */
+trait Unevaluable { self: Expression =>
+
+  override def eval(input: InternalRow = null): Any =
+    throw new UnsupportedOperationException(s"Cannot evaluate expression: $this")
+
+  override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String =
+    throw new UnsupportedOperationException(s"Cannot evaluate expression: $this")
+}
+
+
 /**
  * A leaf expression, i.e. one without any child expressions.
  */
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala
index 22687acd68a97..11c7950c0613b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.CatalystTypeConverters
+import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
 import org.apache.spark.sql.types.DataType
 
 /**
@@ -29,7 +30,8 @@ case class ScalaUDF(
     function: AnyRef,
     dataType: DataType,
     children: Seq[Expression],
-    inputTypes: Seq[DataType] = Nil) extends Expression with ImplicitCastInputTypes {
+    inputTypes: Seq[DataType] = Nil)
+  extends Expression with ImplicitCastInputTypes with CodegenFallback {
 
   override def nullable: Boolean = true
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SortOrder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SortOrder.scala
index b8f7068c9e5e5..3f436c0eb893c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SortOrder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SortOrder.scala
@@ -17,9 +17,6 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.errors.TreeNodeException
-import org.apache.spark.sql.catalyst.trees
 import org.apache.spark.sql.types.DataType
 
 abstract sealed class SortDirection
@@ -30,7 +27,8 @@ case object Descending extends SortDirection
  * An expression that can be used to sort a tuple.  This class extends expression primarily so that
  * transformations over expression will descend into its child.
  */
-case class SortOrder(child: Expression, direction: SortDirection) extends UnaryExpression {
+case class SortOrder(child: Expression, direction: SortDirection)
+  extends UnaryExpression with Unevaluable {
 
   /** Sort order is not foldable because we don't have an eval for it. */
   override def foldable: Boolean = false
@@ -38,9 +36,5 @@ case class SortOrder(child: Expression, direction: SortDirection) extends UnaryE
   override def dataType: DataType = child.dataType
   override def nullable: Boolean = child.nullable
 
-  // SortOrder itself is never evaluated.
-  override def eval(input: InternalRow = null): Any =
-    throw new TreeNodeException(this, s"No function to evaluate expression. type: ${this.nodeName}")
-
   override def toString: String = s"$child ${if (direction == Ascending) "ASC" else "DESC"}"
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
index af9a674ab4958..d705a1286065c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
@@ -26,7 +26,8 @@ import org.apache.spark.sql.catalyst.util.TypeUtils
 import org.apache.spark.sql.types._
 import org.apache.spark.util.collection.OpenHashSet
 
-trait AggregateExpression extends Expression {
+
+trait AggregateExpression extends Expression with Unevaluable {
 
   /**
    * Aggregate expressions should not be foldable.
@@ -38,13 +39,6 @@ trait AggregateExpression extends Expression {
    * of input rows/
    */
   def newInstance(): AggregateFunction
-
-  /**
-   * [[AggregateExpression.eval]] should never be invoked because [[AggregateExpression]]'s are
-   * replaced with a physical aggregate operator at runtime.
-   */
-  override def eval(input: InternalRow = null): Any =
-    throw new TreeNodeException(this, s"No function to evaluate expression. type: ${this.nodeName}")
 }
 
 /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
index e83650fc8cb0e..05b5ad88fee8f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode}
+import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.catalyst.util.TypeUtils
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.Interval
@@ -65,7 +65,8 @@ case class UnaryPositive(child: Expression) extends UnaryExpression with Expects
 /**
  * A function that get the absolute value of the numeric value.
  */
-case class Abs(child: Expression) extends UnaryExpression with ExpectsInputTypes {
+case class Abs(child: Expression)
+  extends UnaryExpression with ExpectsInputTypes with CodegenFallback {
 
   override def inputTypes: Seq[AbstractDataType] = Seq(NumericType)
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodegenFallback.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodegenFallback.scala
new file mode 100644
index 0000000000000..bf4f600cb26e5
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodegenFallback.scala
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions.codegen
+
+import org.apache.spark.sql.catalyst.expressions.Expression
+
+/**
+ * A trait that can be used to provide a fallback mode for expression code generation.
+ */
+trait CodegenFallback { self: Expression =>
+
+  protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    ctx.references += this
+    val objectTerm = ctx.freshName("obj")
+    s"""
+      /* expression: ${this} */
+      Object $objectTerm = expressions[${ctx.references.size - 1}].eval(i);
+      boolean ${ev.isNull} = $objectTerm == null;
+      ${ctx.javaType(this.dataType)} ${ev.primitive} = ${ctx.defaultValue(this.dataType)};
+      if (!${ev.isNull}) {
+        ${ev.primitive} = (${ctx.boxedType(this.dataType)}) $objectTerm;
+      }
+    """
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
index d1e4c458864f1..f9fd04c02aaef 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
@@ -19,13 +19,14 @@ package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
+import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
 import org.apache.spark.sql.catalyst.util.TypeUtils
 import org.apache.spark.sql.types._
 
 /**
  * Returns an Array containing the evaluation of all children expressions.
  */
-case class CreateArray(children: Seq[Expression]) extends Expression {
+case class CreateArray(children: Seq[Expression]) extends Expression with CodegenFallback {
 
   override def foldable: Boolean = children.forall(_.foldable)
 
@@ -51,7 +52,7 @@ case class CreateArray(children: Seq[Expression]) extends Expression {
  * Returns a Row containing the evaluation of all children expressions.
  * TODO: [[CreateStruct]] does not support codegen.
  */
-case class CreateStruct(children: Seq[Expression]) extends Expression {
+case class CreateStruct(children: Seq[Expression]) extends Expression with CodegenFallback {
 
   override def foldable: Boolean = children.forall(_.foldable)
 
@@ -83,7 +84,7 @@ case class CreateStruct(children: Seq[Expression]) extends Expression {
  *
  * @param children Seq(name1, val1, name2, val2, ...)
  */
-case class CreateNamedStruct(children: Seq[Expression]) extends Expression {
+case class CreateNamedStruct(children: Seq[Expression]) extends Expression with CodegenFallback {
 
   private lazy val (nameExprs, valExprs) =
     children.grouped(2).map { case Seq(name, value) => (name, value) }.toList.unzip
@@ -103,11 +104,11 @@ case class CreateNamedStruct(children: Seq[Expression]) extends Expression {
 
   override def checkInputDataTypes(): TypeCheckResult = {
     if (children.size % 2 != 0) {
-      TypeCheckResult.TypeCheckFailure("CreateNamedStruct expects an even number of arguments.")
+      TypeCheckResult.TypeCheckFailure(s"$prettyName expects an even number of arguments.")
     } else {
       val invalidNames =
         nameExprs.filterNot(e => e.foldable && e.dataType == StringType && !nullable)
-      if (invalidNames.size != 0) {
+      if (invalidNames.nonEmpty) {
         TypeCheckResult.TypeCheckFailure(
           s"Odd position only allow foldable and not-null StringType expressions, got :" +
             s" ${invalidNames.mkString(",")}")
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeFunctions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeFunctions.scala
index dd5ec330a771b..4bed140cffbfa 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeFunctions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeFunctions.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.types._
 
@@ -27,7 +28,7 @@ import org.apache.spark.sql.types._
  *
  * There is no code generation since this expression should get constant folded by the optimizer.
  */
-case class CurrentDate() extends LeafExpression {
+case class CurrentDate() extends LeafExpression with CodegenFallback {
   override def foldable: Boolean = true
   override def nullable: Boolean = false
 
@@ -44,7 +45,7 @@ case class CurrentDate() extends LeafExpression {
  *
  * There is no code generation since this expression should get constant folded by the optimizer.
  */
-case class CurrentTimestamp() extends LeafExpression {
+case class CurrentTimestamp() extends LeafExpression with CodegenFallback {
   override def foldable: Boolean = true
   override def nullable: Boolean = false
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
index c58a6d36141c1..2dbcf2830f876 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
@@ -20,9 +20,9 @@ package org.apache.spark.sql.catalyst.expressions
 import scala.collection.Map
 
 import org.apache.spark.sql.Row
-import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
-import org.apache.spark.sql.catalyst.{CatalystTypeConverters, trees}
+import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
 import org.apache.spark.sql.types._
 
 /**
@@ -73,7 +73,7 @@ case class UserDefinedGenerator(
     elementTypes: Seq[(DataType, Boolean)],
     function: Row => TraversableOnce[InternalRow],
     children: Seq[Expression])
-  extends Generator {
+  extends Generator with CodegenFallback {
 
   @transient private[this] var inputRow: InterpretedProjection = _
   @transient private[this] var convertToScala: (InternalRow) => Row = _
@@ -100,7 +100,7 @@ case class UserDefinedGenerator(
 /**
  * Given an input array produces a sequence of rows for each value in the array.
  */
-case class Explode(child: Expression) extends UnaryExpression with Generator {
+case class Explode(child: Expression) extends UnaryExpression with Generator with CodegenFallback {
 
   override def children: Seq[Expression] = child :: Nil
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
index e1fdb29541fa8..f25ac32679587 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
@@ -21,7 +21,7 @@ import java.sql.{Date, Timestamp}
 
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.CatalystTypeConverters
-import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode}
+import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types._
@@ -75,7 +75,8 @@ object IntegerLiteral {
 /**
  * In order to do type checking, use Literal.create() instead of constructor
  */
-case class Literal protected (value: Any, dataType: DataType) extends LeafExpression {
+case class Literal protected (value: Any, dataType: DataType)
+  extends LeafExpression with CodegenFallback {
 
   override def foldable: Boolean = true
   override def nullable: Boolean = value == null
@@ -142,7 +143,7 @@ case class Literal protected (value: Any, dataType: DataType) extends LeafExpres
 
 // TODO: Specialize
 case class MutableLiteral(var value: Any, dataType: DataType, nullable: Boolean = true)
-    extends LeafExpression {
+  extends LeafExpression with CodegenFallback {
 
   def update(expression: Expression, input: InternalRow): Unit = {
     value = expression.eval(input)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
index eb5c065a34123..7ce64d29ba59a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
@@ -18,7 +18,6 @@
 package org.apache.spark.sql.catalyst.expressions
 
 import java.{lang => jl}
-import java.util.Arrays
 
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.{TypeCheckSuccess, TypeCheckFailure}
@@ -29,11 +28,14 @@ import org.apache.spark.unsafe.types.UTF8String
 
 /**
  * A leaf expression specifically for math constants. Math constants expect no input.
+ *
+ * There is no code generation because they should get constant folded by the optimizer.
+ *
  * @param c The math constant.
  * @param name The short name of the function
  */
 abstract class LeafMathExpression(c: Double, name: String)
-  extends LeafExpression with Serializable {
+  extends LeafExpression with CodegenFallback {
 
   override def dataType: DataType = DoubleType
   override def foldable: Boolean = true
@@ -41,13 +43,6 @@ abstract class LeafMathExpression(c: Double, name: String)
   override def toString: String = s"$name()"
 
   override def eval(input: InternalRow): Any = c
-
-  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
-    s"""
-      boolean ${ev.isNull} = false;
-      ${ctx.javaType(dataType)} ${ev.primitive} = java.lang.Math.$name;
-    """
-  }
 }
 
 /**
@@ -130,8 +125,16 @@ abstract class BinaryMathExpression(f: (Double, Double) => Double, name: String)
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
+/**
+ * Euler's number. Note that there is no code generation because this is only
+ * evaluated by the optimizer during constant folding.
+ */
 case class EulerNumber() extends LeafMathExpression(math.E, "E")
 
+/**
+ * Pi. Note that there is no code generation because this is only
+ * evaluated by the optimizer during constant folding.
+ */
 case class Pi() extends LeafMathExpression(math.Pi, "PI")
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -161,7 +164,7 @@ case class Cosh(child: Expression) extends UnaryMathExpression(math.cosh, "COSH"
  * @param toBaseExpr to which base
  */
 case class Conv(numExpr: Expression, fromBaseExpr: Expression, toBaseExpr: Expression)
-  extends Expression with ImplicitCastInputTypes{
+  extends Expression with ImplicitCastInputTypes with CodegenFallback {
 
   override def foldable: Boolean = numExpr.foldable && fromBaseExpr.foldable && toBaseExpr.foldable
 
@@ -171,6 +174,8 @@ case class Conv(numExpr: Expression, fromBaseExpr: Expression, toBaseExpr: Expre
 
   override def inputTypes: Seq[AbstractDataType] = Seq(StringType, IntegerType, IntegerType)
 
+  override def dataType: DataType = StringType
+
   /** Returns the result of evaluating this expression on a given input Row */
   override def eval(input: InternalRow): Any = {
     val num = numExpr.eval(input)
@@ -179,17 +184,13 @@ case class Conv(numExpr: Expression, fromBaseExpr: Expression, toBaseExpr: Expre
     if (num == null || fromBase == null || toBase == null) {
       null
     } else {
-      conv(num.asInstanceOf[UTF8String].getBytes,
-        fromBase.asInstanceOf[Int], toBase.asInstanceOf[Int])
+      conv(
+        num.asInstanceOf[UTF8String].getBytes,
+        fromBase.asInstanceOf[Int],
+        toBase.asInstanceOf[Int])
     }
   }
 
-  /**
-   * Returns the [[DataType]] of the result of evaluating this expression.  It is
-   * invalid to query the dataType of an unresolved expression (i.e., when `resolved` == false).
-   */
-  override def dataType: DataType = StringType
-
   private val value = new Array[Byte](64)
 
   /**
@@ -208,7 +209,7 @@ case class Conv(numExpr: Expression, fromBaseExpr: Expression, toBaseExpr: Expre
       // Two's complement => x = uval - 2*MAX - 2
       // => uval = x + 2*MAX + 2
       // Now, use the fact: (a+b)/c = a/c + b/c + (a%c+b%c)/c
-      (x / m + 2 * (Long.MaxValue / m) + 2 / m + (x % m + 2 * (Long.MaxValue % m) + 2 % m) / m)
+      x / m + 2 * (Long.MaxValue / m) + 2 / m + (x % m + 2 * (Long.MaxValue % m) + 2 % m) / m
     }
   }
 
@@ -220,7 +221,7 @@ case class Conv(numExpr: Expression, fromBaseExpr: Expression, toBaseExpr: Expre
    */
   private def decode(v: Long, radix: Int): Unit = {
     var tmpV = v
-    Arrays.fill(value, 0.asInstanceOf[Byte])
+    java.util.Arrays.fill(value, 0.asInstanceOf[Byte])
     var i = value.length - 1
     while (tmpV != 0) {
       val q = unsignedLongDiv(tmpV, radix)
@@ -254,7 +255,7 @@ case class Conv(numExpr: Expression, fromBaseExpr: Expression, toBaseExpr: Expre
       v = v * radix + value(i)
       i += 1
     }
-    return v
+    v
   }
 
   /**
@@ -292,16 +293,16 @@ case class Conv(numExpr: Expression, fromBaseExpr: Expression, toBaseExpr: Expre
    * NB: This logic is borrowed from org.apache.hadoop.hive.ql.ud.UDFConv
    */
   private def conv(n: Array[Byte] , fromBase: Int, toBase: Int ): UTF8String = {
-    if (n == null || fromBase == null || toBase == null || n.isEmpty) {
-      return null
-    }
-
     if (fromBase < Character.MIN_RADIX || fromBase > Character.MAX_RADIX
       || Math.abs(toBase) < Character.MIN_RADIX
       || Math.abs(toBase) > Character.MAX_RADIX) {
       return null
     }
 
+    if (n.length == 0) {
+      return null
+    }
+
     var (negative, first) = if (n(0) == '-') (true, 1) else (false, 0)
 
     // Copy the digits in the right side of the array
@@ -340,7 +341,7 @@ case class Conv(numExpr: Expression, fromBaseExpr: Expression, toBaseExpr: Expre
       resultStartPos = firstNonZeroPos - 1
       value(resultStartPos) = '-'
     }
-    UTF8String.fromBytes( Arrays.copyOfRange(value, resultStartPos, value.length))
+    UTF8String.fromBytes(java.util.Arrays.copyOfRange(value, resultStartPos, value.length))
   }
 }
 
@@ -495,8 +496,8 @@ object Hex {
  * Otherwise if the number is a STRING, it converts each character into its hex representation
  * and returns the resulting STRING. Negative numbers would be treated as two's complement.
  */
-case class Hex(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
-  // TODO: Create code-gen version.
+case class Hex(child: Expression)
+  extends UnaryExpression with ImplicitCastInputTypes with CodegenFallback {
 
   override def inputTypes: Seq[AbstractDataType] =
     Seq(TypeCollection(LongType, BinaryType, StringType))
@@ -539,8 +540,8 @@ case class Hex(child: Expression) extends UnaryExpression with ImplicitCastInput
  * Performs the inverse operation of HEX.
  * Resulting characters are returned as a byte array.
  */
-case class Unhex(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
-  // TODO: Create code-gen version.
+case class Unhex(child: Expression)
+  extends UnaryExpression with ImplicitCastInputTypes with CodegenFallback {
 
   override def inputTypes: Seq[AbstractDataType] = Seq(StringType)
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
index c083ac08ded21..6f173b52ad9b9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
@@ -19,9 +19,7 @@ package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
-import org.apache.spark.sql.catalyst.errors.TreeNodeException
-import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode}
-import org.apache.spark.sql.catalyst.trees
+import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.types._
 
 object NamedExpression {
@@ -122,7 +120,9 @@ case class Alias(child: Expression, name: String)(
 
   override def eval(input: InternalRow): Any = child.eval(input)
 
+  /** Just a simple passthrough for code generation. */
   override def gen(ctx: CodeGenContext): GeneratedExpressionCode = child.gen(ctx)
+  override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = ""
 
   override def dataType: DataType = child.dataType
   override def nullable: Boolean = child.nullable
@@ -177,7 +177,7 @@ case class AttributeReference(
     override val metadata: Metadata = Metadata.empty)(
     val exprId: ExprId = NamedExpression.newExprId,
     val qualifiers: Seq[String] = Nil)
-  extends Attribute {
+  extends Attribute with Unevaluable {
 
   /**
    * Returns true iff the expression id is the same for both attributes.
@@ -236,10 +236,6 @@ case class AttributeReference(
     }
   }
 
-  // Unresolved attributes are transient at compile time and don't get evaluated during execution.
-  override def eval(input: InternalRow = null): Any =
-    throw new TreeNodeException(this, s"No function to evaluate expression. type: ${this.nodeName}")
-
   override def toString: String = s"$name#${exprId.id}$typeSuffix"
 }
 
@@ -247,7 +243,7 @@ case class AttributeReference(
  * A place holder used when printing expressions without debugging information such as the
  * expression id or the unresolved indicator.
  */
-case class PrettyAttribute(name: String) extends Attribute {
+case class PrettyAttribute(name: String) extends Attribute with Unevaluable {
 
   override def toString: String = name
 
@@ -259,7 +255,6 @@ case class PrettyAttribute(name: String) extends Attribute {
   override def withName(newName: String): Attribute = throw new UnsupportedOperationException
   override def qualifiers: Seq[String] = throw new UnsupportedOperationException
   override def exprId: ExprId = throw new UnsupportedOperationException
-  override def eval(input: InternalRow): Any = throw new UnsupportedOperationException
   override def nullable: Boolean = throw new UnsupportedOperationException
   override def dataType: DataType = NullType
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
index bddd2a9eccfc0..40ec3df224ce1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
@@ -19,10 +19,11 @@ package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.util.TypeUtils
-import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, CodeGenContext}
+import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.types._
 
+
 object InterpretedPredicate {
   def create(expression: Expression, inputSchema: Seq[Attribute]): (InternalRow => Boolean) =
     create(BindReferences.bindReference(expression, inputSchema))
@@ -91,7 +92,7 @@ case class Not(child: Expression)
 /**
  * Evaluates to `true` if `list` contains `value`.
  */
-case class In(value: Expression, list: Seq[Expression]) extends Predicate {
+case class In(value: Expression, list: Seq[Expression]) extends Predicate with CodegenFallback {
   override def children: Seq[Expression] = value +: list
 
   override def nullable: Boolean = true // TODO: Figure out correct nullability semantics of IN.
@@ -109,7 +110,7 @@ case class In(value: Expression, list: Seq[Expression]) extends Predicate {
  * static.
  */
 case class InSet(child: Expression, hset: Set[Any])
-  extends UnaryExpression with Predicate {
+  extends UnaryExpression with Predicate with CodegenFallback {
 
   override def nullable: Boolean = true // TODO: Figure out correct nullability semantics of IN.
   override def toString: String = s"$child INSET ${hset.mkString("(", ",", ")")}"
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/sets.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/sets.scala
index 49b2026364cd6..5b0fe8dfe2fc8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/sets.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/sets.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode}
+import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.types._
 import org.apache.spark.util.collection.OpenHashSet
 
@@ -52,7 +52,7 @@ private[sql] class OpenHashSetUDT(
 /**
  * Creates a new set of the specified type
  */
-case class NewSet(elementType: DataType) extends LeafExpression {
+case class NewSet(elementType: DataType) extends LeafExpression with CodegenFallback {
 
   override def nullable: Boolean = false
 
@@ -82,7 +82,8 @@ case class NewSet(elementType: DataType) extends LeafExpression {
  * Note: this expression is internal and created only by the GeneratedAggregate,
  * we don't need to do type check for it.
  */
-case class AddItemToSet(item: Expression, set: Expression) extends Expression {
+case class AddItemToSet(item: Expression, set: Expression)
+  extends Expression with CodegenFallback {
 
   override def children: Seq[Expression] = item :: set :: Nil
 
@@ -134,7 +135,8 @@ case class AddItemToSet(item: Expression, set: Expression) extends Expression {
  * Note: this expression is internal and created only by the GeneratedAggregate,
  * we don't need to do type check for it.
  */
-case class CombineSets(left: Expression, right: Expression) extends BinaryExpression {
+case class CombineSets(left: Expression, right: Expression)
+  extends BinaryExpression with CodegenFallback {
 
   override def nullable: Boolean = left.nullable
   override def dataType: DataType = left.dataType
@@ -181,7 +183,7 @@ case class CombineSets(left: Expression, right: Expression) extends BinaryExpres
  * Note: this expression is internal and created only by the GeneratedAggregate,
  * we don't need to do type check for it.
  */
-case class CountSet(child: Expression) extends UnaryExpression {
+case class CountSet(child: Expression) extends UnaryExpression with CodegenFallback {
 
   override def dataType: DataType = LongType
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
index b36354eff092a..560b1bc2d889f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
@@ -103,7 +103,7 @@ trait StringRegexExpression extends ImplicitCastInputTypes {
  * Simple RegEx pattern matching function
  */
 case class Like(left: Expression, right: Expression)
-  extends BinaryExpression with StringRegexExpression {
+  extends BinaryExpression with StringRegexExpression with CodegenFallback {
 
   // replace the _ with .{1} exactly match 1 time of any character
   // replace the % with .*, match 0 or more times with any character
@@ -133,14 +133,16 @@ case class Like(left: Expression, right: Expression)
   override def toString: String = s"$left LIKE $right"
 }
 
+
 case class RLike(left: Expression, right: Expression)
-  extends BinaryExpression with StringRegexExpression {
+  extends BinaryExpression with StringRegexExpression with CodegenFallback {
 
   override def escape(v: String): String = v
   override def matches(regex: Pattern, str: String): Boolean = regex.matcher(str).find(0)
   override def toString: String = s"$left RLIKE $right"
 }
 
+
 trait String2StringExpression extends ImplicitCastInputTypes {
   self: UnaryExpression =>
 
@@ -156,7 +158,8 @@ trait String2StringExpression extends ImplicitCastInputTypes {
 /**
  * A function that converts the characters of a string to uppercase.
  */
-case class Upper(child: Expression) extends UnaryExpression with String2StringExpression {
+case class Upper(child: Expression)
+  extends UnaryExpression with String2StringExpression {
 
   override def convert(v: UTF8String): UTF8String = v.toUpperCase
 
@@ -301,7 +304,7 @@ case class StringInstr(str: Expression, substr: Expression)
  * in given string after position pos.
  */
 case class StringLocate(substr: Expression, str: Expression, start: Expression)
-  extends Expression with ImplicitCastInputTypes {
+  extends Expression with ImplicitCastInputTypes with CodegenFallback {
 
   def this(substr: Expression, str: Expression) = {
     this(substr, str, Literal(0))
@@ -342,7 +345,7 @@ case class StringLocate(substr: Expression, str: Expression, start: Expression)
  * Returns str, left-padded with pad to a length of len.
  */
 case class StringLPad(str: Expression, len: Expression, pad: Expression)
-  extends Expression with ImplicitCastInputTypes {
+  extends Expression with ImplicitCastInputTypes with CodegenFallback {
 
   override def children: Seq[Expression] = str :: len :: pad :: Nil
   override def foldable: Boolean = children.forall(_.foldable)
@@ -380,7 +383,7 @@ case class StringLPad(str: Expression, len: Expression, pad: Expression)
  * Returns str, right-padded with pad to a length of len.
  */
 case class StringRPad(str: Expression, len: Expression, pad: Expression)
-  extends Expression with ImplicitCastInputTypes {
+  extends Expression with ImplicitCastInputTypes with CodegenFallback {
 
   override def children: Seq[Expression] = str :: len :: pad :: Nil
   override def foldable: Boolean = children.forall(_.foldable)
@@ -417,9 +420,9 @@ case class StringRPad(str: Expression, len: Expression, pad: Expression)
 /**
  * Returns the input formatted according do printf-style format strings
  */
-case class StringFormat(children: Expression*) extends Expression {
+case class StringFormat(children: Expression*) extends Expression with CodegenFallback {
 
-  require(children.length >=1, "printf() should take at least 1 argument")
+  require(children.nonEmpty, "printf() should take at least 1 argument")
 
   override def foldable: Boolean = children.forall(_.foldable)
   override def nullable: Boolean = children(0).nullable
@@ -436,7 +439,7 @@ case class StringFormat(children: Expression*) extends Expression {
       val formatter = new java.util.Formatter(sb, Locale.US)
 
       val arglist = args.map(_.eval(input).asInstanceOf[AnyRef])
-      formatter.format(pattern.asInstanceOf[UTF8String].toString(), arglist: _*)
+      formatter.format(pattern.asInstanceOf[UTF8String].toString, arglist: _*)
 
       UTF8String.fromString(sb.toString)
     }
@@ -483,7 +486,8 @@ case class StringReverse(child: Expression) extends UnaryExpression with String2
 /**
  * Returns a n spaces string.
  */
-case class StringSpace(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
+case class StringSpace(child: Expression)
+  extends UnaryExpression with ImplicitCastInputTypes with CodegenFallback {
 
   override def dataType: DataType = StringType
   override def inputTypes: Seq[DataType] = Seq(IntegerType)
@@ -503,7 +507,7 @@ case class StringSpace(child: Expression) extends UnaryExpression with ImplicitC
  * Splits str around pat (pattern is a regular expression).
  */
 case class StringSplit(str: Expression, pattern: Expression)
-  extends BinaryExpression with ImplicitCastInputTypes {
+  extends BinaryExpression with ImplicitCastInputTypes with CodegenFallback {
 
   override def left: Expression = str
   override def right: Expression = pattern
@@ -524,7 +528,7 @@ case class StringSplit(str: Expression, pattern: Expression)
  * Defined for String and Binary types.
  */
 case class Substring(str: Expression, pos: Expression, len: Expression)
-  extends Expression with ImplicitCastInputTypes {
+  extends Expression with ImplicitCastInputTypes with CodegenFallback {
 
   def this(str: Expression, pos: Expression) = {
     this(str, pos, Literal(Integer.MAX_VALUE))
@@ -606,8 +610,6 @@ case class Length(child: Expression) extends UnaryExpression with ExpectsInputTy
       case BinaryType => defineCodeGen(ctx, ev, c => s"($c).length")
     }
   }
-
-  override def prettyName: String = "length"
 }
 
 /**
@@ -632,7 +634,9 @@ case class Levenshtein(left: Expression, right: Expression) extends BinaryExpres
 /**
  * Returns the numeric value of the first character of str.
  */
-case class Ascii(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
+case class Ascii(child: Expression)
+  extends UnaryExpression with ImplicitCastInputTypes with CodegenFallback {
+
   override def dataType: DataType = IntegerType
   override def inputTypes: Seq[DataType] = Seq(StringType)
 
@@ -649,7 +653,9 @@ case class Ascii(child: Expression) extends UnaryExpression with ImplicitCastInp
 /**
  * Converts the argument from binary to a base 64 string.
  */
-case class Base64(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
+case class Base64(child: Expression)
+  extends UnaryExpression with ImplicitCastInputTypes with CodegenFallback {
+
   override def dataType: DataType = StringType
   override def inputTypes: Seq[DataType] = Seq(BinaryType)
 
@@ -663,7 +669,9 @@ case class Base64(child: Expression) extends UnaryExpression with ImplicitCastIn
 /**
  * Converts the argument from a base 64 string to BINARY.
  */
-case class UnBase64(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
+case class UnBase64(child: Expression)
+  extends UnaryExpression with ImplicitCastInputTypes with CodegenFallback {
+
   override def dataType: DataType = BinaryType
   override def inputTypes: Seq[DataType] = Seq(StringType)
 
@@ -677,7 +685,7 @@ case class UnBase64(child: Expression) extends UnaryExpression with ImplicitCast
  * If either argument is null, the result will also be null.
  */
 case class Decode(bin: Expression, charset: Expression)
-  extends BinaryExpression with ImplicitCastInputTypes {
+  extends BinaryExpression with ImplicitCastInputTypes with CodegenFallback {
 
   override def left: Expression = bin
   override def right: Expression = charset
@@ -696,7 +704,7 @@ case class Decode(bin: Expression, charset: Expression)
  * If either argument is null, the result will also be null.
 */
 case class Encode(value: Expression, charset: Expression)
-  extends BinaryExpression with ImplicitCastInputTypes {
+  extends BinaryExpression with ImplicitCastInputTypes with CodegenFallback {
 
   override def left: Expression = value
   override def right: Expression = charset
@@ -715,7 +723,7 @@ case class Encode(value: Expression, charset: Expression)
  * fractional part.
  */
 case class FormatNumber(x: Expression, d: Expression)
-  extends BinaryExpression with ExpectsInputTypes {
+  extends BinaryExpression with ExpectsInputTypes with CodegenFallback {
 
   override def left: Expression = x
   override def right: Expression = d
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala
index c8aa571df64fc..50bbfd644d302 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala
@@ -19,7 +19,6 @@ package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.UnresolvedException
-import org.apache.spark.sql.catalyst.errors.TreeNodeException
 import org.apache.spark.sql.types.{DataType, NumericType}
 
 /**
@@ -37,7 +36,7 @@ sealed trait WindowSpec
 case class WindowSpecDefinition(
     partitionSpec: Seq[Expression],
     orderSpec: Seq[SortOrder],
-    frameSpecification: WindowFrame) extends Expression with WindowSpec {
+    frameSpecification: WindowFrame) extends Expression with WindowSpec with Unevaluable {
 
   def validate: Option[String] = frameSpecification match {
     case UnspecifiedFrame =>
@@ -75,7 +74,6 @@ case class WindowSpecDefinition(
 
   override def toString: String = simpleString
 
-  override def eval(input: InternalRow): Any = throw new UnsupportedOperationException
   override def nullable: Boolean = true
   override def foldable: Boolean = false
   override def dataType: DataType = throw new UnsupportedOperationException
@@ -274,60 +272,43 @@ trait WindowFunction extends Expression {
 case class UnresolvedWindowFunction(
     name: String,
     children: Seq[Expression])
-  extends Expression with WindowFunction {
+  extends Expression with WindowFunction with Unevaluable {
 
   override def dataType: DataType = throw new UnresolvedException(this, "dataType")
   override def foldable: Boolean = throw new UnresolvedException(this, "foldable")
   override def nullable: Boolean = throw new UnresolvedException(this, "nullable")
   override lazy val resolved = false
 
-  override def init(): Unit =
-    throw new UnresolvedException(this, "init")
-  override def reset(): Unit =
-    throw new UnresolvedException(this, "reset")
+  override def init(): Unit = throw new UnresolvedException(this, "init")
+  override def reset(): Unit = throw new UnresolvedException(this, "reset")
   override def prepareInputParameters(input: InternalRow): AnyRef =
     throw new UnresolvedException(this, "prepareInputParameters")
-  override def update(input: AnyRef): Unit =
-    throw new UnresolvedException(this, "update")
+  override def update(input: AnyRef): Unit = throw new UnresolvedException(this, "update")
   override def batchUpdate(inputs: Array[AnyRef]): Unit =
     throw new UnresolvedException(this, "batchUpdate")
-  override def evaluate(): Unit =
-    throw new UnresolvedException(this, "evaluate")
-  override def get(index: Int): Any =
-    throw new UnresolvedException(this, "get")
-  // Unresolved functions are transient at compile time and don't get evaluated during execution.
-  override def eval(input: InternalRow = null): Any =
-    throw new TreeNodeException(this, s"No function to evaluate expression. type: ${this.nodeName}")
+  override def evaluate(): Unit = throw new UnresolvedException(this, "evaluate")
+  override def get(index: Int): Any = throw new UnresolvedException(this, "get")
 
   override def toString: String = s"'$name(${children.mkString(",")})"
 
-  override def newInstance(): WindowFunction =
-    throw new UnresolvedException(this, "newInstance")
+  override def newInstance(): WindowFunction = throw new UnresolvedException(this, "newInstance")
 }
 
 case class UnresolvedWindowExpression(
     child: UnresolvedWindowFunction,
-    windowSpec: WindowSpecReference) extends UnaryExpression {
+    windowSpec: WindowSpecReference) extends UnaryExpression with Unevaluable {
 
   override def dataType: DataType = throw new UnresolvedException(this, "dataType")
   override def foldable: Boolean = throw new UnresolvedException(this, "foldable")
   override def nullable: Boolean = throw new UnresolvedException(this, "nullable")
   override lazy val resolved = false
-
-  // Unresolved functions are transient at compile time and don't get evaluated during execution.
-  override def eval(input: InternalRow = null): Any =
-    throw new TreeNodeException(this, s"No function to evaluate expression. type: ${this.nodeName}")
 }
 
 case class WindowExpression(
     windowFunction: WindowFunction,
-    windowSpec: WindowSpecDefinition) extends Expression {
-
-  override def children: Seq[Expression] =
-    windowFunction :: windowSpec :: Nil
+    windowSpec: WindowSpecDefinition) extends Expression with Unevaluable {
 
-  override def eval(input: InternalRow): Any =
-    throw new TreeNodeException(this, s"No function to evaluate expression. type: ${this.nodeName}")
+  override def children: Seq[Expression] = windowFunction :: windowSpec :: Nil
 
   override def dataType: DataType = windowFunction.dataType
   override def foldable: Boolean = windowFunction.foldable
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala
index 42dead7c28425..2dcfa19fec383 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala
@@ -17,9 +17,7 @@
 
 package org.apache.spark.sql.catalyst.plans.physical
 
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.errors.TreeNodeException
-import org.apache.spark.sql.catalyst.expressions.{Expression, SortOrder}
+import org.apache.spark.sql.catalyst.expressions.{Unevaluable, Expression, SortOrder}
 import org.apache.spark.sql.types.{DataType, IntegerType}
 
 /**
@@ -146,8 +144,7 @@ case object BroadcastPartitioning extends Partitioning {
  * in the same partition.
  */
 case class HashPartitioning(expressions: Seq[Expression], numPartitions: Int)
-  extends Expression
-  with Partitioning {
+  extends Expression with Partitioning with Unevaluable {
 
   override def children: Seq[Expression] = expressions
   override def nullable: Boolean = false
@@ -169,9 +166,6 @@ case class HashPartitioning(expressions: Seq[Expression], numPartitions: Int)
   }
 
   override def keyExpressions: Seq[Expression] = expressions
-
-  override def eval(input: InternalRow = null): Any =
-    throw new TreeNodeException(this, s"No function to evaluate expression. type: ${this.nodeName}")
 }
 
 /**
@@ -187,8 +181,7 @@ case class HashPartitioning(expressions: Seq[Expression], numPartitions: Int)
  * into its child.
  */
 case class RangePartitioning(ordering: Seq[SortOrder], numPartitions: Int)
-  extends Expression
-  with Partitioning {
+  extends Expression with Partitioning with Unevaluable {
 
   override def children: Seq[SortOrder] = ordering
   override def nullable: Boolean = false
@@ -213,7 +206,4 @@ case class RangePartitioning(ordering: Seq[SortOrder], numPartitions: Int)
   }
 
   override def keyExpressions: Seq[Expression] = ordering.map(_.child)
-
-  override def eval(input: InternalRow): Any =
-    throw new TreeNodeException(this, s"No function to evaluate expression. type: ${this.nodeName}")
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
index 2147d07e09bd3..dca8c881f21ab 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
@@ -31,9 +31,9 @@ import org.apache.spark.sql.types._
 
 case class TestFunction(
     children: Seq[Expression],
-    inputTypes: Seq[AbstractDataType]) extends Expression with ImplicitCastInputTypes {
+    inputTypes: Seq[AbstractDataType])
+  extends Expression with ImplicitCastInputTypes with Unevaluable {
   override def nullable: Boolean = true
-  override def eval(input: InternalRow): Any = throw new UnsupportedOperationException
   override def dataType: DataType = StringType
 }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
index c9b3c69c6de89..f9442bccc4a7a 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
@@ -363,26 +363,26 @@ class HiveTypeCoercionSuite extends PlanTest {
 object HiveTypeCoercionSuite {
 
   case class AnyTypeUnaryExpression(child: Expression)
-    extends UnaryExpression with ExpectsInputTypes {
+    extends UnaryExpression with ExpectsInputTypes with Unevaluable {
     override def inputTypes: Seq[AbstractDataType] = Seq(AnyDataType)
     override def dataType: DataType = NullType
   }
 
   case class NumericTypeUnaryExpression(child: Expression)
-    extends UnaryExpression with ExpectsInputTypes {
+    extends UnaryExpression with ExpectsInputTypes with Unevaluable {
     override def inputTypes: Seq[AbstractDataType] = Seq(NumericType)
     override def dataType: DataType = NullType
   }
 
   case class AnyTypeBinaryOperator(left: Expression, right: Expression)
-    extends BinaryOperator {
+    extends BinaryOperator with Unevaluable {
     override def dataType: DataType = NullType
     override def inputType: AbstractDataType = AnyDataType
     override def symbol: String = "anytype"
   }
 
   case class NumericTypeBinaryOperator(left: Expression, right: Expression)
-    extends BinaryOperator {
+    extends BinaryOperator with Unevaluable {
     override def dataType: DataType = NullType
     override def inputType: AbstractDataType = NumericType
     override def symbol: String = "numerictype"
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala
index 1bd7d4e5cdf0f..8fff39906b342 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala
@@ -22,9 +22,10 @@ import scala.collection.mutable.ArrayBuffer
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
 import org.apache.spark.sql.types.{IntegerType, StringType, NullType}
 
-case class Dummy(optKey: Option[Expression]) extends Expression {
+case class Dummy(optKey: Option[Expression]) extends Expression with CodegenFallback {
   override def children: Seq[Expression] = optKey.toSeq
   override def nullable: Boolean = true
   override def dataType: NullType = NullType
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUDFs.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUDFs.scala
index 6d6e67dace177..e6e27a87c7151 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUDFs.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUDFs.scala
@@ -51,15 +51,11 @@ private[spark] case class PythonUDF(
     broadcastVars: JList[Broadcast[PythonBroadcast]],
     accumulator: Accumulator[JList[Array[Byte]]],
     dataType: DataType,
-    children: Seq[Expression]) extends Expression with SparkLogging {
+    children: Seq[Expression]) extends Expression with Unevaluable with SparkLogging {
 
   override def toString: String = s"PythonUDF#$name(${children.mkString(",")})"
 
   override def nullable: Boolean = true
-
-  override def eval(input: InternalRow): Any = {
-    throw new UnsupportedOperationException("PythonUDFs can not be directly evaluated.")
-  }
 }
 
 /**
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
index 0bc8adb16afc0..4d23c7035c03d 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
@@ -36,8 +36,8 @@ import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis
 import org.apache.spark.sql.catalyst.analysis.FunctionRegistry.FunctionBuilder
-import org.apache.spark.sql.catalyst.errors.TreeNodeException
 import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.hive.HiveShim._
@@ -81,7 +81,7 @@ private[hive] class HiveFunctionRegistry(underlying: analysis.FunctionRegistry)
 }
 
 private[hive] case class HiveSimpleUDF(funcWrapper: HiveFunctionWrapper, children: Seq[Expression])
-  extends Expression with HiveInspectors with Logging {
+  extends Expression with HiveInspectors with CodegenFallback with Logging {
 
   type UDFType = UDF
 
@@ -146,7 +146,7 @@ private[hive] class DeferredObjectAdapter(oi: ObjectInspector)
 }
 
 private[hive] case class HiveGenericUDF(funcWrapper: HiveFunctionWrapper, children: Seq[Expression])
-  extends Expression with HiveInspectors with Logging {
+  extends Expression with HiveInspectors with CodegenFallback with Logging {
   type UDFType = GenericUDF
 
   override def deterministic: Boolean = isUDFDeterministic
@@ -166,8 +166,8 @@ private[hive] case class HiveGenericUDF(funcWrapper: HiveFunctionWrapper, childr
 
   @transient
   protected lazy val isUDFDeterministic = {
-    val udfType = function.getClass().getAnnotation(classOf[HiveUDFType])
-    (udfType != null && udfType.deterministic())
+    val udfType = function.getClass.getAnnotation(classOf[HiveUDFType])
+    udfType != null && udfType.deterministic()
   }
 
   override def foldable: Boolean =
@@ -301,7 +301,7 @@ private[hive] case class HiveWindowFunction(
     pivotResult: Boolean,
     isUDAFBridgeRequired: Boolean,
     children: Seq[Expression]) extends WindowFunction
-  with HiveInspectors {
+  with HiveInspectors with Unevaluable {
 
   // Hive window functions are based on GenericUDAFResolver2.
   type UDFType = GenericUDAFResolver2
@@ -330,7 +330,7 @@ private[hive] case class HiveWindowFunction(
     evaluator.init(GenericUDAFEvaluator.Mode.COMPLETE, inputInspectors)
   }
 
-  def dataType: DataType =
+  override def dataType: DataType =
     if (!pivotResult) {
       inspectorToDataType(returnInspector)
     } else {
@@ -344,10 +344,7 @@ private[hive] case class HiveWindowFunction(
       }
     }
 
-  def nullable: Boolean = true
-
-  override def eval(input: InternalRow): Any =
-    throw new TreeNodeException(this, s"No function to evaluate expression. type: ${this.nodeName}")
+  override def nullable: Boolean = true
 
   @transient
   lazy val inputProjection = new InterpretedProjection(children)
@@ -406,7 +403,7 @@ private[hive] case class HiveWindowFunction(
     s"$nodeName#${funcWrapper.functionClassName}(${children.mkString(",")})"
   }
 
-  override def newInstance: WindowFunction =
+  override def newInstance(): WindowFunction =
     new HiveWindowFunction(funcWrapper, pivotResult, isUDAFBridgeRequired, children)
 }
 
@@ -476,7 +473,7 @@ private[hive] case class HiveUDAF(
 
 /**
  * Converts a Hive Generic User Defined Table Generating Function (UDTF) to a
- * [[catalyst.expressions.Generator Generator]].  Note that the semantics of Generators do not allow
+ * [[Generator]].  Note that the semantics of Generators do not allow
  * Generators to maintain state in between input rows.  Thus UDTFs that rely on partitioning
  * dependent operations like calls to `close()` before producing output will not operate the same as
  * in Hive.  However, in practice this should not affect compatibility for most sane UDTFs
@@ -488,7 +485,7 @@ private[hive] case class HiveUDAF(
 private[hive] case class HiveGenericUDTF(
     funcWrapper: HiveFunctionWrapper,
     children: Seq[Expression])
-  extends Generator with HiveInspectors {
+  extends Generator with HiveInspectors with CodegenFallback {
 
   @transient
   protected lazy val function: GenericUDTF = {

From 45d798c323ffe32bc2eba4dbd271c4572f5a30cf Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Sat, 18 Jul 2015 20:27:55 -0700
Subject: [PATCH 239/549] [SPARK-8278] Remove non-streaming JSON reader.

Author: Reynold Xin <rxin@databricks.com>

Closes #7501 from rxin/jsonrdd and squashes the following commits:

767ec55 [Reynold Xin] More Mima
51f456e [Reynold Xin] Mima exclude.
789cb80 [Reynold Xin] Fixed compilation error.
b4cf50d [Reynold Xin] [SPARK-8278] Remove non-streaming JSON reader.
---
 project/MimaExcludes.scala                    |   3 +
 .../apache/spark/sql/DataFrameReader.scala    |  15 +-
 .../scala/org/apache/spark/sql/SQLConf.scala  |   5 -
 .../apache/spark/sql/json/JSONRelation.scala  |  48 +-
 .../org/apache/spark/sql/json/JsonRDD.scala   | 449 ------------------
 .../org/apache/spark/sql/json/JsonSuite.scala |  27 +-
 6 files changed, 29 insertions(+), 518 deletions(-)
 delete mode 100644 sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala

diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 4e4e810ec36e3..36417f5df9f2d 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -64,6 +64,9 @@ object MimaExcludes {
             excludePackage("org.apache.spark.sql.execution"),
             // Parquet support is considered private.
             excludePackage("org.apache.spark.sql.parquet"),
+            // The old JSON RDD is removed in favor of streaming Jackson
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.json.JsonRDD$"),
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.json.JsonRDD"),
             // local function inside a method
             ProblemFilters.exclude[MissingMethodProblem](
               "org.apache.spark.sql.SQLContext.org$apache$spark$sql$SQLContext$$needsConversion$1")
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index 9ad6e21da7bf7..9b23df4843c06 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -27,7 +27,7 @@ import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.jdbc.{JDBCPartition, JDBCPartitioningInfo, JDBCRelation}
-import org.apache.spark.sql.json.{JsonRDD, JSONRelation}
+import org.apache.spark.sql.json.JSONRelation
 import org.apache.spark.sql.parquet.ParquetRelation2
 import org.apache.spark.sql.sources.{LogicalRelation, ResolvedDataSource}
 import org.apache.spark.sql.types.StructType
@@ -236,17 +236,8 @@ class DataFrameReader private[sql](sqlContext: SQLContext) {
    */
   def json(jsonRDD: RDD[String]): DataFrame = {
     val samplingRatio = extraOptions.getOrElse("samplingRatio", "1.0").toDouble
-    if (sqlContext.conf.useJacksonStreamingAPI) {
-      sqlContext.baseRelationToDataFrame(
-        new JSONRelation(() => jsonRDD, None, samplingRatio, userSpecifiedSchema)(sqlContext))
-    } else {
-      val columnNameOfCorruptJsonRecord = sqlContext.conf.columnNameOfCorruptRecord
-      val appliedSchema = userSpecifiedSchema.getOrElse(
-        JsonRDD.nullTypeToStringType(
-          JsonRDD.inferSchema(jsonRDD, 1.0, columnNameOfCorruptJsonRecord)))
-      val rowRDD = JsonRDD.jsonStringToRow(jsonRDD, appliedSchema, columnNameOfCorruptJsonRecord)
-      sqlContext.internalCreateDataFrame(rowRDD, appliedSchema)
-    }
+    sqlContext.baseRelationToDataFrame(
+      new JSONRelation(() => jsonRDD, None, samplingRatio, userSpecifiedSchema)(sqlContext))
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
index 2c2f7c35dfdce..84d3271ceb738 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
@@ -401,9 +401,6 @@ private[spark] object SQLConf {
     "spark.sql.useSerializer2",
     defaultValue = Some(true), isPublic = false)
 
-  val USE_JACKSON_STREAMING_API = booleanConf("spark.sql.json.useJacksonStreamingAPI",
-    defaultValue = Some(true), doc = "<TODO>")
-
   object Deprecated {
     val MAPRED_REDUCE_TASKS = "mapred.reduce.tasks"
   }
@@ -473,8 +470,6 @@ private[sql] class SQLConf extends Serializable with CatalystConf {
 
   private[spark] def useSqlSerializer2: Boolean = getConf(USE_SQL_SERIALIZER2)
 
-  private[spark] def useJacksonStreamingAPI: Boolean = getConf(USE_JACKSON_STREAMING_API)
-
   private[spark] def autoBroadcastJoinThreshold: Int = getConf(AUTO_BROADCASTJOIN_THRESHOLD)
 
   private[spark] def defaultSizeInBytes: Long =
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/json/JSONRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/json/JSONRelation.scala
index 2361d3bf52d2b..25802d054ac00 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/json/JSONRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/json/JSONRelation.scala
@@ -157,51 +157,27 @@ private[sql] class JSONRelation(
     }
   }
 
-  private val useJacksonStreamingAPI: Boolean = sqlContext.conf.useJacksonStreamingAPI
-
   override val needConversion: Boolean = false
 
   override lazy val schema = userSpecifiedSchema.getOrElse {
-    if (useJacksonStreamingAPI) {
-      InferSchema(
-        baseRDD(),
-        samplingRatio,
-        sqlContext.conf.columnNameOfCorruptRecord)
-    } else {
-      JsonRDD.nullTypeToStringType(
-        JsonRDD.inferSchema(
-          baseRDD(),
-          samplingRatio,
-          sqlContext.conf.columnNameOfCorruptRecord))
-    }
+    InferSchema(
+      baseRDD(),
+      samplingRatio,
+      sqlContext.conf.columnNameOfCorruptRecord)
   }
 
   override def buildScan(): RDD[Row] = {
-    if (useJacksonStreamingAPI) {
-      JacksonParser(
-        baseRDD(),
-        schema,
-        sqlContext.conf.columnNameOfCorruptRecord).map(_.asInstanceOf[Row])
-    } else {
-      JsonRDD.jsonStringToRow(
-        baseRDD(),
-        schema,
-        sqlContext.conf.columnNameOfCorruptRecord).map(_.asInstanceOf[Row])
-    }
+    JacksonParser(
+      baseRDD(),
+      schema,
+      sqlContext.conf.columnNameOfCorruptRecord).map(_.asInstanceOf[Row])
   }
 
   override def buildScan(requiredColumns: Seq[Attribute], filters: Seq[Expression]): RDD[Row] = {
-    if (useJacksonStreamingAPI) {
-      JacksonParser(
-        baseRDD(),
-        StructType.fromAttributes(requiredColumns),
-        sqlContext.conf.columnNameOfCorruptRecord).map(_.asInstanceOf[Row])
-    } else {
-      JsonRDD.jsonStringToRow(
-        baseRDD(),
-        StructType.fromAttributes(requiredColumns),
-        sqlContext.conf.columnNameOfCorruptRecord).map(_.asInstanceOf[Row])
-    }
+    JacksonParser(
+      baseRDD(),
+      StructType.fromAttributes(requiredColumns),
+      sqlContext.conf.columnNameOfCorruptRecord).map(_.asInstanceOf[Row])
   }
 
   override def insert(data: DataFrame, overwrite: Boolean): Unit = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
deleted file mode 100644
index b392a51bf7dce..0000000000000
--- a/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
+++ /dev/null
@@ -1,449 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.json
-
-import scala.collection.Map
-import scala.collection.convert.Wrappers.{JListWrapper, JMapWrapper}
-
-import com.fasterxml.jackson.core.JsonProcessingException
-import com.fasterxml.jackson.databind.ObjectMapper
-
-import org.apache.spark.Logging
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.ScalaReflection
-import org.apache.spark.sql.catalyst.analysis.HiveTypeCoercion
-import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.util.DateTimeUtils
-import org.apache.spark.sql.types._
-import org.apache.spark.unsafe.types.UTF8String
-
-
-private[sql] object JsonRDD extends Logging {
-
-  private[sql] def jsonStringToRow(
-      json: RDD[String],
-      schema: StructType,
-      columnNameOfCorruptRecords: String): RDD[InternalRow] = {
-    parseJson(json, columnNameOfCorruptRecords).map(parsed => asRow(parsed, schema))
-  }
-
-  private[sql] def inferSchema(
-      json: RDD[String],
-      samplingRatio: Double = 1.0,
-      columnNameOfCorruptRecords: String): StructType = {
-    require(samplingRatio > 0, s"samplingRatio ($samplingRatio) should be greater than 0")
-    val schemaData = if (samplingRatio > 0.99) json else json.sample(false, samplingRatio, 1)
-    val allKeys =
-      if (schemaData.isEmpty()) {
-        Set.empty[(String, DataType)]
-      } else {
-        parseJson(schemaData, columnNameOfCorruptRecords).map(allKeysWithValueTypes).reduce(_ ++ _)
-      }
-    createSchema(allKeys)
-  }
-
-  private def createSchema(allKeys: Set[(String, DataType)]): StructType = {
-    // Resolve type conflicts
-    val resolved = allKeys.groupBy {
-      case (key, dataType) => key
-    }.map {
-      // Now, keys and types are organized in the format of
-      // key -> Set(type1, type2, ...).
-      case (key, typeSet) => {
-        val fieldName = key.substring(1, key.length - 1).split("`.`").toSeq
-        val dataType = typeSet.map {
-          case (_, dataType) => dataType
-        }.reduce((type1: DataType, type2: DataType) => compatibleType(type1, type2))
-
-        (fieldName, dataType)
-      }
-    }
-
-    def makeStruct(values: Seq[Seq[String]], prefix: Seq[String]): StructType = {
-      val (topLevel, structLike) = values.partition(_.size == 1)
-
-      val topLevelFields = topLevel.filter {
-        name => resolved.get(prefix ++ name).get match {
-          case ArrayType(elementType, _) => {
-            def hasInnerStruct(t: DataType): Boolean = t match {
-              case s: StructType => true
-              case ArrayType(t1, _) => hasInnerStruct(t1)
-              case o => false
-            }
-
-            // Check if this array has inner struct.
-            !hasInnerStruct(elementType)
-          }
-          case struct: StructType => false
-          case _ => true
-        }
-      }.map {
-        a => StructField(a.head, resolved.get(prefix ++ a).get, nullable = true)
-      }
-      val topLevelFieldNameSet = topLevelFields.map(_.name)
-
-      val structFields: Seq[StructField] = structLike.groupBy(_(0)).filter {
-        case (name, _) => !topLevelFieldNameSet.contains(name)
-      }.map {
-        case (name, fields) => {
-          val nestedFields = fields.map(_.tail)
-          val structType = makeStruct(nestedFields, prefix :+ name)
-          val dataType = resolved.get(prefix :+ name).get
-          dataType match {
-            case array: ArrayType =>
-              // The pattern of this array is ArrayType(...(ArrayType(StructType))).
-              // Since the inner struct of array is a placeholder (StructType(Nil)),
-              // we need to replace this placeholder with the actual StructType (structType).
-              def getActualArrayType(
-                  innerStruct: StructType,
-                  currentArray: ArrayType): ArrayType = currentArray match {
-                case ArrayType(s: StructType, containsNull) =>
-                  ArrayType(innerStruct, containsNull)
-                case ArrayType(a: ArrayType, containsNull) =>
-                  ArrayType(getActualArrayType(innerStruct, a), containsNull)
-              }
-              Some(StructField(name, getActualArrayType(structType, array), nullable = true))
-            case struct: StructType => Some(StructField(name, structType, nullable = true))
-            // dataType is StringType means that we have resolved type conflicts involving
-            // primitive types and complex types. So, the type of name has been relaxed to
-            // StringType. Also, this field should have already been put in topLevelFields.
-            case StringType => None
-          }
-        }
-      }.flatMap(field => field).toSeq
-
-      StructType((topLevelFields ++ structFields).sortBy(_.name))
-    }
-
-    makeStruct(resolved.keySet.toSeq, Nil)
-  }
-
-  private[sql] def nullTypeToStringType(struct: StructType): StructType = {
-    val fields = struct.fields.map {
-      case StructField(fieldName, dataType, nullable, _) => {
-        val newType = dataType match {
-          case NullType => StringType
-          case ArrayType(NullType, containsNull) => ArrayType(StringType, containsNull)
-          case ArrayType(struct: StructType, containsNull) =>
-            ArrayType(nullTypeToStringType(struct), containsNull)
-          case struct: StructType => nullTypeToStringType(struct)
-          case other: DataType => other
-        }
-        StructField(fieldName, newType, nullable)
-      }
-    }
-
-    StructType(fields)
-  }
-
-  /**
-   * Returns the most general data type for two given data types.
-   */
-  private[json] def compatibleType(t1: DataType, t2: DataType): DataType = {
-    HiveTypeCoercion.findTightestCommonTypeOfTwo(t1, t2) match {
-      case Some(commonType) => commonType
-      case None =>
-        // t1 or t2 is a StructType, ArrayType, or an unexpected type.
-        (t1, t2) match {
-          case (other: DataType, NullType) => other
-          case (NullType, other: DataType) => other
-          case (StructType(fields1), StructType(fields2)) => {
-            val newFields = (fields1 ++ fields2).groupBy(field => field.name).map {
-              case (name, fieldTypes) => {
-                val dataType = fieldTypes.map(field => field.dataType).reduce(
-                  (type1: DataType, type2: DataType) => compatibleType(type1, type2))
-                StructField(name, dataType, true)
-              }
-            }
-            StructType(newFields.toSeq.sortBy(_.name))
-          }
-          case (ArrayType(elementType1, containsNull1), ArrayType(elementType2, containsNull2)) =>
-            ArrayType(compatibleType(elementType1, elementType2), containsNull1 || containsNull2)
-          // TODO: We should use JsonObjectStringType to mark that values of field will be
-          // strings and every string is a Json object.
-          case (_, _) => StringType
-        }
-    }
-  }
-
-  private def typeOfPrimitiveValue: PartialFunction[Any, DataType] = {
-    // For Integer values, use LongType by default.
-    val useLongType: PartialFunction[Any, DataType] = {
-      case value: IntegerType.InternalType => LongType
-    }
-
-    useLongType orElse ScalaReflection.typeOfObject orElse {
-      // Since we do not have a data type backed by BigInteger,
-      // when we see a Java BigInteger, we use DecimalType.
-      case value: java.math.BigInteger => DecimalType.Unlimited
-      // DecimalType's JVMType is scala BigDecimal.
-      case value: java.math.BigDecimal => DecimalType.Unlimited
-      // Unexpected data type.
-      case _ => StringType
-    }
-  }
-
-  /**
-   * Returns the element type of an JSON array. We go through all elements of this array
-   * to detect any possible type conflict. We use [[compatibleType]] to resolve
-   * type conflicts.
-   */
-  private def typeOfArray(l: Seq[Any]): ArrayType = {
-    val elements = l.flatMap(v => Option(v))
-    if (elements.isEmpty) {
-      // If this JSON array is empty, we use NullType as a placeholder.
-      // If this array is not empty in other JSON objects, we can resolve
-      // the type after we have passed through all JSON objects.
-      ArrayType(NullType, containsNull = true)
-    } else {
-      val elementType = elements.map {
-        e => e match {
-          case map: Map[_, _] => StructType(Nil)
-          // We have an array of arrays. If those element arrays do not have the same
-          // element types, we will return ArrayType[StringType].
-          case seq: Seq[_] => typeOfArray(seq)
-          case value => typeOfPrimitiveValue(value)
-        }
-      }.reduce((type1: DataType, type2: DataType) => compatibleType(type1, type2))
-
-      ArrayType(elementType, containsNull = true)
-    }
-  }
-
-  /**
-   * Figures out all key names and data types of values from a parsed JSON object
-   * (in the format of Map[Stirng, Any]). When the value of a key is an JSON object, we
-   * only use a placeholder (StructType(Nil)) to mark that it should be a struct
-   * instead of getting all fields of this struct because a field does not appear
-   * in this JSON object can appear in other JSON objects.
-   */
-  private def allKeysWithValueTypes(m: Map[String, Any]): Set[(String, DataType)] = {
-    val keyValuePairs = m.map {
-      // Quote the key with backticks to handle cases which have dots
-      // in the field name.
-      case (key, value) => (s"`$key`", value)
-    }.toSet
-    keyValuePairs.flatMap {
-      case (key: String, struct: Map[_, _]) => {
-        // The value associated with the key is an JSON object.
-        allKeysWithValueTypes(struct.asInstanceOf[Map[String, Any]]).map {
-          case (k, dataType) => (s"$key.$k", dataType)
-        } ++ Set((key, StructType(Nil)))
-      }
-      case (key: String, array: Seq[_]) => {
-        // The value associated with the key is an array.
-        // Handle inner structs of an array.
-        def buildKeyPathForInnerStructs(v: Any, t: DataType): Seq[(String, DataType)] = t match {
-          case ArrayType(e: StructType, _) => {
-            // The elements of this arrays are structs.
-            v.asInstanceOf[Seq[Map[String, Any]]].flatMap(Option(_)).flatMap {
-              element => allKeysWithValueTypes(element)
-            }.map {
-              case (k, t) => (s"$key.$k", t)
-            }
-          }
-          case ArrayType(t1, _) =>
-            v.asInstanceOf[Seq[Any]].flatMap(Option(_)).flatMap {
-              element => buildKeyPathForInnerStructs(element, t1)
-            }
-          case other => Nil
-        }
-        val elementType = typeOfArray(array)
-        buildKeyPathForInnerStructs(array, elementType) :+ (key, elementType)
-      }
-      // we couldn't tell what the type is if the value is null or empty string
-      case (key: String, value) if value == "" || value == null => (key, NullType) :: Nil
-      case (key: String, value) => (key, typeOfPrimitiveValue(value)) :: Nil
-    }
-  }
-
-  /**
-   * Converts a Java Map/List to a Scala Map/Seq.
-   * We do not use Jackson's scala module at here because
-   * DefaultScalaModule in jackson-module-scala will make
-   * the parsing very slow.
-   */
-  private def scalafy(obj: Any): Any = obj match {
-    case map: java.util.Map[_, _] =>
-      // .map(identity) is used as a workaround of non-serializable Map
-      // generated by .mapValues.
-      // This issue is documented at https://issues.scala-lang.org/browse/SI-7005
-      JMapWrapper(map).mapValues(scalafy).map(identity)
-    case list: java.util.List[_] =>
-      JListWrapper(list).map(scalafy)
-    case atom => atom
-  }
-
-  private def parseJson(
-      json: RDD[String],
-      columnNameOfCorruptRecords: String): RDD[Map[String, Any]] = {
-    // According to [Jackson-72: https://jira.codehaus.org/browse/JACKSON-72],
-    // ObjectMapper will not return BigDecimal when
-    // "DeserializationFeature.USE_BIG_DECIMAL_FOR_FLOATS" is disabled
-    // (see NumberDeserializer.deserialize for the logic).
-    // But, we do not want to enable this feature because it will use BigDecimal
-    // for every float number, which will be slow.
-    // So, right now, we will have Infinity for those BigDecimal number.
-    // TODO: Support BigDecimal.
-    json.mapPartitions(iter => {
-      // When there is a key appearing multiple times (a duplicate key),
-      // the ObjectMapper will take the last value associated with this duplicate key.
-      // For example: for {"key": 1, "key":2}, we will get "key"->2.
-      val mapper = new ObjectMapper()
-      iter.flatMap { record =>
-        try {
-          val parsed = mapper.readValue(record, classOf[Object]) match {
-            case map: java.util.Map[_, _] => scalafy(map).asInstanceOf[Map[String, Any]] :: Nil
-            case list: java.util.List[_] => scalafy(list).asInstanceOf[Seq[Map[String, Any]]]
-            case _ =>
-              sys.error(
-                s"Failed to parse record $record. Please make sure that each line of the file " +
-                "(or each string in the RDD) is a valid JSON object or an array of JSON objects.")
-          }
-
-          parsed
-        } catch {
-          case e: JsonProcessingException =>
-            Map(columnNameOfCorruptRecords -> UTF8String.fromString(record)) :: Nil
-        }
-      }
-    })
-  }
-
-  private def toLong(value: Any): Long = {
-    value match {
-      case value: java.lang.Integer => value.asInstanceOf[Int].toLong
-      case value: java.lang.Long => value.asInstanceOf[Long]
-    }
-  }
-
-  private def toDouble(value: Any): Double = {
-    value match {
-      case value: java.lang.Integer => value.asInstanceOf[Int].toDouble
-      case value: java.lang.Long => value.asInstanceOf[Long].toDouble
-      case value: java.lang.Double => value.asInstanceOf[Double]
-    }
-  }
-
-  private def toDecimal(value: Any): Decimal = {
-    value match {
-      case value: java.lang.Integer => Decimal(value)
-      case value: java.lang.Long => Decimal(value)
-      case value: java.math.BigInteger => Decimal(new java.math.BigDecimal(value))
-      case value: java.lang.Double => Decimal(value)
-      case value: java.math.BigDecimal => Decimal(value)
-    }
-  }
-
-  private def toJsonArrayString(seq: Seq[Any]): String = {
-    val builder = new StringBuilder
-    builder.append("[")
-    var count = 0
-    seq.foreach {
-      element =>
-        if (count > 0) builder.append(",")
-        count += 1
-        builder.append(toString(element))
-    }
-    builder.append("]")
-
-    builder.toString()
-  }
-
-  private def toJsonObjectString(map: Map[String, Any]): String = {
-    val builder = new StringBuilder
-    builder.append("{")
-    var count = 0
-    map.foreach {
-      case (key, value) =>
-        if (count > 0) builder.append(",")
-        count += 1
-        val stringValue = if (value.isInstanceOf[String]) s"""\"$value\"""" else toString(value)
-        builder.append(s"""\"${key}\":${stringValue}""")
-    }
-    builder.append("}")
-
-    builder.toString()
-  }
-
-  private def toString(value: Any): String = {
-    value match {
-      case value: Map[_, _] => toJsonObjectString(value.asInstanceOf[Map[String, Any]])
-      case value: Seq[_] => toJsonArrayString(value)
-      case value => Option(value).map(_.toString).orNull
-    }
-  }
-
-  private def toDate(value: Any): Int = {
-    value match {
-      // only support string as date
-      case value: java.lang.String =>
-        DateTimeUtils.millisToDays(DateTimeUtils.stringToTime(value).getTime)
-      case value: java.sql.Date => DateTimeUtils.fromJavaDate(value)
-    }
-  }
-
-  private def toTimestamp(value: Any): Long = {
-    value match {
-      case value: java.lang.Integer => value.asInstanceOf[Int].toLong * 1000L
-      case value: java.lang.Long => value * 1000L
-      case value: java.lang.String => DateTimeUtils.stringToTime(value).getTime * 1000L
-    }
-  }
-
-  private[json] def enforceCorrectType(value: Any, desiredType: DataType): Any = {
-    if (value == null) {
-      null
-    } else {
-      desiredType match {
-        case StringType => UTF8String.fromString(toString(value))
-        case _ if value == null || value == "" => null // guard the non string type
-        case IntegerType => value.asInstanceOf[IntegerType.InternalType]
-        case LongType => toLong(value)
-        case DoubleType => toDouble(value)
-        case DecimalType() => toDecimal(value)
-        case BooleanType => value.asInstanceOf[BooleanType.InternalType]
-        case NullType => null
-        case ArrayType(elementType, _) =>
-          value.asInstanceOf[Seq[Any]].map(enforceCorrectType(_, elementType))
-        case MapType(StringType, valueType, _) =>
-          val map = value.asInstanceOf[Map[String, Any]]
-          map.map {
-            case (k, v) =>
-              (UTF8String.fromString(k), enforceCorrectType(v, valueType))
-          }.map(identity)
-        case struct: StructType => asRow(value.asInstanceOf[Map[String, Any]], struct)
-        case DateType => toDate(value)
-        case TimestampType => toTimestamp(value)
-      }
-    }
-  }
-
-  private def asRow(json: Map[String, Any], schema: StructType): InternalRow = {
-    // TODO: Reuse the row instead of creating a new one for every record.
-    val row = new GenericMutableRow(schema.fields.length)
-    schema.fields.zipWithIndex.foreach {
-      case (StructField(name, dataType, _, _), i) =>
-        row.update(i, json.get(name).flatMap(v => Option(v)).map(
-          enforceCorrectType(_, dataType)).orNull)
-    }
-
-    row
-  }
-}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
index 8204a584179bb..3475f9dd6787e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
@@ -1079,28 +1079,23 @@ class JsonSuite extends QueryTest with TestJsonData {
   }
 
   test("SPARK-7565 MapType in JsonRDD") {
-    val useStreaming = ctx.conf.useJacksonStreamingAPI
     val oldColumnNameOfCorruptRecord = ctx.conf.columnNameOfCorruptRecord
     ctx.setConf(SQLConf.COLUMN_NAME_OF_CORRUPT_RECORD, "_unparsed")
 
     val schemaWithSimpleMap = StructType(
       StructField("map", MapType(StringType, IntegerType, true), false) :: Nil)
-    try{
-      for (useStreaming <- List(true, false)) {
-        ctx.setConf(SQLConf.USE_JACKSON_STREAMING_API, useStreaming)
-        val temp = Utils.createTempDir().getPath
-
-        val df = ctx.read.schema(schemaWithSimpleMap).json(mapType1)
-        df.write.mode("overwrite").parquet(temp)
-        // order of MapType is not defined
-        assert(ctx.read.parquet(temp).count() == 5)
-
-        val df2 = ctx.read.json(corruptRecords)
-        df2.write.mode("overwrite").parquet(temp)
-        checkAnswer(ctx.read.parquet(temp), df2.collect())
-      }
+    try {
+      val temp = Utils.createTempDir().getPath
+
+      val df = ctx.read.schema(schemaWithSimpleMap).json(mapType1)
+      df.write.mode("overwrite").parquet(temp)
+      // order of MapType is not defined
+      assert(ctx.read.parquet(temp).count() == 5)
+
+      val df2 = ctx.read.json(corruptRecords)
+      df2.write.mode("overwrite").parquet(temp)
+      checkAnswer(ctx.read.parquet(temp), df2.collect())
     } finally {
-      ctx.setConf(SQLConf.USE_JACKSON_STREAMING_API, useStreaming)
       ctx.setConf(SQLConf.COLUMN_NAME_OF_CORRUPT_RECORD, oldColumnNameOfCorruptRecord)
     }
   }

From 6cb6096c016178b9ce5c97592abe529ddb18cef2 Mon Sep 17 00:00:00 2001
From: Forest Fang <forest.fang@outlook.com>
Date: Sat, 18 Jul 2015 21:05:44 -0700
Subject: [PATCH 240/549] [SPARK-8443][SQL] Split GenerateMutableProjection
 Codegen due to JVM Code Size Limits

By grouping projection calls into multiple apply function, we are able to push the number of projections codegen can handle from ~1k to ~60k. I have set the unit test to test against 5k as 60k took 15s for the unit test to complete.

Author: Forest Fang <forest.fang@outlook.com>

Closes #7076 from saurfang/codegen_size_limit and squashes the following commits:

b7a7635 [Forest Fang] [SPARK-8443][SQL] Execute and verify split projections in test
adef95a [Forest Fang] [SPARK-8443][SQL] Use safer factor and rewrite splitting code
1b5aa7e [Forest Fang] [SPARK-8443][SQL] inline execution if one block only
9405680 [Forest Fang] [SPARK-8443][SQL] split projection code by size limit
---
 .../codegen/GenerateMutableProjection.scala   | 39 ++++++++++++++++++-
 .../expressions/CodeGenerationSuite.scala     | 14 ++++++-
 2 files changed, 50 insertions(+), 3 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
index 71e47d4f9b620..b82bd6814b487 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
@@ -19,6 +19,8 @@ package org.apache.spark.sql.catalyst.expressions.codegen
 
 import org.apache.spark.sql.catalyst.expressions._
 
+import scala.collection.mutable.ArrayBuffer
+
 // MutableProjection is not accessible in Java
 abstract class BaseMutableProjection extends MutableProjection
 
@@ -45,10 +47,41 @@ object GenerateMutableProjection extends CodeGenerator[Seq[Expression], () => Mu
           else
             ${ctx.setColumn("mutableRow", e.dataType, i, evaluationCode.primitive)};
         """
-    }.mkString("\n")
+    }
+    // collect projections into blocks as function has 64kb codesize limit in JVM
+    val projectionBlocks = new ArrayBuffer[String]()
+    val blockBuilder = new StringBuilder()
+    for (projection <- projectionCode) {
+      if (blockBuilder.length > 16 * 1000) {
+        projectionBlocks.append(blockBuilder.toString())
+        blockBuilder.clear()
+      }
+      blockBuilder.append(projection)
+    }
+    projectionBlocks.append(blockBuilder.toString())
+
+    val (projectionFuns, projectionCalls) = {
+      // inline execution if codesize limit was not broken
+      if (projectionBlocks.length == 1) {
+        ("", projectionBlocks.head)
+      } else {
+        (
+          projectionBlocks.zipWithIndex.map { case (body, i) =>
+            s"""
+               |private void apply$i(InternalRow i) {
+               |  $body
+               |}
+             """.stripMargin
+          }.mkString,
+          projectionBlocks.indices.map(i => s"apply$i(i);").mkString("\n")
+        )
+      }
+    }
+
     val mutableStates = ctx.mutableStates.map { case (javaType, variableName, initialValue) =>
       s"private $javaType $variableName = $initialValue;"
     }.mkString("\n      ")
+
     val code = s"""
       public Object generate($exprType[] expr) {
         return new SpecificProjection(expr);
@@ -75,9 +108,11 @@ object GenerateMutableProjection extends CodeGenerator[Seq[Expression], () => Mu
           return (InternalRow) mutableRow;
         }
 
+        $projectionFuns
+
         public Object apply(Object _i) {
           InternalRow i = (InternalRow) _i;
-          $projectionCode
+          $projectionCalls
 
           return mutableRow;
         }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala
index 481b335d15dfd..e05218a23aa73 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala
@@ -24,7 +24,7 @@ import org.apache.spark.sql.catalyst.expressions.codegen._
 /**
  * Additional tests for code generation.
  */
-class CodeGenerationSuite extends SparkFunSuite {
+class CodeGenerationSuite extends SparkFunSuite with ExpressionEvalHelper {
 
   test("multithreaded eval") {
     import scala.concurrent._
@@ -42,4 +42,16 @@ class CodeGenerationSuite extends SparkFunSuite {
 
     futures.foreach(Await.result(_, 10.seconds))
   }
+
+  test("SPARK-8443: split wide projections into blocks due to JVM code size limit") {
+    val length = 5000
+    val expressions = List.fill(length)(EqualTo(Literal(1), Literal(1)))
+    val plan = GenerateMutableProjection.generate(expressions)()
+    val actual = plan(new GenericMutableRow(length)).toSeq
+    val expected = Seq.fill(length)(true)
+
+    if (!checkResult(actual, expected)) {
+      fail(s"Incorrect Evaluation: expressions: $expressions, actual: $actual, expected: $expected")
+    }
+  }
 }

From 83b682beec884da76708769414108f4316e620f2 Mon Sep 17 00:00:00 2001
From: Tarek Auel <tarek.auel@googlemail.com>
Date: Sat, 18 Jul 2015 22:48:05 -0700
Subject: [PATCH 241/549] 
 [SPARK-8199][SPARK-8184][SPARK-8183][SPARK-8182][SPARK-8181][SPARK-8180][SPARK-8179][SPARK-8177][SPARK-8178][SPARK-9115][SQL]
 date functions

Jira:
https://issues.apache.org/jira/browse/SPARK-8199
https://issues.apache.org/jira/browse/SPARK-8184
https://issues.apache.org/jira/browse/SPARK-8183
https://issues.apache.org/jira/browse/SPARK-8182
https://issues.apache.org/jira/browse/SPARK-8181
https://issues.apache.org/jira/browse/SPARK-8180
https://issues.apache.org/jira/browse/SPARK-8179
https://issues.apache.org/jira/browse/SPARK-8177
https://issues.apache.org/jira/browse/SPARK-8179
https://issues.apache.org/jira/browse/SPARK-9115

Regarding `day`and `dayofmonth` are both necessary?

~~I am going to add `Quarter` to this PR as well.~~ Done.

~~As soon as the Scala coding is reviewed and discussed, I'll add the python api.~~ Done

Author: Tarek Auel <tarek.auel@googlemail.com>
Author: Tarek Auel <tarek.auel@gmail.com>

Closes #6981 from tarekauel/SPARK-8199 and squashes the following commits:

f7b4c8c [Tarek Auel] [SPARK-8199] fixed bug in tests
bb567b6 [Tarek Auel] [SPARK-8199] fixed test
3e095ba [Tarek Auel] [SPARK-8199] style and timezone fix
256c357 [Tarek Auel] [SPARK-8199] code cleanup
5983dcc [Tarek Auel] [SPARK-8199] whitespace fix
6e0c78f [Tarek Auel] [SPARK-8199] removed setTimeZone in tests, according to cloud-fans comment in #7488
4afc09c [Tarek Auel] [SPARK-8199] concise leap year handling
ea6c110 [Tarek Auel] [SPARK-8199] fix after merging master
70238e0 [Tarek Auel] Merge branch 'master' into SPARK-8199
3c6ae2e [Tarek Auel] [SPARK-8199] removed binary search
fb98ba0 [Tarek Auel] [SPARK-8199] python docstring fix
cdfae27 [Tarek Auel] [SPARK-8199] cleanup & python docstring fix
746b80a [Tarek Auel] [SPARK-8199] build fix
0ad6db8 [Tarek Auel] [SPARK-8199] minor fix
523542d [Tarek Auel] [SPARK-8199] address comments
2259299 [Tarek Auel] [SPARK-8199] day_of_month alias
d01b977 [Tarek Auel] [SPARK-8199] python underscore
56c4a92 [Tarek Auel] [SPARK-8199] update python docu
e223bc0 [Tarek Auel] [SPARK-8199] refactoring
d6aa14e [Tarek Auel] [SPARK-8199] fixed Hive compatibility
b382267 [Tarek Auel] [SPARK-8199] fixed bug in day calculation; removed set TimeZone in HiveCompatibilitySuite for test purposes; removed Hive tests for second and minute, because we can cast '2015-03-18' to a timestamp and extract a minute/second from it
1b2e540 [Tarek Auel] [SPARK-8119] style fix
0852655 [Tarek Auel] [SPARK-8119] changed from ExpectsInputTypes to implicit casts
ec87c69 [Tarek Auel] [SPARK-8119] bug fixing and refactoring
1358cdc [Tarek Auel] Merge remote-tracking branch 'origin/master' into SPARK-8199
740af0e [Tarek Auel] implement date function using a calculation based on days
4fb66da [Tarek Auel] WIP: date functions on calculation only
1a436c9 [Tarek Auel] wip
f775f39 [Tarek Auel] fixed return type
ad17e96 [Tarek Auel] improved implementation
c42b444 [Tarek Auel] Removed merge conflict file
ccb723c [Tarek Auel] [SPARK-8199] style and fixed merge issues
10e4ad1 [Tarek Auel] Merge branch 'master' into date-functions-fast
7d9f0eb [Tarek Auel] [SPARK-8199] git renaming issue
f3e7a9f [Tarek Auel] [SPARK-8199] revert change in DataFrameFunctionsSuite
6f5d95c [Tarek Auel] [SPARK-8199] fixed year interval
d9f8ac3 [Tarek Auel] [SPARK-8199] implement fast track
7bc9d93 [Tarek Auel] Merge branch 'master' into SPARK-8199
5a105d9 [Tarek Auel] [SPARK-8199] rebase after #6985 got merged
eb6760d [Tarek Auel] Merge branch 'master' into SPARK-8199
f120415 [Tarek Auel] improved runtime
a8edebd [Tarek Auel] use Calendar instead of SimpleDateFormat
5fe74e1 [Tarek Auel] fixed python style
3bfac90 [Tarek Auel] fixed style
356df78 [Tarek Auel] rely on cast mechanism of Spark. Simplified implementation
02efc5d [Tarek Auel] removed doubled code
a5ea120 [Tarek Auel] added python api; changed test to be more meaningful
b680db6 [Tarek Auel] added codegeneration to all functions
c739788 [Tarek Auel] added support for quarter SPARK-8178
849fb41 [Tarek Auel] fixed stupid test
638596f [Tarek Auel] improved codegen
4d8049b [Tarek Auel] fixed tests and added type check
5ebb235 [Tarek Auel] resolved naming conflict
d0e2f99 [Tarek Auel] date functions
---
 python/pyspark/sql/functions.py               | 150 +++++++++++
 .../catalyst/analysis/FunctionRegistry.scala  |  14 +-
 .../expressions/datetimeFunctions.scala       | 206 +++++++++++++++
 .../sql/catalyst/util/DateTimeUtils.scala     | 195 +++++++++++++-
 .../expressions/DateFunctionsSuite.scala      | 249 ++++++++++++++++++
 .../catalyst/util/DateTimeUtilsSuite.scala    |  91 +++++--
 .../org/apache/spark/sql/functions.scala      | 176 +++++++++++++
 .../spark/sql/DateExpressionsSuite.scala      | 170 ++++++++++++
 .../execution/HiveCompatibilitySuite.scala    |   9 +-
 9 files changed, 1234 insertions(+), 26 deletions(-)
 create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateFunctionsSuite.scala
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/DateExpressionsSuite.scala

diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index e0816b3e654bc..0aca3788922aa 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -652,6 +652,156 @@ def ntile(n):
     return Column(sc._jvm.functions.ntile(int(n)))
 
 
+@ignore_unicode_prefix
+@since(1.5)
+def date_format(dateCol, format):
+    """
+    Converts a date/timestamp/string to a value of string in the format specified by the date
+    format given by the second argument.
+
+    A pattern could be for instance `dd.MM.yyyy` and could return a string like '18.03.1993'. All
+    pattern letters of the Java class `java.text.SimpleDateFormat` can be used.
+
+    NOTE: Use when ever possible specialized functions like `year`. These benefit from a
+    specialized implementation.
+
+    >>> df = sqlContext.createDataFrame([('2015-04-08',)], ['a'])
+    >>> df.select(date_format('a', 'MM/dd/yyy').alias('date')).collect()
+    [Row(date=u'04/08/2015')]
+    """
+    sc = SparkContext._active_spark_context
+    return Column(sc._jvm.functions.date_format(dateCol, format))
+
+
+@since(1.5)
+def year(col):
+    """
+    Extract the year of a given date as integer.
+
+    >>> df = sqlContext.createDataFrame([('2015-04-08',)], ['a'])
+    >>> df.select(year('a').alias('year')).collect()
+    [Row(year=2015)]
+    """
+    sc = SparkContext._active_spark_context
+    return Column(sc._jvm.functions.year(col))
+
+
+@since(1.5)
+def quarter(col):
+    """
+    Extract the quarter of a given date as integer.
+
+    >>> df = sqlContext.createDataFrame([('2015-04-08',)], ['a'])
+    >>> df.select(quarter('a').alias('quarter')).collect()
+    [Row(quarter=2)]
+    """
+    sc = SparkContext._active_spark_context
+    return Column(sc._jvm.functions.quarter(col))
+
+
+@since(1.5)
+def month(col):
+    """
+    Extract the month of a given date as integer.
+
+    >>> df = sqlContext.createDataFrame([('2015-04-08',)], ['a'])
+    >>> df.select(month('a').alias('month')).collect()
+    [Row(month=4)]
+   """
+    sc = SparkContext._active_spark_context
+    return Column(sc._jvm.functions.month(col))
+
+
+@since(1.5)
+def day(col):
+    """
+    Extract the day of the month of a given date as integer.
+
+    >>> sqlContext.createDataFrame([('2015-04-08',)], ['a']).select(day('a').alias('day')).collect()
+    [Row(day=8)]
+    """
+    sc = SparkContext._active_spark_context
+    return Column(sc._jvm.functions.day(col))
+
+
+@since(1.5)
+def day_of_month(col):
+    """
+    Extract the day of the month of a given date as integer.
+
+    >>> df = sqlContext.createDataFrame([('2015-04-08',)], ['a'])
+    >>> df.select(day_of_month('a').alias('day')).collect()
+    [Row(day=8)]
+    """
+    sc = SparkContext._active_spark_context
+    return Column(sc._jvm.functions.day_of_month(col))
+
+
+@since(1.5)
+def day_in_year(col):
+    """
+    Extract the day of the year of a given date as integer.
+
+    >>> df = sqlContext.createDataFrame([('2015-04-08',)], ['a'])
+    >>> df.select(day_in_year('a').alias('day')).collect()
+    [Row(day=98)]
+    """
+    sc = SparkContext._active_spark_context
+    return Column(sc._jvm.functions.day_in_year(col))
+
+
+@since(1.5)
+def hour(col):
+    """
+    Extract the hours of a given date as integer.
+
+    >>> df = sqlContext.createDataFrame([('2015-04-08 13:08:15',)], ['a'])
+    >>> df.select(hour('a').alias('hour')).collect()
+    [Row(hour=13)]
+    """
+    sc = SparkContext._active_spark_context
+    return Column(sc._jvm.functions.hour(col))
+
+
+@since(1.5)
+def minute(col):
+    """
+    Extract the minutes of a given date as integer.
+
+    >>> df = sqlContext.createDataFrame([('2015-04-08 13:08:15',)], ['a'])
+    >>> df.select(minute('a').alias('minute')).collect()
+    [Row(minute=8)]
+    """
+    sc = SparkContext._active_spark_context
+    return Column(sc._jvm.functions.minute(col))
+
+
+@since(1.5)
+def second(col):
+    """
+    Extract the seconds of a given date as integer.
+
+    >>> df = sqlContext.createDataFrame([('2015-04-08 13:08:15',)], ['a'])
+    >>> df.select(second('a').alias('second')).collect()
+    [Row(second=15)]
+    """
+    sc = SparkContext._active_spark_context
+    return Column(sc._jvm.functions.second(col))
+
+
+@since(1.5)
+def week_of_year(col):
+    """
+    Extract the week number of a given date as integer.
+
+    >>> df = sqlContext.createDataFrame([('2015-04-08',)], ['a'])
+    >>> df.select(week_of_year('a').alias('week')).collect()
+    [Row(week=15)]
+    """
+    sc = SparkContext._active_spark_context
+    return Column(sc._jvm.functions.week_of_year(col))
+
+
 class UserDefinedFunction(object):
     """
     User defined function in Python
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index d1cda6bc27095..159f7eca7acfe 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -181,7 +181,19 @@ object FunctionRegistry {
 
     // datetime functions
     expression[CurrentDate]("current_date"),
-    expression[CurrentTimestamp]("current_timestamp")
+    expression[CurrentTimestamp]("current_timestamp"),
+    expression[DateFormatClass]("date_format"),
+    expression[Day]("day"),
+    expression[DayInYear]("day_in_year"),
+    expression[Day]("day_of_month"),
+    expression[Hour]("hour"),
+    expression[Month]("month"),
+    expression[Minute]("minute"),
+    expression[Quarter]("quarter"),
+    expression[Second]("second"),
+    expression[WeekOfYear]("week_of_year"),
+    expression[Year]("year")
+
   )
 
   val builtin: FunctionRegistry = {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeFunctions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeFunctions.scala
index 4bed140cffbfa..f9cbbb8c6bee0 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeFunctions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeFunctions.scala
@@ -17,10 +17,16 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+import java.sql.Date
+import java.text.SimpleDateFormat
+import java.util.{Calendar, TimeZone}
+
+import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.UTF8String
 
 /**
  * Returns the current date at the start of query evaluation.
@@ -55,3 +61,203 @@ case class CurrentTimestamp() extends LeafExpression with CodegenFallback {
     System.currentTimeMillis() * 1000L
   }
 }
+
+case class Hour(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
+
+  override def inputTypes: Seq[AbstractDataType] = Seq(TimestampType)
+
+  override def dataType: DataType = IntegerType
+
+  override protected def nullSafeEval(timestamp: Any): Any = {
+    DateTimeUtils.getHours(timestamp.asInstanceOf[Long])
+  }
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    val dtu = DateTimeUtils.getClass.getName.stripSuffix("$")
+    defineCodeGen(ctx, ev, (c) =>
+      s"""$dtu.getHours($c)"""
+    )
+  }
+}
+
+case class Minute(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
+
+  override def inputTypes: Seq[AbstractDataType] = Seq(TimestampType)
+
+  override def dataType: DataType = IntegerType
+
+  override protected def nullSafeEval(timestamp: Any): Any = {
+    DateTimeUtils.getMinutes(timestamp.asInstanceOf[Long])
+  }
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    val dtu = DateTimeUtils.getClass.getName.stripSuffix("$")
+    defineCodeGen(ctx, ev, (c) =>
+      s"""$dtu.getMinutes($c)"""
+    )
+  }
+}
+
+case class Second(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
+
+  override def inputTypes: Seq[AbstractDataType] = Seq(TimestampType)
+
+  override def dataType: DataType = IntegerType
+
+  override protected def nullSafeEval(timestamp: Any): Any = {
+    DateTimeUtils.getSeconds(timestamp.asInstanceOf[Long])
+  }
+
+  override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    val dtu = DateTimeUtils.getClass.getName.stripSuffix("$")
+    defineCodeGen(ctx, ev, (c) =>
+      s"""$dtu.getSeconds($c)"""
+    )
+  }
+}
+
+case class DayInYear(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
+
+  override def inputTypes: Seq[AbstractDataType] = Seq(DateType)
+
+  override def dataType: DataType = IntegerType
+
+  override def prettyName: String = "day_in_year"
+
+  override protected def nullSafeEval(date: Any): Any = {
+    DateTimeUtils.getDayInYear(date.asInstanceOf[Int])
+  }
+
+  override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    val dtu = DateTimeUtils.getClass.getName.stripSuffix("$")
+    defineCodeGen(ctx, ev, (c) =>
+      s"""$dtu.getDayInYear($c)"""
+    )
+  }
+}
+
+
+case class Year(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
+
+  override def inputTypes: Seq[AbstractDataType] = Seq(DateType)
+
+  override def dataType: DataType = IntegerType
+
+  override protected def nullSafeEval(date: Any): Any = {
+    DateTimeUtils.getYear(date.asInstanceOf[Int])
+  }
+
+  override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    val dtu = DateTimeUtils.getClass.getName.stripSuffix("$")
+    defineCodeGen(ctx, ev, (c) =>
+      s"""$dtu.getYear($c)"""
+    )
+  }
+}
+
+case class Quarter(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
+
+  override def inputTypes: Seq[AbstractDataType] = Seq(DateType)
+
+  override def dataType: DataType = IntegerType
+
+  override protected def nullSafeEval(date: Any): Any = {
+    DateTimeUtils.getQuarter(date.asInstanceOf[Int])
+  }
+
+  override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    val dtu = DateTimeUtils.getClass.getName.stripSuffix("$")
+    defineCodeGen(ctx, ev, (c) =>
+      s"""$dtu.getQuarter($c)"""
+    )
+  }
+}
+
+case class Month(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
+
+  override def inputTypes: Seq[AbstractDataType] = Seq(DateType)
+
+  override def dataType: DataType = IntegerType
+
+  override protected def nullSafeEval(date: Any): Any = {
+    DateTimeUtils.getMonth(date.asInstanceOf[Int])
+  }
+
+  override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    val dtu = DateTimeUtils.getClass.getName.stripSuffix("$")
+    defineCodeGen(ctx, ev, (c) =>
+      s"""$dtu.getMonth($c)"""
+    )
+  }
+}
+
+case class Day(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
+
+  override def inputTypes: Seq[AbstractDataType] = Seq(DateType)
+
+  override def dataType: DataType = IntegerType
+
+  override protected def nullSafeEval(date: Any): Any = {
+    DateTimeUtils.getDayOfMonth(date.asInstanceOf[Int])
+  }
+
+  override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    val dtu = DateTimeUtils.getClass.getName.stripSuffix("$")
+    defineCodeGen(ctx, ev, (c) =>
+      s"""$dtu.getDayOfMonth($c)"""
+    )
+  }
+}
+
+case class WeekOfYear(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
+
+  override def inputTypes: Seq[AbstractDataType] = Seq(DateType)
+
+  override def dataType: DataType = IntegerType
+
+  override def prettyName: String = "week_of_year"
+
+  override protected def nullSafeEval(date: Any): Any = {
+    val c = Calendar.getInstance(TimeZone.getTimeZone("UTC"))
+    c.setFirstDayOfWeek(Calendar.MONDAY)
+    c.setMinimalDaysInFirstWeek(4)
+    c.setTimeInMillis(date.asInstanceOf[Int] * 1000L * 3600L * 24L)
+    c.get(Calendar.WEEK_OF_YEAR)
+  }
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String =
+    nullSafeCodeGen(ctx, ev, (time) => {
+      val cal = classOf[Calendar].getName
+      val c = ctx.freshName("cal")
+      s"""
+        $cal $c = $cal.getInstance(java.util.TimeZone.getTimeZone("UTC"));
+        $c.setFirstDayOfWeek($cal.MONDAY);
+        $c.setMinimalDaysInFirstWeek(4);
+        $c.setTimeInMillis($time * 1000L * 3600L * 24L);
+        ${ev.primitive} = $c.get($cal.WEEK_OF_YEAR);
+      """
+    })
+}
+
+case class DateFormatClass(left: Expression, right: Expression) extends BinaryExpression
+  with ImplicitCastInputTypes {
+
+  override def dataType: DataType = StringType
+
+  override def inputTypes: Seq[AbstractDataType] = Seq(TimestampType, StringType)
+
+  override def prettyName: String = "date_format"
+
+  override protected def nullSafeEval(timestamp: Any, format: Any): Any = {
+    val sdf = new SimpleDateFormat(format.toString)
+    UTF8String.fromString(sdf.format(new Date(timestamp.asInstanceOf[Long] / 1000)))
+  }
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    val sdf = classOf[SimpleDateFormat].getName
+    defineCodeGen(ctx, ev, (timestamp, format) => {
+      s"""UTF8String.fromString((new $sdf($format.toString()))
+          .format(new java.sql.Date($timestamp / 1000)))"""
+    })
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
index 45e45aef1a349..a0da73a995a82 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.util
 
 import java.sql.{Date, Timestamp}
 import java.text.{DateFormat, SimpleDateFormat}
-import java.util.{Calendar, TimeZone}
+import java.util.{TimeZone, Calendar}
 
 import org.apache.spark.unsafe.types.UTF8String
 
@@ -39,6 +39,15 @@ object DateTimeUtils {
   final val MICROS_PER_SECOND = 1000L * 1000L
   final val NANOS_PER_SECOND = MICROS_PER_SECOND * 1000L
 
+  // number of days in 400 years
+  final val daysIn400Years: Int = 146097
+  // number of days between 1.1.1970 and 1.1.2001
+  final val to2001 = -11323
+
+  // this is year -17999, calculation: 50 * daysIn400Year
+  final val toYearZero = to2001 + 7304850
+
+  @transient lazy val defaultTimeZone = TimeZone.getDefault
 
   // Java TimeZone has no mention of thread safety. Use thread local instance to be safe.
   private val threadLocalLocalTimeZone = new ThreadLocal[TimeZone] {
@@ -380,4 +389,188 @@ object DateTimeUtils {
     c.set(Calendar.MILLISECOND, 0)
     Some((c.getTimeInMillis / MILLIS_PER_DAY).toInt)
   }
+
+  /**
+   * Returns the hour value of a given timestamp value. The timestamp is expressed in microseconds.
+   */
+  def getHours(timestamp: Long): Int = {
+    val localTs = (timestamp / 1000) + defaultTimeZone.getOffset(timestamp / 1000)
+    ((localTs / 1000 / 3600) % 24).toInt
+  }
+
+  /**
+   * Returns the minute value of a given timestamp value. The timestamp is expressed in
+   * microseconds.
+   */
+  def getMinutes(timestamp: Long): Int = {
+    val localTs = (timestamp / 1000) + defaultTimeZone.getOffset(timestamp / 1000)
+    ((localTs / 1000 / 60) % 60).toInt
+  }
+
+  /**
+   * Returns the second value of a given timestamp value. The timestamp is expressed in
+   * microseconds.
+   */
+  def getSeconds(timestamp: Long): Int = {
+    ((timestamp / 1000 / 1000) % 60).toInt
+  }
+
+  private[this] def isLeapYear(year: Int): Boolean = {
+    (year % 4) == 0 && ((year % 100) != 0 || (year % 400) == 0)
+  }
+
+  /**
+   * Return the number of days since the start of 400 year period.
+   * The second year of a 400 year period (year 1) starts on day 365.
+   */
+  private[this] def yearBoundary(year: Int): Int = {
+    year * 365 + ((year / 4 ) - (year / 100) + (year / 400))
+  }
+
+  /**
+   * Calculates the number of years for the given number of days. This depends
+   * on a 400 year period.
+   * @param days days since the beginning of the 400 year period
+   * @return (number of year, days in year)
+   */
+  private[this] def numYears(days: Int): (Int, Int) = {
+    val year = days / 365
+    val boundary = yearBoundary(year)
+    if (days > boundary) (year, days - boundary) else (year - 1, days - yearBoundary(year - 1))
+  }
+
+  /**
+   * Calculates the year and and the number of the day in the year for the given
+   * number of days. The given days is the number of days since 1.1.1970.
+   *
+   * The calculation uses the fact that the period 1.1.2001 until 31.12.2400 is
+   * equals to the period 1.1.1601 until 31.12.2000.
+   */
+  private[this] def getYearAndDayInYear(daysSince1970: Int): (Int, Int) = {
+    // add the difference (in days) between 1.1.1970 and the artificial year 0 (-17999)
+    val daysNormalized = daysSince1970 + toYearZero
+    val numOfQuarterCenturies = daysNormalized / daysIn400Years
+    val daysInThis400 = daysNormalized % daysIn400Years + 1
+    val (years, dayInYear) = numYears(daysInThis400)
+    val year: Int = (2001 - 20000) + 400 * numOfQuarterCenturies + years
+    (year, dayInYear)
+  }
+
+  /**
+   * Returns the 'day in year' value for the given date. The date is expressed in days
+   * since 1.1.1970.
+   */
+  def getDayInYear(date: Int): Int = {
+    getYearAndDayInYear(date)._2
+  }
+
+  /**
+   * Returns the year value for the given date. The date is expressed in days
+   * since 1.1.1970.
+   */
+  def getYear(date: Int): Int = {
+    getYearAndDayInYear(date)._1
+  }
+
+  /**
+   * Returns the quarter for the given date. The date is expressed in days
+   * since 1.1.1970.
+   */
+  def getQuarter(date: Int): Int = {
+    var (year, dayInYear) = getYearAndDayInYear(date)
+    if (isLeapYear(year)) {
+      dayInYear = dayInYear - 1
+    }
+    if (dayInYear <= 90) {
+      1
+    } else if (dayInYear <= 181) {
+      2
+    } else if (dayInYear <= 273) {
+      3
+    } else {
+      4
+    }
+  }
+
+  /**
+   * Returns the month value for the given date. The date is expressed in days
+   * since 1.1.1970. January is month 1.
+   */
+  def getMonth(date: Int): Int = {
+    var (year, dayInYear) = getYearAndDayInYear(date)
+    if (isLeapYear(year)) {
+      if (dayInYear == 60) {
+        return 2
+      } else if (dayInYear > 60) {
+        dayInYear = dayInYear - 1
+      }
+    }
+
+    if (dayInYear <= 31) {
+      1
+    } else if (dayInYear <= 59) {
+      2
+    } else if (dayInYear <= 90) {
+      3
+    } else if (dayInYear <= 120) {
+      4
+    } else if (dayInYear <= 151) {
+      5
+    } else if (dayInYear <= 181) {
+      6
+    } else if (dayInYear <= 212) {
+      7
+    } else if (dayInYear <= 243) {
+      8
+    } else if (dayInYear <= 273) {
+      9
+    } else if (dayInYear <= 304) {
+      10
+    } else if (dayInYear <= 334) {
+      11
+    } else {
+      12
+    }
+  }
+
+  /**
+   * Returns the 'day of month' value for the given date. The date is expressed in days
+   * since 1.1.1970.
+   */
+  def getDayOfMonth(date: Int): Int = {
+    var (year, dayInYear) = getYearAndDayInYear(date)
+    if (isLeapYear(year)) {
+      if (dayInYear == 60) {
+        return 29
+      } else if (dayInYear > 60) {
+        dayInYear = dayInYear - 1
+      }
+    }
+
+    if (dayInYear <= 31) {
+      dayInYear
+    } else if (dayInYear <= 59) {
+      dayInYear - 31
+    } else if (dayInYear <= 90) {
+      dayInYear - 59
+    } else if (dayInYear <= 120) {
+      dayInYear - 90
+    } else if (dayInYear <= 151) {
+      dayInYear - 120
+    } else if (dayInYear <= 181) {
+      dayInYear - 151
+    } else if (dayInYear <= 212) {
+      dayInYear - 181
+    } else if (dayInYear <= 243) {
+      dayInYear - 212
+    } else if (dayInYear <= 273) {
+      dayInYear - 243
+    } else if (dayInYear <= 304) {
+      dayInYear - 273
+    } else if (dayInYear <= 334) {
+      dayInYear - 304
+    } else {
+      dayInYear - 334
+    }
+  }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateFunctionsSuite.scala
new file mode 100644
index 0000000000000..49d0b0aceac0d
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateFunctionsSuite.scala
@@ -0,0 +1,249 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions
+
+import java.sql.{Timestamp, Date}
+import java.text.SimpleDateFormat
+import java.util.{TimeZone, Calendar}
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.types.{StringType, TimestampType, DateType}
+
+class DateFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
+
+  val sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
+  val sdfDate = new SimpleDateFormat("yyyy-MM-dd")
+  val d = new Date(sdf.parse("2015-04-08 13:10:15").getTime)
+  val ts = new Timestamp(sdf.parse("2013-11-08 13:10:15").getTime)
+
+  test("Day in Year") {
+    val sdfDay = new SimpleDateFormat("D")
+    (2002 to 2004).foreach { y =>
+      (0 to 11).foreach { m =>
+        (0 to 5).foreach { i =>
+          val c = Calendar.getInstance()
+          c.set(y, m, 28, 0, 0, 0)
+          c.add(Calendar.DATE, i)
+          checkEvaluation(DayInYear(Cast(Literal(new Date(c.getTimeInMillis)), DateType)),
+            sdfDay.format(c.getTime).toInt)
+        }
+      }
+    }
+
+    (1998 to 2002).foreach { y =>
+      (0 to 11).foreach { m =>
+        (0 to 5).foreach { i =>
+          val c = Calendar.getInstance()
+          c.set(y, m, 28, 0, 0, 0)
+          c.add(Calendar.DATE, 1)
+          checkEvaluation(DayInYear(Cast(Literal(new Date(c.getTimeInMillis)), DateType)),
+            sdfDay.format(c.getTime).toInt)
+        }
+      }
+    }
+
+    (1969 to 1970).foreach { y =>
+      (0 to 11).foreach { m =>
+        (0 to 5).foreach { i =>
+          val c = Calendar.getInstance()
+          c.set(y, m, 28, 0, 0, 0)
+          c.add(Calendar.DATE, 1)
+          checkEvaluation(DayInYear(Cast(Literal(new Date(c.getTimeInMillis)), DateType)),
+            sdfDay.format(c.getTime).toInt)
+        }
+      }
+    }
+
+    (2402 to 2404).foreach { y =>
+      (0 to 11).foreach { m =>
+        (0 to 5).foreach { i =>
+          val c = Calendar.getInstance()
+          c.set(y, m, 28, 0, 0, 0)
+          c.add(Calendar.DATE, 1)
+          checkEvaluation(DayInYear(Cast(Literal(new Date(c.getTimeInMillis)), DateType)),
+            sdfDay.format(c.getTime).toInt)
+        }
+      }
+    }
+
+    (2398 to 2402).foreach { y =>
+      (0 to 11).foreach { m =>
+        (0 to 5).foreach { i =>
+          val c = Calendar.getInstance()
+          c.set(y, m, 28, 0, 0, 0)
+          c.add(Calendar.DATE, 1)
+          checkEvaluation(DayInYear(Cast(Literal(new Date(c.getTimeInMillis)), DateType)),
+            sdfDay.format(c.getTime).toInt)
+        }
+      }
+    }
+  }
+
+  test("Year") {
+    checkEvaluation(Year(Literal.create(null, DateType)), null)
+    checkEvaluation(Year(Cast(Literal(d), DateType)), 2015)
+    checkEvaluation(Year(Cast(Literal(sdfDate.format(d)), DateType)), 2015)
+    checkEvaluation(Year(Cast(Literal(ts), DateType)), 2013)
+
+    val c = Calendar.getInstance()
+    (2000 to 2010).foreach { y =>
+      (0 to 11 by 11).foreach { m =>
+        c.set(y, m, 28)
+        (0 to 5 * 24).foreach { i =>
+          c.add(Calendar.HOUR_OF_DAY, 1)
+          checkEvaluation(Year(Cast(Literal(new Date(c.getTimeInMillis)), DateType)),
+            c.get(Calendar.YEAR))
+        }
+      }
+    }
+  }
+
+  test("Quarter") {
+    checkEvaluation(Quarter(Literal.create(null, DateType)), null)
+    checkEvaluation(Quarter(Cast(Literal(d), DateType)), 2)
+    checkEvaluation(Quarter(Cast(Literal(sdfDate.format(d)), DateType)), 2)
+    checkEvaluation(Quarter(Cast(Literal(ts), DateType)), 4)
+
+    val c = Calendar.getInstance()
+    (2003 to 2004).foreach { y =>
+      (0 to 11 by 3).foreach { m =>
+        c.set(y, m, 28, 0, 0, 0)
+        (0 to 5 * 24).foreach { i =>
+          c.add(Calendar.HOUR_OF_DAY, 1)
+          checkEvaluation(Quarter(Cast(Literal(new Date(c.getTimeInMillis)), DateType)),
+            c.get(Calendar.MONTH) / 3 + 1)
+        }
+      }
+    }
+  }
+
+  test("Month") {
+    checkEvaluation(Month(Literal.create(null, DateType)), null)
+    checkEvaluation(Month(Cast(Literal(d), DateType)), 4)
+    checkEvaluation(Month(Cast(Literal(sdfDate.format(d)), DateType)), 4)
+    checkEvaluation(Month(Cast(Literal(ts), DateType)), 11)
+
+    (2003 to 2004).foreach { y =>
+      (0 to 11).foreach { m =>
+        (0 to 5 * 24).foreach { i =>
+          val c = Calendar.getInstance()
+          c.set(y, m, 28, 0, 0, 0)
+          c.add(Calendar.HOUR_OF_DAY, i)
+          checkEvaluation(Month(Cast(Literal(new Date(c.getTimeInMillis)), DateType)),
+            c.get(Calendar.MONTH) + 1)
+        }
+      }
+    }
+
+    (1999 to 2000).foreach { y =>
+      (0 to 11).foreach { m =>
+        (0 to 5 * 24).foreach { i =>
+          val c = Calendar.getInstance()
+          c.set(y, m, 28, 0, 0, 0)
+          c.add(Calendar.HOUR_OF_DAY, i)
+          checkEvaluation(Month(Cast(Literal(new Date(c.getTimeInMillis)), DateType)),
+            c.get(Calendar.MONTH) + 1)
+        }
+      }
+    }
+  }
+
+  test("Day") {
+    checkEvaluation(Day(Cast(Literal("2000-02-29"), DateType)), 29)
+    checkEvaluation(Day(Literal.create(null, DateType)), null)
+    checkEvaluation(Day(Cast(Literal(d), DateType)), 8)
+    checkEvaluation(Day(Cast(Literal(sdfDate.format(d)), DateType)), 8)
+    checkEvaluation(Day(Cast(Literal(ts), DateType)), 8)
+
+    (1999 to 2000).foreach { y =>
+      val c = Calendar.getInstance()
+      c.set(y, 0, 1, 0, 0, 0)
+      (0 to 365).foreach { d =>
+        c.add(Calendar.DATE, 1)
+        checkEvaluation(Day(Cast(Literal(new Date(c.getTimeInMillis)), DateType)),
+          c.get(Calendar.DAY_OF_MONTH))
+      }
+    }
+  }
+
+  test("Seconds") {
+    checkEvaluation(Second(Literal.create(null, DateType)), null)
+    checkEvaluation(Second(Cast(Literal(d), TimestampType)), 0)
+    checkEvaluation(Second(Cast(Literal(sdf.format(d)), TimestampType)), 15)
+    checkEvaluation(Second(Literal(ts)), 15)
+
+    val c = Calendar.getInstance()
+    (0 to 60 by 5).foreach { s =>
+      c.set(2015, 18, 3, 3, 5, s)
+      checkEvaluation(Second(Cast(Literal(new Timestamp(c.getTimeInMillis)), TimestampType)),
+        c.get(Calendar.SECOND))
+    }
+  }
+
+  test("WeekOfYear") {
+    checkEvaluation(WeekOfYear(Literal.create(null, DateType)), null)
+    checkEvaluation(WeekOfYear(Cast(Literal(d), DateType)), 15)
+    checkEvaluation(WeekOfYear(Cast(Literal(sdfDate.format(d)), DateType)), 15)
+    checkEvaluation(WeekOfYear(Cast(Literal(ts), DateType)), 45)
+    checkEvaluation(WeekOfYear(Cast(Literal("2011-05-06"), DateType)), 18)
+  }
+
+  test("DateFormat") {
+    checkEvaluation(DateFormatClass(Literal.create(null, TimestampType), Literal("y")), null)
+    checkEvaluation(DateFormatClass(Cast(Literal(d), TimestampType),
+      Literal.create(null, StringType)), null)
+    checkEvaluation(DateFormatClass(Cast(Literal(d), TimestampType),
+      Literal("y")), "2015")
+    checkEvaluation(DateFormatClass(Literal(ts), Literal("y")), "2013")
+  }
+
+  test("Hour") {
+    checkEvaluation(Hour(Literal.create(null, DateType)), null)
+    checkEvaluation(Hour(Cast(Literal(d), TimestampType)), 0)
+    checkEvaluation(Hour(Cast(Literal(sdf.format(d)), TimestampType)), 13)
+    checkEvaluation(Hour(Literal(ts)), 13)
+
+    val c = Calendar.getInstance()
+    (0 to 24).foreach { h =>
+      (0 to 60 by 15).foreach { m =>
+        (0 to 60 by 15).foreach { s =>
+          c.set(2015, 18, 3, h, m, s)
+          checkEvaluation(Hour(Cast(Literal(new Timestamp(c.getTimeInMillis)), TimestampType)),
+            c.get(Calendar.HOUR_OF_DAY))
+        }
+      }
+    }
+  }
+
+  test("Minute") {
+    checkEvaluation(Minute(Literal.create(null, DateType)), null)
+    checkEvaluation(Minute(Cast(Literal(d), TimestampType)), 0)
+    checkEvaluation(Minute(Cast(Literal(sdf.format(d)), TimestampType)), 10)
+    checkEvaluation(Minute(Literal(ts)), 10)
+
+    val c = Calendar.getInstance()
+    (0 to 60 by 5).foreach { m =>
+      (0 to 60 by 15).foreach { s =>
+        c.set(2015, 18, 3, 3, m, s)
+        checkEvaluation(Minute(Cast(Literal(new Timestamp(c.getTimeInMillis)), TimestampType)),
+          c.get(Calendar.MINUTE))
+      }
+    }
+  }
+
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
index 04c5f09792ac3..fab9eb9cd4c9f 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
@@ -26,6 +26,11 @@ import org.apache.spark.unsafe.types.UTF8String
 
 class DateTimeUtilsSuite extends SparkFunSuite {
 
+  private[this] def getInUTCDays(timestamp: Long): Int = {
+    val tz = TimeZone.getDefault
+    ((timestamp + tz.getOffset(timestamp)) / DateTimeUtils.MILLIS_PER_DAY).toInt
+  }
+
   test("timestamp and us") {
     val now = new Timestamp(System.currentTimeMillis())
     now.setNanos(1000)
@@ -277,28 +282,6 @@ class DateTimeUtilsSuite extends SparkFunSuite {
     assert(DateTimeUtils.stringToTimestamp(
       UTF8String.fromString("2011-05-06 07:08:09.1000")).get === c.getTimeInMillis * 1000)
 
-    val defaultTimeZone = TimeZone.getDefault
-    TimeZone.setDefault(TimeZone.getTimeZone("America/Los_Angeles"))
-
-    c = Calendar.getInstance()
-    c.set(2015, 2, 8, 2, 0, 0)
-    c.set(Calendar.MILLISECOND, 0)
-    assert(DateTimeUtils.stringToTimestamp(
-      UTF8String.fromString("2015-3-8 2:0:0")).get === c.getTimeInMillis * 1000)
-    c.add(Calendar.MINUTE, 30)
-    assert(DateTimeUtils.stringToTimestamp(
-      UTF8String.fromString("2015-3-8 3:30:0")).get === c.getTimeInMillis * 1000)
-    assert(DateTimeUtils.stringToTimestamp(
-      UTF8String.fromString("2015-3-8 2:30:0")).get === c.getTimeInMillis * 1000)
-
-    c = Calendar.getInstance()
-    c.set(2015, 10, 1, 1, 59, 0)
-    c.set(Calendar.MILLISECOND, 0)
-    c.add(Calendar.MINUTE, 31)
-    assert(DateTimeUtils.stringToTimestamp(
-      UTF8String.fromString("2015-11-1 2:30:0")).get === c.getTimeInMillis * 1000)
-    TimeZone.setDefault(defaultTimeZone)
-
     assert(DateTimeUtils.stringToTimestamp(UTF8String.fromString("238")).isEmpty)
     assert(DateTimeUtils.stringToTimestamp(UTF8String.fromString("2015-03-18 123142")).isEmpty)
     assert(DateTimeUtils.stringToTimestamp(UTF8String.fromString("2015-03-18T123123")).isEmpty)
@@ -314,4 +297,68 @@ class DateTimeUtilsSuite extends SparkFunSuite {
     assert(DateTimeUtils.stringToTimestamp(
       UTF8String.fromString("2015-03-18T12:03.17-1:0:0")).isEmpty)
   }
+
+  test("hours") {
+    val c = Calendar.getInstance()
+    c.set(2015, 2, 18, 13, 2, 11)
+    assert(DateTimeUtils.getHours(c.getTimeInMillis * 1000) === 13)
+    c.set(2015, 12, 8, 2, 7, 9)
+    assert(DateTimeUtils.getHours(c.getTimeInMillis * 1000) === 2)
+  }
+
+  test("minutes") {
+    val c = Calendar.getInstance()
+    c.set(2015, 2, 18, 13, 2, 11)
+    assert(DateTimeUtils.getMinutes(c.getTimeInMillis * 1000) === 2)
+    c.set(2015, 2, 8, 2, 7, 9)
+    assert(DateTimeUtils.getMinutes(c.getTimeInMillis * 1000) === 7)
+  }
+
+  test("seconds") {
+    val c = Calendar.getInstance()
+    c.set(2015, 2, 18, 13, 2, 11)
+    assert(DateTimeUtils.getSeconds(c.getTimeInMillis * 1000) === 11)
+    c.set(2015, 2, 8, 2, 7, 9)
+    assert(DateTimeUtils.getSeconds(c.getTimeInMillis * 1000) === 9)
+  }
+
+  test("get day in year") {
+    val c = Calendar.getInstance()
+    c.set(2015, 2, 18, 0, 0, 0)
+    assert(DateTimeUtils.getDayInYear(getInUTCDays(c.getTimeInMillis)) === 77)
+    c.set(2012, 2, 18, 0, 0, 0)
+    assert(DateTimeUtils.getDayInYear(getInUTCDays(c.getTimeInMillis)) === 78)
+  }
+
+  test("get year") {
+    val c = Calendar.getInstance()
+    c.set(2015, 2, 18, 0, 0, 0)
+    assert(DateTimeUtils.getYear(getInUTCDays(c.getTimeInMillis)) === 2015)
+    c.set(2012, 2, 18, 0, 0, 0)
+    assert(DateTimeUtils.getYear(getInUTCDays(c.getTimeInMillis)) === 2012)
+  }
+
+  test("get quarter") {
+    val c = Calendar.getInstance()
+    c.set(2015, 2, 18, 0, 0, 0)
+    assert(DateTimeUtils.getQuarter(getInUTCDays(c.getTimeInMillis)) === 1)
+    c.set(2012, 11, 18, 0, 0, 0)
+    assert(DateTimeUtils.getQuarter(getInUTCDays(c.getTimeInMillis)) === 4)
+  }
+
+  test("get month") {
+    val c = Calendar.getInstance()
+    c.set(2015, 2, 18, 0, 0, 0)
+    assert(DateTimeUtils.getMonth(getInUTCDays(c.getTimeInMillis)) === 3)
+    c.set(2012, 11, 18, 0, 0, 0)
+    assert(DateTimeUtils.getMonth(getInUTCDays(c.getTimeInMillis)) === 12)
+  }
+
+  test("get day of month") {
+    val c = Calendar.getInstance()
+    c.set(2015, 2, 18, 0, 0, 0)
+    assert(DateTimeUtils.getDayOfMonth(getInUTCDays(c.getTimeInMillis)) === 18)
+    c.set(2012, 11, 24, 0, 0, 0)
+    assert(DateTimeUtils.getDayOfMonth(getInUTCDays(c.getTimeInMillis)) === 24)
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index c180407389136..cadb25d597d19 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -1748,6 +1748,182 @@ object functions {
    */
   def length(columnName: String): Column = length(Column(columnName))
 
+  //////////////////////////////////////////////////////////////////////////////////////////////
+  // DateTime functions
+  //////////////////////////////////////////////////////////////////////////////////////////////
+
+  /**
+   * Converts a date/timestamp/string to a value of string in the format specified by the date
+   * format given by the second argument.
+   *
+   * A pattern could be for instance `dd.MM.yyyy` and could return a string like '18.03.1993'. All
+   * pattern letters of [[java.text.SimpleDateFormat]] can be used.
+   *
+   * NOTE: Use when ever possible specialized functions like [[year]]. These benefit from a
+   * specialized implementation.
+   *
+   * @group datetime_funcs
+   * @since 1.5.0
+   */
+  def date_format(dateExpr: Column, format: String): Column =
+    DateFormatClass(dateExpr.expr, Literal(format))
+
+  /**
+   * Converts a date/timestamp/string to a value of string in the format specified by the date
+   * format given by the second argument.
+   *
+   * A pattern could be for instance `dd.MM.yyyy` and could return a string like '18.03.1993'. All
+   * pattern letters of [[java.text.SimpleDateFormat]] can be used.
+   *
+   * NOTE: Use when ever possible specialized functions like [[year]]. These benefit from a
+   * specialized implementation.
+   *
+   * @group datetime_funcs
+   * @since 1.5.0
+   */
+  def date_format(dateColumnName: String, format: String): Column =
+    date_format(Column(dateColumnName), format)
+
+  /**
+   * Extracts the year as an integer from a given date/timestamp/string.
+   * @group datetime_funcs
+   * @since 1.5.0
+   */
+  def year(e: Column): Column = Year(e.expr)
+
+  /**
+   * Extracts the year as an integer from a given date/timestamp/string.
+   * @group datetime_funcs
+   * @since 1.5.0
+   */
+  def year(columnName: String): Column = year(Column(columnName))
+
+  /**
+   * Extracts the quarter as an integer from a given date/timestamp/string.
+   * @group datetime_funcs
+   * @since 1.5.0
+   */
+  def quarter(e: Column): Column = Quarter(e.expr)
+
+  /**
+   * Extracts the quarter as an integer from a given date/timestamp/string.
+   * @group datetime_funcs
+   * @since 1.5.0
+   */
+  def quarter(columnName: String): Column = quarter(Column(columnName))
+
+  /**
+   * Extracts the month as an integer from a given date/timestamp/string.
+   * @group datetime_funcs
+   * @since 1.5.0
+   */
+  def month(e: Column): Column = Month(e.expr)
+
+  /**
+   * Extracts the month as an integer from a given date/timestamp/string.
+   * @group datetime_funcs
+   * @since 1.5.0
+   */
+  def month(columnName: String): Column = month(Column(columnName))
+
+  /**
+   * Extracts the day of the month as an integer from a given date/timestamp/string.
+   * @group datetime_funcs
+   * @since 1.5.0
+   */
+  def day(e: Column): Column = Day(e.expr)
+
+  /**
+   * Extracts the day of the month as an integer from a given date/timestamp/string.
+   * @group datetime_funcs
+   * @since 1.5.0
+   */
+  def day(columnName: String): Column = day(Column(columnName))
+
+  /**
+   * Extracts the day of the month as an integer from a given date/timestamp/string.
+   * @group datetime_funcs
+   * @since 1.5.0
+   */
+  def day_of_month(e: Column): Column = Day(e.expr)
+
+  /**
+   * Extracts the day of the month as an integer from a given date/timestamp/string.
+   * @group datetime_funcs
+   * @since 1.5.0
+   */
+  def day_of_month(columnName: String): Column = day_of_month(Column(columnName))
+
+  /**
+   * Extracts the day of the year as an integer from a given date/timestamp/string.
+   * @group datetime_funcs
+   * @since 1.5.0
+   */
+  def day_in_year(e: Column): Column = DayInYear(e.expr)
+
+  /**
+   * Extracts the day of the year as an integer from a given date/timestamp/string.
+   * @group datetime_funcs
+   * @since 1.5.0
+   */
+  def day_in_year(columnName: String): Column = day_in_year(Column(columnName))
+
+  /**
+   * Extracts the hours as an integer from a given date/timestamp/string.
+   * @group datetime_funcs
+   * @since 1.5.0
+   */
+  def hour(e: Column): Column = Hour(e.expr)
+
+  /**
+   * Extracts the hours as an integer from a given date/timestamp/string.
+   * @group datetime_funcs
+   * @since 1.5.0
+   */
+  def hour(columnName: String): Column = hour(Column(columnName))
+
+  /**
+   * Extracts the minutes as an integer from a given date/timestamp/string.
+   * @group datetime_funcs
+   * @since 1.5.0
+   */
+  def minute(e: Column): Column = Minute(e.expr)
+
+  /**
+   * Extracts the minutes as an integer from a given date/timestamp/string.
+   * @group datetime_funcs
+   * @since 1.5.0
+   */
+  def minute(columnName: String): Column = minute(Column(columnName))
+
+  /**
+   * Extracts the seconds as an integer from a given date/timestamp/string.
+   * @group datetime_funcs
+   * @since 1.5.0
+   */
+  def second(e: Column): Column = Second(e.expr)
+
+  /**
+   * Extracts the seconds as an integer from a given date/timestamp/string.
+   * @group datetime_funcs
+   * @since 1.5.0
+   */
+  def second(columnName: String): Column = second(Column(columnName))
+
+  /**
+   * Extracts the week number as an integer from a given date/timestamp/string.
+   * @group datetime_funcs
+   * @since 1.5.0
+   */
+  def week_of_year(e: Column): Column = WeekOfYear(e.expr)
+
+  /**
+   * Extracts the week number as an integer from a given date/timestamp/string.
+   * @group datetime_funcs
+   * @since 1.5.0
+   */
+  def week_of_year(columnName: String): Column = week_of_year(Column(columnName))
+
   /**
    * Formats the number X to a format like '#,###,###.##', rounded to d decimal places,
    * and returns the result as a string.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DateExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DateExpressionsSuite.scala
new file mode 100644
index 0000000000000..d24e3ee1dd8f5
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DateExpressionsSuite.scala
@@ -0,0 +1,170 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql
+
+import java.sql.{Timestamp, Date}
+import java.text.SimpleDateFormat
+
+import org.apache.spark.sql.functions._
+
+class DateExpressionsSuite extends QueryTest {
+  private lazy val ctx = org.apache.spark.sql.test.TestSQLContext
+
+  import ctx.implicits._
+
+  val sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
+  val sdfDate = new SimpleDateFormat("yyyy-MM-dd")
+  val d = new Date(sdf.parse("2015-04-08 13:10:15").getTime)
+  val ts = new Timestamp(sdf.parse("2013-04-08 13:10:15").getTime)
+
+
+  test("date format") {
+    val df = Seq((d, sdf.format(d), ts)).toDF("a", "b", "c")
+
+    checkAnswer(
+      df.select(date_format("a", "y"), date_format("b", "y"), date_format("c", "y")),
+      Row("2015", "2015", "2013"))
+
+    checkAnswer(
+      df.selectExpr("date_format(a, 'y')", "date_format(b, 'y')", "date_format(c, 'y')"),
+      Row("2015", "2015", "2013"))
+  }
+
+  test("year") {
+    val df = Seq((d, sdfDate.format(d), ts)).toDF("a", "b", "c")
+
+    checkAnswer(
+      df.select(year("a"), year("b"), year("c")),
+      Row(2015, 2015, 2013))
+
+    checkAnswer(
+      df.selectExpr("year(a)", "year(b)", "year(c)"),
+      Row(2015, 2015, 2013))
+  }
+
+  test("quarter") {
+    val ts = new Timestamp(sdf.parse("2013-11-08 13:10:15").getTime)
+
+    val df = Seq((d, sdfDate.format(d), ts)).toDF("a", "b", "c")
+
+    checkAnswer(
+      df.select(quarter("a"), quarter("b"), quarter("c")),
+      Row(2, 2, 4))
+
+    checkAnswer(
+      df.selectExpr("quarter(a)", "quarter(b)", "quarter(c)"),
+      Row(2, 2, 4))
+  }
+
+  test("month") {
+    val df = Seq((d, sdfDate.format(d), ts)).toDF("a", "b", "c")
+
+    checkAnswer(
+      df.select(month("a"), month("b"), month("c")),
+      Row(4, 4, 4))
+
+    checkAnswer(
+      df.selectExpr("month(a)", "month(b)", "month(c)"),
+      Row(4, 4, 4))
+  }
+
+  test("day") {
+    val df = Seq((d, sdfDate.format(d), ts)).toDF("a", "b", "c")
+
+    checkAnswer(
+      df.select(day("a"), day("b"), day("c")),
+      Row(8, 8, 8))
+
+    checkAnswer(
+      df.selectExpr("day(a)", "day(b)", "day(c)"),
+      Row(8, 8, 8))
+  }
+
+  test("day of month") {
+    val df = Seq((d, sdfDate.format(d), ts)).toDF("a", "b", "c")
+
+    checkAnswer(
+      df.select(day_of_month("a"), day_of_month("b"), day_of_month("c")),
+      Row(8, 8, 8))
+
+    checkAnswer(
+      df.selectExpr("day_of_month(a)", "day_of_month(b)", "day_of_month(c)"),
+      Row(8, 8, 8))
+  }
+
+  test("day in year") {
+    val df = Seq((d, sdfDate.format(d), ts)).toDF("a", "b", "c")
+
+    checkAnswer(
+      df.select(day_in_year("a"), day_in_year("b"), day_in_year("c")),
+      Row(98, 98, 98))
+
+    checkAnswer(
+      df.selectExpr("day_in_year(a)", "day_in_year(b)", "day_in_year(c)"),
+      Row(98, 98, 98))
+  }
+
+  test("hour") {
+    val df = Seq((d, sdf.format(d), ts)).toDF("a", "b", "c")
+
+    checkAnswer(
+      df.select(hour("a"), hour("b"), hour("c")),
+      Row(0, 13, 13))
+
+    checkAnswer(
+      df.selectExpr("hour(a)", "hour(b)", "hour(c)"),
+      Row(0, 13, 13))
+  }
+
+  test("minute") {
+    val df = Seq((d, sdf.format(d), ts)).toDF("a", "b", "c")
+
+    checkAnswer(
+      df.select(minute("a"), minute("b"), minute("c")),
+      Row(0, 10, 10))
+
+    checkAnswer(
+      df.selectExpr("minute(a)", "minute(b)", "minute(c)"),
+      Row(0, 10, 10))
+  }
+
+  test("second") {
+    val df = Seq((d, sdf.format(d), ts)).toDF("a", "b", "c")
+
+    checkAnswer(
+      df.select(second("a"), second("b"), second("c")),
+      Row(0, 15, 15))
+
+    checkAnswer(
+      df.selectExpr("second(a)", "second(b)", "second(c)"),
+      Row(0, 15, 15))
+  }
+
+  test("week of year") {
+    val df = Seq((d, sdfDate.format(d), ts)).toDF("a", "b", "c")
+
+    checkAnswer(
+      df.select(week_of_year("a"), week_of_year("b"), week_of_year("c")),
+      Row(15, 15, 15))
+
+    checkAnswer(
+      df.selectExpr("week_of_year(a)", "week_of_year(b)", "week_of_year(c)"),
+      Row(15, 15, 15))
+  }
+
+}
diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
index 299cc599ff8f7..2689d904d6541 100644
--- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
+++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
@@ -115,6 +115,13 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     // This test is totally fine except that it includes wrong queries and expects errors, but error
     // message format in Hive and Spark SQL differ. Should workaround this later.
     "udf_to_unix_timestamp",
+    // we can cast dates likes '2015-03-18' to a timestamp and extract the seconds.
+    // Hive returns null for second('2015-03-18')
+    "udf_second",
+    // we can cast dates likes '2015-03-18' to a timestamp and extract the minutes.
+    // Hive returns null for minute('2015-03-18')
+    "udf_minute",
+
 
     // Cant run without local map/reduce.
     "index_auto_update",
@@ -896,7 +903,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "udf_lpad",
     "udf_ltrim",
     "udf_map",
-    "udf_minute",
     "udf_modulo",
     "udf_month",
     "udf_named_struct",
@@ -923,7 +929,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "udf_round_3",
     "udf_rpad",
     "udf_rtrim",
-    "udf_second",
     "udf_sign",
     "udf_sin",
     "udf_smallint",

From 04c1b49f5eee915ad1159a32bf12836a3b9f2620 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Sat, 18 Jul 2015 22:50:34 -0700
Subject: [PATCH 242/549] Fixed test cases.

---
 .../spark/sql/catalyst/expressions/DateFunctionsSuite.scala   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateFunctionsSuite.scala
index 49d0b0aceac0d..f469f42116d21 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateFunctionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateFunctionsSuite.scala
@@ -50,7 +50,7 @@ class DateFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
         (0 to 5).foreach { i =>
           val c = Calendar.getInstance()
           c.set(y, m, 28, 0, 0, 0)
-          c.add(Calendar.DATE, 1)
+          c.add(Calendar.DATE, i)
           checkEvaluation(DayInYear(Cast(Literal(new Date(c.getTimeInMillis)), DateType)),
             sdfDay.format(c.getTime).toInt)
         }
@@ -62,7 +62,7 @@ class DateFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
         (0 to 5).foreach { i =>
           val c = Calendar.getInstance()
           c.set(y, m, 28, 0, 0, 0)
-          c.add(Calendar.DATE, 1)
+          c.add(Calendar.DATE, i)
           checkEvaluation(DayInYear(Cast(Literal(new Date(c.getTimeInMillis)), DateType)),
             sdfDay.format(c.getTime).toInt)
         }

From a9a0d0cebf8ab3c539723488e5945794ebfd6104 Mon Sep 17 00:00:00 2001
From: Herman van Hovell <hvanhovell@questtec.nl>
Date: Sat, 18 Jul 2015 23:44:38 -0700
Subject: [PATCH 243/549] [SPARK-8638] [SQL] Window Function Performance
 Improvements

## Description
Performance improvements for Spark Window functions. This PR will also serve as the basis for moving away from Hive UDAFs to Spark UDAFs. See JIRA tickets SPARK-8638 and SPARK-7712 for more information.

## Improvements
* Much better performance (10x) in running cases (e.g. BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) and UNBOUDED FOLLOWING cases. The current implementation in spark uses a sliding window approach in these cases. This means that an aggregate is maintained for every row, so space usage is N (N being the number of rows). This also means that all these aggregates all need to be updated separately, this takes N*(N-1)/2 updates. The running case differs from the Sliding case because we are only adding data to an aggregate function (no reset is required), we only need to maintain one aggregate (like in the UNBOUNDED PRECEDING AND UNBOUNDED case), update the aggregate for each row, and get the aggregate value after each update. This is what the new implementation does. This approach only uses 1 buffer, and only requires N updates; I am currently working on data with window sizes of 500-1000 doing running sums and this saves a lot of time. The CURRENT ROW AND UNBOUNDED FOLLOWING case also uses this approach and the fact that aggregate operations are communitative, there is one twist though it will process the input buffer in reverse.
* Fewer comparisons in the sliding case. The current implementation determines frame boundaries for every input row. The new implementation makes more use of the fact that the window is sorted, maintains the boundaries, and only moves them when the current row order changes. This is a minor improvement.
* A single Window node is able to process all types of Frames for the same Partitioning/Ordering. This saves a little time/memory spent buffering and managing partitions. This will be enabled in a follow-up PR.
* A lot of the staging code is moved from the execution phase to the initialization phase. Minor performance improvement, and improves readability of the execution code.

## Benchmarking
I have done a small benchmark using [on time performance](http://www.transtats.bts.gov) data of the month april. I have used the origin as a partioning key, as a result there is quite some variation in window sizes. The code for the benchmark can be found in the JIRA ticket. These are the results per Frame type:

Frame | Master | SPARK-8638
----- | ------ | ----------
Entire Frame | 2 s | 1 s
Sliding | 18 s | 1 s
Growing | 14 s | 0.9 s
Shrinking | 13 s | 1 s

Author: Herman van Hovell <hvanhovell@questtec.nl>

Closes #7057 from hvanhovell/SPARK-8638 and squashes the following commits:

3bfdc49 [Herman van Hovell] Fixed Perfomance Regression for Shrinking Window Frames (+Rebase)
2eb3b33 [Herman van Hovell] Corrected reverse range frame processing.
2cd2d5b [Herman van Hovell] Corrected reverse range frame processing.
b0654d7 [Herman van Hovell] Tests for exotic frame specifications.
e75b76e [Herman van Hovell] More docs, added support for reverse sliding range frames, and some reorganization of code.
1fdb558 [Herman van Hovell] Changed Data In HiveDataFrameWindowSuite.
ac2f682 [Herman van Hovell] Added a few more comments.
1938312 [Herman van Hovell] Added Documentation to the createBoundOrdering methods.
bb020e6 [Herman van Hovell] Major overhaul of Window operator.
---
 .../expressions/windowExpressions.scala       |   12 +
 .../apache/spark/sql/execution/Window.scala   | 1072 +++++++++++------
 .../sql/hive/HiveDataFrameWindowSuite.scala   |    6 +-
 .../sql/hive/execution/WindowSuite.scala      |   79 ++
 4 files changed, 765 insertions(+), 404 deletions(-)
 create mode 100644 sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/WindowSuite.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala
index 50bbfd644d302..09ec0e333aa44 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala
@@ -316,3 +316,15 @@ case class WindowExpression(
 
   override def toString: String = s"$windowFunction $windowSpec"
 }
+
+/**
+ * Extractor for making working with frame boundaries easier.
+ */
+object FrameBoundaryExtractor {
+  def unapply(boundary: FrameBoundary): Option[Int] = boundary match {
+    case CurrentRow => Some(0)
+    case ValuePreceding(offset) => Some(-offset)
+    case ValueFollowing(offset) => Some(offset)
+    case _ => None
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Window.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Window.scala
index 6e127e548a120..a054f52b8b489 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Window.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Window.scala
@@ -19,18 +19,64 @@ package org.apache.spark.sql.execution
 
 import java.util
 
-import org.apache.spark.rdd.RDD
+import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.plans.physical.{AllTuples, ClusteredDistribution, Distribution, Partitioning}
+import org.apache.spark.sql.catalyst.plans.physical._
+import org.apache.spark.sql.types.IntegerType
+import org.apache.spark.rdd.RDD
 import org.apache.spark.util.collection.CompactBuffer
+import scala.collection.mutable
 
 /**
  * :: DeveloperApi ::
- * For every row, evaluates `windowExpression` containing Window Functions and attaches
- * the results with other regular expressions (presented by `projectList`).
- * Evert operator handles a single Window Specification, `windowSpec`.
+ * This class calculates and outputs (windowed) aggregates over the rows in a single (sorted)
+ * partition. The aggregates are calculated for each row in the group. Special processing
+ * instructions, frames, are used to calculate these aggregates. Frames are processed in the order
+ * specified in the window specification (the ORDER BY ... clause). There are four different frame
+ * types:
+ * - Entire partition: The frame is the entire partition, i.e.
+ *   UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING. For this case, window function will take all
+ *   rows as inputs and be evaluated once.
+ * - Growing frame: We only add new rows into the frame, i.e. UNBOUNDED PRECEDING AND ....
+ *   Every time we move to a new row to process, we add some rows to the frame. We do not remove
+ *   rows from this frame.
+ * - Shrinking frame: We only remove rows from the frame, i.e. ... AND UNBOUNDED FOLLOWING.
+ *   Every time we move to a new row to process, we remove some rows from the frame. We do not add
+ *   rows to this frame.
+ * - Moving frame: Every time we move to a new row to process, we remove some rows from the frame
+ *   and we add some rows to the frame. Examples are:
+ *     1 PRECEDING AND CURRENT ROW and 1 FOLLOWING AND 2 FOLLOWING.
+ *
+ * Different frame boundaries can be used in Growing, Shrinking and Moving frames. A frame
+ * boundary can be either Row or Range based:
+ * - Row Based: A row based boundary is based on the position of the row within the partition.
+ *   An offset indicates the number of rows above or below the current row, the frame for the
+ *   current row starts or ends. For instance, given a row based sliding frame with a lower bound
+ *   offset of -1 and a upper bound offset of +2. The frame for row with index 5 would range from
+ *   index 4 to index 6.
+ * - Range based: A range based boundary is based on the actual value of the ORDER BY
+ *   expression(s). An offset is used to alter the value of the ORDER BY expression, for
+ *   instance if the current order by expression has a value of 10 and the lower bound offset
+ *   is -3, the resulting lower bound for the current row will be 10 - 3 = 7. This however puts a
+ *   number of constraints on the ORDER BY expressions: there can be only one expression and this
+ *   expression must have a numerical data type. An exception can be made when the offset is 0,
+ *   because no value modification is needed, in this case multiple and non-numeric ORDER BY
+ *   expression are allowed.
+ *
+ * This is quite an expensive operator because every row for a single group must be in the same
+ * partition and partitions must be sorted according to the grouping and sort order. The operator
+ * requires the planner to take care of the partitioning and sorting.
+ *
+ * The operator is semi-blocking. The window functions and aggregates are calculated one group at
+ * a time, the result will only be made available after the processing for the entire group has
+ * finished. The operator is able to process different frame configurations at the same time. This
+ * is done by delegating the actual frame processing (i.e. calculation of the window functions) to
+ * specialized classes, see [[WindowFunctionFrame]], which take care of their own frame type:
+ * Entire Partition, Sliding, Growing & Shrinking. Boundary evaluation is also delegated to a pair
+ * of specialized classes: [[RowBoundOrdering]] & [[RangeBoundOrdering]].
  */
+@DeveloperApi
 case class Window(
     projectList: Seq[Attribute],
     windowExpression: Seq[NamedExpression],
@@ -38,443 +84,667 @@ case class Window(
     child: SparkPlan)
   extends UnaryNode {
 
-  override def output: Seq[Attribute] =
-    (projectList ++ windowExpression).map(_.toAttribute)
+  override def output: Seq[Attribute] = projectList ++ windowExpression.map(_.toAttribute)
 
-  override def requiredChildDistribution: Seq[Distribution] =
+  override def requiredChildDistribution: Seq[Distribution] = {
     if (windowSpec.partitionSpec.isEmpty) {
-      // This operator will be very expensive.
+      // Only show warning when the number of bytes is larger than 100 MB?
+      logWarning("No Partition Defined for Window operation! Moving all data to a single "
+        + "partition, this can cause serious performance degradation.")
       AllTuples :: Nil
-    } else {
-      ClusteredDistribution(windowSpec.partitionSpec) :: Nil
-    }
-
-  // Since window functions are adding columns to the input rows, the child's outputPartitioning
-  // is preserved.
-  override def outputPartitioning: Partitioning = child.outputPartitioning
-
-  override def requiredChildOrdering: Seq[Seq[SortOrder]] = {
-    // The required child ordering has two parts.
-    // The first part is the expressions in the partition specification.
-    // We add these expressions to the required ordering to make sure input rows are grouped
-    // based on the partition specification. So, we only need to process a single partition
-    // at a time.
-    // The second part is the expressions specified in the ORDER BY cluase.
-    // Basically, we first use sort to group rows based on partition specifications and then sort
-    // Rows in a group based on the order specification.
-    (windowSpec.partitionSpec.map(SortOrder(_, Ascending)) ++ windowSpec.orderSpec) :: Nil
+    } else ClusteredDistribution(windowSpec.partitionSpec) :: Nil
   }
 
-  // Since window functions basically add columns to input rows, this operator
-  // will not change the ordering of input rows.
+  override def requiredChildOrdering: Seq[Seq[SortOrder]] =
+    Seq(windowSpec.partitionSpec.map(SortOrder(_, Ascending)) ++ windowSpec.orderSpec)
+
   override def outputOrdering: Seq[SortOrder] = child.outputOrdering
 
-  case class ComputedWindow(
-    unbound: WindowExpression,
-    windowFunction: WindowFunction,
-    resultAttribute: AttributeReference)
-
-  // A list of window functions that need to be computed for each group.
-  private[this] val computedWindowExpressions = windowExpression.flatMap { window =>
-    window.collect {
-      case w: WindowExpression =>
-        ComputedWindow(
-          w,
-          BindReferences.bindReference(w.windowFunction, child.output),
-          AttributeReference(s"windowResult:$w", w.dataType, w.nullable)())
+  /**
+   * Create a bound ordering object for a given frame type and offset. A bound ordering object is
+   * used to determine which input row lies within the frame boundaries of an output row.
+   *
+   * This method uses Code Generation. It can only be used on the executor side.
+   *
+   * @param frameType to evaluate. This can either be Row or Range based.
+   * @param offset with respect to the row.
+   * @return a bound ordering object.
+   */
+  private[this] def createBoundOrdering(frameType: FrameType, offset: Int): BoundOrdering = {
+    frameType match {
+      case RangeFrame =>
+        val (exprs, current, bound) = if (offset == 0) {
+          // Use the entire order expression when the offset is 0.
+          val exprs = windowSpec.orderSpec.map(_.child)
+          val projection = newMutableProjection(exprs, child.output)
+          (windowSpec.orderSpec, projection(), projection())
+        }
+        else if (windowSpec.orderSpec.size == 1) {
+          // Use only the first order expression when the offset is non-null.
+          val sortExpr = windowSpec.orderSpec.head
+          val expr = sortExpr.child
+          // Create the projection which returns the current 'value'.
+          val current = newMutableProjection(expr :: Nil, child.output)()
+          // Flip the sign of the offset when processing the order is descending
+          val boundOffset = if (sortExpr.direction == Descending) -offset
+          else offset
+          // Create the projection which returns the current 'value' modified by adding the offset.
+          val boundExpr = Add(expr, Cast(Literal.create(boundOffset, IntegerType), expr.dataType))
+          val bound = newMutableProjection(boundExpr :: Nil, child.output)()
+          (sortExpr :: Nil, current, bound)
+        }
+        else {
+          sys.error("Non-Zero range offsets are not supported for windows " +
+            "with multiple order expressions.")
+        }
+        // Construct the ordering. This is used to compare the result of current value projection
+        // to the result of bound value projection. This is done manually because we want to use
+        // Code Generation (if it is enabled).
+        val (sortExprs, schema) = exprs.map { case e =>
+          val ref = AttributeReference("ordExpr", e.dataType, e.nullable)()
+          (SortOrder(ref, e.direction), ref)
+        }.unzip
+        val ordering = newOrdering(sortExprs, schema)
+        RangeBoundOrdering(ordering, current, bound)
+      case RowFrame => RowBoundOrdering(offset)
     }
-  }.toArray
+  }
 
-  private[this] val windowFrame =
-    windowSpec.frameSpecification.asInstanceOf[SpecifiedWindowFrame]
+  /**
+   * Create a frame processor.
+   *
+   * This method uses Code Generation. It can only be used on the executor side.
+   *
+   * @param frame boundaries.
+   * @param functions to process in the frame.
+   * @param ordinal at which the processor starts writing to the output.
+   * @return a frame processor.
+   */
+  private[this] def createFrameProcessor(
+      frame: WindowFrame,
+      functions: Array[WindowFunction],
+      ordinal: Int): WindowFunctionFrame = frame match {
+    // Growing Frame.
+    case SpecifiedWindowFrame(frameType, UnboundedPreceding, FrameBoundaryExtractor(high)) =>
+      val uBoundOrdering = createBoundOrdering(frameType, high)
+      new UnboundedPrecedingWindowFunctionFrame(ordinal, functions, uBoundOrdering)
+
+    // Shrinking Frame.
+    case SpecifiedWindowFrame(frameType, FrameBoundaryExtractor(low), UnboundedFollowing) =>
+      val lBoundOrdering = createBoundOrdering(frameType, low)
+      new UnboundedFollowingWindowFunctionFrame(ordinal, functions, lBoundOrdering)
+
+    // Moving Frame.
+    case SpecifiedWindowFrame(frameType,
+        FrameBoundaryExtractor(low), FrameBoundaryExtractor(high)) =>
+      val lBoundOrdering = createBoundOrdering(frameType, low)
+      val uBoundOrdering = createBoundOrdering(frameType, high)
+      new SlidingWindowFunctionFrame(ordinal, functions, lBoundOrdering, uBoundOrdering)
+
+    // Entire Partition Frame.
+    case SpecifiedWindowFrame(_, UnboundedPreceding, UnboundedFollowing) =>
+      new UnboundedWindowFunctionFrame(ordinal, functions)
+
+    // Error
+    case fr =>
+      sys.error(s"Unsupported Frame $fr for functions: $functions")
+  }
 
-  // Create window functions.
-  private[this] def windowFunctions(): Array[WindowFunction] = {
-    val functions = new Array[WindowFunction](computedWindowExpressions.length)
-    var i = 0
-    while (i < computedWindowExpressions.length) {
-      functions(i) = computedWindowExpressions(i).windowFunction.newInstance()
-      functions(i).init()
-      i += 1
+  /**
+   * Create the resulting projection.
+   *
+   * This method uses Code Generation. It can only be used on the executor side.
+   *
+   * @param expressions unbound ordered function expressions.
+   * @return the final resulting projection.
+   */
+  private[this] def createResultProjection(
+      expressions: Seq[Expression]): MutableProjection = {
+    val unboundToAttr = expressions.map {
+      e => (e, AttributeReference("windowResult", e.dataType, e.nullable)())
     }
-    functions
+    val unboundToAttrMap = unboundToAttr.toMap
+    val patchedWindowExpression = windowExpression.map(_.transform(unboundToAttrMap))
+    newMutableProjection(
+      projectList ++ patchedWindowExpression,
+      child.output ++ unboundToAttr.map(_._2))()
   }
 
-  // The schema of the result of all window function evaluations
-  private[this] val computedSchema = computedWindowExpressions.map(_.resultAttribute)
-
-  private[this] val computedResultMap =
-    computedWindowExpressions.map { w => w.unbound -> w.resultAttribute }.toMap
+  protected override def doExecute(): RDD[InternalRow] = {
+    // Prepare processing.
+    // Group the window expression by their processing frame.
+    val windowExprs = windowExpression.flatMap {
+      _.collect {
+        case e: WindowExpression => e
+      }
+    }
 
-  private[this] val windowExpressionResult = windowExpression.map { window =>
-    window.transform {
-      case w: WindowExpression if computedResultMap.contains(w) => computedResultMap(w)
+    // Create Frame processor factories and order the unbound window expressions by the frame they
+    // are processed in; this is the order in which their results will be written to window
+    // function result buffer.
+    val framedWindowExprs = windowExprs.groupBy(_.windowSpec.frameSpecification)
+    val factories = Array.ofDim[() => WindowFunctionFrame](framedWindowExprs.size)
+    val unboundExpressions = mutable.Buffer.empty[Expression]
+    framedWindowExprs.zipWithIndex.foreach {
+      case ((frame, unboundFrameExpressions), index) =>
+        // Track the ordinal.
+        val ordinal = unboundExpressions.size
+
+        // Track the unbound expressions
+        unboundExpressions ++= unboundFrameExpressions
+
+        // Bind the expressions.
+        val functions = unboundFrameExpressions.map { e =>
+          BindReferences.bindReference(e.windowFunction, child.output)
+        }.toArray
+
+        // Create the frame processor factory.
+        factories(index) = () => createFrameProcessor(frame, functions, ordinal)
     }
-  }
 
-  protected override def doExecute(): RDD[InternalRow] = {
-    child.execute().mapPartitions { iter =>
+    // Start processing.
+    child.execute().mapPartitions { stream =>
       new Iterator[InternalRow] {
 
-        // Although input rows are grouped based on windowSpec.partitionSpec, we need to
-        // know when we have a new partition.
-        // This is to manually construct an ordering that can be used to compare rows.
-        // TODO: We may want to have a newOrdering that takes BoundReferences.
-        // So, we can take advantave of code gen.
-        private val partitionOrdering: Ordering[InternalRow] =
-          RowOrdering.forSchema(windowSpec.partitionSpec.map(_.dataType))
-
-        // This is used to project expressions for the partition specification.
-        protected val partitionGenerator =
-          newMutableProjection(windowSpec.partitionSpec, child.output)()
-
-        // This is ued to project expressions for the order specification.
-        protected val rowOrderGenerator =
-          newMutableProjection(windowSpec.orderSpec.map(_.child), child.output)()
-
-        // The position of next output row in the inputRowBuffer.
-        var rowPosition: Int = 0
-        // The number of buffered rows in the inputRowBuffer (the size of the current partition).
-        var partitionSize: Int = 0
-        // The buffer used to buffer rows in a partition.
-        var inputRowBuffer: CompactBuffer[InternalRow] = _
-        // The partition key of the current partition.
-        var currentPartitionKey: InternalRow = _
-        // The partition key of next partition.
-        var nextPartitionKey: InternalRow = _
-        // The first row of next partition.
-        var firstRowInNextPartition: InternalRow = _
-        // Indicates if this partition is the last one in the iter.
-        var lastPartition: Boolean = false
-
-        def createBoundaryEvaluator(): () => Unit = {
-          def findPhysicalBoundary(
-              boundary: FrameBoundary): () => Int = boundary match {
-            case UnboundedPreceding => () => 0
-            case UnboundedFollowing => () => partitionSize - 1
-            case CurrentRow => () => rowPosition
-            case ValuePreceding(value) =>
-              () =>
-                val newPosition = rowPosition - value
-                if (newPosition > 0) newPosition else 0
-            case ValueFollowing(value) =>
-              () =>
-                val newPosition = rowPosition + value
-                if (newPosition < partitionSize) newPosition else partitionSize - 1
+        // Get all relevant projections.
+        val result = createResultProjection(unboundExpressions)
+        val grouping = newProjection(windowSpec.partitionSpec, child.output)
+
+        // Manage the stream and the grouping.
+        var nextRow: InternalRow = EmptyRow
+        var nextGroup: InternalRow = EmptyRow
+        var nextRowAvailable: Boolean = false
+        private[this] def fetchNextRow() {
+          nextRowAvailable = stream.hasNext
+          if (nextRowAvailable) {
+            nextRow = stream.next()
+            nextGroup = grouping(nextRow)
+          } else {
+            nextRow = EmptyRow
+            nextGroup = EmptyRow
           }
-
-          def findLogicalBoundary(
-              boundary: FrameBoundary,
-              searchDirection: Int,
-              evaluator: Expression,
-              joinedRow: JoinedRow): () => Int = boundary match {
-            case UnboundedPreceding => () => 0
-            case UnboundedFollowing => () => partitionSize - 1
-            case other =>
-              () => {
-                // CurrentRow, ValuePreceding, or ValueFollowing.
-                var newPosition = rowPosition + searchDirection
-                var stopSearch = false
-                // rowOrderGenerator is a mutable projection.
-                // We need to make a copy of the returned by rowOrderGenerator since we will
-                // compare searched row with this currentOrderByValue.
-                val currentOrderByValue = rowOrderGenerator(inputRowBuffer(rowPosition)).copy()
-                while (newPosition >= 0 && newPosition < partitionSize && !stopSearch) {
-                  val r = rowOrderGenerator(inputRowBuffer(newPosition))
-                  stopSearch =
-                    !(evaluator.eval(joinedRow(currentOrderByValue, r)).asInstanceOf[Boolean])
-                  if (!stopSearch) {
-                    newPosition += searchDirection
-                  }
-                }
-                newPosition -= searchDirection
-
-                if (newPosition < 0) {
-                  0
-                } else if (newPosition >= partitionSize) {
-                  partitionSize - 1
-                } else {
-                  newPosition
-                }
-              }
+        }
+        fetchNextRow()
+
+        // Manage the current partition.
+        var rows: CompactBuffer[InternalRow] = _
+        val frames: Array[WindowFunctionFrame] = factories.map(_())
+        val numFrames = frames.length
+        private[this] def fetchNextPartition() {
+          // Collect all the rows in the current partition.
+          val currentGroup = nextGroup
+          rows = new CompactBuffer
+          while (nextRowAvailable && nextGroup == currentGroup) {
+            rows += nextRow.copy()
+            fetchNextRow()
           }
 
-          windowFrame.frameType match {
-            case RowFrame =>
-              val findStart = findPhysicalBoundary(windowFrame.frameStart)
-              val findEnd = findPhysicalBoundary(windowFrame.frameEnd)
-              () => {
-                frameStart = findStart()
-                frameEnd = findEnd()
-              }
-            case RangeFrame =>
-              val joinedRowForBoundaryEvaluation: JoinedRow = new JoinedRow()
-              val orderByExpr = windowSpec.orderSpec.head
-              val currentRowExpr =
-                BoundReference(0, orderByExpr.dataType, orderByExpr.nullable)
-              val examedRowExpr =
-                BoundReference(1, orderByExpr.dataType, orderByExpr.nullable)
-              val differenceExpr = Abs(Subtract(currentRowExpr, examedRowExpr))
-
-              val frameStartEvaluator = windowFrame.frameStart match {
-                case CurrentRow => EqualTo(currentRowExpr, examedRowExpr)
-                case ValuePreceding(value) =>
-                  LessThanOrEqual(differenceExpr, Cast(Literal(value), orderByExpr.dataType))
-                case ValueFollowing(value) =>
-                  GreaterThanOrEqual(differenceExpr, Cast(Literal(value), orderByExpr.dataType))
-                case o => Literal(true) // This is just a dummy expression, we will not use it.
-              }
-
-              val frameEndEvaluator = windowFrame.frameEnd match {
-                case CurrentRow => EqualTo(currentRowExpr, examedRowExpr)
-                case ValuePreceding(value) =>
-                  GreaterThanOrEqual(differenceExpr, Cast(Literal(value), orderByExpr.dataType))
-                case ValueFollowing(value) =>
-                  LessThanOrEqual(differenceExpr, Cast(Literal(value), orderByExpr.dataType))
-                case o => Literal(true) // This is just a dummy expression, we will not use it.
-              }
-
-              val findStart =
-                findLogicalBoundary(
-                  boundary = windowFrame.frameStart,
-                  searchDirection = -1,
-                  evaluator = frameStartEvaluator,
-                  joinedRow = joinedRowForBoundaryEvaluation)
-              val findEnd =
-                findLogicalBoundary(
-                  boundary = windowFrame.frameEnd,
-                  searchDirection = 1,
-                  evaluator = frameEndEvaluator,
-                  joinedRow = joinedRowForBoundaryEvaluation)
-              () => {
-                frameStart = findStart()
-                frameEnd = findEnd()
-              }
+          // Setup the frames.
+          var i = 0
+          while (i < numFrames) {
+            frames(i).prepare(rows)
+            i += 1
           }
+
+          // Setup iteration
+          rowIndex = 0
+          rowsSize = rows.size
         }
 
-        val boundaryEvaluator = createBoundaryEvaluator()
-        // Indicates if we the specified window frame requires us to maintain a sliding frame
-        // (e.g. RANGES BETWEEN 1 PRECEDING AND CURRENT ROW) or the window frame
-        // is the entire partition (e.g. ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING).
-        val requireUpdateFrame: Boolean = {
-          def requireUpdateBoundary(boundary: FrameBoundary): Boolean = boundary match {
-            case UnboundedPreceding => false
-            case UnboundedFollowing => false
-            case _ => true
-          }
+        // Iteration
+        var rowIndex = 0
+        var rowsSize = 0
+        override final def hasNext: Boolean = rowIndex < rowsSize || nextRowAvailable
 
-          requireUpdateBoundary(windowFrame.frameStart) ||
-            requireUpdateBoundary(windowFrame.frameEnd)
-        }
-        // The start position of the current frame in the partition.
-        var frameStart: Int = 0
-        // The end position of the current frame in the partition.
-        var frameEnd: Int = -1
-        // Window functions.
-        val functions: Array[WindowFunction] = windowFunctions()
-        // Buffers used to store input parameters for window functions. Because we may need to
-        // maintain a sliding frame, we use this buffer to avoid evaluate the parameters from
-        // the same row multiple times.
-        val windowFunctionParameterBuffers: Array[util.LinkedList[AnyRef]] =
-          functions.map(_ => new util.LinkedList[AnyRef]())
-
-        // The projection used to generate the final result rows of this operator.
-        private[this] val resultProjection =
-          newMutableProjection(
-            projectList ++ windowExpressionResult,
-            projectList ++ computedSchema)()
-
-        // The row used to hold results of window functions.
-        private[this] val windowExpressionResultRow =
-          new GenericMutableRow(computedSchema.length)
-
-        private[this] val joinedRow = new JoinedRow6
-
-        // Initialize this iterator.
-        initialize()
-
-        private def initialize(): Unit = {
-          if (iter.hasNext) {
-            val currentRow = iter.next().copy()
-            // partitionGenerator is a mutable projection. Since we need to track nextPartitionKey,
-            // we are making a copy of the returned partitionKey at here.
-            nextPartitionKey = partitionGenerator(currentRow).copy()
-            firstRowInNextPartition = currentRow
+        val join = new JoinedRow6
+        val windowFunctionResult = new GenericMutableRow(unboundExpressions.size)
+        override final def next(): InternalRow = {
+          // Load the next partition if we need to.
+          if (rowIndex >= rowsSize && nextRowAvailable) {
             fetchNextPartition()
-          } else {
-            // The iter is an empty one. So, we set all of the following variables
-            // to make sure hasNext will return false.
-            lastPartition = true
-            rowPosition = 0
-            partitionSize = 0
           }
-        }
-
-        // Indicates if we will have new output row.
-        override final def hasNext: Boolean = {
-          !lastPartition || (rowPosition < partitionSize)
-        }
 
-        override final def next(): InternalRow = {
-          if (hasNext) {
-            if (rowPosition == partitionSize) {
-              // All rows of this buffer have been consumed.
-              // We will move to next partition.
-              fetchNextPartition()
-            }
-            // Get the input row for the current output row.
-            val inputRow = inputRowBuffer(rowPosition)
-            // Get all results of the window functions for this output row.
+          if (rowIndex < rowsSize) {
+            // Get the results for the window frames.
             var i = 0
-            while (i < functions.length) {
-              windowExpressionResultRow.update(i, functions(i).get(rowPosition))
+            while (i < numFrames) {
+              frames(i).write(windowFunctionResult)
               i += 1
             }
 
-            // Construct the output row.
-            val outputRow = resultProjection(joinedRow(inputRow, windowExpressionResultRow))
-            // We will move to the next one.
-            rowPosition += 1
-            if (requireUpdateFrame && rowPosition < partitionSize) {
-              // If we need to maintain a sliding frame and
-              // we will still work on this partition when next is called next time, do the update.
-              updateFrame()
-            }
+            // 'Merge' the input row with the window function result
+            join(rows(rowIndex), windowFunctionResult)
+            rowIndex += 1
 
-            // Return the output row.
-            outputRow
-          } else {
-            // no more result
-            throw new NoSuchElementException
-          }
+            // Return the projection.
+            result(join)
+          } else throw new NoSuchElementException
         }
+      }
+    }
+  }
+}
 
-        // Fetch the next partition.
-        private def fetchNextPartition(): Unit = {
-          // Create a new buffer for input rows.
-          inputRowBuffer = new CompactBuffer[InternalRow]()
-          // We already have the first row for this partition
-          // (recorded in firstRowInNextPartition). Add it back.
-          inputRowBuffer += firstRowInNextPartition
-          // Set the current partition key.
-          currentPartitionKey = nextPartitionKey
-          // Now, we will start to find all rows belonging to this partition.
-          // Create a variable to track if we see the next partition.
-          var findNextPartition = false
-          // The search will stop when we see the next partition or there is no
-          // input row left in the iter.
-          while (iter.hasNext && !findNextPartition) {
-            // Make a copy of the input row since we will put it in the buffer.
-            val currentRow = iter.next().copy()
-            // Get the partition key based on the partition specification.
-            // For the below compare method, we do not need to make a copy of partitionKey.
-            val partitionKey = partitionGenerator(currentRow)
-            // Check if the current row belongs the current input row.
-            val comparing = partitionOrdering.compare(currentPartitionKey, partitionKey)
-            if (comparing == 0) {
-              // This row is still in the current partition.
-              inputRowBuffer += currentRow
-            } else {
-              // The current input row is in a different partition.
-              findNextPartition = true
-              // partitionGenerator is a mutable projection.
-              // Since we need to track nextPartitionKey and we determine that it should be set
-              // as partitionKey, we are making a copy of the partitionKey at here.
-              nextPartitionKey = partitionKey.copy()
-              firstRowInNextPartition = currentRow
-            }
-          }
+/**
+ * Function for comparing boundary values.
+ */
+private[execution] abstract class BoundOrdering {
+  def compare(input: Seq[InternalRow], inputIndex: Int, outputIndex: Int): Int
+}
 
-          // We have not seen a new partition. It means that there is no new row in the
-          // iter. The current partition is the last partition of the iter.
-          if (!findNextPartition) {
-            lastPartition = true
-          }
+/**
+ * Compare the input index to the bound of the output index.
+ */
+private[execution] final case class RowBoundOrdering(offset: Int) extends BoundOrdering {
+  override def compare(input: Seq[InternalRow], inputIndex: Int, outputIndex: Int): Int =
+    inputIndex - (outputIndex + offset)
+}
 
-          // We have got all rows for the current partition.
-          // Set rowPosition to 0 (the next output row will be based on the first
-          // input row of this partition).
-          rowPosition = 0
-          // The size of this partition.
-          partitionSize = inputRowBuffer.size
-          // Reset all parameter buffers of window functions.
-          var i = 0
-          while (i < windowFunctionParameterBuffers.length) {
-            windowFunctionParameterBuffers(i).clear()
-            i += 1
-          }
-          frameStart = 0
-          frameEnd = -1
-          // Create the first window frame for this partition.
-          // If we do not need to maintain a sliding frame, this frame will
-          // have the entire partition.
-          updateFrame()
-        }
+/**
+ * Compare the value of the input index to the value bound of the output index.
+ */
+private[execution] final case class RangeBoundOrdering(
+    ordering: Ordering[InternalRow],
+    current: Projection,
+    bound: Projection) extends BoundOrdering {
+  override def compare(input: Seq[InternalRow], inputIndex: Int, outputIndex: Int): Int =
+    ordering.compare(current(input(inputIndex)), bound(input(outputIndex)))
+}
 
-        /** The function used to maintain the sliding frame. */
-        private def updateFrame(): Unit = {
-          // Based on the difference between the new frame and old frame,
-          // updates the buffers holding input parameters of window functions.
-          // We will start to prepare input parameters starting from the row
-          // indicated by offset in the input row buffer.
-          def updateWindowFunctionParameterBuffers(
-              numToRemove: Int,
-              numToAdd: Int,
-              offset: Int): Unit = {
-            // First, remove unneeded entries from the head of every buffer.
-            var i = 0
-            while (i < numToRemove) {
-              var j = 0
-              while (j < windowFunctionParameterBuffers.length) {
-                windowFunctionParameterBuffers(j).remove()
-                j += 1
-              }
-              i += 1
-            }
-            // Then, add needed entries to the tail of every buffer.
-            i = 0
-            while (i < numToAdd) {
-              var j = 0
-              while (j < windowFunctionParameterBuffers.length) {
-                // Ask the function to prepare the input parameters.
-                val parameters = functions(j).prepareInputParameters(inputRowBuffer(i + offset))
-                windowFunctionParameterBuffers(j).add(parameters)
-                j += 1
-              }
-              i += 1
-            }
-          }
+/**
+ * A window function calculates the results of a number of window functions for a window frame.
+ * Before use a frame must be prepared by passing it all the rows in the current partition. After
+ * preparation the update method can be called to fill the output rows.
+ *
+ * TODO How to improve performance? A few thoughts:
+ * - Window functions are expensive due to its distribution and ordering requirements.
+ * Unfortunately it is up to the Spark engine to solve this. Improvements in the form of project
+ * Tungsten are on the way.
+ * - The window frame processing bit can be improved though. But before we start doing that we
+ * need to see how much of the time and resources are spent on partitioning and ordering, and
+ * how much time and resources are spent processing the partitions. There are a couple ways to
+ * improve on the current situation:
+ * - Reduce memory footprint by performing streaming calculations. This can only be done when
+ * there are no Unbound/Unbounded Following calculations present.
+ * - Use Tungsten style memory usage.
+ * - Use code generation in general, and use the approach to aggregation taken in the
+ *   GeneratedAggregate class in specific.
+ *
+ * @param ordinal of the first column written by this frame.
+ * @param functions to calculate the row values with.
+ */
+private[execution] abstract class WindowFunctionFrame(
+    ordinal: Int,
+    functions: Array[WindowFunction]) {
+
+  // Make sure functions are initialized.
+  functions.foreach(_.init())
+
+  /** Number of columns the window function frame is managing */
+  val numColumns = functions.length
+
+  /**
+   * Create a fresh thread safe copy of the frame.
+   *
+   * @return the copied frame.
+   */
+  def copy: WindowFunctionFrame
+
+  /**
+   * Create new instances of the functions.
+   *
+   * @return an array containing copies of the current window functions.
+   */
+  protected final def copyFunctions: Array[WindowFunction] = functions.map(_.newInstance())
+
+  /**
+   * Prepare the frame for calculating the results for a partition.
+   *
+   * @param rows to calculate the frame results for.
+   */
+  def prepare(rows: CompactBuffer[InternalRow]): Unit
+
+  /**
+   * Write the result for the current row to the given target row.
+   *
+   * @param target row to write the result for the current row to.
+   */
+  def write(target: GenericMutableRow): Unit
+
+  /** Reset the current window functions. */
+  protected final def reset(): Unit = {
+    var i = 0
+    while (i < numColumns) {
+      functions(i).reset()
+      i += 1
+    }
+  }
 
-          // Record the current frame start point and end point before
-          // we update them.
-          val previousFrameStart = frameStart
-          val previousFrameEnd = frameEnd
-          boundaryEvaluator()
-          updateWindowFunctionParameterBuffers(
-            frameStart - previousFrameStart,
-            frameEnd - previousFrameEnd,
-            previousFrameEnd + 1)
-          // Evaluate the current frame.
-          evaluateCurrentFrame()
-        }
+  /** Prepare an input row for processing. */
+  protected final def prepare(input: InternalRow): Array[AnyRef] = {
+    val prepared = new Array[AnyRef](numColumns)
+    var i = 0
+    while (i < numColumns) {
+      prepared(i) = functions(i).prepareInputParameters(input)
+      i += 1
+    }
+    prepared
+  }
 
-        /** Evaluate the current window frame. */
-        private def evaluateCurrentFrame(): Unit = {
-          var i = 0
-          while (i < functions.length) {
-            // Reset the state of the window function.
-            functions(i).reset()
-            // Get all buffered input parameters based on rows of this window frame.
-            val inputParameters = windowFunctionParameterBuffers(i).toArray()
-            // Send these input parameters to the window function.
-            functions(i).batchUpdate(inputParameters)
-            // Ask the function to evaluate based on this window frame.
-            functions(i).evaluate()
-            i += 1
-          }
-        }
+  /** Evaluate a prepared buffer (iterator). */
+  protected final def evaluatePrepared(iterator: java.util.Iterator[Array[AnyRef]]): Unit = {
+    reset()
+    while (iterator.hasNext) {
+      val prepared = iterator.next()
+      var i = 0
+      while (i < numColumns) {
+        functions(i).update(prepared(i))
+        i += 1
       }
     }
+    evaluate()
   }
+
+  /** Evaluate a prepared buffer (array). */
+  protected final def evaluatePrepared(prepared: Array[Array[AnyRef]],
+      fromIndex: Int, toIndex: Int): Unit = {
+    var i = 0
+    while (i < numColumns) {
+      val function = functions(i)
+      function.reset()
+      var j = fromIndex
+      while (j < toIndex) {
+        function.update(prepared(j)(i))
+        j += 1
+      }
+      function.evaluate()
+      i += 1
+    }
+  }
+
+  /** Update an array of window functions. */
+  protected final def update(input: InternalRow): Unit = {
+    var i = 0
+    while (i < numColumns) {
+      val aggregate = functions(i)
+      val preparedInput = aggregate.prepareInputParameters(input)
+      aggregate.update(preparedInput)
+      i += 1
+    }
+  }
+
+  /** Evaluate the window functions. */
+  protected final def evaluate(): Unit = {
+    var i = 0
+    while (i < numColumns) {
+      functions(i).evaluate()
+      i += 1
+    }
+  }
+
+  /** Fill a target row with the current window function results. */
+  protected final def fill(target: GenericMutableRow, rowIndex: Int): Unit = {
+    var i = 0
+    while (i < numColumns) {
+      target.update(ordinal + i, functions(i).get(rowIndex))
+      i += 1
+    }
+  }
+}
+
+/**
+ * The sliding window frame calculates frames with the following SQL form:
+ * ... BETWEEN 1 PRECEDING AND 1 FOLLOWING
+ *
+ * @param ordinal of the first column written by this frame.
+ * @param functions to calculate the row values with.
+ * @param lbound comparator used to identify the lower bound of an output row.
+ * @param ubound comparator used to identify the upper bound of an output row.
+ */
+private[execution] final class SlidingWindowFunctionFrame(
+    ordinal: Int,
+    functions: Array[WindowFunction],
+    lbound: BoundOrdering,
+    ubound: BoundOrdering) extends WindowFunctionFrame(ordinal, functions) {
+
+  /** Rows of the partition currently being processed. */
+  private[this] var input: CompactBuffer[InternalRow] = null
+
+  /** Index of the first input row with a value greater than the upper bound of the current
+    * output row. */
+  private[this] var inputHighIndex = 0
+
+  /** Index of the first input row with a value equal to or greater than the lower bound of the
+    * current output row. */
+  private[this] var inputLowIndex = 0
+
+  /** Buffer used for storing prepared input for the window functions. */
+  private[this] val buffer = new util.ArrayDeque[Array[AnyRef]]
+
+  /** Index of the row we are currently writing. */
+  private[this] var outputIndex = 0
+
+  /** Prepare the frame for calculating a new partition. Reset all variables. */
+  override def prepare(rows: CompactBuffer[InternalRow]): Unit = {
+    input = rows
+    inputHighIndex = 0
+    inputLowIndex = 0
+    outputIndex = 0
+    buffer.clear()
+  }
+
+  /** Write the frame columns for the current row to the given target row. */
+  override def write(target: GenericMutableRow): Unit = {
+    var bufferUpdated = outputIndex == 0
+
+    // Add all rows to the buffer for which the input row value is equal to or less than
+    // the output row upper bound.
+    while (inputHighIndex < input.size &&
+        ubound.compare(input, inputHighIndex, outputIndex) <= 0) {
+      buffer.offer(prepare(input(inputHighIndex)))
+      inputHighIndex += 1
+      bufferUpdated = true
+    }
+
+    // Drop all rows from the buffer for which the input row value is smaller than
+    // the output row lower bound.
+    while (inputLowIndex < inputHighIndex &&
+        lbound.compare(input, inputLowIndex, outputIndex) < 0) {
+      buffer.pop()
+      inputLowIndex += 1
+      bufferUpdated = true
+    }
+
+    // Only recalculate and update when the buffer changes.
+    if (bufferUpdated) {
+      evaluatePrepared(buffer.iterator())
+      fill(target, outputIndex)
+    }
+
+    // Move to the next row.
+    outputIndex += 1
+  }
+
+  /** Copy the frame. */
+  override def copy: SlidingWindowFunctionFrame =
+    new SlidingWindowFunctionFrame(ordinal, copyFunctions, lbound, ubound)
+}
+
+/**
+ * The unbounded window frame calculates frames with the following SQL forms:
+ * ... (No Frame Definition)
+ * ... BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
+ *
+ * Its results are  the same for each and every row in the partition. This class can be seen as a
+ * special case of a sliding window, but is optimized for the unbound case.
+ *
+ * @param ordinal of the first column written by this frame.
+ * @param functions to calculate the row values with.
+ */
+private[execution] final class UnboundedWindowFunctionFrame(
+    ordinal: Int,
+    functions: Array[WindowFunction]) extends WindowFunctionFrame(ordinal, functions) {
+
+  /** Index of the row we are currently writing. */
+  private[this] var outputIndex = 0
+
+  /** Prepare the frame for calculating a new partition. Process all rows eagerly. */
+  override def prepare(rows: CompactBuffer[InternalRow]): Unit = {
+    reset()
+    outputIndex = 0
+    val iterator = rows.iterator
+    while (iterator.hasNext) {
+      update(iterator.next())
+    }
+    evaluate()
+  }
+
+  /** Write the frame columns for the current row to the given target row. */
+  override def write(target: GenericMutableRow): Unit = {
+    fill(target, outputIndex)
+    outputIndex += 1
+  }
+
+  /** Copy the frame. */
+  override def copy: UnboundedWindowFunctionFrame =
+    new UnboundedWindowFunctionFrame(ordinal, copyFunctions)
+}
+
+/**
+ * The UnboundPreceding window frame calculates frames with the following SQL form:
+ * ... BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
+ *
+ * There is only an upper bound. Very common use cases are for instance running sums or counts
+ * (row_number). Technically this is a special case of a sliding window. However a sliding window
+ * has to maintain a buffer, and it must do a full evaluation everytime the buffer changes. This
+ * is not the case when there is no lower bound, given the additive nature of most aggregates
+ * streaming updates and partial evaluation suffice and no buffering is needed.
+ *
+ * @param ordinal of the first column written by this frame.
+ * @param functions to calculate the row values with.
+ * @param ubound comparator used to identify the upper bound of an output row.
+ */
+private[execution] final class UnboundedPrecedingWindowFunctionFrame(
+    ordinal: Int,
+    functions: Array[WindowFunction],
+    ubound: BoundOrdering) extends WindowFunctionFrame(ordinal, functions) {
+
+  /** Rows of the partition currently being processed. */
+  private[this] var input: CompactBuffer[InternalRow] = null
+
+  /** Index of the first input row with a value greater than the upper bound of the current
+    * output row. */
+  private[this] var inputIndex = 0
+
+  /** Index of the row we are currently writing. */
+  private[this] var outputIndex = 0
+
+  /** Prepare the frame for calculating a new partition. */
+  override def prepare(rows: CompactBuffer[InternalRow]): Unit = {
+    reset()
+    input = rows
+    inputIndex = 0
+    outputIndex = 0
+  }
+
+  /** Write the frame columns for the current row to the given target row. */
+  override def write(target: GenericMutableRow): Unit = {
+    var bufferUpdated = outputIndex == 0
+
+    // Add all rows to the aggregates for which the input row value is equal to or less than
+    // the output row upper bound.
+    while (inputIndex < input.size && ubound.compare(input, inputIndex, outputIndex) <= 0) {
+      update(input(inputIndex))
+      inputIndex += 1
+      bufferUpdated = true
+    }
+
+    // Only recalculate and update when the buffer changes.
+    if (bufferUpdated) {
+      evaluate()
+      fill(target, outputIndex)
+    }
+
+    // Move to the next row.
+    outputIndex += 1
+  }
+
+  /** Copy the frame. */
+  override def copy: UnboundedPrecedingWindowFunctionFrame =
+    new UnboundedPrecedingWindowFunctionFrame(ordinal, copyFunctions, ubound)
+}
+
+/**
+ * The UnboundFollowing window frame calculates frames with the following SQL form:
+ * ... BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING
+ *
+ * There is only an upper bound. This is a slightly modified version of the sliding window. The
+ * sliding window operator has to check if both upper and the lower bound change when a new row
+ * gets processed, where as the unbounded following only has to check the lower bound.
+ *
+ * This is a very expensive operator to use, O(n * (n - 1) /2), because we need to maintain a
+ * buffer and must do full recalculation after each row. Reverse iteration would be possible, if
+ * the communitativity of the used window functions can be guaranteed.
+ *
+ * @param ordinal of the first column written by this frame.
+ * @param functions to calculate the row values with.
+ * @param lbound comparator used to identify the lower bound of an output row.
+ */
+private[execution] final class UnboundedFollowingWindowFunctionFrame(
+    ordinal: Int,
+    functions: Array[WindowFunction],
+    lbound: BoundOrdering) extends WindowFunctionFrame(ordinal, functions) {
+
+  /** Buffer used for storing prepared input for the window functions. */
+  private[this] var buffer: Array[Array[AnyRef]] = _
+
+  /** Rows of the partition currently being processed. */
+  private[this] var input: CompactBuffer[InternalRow] = null
+
+  /** Index of the first input row with a value equal to or greater than the lower bound of the
+    * current output row. */
+  private[this] var inputIndex = 0
+
+  /** Index of the row we are currently writing. */
+  private[this] var outputIndex = 0
+
+  /** Prepare the frame for calculating a new partition. */
+  override def prepare(rows: CompactBuffer[InternalRow]): Unit = {
+    input = rows
+    inputIndex = 0
+    outputIndex = 0
+    val size = input.size
+    buffer = Array.ofDim(size)
+    var i = 0
+    while (i < size) {
+      buffer(i) = prepare(input(i))
+      i += 1
+    }
+    evaluatePrepared(buffer, 0, buffer.length)
+  }
+
+  /** Write the frame columns for the current row to the given target row. */
+  override def write(target: GenericMutableRow): Unit = {
+    var bufferUpdated = outputIndex == 0
+
+    // Drop all rows from the buffer for which the input row value is smaller than
+    // the output row lower bound.
+    while (inputIndex < input.size && lbound.compare(input, inputIndex, outputIndex) < 0) {
+      inputIndex += 1
+      bufferUpdated = true
+    }
+
+    // Only recalculate and update when the buffer changes.
+    if (bufferUpdated) {
+      evaluatePrepared(buffer, inputIndex, buffer.length)
+      fill(target, outputIndex)
+    }
+
+    // Move to the next row.
+    outputIndex += 1
+  }
+
+  /** Copy the frame. */
+  override def copy: UnboundedFollowingWindowFunctionFrame =
+    new UnboundedFollowingWindowFunctionFrame(ordinal, copyFunctions, lbound)
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameWindowSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameWindowSuite.scala
index efb3f2545db84..15b5f418f0a8c 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameWindowSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameWindowSuite.scala
@@ -183,13 +183,13 @@ class HiveDataFrameWindowSuite extends QueryTest {
   }
 
   test("aggregation and range betweens with unbounded") {
-    val df = Seq((1, "1"), (2, "2"), (2, "2"), (2, "2"), (1, "1"), (2, "2")).toDF("key", "value")
+    val df = Seq((5, "1"), (5, "2"), (4, "2"), (6, "2"), (3, "1"), (2, "2")).toDF("key", "value")
     df.registerTempTable("window_table")
     checkAnswer(
       df.select(
         $"key",
         last("value").over(
-          Window.partitionBy($"value").orderBy($"key").rangeBetween(1, Long.MaxValue))
+          Window.partitionBy($"value").orderBy($"key").rangeBetween(-2, -1))
           .equalTo("2")
           .as("last_v"),
         avg("key").over(Window.partitionBy("value").orderBy("key").rangeBetween(Long.MinValue, 1))
@@ -203,7 +203,7 @@ class HiveDataFrameWindowSuite extends QueryTest {
         """SELECT
           | key,
           | last_value(value) OVER
-          |   (PARTITION BY value ORDER BY key RANGE 1 preceding) == "2",
+          |   (PARTITION BY value ORDER BY key RANGE BETWEEN 2 preceding and 1 preceding) == "2",
           | avg(key) OVER
           |   (PARTITION BY value ORDER BY key RANGE BETWEEN unbounded preceding and 1 following),
           | avg(key) OVER
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/WindowSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/WindowSuite.scala
new file mode 100644
index 0000000000000..a089d0d165195
--- /dev/null
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/WindowSuite.scala
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.execution
+
+import org.apache.spark.sql.{Row, QueryTest}
+import org.apache.spark.sql.expressions.Window
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.hive.test.TestHive.implicits._
+
+/**
+ * Window expressions are tested extensively by the following test suites:
+ * [[org.apache.spark.sql.hive.HiveDataFrameWindowSuite]]
+ * [[org.apache.spark.sql.hive.execution.HiveWindowFunctionQueryWithoutCodeGenSuite]]
+ * [[org.apache.spark.sql.hive.execution.HiveWindowFunctionQueryFileWithoutCodeGenSuite]]
+ * However these suites do not cover all possible (i.e. more exotic) settings. This suite fill
+ * this gap.
+ *
+ * TODO Move this class to the sql/core project when we move to Native Spark UDAFs.
+ */
+class WindowSuite extends QueryTest {
+
+  test("reverse sliding range frame") {
+    val df = Seq(
+      (1, "Thin", "Cell Phone", 6000),
+      (2, "Normal", "Tablet", 1500),
+      (3, "Mini", "Tablet", 5500),
+      (4, "Ultra thin", "Cell Phone", 5500),
+      (5, "Very thin", "Cell Phone", 6000),
+      (6, "Big", "Tablet", 2500),
+      (7, "Bendable", "Cell Phone", 3000),
+      (8, "Foldable", "Cell Phone", 3000),
+      (9, "Pro", "Tablet", 4500),
+      (10, "Pro2", "Tablet", 6500)).
+      toDF("id", "product", "category", "revenue")
+    val window = Window.
+      partitionBy($"category").
+      orderBy($"revenue".desc).
+      rangeBetween(-2000L, 1000L)
+    checkAnswer(
+      df.select(
+        $"id",
+        avg($"revenue").over(window).cast("int")),
+      Row(1, 5833) :: Row(2, 2000) :: Row(3, 5500) ::
+      Row(4, 5833) :: Row(5, 5833) :: Row(6, 2833) ::
+      Row(7, 3000) :: Row(8, 3000) :: Row(9, 5500) ::
+      Row(10, 6000) :: Nil)
+  }
+
+  // This is here to illustrate the fact that reverse order also reverses offsets.
+  test("reverse unbounded range frame") {
+    val df = Seq(1, 2, 4, 3, 2, 1).
+      map(Tuple1.apply).
+      toDF("value")
+    val window = Window.orderBy($"value".desc)
+    checkAnswer(
+      df.select(
+        $"value",
+         sum($"value").over(window.rangeBetween(Long.MinValue, 1)),
+         sum($"value").over(window.rangeBetween(1, Long.MaxValue))),
+      Row(1, 13, null) :: Row(2, 13, 2) :: Row(4, 7, 9) ::
+        Row(3, 11, 6) :: Row(2, 13, 2) :: Row(1, 13, null) :: Nil)
+
+  }
+}

From 89d135851d928f9d7dcebe785c1b3b6a4d8dfc87 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Sat, 18 Jul 2015 23:47:40 -0700
Subject: [PATCH 244/549] Closes #6775 since it is subsumbed by other patches.


From 9b644c41306cac53185ce0d2de4cb72127ada932 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@appier.com>
Date: Sun, 19 Jul 2015 00:32:56 -0700
Subject: [PATCH 245/549] [SPARK-9166][SQL][PYSPARK] Capture and hide
 IllegalArgumentException in Python API

JIRA: https://issues.apache.org/jira/browse/SPARK-9166

Simply capture and hide `IllegalArgumentException` in Python API.

Author: Liang-Chi Hsieh <viirya@appier.com>

Closes #7497 from viirya/hide_illegalargument and squashes the following commits:

8324dce [Liang-Chi Hsieh] Fix python style.
9ace67d [Liang-Chi Hsieh] Also check exception message.
8b2ce5c [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into hide_illegalargument
7be016a [Liang-Chi Hsieh] Capture and hide IllegalArgumentException in Python.
---
 python/pyspark/sql/tests.py | 11 +++++++++--
 python/pyspark/sql/utils.py |  8 ++++++++
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 241eac45cfe36..86706e2dc41a3 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -45,9 +45,9 @@
 from pyspark.sql.types import *
 from pyspark.sql.types import UserDefinedType, _infer_type
 from pyspark.tests import ReusedPySparkTestCase
-from pyspark.sql.functions import UserDefinedFunction
+from pyspark.sql.functions import UserDefinedFunction, sha2
 from pyspark.sql.window import Window
-from pyspark.sql.utils import AnalysisException
+from pyspark.sql.utils import AnalysisException, IllegalArgumentException
 
 
 class UTC(datetime.tzinfo):
@@ -894,6 +894,13 @@ def test_capture_analysis_exception(self):
         # RuntimeException should not be captured
         self.assertRaises(py4j.protocol.Py4JJavaError, lambda: self.sqlCtx.sql("abc"))
 
+    def test_capture_illegalargument_exception(self):
+        self.assertRaisesRegexp(IllegalArgumentException, "Setting negative mapred.reduce.tasks",
+                                lambda: self.sqlCtx.sql("SET mapred.reduce.tasks=-1"))
+        df = self.sqlCtx.createDataFrame([(1, 2)], ["a", "b"])
+        self.assertRaisesRegexp(IllegalArgumentException, "1024 is not in the permitted values",
+                                lambda: df.select(sha2(df.a, 1024)).collect())
+
 
 class HiveContextSQLTests(ReusedPySparkTestCase):
 
diff --git a/python/pyspark/sql/utils.py b/python/pyspark/sql/utils.py
index cc5b2c088b7cc..0f795ca35b38a 100644
--- a/python/pyspark/sql/utils.py
+++ b/python/pyspark/sql/utils.py
@@ -24,6 +24,12 @@ class AnalysisException(Exception):
     """
 
 
+class IllegalArgumentException(Exception):
+    """
+    Passed an illegal or inappropriate argument.
+    """
+
+
 def capture_sql_exception(f):
     def deco(*a, **kw):
         try:
@@ -32,6 +38,8 @@ def deco(*a, **kw):
             s = e.java_exception.toString()
             if s.startswith('org.apache.spark.sql.AnalysisException: '):
                 raise AnalysisException(s.split(': ', 1)[1])
+            if s.startswith('java.lang.IllegalArgumentException: '):
+                raise IllegalArgumentException(s.split(': ', 1)[1])
             raise
     return deco
 

From 344d1567e5ac28b3ab8f83f18d2fa9d98acef152 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Carl=20Anders=20D=C3=BCvel?= <c.a.duevel@gmail.com>
Date: Sun, 19 Jul 2015 09:14:55 +0100
Subject: [PATCH 246/549] [SPARK-9094] [PARENT] Increased io.dropwizard.metrics
 from 3.1.0 to 3.1.2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We are running Spark 1.4.0 in production and ran into problems because after a network hiccup (which happens often in our current environment) no more metrics were reported to graphite leaving us blindfolded about the current state of our spark applications. [This problem](https://github.com/dropwizard/metrics/commit/70559816f1fc3a0a0122b5263d5478ff07396991) was fixed in the current version of the metrics library. We run spark with this change  in production now and have seen no problems. We also had a look at the commit history since 3.1.0 and did not detect any potentially  incompatible changes but many fixes which could potentially help other users as well.

Author: Carl Anders Düvel <c.a.duevel@gmail.com>

Closes #7493 from hackbert/bump-metrics-lib-version and squashes the following commits:

6677565 [Carl Anders Düvel] [SPARK-9094] [PARENT] Increased io.dropwizard.metrics from 3.1.0 to 3.1.2 in order to get this fix https://github.com/dropwizard/metrics/commit/70559816f1fc3a0a0122b5263d5478ff07396991
---
 pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pom.xml b/pom.xml
index c5c655834bdeb..2de0c35fbd51a 100644
--- a/pom.xml
+++ b/pom.xml
@@ -144,7 +144,7 @@
     <chill.version>0.5.0</chill.version>
     <ivy.version>2.4.0</ivy.version>
     <oro.version>2.0.8</oro.version>
-    <codahale.metrics.version>3.1.0</codahale.metrics.version>
+    <codahale.metrics.version>3.1.2</codahale.metrics.version>
     <avro.version>1.7.7</avro.version>
     <avro.mapred.classifier>hadoop2</avro.mapred.classifier>
     <jets3t.version>0.7.1</jets3t.version>

From a53d13f7aa5d44c706e5510f57399a32c7558b80 Mon Sep 17 00:00:00 2001
From: Tarek Auel <tarek.auel@googlemail.com>
Date: Sun, 19 Jul 2015 01:16:01 -0700
Subject: [PATCH 247/549] [SPARK-8199][SQL] follow up; revert change in test

rxin / davies

Sorry for that unnecessary change. And thanks again for all your support!

Author: Tarek Auel <tarek.auel@googlemail.com>

Closes #7505 from tarekauel/SPARK-8199-FollowUp and squashes the following commits:

d09321c [Tarek Auel] [SPARK-8199] follow up; revert change in test
c17397f [Tarek Auel] [SPARK-8199] follow up; revert change in test
67acfe6 [Tarek Auel] [SPARK-8199] follow up; revert change in test
---
 .../spark/sql/catalyst/expressions/DateFunctionsSuite.scala   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateFunctionsSuite.scala
index f469f42116d21..a0991ec998311 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateFunctionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateFunctionsSuite.scala
@@ -74,7 +74,7 @@ class DateFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
         (0 to 5).foreach { i =>
           val c = Calendar.getInstance()
           c.set(y, m, 28, 0, 0, 0)
-          c.add(Calendar.DATE, 1)
+          c.add(Calendar.DATE, i)
           checkEvaluation(DayInYear(Cast(Literal(new Date(c.getTimeInMillis)), DateType)),
             sdfDay.format(c.getTime).toInt)
         }
@@ -86,7 +86,7 @@ class DateFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
         (0 to 5).foreach { i =>
           val c = Calendar.getInstance()
           c.set(y, m, 28, 0, 0, 0)
-          c.add(Calendar.DATE, 1)
+          c.add(Calendar.DATE, i)
           checkEvaluation(DayInYear(Cast(Literal(new Date(c.getTimeInMillis)), DateType)),
             sdfDay.format(c.getTime).toInt)
         }

From 3427937ea2a4ed19142bd3d66707864879417d61 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Sun, 19 Jul 2015 01:17:22 -0700
Subject: [PATCH 248/549] [SQL] Make date/time functions more consistent with
 other database systems.

This pull request fixes some of the problems in #6981.

- Added date functions to `__all__` so they get exposed
- Rename day_of_month -> dayofmonth
- Rename day_in_year -> dayofyear
- Rename week_of_year -> weekofyear
- Removed "day" from Scala/Python API since it is ambiguous. Only leaving the alias in SQL.

Author: Reynold Xin <rxin@databricks.com>

This patch had conflicts when merged, resolved by
Committer: Reynold Xin <rxin@databricks.com>

Closes #7506 from rxin/datetime and squashes the following commits:

0cb24d9 [Reynold Xin] Export all functions in Python.
e44a4a0 [Reynold Xin] Removed day function from Scala and Python.
9c08fdc [Reynold Xin] [SQL] Make date/time functions more consistent with other database systems.
---
 python/pyspark/sql/functions.py               |  35 +-
 .../catalyst/analysis/FunctionRegistry.scala  |   8 +-
 .../expressions/datetimeFunctions.scala       |  13 +-
 .../sql/catalyst/util/DateTimeUtils.scala     |   4 +-
 ...Suite.scala => DateExpressionsSuite.scala} |  26 +-
 .../org/apache/spark/sql/functions.scala      | 338 +++++++++---------
 .../apache/spark/sql/DataFrameDateSuite.scala |  56 ---
 ...nsSuite.scala => DateFunctionsSuite.scala} |  61 ++--
 8 files changed, 239 insertions(+), 302 deletions(-)
 rename sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/{DateFunctionsSuite.scala => DateExpressionsSuite.scala} (91%)
 delete mode 100644 sql/core/src/test/scala/org/apache/spark/sql/DataFrameDateSuite.scala
 rename sql/core/src/test/scala/org/apache/spark/sql/{DateExpressionsSuite.scala => DateFunctionsSuite.scala} (74%)

diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 0aca3788922aa..fd5a3ba8adab3 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -55,6 +55,11 @@
 
 __all__ += ['lag', 'lead', 'ntile']
 
+__all__ += [
+    'date_format',
+    'year', 'quarter', 'month', 'hour', 'minute', 'second',
+    'dayofmonth', 'dayofyear', 'weekofyear']
+
 
 def _create_function(name, doc=""):
     """ Create a function for aggregator by name"""
@@ -713,41 +718,29 @@ def month(col):
 
 
 @since(1.5)
-def day(col):
-    """
-    Extract the day of the month of a given date as integer.
-
-    >>> sqlContext.createDataFrame([('2015-04-08',)], ['a']).select(day('a').alias('day')).collect()
-    [Row(day=8)]
-    """
-    sc = SparkContext._active_spark_context
-    return Column(sc._jvm.functions.day(col))
-
-
-@since(1.5)
-def day_of_month(col):
+def dayofmonth(col):
     """
     Extract the day of the month of a given date as integer.
 
     >>> df = sqlContext.createDataFrame([('2015-04-08',)], ['a'])
-    >>> df.select(day_of_month('a').alias('day')).collect()
+    >>> df.select(dayofmonth('a').alias('day')).collect()
     [Row(day=8)]
     """
     sc = SparkContext._active_spark_context
-    return Column(sc._jvm.functions.day_of_month(col))
+    return Column(sc._jvm.functions.dayofmonth(col))
 
 
 @since(1.5)
-def day_in_year(col):
+def dayofyear(col):
     """
     Extract the day of the year of a given date as integer.
 
     >>> df = sqlContext.createDataFrame([('2015-04-08',)], ['a'])
-    >>> df.select(day_in_year('a').alias('day')).collect()
+    >>> df.select(dayofyear('a').alias('day')).collect()
     [Row(day=98)]
     """
     sc = SparkContext._active_spark_context
-    return Column(sc._jvm.functions.day_in_year(col))
+    return Column(sc._jvm.functions.dayofyear(col))
 
 
 @since(1.5)
@@ -790,16 +783,16 @@ def second(col):
 
 
 @since(1.5)
-def week_of_year(col):
+def weekofyear(col):
     """
     Extract the week number of a given date as integer.
 
     >>> df = sqlContext.createDataFrame([('2015-04-08',)], ['a'])
-    >>> df.select(week_of_year('a').alias('week')).collect()
+    >>> df.select(weekofyear('a').alias('week')).collect()
     [Row(week=15)]
     """
     sc = SparkContext._active_spark_context
-    return Column(sc._jvm.functions.week_of_year(col))
+    return Column(sc._jvm.functions.weekofyear(col))
 
 
 class UserDefinedFunction(object):
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index 159f7eca7acfe..4b256adcc60c6 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -183,15 +183,15 @@ object FunctionRegistry {
     expression[CurrentDate]("current_date"),
     expression[CurrentTimestamp]("current_timestamp"),
     expression[DateFormatClass]("date_format"),
-    expression[Day]("day"),
-    expression[DayInYear]("day_in_year"),
-    expression[Day]("day_of_month"),
+    expression[DayOfMonth]("day"),
+    expression[DayOfYear]("dayofyear"),
+    expression[DayOfMonth]("dayofmonth"),
     expression[Hour]("hour"),
     expression[Month]("month"),
     expression[Minute]("minute"),
     expression[Quarter]("quarter"),
     expression[Second]("second"),
-    expression[WeekOfYear]("week_of_year"),
+    expression[WeekOfYear]("weekofyear"),
     expression[Year]("year")
 
   )
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeFunctions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeFunctions.scala
index f9cbbb8c6bee0..802445509285d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeFunctions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeFunctions.scala
@@ -116,14 +116,12 @@ case class Second(child: Expression) extends UnaryExpression with ImplicitCastIn
   }
 }
 
-case class DayInYear(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
+case class DayOfYear(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
 
   override def inputTypes: Seq[AbstractDataType] = Seq(DateType)
 
   override def dataType: DataType = IntegerType
 
-  override def prettyName: String = "day_in_year"
-
   override protected def nullSafeEval(date: Any): Any = {
     DateTimeUtils.getDayInYear(date.asInstanceOf[Int])
   }
@@ -149,7 +147,7 @@ case class Year(child: Expression) extends UnaryExpression with ImplicitCastInpu
 
   override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     val dtu = DateTimeUtils.getClass.getName.stripSuffix("$")
-    defineCodeGen(ctx, ev, (c) =>
+    defineCodeGen(ctx, ev, c =>
       s"""$dtu.getYear($c)"""
     )
   }
@@ -191,7 +189,7 @@ case class Month(child: Expression) extends UnaryExpression with ImplicitCastInp
   }
 }
 
-case class Day(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
+case class DayOfMonth(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
 
   override def inputTypes: Seq[AbstractDataType] = Seq(DateType)
 
@@ -215,8 +213,6 @@ case class WeekOfYear(child: Expression) extends UnaryExpression with ImplicitCa
 
   override def dataType: DataType = IntegerType
 
-  override def prettyName: String = "week_of_year"
-
   override protected def nullSafeEval(date: Any): Any = {
     val c = Calendar.getInstance(TimeZone.getTimeZone("UTC"))
     c.setFirstDayOfWeek(Calendar.MONDAY)
@@ -225,7 +221,7 @@ case class WeekOfYear(child: Expression) extends UnaryExpression with ImplicitCa
     c.get(Calendar.WEEK_OF_YEAR)
   }
 
-  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String =
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     nullSafeCodeGen(ctx, ev, (time) => {
       val cal = classOf[Calendar].getName
       val c = ctx.freshName("cal")
@@ -237,6 +233,7 @@ case class WeekOfYear(child: Expression) extends UnaryExpression with ImplicitCa
         ${ev.primitive} = $c.get($cal.WEEK_OF_YEAR);
       """
     })
+  }
 }
 
 case class DateFormatClass(left: Expression, right: Expression) extends BinaryExpression
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
index a0da73a995a82..07412e73b6a5b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
@@ -31,14 +31,14 @@ import org.apache.spark.unsafe.types.UTF8String
  * precision.
  */
 object DateTimeUtils {
-  final val MILLIS_PER_DAY = SECONDS_PER_DAY * 1000L
-
   // see http://stackoverflow.com/questions/466321/convert-unix-timestamp-to-julian
   final val JULIAN_DAY_OF_EPOCH = 2440587  // and .5
   final val SECONDS_PER_DAY = 60 * 60 * 24L
   final val MICROS_PER_SECOND = 1000L * 1000L
   final val NANOS_PER_SECOND = MICROS_PER_SECOND * 1000L
 
+  final val MILLIS_PER_DAY = SECONDS_PER_DAY * 1000L
+
   // number of days in 400 years
   final val daysIn400Years: Int = 146097
   // number of days between 1.1.1970 and 1.1.2001
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
similarity index 91%
rename from sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateFunctionsSuite.scala
rename to sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
index a0991ec998311..f01589c58ea86 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateFunctionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
@@ -19,19 +19,19 @@ package org.apache.spark.sql.catalyst.expressions
 
 import java.sql.{Timestamp, Date}
 import java.text.SimpleDateFormat
-import java.util.{TimeZone, Calendar}
+import java.util.Calendar
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.types.{StringType, TimestampType, DateType}
 
-class DateFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
+class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
 
   val sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
   val sdfDate = new SimpleDateFormat("yyyy-MM-dd")
   val d = new Date(sdf.parse("2015-04-08 13:10:15").getTime)
   val ts = new Timestamp(sdf.parse("2013-11-08 13:10:15").getTime)
 
-  test("Day in Year") {
+  test("DayOfYear") {
     val sdfDay = new SimpleDateFormat("D")
     (2002 to 2004).foreach { y =>
       (0 to 11).foreach { m =>
@@ -39,7 +39,7 @@ class DateFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
           val c = Calendar.getInstance()
           c.set(y, m, 28, 0, 0, 0)
           c.add(Calendar.DATE, i)
-          checkEvaluation(DayInYear(Cast(Literal(new Date(c.getTimeInMillis)), DateType)),
+          checkEvaluation(DayOfYear(Cast(Literal(new Date(c.getTimeInMillis)), DateType)),
             sdfDay.format(c.getTime).toInt)
         }
       }
@@ -51,7 +51,7 @@ class DateFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
           val c = Calendar.getInstance()
           c.set(y, m, 28, 0, 0, 0)
           c.add(Calendar.DATE, i)
-          checkEvaluation(DayInYear(Cast(Literal(new Date(c.getTimeInMillis)), DateType)),
+          checkEvaluation(DayOfYear(Cast(Literal(new Date(c.getTimeInMillis)), DateType)),
             sdfDay.format(c.getTime).toInt)
         }
       }
@@ -63,7 +63,7 @@ class DateFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
           val c = Calendar.getInstance()
           c.set(y, m, 28, 0, 0, 0)
           c.add(Calendar.DATE, i)
-          checkEvaluation(DayInYear(Cast(Literal(new Date(c.getTimeInMillis)), DateType)),
+          checkEvaluation(DayOfYear(Cast(Literal(new Date(c.getTimeInMillis)), DateType)),
             sdfDay.format(c.getTime).toInt)
         }
       }
@@ -163,19 +163,19 @@ class DateFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     }
   }
 
-  test("Day") {
-    checkEvaluation(Day(Cast(Literal("2000-02-29"), DateType)), 29)
-    checkEvaluation(Day(Literal.create(null, DateType)), null)
-    checkEvaluation(Day(Cast(Literal(d), DateType)), 8)
-    checkEvaluation(Day(Cast(Literal(sdfDate.format(d)), DateType)), 8)
-    checkEvaluation(Day(Cast(Literal(ts), DateType)), 8)
+  test("Day / DayOfMonth") {
+    checkEvaluation(DayOfMonth(Cast(Literal("2000-02-29"), DateType)), 29)
+    checkEvaluation(DayOfMonth(Literal.create(null, DateType)), null)
+    checkEvaluation(DayOfMonth(Cast(Literal(d), DateType)), 8)
+    checkEvaluation(DayOfMonth(Cast(Literal(sdfDate.format(d)), DateType)), 8)
+    checkEvaluation(DayOfMonth(Cast(Literal(ts), DateType)), 8)
 
     (1999 to 2000).foreach { y =>
       val c = Calendar.getInstance()
       c.set(y, 0, 1, 0, 0, 0)
       (0 to 365).foreach { d =>
         c.add(Calendar.DATE, 1)
-        checkEvaluation(Day(Cast(Literal(new Date(c.getTimeInMillis)), DateType)),
+        checkEvaluation(DayOfMonth(Cast(Literal(new Date(c.getTimeInMillis)), DateType)),
           c.get(Calendar.DAY_OF_MONTH))
       }
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index cadb25d597d19..f67c89437bb4a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -1748,182 +1748,6 @@ object functions {
    */
   def length(columnName: String): Column = length(Column(columnName))
 
-  //////////////////////////////////////////////////////////////////////////////////////////////
-  // DateTime functions
-  //////////////////////////////////////////////////////////////////////////////////////////////
-
-  /**
-   * Converts a date/timestamp/string to a value of string in the format specified by the date
-   * format given by the second argument.
-   *
-   * A pattern could be for instance `dd.MM.yyyy` and could return a string like '18.03.1993'. All
-   * pattern letters of [[java.text.SimpleDateFormat]] can be used.
-   *
-   * NOTE: Use when ever possible specialized functions like [[year]]. These benefit from a
-   * specialized implementation.
-   *
-   * @group datetime_funcs
-   * @since 1.5.0
-   */
-  def date_format(dateExpr: Column, format: String): Column =
-    DateFormatClass(dateExpr.expr, Literal(format))
-
-  /**
-   * Converts a date/timestamp/string to a value of string in the format specified by the date
-   * format given by the second argument.
-   *
-   * A pattern could be for instance `dd.MM.yyyy` and could return a string like '18.03.1993'. All
-   * pattern letters of [[java.text.SimpleDateFormat]] can be used.
-   *
-   * NOTE: Use when ever possible specialized functions like [[year]]. These benefit from a
-   * specialized implementation.
-   *
-   * @group datetime_funcs
-   * @since 1.5.0
-   */
-  def date_format(dateColumnName: String, format: String): Column =
-    date_format(Column(dateColumnName), format)
-
-  /**
-   * Extracts the year as an integer from a given date/timestamp/string.
-   * @group datetime_funcs
-   * @since 1.5.0
-   */
-  def year(e: Column): Column = Year(e.expr)
-
-  /**
-   * Extracts the year as an integer from a given date/timestamp/string.
-   * @group datetime_funcs
-   * @since 1.5.0
-   */
-  def year(columnName: String): Column = year(Column(columnName))
-
-  /**
-   * Extracts the quarter as an integer from a given date/timestamp/string.
-   * @group datetime_funcs
-   * @since 1.5.0
-   */
-  def quarter(e: Column): Column = Quarter(e.expr)
-
-  /**
-   * Extracts the quarter as an integer from a given date/timestamp/string.
-   * @group datetime_funcs
-   * @since 1.5.0
-   */
-  def quarter(columnName: String): Column = quarter(Column(columnName))
-
-  /**
-   * Extracts the month as an integer from a given date/timestamp/string.
-   * @group datetime_funcs
-   * @since 1.5.0
-   */
-  def month(e: Column): Column = Month(e.expr)
-
-  /**
-   * Extracts the month as an integer from a given date/timestamp/string.
-   * @group datetime_funcs
-   * @since 1.5.0
-   */
-  def month(columnName: String): Column = month(Column(columnName))
-
-  /**
-   * Extracts the day of the month as an integer from a given date/timestamp/string.
-   * @group datetime_funcs
-   * @since 1.5.0
-   */
-  def day(e: Column): Column = Day(e.expr)
-
-  /**
-   * Extracts the day of the month as an integer from a given date/timestamp/string.
-   * @group datetime_funcs
-   * @since 1.5.0
-   */
-  def day(columnName: String): Column = day(Column(columnName))
-
-  /**
-   * Extracts the day of the month as an integer from a given date/timestamp/string.
-   * @group datetime_funcs
-   * @since 1.5.0
-   */
-  def day_of_month(e: Column): Column = Day(e.expr)
-
-  /**
-   * Extracts the day of the month as an integer from a given date/timestamp/string.
-   * @group datetime_funcs
-   * @since 1.5.0
-   */
-  def day_of_month(columnName: String): Column = day_of_month(Column(columnName))
-
-  /**
-   * Extracts the day of the year as an integer from a given date/timestamp/string.
-   * @group datetime_funcs
-   * @since 1.5.0
-   */
-  def day_in_year(e: Column): Column = DayInYear(e.expr)
-
-  /**
-   * Extracts the day of the year as an integer from a given date/timestamp/string.
-   * @group datetime_funcs
-   * @since 1.5.0
-   */
-  def day_in_year(columnName: String): Column = day_in_year(Column(columnName))
-
-  /**
-   * Extracts the hours as an integer from a given date/timestamp/string.
-   * @group datetime_funcs
-   * @since 1.5.0
-   */
-  def hour(e: Column): Column = Hour(e.expr)
-
-  /**
-   * Extracts the hours as an integer from a given date/timestamp/string.
-   * @group datetime_funcs
-   * @since 1.5.0
-   */
-  def hour(columnName: String): Column = hour(Column(columnName))
-
-  /**
-   * Extracts the minutes as an integer from a given date/timestamp/string.
-   * @group datetime_funcs
-   * @since 1.5.0
-   */
-  def minute(e: Column): Column = Minute(e.expr)
-
-  /**
-   * Extracts the minutes as an integer from a given date/timestamp/string.
-   * @group datetime_funcs
-   * @since 1.5.0
-   */
-  def minute(columnName: String): Column = minute(Column(columnName))
-
-  /**
-   * Extracts the seconds as an integer from a given date/timestamp/string.
-   * @group datetime_funcs
-   * @since 1.5.0
-   */
-  def second(e: Column): Column = Second(e.expr)
-
-  /**
-   * Extracts the seconds as an integer from a given date/timestamp/string.
-   * @group datetime_funcs
-   * @since 1.5.0
-   */
-  def second(columnName: String): Column = second(Column(columnName))
-
-  /**
-   * Extracts the week number as an integer from a given date/timestamp/string.
-   * @group datetime_funcs
-   * @since 1.5.0
-   */
-  def week_of_year(e: Column): Column = WeekOfYear(e.expr)
-
-  /**
-   * Extracts the week number as an integer from a given date/timestamp/string.
-   * @group datetime_funcs
-   * @since 1.5.0
-   */
-  def week_of_year(columnName: String): Column = week_of_year(Column(columnName))
-
   /**
    * Formats the number X to a format like '#,###,###.##', rounded to d decimal places,
    * and returns the result as a string.
@@ -2409,6 +2233,168 @@ object functions {
     StringSpace(n.expr)
   }
 
+  //////////////////////////////////////////////////////////////////////////////////////////////
+  // DateTime functions
+  //////////////////////////////////////////////////////////////////////////////////////////////
+
+  /**
+   * Converts a date/timestamp/string to a value of string in the format specified by the date
+   * format given by the second argument.
+   *
+   * A pattern could be for instance `dd.MM.yyyy` and could return a string like '18.03.1993'. All
+   * pattern letters of [[java.text.SimpleDateFormat]] can be used.
+   *
+   * NOTE: Use when ever possible specialized functions like [[year]]. These benefit from a
+   * specialized implementation.
+   *
+   * @group datetime_funcs
+   * @since 1.5.0
+   */
+  def date_format(dateExpr: Column, format: String): Column =
+    DateFormatClass(dateExpr.expr, Literal(format))
+
+  /**
+   * Converts a date/timestamp/string to a value of string in the format specified by the date
+   * format given by the second argument.
+   *
+   * A pattern could be for instance `dd.MM.yyyy` and could return a string like '18.03.1993'. All
+   * pattern letters of [[java.text.SimpleDateFormat]] can be used.
+   *
+   * NOTE: Use when ever possible specialized functions like [[year]]. These benefit from a
+   * specialized implementation.
+   *
+   * @group datetime_funcs
+   * @since 1.5.0
+   */
+  def date_format(dateColumnName: String, format: String): Column =
+    date_format(Column(dateColumnName), format)
+
+  /**
+   * Extracts the year as an integer from a given date/timestamp/string.
+   * @group datetime_funcs
+   * @since 1.5.0
+   */
+  def year(e: Column): Column = Year(e.expr)
+
+  /**
+   * Extracts the year as an integer from a given date/timestamp/string.
+   * @group datetime_funcs
+   * @since 1.5.0
+   */
+  def year(columnName: String): Column = year(Column(columnName))
+
+  /**
+   * Extracts the quarter as an integer from a given date/timestamp/string.
+   * @group datetime_funcs
+   * @since 1.5.0
+   */
+  def quarter(e: Column): Column = Quarter(e.expr)
+
+  /**
+   * Extracts the quarter as an integer from a given date/timestamp/string.
+   * @group datetime_funcs
+   * @since 1.5.0
+   */
+  def quarter(columnName: String): Column = quarter(Column(columnName))
+
+  /**
+   * Extracts the month as an integer from a given date/timestamp/string.
+   * @group datetime_funcs
+   * @since 1.5.0
+   */
+  def month(e: Column): Column = Month(e.expr)
+
+  /**
+   * Extracts the month as an integer from a given date/timestamp/string.
+   * @group datetime_funcs
+   * @since 1.5.0
+   */
+  def month(columnName: String): Column = month(Column(columnName))
+
+  /**
+   * Extracts the day of the month as an integer from a given date/timestamp/string.
+   * @group datetime_funcs
+   * @since 1.5.0
+   */
+  def dayofmonth(e: Column): Column = DayOfMonth(e.expr)
+
+  /**
+   * Extracts the day of the month as an integer from a given date/timestamp/string.
+   * @group datetime_funcs
+   * @since 1.5.0
+   */
+  def dayofmonth(columnName: String): Column = dayofmonth(Column(columnName))
+
+  /**
+   * Extracts the day of the year as an integer from a given date/timestamp/string.
+   * @group datetime_funcs
+   * @since 1.5.0
+   */
+  def dayofyear(e: Column): Column = DayOfYear(e.expr)
+
+  /**
+   * Extracts the day of the year as an integer from a given date/timestamp/string.
+   * @group datetime_funcs
+   * @since 1.5.0
+   */
+  def dayofyear(columnName: String): Column = dayofyear(Column(columnName))
+
+  /**
+   * Extracts the hours as an integer from a given date/timestamp/string.
+   * @group datetime_funcs
+   * @since 1.5.0
+   */
+  def hour(e: Column): Column = Hour(e.expr)
+
+  /**
+   * Extracts the hours as an integer from a given date/timestamp/string.
+   * @group datetime_funcs
+   * @since 1.5.0
+   */
+  def hour(columnName: String): Column = hour(Column(columnName))
+
+  /**
+   * Extracts the minutes as an integer from a given date/timestamp/string.
+   * @group datetime_funcs
+   * @since 1.5.0
+   */
+  def minute(e: Column): Column = Minute(e.expr)
+
+  /**
+   * Extracts the minutes as an integer from a given date/timestamp/string.
+   * @group datetime_funcs
+   * @since 1.5.0
+   */
+  def minute(columnName: String): Column = minute(Column(columnName))
+
+  /**
+   * Extracts the seconds as an integer from a given date/timestamp/string.
+   * @group datetime_funcs
+   * @since 1.5.0
+   */
+  def second(e: Column): Column = Second(e.expr)
+
+  /**
+   * Extracts the seconds as an integer from a given date/timestamp/string.
+   * @group datetime_funcs
+   * @since 1.5.0
+   */
+  def second(columnName: String): Column = second(Column(columnName))
+
+  /**
+   * Extracts the week number as an integer from a given date/timestamp/string.
+   * @group datetime_funcs
+   * @since 1.5.0
+   */
+  def weekofyear(e: Column): Column = WeekOfYear(e.expr)
+
+  /**
+   * Extracts the week number as an integer from a given date/timestamp/string.
+   * @group datetime_funcs
+   * @since 1.5.0
+   */
+  def weekofyear(columnName: String): Column = weekofyear(Column(columnName))
+
   //////////////////////////////////////////////////////////////////////////////////////////////
   //////////////////////////////////////////////////////////////////////////////////////////////
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameDateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameDateSuite.scala
deleted file mode 100644
index a4719a38de1d4..0000000000000
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameDateSuite.scala
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql
-
-import java.sql.{Date, Timestamp}
-
-class DataFrameDateTimeSuite extends QueryTest {
-
-  private lazy val ctx = org.apache.spark.sql.test.TestSQLContext
-  import ctx.implicits._
-
-  test("timestamp comparison with date strings") {
-    val df = Seq(
-      (1, Timestamp.valueOf("2015-01-01 00:00:00")),
-      (2, Timestamp.valueOf("2014-01-01 00:00:00"))).toDF("i", "t")
-
-    checkAnswer(
-      df.select("t").filter($"t" <= "2014-06-01"),
-      Row(Timestamp.valueOf("2014-01-01 00:00:00")) :: Nil)
-
-
-    checkAnswer(
-      df.select("t").filter($"t" >= "2014-06-01"),
-      Row(Timestamp.valueOf("2015-01-01 00:00:00")) :: Nil)
-  }
-
-  test("date comparison with date strings") {
-    val df = Seq(
-      (1, Date.valueOf("2015-01-01")),
-      (2, Date.valueOf("2014-01-01"))).toDF("i", "t")
-
-    checkAnswer(
-      df.select("t").filter($"t" <= "2014-06-01"),
-      Row(Date.valueOf("2014-01-01")) :: Nil)
-
-
-    checkAnswer(
-      df.select("t").filter($"t" >= "2015"),
-      Row(Date.valueOf("2015-01-01")) :: Nil)
-  }
-}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DateExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
similarity index 74%
rename from sql/core/src/test/scala/org/apache/spark/sql/DateExpressionsSuite.scala
rename to sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
index d24e3ee1dd8f5..9e80ae86920d9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DateExpressionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
@@ -22,7 +22,7 @@ import java.text.SimpleDateFormat
 
 import org.apache.spark.sql.functions._
 
-class DateExpressionsSuite extends QueryTest {
+class DateFunctionsSuite extends QueryTest {
   private lazy val ctx = org.apache.spark.sql.test.TestSQLContext
 
   import ctx.implicits._
@@ -32,6 +32,35 @@ class DateExpressionsSuite extends QueryTest {
   val d = new Date(sdf.parse("2015-04-08 13:10:15").getTime)
   val ts = new Timestamp(sdf.parse("2013-04-08 13:10:15").getTime)
 
+  test("timestamp comparison with date strings") {
+    val df = Seq(
+      (1, Timestamp.valueOf("2015-01-01 00:00:00")),
+      (2, Timestamp.valueOf("2014-01-01 00:00:00"))).toDF("i", "t")
+
+    checkAnswer(
+      df.select("t").filter($"t" <= "2014-06-01"),
+      Row(Timestamp.valueOf("2014-01-01 00:00:00")) :: Nil)
+
+
+    checkAnswer(
+      df.select("t").filter($"t" >= "2014-06-01"),
+      Row(Timestamp.valueOf("2015-01-01 00:00:00")) :: Nil)
+  }
+
+  test("date comparison with date strings") {
+    val df = Seq(
+      (1, Date.valueOf("2015-01-01")),
+      (2, Date.valueOf("2014-01-01"))).toDF("i", "t")
+
+    checkAnswer(
+      df.select("t").filter($"t" <= "2014-06-01"),
+      Row(Date.valueOf("2014-01-01")) :: Nil)
+
+
+    checkAnswer(
+      df.select("t").filter($"t" >= "2015"),
+      Row(Date.valueOf("2015-01-01")) :: Nil)
+  }
 
   test("date format") {
     val df = Seq((d, sdf.format(d), ts)).toDF("a", "b", "c")
@@ -83,39 +112,27 @@ class DateExpressionsSuite extends QueryTest {
       Row(4, 4, 4))
   }
 
-  test("day") {
-    val df = Seq((d, sdfDate.format(d), ts)).toDF("a", "b", "c")
-
-    checkAnswer(
-      df.select(day("a"), day("b"), day("c")),
-      Row(8, 8, 8))
-
-    checkAnswer(
-      df.selectExpr("day(a)", "day(b)", "day(c)"),
-      Row(8, 8, 8))
-  }
-
-  test("day of month") {
+  test("dayofmonth") {
     val df = Seq((d, sdfDate.format(d), ts)).toDF("a", "b", "c")
 
     checkAnswer(
-      df.select(day_of_month("a"), day_of_month("b"), day_of_month("c")),
+      df.select(dayofmonth("a"), dayofmonth("b"), dayofmonth("c")),
       Row(8, 8, 8))
 
     checkAnswer(
-      df.selectExpr("day_of_month(a)", "day_of_month(b)", "day_of_month(c)"),
+      df.selectExpr("day(a)", "day(b)", "dayofmonth(c)"),
       Row(8, 8, 8))
   }
 
-  test("day in year") {
+  test("dayofyear") {
     val df = Seq((d, sdfDate.format(d), ts)).toDF("a", "b", "c")
 
     checkAnswer(
-      df.select(day_in_year("a"), day_in_year("b"), day_in_year("c")),
+      df.select(dayofyear("a"), dayofyear("b"), dayofyear("c")),
       Row(98, 98, 98))
 
     checkAnswer(
-      df.selectExpr("day_in_year(a)", "day_in_year(b)", "day_in_year(c)"),
+      df.selectExpr("dayofyear(a)", "dayofyear(b)", "dayofyear(c)"),
       Row(98, 98, 98))
   }
 
@@ -155,15 +172,15 @@ class DateExpressionsSuite extends QueryTest {
       Row(0, 15, 15))
   }
 
-  test("week of year") {
+  test("weekofyear") {
     val df = Seq((d, sdfDate.format(d), ts)).toDF("a", "b", "c")
 
     checkAnswer(
-      df.select(week_of_year("a"), week_of_year("b"), week_of_year("c")),
+      df.select(weekofyear("a"), weekofyear("b"), weekofyear("c")),
       Row(15, 15, 15))
 
     checkAnswer(
-      df.selectExpr("week_of_year(a)", "week_of_year(b)", "week_of_year(c)"),
+      df.selectExpr("weekofyear(a)", "weekofyear(b)", "weekofyear(c)"),
       Row(15, 15, 15))
   }
 

From bc24289f5d54e4ff61cd75a5941338c9d946ff73 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Sun, 19 Jul 2015 17:37:25 +0800
Subject: [PATCH 249/549] [SPARK-9179] [BUILD] Allows committers to specify
 primary author of the PR to be merged

It's a common case that some contributor contributes an initial version of a feature/bugfix, and later on some other people (mostly committers) fork and add more improvements. When merging these PRs, we probably want to specify the original author as the primary author. Currently we can only do this by running

```
$ git commit --amend --author="name <email>"
```

manually right before the merge script pushes to Apache Git repo. It would be nice if the script accepts user specified primary author information.

Author: Cheng Lian <lian@databricks.com>

Closes #7508 from liancheng/spark-9179 and squashes the following commits:

218d88e [Cheng Lian] Allows committers to specify primary author of the PR to be merged
---
 dev/merge_spark_pr.py | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/dev/merge_spark_pr.py b/dev/merge_spark_pr.py
index 4a17d48d8171d..d586a57481aa1 100755
--- a/dev/merge_spark_pr.py
+++ b/dev/merge_spark_pr.py
@@ -130,7 +130,10 @@ def merge_pr(pr_num, target_ref, title, body, pr_repo_desc):
                              '--pretty=format:%an <%ae>']).split("\n")
     distinct_authors = sorted(set(commit_authors),
                               key=lambda x: commit_authors.count(x), reverse=True)
-    primary_author = distinct_authors[0]
+    primary_author = raw_input(
+        "Enter primary author in the format of \"name <email>\" [%s]: " %
+        distinct_authors[0])
+
     commits = run_cmd(['git', 'log', 'HEAD..%s' % pr_branch_name,
                       '--pretty=format:%h [%an] %s']).split("\n\n")
 
@@ -281,7 +284,7 @@ def get_version_json(version_str):
     resolve = filter(lambda a: a['name'] == "Resolve Issue", asf_jira.transitions(jira_id))[0]
     resolution = filter(lambda r: r.raw['name'] == "Fixed", asf_jira.resolutions())[0]
     asf_jira.transition_issue(
-        jira_id, resolve["id"], fixVersions = jira_fix_versions, 
+        jira_id, resolve["id"], fixVersions = jira_fix_versions,
         comment = comment, resolution = {'id': resolution.raw['id']})
 
     print "Successfully resolved %s with fixVersions=%s!" % (jira_id, fix_versions)
@@ -300,7 +303,7 @@ def standardize_jira_ref(text):
     """
     Standardize the [SPARK-XXXXX] [MODULE] prefix
     Converts "[SPARK-XXX][mllib] Issue", "[MLLib] SPARK-XXX. Issue" or "SPARK XXX [MLLIB]: Issue" to "[SPARK-XXX] [MLLIB] Issue"
-    
+
     >>> standardize_jira_ref("[SPARK-5821] [SQL] ParquetRelation2 CTAS should check if delete is successful")
     '[SPARK-5821] [SQL] ParquetRelation2 CTAS should check if delete is successful'
     >>> standardize_jira_ref("[SPARK-4123][Project Infra][WIP]: Show new dependencies added in pull requests")
@@ -322,11 +325,11 @@ def standardize_jira_ref(text):
     """
     jira_refs = []
     components = []
-    
+
     # If the string is compliant, no need to process any further
     if (re.search(r'^\[SPARK-[0-9]{3,6}\] (\[[A-Z0-9_\s,]+\] )+\S+', text)):
         return text
-    
+
     # Extract JIRA ref(s):
     pattern = re.compile(r'(SPARK[-\s]*[0-9]{3,6})+', re.IGNORECASE)
     for ref in pattern.findall(text):
@@ -348,18 +351,18 @@ def standardize_jira_ref(text):
 
     # Assemble full text (JIRA ref(s), module(s), remaining text)
     clean_text = ' '.join(jira_refs).strip() + " " + ' '.join(components).strip() + " " + text.strip()
-    
+
     # Replace multiple spaces with a single space, e.g. if no jira refs and/or components were included
     clean_text = re.sub(r'\s+', ' ', clean_text.strip())
-    
+
     return clean_text
 
 def main():
     global original_head
-    
+
     os.chdir(SPARK_HOME)
     original_head = run_cmd("git rev-parse HEAD")[:8]
-    
+
     branches = get_json("%s/branches" % GITHUB_API_BASE)
     branch_names = filter(lambda x: x.startswith("branch-"), [x['name'] for x in branches])
     # Assumes branch names can be sorted lexicographically
@@ -448,5 +451,5 @@ def main():
     (failure_count, test_count) = doctest.testmod()
     if failure_count:
         exit(-1)
-    
+
     main()

From 34ed82bb44c4519819695ddc760e6c9a98bc2e40 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Sun, 19 Jul 2015 18:58:19 +0800
Subject: [PATCH 250/549] [HOTFIX] [SQL] Fixes compilation error introduced by
 PR #7506

PR #7506 breaks master build because of compilation error. Note that #7506 itself looks good, but it seems that `git merge` did something stupid.

Author: Cheng Lian <lian@databricks.com>

Closes #7510 from liancheng/hotfix-for-pr-7506 and squashes the following commits:

7ea7e89 [Cheng Lian] Fixes compilation error
---
 .../spark/sql/catalyst/expressions/DateExpressionsSuite.scala | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
index f01589c58ea86..f724bab4d8839 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
@@ -75,7 +75,7 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
           val c = Calendar.getInstance()
           c.set(y, m, 28, 0, 0, 0)
           c.add(Calendar.DATE, i)
-          checkEvaluation(DayInYear(Cast(Literal(new Date(c.getTimeInMillis)), DateType)),
+          checkEvaluation(DayOfYear(Cast(Literal(new Date(c.getTimeInMillis)), DateType)),
             sdfDay.format(c.getTime).toInt)
         }
       }
@@ -87,7 +87,7 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
           val c = Calendar.getInstance()
           c.set(y, m, 28, 0, 0, 0)
           c.add(Calendar.DATE, i)
-          checkEvaluation(DayInYear(Cast(Literal(new Date(c.getTimeInMillis)), DateType)),
+          checkEvaluation(DayOfYear(Cast(Literal(new Date(c.getTimeInMillis)), DateType)),
             sdfDay.format(c.getTime).toInt)
         }
       }

From a803ac3e060d181c7b34d9501c9350e5f215ba85 Mon Sep 17 00:00:00 2001
From: Nicholas Hwang <moogling@gmail.com>
Date: Sun, 19 Jul 2015 10:30:28 -0700
Subject: [PATCH 251/549] [SPARK-9021] [PYSPARK] Change RDD.aggregate() to do
 reduce(mapPartitions()) instead of mapPartitions.fold()

I'm relatively new to Spark and functional programming, so forgive me if this pull request is just a result of my misunderstanding of how Spark should be used.

Currently, if one happens to use a mutable object as `zeroValue` for `RDD.aggregate()`, possibly unexpected behavior can occur.

This is because pyspark's current implementation of `RDD.aggregate()` does not serialize or make a copy of `zeroValue` before handing it off to `RDD.mapPartitions(...).fold(...)`. This results in a single reference to `zeroValue` being used for both `RDD.mapPartitions()` and `RDD.fold()` on each partition. This can result in strange accumulator values being fed into each partition's call to `RDD.fold()`, as the `zeroValue` may have been changed in-place during the `RDD.mapPartitions()` call.

As an illustrative example, submit the following to `spark-submit`:
```
from pyspark import SparkConf, SparkContext
import collections

def updateCounter(acc, val):
    print 'update acc:', acc
    print 'update val:', val
    acc[val] += 1
    return acc

def comboCounter(acc1, acc2):
    print 'combo acc1:', acc1
    print 'combo acc2:', acc2
    acc1.update(acc2)
    return acc1

def main():
    conf = SparkConf().setMaster("local").setAppName("Aggregate with Counter")
    sc = SparkContext(conf = conf)

    print '======= AGGREGATING with ONE PARTITION ======='
    print sc.parallelize(range(1,10), 1).aggregate(collections.Counter(), updateCounter, comboCounter)

    print '======= AGGREGATING with TWO PARTITIONS ======='
    print sc.parallelize(range(1,10), 2).aggregate(collections.Counter(), updateCounter, comboCounter)

if __name__ == "__main__":
    main()
```

One probably expects this to output the following:
```
Counter({1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: 1, 9: 1})
```

But it instead outputs this (regardless of the number of partitions):
```
Counter({1: 2, 2: 2, 3: 2, 4: 2, 5: 2, 6: 2, 7: 2, 8: 2, 9: 2})
```

This is because (I believe) `zeroValue` gets passed correctly to each partition, but after `RDD.mapPartitions()` completes, the `zeroValue` object has been updated and is then passed to `RDD.fold()`, which results in all items being double-counted within each partition before being finally reduced at the calling node.

I realize that this type of calculation is typically done by `RDD.mapPartitions(...).reduceByKey(...)`, but hopefully this illustrates some potentially confusing behavior. I also noticed that other `RDD` methods use this `deepcopy` approach to creating unique copies of `zeroValue` (i.e., `RDD.aggregateByKey()` and `RDD.foldByKey()`), and that the Scala implementations do seem to serialize the `zeroValue` object appropriately to prevent this type of behavior.

Author: Nicholas Hwang <moogling@gmail.com>

Closes #7378 from njhwang/master and squashes the following commits:

659bb27 [Nicholas Hwang] Fixed RDD.aggregate() to perform a reduce operation on collected mapPartitions results, similar to how fold currently is implemented. This prevents an initial combOp being performed on each partition with zeroValue (which leads to unexpected behavior if zeroValue is a mutable object) before being combOp'ed with other partition results.
8d8d694 [Nicholas Hwang] Changed dict construction to be compatible with Python 2.6 (cannot use list comprehensions to make dicts)
56eb2ab [Nicholas Hwang] Fixed whitespace after colon to conform with PEP8
391de4a [Nicholas Hwang] Removed used of collections.Counter from RDD tests for Python 2.6 compatibility; used defaultdict(int) instead. Merged treeAggregate test with mutable zero value into aggregate test to reduce code duplication.
2fa4e4b [Nicholas Hwang] Merge branch 'master' of https://github.com/njhwang/spark
ba528bd [Nicholas Hwang] Updated comments regarding protection of zeroValue from mutation in RDD.aggregate(). Added regression tests for aggregate(), fold(), aggregateByKey(), foldByKey(), and treeAggregate(), all with both 1 and 2 partition RDDs. Confirmed that aggregate() is the only problematic implementation as of commit 257236c3e17906098f801cbc2059e7a9054e8cab. Also replaced some parallelizations of ranges with xranges, per the documentation's recommendations of preferring xrange over range.
7820391 [Nicholas Hwang] Updated comments regarding protection of zeroValue from mutation in RDD.aggregate(). Added regression tests for aggregate(), fold(), aggregateByKey(), foldByKey(), and treeAggregate(), all with both 1 and 2 partition RDDs. Confirmed that aggregate() is the only problematic implementation as of commit 257236c3e17906098f801cbc2059e7a9054e8cab.
90d1544 [Nicholas Hwang] Made sure RDD.aggregate() makes a deepcopy of zeroValue for all partitions; this ensures that the mapPartitions call works with unique copies of zeroValue in each partition, and prevents a single reference to zeroValue being used for both map and fold calls on each partition (resulting in possibly unexpected behavior).
---
 python/pyspark/rdd.py   |  10 ++-
 python/pyspark/tests.py | 141 ++++++++++++++++++++++++++++++++++++----
 2 files changed, 137 insertions(+), 14 deletions(-)

diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index 3218bed5c74fc..7e788148d981c 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -862,6 +862,9 @@ def func(iterator):
             for obj in iterator:
                 acc = op(obj, acc)
             yield acc
+        # collecting result of mapPartitions here ensures that the copy of
+        # zeroValue provided to each partition is unique from the one provided
+        # to the final reduce call
         vals = self.mapPartitions(func).collect()
         return reduce(op, vals, zeroValue)
 
@@ -891,8 +894,11 @@ def func(iterator):
             for obj in iterator:
                 acc = seqOp(acc, obj)
             yield acc
-
-        return self.mapPartitions(func).fold(zeroValue, combOp)
+        # collecting result of mapPartitions here ensures that the copy of
+        # zeroValue provided to each partition is unique from the one provided
+        # to the final reduce call
+        vals = self.mapPartitions(func).collect()
+        return reduce(combOp, vals, zeroValue)
 
     def treeAggregate(self, zeroValue, seqOp, combOp, depth=2):
         """
diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py
index 21225016805bc..5be9937cb04b2 100644
--- a/python/pyspark/tests.py
+++ b/python/pyspark/tests.py
@@ -529,10 +529,127 @@ def test_deleting_input_files(self):
 
     def test_sampling_default_seed(self):
         # Test for SPARK-3995 (default seed setting)
-        data = self.sc.parallelize(range(1000), 1)
+        data = self.sc.parallelize(xrange(1000), 1)
         subset = data.takeSample(False, 10)
         self.assertEqual(len(subset), 10)
 
+    def test_aggregate_mutable_zero_value(self):
+        # Test for SPARK-9021; uses aggregate and treeAggregate to build dict
+        # representing a counter of ints
+        # NOTE: dict is used instead of collections.Counter for Python 2.6
+        # compatibility
+        from collections import defaultdict
+
+        # Show that single or multiple partitions work
+        data1 = self.sc.range(10, numSlices=1)
+        data2 = self.sc.range(10, numSlices=2)
+
+        def seqOp(x, y):
+            x[y] += 1
+            return x
+
+        def comboOp(x, y):
+            for key, val in y.items():
+                x[key] += val
+            return x
+
+        counts1 = data1.aggregate(defaultdict(int), seqOp, comboOp)
+        counts2 = data2.aggregate(defaultdict(int), seqOp, comboOp)
+        counts3 = data1.treeAggregate(defaultdict(int), seqOp, comboOp, 2)
+        counts4 = data2.treeAggregate(defaultdict(int), seqOp, comboOp, 2)
+
+        ground_truth = defaultdict(int, dict((i, 1) for i in range(10)))
+        self.assertEqual(counts1, ground_truth)
+        self.assertEqual(counts2, ground_truth)
+        self.assertEqual(counts3, ground_truth)
+        self.assertEqual(counts4, ground_truth)
+
+    def test_aggregate_by_key_mutable_zero_value(self):
+        # Test for SPARK-9021; uses aggregateByKey to make a pair RDD that
+        # contains lists of all values for each key in the original RDD
+
+        # list(range(...)) for Python 3.x compatibility (can't use * operator
+        # on a range object)
+        # list(zip(...)) for Python 3.x compatibility (want to parallelize a
+        # collection, not a zip object)
+        tuples = list(zip(list(range(10))*2, [1]*20))
+        # Show that single or multiple partitions work
+        data1 = self.sc.parallelize(tuples, 1)
+        data2 = self.sc.parallelize(tuples, 2)
+
+        def seqOp(x, y):
+            x.append(y)
+            return x
+
+        def comboOp(x, y):
+            x.extend(y)
+            return x
+
+        values1 = data1.aggregateByKey([], seqOp, comboOp).collect()
+        values2 = data2.aggregateByKey([], seqOp, comboOp).collect()
+        # Sort lists to ensure clean comparison with ground_truth
+        values1.sort()
+        values2.sort()
+
+        ground_truth = [(i, [1]*2) for i in range(10)]
+        self.assertEqual(values1, ground_truth)
+        self.assertEqual(values2, ground_truth)
+
+    def test_fold_mutable_zero_value(self):
+        # Test for SPARK-9021; uses fold to merge an RDD of dict counters into
+        # a single dict
+        # NOTE: dict is used instead of collections.Counter for Python 2.6
+        # compatibility
+        from collections import defaultdict
+
+        counts1 = defaultdict(int, dict((i, 1) for i in range(10)))
+        counts2 = defaultdict(int, dict((i, 1) for i in range(3, 8)))
+        counts3 = defaultdict(int, dict((i, 1) for i in range(4, 7)))
+        counts4 = defaultdict(int, dict((i, 1) for i in range(5, 6)))
+        all_counts = [counts1, counts2, counts3, counts4]
+        # Show that single or multiple partitions work
+        data1 = self.sc.parallelize(all_counts, 1)
+        data2 = self.sc.parallelize(all_counts, 2)
+
+        def comboOp(x, y):
+            for key, val in y.items():
+                x[key] += val
+            return x
+
+        fold1 = data1.fold(defaultdict(int), comboOp)
+        fold2 = data2.fold(defaultdict(int), comboOp)
+
+        ground_truth = defaultdict(int)
+        for counts in all_counts:
+            for key, val in counts.items():
+                ground_truth[key] += val
+        self.assertEqual(fold1, ground_truth)
+        self.assertEqual(fold2, ground_truth)
+
+    def test_fold_by_key_mutable_zero_value(self):
+        # Test for SPARK-9021; uses foldByKey to make a pair RDD that contains
+        # lists of all values for each key in the original RDD
+
+        tuples = [(i, range(i)) for i in range(10)]*2
+        # Show that single or multiple partitions work
+        data1 = self.sc.parallelize(tuples, 1)
+        data2 = self.sc.parallelize(tuples, 2)
+
+        def comboOp(x, y):
+            x.extend(y)
+            return x
+
+        values1 = data1.foldByKey([], comboOp).collect()
+        values2 = data2.foldByKey([], comboOp).collect()
+        # Sort lists to ensure clean comparison with ground_truth
+        values1.sort()
+        values2.sort()
+
+        # list(range(...)) for Python 3.x compatibility
+        ground_truth = [(i, list(range(i))*2) for i in range(10)]
+        self.assertEqual(values1, ground_truth)
+        self.assertEqual(values2, ground_truth)
+
     def test_aggregate_by_key(self):
         data = self.sc.parallelize([(1, 1), (1, 1), (3, 2), (5, 1), (5, 3)], 2)
 
@@ -624,8 +741,8 @@ def test_zip_with_different_serializers(self):
 
     def test_zip_with_different_object_sizes(self):
         # regress test for SPARK-5973
-        a = self.sc.parallelize(range(10000)).map(lambda i: '*' * i)
-        b = self.sc.parallelize(range(10000, 20000)).map(lambda i: '*' * i)
+        a = self.sc.parallelize(xrange(10000)).map(lambda i: '*' * i)
+        b = self.sc.parallelize(xrange(10000, 20000)).map(lambda i: '*' * i)
         self.assertEqual(10000, a.zip(b).count())
 
     def test_zip_with_different_number_of_items(self):
@@ -647,7 +764,7 @@ def test_zip_with_different_number_of_items(self):
             self.assertRaises(Exception, lambda: a.zip(b).count())
 
     def test_count_approx_distinct(self):
-        rdd = self.sc.parallelize(range(1000))
+        rdd = self.sc.parallelize(xrange(1000))
         self.assertTrue(950 < rdd.countApproxDistinct(0.03) < 1050)
         self.assertTrue(950 < rdd.map(float).countApproxDistinct(0.03) < 1050)
         self.assertTrue(950 < rdd.map(str).countApproxDistinct(0.03) < 1050)
@@ -777,7 +894,7 @@ def test_distinct(self):
     def test_external_group_by_key(self):
         self.sc._conf.set("spark.python.worker.memory", "1m")
         N = 200001
-        kv = self.sc.parallelize(range(N)).map(lambda x: (x % 3, x))
+        kv = self.sc.parallelize(xrange(N)).map(lambda x: (x % 3, x))
         gkv = kv.groupByKey().cache()
         self.assertEqual(3, gkv.count())
         filtered = gkv.filter(lambda kv: kv[0] == 1)
@@ -871,7 +988,7 @@ def test_narrow_dependency_in_join(self):
 
     # Regression test for SPARK-6294
     def test_take_on_jrdd(self):
-        rdd = self.sc.parallelize(range(1 << 20)).map(lambda x: str(x))
+        rdd = self.sc.parallelize(xrange(1 << 20)).map(lambda x: str(x))
         rdd._jrdd.first()
 
     def test_sortByKey_uses_all_partitions_not_only_first_and_last(self):
@@ -1517,13 +1634,13 @@ def run():
             self.fail("daemon had been killed")
 
         # run a normal job
-        rdd = self.sc.parallelize(range(100), 1)
+        rdd = self.sc.parallelize(xrange(100), 1)
         self.assertEqual(100, rdd.map(str).count())
 
     def test_after_exception(self):
         def raise_exception(_):
             raise Exception()
-        rdd = self.sc.parallelize(range(100), 1)
+        rdd = self.sc.parallelize(xrange(100), 1)
         with QuietTest(self.sc):
             self.assertRaises(Exception, lambda: rdd.foreach(raise_exception))
         self.assertEqual(100, rdd.map(str).count())
@@ -1539,22 +1656,22 @@ def test_after_jvm_exception(self):
         with QuietTest(self.sc):
             self.assertRaises(Exception, lambda: filtered_data.count())
 
-        rdd = self.sc.parallelize(range(100), 1)
+        rdd = self.sc.parallelize(xrange(100), 1)
         self.assertEqual(100, rdd.map(str).count())
 
     def test_accumulator_when_reuse_worker(self):
         from pyspark.accumulators import INT_ACCUMULATOR_PARAM
         acc1 = self.sc.accumulator(0, INT_ACCUMULATOR_PARAM)
-        self.sc.parallelize(range(100), 20).foreach(lambda x: acc1.add(x))
+        self.sc.parallelize(xrange(100), 20).foreach(lambda x: acc1.add(x))
         self.assertEqual(sum(range(100)), acc1.value)
 
         acc2 = self.sc.accumulator(0, INT_ACCUMULATOR_PARAM)
-        self.sc.parallelize(range(100), 20).foreach(lambda x: acc2.add(x))
+        self.sc.parallelize(xrange(100), 20).foreach(lambda x: acc2.add(x))
         self.assertEqual(sum(range(100)), acc2.value)
         self.assertEqual(sum(range(100)), acc1.value)
 
     def test_reuse_worker_after_take(self):
-        rdd = self.sc.parallelize(range(100000), 1)
+        rdd = self.sc.parallelize(xrange(100000), 1)
         self.assertEqual(0, rdd.first())
 
         def count():

From 7a81245345f2d6124423161786bb0d9f1c278ab8 Mon Sep 17 00:00:00 2001
From: Herman van Hovell <hvanhovell@questtec.nl>
Date: Sun, 19 Jul 2015 16:29:50 -0700
Subject: [PATCH 252/549] [SPARK-8638] [SQL] Window Function Performance
 Improvements - Cleanup

This PR contains a few clean-ups that are a part of SPARK-8638: a few style issues got fixed, and a few tests were moved.

Git commit message is wrong BTW :(...

Author: Herman van Hovell <hvanhovell@questtec.nl>

Closes #7513 from hvanhovell/SPARK-8638-cleanup and squashes the following commits:

4e69d08 [Herman van Hovell] Fixed Perfomance Regression for Shrinking Window Frames (+Rebase)
---
 .../apache/spark/sql/execution/Window.scala   | 14 ++--
 .../sql/hive/HiveDataFrameWindowSuite.scala   | 43 ++++++++++
 .../sql/hive/execution/WindowSuite.scala      | 79 -------------------
 3 files changed, 51 insertions(+), 85 deletions(-)
 delete mode 100644 sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/WindowSuite.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Window.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Window.scala
index a054f52b8b489..de04132eb1104 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Window.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Window.scala
@@ -118,22 +118,24 @@ case class Window(
           val exprs = windowSpec.orderSpec.map(_.child)
           val projection = newMutableProjection(exprs, child.output)
           (windowSpec.orderSpec, projection(), projection())
-        }
-        else if (windowSpec.orderSpec.size == 1) {
+        } else if (windowSpec.orderSpec.size == 1) {
           // Use only the first order expression when the offset is non-null.
           val sortExpr = windowSpec.orderSpec.head
           val expr = sortExpr.child
           // Create the projection which returns the current 'value'.
           val current = newMutableProjection(expr :: Nil, child.output)()
           // Flip the sign of the offset when processing the order is descending
-          val boundOffset = if (sortExpr.direction == Descending) -offset
-          else offset
+          val boundOffset =
+            if (sortExpr.direction == Descending) {
+              -offset
+            } else {
+              offset
+            }
           // Create the projection which returns the current 'value' modified by adding the offset.
           val boundExpr = Add(expr, Cast(Literal.create(boundOffset, IntegerType), expr.dataType))
           val bound = newMutableProjection(boundExpr :: Nil, child.output)()
           (sortExpr :: Nil, current, bound)
-        }
-        else {
+        } else {
           sys.error("Non-Zero range offsets are not supported for windows " +
             "with multiple order expressions.")
         }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameWindowSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameWindowSuite.scala
index 15b5f418f0a8c..c177cbdd991cf 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameWindowSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameWindowSuite.scala
@@ -212,4 +212,47 @@ class HiveDataFrameWindowSuite extends QueryTest {
           |   (PARTITION BY value ORDER BY key RANGE BETWEEN 1 preceding and current row)
           | FROM window_table""".stripMargin).collect())
   }
+
+  test("reverse sliding range frame") {
+    val df = Seq(
+      (1, "Thin", "Cell Phone", 6000),
+      (2, "Normal", "Tablet", 1500),
+      (3, "Mini", "Tablet", 5500),
+      (4, "Ultra thin", "Cell Phone", 5500),
+      (5, "Very thin", "Cell Phone", 6000),
+      (6, "Big", "Tablet", 2500),
+      (7, "Bendable", "Cell Phone", 3000),
+      (8, "Foldable", "Cell Phone", 3000),
+      (9, "Pro", "Tablet", 4500),
+      (10, "Pro2", "Tablet", 6500)).
+      toDF("id", "product", "category", "revenue")
+    val window = Window.
+      partitionBy($"category").
+      orderBy($"revenue".desc).
+      rangeBetween(-2000L, 1000L)
+    checkAnswer(
+      df.select(
+        $"id",
+        avg($"revenue").over(window).cast("int")),
+      Row(1, 5833) :: Row(2, 2000) :: Row(3, 5500) ::
+        Row(4, 5833) :: Row(5, 5833) :: Row(6, 2833) ::
+        Row(7, 3000) :: Row(8, 3000) :: Row(9, 5500) ::
+        Row(10, 6000) :: Nil)
+  }
+
+  // This is here to illustrate the fact that reverse order also reverses offsets.
+  test("reverse unbounded range frame") {
+    val df = Seq(1, 2, 4, 3, 2, 1).
+      map(Tuple1.apply).
+      toDF("value")
+    val window = Window.orderBy($"value".desc)
+    checkAnswer(
+      df.select(
+        $"value",
+        sum($"value").over(window.rangeBetween(Long.MinValue, 1)),
+        sum($"value").over(window.rangeBetween(1, Long.MaxValue))),
+      Row(1, 13, null) :: Row(2, 13, 2) :: Row(4, 7, 9) ::
+        Row(3, 11, 6) :: Row(2, 13, 2) :: Row(1, 13, null) :: Nil)
+
+  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/WindowSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/WindowSuite.scala
deleted file mode 100644
index a089d0d165195..0000000000000
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/WindowSuite.scala
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.hive.execution
-
-import org.apache.spark.sql.{Row, QueryTest}
-import org.apache.spark.sql.expressions.Window
-import org.apache.spark.sql.functions._
-import org.apache.spark.sql.hive.test.TestHive.implicits._
-
-/**
- * Window expressions are tested extensively by the following test suites:
- * [[org.apache.spark.sql.hive.HiveDataFrameWindowSuite]]
- * [[org.apache.spark.sql.hive.execution.HiveWindowFunctionQueryWithoutCodeGenSuite]]
- * [[org.apache.spark.sql.hive.execution.HiveWindowFunctionQueryFileWithoutCodeGenSuite]]
- * However these suites do not cover all possible (i.e. more exotic) settings. This suite fill
- * this gap.
- *
- * TODO Move this class to the sql/core project when we move to Native Spark UDAFs.
- */
-class WindowSuite extends QueryTest {
-
-  test("reverse sliding range frame") {
-    val df = Seq(
-      (1, "Thin", "Cell Phone", 6000),
-      (2, "Normal", "Tablet", 1500),
-      (3, "Mini", "Tablet", 5500),
-      (4, "Ultra thin", "Cell Phone", 5500),
-      (5, "Very thin", "Cell Phone", 6000),
-      (6, "Big", "Tablet", 2500),
-      (7, "Bendable", "Cell Phone", 3000),
-      (8, "Foldable", "Cell Phone", 3000),
-      (9, "Pro", "Tablet", 4500),
-      (10, "Pro2", "Tablet", 6500)).
-      toDF("id", "product", "category", "revenue")
-    val window = Window.
-      partitionBy($"category").
-      orderBy($"revenue".desc).
-      rangeBetween(-2000L, 1000L)
-    checkAnswer(
-      df.select(
-        $"id",
-        avg($"revenue").over(window).cast("int")),
-      Row(1, 5833) :: Row(2, 2000) :: Row(3, 5500) ::
-      Row(4, 5833) :: Row(5, 5833) :: Row(6, 2833) ::
-      Row(7, 3000) :: Row(8, 3000) :: Row(9, 5500) ::
-      Row(10, 6000) :: Nil)
-  }
-
-  // This is here to illustrate the fact that reverse order also reverses offsets.
-  test("reverse unbounded range frame") {
-    val df = Seq(1, 2, 4, 3, 2, 1).
-      map(Tuple1.apply).
-      toDF("value")
-    val window = Window.orderBy($"value".desc)
-    checkAnswer(
-      df.select(
-        $"value",
-         sum($"value").over(window.rangeBetween(Long.MinValue, 1)),
-         sum($"value").over(window.rangeBetween(1, Long.MaxValue))),
-      Row(1, 13, null) :: Row(2, 13, 2) :: Row(4, 7, 9) ::
-        Row(3, 11, 6) :: Row(2, 13, 2) :: Row(1, 13, null) :: Nil)
-
-  }
-}

From 163e3f1df94f6b7d3dadb46a87dbb3a2bade3f95 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Sun, 19 Jul 2015 16:48:47 -0700
Subject: [PATCH 253/549] [SPARK-8241][SQL] string function: concat_ws.

I also changed the semantics of concat w.r.t. null back to the same behavior as Hive.
That is to say, concat now returns null if any input is null.

Author: Reynold Xin <rxin@databricks.com>

Closes #7504 from rxin/concat_ws and squashes the following commits:

83fd950 [Reynold Xin] Fixed type casting.
3ae85f7 [Reynold Xin] Write null better.
cdc7be6 [Reynold Xin] Added code generation for pure string mode.
a61c4e4 [Reynold Xin] Updated comments.
2d51406 [Reynold Xin] [SPARK-8241][SQL] string function: concat_ws.
---
 .../catalyst/analysis/FunctionRegistry.scala  | 11 ++-
 .../expressions/stringOperations.scala        | 72 ++++++++++++++++---
 .../org/apache/spark/sql/types/DataType.scala |  2 +-
 .../analysis/HiveTypeCoercionSuite.scala      | 11 ++-
 .../expressions/StringExpressionsSuite.scala  | 31 +++++++-
 .../org/apache/spark/sql/functions.scala      | 24 +++++++
 .../spark/sql/StringFunctionsSuite.scala      | 19 +++--
 .../execution/HiveCompatibilitySuite.scala    |  4 +-
 .../apache/spark/unsafe/types/UTF8String.java | 58 +++++++++++++--
 .../spark/unsafe/types/UTF8StringSuite.java   | 62 ++++++++++++----
 10 files changed, 256 insertions(+), 38 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index 4b256adcc60c6..71e87b98d86fc 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -153,6 +153,7 @@ object FunctionRegistry {
     expression[Ascii]("ascii"),
     expression[Base64]("base64"),
     expression[Concat]("concat"),
+    expression[ConcatWs]("concat_ws"),
     expression[Encode]("encode"),
     expression[Decode]("decode"),
     expression[FormatNumber]("format_number"),
@@ -211,7 +212,10 @@ object FunctionRegistry {
     val builder = (expressions: Seq[Expression]) => {
       if (varargCtor.isDefined) {
         // If there is an apply method that accepts Seq[Expression], use that one.
-        varargCtor.get.newInstance(expressions).asInstanceOf[Expression]
+        Try(varargCtor.get.newInstance(expressions).asInstanceOf[Expression]) match {
+          case Success(e) => e
+          case Failure(e) => throw new AnalysisException(e.getMessage)
+        }
       } else {
         // Otherwise, find an ctor method that matches the number of arguments, and use that.
         val params = Seq.fill(expressions.size)(classOf[Expression])
@@ -221,7 +225,10 @@ object FunctionRegistry {
           case Failure(e) =>
             throw new AnalysisException(s"Invalid number of arguments for function $name")
         }
-        f.newInstance(expressions : _*).asInstanceOf[Expression]
+        Try(f.newInstance(expressions : _*).asInstanceOf[Expression]) match {
+          case Success(e) => e
+          case Failure(e) => throw new AnalysisException(e.getMessage)
+        }
       }
     }
     (name, builder)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
index 560b1bc2d889f..5f8ac716f79a1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
@@ -34,19 +34,14 @@ import org.apache.spark.unsafe.types.UTF8String
 
 /**
  * An expression that concatenates multiple input strings into a single string.
- * Input expressions that are evaluated to nulls are skipped.
- *
- * For example, `concat("a", null, "b")` is evaluated to `"ab"`.
- *
- * Note that this is different from Hive since Hive outputs null if any input is null.
- * We never output null.
+ * If any input is null, concat returns null.
  */
 case class Concat(children: Seq[Expression]) extends Expression with ImplicitCastInputTypes {
 
   override def inputTypes: Seq[AbstractDataType] = Seq.fill(children.size)(StringType)
   override def dataType: DataType = StringType
 
-  override def nullable: Boolean = false
+  override def nullable: Boolean = children.exists(_.nullable)
   override def foldable: Boolean = children.forall(_.foldable)
 
   override def eval(input: InternalRow): Any = {
@@ -56,15 +51,76 @@ case class Concat(children: Seq[Expression]) extends Expression with ImplicitCas
 
   override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     val evals = children.map(_.gen(ctx))
-    val inputs = evals.map { eval => s"${eval.isNull} ? null : ${eval.primitive}" }.mkString(", ")
+    val inputs = evals.map { eval =>
+      s"${eval.isNull} ? (UTF8String)null : ${eval.primitive}"
+    }.mkString(", ")
     evals.map(_.code).mkString("\n") + s"""
       boolean ${ev.isNull} = false;
       UTF8String ${ev.primitive} = UTF8String.concat($inputs);
+      if (${ev.primitive} == null) {
+        ${ev.isNull} = true;
+      }
     """
   }
 }
 
 
+/**
+ * An expression that concatenates multiple input strings or array of strings into a single string,
+ * using a given separator (the first child).
+ *
+ * Returns null if the separator is null. Otherwise, concat_ws skips all null values.
+ */
+case class ConcatWs(children: Seq[Expression])
+  extends Expression with ImplicitCastInputTypes with CodegenFallback {
+
+  require(children.nonEmpty, s"$prettyName requires at least one argument.")
+
+  override def prettyName: String = "concat_ws"
+
+  /** The 1st child (separator) is str, and rest are either str or array of str. */
+  override def inputTypes: Seq[AbstractDataType] = {
+    val arrayOrStr = TypeCollection(ArrayType(StringType), StringType)
+    StringType +: Seq.fill(children.size - 1)(arrayOrStr)
+  }
+
+  override def dataType: DataType = StringType
+
+  override def nullable: Boolean = children.head.nullable
+  override def foldable: Boolean = children.forall(_.foldable)
+
+  override def eval(input: InternalRow): Any = {
+    val flatInputs = children.flatMap { child =>
+      child.eval(input) match {
+        case s: UTF8String => Iterator(s)
+        case arr: Seq[_] => arr.asInstanceOf[Seq[UTF8String]]
+        case null => Iterator(null.asInstanceOf[UTF8String])
+      }
+    }
+    UTF8String.concatWs(flatInputs.head, flatInputs.tail : _*)
+  }
+
+  override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    if (children.forall(_.dataType == StringType)) {
+      // All children are strings. In that case we can construct a fixed size array.
+      val evals = children.map(_.gen(ctx))
+
+      val inputs = evals.map { eval =>
+        s"${eval.isNull} ? (UTF8String)null : ${eval.primitive}"
+      }.mkString(", ")
+
+      evals.map(_.code).mkString("\n") + s"""
+        UTF8String ${ev.primitive} = UTF8String.concatWs($inputs);
+        boolean ${ev.isNull} = ${ev.primitive} == null;
+      """
+    } else {
+      // Contains a mix of strings and array<string>s. Fall back to interpreted mode for now.
+      super.genCode(ctx, ev)
+    }
+  }
+}
+
+
 trait StringRegexExpression extends ImplicitCastInputTypes {
   self: BinaryExpression =>
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala
index 2d133eea19fe0..e98fd2583b931 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala
@@ -79,7 +79,7 @@ abstract class DataType extends AbstractDataType {
 
   override private[sql] def defaultConcreteType: DataType = this
 
-  override private[sql] def acceptsType(other: DataType): Boolean = this == other
+  override private[sql] def acceptsType(other: DataType): Boolean = sameType(other)
 }
 
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
index f9442bccc4a7a..7ee2333a81dfe 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
@@ -37,7 +37,6 @@ class HiveTypeCoercionSuite extends PlanTest {
     shouldCast(NullType, IntegerType, IntegerType)
     shouldCast(NullType, DecimalType, DecimalType.Unlimited)
 
-    // TODO: write the entire implicit cast table out for test cases.
     shouldCast(ByteType, IntegerType, IntegerType)
     shouldCast(IntegerType, IntegerType, IntegerType)
     shouldCast(IntegerType, LongType, LongType)
@@ -86,6 +85,16 @@ class HiveTypeCoercionSuite extends PlanTest {
       DecimalType.Unlimited, DecimalType(10, 2)).foreach { tpe =>
       shouldCast(tpe, NumericType, tpe)
     }
+
+    shouldCast(
+      ArrayType(StringType, false),
+      TypeCollection(ArrayType(StringType), StringType),
+      ArrayType(StringType, false))
+
+    shouldCast(
+      ArrayType(StringType, true),
+      TypeCollection(ArrayType(StringType), StringType),
+      ArrayType(StringType, true))
   }
 
   test("ineligible implicit type cast") {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
index 0ed567a90dd1f..96f433be8b065 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
@@ -26,7 +26,7 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
 
   test("concat") {
     def testConcat(inputs: String*): Unit = {
-      val expected = inputs.filter(_ != null).mkString
+      val expected = if (inputs.contains(null)) null else inputs.mkString
       checkEvaluation(Concat(inputs.map(Literal.create(_, StringType))), expected, EmptyRow)
     }
 
@@ -46,6 +46,35 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     // scalastyle:on
   }
 
+  test("concat_ws") {
+    def testConcatWs(expected: String, sep: String, inputs: Any*): Unit = {
+      val inputExprs = inputs.map {
+        case s: Seq[_] => Literal.create(s, ArrayType(StringType))
+        case null => Literal.create(null, StringType)
+        case s: String => Literal.create(s, StringType)
+      }
+      val sepExpr = Literal.create(sep, StringType)
+      checkEvaluation(ConcatWs(sepExpr +: inputExprs), expected, EmptyRow)
+    }
+
+    // scalastyle:off
+    // non ascii characters are not allowed in the code, so we disable the scalastyle here.
+    testConcatWs(null, null)
+    testConcatWs(null, null, "a", "b")
+    testConcatWs("", "")
+    testConcatWs("ab", "哈哈", "ab")
+    testConcatWs("a哈哈b", "哈哈", "a", "b")
+    testConcatWs("a哈哈b", "哈哈", "a", null, "b")
+    testConcatWs("a哈哈b哈哈c", "哈哈", null, "a", null, "b", "c")
+
+    testConcatWs("ab", "哈哈", Seq("ab"))
+    testConcatWs("a哈哈b", "哈哈", Seq("a", "b"))
+    testConcatWs("a哈哈b哈哈c哈哈d", "哈哈", Seq("a", null, "b"), null, "c", Seq(null, "d"))
+    testConcatWs("a哈哈b哈哈c", "哈哈", Seq("a", null, "b"), null, "c", Seq.empty[String])
+    testConcatWs("a哈哈b哈哈c", "哈哈", Seq("a", null, "b"), null, "c", Seq[String](null))
+    // scalastyle:on
+  }
+
   test("StringComparison") {
     val row = create_row("abc", null)
     val c1 = 'a.string.at(0)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index f67c89437bb4a..b5140dca0487f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -1732,6 +1732,30 @@ object functions {
     concat((columnName +: columnNames).map(Column.apply): _*)
   }
 
+  /**
+   * Concatenates input strings together into a single string, using the given separator.
+   *
+   * @group string_funcs
+   * @since 1.5.0
+   */
+  @scala.annotation.varargs
+  def concat_ws(sep: String, exprs: Column*): Column = {
+    ConcatWs(Literal.create(sep, StringType) +: exprs.map(_.expr))
+  }
+
+  /**
+   * Concatenates input strings together into a single string, using the given separator.
+   *
+   * This is the variant of concat_ws that takes in the column names.
+   *
+   * @group string_funcs
+   * @since 1.5.0
+   */
+  @scala.annotation.varargs
+  def concat_ws(sep: String, columnName: String, columnNames: String*): Column = {
+    concat_ws(sep, (columnName +: columnNames).map(Column.apply) : _*)
+  }
+
   /**
    * Computes the length of a given string / binary value.
    *
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
index 4eff33ed45042..fe4de8d8b855f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
@@ -30,14 +30,25 @@ class StringFunctionsSuite extends QueryTest {
     val df = Seq[(String, String, String)](("a", "b", null)).toDF("a", "b", "c")
 
     checkAnswer(
-      df.select(concat($"a", $"b", $"c")),
-      Row("ab"))
+      df.select(concat($"a", $"b"), concat($"a", $"b", $"c")),
+      Row("ab", null))
 
     checkAnswer(
-      df.selectExpr("concat(a, b, c)"),
-      Row("ab"))
+      df.selectExpr("concat(a, b)", "concat(a, b, c)"),
+      Row("ab", null))
   }
 
+  test("string concat_ws") {
+    val df = Seq[(String, String, String)](("a", "b", null)).toDF("a", "b", "c")
+
+    checkAnswer(
+      df.select(concat_ws("||", $"a", $"b", $"c")),
+      Row("a||b"))
+
+    checkAnswer(
+      df.selectExpr("concat_ws('||', a, b, c)"),
+      Row("a||b"))
+  }
 
   test("string Levenshtein distance") {
     val df = Seq(("kitten", "sitting"), ("frog", "fog")).toDF("l", "r")
diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
index 2689d904d6541..b12b3838e615c 100644
--- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
+++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
@@ -263,9 +263,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "timestamp_2",
     "timestamp_udf",
 
-    // Hive outputs NULL if any concat input has null. We never output null for concat.
-    "udf_concat",
-
     // Unlike Hive, we do support log base in (0, 1.0], therefore disable this
     "udf7"
   )
@@ -856,6 +853,7 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "udf_case",
     "udf_ceil",
     "udf_ceiling",
+    "udf_concat",
     "udf_concat_insert1",
     "udf_concat_insert2",
     "udf_concat_ws",
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
index 9723b6e0834b2..3eecd657e6ef9 100644
--- a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
+++ b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -397,26 +397,62 @@ public UTF8String lpad(int len, UTF8String pad) {
   }
 
   /**
-   * Concatenates input strings together into a single string. A null input is skipped.
-   * For example, concat("a", null, "c") would yield "ac".
+   * Concatenates input strings together into a single string. Returns null if any input is null.
    */
   public static UTF8String concat(UTF8String... inputs) {
-    if (inputs == null) {
-      return fromBytes(new byte[0]);
-    }
-
     // Compute the total length of the result.
     int totalLength = 0;
     for (int i = 0; i < inputs.length; i++) {
       if (inputs[i] != null) {
         totalLength += inputs[i].numBytes;
+      } else {
+        return null;
       }
     }
 
     // Allocate a new byte array, and copy the inputs one by one into it.
     final byte[] result = new byte[totalLength];
     int offset = 0;
+    for (int i = 0; i < inputs.length; i++) {
+      int len = inputs[i].numBytes;
+      PlatformDependent.copyMemory(
+        inputs[i].base, inputs[i].offset,
+        result, PlatformDependent.BYTE_ARRAY_OFFSET + offset,
+        len);
+      offset += len;
+    }
+    return fromBytes(result);
+  }
+
+  /**
+   * Concatenates input strings together into a single string using the separator.
+   * A null input is skipped. For example, concat(",", "a", null, "c") would yield "a,c".
+   */
+  public static UTF8String concatWs(UTF8String separator, UTF8String... inputs) {
+    if (separator == null) {
+      return null;
+    }
+
+    int numInputBytes = 0;  // total number of bytes from the inputs
+    int numInputs = 0;      // number of non-null inputs
     for (int i = 0; i < inputs.length; i++) {
+      if (inputs[i] != null) {
+        numInputBytes += inputs[i].numBytes;
+        numInputs++;
+      }
+    }
+
+    if (numInputs == 0) {
+      // Return an empty string if there is no input, or all the inputs are null.
+      return fromBytes(new byte[0]);
+    }
+
+    // Allocate a new byte array, and copy the inputs one by one into it.
+    // The size of the new array is the size of all inputs, plus the separators.
+    final byte[] result = new byte[numInputBytes + (numInputs - 1) * separator.numBytes];
+    int offset = 0;
+
+    for (int i = 0, j = 0; i < inputs.length; i++) {
       if (inputs[i] != null) {
         int len = inputs[i].numBytes;
         PlatformDependent.copyMemory(
@@ -424,6 +460,16 @@ public static UTF8String concat(UTF8String... inputs) {
           result, PlatformDependent.BYTE_ARRAY_OFFSET + offset,
           len);
         offset += len;
+
+        j++;
+        // Add separator if this is not the last input.
+        if (j < numInputs) {
+          PlatformDependent.copyMemory(
+            separator.base, separator.offset,
+            result, PlatformDependent.BYTE_ARRAY_OFFSET + offset,
+            separator.numBytes);
+          offset += separator.numBytes;
+        }
       }
     }
     return fromBytes(result);
diff --git a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
index 0db7522b50c1a..7d0c49e2fb84c 100644
--- a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
+++ b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
@@ -88,16 +88,50 @@ public void upperAndLower() {
 
   @Test
   public void concatTest() {
-    assertEquals(concat(), fromString(""));
-    assertEquals(concat(null), fromString(""));
-    assertEquals(concat(fromString("")), fromString(""));
-    assertEquals(concat(fromString("ab")), fromString("ab"));
-    assertEquals(concat(fromString("a"), fromString("b")), fromString("ab"));
-    assertEquals(concat(fromString("a"), fromString("b"), fromString("c")), fromString("abc"));
-    assertEquals(concat(fromString("a"), null, fromString("c")), fromString("ac"));
-    assertEquals(concat(fromString("a"), null, null), fromString("a"));
-    assertEquals(concat(null, null, null), fromString(""));
-    assertEquals(concat(fromString("数据"), fromString("砖头")), fromString("数据砖头"));
+    assertEquals(fromString(""), concat());
+    assertEquals(null, concat((UTF8String) null));
+    assertEquals(fromString(""), concat(fromString("")));
+    assertEquals(fromString("ab"), concat(fromString("ab")));
+    assertEquals(fromString("ab"), concat(fromString("a"), fromString("b")));
+    assertEquals(fromString("abc"), concat(fromString("a"), fromString("b"), fromString("c")));
+    assertEquals(null, concat(fromString("a"), null, fromString("c")));
+    assertEquals(null, concat(fromString("a"), null, null));
+    assertEquals(null, concat(null, null, null));
+    assertEquals(fromString("数据砖头"), concat(fromString("数据"), fromString("砖头")));
+  }
+
+  @Test
+  public void concatWsTest() {
+    // Returns null if the separator is null
+    assertEquals(null, concatWs(null, (UTF8String)null));
+    assertEquals(null, concatWs(null, fromString("a")));
+
+    // If separator is null, concatWs should skip all null inputs and never return null.
+    UTF8String sep = fromString("哈哈");
+    assertEquals(
+      fromString(""),
+      concatWs(sep, fromString("")));
+    assertEquals(
+      fromString("ab"),
+      concatWs(sep, fromString("ab")));
+    assertEquals(
+      fromString("a哈哈b"),
+      concatWs(sep, fromString("a"), fromString("b")));
+    assertEquals(
+      fromString("a哈哈b哈哈c"),
+      concatWs(sep, fromString("a"), fromString("b"), fromString("c")));
+    assertEquals(
+      fromString("a哈哈c"),
+      concatWs(sep, fromString("a"), null, fromString("c")));
+    assertEquals(
+      fromString("a"),
+      concatWs(sep, fromString("a"), null, null));
+    assertEquals(
+      fromString(""),
+      concatWs(sep, null, null, null));
+    assertEquals(
+      fromString("数据哈哈砖头"),
+      concatWs(sep, fromString("数据"), fromString("砖头")));
   }
 
   @Test
@@ -215,14 +249,18 @@ public void pad() {
     assertEquals(fromString("??数据砖头"), fromString("数据砖头").lpad(6, fromString("????")));
     assertEquals(fromString("孙行数据砖头"), fromString("数据砖头").lpad(6, fromString("孙行者")));
     assertEquals(fromString("孙行者数据砖头"), fromString("数据砖头").lpad(7, fromString("孙行者")));
-    assertEquals(fromString("孙行者孙行者孙行数据砖头"), fromString("数据砖头").lpad(12, fromString("孙行者")));
+    assertEquals(
+      fromString("孙行者孙行者孙行数据砖头"),
+      fromString("数据砖头").lpad(12, fromString("孙行者")));
 
     assertEquals(fromString("数据砖"), fromString("数据砖头").rpad(3, fromString("????")));
     assertEquals(fromString("数据砖头?"), fromString("数据砖头").rpad(5, fromString("????")));
     assertEquals(fromString("数据砖头??"), fromString("数据砖头").rpad(6, fromString("????")));
     assertEquals(fromString("数据砖头孙行"), fromString("数据砖头").rpad(6, fromString("孙行者")));
     assertEquals(fromString("数据砖头孙行者"), fromString("数据砖头").rpad(7, fromString("孙行者")));
-    assertEquals(fromString("数据砖头孙行者孙行者孙行"), fromString("数据砖头").rpad(12, fromString("孙行者")));
+    assertEquals(
+      fromString("数据砖头孙行者孙行者孙行"),
+      fromString("数据砖头").rpad(12, fromString("孙行者")));
   }
   
   @Test

From 93eb2acfb287807355ba5d77989d239fdd6e2c30 Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Sun, 19 Jul 2015 20:34:30 -0700
Subject: [PATCH 254/549] [SPARK-9030] [STREAMING] [HOTFIX] Make sure that no
 attempts to create Kinesis streams is made when no enabled

Problem: Even when the environment variable to enable tests are disabled, the `beforeAll` of the KinesisStreamSuite attempted to find AWS credentials to create Kinesis stream, and failed.

Solution: Made sure all accesses to KinesisTestUtils, that created streams, is under `testOrIgnore`

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #7519 from tdas/kinesis-tests and squashes the following commits:

64d6d06 [Tathagata Das] Removed empty lines.
7c18473 [Tathagata Das] Putting all access to KinesisTestUtils inside testOrIgnore
---
 .../kinesis/KinesisStreamSuite.scala          | 57 +++++++++----------
 1 file changed, 26 insertions(+), 31 deletions(-)

diff --git a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisStreamSuite.scala b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisStreamSuite.scala
index d3dd541fe4371..50f71413abf37 100644
--- a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisStreamSuite.scala
+++ b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisStreamSuite.scala
@@ -33,8 +33,6 @@ import org.apache.spark.{SparkConf, SparkContext, SparkFunSuite}
 class KinesisStreamSuite extends SparkFunSuite with KinesisSuiteHelper
   with Eventually with BeforeAndAfter with BeforeAndAfterAll {
 
-  private val kinesisTestUtils = new KinesisTestUtils()
-
   // This is the name that KCL uses to save metadata to DynamoDB
   private val kinesisAppName = s"KinesisStreamSuite-${math.abs(Random.nextLong())}"
 
@@ -42,7 +40,6 @@ class KinesisStreamSuite extends SparkFunSuite with KinesisSuiteHelper
   private var sc: SparkContext = _
 
   override def beforeAll(): Unit = {
-    kinesisTestUtils.createStream()
     val conf = new SparkConf()
       .setMaster("local[4]")
       .setAppName("KinesisStreamSuite") // Setting Spark app name to Kinesis app name
@@ -53,15 +50,6 @@ class KinesisStreamSuite extends SparkFunSuite with KinesisSuiteHelper
     sc.stop()
     // Delete the Kinesis stream as well as the DynamoDB table generated by
     // Kinesis Client Library when consuming the stream
-    kinesisTestUtils.deleteStream()
-    kinesisTestUtils.deleteDynamoDBTable(kinesisAppName)
-  }
-
-  before {
-    // Delete the DynamoDB table generated by Kinesis Client Library when
-    // consuming from the stream, so that each unit test can start from
-    // scratch without prior history of data consumption
-    kinesisTestUtils.deleteDynamoDBTable(kinesisAppName)
   }
 
   after {
@@ -96,25 +84,32 @@ class KinesisStreamSuite extends SparkFunSuite with KinesisSuiteHelper
    * and you have to set the system environment variable RUN_KINESIS_TESTS=1 .
    */
   testOrIgnore("basic operation") {
-    ssc = new StreamingContext(sc, Seconds(1))
-    val aWSCredentials = KinesisTestUtils.getAWSCredentials()
-    val stream = KinesisUtils.createStream(ssc, kinesisAppName, kinesisTestUtils.streamName,
-      kinesisTestUtils.endpointUrl, kinesisTestUtils.regionName, InitialPositionInStream.LATEST,
-      Seconds(10), StorageLevel.MEMORY_ONLY,
-      aWSCredentials.getAWSAccessKeyId, aWSCredentials.getAWSSecretKey)
-
-    val collected = new mutable.HashSet[Int] with mutable.SynchronizedSet[Int]
-    stream.map { bytes => new String(bytes).toInt }.foreachRDD { rdd =>
-      collected ++= rdd.collect()
-      logInfo("Collected = " + rdd.collect().toSeq.mkString(", "))
-    }
-    ssc.start()
-
-    val testData = 1 to 10
-    eventually(timeout(120 seconds), interval(10 second)) {
-      kinesisTestUtils.pushData(testData)
-      assert(collected === testData.toSet, "\nData received does not match data sent")
+    val kinesisTestUtils = new KinesisTestUtils()
+    try {
+      kinesisTestUtils.createStream()
+      ssc = new StreamingContext(sc, Seconds(1))
+      val aWSCredentials = KinesisTestUtils.getAWSCredentials()
+      val stream = KinesisUtils.createStream(ssc, kinesisAppName, kinesisTestUtils.streamName,
+        kinesisTestUtils.endpointUrl, kinesisTestUtils.regionName, InitialPositionInStream.LATEST,
+        Seconds(10), StorageLevel.MEMORY_ONLY,
+        aWSCredentials.getAWSAccessKeyId, aWSCredentials.getAWSSecretKey)
+
+      val collected = new mutable.HashSet[Int] with mutable.SynchronizedSet[Int]
+      stream.map { bytes => new String(bytes).toInt }.foreachRDD { rdd =>
+        collected ++= rdd.collect()
+        logInfo("Collected = " + rdd.collect().toSeq.mkString(", "))
+      }
+      ssc.start()
+
+      val testData = 1 to 10
+      eventually(timeout(120 seconds), interval(10 second)) {
+        kinesisTestUtils.pushData(testData)
+        assert(collected === testData.toSet, "\nData received does not match data sent")
+      }
+      ssc.stop()
+    } finally {
+      kinesisTestUtils.deleteStream()
+      kinesisTestUtils.deleteDynamoDBTable(kinesisAppName)
     }
-    ssc.stop()
   }
 }

From d743bec645fd2a65bd488d2d660b3aa2135b4da6 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@appier.com>
Date: Sun, 19 Jul 2015 20:53:18 -0700
Subject: [PATCH 255/549] [SPARK-9172][SQL] Make DecimalPrecision support for
 Intersect and Except

JIRA: https://issues.apache.org/jira/browse/SPARK-9172

Simply make `DecimalPrecision` support for `Intersect` and `Except` in addition to `Union`.

Besides, add unit test for `DecimalPrecision` as well.

Author: Liang-Chi Hsieh <viirya@appier.com>

Closes #7511 from viirya/more_decimalprecieion and squashes the following commits:

4d29d10 [Liang-Chi Hsieh] Fix code comment.
9fb0d49 [Liang-Chi Hsieh] Make DecimalPrecision support for Intersect and Except.
---
 .../catalyst/analysis/HiveTypeCoercion.scala  | 86 +++++++++++--------
 .../analysis/HiveTypeCoercionSuite.scala      | 55 ++++++++++++
 2 files changed, 104 insertions(+), 37 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
index ff20835e82ba7..e214545726249 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
@@ -335,7 +335,7 @@ object HiveTypeCoercion {
    * - INT gets turned into DECIMAL(10, 0)
    * - LONG gets turned into DECIMAL(20, 0)
    * - FLOAT and DOUBLE
-   *   1. Union operation:
+   *   1. Union, Intersect and Except operations:
    *      FLOAT gets turned into DECIMAL(7, 7), DOUBLE gets turned into DECIMAL(15, 15) (this is the
    *      same as Hive)
    *   2. Other operation:
@@ -362,47 +362,59 @@ object HiveTypeCoercion {
       DoubleType -> DecimalType(15, 15)
     )
 
-    def apply(plan: LogicalPlan): LogicalPlan = plan transform {
-      // fix decimal precision for union
-      case u @ Union(left, right) if u.childrenResolved && !u.resolved =>
-        val castedInput = left.output.zip(right.output).map {
-          case (lhs, rhs) if lhs.dataType != rhs.dataType =>
-            (lhs.dataType, rhs.dataType) match {
-              case (DecimalType.Fixed(p1, s1), DecimalType.Fixed(p2, s2)) =>
-                // Union decimals with precision/scale p1/s2 and p2/s2  will be promoted to
-                // DecimalType(max(s1, s2) + max(p1-s1, p2-s2), max(s1, s2))
-                val fixedType = DecimalType(max(s1, s2) + max(p1 - s1, p2 - s2), max(s1, s2))
-                (Alias(Cast(lhs, fixedType), lhs.name)(), Alias(Cast(rhs, fixedType), rhs.name)())
-              case (t, DecimalType.Fixed(p, s)) if intTypeToFixed.contains(t) =>
-                (Alias(Cast(lhs, intTypeToFixed(t)), lhs.name)(), rhs)
-              case (DecimalType.Fixed(p, s), t) if intTypeToFixed.contains(t) =>
-                (lhs, Alias(Cast(rhs, intTypeToFixed(t)), rhs.name)())
-              case (t, DecimalType.Fixed(p, s)) if floatTypeToFixed.contains(t) =>
-                (Alias(Cast(lhs, floatTypeToFixed(t)), lhs.name)(), rhs)
-              case (DecimalType.Fixed(p, s), t) if floatTypeToFixed.contains(t) =>
-                (lhs, Alias(Cast(rhs, floatTypeToFixed(t)), rhs.name)())
-              case _ => (lhs, rhs)
-            }
-          case other => other
-        }
+    private def castDecimalPrecision(
+        left: LogicalPlan,
+        right: LogicalPlan): (LogicalPlan, LogicalPlan) = {
+      val castedInput = left.output.zip(right.output).map {
+        case (lhs, rhs) if lhs.dataType != rhs.dataType =>
+          (lhs.dataType, rhs.dataType) match {
+            case (DecimalType.Fixed(p1, s1), DecimalType.Fixed(p2, s2)) =>
+              // Decimals with precision/scale p1/s2 and p2/s2  will be promoted to
+              // DecimalType(max(s1, s2) + max(p1-s1, p2-s2), max(s1, s2))
+              val fixedType = DecimalType(max(s1, s2) + max(p1 - s1, p2 - s2), max(s1, s2))
+              (Alias(Cast(lhs, fixedType), lhs.name)(), Alias(Cast(rhs, fixedType), rhs.name)())
+            case (t, DecimalType.Fixed(p, s)) if intTypeToFixed.contains(t) =>
+              (Alias(Cast(lhs, intTypeToFixed(t)), lhs.name)(), rhs)
+            case (DecimalType.Fixed(p, s), t) if intTypeToFixed.contains(t) =>
+              (lhs, Alias(Cast(rhs, intTypeToFixed(t)), rhs.name)())
+            case (t, DecimalType.Fixed(p, s)) if floatTypeToFixed.contains(t) =>
+              (Alias(Cast(lhs, floatTypeToFixed(t)), lhs.name)(), rhs)
+            case (DecimalType.Fixed(p, s), t) if floatTypeToFixed.contains(t) =>
+              (lhs, Alias(Cast(rhs, floatTypeToFixed(t)), rhs.name)())
+            case _ => (lhs, rhs)
+          }
+        case other => other
+      }
 
-        val (castedLeft, castedRight) = castedInput.unzip
+      val (castedLeft, castedRight) = castedInput.unzip
 
-        val newLeft =
-          if (castedLeft.map(_.dataType) != left.output.map(_.dataType)) {
-            Project(castedLeft, left)
-          } else {
-            left
-          }
+      val newLeft =
+        if (castedLeft.map(_.dataType) != left.output.map(_.dataType)) {
+          Project(castedLeft, left)
+        } else {
+          left
+        }
 
-        val newRight =
-          if (castedRight.map(_.dataType) != right.output.map(_.dataType)) {
-            Project(castedRight, right)
-          } else {
-            right
-          }
+      val newRight =
+        if (castedRight.map(_.dataType) != right.output.map(_.dataType)) {
+          Project(castedRight, right)
+        } else {
+          right
+        }
+      (newLeft, newRight)
+    }
 
+    def apply(plan: LogicalPlan): LogicalPlan = plan transform {
+      // fix decimal precision for union, intersect and except
+      case u @ Union(left, right) if u.childrenResolved && !u.resolved =>
+        val (newLeft, newRight) = castDecimalPrecision(left, right)
         Union(newLeft, newRight)
+      case i @ Intersect(left, right) if i.childrenResolved && !i.resolved =>
+        val (newLeft, newRight) = castDecimalPrecision(left, right)
+        Intersect(newLeft, newRight)
+      case e @ Except(left, right) if e.childrenResolved && !e.resolved =>
+        val (newLeft, newRight) = castDecimalPrecision(left, right)
+        Except(newLeft, newRight)
 
       // fix decimal precision for expressions
       case q => q.transformExpressions {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
index 7ee2333a81dfe..835220c563f41 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
@@ -346,6 +346,61 @@ class HiveTypeCoercionSuite extends PlanTest {
     checkOutput(r3.right, expectedTypes)
   }
 
+  test("Transform Decimal precision/scale for union except and intersect") {
+    def checkOutput(logical: LogicalPlan, expectTypes: Seq[DataType]): Unit = {
+      logical.output.zip(expectTypes).foreach { case (attr, dt) =>
+        assert(attr.dataType === dt)
+      }
+    }
+
+    val dp = HiveTypeCoercion.DecimalPrecision
+
+    val left1 = LocalRelation(
+      AttributeReference("l", DecimalType(10, 8))())
+    val right1 = LocalRelation(
+      AttributeReference("r", DecimalType(5, 5))())
+    val expectedType1 = Seq(DecimalType(math.max(8, 5) + math.max(10 - 8, 5 - 5), math.max(8, 5)))
+
+    val r1 = dp(Union(left1, right1)).asInstanceOf[Union]
+    val r2 = dp(Except(left1, right1)).asInstanceOf[Except]
+    val r3 = dp(Intersect(left1, right1)).asInstanceOf[Intersect]
+
+    checkOutput(r1.left, expectedType1)
+    checkOutput(r1.right, expectedType1)
+    checkOutput(r2.left, expectedType1)
+    checkOutput(r2.right, expectedType1)
+    checkOutput(r3.left, expectedType1)
+    checkOutput(r3.right, expectedType1)
+
+    val plan1 = LocalRelation(
+      AttributeReference("l", DecimalType(10, 10))())
+
+    val rightTypes = Seq(ByteType, ShortType, IntegerType, LongType, FloatType, DoubleType)
+    val expectedTypes = Seq(DecimalType(3, 0), DecimalType(5, 0), DecimalType(10, 0),
+      DecimalType(20, 0), DecimalType(7, 7), DecimalType(15, 15))
+
+    rightTypes.zip(expectedTypes).map { case (rType, expectedType) =>
+      val plan2 = LocalRelation(
+        AttributeReference("r", rType)())
+
+      val r1 = dp(Union(plan1, plan2)).asInstanceOf[Union]
+      val r2 = dp(Except(plan1, plan2)).asInstanceOf[Except]
+      val r3 = dp(Intersect(plan1, plan2)).asInstanceOf[Intersect]
+
+      checkOutput(r1.right, Seq(expectedType))
+      checkOutput(r2.right, Seq(expectedType))
+      checkOutput(r3.right, Seq(expectedType))
+
+      val r4 = dp(Union(plan2, plan1)).asInstanceOf[Union]
+      val r5 = dp(Except(plan2, plan1)).asInstanceOf[Except]
+      val r6 = dp(Intersect(plan2, plan1)).asInstanceOf[Intersect]
+
+      checkOutput(r4.left, Seq(expectedType))
+      checkOutput(r5.left, Seq(expectedType))
+      checkOutput(r6.left, Seq(expectedType))
+    }
+  }
+
   /**
    * There are rules that need to not fire before child expressions get resolved.
    * We use this test to make sure those rules do not fire early.

From 930253e0766a7585347edfb73ed11b1bf78143fe Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@outlook.com>
Date: Sun, 19 Jul 2015 22:42:44 -0700
Subject: [PATCH 256/549] [SPARK-9185][SQL] improve code gen for mutable states
 to support complex initialization

Sometimes we need more than one step to initialize the mutable states in code gen like https://github.com/apache/spark/pull/7516

Author: Wenchen Fan <cloud0fan@outlook.com>

Closes #7521 from cloud-fan/init and squashes the following commits:

2106445 [Wenchen Fan] improve code gen for mutable states
---
 .../expressions/codegen/CodeGenerator.scala   | 20 +++++++++++++++++--
 .../codegen/GenerateMutableProjection.scala   | 11 ++++------
 .../codegen/GenerateOrdering.scala            |  8 +++-----
 .../codegen/GeneratePredicate.scala           |  6 ++----
 .../codegen/GenerateProjection.scala          |  9 +++------
 .../codegen/GenerateUnsafeProjection.scala    | 11 ++++------
 .../sql/catalyst/expressions/random.scala     |  8 ++++----
 .../MonotonicallyIncreasingID.scala           |  4 ++--
 .../expressions/SparkPartitionID.scala        |  3 ++-
 9 files changed, 42 insertions(+), 38 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index 7c388bc346306..b2468b6a181d7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -60,13 +60,19 @@ class CodeGenContext {
   /**
    * Holding expressions' mutable states like `MonotonicallyIncreasingID.count` as a
    * 3-tuple: java type, variable name, code to init it.
+   * As an example, ("int", "count", "count = 0;") will produce code:
+   * {{{
+   *   private int count;
+   *   count = 0;
+   * }}}
+   *
    * They will be kept as member variables in generated classes like `SpecificProjection`.
    */
   val mutableStates: mutable.ArrayBuffer[(String, String, String)] =
     mutable.ArrayBuffer.empty[(String, String, String)]
 
-  def addMutableState(javaType: String, variableName: String, initialValue: String): Unit = {
-    mutableStates += ((javaType, variableName, initialValue))
+  def addMutableState(javaType: String, variableName: String, initialCode: String): Unit = {
+    mutableStates += ((javaType, variableName, initialCode))
   }
 
   final val intervalType: String = classOf[Interval].getName
@@ -234,6 +240,16 @@ abstract class CodeGenerator[InType <: AnyRef, OutType <: AnyRef] extends Loggin
   protected val mutableRowType: String = classOf[MutableRow].getName
   protected val genericMutableRowType: String = classOf[GenericMutableRow].getName
 
+  protected def declareMutableStates(ctx: CodeGenContext) = {
+    ctx.mutableStates.map { case (javaType, variableName, _) =>
+      s"private $javaType $variableName;"
+    }.mkString("\n      ")
+  }
+
+  protected def initMutableStates(ctx: CodeGenContext) = {
+    ctx.mutableStates.map(_._3).mkString("\n        ")
+  }
+
   /**
    * Generates a class for a given input expression.  Called when there is not cached code
    * already available.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
index b82bd6814b487..03b4b3c216f49 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
@@ -78,10 +78,6 @@ object GenerateMutableProjection extends CodeGenerator[Seq[Expression], () => Mu
       }
     }
 
-    val mutableStates = ctx.mutableStates.map { case (javaType, variableName, initialValue) =>
-      s"private $javaType $variableName = $initialValue;"
-    }.mkString("\n      ")
-
     val code = s"""
       public Object generate($exprType[] expr) {
         return new SpecificProjection(expr);
@@ -89,13 +85,14 @@ object GenerateMutableProjection extends CodeGenerator[Seq[Expression], () => Mu
 
       class SpecificProjection extends ${classOf[BaseMutableProjection].getName} {
 
-        private $exprType[] expressions = null;
-        private $mutableRowType mutableRow = null;
-        $mutableStates
+        private $exprType[] expressions;
+        private $mutableRowType mutableRow;
+        ${declareMutableStates(ctx)}
 
         public SpecificProjection($exprType[] expr) {
           expressions = expr;
           mutableRow = new $genericMutableRowType(${expressions.size});
+          ${initMutableStates(ctx)}
         }
 
         public ${classOf[BaseMutableProjection].getName} target($mutableRowType row) {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateOrdering.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateOrdering.scala
index 856ff9f1f96f8..2e6f9e204d813 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateOrdering.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateOrdering.scala
@@ -84,9 +84,6 @@ object GenerateOrdering extends CodeGenerator[Seq[SortOrder], Ordering[InternalR
           }
       """
     }.mkString("\n")
-    val mutableStates = ctx.mutableStates.map { case (javaType, variableName, initialValue) =>
-      s"private $javaType $variableName = $initialValue;"
-    }.mkString("\n      ")
     val code = s"""
       public SpecificOrdering generate($exprType[] expr) {
         return new SpecificOrdering(expr);
@@ -94,11 +91,12 @@ object GenerateOrdering extends CodeGenerator[Seq[SortOrder], Ordering[InternalR
 
       class SpecificOrdering extends ${classOf[BaseOrdering].getName} {
 
-        private $exprType[] expressions = null;
-        $mutableStates
+        private $exprType[] expressions;
+        ${declareMutableStates(ctx)}
 
         public SpecificOrdering($exprType[] expr) {
           expressions = expr;
+          ${initMutableStates(ctx)}
         }
 
         @Override
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratePredicate.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratePredicate.scala
index 9e5a745d512e9..1dda5992c3654 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratePredicate.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratePredicate.scala
@@ -40,9 +40,6 @@ object GeneratePredicate extends CodeGenerator[Expression, (InternalRow) => Bool
   protected def create(predicate: Expression): ((InternalRow) => Boolean) = {
     val ctx = newCodeGenContext()
     val eval = predicate.gen(ctx)
-    val mutableStates = ctx.mutableStates.map { case (javaType, variableName, initialValue) =>
-      s"private $javaType $variableName = $initialValue;"
-    }.mkString("\n      ")
     val code = s"""
       public SpecificPredicate generate($exprType[] expr) {
         return new SpecificPredicate(expr);
@@ -50,9 +47,10 @@ object GeneratePredicate extends CodeGenerator[Expression, (InternalRow) => Bool
 
       class SpecificPredicate extends ${classOf[Predicate].getName} {
         private final $exprType[] expressions;
-        $mutableStates
+        ${declareMutableStates(ctx)}
         public SpecificPredicate($exprType[] expr) {
           expressions = expr;
+          ${initMutableStates(ctx)}
         }
 
         @Override
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala
index 8f9fcbf810554..405d6b0e3bc76 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala
@@ -151,21 +151,18 @@ object GenerateProjection extends CodeGenerator[Seq[Expression], Projection] {
         s"""if (!nullBits[$i]) arr[$i] = c$i;"""
     }.mkString("\n      ")
 
-    val mutableStates = ctx.mutableStates.map { case (javaType, variableName, initialValue) =>
-      s"private $javaType $variableName = $initialValue;"
-    }.mkString("\n      ")
-
     val code = s"""
     public SpecificProjection generate($exprType[] expr) {
       return new SpecificProjection(expr);
     }
 
     class SpecificProjection extends ${classOf[BaseProjection].getName} {
-      private $exprType[] expressions = null;
-      $mutableStates
+      private $exprType[] expressions;
+      ${declareMutableStates(ctx)}
 
       public SpecificProjection($exprType[] expr) {
         expressions = expr;
+        ${initMutableStates(ctx)}
       }
 
       @Override
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
index a81d545a8ec63..3a8e8302b24fd 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
@@ -74,10 +74,6 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
           }"""
     }.mkString("\n          ")
 
-    val mutableStates = ctx.mutableStates.map { case (javaType, variableName, initialValue) =>
-      s"private $javaType $variableName = $initialValue;"
-    }.mkString("\n      ")
-
     val code = s"""
     private $exprType[] expressions;
 
@@ -90,10 +86,11 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
 
       private UnsafeRow target = new UnsafeRow();
       private byte[] buffer = new byte[64];
+      ${declareMutableStates(ctx)}
 
-      $mutableStates
-
-      public SpecificProjection() {}
+      public SpecificProjection() {
+        ${initMutableStates(ctx)}
+      }
 
       // Scala.Function1 need this
       public Object apply(Object row) {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/random.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/random.scala
index 65093dc72264b..822898e561cd6 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/random.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/random.scala
@@ -60,9 +60,9 @@ case class Rand(seed: Long) extends RDG(seed) {
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     val rngTerm = ctx.freshName("rng")
-    val className = classOf[XORShiftRandom].getCanonicalName
+    val className = classOf[XORShiftRandom].getName
     ctx.addMutableState(className, rngTerm,
-      s"new $className($seed + org.apache.spark.TaskContext.getPartitionId())")
+      s"$rngTerm = new $className($seed + org.apache.spark.TaskContext.getPartitionId());")
     ev.isNull = "false"
     s"""
       final ${ctx.javaType(dataType)} ${ev.primitive} = $rngTerm.nextDouble();
@@ -83,9 +83,9 @@ case class Randn(seed: Long) extends RDG(seed) {
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     val rngTerm = ctx.freshName("rng")
-    val className = classOf[XORShiftRandom].getCanonicalName
+    val className = classOf[XORShiftRandom].getName
     ctx.addMutableState(className, rngTerm,
-      s"new $className($seed + org.apache.spark.TaskContext.getPartitionId())")
+      s"$rngTerm = new $className($seed + org.apache.spark.TaskContext.getPartitionId());")
     ev.isNull = "false"
     s"""
       final ${ctx.javaType(dataType)} ${ev.primitive} = $rngTerm.nextGaussian();
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/MonotonicallyIncreasingID.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/MonotonicallyIncreasingID.scala
index fec403fe2d348..4d8ed089731aa 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/MonotonicallyIncreasingID.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/MonotonicallyIncreasingID.scala
@@ -58,9 +58,9 @@ private[sql] case class MonotonicallyIncreasingID() extends LeafExpression {
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     val countTerm = ctx.freshName("count")
     val partitionMaskTerm = ctx.freshName("partitionMask")
-    ctx.addMutableState(ctx.JAVA_LONG, countTerm, "0L")
+    ctx.addMutableState(ctx.JAVA_LONG, countTerm, s"$countTerm = 0L;")
     ctx.addMutableState(ctx.JAVA_LONG, partitionMaskTerm,
-      "((long) org.apache.spark.TaskContext.getPartitionId()) << 33")
+      s"$partitionMaskTerm = ((long) org.apache.spark.TaskContext.getPartitionId()) << 33;")
 
     ev.isNull = "false"
     s"""
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/SparkPartitionID.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/SparkPartitionID.scala
index 7c790c549a5d8..43ffc9cc84738 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/SparkPartitionID.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/SparkPartitionID.scala
@@ -41,7 +41,8 @@ private[sql] case object SparkPartitionID extends LeafExpression {
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     val idTerm = ctx.freshName("partitionId")
-    ctx.addMutableState(ctx.JAVA_INT, idTerm, "org.apache.spark.TaskContext.getPartitionId()")
+    ctx.addMutableState(ctx.JAVA_INT, idTerm,
+      s"$idTerm = org.apache.spark.TaskContext.getPartitionId();")
     ev.isNull = "false"
     s"final ${ctx.javaType(dataType)} ${ev.primitive} = $idTerm;"
   }

From 5bdf16da90cf20f20e6fbb258ffd0888bf45e357 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Sun, 19 Jul 2015 22:45:56 -0700
Subject: [PATCH 257/549] Code review feedback for the previous patch.

---
 .../sql/catalyst/expressions/codegen/CodeGenerator.scala  | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index b2468b6a181d7..10f411ff7451a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -63,16 +63,20 @@ class CodeGenContext {
    * As an example, ("int", "count", "count = 0;") will produce code:
    * {{{
    *   private int count;
+   * }}}
+   * as a member variable, and add
+   * {{{
    *   count = 0;
    * }}}
+   * to the constructor.
    *
    * They will be kept as member variables in generated classes like `SpecificProjection`.
    */
   val mutableStates: mutable.ArrayBuffer[(String, String, String)] =
     mutable.ArrayBuffer.empty[(String, String, String)]
 
-  def addMutableState(javaType: String, variableName: String, initialCode: String): Unit = {
-    mutableStates += ((javaType, variableName, initialCode))
+  def addMutableState(javaType: String, variableName: String, initCode: String): Unit = {
+    mutableStates += ((javaType, variableName, initCode))
   }
 
   final val intervalType: String = classOf[Interval].getName

From 972d8900a1e2430d172968b11fdea14b289d7d4d Mon Sep 17 00:00:00 2001
From: Jacky Li <lee.unreal@gmail.com>
Date: Sun, 19 Jul 2015 23:19:17 -0700
Subject: [PATCH 258/549] [SQL][DOC] Minor document fix in
 HadoopFsRelationProvider

Catch this while reading the code

Author: Jacky Li <lee.unreal@gmail.com>
Author: Jacky Li <jackylk@users.noreply.github.com>

Closes #7524 from jackylk/patch-11 and squashes the following commits:

b679011 [Jacky Li] fix doc
e10e211 [Jacky Li] [SQL] Minor document fix in HadoopFsRelationProvider
---
 .../main/scala/org/apache/spark/sql/sources/interfaces.scala  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
index b13c5313b25c9..5d7cc2ff55af1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
@@ -75,7 +75,7 @@ trait RelationProvider {
  * A new instance of this class with be instantiated each time a DDL call is made.
  *
  * The difference between a [[RelationProvider]] and a [[SchemaRelationProvider]] is that
- * users need to provide a schema when using a SchemaRelationProvider.
+ * users need to provide a schema when using a [[SchemaRelationProvider]].
  * A relation provider can inherits both [[RelationProvider]] and [[SchemaRelationProvider]]
  * if it can support both schema inference and user-specified schemas.
  *
@@ -111,7 +111,7 @@ trait SchemaRelationProvider {
  *
  * The difference between a [[RelationProvider]] and a [[HadoopFsRelationProvider]] is
  * that users need to provide a schema and a (possibly empty) list of partition columns when
- * using a SchemaRelationProvider. A relation provider can inherits both [[RelationProvider]],
+ * using a [[HadoopFsRelationProvider]]. A relation provider can inherits both [[RelationProvider]],
  * and [[HadoopFsRelationProvider]] if it can support schema inference, user-specified
  * schemas, and accessing partitioned relations.
  *

From 79ec07290d0b4d16f1643af83824d926304c8f46 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Sun, 19 Jul 2015 23:41:28 -0700
Subject: [PATCH 259/549] [SPARK-9023] [SQL] Efficiency improvements for
 UnsafeRows in Exchange

This pull request aims to improve the performance of SQL's Exchange operator when shuffling UnsafeRows.  It also makes several general efficiency improvements to Exchange.

Key changes:

- When performing hash partitioning, the old Exchange projected the partitioning columns into a new row then passed a `(partitioningColumRow: InternalRow, row: InternalRow)` pair into the shuffle. This is very inefficient because it ends up redundantly serializing the partitioning columns only to immediately discard them after the shuffle.  After this patch's changes, Exchange now shuffles `(partitionId: Int, row: InternalRow)` pairs.  This still isn't optimal, since we're still shuffling extra data that we don't need, but it's significantly more efficient than the old implementation; in the future, we may be able to further optimize this once we implement a new shuffle write interface that accepts non-key-value-pair inputs.
- Exchange's `compute()` method has been significantly simplified; the new code has less duplication and thus is easier to understand.
- When the Exchange's input operator produces UnsafeRows, Exchange will use a specialized `UnsafeRowSerializer` to serialize these rows.  This serializer is significantly more efficient since it simply copies the UnsafeRow's underlying bytes.  Note that this approach does not work for UnsafeRows that use the ObjectPool mechanism; I did not add support for this because we are planning to remove ObjectPool in the next few weeks.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #7456 from JoshRosen/unsafe-exchange and squashes the following commits:

7e75259 [Josh Rosen] Fix cast in SparkSqlSerializer2Suite
0082515 [Josh Rosen] Some additional comments + small cleanup to remove an unused parameter
a27cfc1 [Josh Rosen] Add missing newline
741973c [Josh Rosen] Add simple test of UnsafeRow shuffling in Exchange.
359c6a4 [Josh Rosen] Remove println() and add comments
93904e7 [Josh Rosen] Merge remote-tracking branch 'origin/master' into unsafe-exchange
8dd3ff2 [Josh Rosen] Exchange outputs UnsafeRows when its child outputs them
dd9c66d [Josh Rosen] Fix for copying logic
035af21 [Josh Rosen] Add logic for choosing when to use UnsafeRowSerializer
7876f31 [Josh Rosen] Merge remote-tracking branch 'origin/master' into unsafe-shuffle
cbea80b [Josh Rosen] Add UnsafeRowSerializer
0f2ac86 [Josh Rosen] Import ordering
3ca8515 [Josh Rosen] Big code simplification in Exchange
3526868 [Josh Rosen] Iniitial cut at removing shuffle on KV pairs
---
 .../apache/spark/sql/execution/Exchange.scala | 132 ++++++----------
 .../spark/sql/execution/ShuffledRowRDD.scala  |  80 ++++++++++
 .../sql/execution/SparkSqlSerializer2.scala   |  43 ++----
 .../sql/execution/UnsafeRowSerializer.scala   | 142 ++++++++++++++++++
 .../spark/sql/execution/basicOperators.scala  |   5 +-
 .../spark/sql/execution/ExchangeSuite.scala   |  32 ++++
 .../execution/SparkSqlSerializer2Suite.scala  |   4 +-
 .../execution/UnsafeRowSerializerSuite.scala  |  76 ++++++++++
 8 files changed, 398 insertions(+), 116 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/ShuffledRowRDD.scala
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/UnsafeRowSerializer.scala
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/ExchangeSuite.scala
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
index feea4f239c04d..2750053594f99 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.execution
 
 import org.apache.spark.annotation.DeveloperApi
-import org.apache.spark.rdd.{RDD, ShuffledRDD}
+import org.apache.spark.rdd.RDD
 import org.apache.spark.serializer.Serializer
 import org.apache.spark.shuffle.hash.HashShuffleManager
 import org.apache.spark.shuffle.sort.SortShuffleManager
@@ -29,7 +29,6 @@ import org.apache.spark.sql.catalyst.errors.attachTree
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.sql.catalyst.rules.Rule
-import org.apache.spark.sql.types.DataType
 import org.apache.spark.util.MutablePair
 import org.apache.spark.{HashPartitioner, Partitioner, RangePartitioner, SparkEnv}
 
@@ -44,6 +43,12 @@ case class Exchange(newPartitioning: Partitioning, child: SparkPlan) extends Una
 
   override def output: Seq[Attribute] = child.output
 
+  override def outputsUnsafeRows: Boolean = child.outputsUnsafeRows
+
+  override def canProcessSafeRows: Boolean = true
+
+  override def canProcessUnsafeRows: Boolean = true
+
   /**
    * Determines whether records must be defensively copied before being sent to the shuffle.
    * Several of Spark's shuffle components will buffer deserialized Java objects in memory. The
@@ -112,109 +117,70 @@ case class Exchange(newPartitioning: Partitioning, child: SparkPlan) extends Una
 
   @transient private lazy val sparkConf = child.sqlContext.sparkContext.getConf
 
-  private def getSerializer(
-      keySchema: Array[DataType],
-      valueSchema: Array[DataType],
-      numPartitions: Int): Serializer = {
+  private val serializer: Serializer = {
+    val rowDataTypes = child.output.map(_.dataType).toArray
     // It is true when there is no field that needs to be write out.
     // For now, we will not use SparkSqlSerializer2 when noField is true.
-    val noField =
-      (keySchema == null || keySchema.length == 0) &&
-      (valueSchema == null || valueSchema.length == 0)
+    val noField = rowDataTypes == null || rowDataTypes.length == 0
 
     val useSqlSerializer2 =
         child.sqlContext.conf.useSqlSerializer2 &&   // SparkSqlSerializer2 is enabled.
-        SparkSqlSerializer2.support(keySchema) &&    // The schema of key is supported.
-        SparkSqlSerializer2.support(valueSchema) &&  // The schema of value is supported.
+        SparkSqlSerializer2.support(rowDataTypes) &&  // The schema of row is supported.
         !noField
 
-    val serializer = if (useSqlSerializer2) {
+    if (child.outputsUnsafeRows) {
+      logInfo("Using UnsafeRowSerializer.")
+      new UnsafeRowSerializer(child.output.size)
+    } else if (useSqlSerializer2) {
       logInfo("Using SparkSqlSerializer2.")
-      new SparkSqlSerializer2(keySchema, valueSchema)
+      new SparkSqlSerializer2(rowDataTypes)
     } else {
       logInfo("Using SparkSqlSerializer.")
       new SparkSqlSerializer(sparkConf)
     }
-
-    serializer
   }
 
   protected override def doExecute(): RDD[InternalRow] = attachTree(this , "execute") {
-    newPartitioning match {
-      case HashPartitioning(expressions, numPartitions) =>
-        val keySchema = expressions.map(_.dataType).toArray
-        val valueSchema = child.output.map(_.dataType).toArray
-        val serializer = getSerializer(keySchema, valueSchema, numPartitions)
-        val part = new HashPartitioner(numPartitions)
-
-        val rdd = if (needToCopyObjectsBeforeShuffle(part, serializer)) {
-          child.execute().mapPartitions { iter =>
-            val hashExpressions = newMutableProjection(expressions, child.output)()
-            iter.map(r => (hashExpressions(r).copy(), r.copy()))
-          }
-        } else {
-          child.execute().mapPartitions { iter =>
-            val hashExpressions = newMutableProjection(expressions, child.output)()
-            val mutablePair = new MutablePair[InternalRow, InternalRow]()
-            iter.map(r => mutablePair.update(hashExpressions(r), r))
-          }
-        }
-        val shuffled = new ShuffledRDD[InternalRow, InternalRow, InternalRow](rdd, part)
-        shuffled.setSerializer(serializer)
-        shuffled.map(_._2)
-
+    val rdd = child.execute()
+    val part: Partitioner = newPartitioning match {
+      case HashPartitioning(expressions, numPartitions) => new HashPartitioner(numPartitions)
       case RangePartitioning(sortingExpressions, numPartitions) =>
-        val keySchema = child.output.map(_.dataType).toArray
-        val serializer = getSerializer(keySchema, null, numPartitions)
-
-        val childRdd = child.execute()
-        val part: Partitioner = {
-          // Internally, RangePartitioner runs a job on the RDD that samples keys to compute
-          // partition bounds. To get accurate samples, we need to copy the mutable keys.
-          val rddForSampling = childRdd.mapPartitions { iter =>
-            val mutablePair = new MutablePair[InternalRow, Null]()
-            iter.map(row => mutablePair.update(row.copy(), null))
-          }
-          // TODO: RangePartitioner should take an Ordering.
-          implicit val ordering = new RowOrdering(sortingExpressions, child.output)
-          new RangePartitioner(numPartitions, rddForSampling, ascending = true)
-        }
-
-        val rdd = if (needToCopyObjectsBeforeShuffle(part, serializer)) {
-          childRdd.mapPartitions { iter => iter.map(row => (row.copy(), null))}
-        } else {
-          childRdd.mapPartitions { iter =>
-            val mutablePair = new MutablePair[InternalRow, Null]()
-            iter.map(row => mutablePair.update(row, null))
-          }
+        // Internally, RangePartitioner runs a job on the RDD that samples keys to compute
+        // partition bounds. To get accurate samples, we need to copy the mutable keys.
+        val rddForSampling = rdd.mapPartitions { iter =>
+          val mutablePair = new MutablePair[InternalRow, Null]()
+          iter.map(row => mutablePair.update(row.copy(), null))
         }
-
-        val shuffled = new ShuffledRDD[InternalRow, Null, Null](rdd, part)
-        shuffled.setSerializer(serializer)
-        shuffled.map(_._1)
-
+        implicit val ordering = new RowOrdering(sortingExpressions, child.output)
+        new RangePartitioner(numPartitions, rddForSampling, ascending = true)
       case SinglePartition =>
-        val valueSchema = child.output.map(_.dataType).toArray
-        val serializer = getSerializer(null, valueSchema, numPartitions = 1)
-        val partitioner = new HashPartitioner(1)
-
-        val rdd = if (needToCopyObjectsBeforeShuffle(partitioner, serializer)) {
-          child.execute().mapPartitions {
-            iter => iter.map(r => (null, r.copy()))
-          }
-        } else {
-          child.execute().mapPartitions { iter =>
-            val mutablePair = new MutablePair[Null, InternalRow]()
-            iter.map(r => mutablePair.update(null, r))
-          }
+        new Partitioner {
+          override def numPartitions: Int = 1
+          override def getPartition(key: Any): Int = 0
         }
-        val shuffled = new ShuffledRDD[Null, InternalRow, InternalRow](rdd, partitioner)
-        shuffled.setSerializer(serializer)
-        shuffled.map(_._2)
-
       case _ => sys.error(s"Exchange not implemented for $newPartitioning")
       // TODO: Handle BroadcastPartitioning.
     }
+    def getPartitionKeyExtractor(): InternalRow => InternalRow = newPartitioning match {
+      case HashPartitioning(expressions, _) => newMutableProjection(expressions, child.output)()
+      case RangePartitioning(_, _) | SinglePartition => identity
+      case _ => sys.error(s"Exchange not implemented for $newPartitioning")
+    }
+    val rddWithPartitionIds: RDD[Product2[Int, InternalRow]] = {
+      if (needToCopyObjectsBeforeShuffle(part, serializer)) {
+        rdd.mapPartitions { iter =>
+          val getPartitionKey = getPartitionKeyExtractor()
+          iter.map { row => (part.getPartition(getPartitionKey(row)), row.copy()) }
+        }
+      } else {
+        rdd.mapPartitions { iter =>
+          val getPartitionKey = getPartitionKeyExtractor()
+          val mutablePair = new MutablePair[Int, InternalRow]()
+          iter.map { row => mutablePair.update(part.getPartition(getPartitionKey(row)), row) }
+        }
+      }
+    }
+    new ShuffledRowRDD(rddWithPartitionIds, serializer, part.numPartitions)
   }
 }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ShuffledRowRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ShuffledRowRDD.scala
new file mode 100644
index 0000000000000..88f5b13c8f248
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ShuffledRowRDD.scala
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution
+
+import org.apache.spark._
+import org.apache.spark.rdd.RDD
+import org.apache.spark.serializer.Serializer
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.types.DataType
+
+private class ShuffledRowRDDPartition(val idx: Int) extends Partition {
+  override val index: Int = idx
+  override def hashCode(): Int = idx
+}
+
+/**
+ * A dummy partitioner for use with records whose partition ids have been pre-computed (i.e. for
+ * use on RDDs of (Int, Row) pairs where the Int is a partition id in the expected range).
+ */
+private class PartitionIdPassthrough(override val numPartitions: Int) extends Partitioner {
+  override def getPartition(key: Any): Int = key.asInstanceOf[Int]
+}
+
+/**
+ * This is a specialized version of [[org.apache.spark.rdd.ShuffledRDD]] that is optimized for
+ * shuffling rows instead of Java key-value pairs. Note that something like this should eventually
+ * be implemented in Spark core, but that is blocked by some more general refactorings to shuffle
+ * interfaces / internals.
+ *
+ * @param prev the RDD being shuffled. Elements of this RDD are (partitionId, Row) pairs.
+ *             Partition ids should be in the range [0, numPartitions - 1].
+ * @param serializer the serializer used during the shuffle.
+ * @param numPartitions the number of post-shuffle partitions.
+ */
+class ShuffledRowRDD(
+    @transient var prev: RDD[Product2[Int, InternalRow]],
+    serializer: Serializer,
+    numPartitions: Int)
+  extends RDD[InternalRow](prev.context, Nil) {
+
+  private val part: Partitioner = new PartitionIdPassthrough(numPartitions)
+
+  override def getDependencies: Seq[Dependency[_]] = {
+    List(new ShuffleDependency[Int, InternalRow, InternalRow](prev, part, Some(serializer)))
+  }
+
+  override val partitioner = Some(part)
+
+  override def getPartitions: Array[Partition] = {
+    Array.tabulate[Partition](part.numPartitions)(i => new ShuffledRowRDDPartition(i))
+  }
+
+  override def compute(split: Partition, context: TaskContext): Iterator[InternalRow] = {
+    val dep = dependencies.head.asInstanceOf[ShuffleDependency[Int, InternalRow, InternalRow]]
+    SparkEnv.get.shuffleManager.getReader(dep.shuffleHandle, split.index, split.index + 1, context)
+      .read()
+      .asInstanceOf[Iterator[Product2[Int, InternalRow]]]
+      .map(_._2)
+  }
+
+  override def clearDependencies() {
+    super.clearDependencies()
+    prev = null
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlSerializer2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlSerializer2.scala
index 6ed822dc70d68..c87e2064a8f33 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlSerializer2.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlSerializer2.scala
@@ -45,14 +45,12 @@ import org.apache.spark.unsafe.types.UTF8String
  *     the comment of the `serializer` method in [[Exchange]] for more information on it.
  */
 private[sql] class Serializer2SerializationStream(
-    keySchema: Array[DataType],
-    valueSchema: Array[DataType],
+    rowSchema: Array[DataType],
     out: OutputStream)
   extends SerializationStream with Logging {
 
   private val rowOut = new DataOutputStream(new BufferedOutputStream(out))
-  private val writeKeyFunc = SparkSqlSerializer2.createSerializationFunction(keySchema, rowOut)
-  private val writeValueFunc = SparkSqlSerializer2.createSerializationFunction(valueSchema, rowOut)
+  private val writeRowFunc = SparkSqlSerializer2.createSerializationFunction(rowSchema, rowOut)
 
   override def writeObject[T: ClassTag](t: T): SerializationStream = {
     val kv = t.asInstanceOf[Product2[Row, Row]]
@@ -63,12 +61,12 @@ private[sql] class Serializer2SerializationStream(
   }
 
   override def writeKey[T: ClassTag](t: T): SerializationStream = {
-    writeKeyFunc(t.asInstanceOf[Row])
+    // No-op.
     this
   }
 
   override def writeValue[T: ClassTag](t: T): SerializationStream = {
-    writeValueFunc(t.asInstanceOf[Row])
+    writeRowFunc(t.asInstanceOf[Row])
     this
   }
 
@@ -85,8 +83,7 @@ private[sql] class Serializer2SerializationStream(
  * The corresponding deserialization stream for [[Serializer2SerializationStream]].
  */
 private[sql] class Serializer2DeserializationStream(
-    keySchema: Array[DataType],
-    valueSchema: Array[DataType],
+    rowSchema: Array[DataType],
     in: InputStream)
   extends DeserializationStream with Logging  {
 
@@ -103,22 +100,20 @@ private[sql] class Serializer2DeserializationStream(
   }
 
   // Functions used to return rows for key and value.
-  private val getKey = rowGenerator(keySchema)
-  private val getValue = rowGenerator(valueSchema)
+  private val getRow = rowGenerator(rowSchema)
   // Functions used to read a serialized row from the InputStream and deserialize it.
-  private val readKeyFunc = SparkSqlSerializer2.createDeserializationFunction(keySchema, rowIn)
-  private val readValueFunc = SparkSqlSerializer2.createDeserializationFunction(valueSchema, rowIn)
+  private val readRowFunc = SparkSqlSerializer2.createDeserializationFunction(rowSchema, rowIn)
 
   override def readObject[T: ClassTag](): T = {
-    (readKeyFunc(getKey()), readValueFunc(getValue())).asInstanceOf[T]
+    readValue()
   }
 
   override def readKey[T: ClassTag](): T = {
-    readKeyFunc(getKey()).asInstanceOf[T]
+    null.asInstanceOf[T] // intentionally left blank.
   }
 
   override def readValue[T: ClassTag](): T = {
-    readValueFunc(getValue()).asInstanceOf[T]
+    readRowFunc(getRow()).asInstanceOf[T]
   }
 
   override def close(): Unit = {
@@ -127,8 +122,7 @@ private[sql] class Serializer2DeserializationStream(
 }
 
 private[sql] class SparkSqlSerializer2Instance(
-    keySchema: Array[DataType],
-    valueSchema: Array[DataType])
+    rowSchema: Array[DataType])
   extends SerializerInstance {
 
   def serialize[T: ClassTag](t: T): ByteBuffer =
@@ -141,30 +135,25 @@ private[sql] class SparkSqlSerializer2Instance(
     throw new UnsupportedOperationException("Not supported.")
 
   def serializeStream(s: OutputStream): SerializationStream = {
-    new Serializer2SerializationStream(keySchema, valueSchema, s)
+    new Serializer2SerializationStream(rowSchema, s)
   }
 
   def deserializeStream(s: InputStream): DeserializationStream = {
-    new Serializer2DeserializationStream(keySchema, valueSchema, s)
+    new Serializer2DeserializationStream(rowSchema, s)
   }
 }
 
 /**
  * SparkSqlSerializer2 is a special serializer that creates serialization function and
  * deserialization function based on the schema of data. It assumes that values passed in
- * are key/value pairs and values returned from it are also key/value pairs.
- * The schema of keys is represented by `keySchema` and that of values is represented by
- * `valueSchema`.
+ * are Rows.
  */
-private[sql] class SparkSqlSerializer2(
-    keySchema: Array[DataType],
-    valueSchema: Array[DataType])
+private[sql] class SparkSqlSerializer2(rowSchema: Array[DataType])
   extends Serializer
   with Logging
   with Serializable{
 
-  def newInstance(): SerializerInstance =
-    new SparkSqlSerializer2Instance(keySchema, valueSchema)
+  def newInstance(): SerializerInstance = new SparkSqlSerializer2Instance(rowSchema)
 
   override def supportsRelocationOfSerializedObjects: Boolean = {
     // SparkSqlSerializer2 is stateless and writes no stream headers
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/UnsafeRowSerializer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/UnsafeRowSerializer.scala
new file mode 100644
index 0000000000000..19503ed00056c
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/UnsafeRowSerializer.scala
@@ -0,0 +1,142 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution
+
+import java.io.{DataInputStream, DataOutputStream, OutputStream, InputStream}
+import java.nio.ByteBuffer
+
+import scala.reflect.ClassTag
+
+import com.google.common.io.ByteStreams
+
+import org.apache.spark.serializer.{SerializationStream, DeserializationStream, SerializerInstance, Serializer}
+import org.apache.spark.sql.catalyst.expressions.UnsafeRow
+import org.apache.spark.unsafe.PlatformDependent
+
+/**
+ * Serializer for serializing [[UnsafeRow]]s during shuffle. Since UnsafeRows are already stored as
+ * bytes, this serializer simply copies those bytes to the underlying output stream. When
+ * deserializing a stream of rows, instances of this serializer mutate and return a single UnsafeRow
+ * instance that is backed by an on-heap byte array.
+ *
+ * Note that this serializer implements only the [[Serializer]] methods that are used during
+ * shuffle, so certain [[SerializerInstance]] methods will throw UnsupportedOperationException.
+ *
+ * This serializer does not support UnsafeRows that use
+ * [[org.apache.spark.sql.catalyst.util.ObjectPool]].
+ *
+ * @param numFields the number of fields in the row being serialized.
+ */
+private[sql] class UnsafeRowSerializer(numFields: Int) extends Serializer with Serializable {
+  override def newInstance(): SerializerInstance = new UnsafeRowSerializerInstance(numFields)
+  override private[spark] def supportsRelocationOfSerializedObjects: Boolean = true
+}
+
+private class UnsafeRowSerializerInstance(numFields: Int) extends SerializerInstance {
+
+  private[this] val EOF: Int = -1
+
+  override def serializeStream(out: OutputStream): SerializationStream = new SerializationStream {
+    private[this] var writeBuffer: Array[Byte] = new Array[Byte](4096)
+    private[this] val dOut: DataOutputStream = new DataOutputStream(out)
+
+    override def writeValue[T: ClassTag](value: T): SerializationStream = {
+      val row = value.asInstanceOf[UnsafeRow]
+      assert(row.getPool == null, "UnsafeRowSerializer does not support ObjectPool")
+      dOut.writeInt(row.getSizeInBytes)
+      var dataRemaining: Int = row.getSizeInBytes
+      val baseObject = row.getBaseObject
+      var rowReadPosition: Long = row.getBaseOffset
+      while (dataRemaining > 0) {
+        val toTransfer: Int = Math.min(writeBuffer.length, dataRemaining)
+        PlatformDependent.copyMemory(
+          baseObject,
+          rowReadPosition,
+          writeBuffer,
+          PlatformDependent.BYTE_ARRAY_OFFSET,
+          toTransfer)
+        out.write(writeBuffer, 0, toTransfer)
+        rowReadPosition += toTransfer
+        dataRemaining -= toTransfer
+      }
+      this
+    }
+    override def writeKey[T: ClassTag](key: T): SerializationStream = {
+      assert(key.isInstanceOf[Int])
+      this
+    }
+    override def writeAll[T: ClassTag](iter: Iterator[T]): SerializationStream =
+      throw new UnsupportedOperationException
+    override def writeObject[T: ClassTag](t: T): SerializationStream =
+      throw new UnsupportedOperationException
+    override def flush(): Unit = dOut.flush()
+    override def close(): Unit = {
+      writeBuffer = null
+      dOut.writeInt(EOF)
+      dOut.close()
+    }
+  }
+
+  override def deserializeStream(in: InputStream): DeserializationStream = {
+    new DeserializationStream {
+      private[this] val dIn: DataInputStream = new DataInputStream(in)
+      private[this] var rowBuffer: Array[Byte] = new Array[Byte](1024)
+      private[this] var row: UnsafeRow = new UnsafeRow()
+      private[this] var rowTuple: (Int, UnsafeRow) = (0, row)
+
+      override def asKeyValueIterator: Iterator[(Int, UnsafeRow)] = {
+        new Iterator[(Int, UnsafeRow)] {
+          private[this] var rowSize: Int = dIn.readInt()
+
+          override def hasNext: Boolean = rowSize != EOF
+
+          override def next(): (Int, UnsafeRow) = {
+            if (rowBuffer.length < rowSize) {
+              rowBuffer = new Array[Byte](rowSize)
+            }
+            ByteStreams.readFully(in, rowBuffer, 0, rowSize)
+            row.pointTo(rowBuffer, PlatformDependent.BYTE_ARRAY_OFFSET, numFields, rowSize, null)
+            rowSize = dIn.readInt() // read the next row's size
+            if (rowSize == EOF) { // We are returning the last row in this stream
+              val _rowTuple = rowTuple
+              // Null these out so that the byte array can be garbage collected once the entire
+              // iterator has been consumed
+              row = null
+              rowBuffer = null
+              rowTuple = null
+              _rowTuple
+            } else {
+              rowTuple
+            }
+          }
+        }
+      }
+      override def asIterator: Iterator[Any] = throw new UnsupportedOperationException
+      override def readKey[T: ClassTag](): T = throw new UnsupportedOperationException
+      override def readValue[T: ClassTag](): T = throw new UnsupportedOperationException
+      override def readObject[T: ClassTag](): T = throw new UnsupportedOperationException
+      override def close(): Unit = dIn.close()
+    }
+  }
+
+  override def serialize[T: ClassTag](t: T): ByteBuffer = throw new UnsupportedOperationException
+  override def deserialize[T: ClassTag](bytes: ByteBuffer): T =
+    throw new UnsupportedOperationException
+  override def deserialize[T: ClassTag](bytes: ByteBuffer, loader: ClassLoader): T =
+    throw new UnsupportedOperationException
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
index 82bef269b069f..fdd7ad59aba50 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
@@ -56,11 +56,8 @@ case class Project(projectList: Seq[NamedExpression], child: SparkPlan) extends
 case class Filter(condition: Expression, child: SparkPlan) extends UnaryNode {
   override def output: Seq[Attribute] = child.output
 
-  @transient lazy val conditionEvaluator: (InternalRow) => Boolean =
-    newPredicate(condition, child.output)
-
   protected override def doExecute(): RDD[InternalRow] = child.execute().mapPartitions { iter =>
-    iter.filter(conditionEvaluator)
+    iter.filter(newPredicate(condition, child.output))
   }
 
   override def outputOrdering: Seq[SortOrder] = child.outputOrdering
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/ExchangeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/ExchangeSuite.scala
new file mode 100644
index 0000000000000..79e903c2bbd40
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/ExchangeSuite.scala
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution
+
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.plans.physical.SinglePartition
+
+class ExchangeSuite extends SparkPlanTest {
+  test("shuffling UnsafeRows in exchange") {
+    val input = (1 to 1000).map(Tuple1.apply)
+    checkAnswer(
+      input.toDF(),
+      plan => ConvertToSafe(Exchange(SinglePartition, ConvertToUnsafe(plan))),
+      input.map(Row.fromTuple)
+    )
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlSerializer2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlSerializer2Suite.scala
index 71f6b26bcd01a..4a53fadd7e099 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlSerializer2Suite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlSerializer2Suite.scala
@@ -132,8 +132,8 @@ abstract class SparkSqlSerializer2Suite extends QueryTest with BeforeAndAfterAll
       expectedSerializerClass: Class[T]): Unit = {
     executedPlan.foreach {
       case exchange: Exchange =>
-        val shuffledRDD = exchange.execute().firstParent.asInstanceOf[ShuffledRDD[_, _, _]]
-        val dependency = shuffledRDD.getDependencies.head.asInstanceOf[ShuffleDependency[_, _, _]]
+        val shuffledRDD = exchange.execute()
+        val dependency = shuffledRDD.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]]
         val serializerNotSetMessage =
           s"Expected $expectedSerializerClass as the serializer of Exchange. " +
           s"However, the serializer was not set."
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala
new file mode 100644
index 0000000000000..bd788ec8c14b1
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution
+
+import java.io.{ByteArrayInputStream, ByteArrayOutputStream}
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
+import org.apache.spark.sql.catalyst.expressions.{UnsafeRow, UnsafeRowConverter}
+import org.apache.spark.sql.catalyst.util.ObjectPool
+import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.PlatformDependent
+
+class UnsafeRowSerializerSuite extends SparkFunSuite {
+
+  private def toUnsafeRow(
+      row: Row,
+      schema: Array[DataType],
+      objPool: ObjectPool = null): UnsafeRow = {
+    val internalRow = CatalystTypeConverters.convertToCatalyst(row).asInstanceOf[InternalRow]
+    val rowConverter = new UnsafeRowConverter(schema)
+    val rowSizeInBytes = rowConverter.getSizeRequirement(internalRow)
+    val byteArray = new Array[Byte](rowSizeInBytes)
+    rowConverter.writeRow(
+      internalRow, byteArray, PlatformDependent.BYTE_ARRAY_OFFSET, rowSizeInBytes, objPool)
+    val unsafeRow = new UnsafeRow()
+    unsafeRow.pointTo(
+      byteArray, PlatformDependent.BYTE_ARRAY_OFFSET, row.length, rowSizeInBytes, objPool)
+    unsafeRow
+  }
+
+  test("toUnsafeRow() test helper method") {
+    val row = Row("Hello", 123)
+    val unsafeRow = toUnsafeRow(row, Array(StringType, IntegerType))
+    assert(row.getString(0) === unsafeRow.get(0).toString)
+    assert(row.getInt(1) === unsafeRow.getInt(1))
+  }
+
+  test("basic row serialization") {
+    val rows = Seq(Row("Hello", 1), Row("World", 2))
+    val unsafeRows = rows.map(row => toUnsafeRow(row, Array(StringType, IntegerType)))
+    val serializer = new UnsafeRowSerializer(numFields = 2).newInstance()
+    val baos = new ByteArrayOutputStream()
+    val serializerStream = serializer.serializeStream(baos)
+    for (unsafeRow <- unsafeRows) {
+      serializerStream.writeKey(0)
+      serializerStream.writeValue(unsafeRow)
+    }
+    serializerStream.close()
+    val deserializerIter = serializer.deserializeStream(
+      new ByteArrayInputStream(baos.toByteArray)).asKeyValueIterator
+    for (expectedRow <- unsafeRows) {
+      val actualRow = deserializerIter.next().asInstanceOf[(Integer, UnsafeRow)]._2
+      assert(expectedRow.getSizeInBytes === actualRow.getSizeInBytes)
+      assert(expectedRow.getString(0) === actualRow.getString(0))
+      assert(expectedRow.getInt(1) === actualRow.getInt(1))
+    }
+    assert(!deserializerIter.hasNext)
+  }
+}

From 3f7de7db4cf7c5e2824cb91087c5e9d4beb0f738 Mon Sep 17 00:00:00 2001
From: George Dittmar <georgedittmar@gmail.com>
Date: Mon, 20 Jul 2015 08:55:37 -0700
Subject: [PATCH 260/549] [SPARK-7422] [MLLIB] Add argmax to Vector,
 SparseVector

Modifying Vector, DenseVector, and SparseVector to implement argmax functionality. This work is to set the stage for changes to be done in Spark-7423.

Author: George Dittmar <georgedittmar@gmail.com>
Author: George <dittmar@Georges-MacBook-Pro.local>
Author: dittmarg <george.dittmar@webtrends.com>
Author: Xiangrui Meng <meng@databricks.com>

Closes #6112 from GeorgeDittmar/SPARK-7422 and squashes the following commits:

3e0a939 [George Dittmar] Merge pull request #1 from mengxr/SPARK-7422
127dec5 [Xiangrui Meng] update argmax impl
2ea6a55 [George Dittmar] Added MimaExcludes for Vectors.argmax
98058f4 [George Dittmar] Merge branch 'master' of github.com:apache/spark into SPARK-7422
5fd9380 [George Dittmar] fixing style check error
42341fb [George Dittmar] refactoring arg max check to better handle zero values
b22af46 [George Dittmar] Fixing spaces between commas in unit test
f2eba2f [George Dittmar] Cleaning up unit tests to be fewer lines
aa330e3 [George Dittmar] Fixing some last if else spacing issues
ac53c55 [George Dittmar] changing dense vector argmax unit test to be one line call vs 2
d5b5423 [George Dittmar] Fixing code style and updating if logic on when to check for zero values
ee1a85a [George Dittmar] Cleaning up unit tests a bit and modifying a few cases
3ee8711 [George Dittmar] Fixing corner case issue with zeros in the active values of the sparse vector. Updated unit tests
b1f059f [George Dittmar] Added comment before we start arg max calculation. Updated unit tests to cover corner cases
f21dcce [George Dittmar] commit
af17981 [dittmarg] Initial work fixing bug that was made clear in pr
eeda560 [George] Fixing SparseVector argmax function to ignore zero values while doing the calculation.
4526acc [George] Merge branch 'master' of github.com:apache/spark into SPARK-7422
df9538a [George] Added argmax to sparse vector and added unit test
3cffed4 [George] Adding unit tests for argmax functions for Dense and Sparse vectors
04677af [George] initial work on adding argmax to Vector and SparseVector
---
 .../apache/spark/mllib/linalg/Vectors.scala   | 57 +++++++++++++++++--
 .../spark/mllib/linalg/VectorsSuite.scala     | 39 +++++++++++++
 project/MimaExcludes.scala                    |  4 ++
 3 files changed, 95 insertions(+), 5 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
index e048b01d92462..9067b3ba9a7bb 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
@@ -150,6 +150,12 @@ sealed trait Vector extends Serializable {
       toDense
     }
   }
+
+  /**
+   * Find the index of a maximal element.  Returns the first maximal element in case of a tie.
+   * Returns -1 if vector has length 0.
+   */
+  def argmax: Int
 }
 
 /**
@@ -588,11 +594,7 @@ class DenseVector(val values: Array[Double]) extends Vector {
     new SparseVector(size, ii, vv)
   }
 
-  /**
-   * Find the index of a maximal element.  Returns the first maximal element in case of a tie.
-   * Returns -1 if vector has length 0.
-   */
-  private[spark] def argmax: Int = {
+  override def argmax: Int = {
     if (size == 0) {
       -1
     } else {
@@ -717,6 +719,51 @@ class SparseVector(
       new SparseVector(size, ii, vv)
     }
   }
+
+  override def argmax: Int = {
+    if (size == 0) {
+      -1
+    } else {
+      // Find the max active entry.
+      var maxIdx = indices(0)
+      var maxValue = values(0)
+      var maxJ = 0
+      var j = 1
+      val na = numActives
+      while (j < na) {
+        val v = values(j)
+        if (v > maxValue) {
+          maxValue = v
+          maxIdx = indices(j)
+          maxJ = j
+        }
+        j += 1
+      }
+
+      // If the max active entry is nonpositive and there exists inactive ones, find the first zero.
+      if (maxValue <= 0.0 && na < size) {
+        if (maxValue == 0.0) {
+          // If there exists an inactive entry before maxIdx, find it and return its index.
+          if (maxJ < maxIdx) {
+            var k = 0
+            while (k < maxJ && indices(k) == k) {
+              k += 1
+            }
+            maxIdx = k
+          }
+        } else {
+          // If the max active value is negative, find and return the first inactive index.
+          var k = 0
+          while (k < na && indices(k) == k) {
+            k += 1
+          }
+          maxIdx = k
+        }
+      }
+
+      maxIdx
+    }
+  }
 }
 
 object SparseVector {
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala
index 178d95a7b94ec..03be4119bdaca 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala
@@ -62,11 +62,50 @@ class VectorsSuite extends SparkFunSuite with Logging {
     assert(vec.toArray.eq(arr))
   }
 
+  test("dense argmax") {
+    val vec = Vectors.dense(Array.empty[Double]).asInstanceOf[DenseVector]
+    assert(vec.argmax === -1)
+
+    val vec2 = Vectors.dense(arr).asInstanceOf[DenseVector]
+    assert(vec2.argmax === 3)
+
+    val vec3 = Vectors.dense(Array(-1.0, 0.0, -2.0, 1.0)).asInstanceOf[DenseVector]
+    assert(vec3.argmax === 3)
+  }
+
   test("sparse to array") {
     val vec = Vectors.sparse(n, indices, values).asInstanceOf[SparseVector]
     assert(vec.toArray === arr)
   }
 
+  test("sparse argmax") {
+    val vec = Vectors.sparse(0, Array.empty[Int], Array.empty[Double]).asInstanceOf[SparseVector]
+    assert(vec.argmax === -1)
+
+    val vec2 = Vectors.sparse(n, indices, values).asInstanceOf[SparseVector]
+    assert(vec2.argmax === 3)
+
+    val vec3 = Vectors.sparse(5, Array(2, 3, 4), Array(1.0, 0.0, -.7))
+    assert(vec3.argmax === 2)
+
+    // check for case that sparse vector is created with
+    // only negative values {0.0, 0.0,-1.0, -0.7, 0.0}
+    val vec4 = Vectors.sparse(5, Array(2, 3), Array(-1.0, -.7))
+    assert(vec4.argmax === 0)
+
+    val vec5 = Vectors.sparse(11, Array(0, 3, 10), Array(-1.0, -.7, 0.0))
+    assert(vec5.argmax === 1)
+
+    val vec6 = Vectors.sparse(11, Array(0, 1, 2), Array(-1.0, -.7, 0.0))
+    assert(vec6.argmax === 2)
+
+    val vec7 = Vectors.sparse(5, Array(0, 1, 3), Array(-1.0, 0.0, -.7))
+    assert(vec7.argmax === 1)
+
+    val vec8 = Vectors.sparse(5, Array(1, 2), Array(0.0, -1.0))
+    assert(vec8.argmax === 0)
+  }
+
   test("vector equals") {
     val dv1 = Vectors.dense(arr.clone())
     val dv2 = Vectors.dense(arr.clone())
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 36417f5df9f2d..dd852547492aa 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -98,6 +98,10 @@ object MimaExcludes {
               "org.apache.spark.api.r.StringRRDD.this"),
             ProblemFilters.exclude[MissingMethodProblem](
               "org.apache.spark.api.r.BaseRRDD.this")
+          ) ++ Seq(
+            // SPARK-7422 add argmax for sparse vectors
+            ProblemFilters.exclude[MissingMethodProblem](
+              "org.apache.spark.mllib.linalg.Vector.argmax")
           )
 
         case v if v.startsWith("1.4") =>

From d0b4e93f7e92ea59058cc457a5586a4d9a596d71 Mon Sep 17 00:00:00 2001
From: MechCoder <manojkumarsivaraj334@gmail.com>
Date: Mon, 20 Jul 2015 09:00:01 -0700
Subject: [PATCH 261/549] [SPARK-8996] [MLLIB] [PYSPARK] Python API for
 Kolmogorov-Smirnov Test

Python API for the KS-test

Statistics.kolmogorovSmirnovTest(data, distName, *params)
I'm not quite sure how to support the callable function since it is not serializable.

Author: MechCoder <manojkumarsivaraj334@gmail.com>

Closes #7430 from MechCoder/spark-8996 and squashes the following commits:

2dd009d [MechCoder] minor
021d233 [MechCoder] Remove one wrapper and other minor stuff
49d07ab [MechCoder] [SPARK-8996] [MLlib] Python API for Kolmogorov-Smirnov Test
---
 .../mllib/api/python/PythonMLLibAPI.scala     | 14 +++-
 python/pyspark/mllib/stat/_statistics.py      | 67 ++++++++++++++++++-
 python/pyspark/mllib/stat/test.py             | 37 ++++++----
 python/pyspark/mllib/tests.py                 | 19 ++++++
 4 files changed, 123 insertions(+), 14 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
index c58a64001d9a0..fda8d5a0b048f 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
@@ -43,7 +43,7 @@ import org.apache.spark.mllib.recommendation._
 import org.apache.spark.mllib.regression._
 import org.apache.spark.mllib.stat.correlation.CorrelationNames
 import org.apache.spark.mllib.stat.distribution.MultivariateGaussian
-import org.apache.spark.mllib.stat.test.ChiSqTestResult
+import org.apache.spark.mllib.stat.test.{ChiSqTestResult, KolmogorovSmirnovTestResult}
 import org.apache.spark.mllib.stat.{
   KernelDensity, MultivariateStatisticalSummary, Statistics}
 import org.apache.spark.mllib.tree.configuration.{Algo, BoostingStrategy, Strategy}
@@ -1093,6 +1093,18 @@ private[python] class PythonMLLibAPI extends Serializable {
     LinearDataGenerator.generateLinearRDD(
       sc, nexamples, nfeatures, eps, nparts, intercept)
   }
+
+  /**
+   * Java stub for Statistics.kolmogorovSmirnovTest()
+   */
+  def kolmogorovSmirnovTest(
+      data: JavaRDD[Double],
+      distName: String,
+      params: JList[Double]): KolmogorovSmirnovTestResult = {
+    val paramsSeq = params.asScala.toSeq
+    Statistics.kolmogorovSmirnovTest(data, distName, paramsSeq: _*)
+  }
+
 }
 
 /**
diff --git a/python/pyspark/mllib/stat/_statistics.py b/python/pyspark/mllib/stat/_statistics.py
index b475be4b4d953..36c8f48a4a882 100644
--- a/python/pyspark/mllib/stat/_statistics.py
+++ b/python/pyspark/mllib/stat/_statistics.py
@@ -15,11 +15,15 @@
 # limitations under the License.
 #
 
+import sys
+if sys.version >= '3':
+    basestring = str
+
 from pyspark.rdd import RDD, ignore_unicode_prefix
 from pyspark.mllib.common import callMLlibFunc, JavaModelWrapper
 from pyspark.mllib.linalg import Matrix, _convert_to_vector
 from pyspark.mllib.regression import LabeledPoint
-from pyspark.mllib.stat.test import ChiSqTestResult
+from pyspark.mllib.stat.test import ChiSqTestResult, KolmogorovSmirnovTestResult
 
 
 __all__ = ['MultivariateStatisticalSummary', 'Statistics']
@@ -238,6 +242,67 @@ def chiSqTest(observed, expected=None):
             jmodel = callMLlibFunc("chiSqTest", _convert_to_vector(observed), expected)
         return ChiSqTestResult(jmodel)
 
+    @staticmethod
+    @ignore_unicode_prefix
+    def kolmogorovSmirnovTest(data, distName="norm", *params):
+        """
+        .. note:: Experimental
+
+        Performs the Kolmogorov-Smirnov (KS) test for data sampled from
+        a continuous distribution. It tests the null hypothesis that
+        the data is generated from a particular distribution.
+
+        The given data is sorted and the Empirical Cumulative
+        Distribution Function (ECDF) is calculated
+        which for a given point is the number of points having a CDF
+        value lesser than it divided by the total number of points.
+
+        Since the data is sorted, this is a step function
+        that rises by (1 / length of data) for every ordered point.
+
+        The KS statistic gives us the maximum distance between the
+        ECDF and the CDF. Intuitively if this statistic is large, the
+        probabilty that the null hypothesis is true becomes small.
+        For specific details of the implementation, please have a look
+        at the Scala documentation.
+
+        :param data: RDD, samples from the data
+        :param distName: string, currently only "norm" is supported.
+                         (Normal distribution) to calculate the
+                         theoretical distribution of the data.
+        :param params: additional values which need to be provided for
+                       a certain distribution.
+                       If not provided, the default values are used.
+        :return: KolmogorovSmirnovTestResult object containing the test
+                 statistic, degrees of freedom, p-value,
+                 the method used, and the null hypothesis.
+
+        >>> kstest = Statistics.kolmogorovSmirnovTest
+        >>> data = sc.parallelize([-1.0, 0.0, 1.0])
+        >>> ksmodel = kstest(data, "norm")
+        >>> print(round(ksmodel.pValue, 3))
+        1.0
+        >>> print(round(ksmodel.statistic, 3))
+        0.175
+        >>> ksmodel.nullHypothesis
+        u'Sample follows theoretical distribution'
+
+        >>> data = sc.parallelize([2.0, 3.0, 4.0])
+        >>> ksmodel = kstest(data, "norm", 3.0, 1.0)
+        >>> print(round(ksmodel.pValue, 3))
+        1.0
+        >>> print(round(ksmodel.statistic, 3))
+        0.175
+        """
+        if not isinstance(data, RDD):
+            raise TypeError("data should be an RDD, got %s." % type(data))
+        if not isinstance(distName, basestring):
+            raise TypeError("distName should be a string, got %s." % type(distName))
+
+        params = [float(param) for param in params]
+        return KolmogorovSmirnovTestResult(
+            callMLlibFunc("kolmogorovSmirnovTest", data, distName, params))
+
 
 def _test():
     import doctest
diff --git a/python/pyspark/mllib/stat/test.py b/python/pyspark/mllib/stat/test.py
index 762506e952b43..0abe104049ff9 100644
--- a/python/pyspark/mllib/stat/test.py
+++ b/python/pyspark/mllib/stat/test.py
@@ -15,24 +15,16 @@
 # limitations under the License.
 #
 
-from pyspark.mllib.common import JavaModelWrapper
+from pyspark.mllib.common import inherit_doc, JavaModelWrapper
 
 
-__all__ = ["ChiSqTestResult"]
+__all__ = ["ChiSqTestResult", "KolmogorovSmirnovTestResult"]
 
 
-class ChiSqTestResult(JavaModelWrapper):
+class TestResult(JavaModelWrapper):
     """
-    .. note:: Experimental
-
-    Object containing the test results for the chi-squared hypothesis test.
+    Base class for all test results.
     """
-    @property
-    def method(self):
-        """
-        Name of the test method
-        """
-        return self._java_model.method()
 
     @property
     def pValue(self):
@@ -67,3 +59,24 @@ def nullHypothesis(self):
 
     def __str__(self):
         return self._java_model.toString()
+
+
+@inherit_doc
+class ChiSqTestResult(TestResult):
+    """
+    Contains test results for the chi-squared hypothesis test.
+    """
+
+    @property
+    def method(self):
+        """
+        Name of the test method
+        """
+        return self._java_model.method()
+
+
+@inherit_doc
+class KolmogorovSmirnovTestResult(TestResult):
+    """
+    Contains test results for the Kolmogorov-Smirnov test.
+    """
diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py
index f2eab5b18f077..3f5a02af12e39 100644
--- a/python/pyspark/mllib/tests.py
+++ b/python/pyspark/mllib/tests.py
@@ -869,6 +869,25 @@ def test_right_number_of_results(self):
         self.assertIsNotNone(chi[1000])
 
 
+class KolmogorovSmirnovTest(MLlibTestCase):
+
+    def test_R_implementation_equivalence(self):
+        data = self.sc.parallelize([
+            1.1626852897838, -0.585924465893051, 1.78546500331661, -1.33259371048501,
+            -0.446566766553219, 0.569606122374976, -2.88971761441412, -0.869018343326555,
+            -0.461702683149641, -0.555540910137444, -0.0201353678515895, -0.150382224136063,
+            -0.628126755843964, 1.32322085193283, -1.52135057001199, -0.437427868856691,
+            0.970577579543399, 0.0282226444247749, -0.0857821886527593, 0.389214404984942
+        ])
+        model = Statistics.kolmogorovSmirnovTest(data, "norm")
+        self.assertAlmostEqual(model.statistic, 0.189, 3)
+        self.assertAlmostEqual(model.pValue, 0.422, 3)
+
+        model = Statistics.kolmogorovSmirnovTest(data, "norm", 0, 1)
+        self.assertAlmostEqual(model.statistic, 0.189, 3)
+        self.assertAlmostEqual(model.pValue, 0.422, 3)
+
+
 class SerDeTest(MLlibTestCase):
     def test_to_java_object_rdd(self):  # SPARK-6660
         data = RandomRDDs.uniformRDD(self.sc, 10, 5, seed=0)

From 5112b7f58b9b8031ff79b9184dafe12b71ba1f79 Mon Sep 17 00:00:00 2001
From: Tarek Auel <tarek.auel@googlemail.com>
Date: Mon, 20 Jul 2015 09:35:45 -0700
Subject: [PATCH 262/549] [SPARK-9153][SQL] codegen StringLPad/StringRPad

Jira: https://issues.apache.org/jira/browse/SPARK-9153

Author: Tarek Auel <tarek.auel@googlemail.com>

Closes #7527 from tarekauel/SPARK-9153 and squashes the following commits:

3840c6b [Tarek Auel] [SPARK-9153] removed codegen fallback
92b6a5d [Tarek Auel] [SPARK-9153] codegen lpad/rpad
---
 .../expressions/stringOperations.scala        | 54 ++++++++++++++++++-
 .../expressions/StringExpressionsSuite.scala  |  6 +++
 2 files changed, 58 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
index 5f8ac716f79a1..6608036f01318 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
@@ -401,7 +401,7 @@ case class StringLocate(substr: Expression, str: Expression, start: Expression)
  * Returns str, left-padded with pad to a length of len.
  */
 case class StringLPad(str: Expression, len: Expression, pad: Expression)
-  extends Expression with ImplicitCastInputTypes with CodegenFallback {
+  extends Expression with ImplicitCastInputTypes {
 
   override def children: Seq[Expression] = str :: len :: pad :: Nil
   override def foldable: Boolean = children.forall(_.foldable)
@@ -432,6 +432,31 @@ case class StringLPad(str: Expression, len: Expression, pad: Expression)
     }
   }
 
+  override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    val lenGen = len.gen(ctx)
+    val strGen = str.gen(ctx)
+    val padGen = pad.gen(ctx)
+
+    s"""
+      ${lenGen.code}
+      boolean ${ev.isNull} = ${lenGen.isNull};
+      ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+      if (!${ev.isNull}) {
+        ${strGen.code}
+        if (!${strGen.isNull}) {
+          ${padGen.code}
+          if (!${padGen.isNull}) {
+            ${ev.primitive} = ${strGen.primitive}.lpad(${lenGen.primitive}, ${padGen.primitive});
+          } else {
+            ${ev.isNull} = true;
+          }
+        } else {
+          ${ev.isNull} = true;
+        }
+      }
+     """
+  }
+
   override def prettyName: String = "lpad"
 }
 
@@ -439,7 +464,7 @@ case class StringLPad(str: Expression, len: Expression, pad: Expression)
  * Returns str, right-padded with pad to a length of len.
  */
 case class StringRPad(str: Expression, len: Expression, pad: Expression)
-  extends Expression with ImplicitCastInputTypes with CodegenFallback {
+  extends Expression with ImplicitCastInputTypes {
 
   override def children: Seq[Expression] = str :: len :: pad :: Nil
   override def foldable: Boolean = children.forall(_.foldable)
@@ -470,6 +495,31 @@ case class StringRPad(str: Expression, len: Expression, pad: Expression)
     }
   }
 
+  override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    val lenGen = len.gen(ctx)
+    val strGen = str.gen(ctx)
+    val padGen = pad.gen(ctx)
+
+    s"""
+      ${lenGen.code}
+      boolean ${ev.isNull} = ${lenGen.isNull};
+      ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+      if (!${ev.isNull}) {
+        ${strGen.code}
+        if (!${strGen.isNull}) {
+          ${padGen.code}
+          if (!${padGen.isNull}) {
+            ${ev.primitive} = ${strGen.primitive}.rpad(${lenGen.primitive}, ${padGen.primitive});
+          } else {
+            ${ev.isNull} = true;
+          }
+        } else {
+          ${ev.isNull} = true;
+        }
+      }
+     """
+  }
+
   override def prettyName: String = "rpad"
 }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
index 96f433be8b065..d5731229df3bb 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
@@ -413,18 +413,24 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     val row1 = create_row("hi", 5, "??")
     val row2 = create_row("hi", 1, "?")
     val row3 = create_row(null, 1, "?")
+    val row4 = create_row("hi", null, "?")
+    val row5 = create_row("hi", 1, null)
 
     checkEvaluation(StringLPad(Literal("hi"), Literal(5), Literal("??")), "???hi", row1)
     checkEvaluation(StringLPad(Literal("hi"), Literal(1), Literal("??")), "h", row1)
     checkEvaluation(StringLPad(s1, s2, s3), "???hi", row1)
     checkEvaluation(StringLPad(s1, s2, s3), "h", row2)
     checkEvaluation(StringLPad(s1, s2, s3), null, row3)
+    checkEvaluation(StringLPad(s1, s2, s3), null, row4)
+    checkEvaluation(StringLPad(s1, s2, s3), null, row5)
 
     checkEvaluation(StringRPad(Literal("hi"), Literal(5), Literal("??")), "hi???", row1)
     checkEvaluation(StringRPad(Literal("hi"), Literal(1), Literal("??")), "h", row1)
     checkEvaluation(StringRPad(s1, s2, s3), "hi???", row1)
     checkEvaluation(StringRPad(s1, s2, s3), "h", row2)
     checkEvaluation(StringRPad(s1, s2, s3), null, row3)
+    checkEvaluation(StringRPad(s1, s2, s3), null, row4)
+    checkEvaluation(StringRPad(s1, s2, s3), null, row5)
   }
 
   test("REPEAT") {

From a15ecd057a6226e5cf83ca05c46748624a1cfc8c Mon Sep 17 00:00:00 2001
From: Tarek Auel <tarek.auel@googlemail.com>
Date: Mon, 20 Jul 2015 09:41:25 -0700
Subject: [PATCH 263/549] [SPARK-9177][SQL] Reuse of calendar object in
 WeekOfYear

https://issues.apache.org/jira/browse/SPARK-9177

rxin Are we sure that this is thread safe? chenghao-intel explained in another PR that every partition (if I remember correctly) uses one expression instance. This instance isn't used by multiple threads, is it? If not, we are fine.

Author: Tarek Auel <tarek.auel@googlemail.com>

Closes #7516 from tarekauel/SPARK-9177 and squashes the following commits:

0c1313a [Tarek Auel] [SPARK-9177] utilize more powerful addMutableState
6e2f03f [Tarek Auel] Merge branch 'master' into SPARK-9177
a69ec92 [Tarek Auel] [SPARK-9177] address comment
6cfb180 [Tarek Auel] [SPARK-9177] calendar as lazy transient val
ff97b09 [Tarek Auel] [SPARK-9177] Reuse calendar object in interpreted code and codegen
---
 .../catalyst/expressions/datetimeFunctions.scala  | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeFunctions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeFunctions.scala
index 802445509285d..9e55f0546e123 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeFunctions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeFunctions.scala
@@ -213,10 +213,14 @@ case class WeekOfYear(child: Expression) extends UnaryExpression with ImplicitCa
 
   override def dataType: DataType = IntegerType
 
-  override protected def nullSafeEval(date: Any): Any = {
+  @transient private lazy val c = {
     val c = Calendar.getInstance(TimeZone.getTimeZone("UTC"))
     c.setFirstDayOfWeek(Calendar.MONDAY)
     c.setMinimalDaysInFirstWeek(4)
+    c
+  }
+
+  override protected def nullSafeEval(date: Any): Any = {
     c.setTimeInMillis(date.asInstanceOf[Int] * 1000L * 3600L * 24L)
     c.get(Calendar.WEEK_OF_YEAR)
   }
@@ -225,10 +229,13 @@ case class WeekOfYear(child: Expression) extends UnaryExpression with ImplicitCa
     nullSafeCodeGen(ctx, ev, (time) => {
       val cal = classOf[Calendar].getName
       val c = ctx.freshName("cal")
+      ctx.addMutableState(cal, c,
+        s"""
+          $c = $cal.getInstance(java.util.TimeZone.getTimeZone("UTC"));
+          $c.setFirstDayOfWeek($cal.MONDAY);
+          $c.setMinimalDaysInFirstWeek(4);
+         """)
       s"""
-        $cal $c = $cal.getInstance(java.util.TimeZone.getTimeZone("UTC"));
-        $c.setFirstDayOfWeek($cal.MONDAY);
-        $c.setMinimalDaysInFirstWeek(4);
         $c.setTimeInMillis($time * 1000L * 3600L * 24L);
         ${ev.primitive} = $c.get($cal.WEEK_OF_YEAR);
       """

From 04db58ae30d2f73af45b7e6813f97be62dc92095 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@outlook.com>
Date: Mon, 20 Jul 2015 09:42:18 -0700
Subject: [PATCH 264/549] [SPARK-9186][SQL] make deterministic describing the
 tree rather than the expression

Author: Wenchen Fan <cloud0fan@outlook.com>

Closes #7525 from cloud-fan/deterministic and squashes the following commits:

4189bfa [Wenchen Fan] make deterministic describing the tree rather than the expression
---
 .../spark/sql/catalyst/expressions/Expression.scala  | 12 +++++++++++-
 .../spark/sql/catalyst/expressions/random.scala      | 10 +++++-----
 .../spark/sql/catalyst/optimizer/Optimizer.scala     |  4 ++--
 .../expressions/MonotonicallyIncreasingID.scala      |  6 ++----
 .../sql/execution/expressions/SparkPartitionID.scala |  6 ++----
 5 files changed, 22 insertions(+), 16 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
index d0a1aa9a1e912..da599b8963340 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
@@ -65,10 +65,12 @@ abstract class Expression extends TreeNode[Expression] {
    * Note that this means that an expression should be considered as non-deterministic if:
    * - if it relies on some mutable internal state, or
    * - if it relies on some implicit input that is not part of the children expression list.
+   * - if it has non-deterministic child or children.
    *
    * An example would be `SparkPartitionID` that relies on the partition id returned by TaskContext.
+   * By default leaf expressions are deterministic as Nil.forall(_.deterministic) returns true.
    */
-  def deterministic: Boolean = true
+  def deterministic: Boolean = children.forall(_.deterministic)
 
   def nullable: Boolean
 
@@ -183,6 +185,14 @@ trait Unevaluable { self: Expression =>
     throw new UnsupportedOperationException(s"Cannot evaluate expression: $this")
 }
 
+/**
+ * An expression that is nondeterministic.
+ */
+trait Nondeterministic { self: Expression =>
+
+  override def deterministic: Boolean = false
+}
+
 
 /**
  * A leaf expression, i.e. one without any child expressions.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/random.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/random.scala
index 822898e561cd6..aef24a5486466 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/random.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/random.scala
@@ -32,7 +32,9 @@ import org.apache.spark.util.random.XORShiftRandom
  *
  * Since this expression is stateful, it cannot be a case object.
  */
-abstract class RDG(seed: Long) extends LeafExpression with Serializable {
+abstract class RDG extends LeafExpression with Nondeterministic {
+
+  protected def seed: Long
 
   /**
    * Record ID within each partition. By being transient, the Random Number Generator is
@@ -40,15 +42,13 @@ abstract class RDG(seed: Long) extends LeafExpression with Serializable {
    */
   @transient protected lazy val rng = new XORShiftRandom(seed + TaskContext.getPartitionId)
 
-  override def deterministic: Boolean = false
-
   override def nullable: Boolean = false
 
   override def dataType: DataType = DoubleType
 }
 
 /** Generate a random column with i.i.d. uniformly distributed values in [0, 1). */
-case class Rand(seed: Long) extends RDG(seed) {
+case class Rand(seed: Long) extends RDG {
   override def eval(input: InternalRow): Double = rng.nextDouble()
 
   def this() = this(Utils.random.nextLong())
@@ -71,7 +71,7 @@ case class Rand(seed: Long) extends RDG(seed) {
 }
 
 /** Generate a random column with i.i.d. gaussian random distribution. */
-case class Randn(seed: Long) extends RDG(seed) {
+case class Randn(seed: Long) extends RDG {
   override def eval(input: InternalRow): Double = rng.nextGaussian()
 
   def this() = this(Utils.random.nextLong())
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 0f28a0d2c8fff..fafdae07c92f0 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -216,9 +216,9 @@ object ProjectCollapsing extends Rule[LogicalPlan] {
 
       // We only collapse these two Projects if their overlapped expressions are all
       // deterministic.
-      val hasNondeterministic = projectList1.flatMap(_.collect {
+      val hasNondeterministic = projectList1.exists(_.collect {
         case a: Attribute if aliasMap.contains(a) => aliasMap(a).child
-      }).exists(_.find(!_.deterministic).isDefined)
+      }.exists(!_.deterministic))
 
       if (hasNondeterministic) {
         p
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/MonotonicallyIncreasingID.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/MonotonicallyIncreasingID.scala
index 4d8ed089731aa..2645eb1854bce 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/MonotonicallyIncreasingID.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/MonotonicallyIncreasingID.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.execution.expressions
 
 import org.apache.spark.TaskContext
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.LeafExpression
+import org.apache.spark.sql.catalyst.expressions.{Nondeterministic, LeafExpression}
 import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, CodeGenContext}
 import org.apache.spark.sql.types.{LongType, DataType}
 
@@ -33,7 +33,7 @@ import org.apache.spark.sql.types.{LongType, DataType}
  *
  * Since this expression is stateful, it cannot be a case object.
  */
-private[sql] case class MonotonicallyIncreasingID() extends LeafExpression {
+private[sql] case class MonotonicallyIncreasingID() extends LeafExpression with Nondeterministic {
 
   /**
    * Record ID within each partition. By being transient, count's value is reset to 0 every time
@@ -43,8 +43,6 @@ private[sql] case class MonotonicallyIncreasingID() extends LeafExpression {
 
   @transient private lazy val partitionMask = TaskContext.getPartitionId().toLong << 33
 
-  override def deterministic: Boolean = false
-
   override def nullable: Boolean = false
 
   override def dataType: DataType = LongType
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/SparkPartitionID.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/SparkPartitionID.scala
index 43ffc9cc84738..53ddd47e3e0c1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/SparkPartitionID.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/SparkPartitionID.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.execution.expressions
 
 import org.apache.spark.TaskContext
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.LeafExpression
+import org.apache.spark.sql.catalyst.expressions.{Nondeterministic, LeafExpression}
 import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, CodeGenContext}
 import org.apache.spark.sql.types.{IntegerType, DataType}
 
@@ -27,9 +27,7 @@ import org.apache.spark.sql.types.{IntegerType, DataType}
 /**
  * Expression that returns the current partition id of the Spark task.
  */
-private[sql] case object SparkPartitionID extends LeafExpression {
-
-  override def deterministic: Boolean = false
+private[sql] case object SparkPartitionID extends LeafExpression with Nondeterministic {
 
   override def nullable: Boolean = false
 

From c6fe9b4a179eecce69a813501dd0f4a22ff5dd5b Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Mon, 20 Jul 2015 09:43:25 -0700
Subject: [PATCH 265/549] [SQL] Remove space from DataFrame Scala/Java API.

I don't think this function is useful at all in Scala/Java, since users can easily compute n * space easily.

Author: Reynold Xin <rxin@databricks.com>

Closes #7530 from rxin/remove-space and squashes the following commits:

c147873 [Reynold Xin] [SQL] Remove space from DataFrame Scala/Java API.
---
 .../org/apache/spark/sql/functions.scala      | 20 -------------------
 .../spark/sql/StringFunctionsSuite.scala      |  4 ----
 2 files changed, 24 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index b5140dca0487f..41b25d1836481 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -2237,26 +2237,6 @@ object functions {
     StringReverse(str.expr)
   }
 
-  /**
-   * Make a n spaces of string.
-   *
-   * @group string_funcs
-   * @since 1.5.0
-   */
-  def space(n: String): Column = {
-    space(Column(n))
-  }
-
-  /**
-   * Make a n spaces of string.
-   *
-   * @group string_funcs
-   * @since 1.5.0
-   */
-  def space(n: Column): Column = {
-    StringSpace(n.expr)
-  }
-
   //////////////////////////////////////////////////////////////////////////////////////////////
   // DateTime functions
   //////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
index fe4de8d8b855f..413f3858d6764 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
@@ -190,10 +190,6 @@ class StringFunctionsSuite extends QueryTest {
   test("string space function") {
     val df = Seq((2, 3)).toDF("a", "b")
 
-    checkAnswer(
-      df.select(space($"a"), space("b")),
-      Row("  ", "   "))
-
     checkAnswer(
       df.selectExpr("space(b)"),
       Row("   "))

From 80e2568b25780a7094199239da8ad6cfb6efc9f7 Mon Sep 17 00:00:00 2001
From: Imran Rashid <irashid@cloudera.com>
Date: Mon, 20 Jul 2015 10:28:32 -0700
Subject: [PATCH 266/549] [SPARK-8103][core] DAGScheduler should not submit
 multiple concurrent attempts for a stage

https://issues.apache.org/jira/browse/SPARK-8103

cc kayousterhout (thanks for the extra test case)

Author: Imran Rashid <irashid@cloudera.com>
Author: Kay Ousterhout <kayousterhout@gmail.com>
Author: Imran Rashid <squito@users.noreply.github.com>

Closes #6750 from squito/SPARK-8103 and squashes the following commits:

fb3acfc [Imran Rashid] fix log msg
e01b7aa [Imran Rashid] fix some comments, style
584acd4 [Imran Rashid] simplify going from taskId to taskSetMgr
e43ac25 [Imran Rashid] Merge branch 'master' into SPARK-8103
6bc23af [Imran Rashid] update log msg
4470fa1 [Imran Rashid] rename
c04707e [Imran Rashid] style
88b61cc [Imran Rashid] add tests to make sure that TaskSchedulerImpl schedules correctly with zombie attempts
d7f1ef2 [Imran Rashid] get rid of activeTaskSets
a21c8b5 [Imran Rashid] Merge branch 'master' into SPARK-8103
906d626 [Imran Rashid] fix merge
109900e [Imran Rashid] Merge branch 'master' into SPARK-8103
c0d4d90 [Imran Rashid] Revert "Index active task sets by stage Id rather than by task set id"
f025154 [Imran Rashid] Merge pull request #2 from kayousterhout/imran_SPARK-8103
baf46e1 [Kay Ousterhout] Index active task sets by stage Id rather than by task set id
19685bb [Imran Rashid] switch to using latestInfo.attemptId, and add comments
a5f7c8c [Imran Rashid] remove comment for reviewers
227b40d [Imran Rashid] style
517b6e5 [Imran Rashid] get rid of SparkIllegalStateException
b2faef5 [Imran Rashid] faster check for conflicting task sets
6542b42 [Imran Rashid] remove extra stageAttemptId
ada7726 [Imran Rashid] reviewer feedback
d8eb202 [Imran Rashid] Merge branch 'master' into SPARK-8103
46bc26a [Imran Rashid] more cleanup of debug garbage
cb245da [Imran Rashid] finally found the issue ... clean up debug stuff
8c29707 [Imran Rashid] Merge branch 'master' into SPARK-8103
89a59b6 [Imran Rashid] more printlns ...
9601b47 [Imran Rashid] more debug printlns
ecb4e7d [Imran Rashid] debugging printlns
b6bc248 [Imran Rashid] style
55f4a94 [Imran Rashid] get rid of more random test case since kays tests are clearer
7021d28 [Imran Rashid] update test since listenerBus.waitUntilEmpty now throws an exception instead of returning a boolean
883fe49 [Kay Ousterhout] Unit tests for concurrent stages issue
6e14683 [Imran Rashid] unit test just to make sure we fail fast on concurrent attempts
06a0af6 [Imran Rashid] ignore for jenkins
c443def [Imran Rashid] better fix and simpler test case
28d70aa [Imran Rashid] wip on getting a better test case ...
a9bf31f [Imran Rashid] wip
---
 .../apache/spark/scheduler/DAGScheduler.scala |  78 +++++-----
 .../apache/spark/scheduler/ResultTask.scala   |   3 +-
 .../spark/scheduler/ShuffleMapTask.scala      |   5 +-
 .../org/apache/spark/scheduler/Task.scala     |   5 +-
 .../spark/scheduler/TaskSchedulerImpl.scala   |  99 +++++++-----
 .../org/apache/spark/scheduler/TaskSet.scala  |   4 +-
 .../CoarseGrainedSchedulerBackend.scala       |   5 +-
 .../spark/scheduler/DAGSchedulerSuite.scala   | 141 ++++++++++++++++++
 .../org/apache/spark/scheduler/FakeTask.scala |   8 +-
 .../scheduler/NotSerializableFakeTask.scala   |   2 +-
 .../spark/scheduler/TaskContextSuite.scala    |   4 +-
 .../scheduler/TaskSchedulerImplSuite.scala    | 113 +++++++++++++-
 .../spark/scheduler/TaskSetManagerSuite.scala |   2 +-
 13 files changed, 383 insertions(+), 86 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
index dd55cd8054332..71a219a4f3414 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -857,7 +857,6 @@ class DAGScheduler(
     // Get our pending tasks and remember them in our pendingTasks entry
     stage.pendingTasks.clear()
 
-
     // First figure out the indexes of partition ids to compute.
     val partitionsToCompute: Seq[Int] = {
       stage match {
@@ -918,7 +917,7 @@ class DAGScheduler(
           partitionsToCompute.map { id =>
             val locs = getPreferredLocs(stage.rdd, id)
             val part = stage.rdd.partitions(id)
-            new ShuffleMapTask(stage.id, taskBinary, part, locs)
+            new ShuffleMapTask(stage.id, stage.latestInfo.attemptId, taskBinary, part, locs)
           }
 
         case stage: ResultStage =>
@@ -927,7 +926,7 @@ class DAGScheduler(
             val p: Int = job.partitions(id)
             val part = stage.rdd.partitions(p)
             val locs = getPreferredLocs(stage.rdd, p)
-            new ResultTask(stage.id, taskBinary, part, locs, id)
+            new ResultTask(stage.id, stage.latestInfo.attemptId, taskBinary, part, locs, id)
           }
       }
     } catch {
@@ -1069,10 +1068,11 @@ class DAGScheduler(
             val execId = status.location.executorId
             logDebug("ShuffleMapTask finished on " + execId)
             if (failedEpoch.contains(execId) && smt.epoch <= failedEpoch(execId)) {
-              logInfo("Ignoring possibly bogus ShuffleMapTask completion from " + execId)
+              logInfo(s"Ignoring possibly bogus $smt completion from executor $execId")
             } else {
               shuffleStage.addOutputLoc(smt.partitionId, status)
             }
+
             if (runningStages.contains(shuffleStage) && shuffleStage.pendingTasks.isEmpty) {
               markStageAsFinished(shuffleStage)
               logInfo("looking for newly runnable stages")
@@ -1132,38 +1132,48 @@ class DAGScheduler(
         val failedStage = stageIdToStage(task.stageId)
         val mapStage = shuffleToMapStage(shuffleId)
 
-        // It is likely that we receive multiple FetchFailed for a single stage (because we have
-        // multiple tasks running concurrently on different executors). In that case, it is possible
-        // the fetch failure has already been handled by the scheduler.
-        if (runningStages.contains(failedStage)) {
-          logInfo(s"Marking $failedStage (${failedStage.name}) as failed " +
-            s"due to a fetch failure from $mapStage (${mapStage.name})")
-          markStageAsFinished(failedStage, Some(failureMessage))
-        }
+        if (failedStage.latestInfo.attemptId != task.stageAttemptId) {
+          logInfo(s"Ignoring fetch failure from $task as it's from $failedStage attempt" +
+            s" ${task.stageAttemptId} and there is a more recent attempt for that stage " +
+            s"(attempt ID ${failedStage.latestInfo.attemptId}) running")
+        } else {
 
-        if (disallowStageRetryForTest) {
-          abortStage(failedStage, "Fetch failure will not retry stage due to testing config")
-        } else if (failedStages.isEmpty) {
-          // Don't schedule an event to resubmit failed stages if failed isn't empty, because
-          // in that case the event will already have been scheduled.
-          // TODO: Cancel running tasks in the stage
-          logInfo(s"Resubmitting $mapStage (${mapStage.name}) and " +
-            s"$failedStage (${failedStage.name}) due to fetch failure")
-          messageScheduler.schedule(new Runnable {
-            override def run(): Unit = eventProcessLoop.post(ResubmitFailedStages)
-          }, DAGScheduler.RESUBMIT_TIMEOUT, TimeUnit.MILLISECONDS)
-        }
-        failedStages += failedStage
-        failedStages += mapStage
-        // Mark the map whose fetch failed as broken in the map stage
-        if (mapId != -1) {
-          mapStage.removeOutputLoc(mapId, bmAddress)
-          mapOutputTracker.unregisterMapOutput(shuffleId, mapId, bmAddress)
-        }
+          // It is likely that we receive multiple FetchFailed for a single stage (because we have
+          // multiple tasks running concurrently on different executors). In that case, it is
+          // possible the fetch failure has already been handled by the scheduler.
+          if (runningStages.contains(failedStage)) {
+            logInfo(s"Marking $failedStage (${failedStage.name}) as failed " +
+              s"due to a fetch failure from $mapStage (${mapStage.name})")
+            markStageAsFinished(failedStage, Some(failureMessage))
+          } else {
+            logDebug(s"Received fetch failure from $task, but its from $failedStage which is no " +
+              s"longer running")
+          }
+
+          if (disallowStageRetryForTest) {
+            abortStage(failedStage, "Fetch failure will not retry stage due to testing config")
+          } else if (failedStages.isEmpty) {
+            // Don't schedule an event to resubmit failed stages if failed isn't empty, because
+            // in that case the event will already have been scheduled.
+            // TODO: Cancel running tasks in the stage
+            logInfo(s"Resubmitting $mapStage (${mapStage.name}) and " +
+              s"$failedStage (${failedStage.name}) due to fetch failure")
+            messageScheduler.schedule(new Runnable {
+              override def run(): Unit = eventProcessLoop.post(ResubmitFailedStages)
+            }, DAGScheduler.RESUBMIT_TIMEOUT, TimeUnit.MILLISECONDS)
+          }
+          failedStages += failedStage
+          failedStages += mapStage
+          // Mark the map whose fetch failed as broken in the map stage
+          if (mapId != -1) {
+            mapStage.removeOutputLoc(mapId, bmAddress)
+            mapOutputTracker.unregisterMapOutput(shuffleId, mapId, bmAddress)
+          }
 
-        // TODO: mark the executor as failed only if there were lots of fetch failures on it
-        if (bmAddress != null) {
-          handleExecutorLost(bmAddress.executorId, fetchFailed = true, Some(task.epoch))
+          // TODO: mark the executor as failed only if there were lots of fetch failures on it
+          if (bmAddress != null) {
+            handleExecutorLost(bmAddress.executorId, fetchFailed = true, Some(task.epoch))
+          }
         }
 
       case commitDenied: TaskCommitDenied =>
diff --git a/core/src/main/scala/org/apache/spark/scheduler/ResultTask.scala b/core/src/main/scala/org/apache/spark/scheduler/ResultTask.scala
index c9a124113961f..9c2606e278c54 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/ResultTask.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/ResultTask.scala
@@ -41,11 +41,12 @@ import org.apache.spark.rdd.RDD
  */
 private[spark] class ResultTask[T, U](
     stageId: Int,
+    stageAttemptId: Int,
     taskBinary: Broadcast[Array[Byte]],
     partition: Partition,
     @transient locs: Seq[TaskLocation],
     val outputId: Int)
-  extends Task[U](stageId, partition.index) with Serializable {
+  extends Task[U](stageId, stageAttemptId, partition.index) with Serializable {
 
   @transient private[this] val preferredLocs: Seq[TaskLocation] = {
     if (locs == null) Nil else locs.toSet.toSeq
diff --git a/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapTask.scala b/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapTask.scala
index bd3dd23dfe1ac..14c8c00961487 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapTask.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapTask.scala
@@ -40,14 +40,15 @@ import org.apache.spark.shuffle.ShuffleWriter
  */
 private[spark] class ShuffleMapTask(
     stageId: Int,
+    stageAttemptId: Int,
     taskBinary: Broadcast[Array[Byte]],
     partition: Partition,
     @transient private var locs: Seq[TaskLocation])
-  extends Task[MapStatus](stageId, partition.index) with Logging {
+  extends Task[MapStatus](stageId, stageAttemptId, partition.index) with Logging {
 
   /** A constructor used only in test suites. This does not require passing in an RDD. */
   def this(partitionId: Int) {
-    this(0, null, new Partition { override def index: Int = 0 }, null)
+    this(0, 0, null, new Partition { override def index: Int = 0 }, null)
   }
 
   @transient private val preferredLocs: Seq[TaskLocation] = {
diff --git a/core/src/main/scala/org/apache/spark/scheduler/Task.scala b/core/src/main/scala/org/apache/spark/scheduler/Task.scala
index 6a86f9d4b8530..76a19aeac4679 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/Task.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/Task.scala
@@ -43,7 +43,10 @@ import org.apache.spark.util.Utils
  * @param stageId id of the stage this task belongs to
  * @param partitionId index of the number in the RDD
  */
-private[spark] abstract class Task[T](val stageId: Int, var partitionId: Int) extends Serializable {
+private[spark] abstract class Task[T](
+    val stageId: Int,
+    val stageAttemptId: Int,
+    var partitionId: Int) extends Serializable {
 
   /**
    * The key of the Map is the accumulator id and the value of the Map is the latest accumulator
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
index ed3dde0fc3055..1705e7f962de2 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
@@ -75,9 +75,9 @@ private[spark] class TaskSchedulerImpl(
 
   // TaskSetManagers are not thread safe, so any access to one should be synchronized
   // on this class.
-  val activeTaskSets = new HashMap[String, TaskSetManager]
+  private val taskSetsByStageIdAndAttempt = new HashMap[Int, HashMap[Int, TaskSetManager]]
 
-  val taskIdToTaskSetId = new HashMap[Long, String]
+  private[scheduler] val taskIdToTaskSetManager = new HashMap[Long, TaskSetManager]
   val taskIdToExecutorId = new HashMap[Long, String]
 
   @volatile private var hasReceivedTask = false
@@ -162,7 +162,17 @@ private[spark] class TaskSchedulerImpl(
     logInfo("Adding task set " + taskSet.id + " with " + tasks.length + " tasks")
     this.synchronized {
       val manager = createTaskSetManager(taskSet, maxTaskFailures)
-      activeTaskSets(taskSet.id) = manager
+      val stage = taskSet.stageId
+      val stageTaskSets =
+        taskSetsByStageIdAndAttempt.getOrElseUpdate(stage, new HashMap[Int, TaskSetManager])
+      stageTaskSets(taskSet.stageAttemptId) = manager
+      val conflictingTaskSet = stageTaskSets.exists { case (_, ts) =>
+        ts.taskSet != taskSet && !ts.isZombie
+      }
+      if (conflictingTaskSet) {
+        throw new IllegalStateException(s"more than one active taskSet for stage $stage:" +
+          s" ${stageTaskSets.toSeq.map{_._2.taskSet.id}.mkString(",")}")
+      }
       schedulableBuilder.addTaskSetManager(manager, manager.taskSet.properties)
 
       if (!isLocal && !hasReceivedTask) {
@@ -192,19 +202,21 @@ private[spark] class TaskSchedulerImpl(
 
   override def cancelTasks(stageId: Int, interruptThread: Boolean): Unit = synchronized {
     logInfo("Cancelling stage " + stageId)
-    activeTaskSets.find(_._2.stageId == stageId).foreach { case (_, tsm) =>
-      // There are two possible cases here:
-      // 1. The task set manager has been created and some tasks have been scheduled.
-      //    In this case, send a kill signal to the executors to kill the task and then abort
-      //    the stage.
-      // 2. The task set manager has been created but no tasks has been scheduled. In this case,
-      //    simply abort the stage.
-      tsm.runningTasksSet.foreach { tid =>
-        val execId = taskIdToExecutorId(tid)
-        backend.killTask(tid, execId, interruptThread)
+    taskSetsByStageIdAndAttempt.get(stageId).foreach { attempts =>
+      attempts.foreach { case (_, tsm) =>
+        // There are two possible cases here:
+        // 1. The task set manager has been created and some tasks have been scheduled.
+        //    In this case, send a kill signal to the executors to kill the task and then abort
+        //    the stage.
+        // 2. The task set manager has been created but no tasks has been scheduled. In this case,
+        //    simply abort the stage.
+        tsm.runningTasksSet.foreach { tid =>
+          val execId = taskIdToExecutorId(tid)
+          backend.killTask(tid, execId, interruptThread)
+        }
+        tsm.abort("Stage %s cancelled".format(stageId))
+        logInfo("Stage %d was cancelled".format(stageId))
       }
-      tsm.abort("Stage %s cancelled".format(stageId))
-      logInfo("Stage %d was cancelled".format(stageId))
     }
   }
 
@@ -214,7 +226,12 @@ private[spark] class TaskSchedulerImpl(
    * cleaned up.
    */
   def taskSetFinished(manager: TaskSetManager): Unit = synchronized {
-    activeTaskSets -= manager.taskSet.id
+    taskSetsByStageIdAndAttempt.get(manager.taskSet.stageId).foreach { taskSetsForStage =>
+      taskSetsForStage -= manager.taskSet.stageAttemptId
+      if (taskSetsForStage.isEmpty) {
+        taskSetsByStageIdAndAttempt -= manager.taskSet.stageId
+      }
+    }
     manager.parent.removeSchedulable(manager)
     logInfo("Removed TaskSet %s, whose tasks have all completed, from pool %s"
       .format(manager.taskSet.id, manager.parent.name))
@@ -235,7 +252,7 @@ private[spark] class TaskSchedulerImpl(
           for (task <- taskSet.resourceOffer(execId, host, maxLocality)) {
             tasks(i) += task
             val tid = task.taskId
-            taskIdToTaskSetId(tid) = taskSet.taskSet.id
+            taskIdToTaskSetManager(tid) = taskSet
             taskIdToExecutorId(tid) = execId
             executorsByHost(host) += execId
             availableCpus(i) -= CPUS_PER_TASK
@@ -319,26 +336,24 @@ private[spark] class TaskSchedulerImpl(
             failedExecutor = Some(execId)
           }
         }
-        taskIdToTaskSetId.get(tid) match {
-          case Some(taskSetId) =>
+        taskIdToTaskSetManager.get(tid) match {
+          case Some(taskSet) =>
             if (TaskState.isFinished(state)) {
-              taskIdToTaskSetId.remove(tid)
+              taskIdToTaskSetManager.remove(tid)
               taskIdToExecutorId.remove(tid)
             }
-            activeTaskSets.get(taskSetId).foreach { taskSet =>
-              if (state == TaskState.FINISHED) {
-                taskSet.removeRunningTask(tid)
-                taskResultGetter.enqueueSuccessfulTask(taskSet, tid, serializedData)
-              } else if (Set(TaskState.FAILED, TaskState.KILLED, TaskState.LOST).contains(state)) {
-                taskSet.removeRunningTask(tid)
-                taskResultGetter.enqueueFailedTask(taskSet, tid, state, serializedData)
-              }
+            if (state == TaskState.FINISHED) {
+              taskSet.removeRunningTask(tid)
+              taskResultGetter.enqueueSuccessfulTask(taskSet, tid, serializedData)
+            } else if (Set(TaskState.FAILED, TaskState.KILLED, TaskState.LOST).contains(state)) {
+              taskSet.removeRunningTask(tid)
+              taskResultGetter.enqueueFailedTask(taskSet, tid, state, serializedData)
             }
           case None =>
             logError(
               ("Ignoring update with state %s for TID %s because its task set is gone (this is " +
-               "likely the result of receiving duplicate task finished status updates)")
-              .format(state, tid))
+                "likely the result of receiving duplicate task finished status updates)")
+                .format(state, tid))
         }
       } catch {
         case e: Exception => logError("Exception in statusUpdate", e)
@@ -363,9 +378,9 @@ private[spark] class TaskSchedulerImpl(
 
     val metricsWithStageIds: Array[(Long, Int, Int, TaskMetrics)] = synchronized {
       taskMetrics.flatMap { case (id, metrics) =>
-        taskIdToTaskSetId.get(id)
-          .flatMap(activeTaskSets.get)
-          .map(taskSetMgr => (id, taskSetMgr.stageId, taskSetMgr.taskSet.attempt, metrics))
+        taskIdToTaskSetManager.get(id).map { taskSetMgr =>
+          (id, taskSetMgr.stageId, taskSetMgr.taskSet.stageAttemptId, metrics)
+        }
       }
     }
     dagScheduler.executorHeartbeatReceived(execId, metricsWithStageIds, blockManagerId)
@@ -397,9 +412,12 @@ private[spark] class TaskSchedulerImpl(
 
   def error(message: String) {
     synchronized {
-      if (activeTaskSets.nonEmpty) {
+      if (taskSetsByStageIdAndAttempt.nonEmpty) {
         // Have each task set throw a SparkException with the error
-        for ((taskSetId, manager) <- activeTaskSets) {
+        for {
+          attempts <- taskSetsByStageIdAndAttempt.values
+          manager <- attempts.values
+        } {
           try {
             manager.abort(message)
           } catch {
@@ -520,6 +538,17 @@ private[spark] class TaskSchedulerImpl(
 
   override def applicationAttemptId(): Option[String] = backend.applicationAttemptId()
 
+  private[scheduler] def taskSetManagerForAttempt(
+      stageId: Int,
+      stageAttemptId: Int): Option[TaskSetManager] = {
+    for {
+      attempts <- taskSetsByStageIdAndAttempt.get(stageId)
+      manager <- attempts.get(stageAttemptId)
+    } yield {
+      manager
+    }
+  }
+
 }
 
 
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSet.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSet.scala
index c3ad325156f53..be8526ba9b94f 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskSet.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSet.scala
@@ -26,10 +26,10 @@ import java.util.Properties
 private[spark] class TaskSet(
     val tasks: Array[Task[_]],
     val stageId: Int,
-    val attempt: Int,
+    val stageAttemptId: Int,
     val priority: Int,
     val properties: Properties) {
-    val id: String = stageId + "." + attempt
+    val id: String = stageId + "." + stageAttemptId
 
   override def toString: String = "TaskSet " + id
 }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
index 0e3215d6e9ec8..f14c603ac6891 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
@@ -191,15 +191,14 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
       for (task <- tasks.flatten) {
         val serializedTask = ser.serialize(task)
         if (serializedTask.limit >= akkaFrameSize - AkkaUtils.reservedSizeBytes) {
-          val taskSetId = scheduler.taskIdToTaskSetId(task.taskId)
-          scheduler.activeTaskSets.get(taskSetId).foreach { taskSet =>
+          scheduler.taskIdToTaskSetManager.get(task.taskId).foreach { taskSetMgr =>
             try {
               var msg = "Serialized task %s:%d was %d bytes, which exceeds max allowed: " +
                 "spark.akka.frameSize (%d bytes) - reserved (%d bytes). Consider increasing " +
                 "spark.akka.frameSize or using broadcast variables for large values."
               msg = msg.format(task.taskId, task.index, serializedTask.limit, akkaFrameSize,
                 AkkaUtils.reservedSizeBytes)
-              taskSet.abort(msg)
+              taskSetMgr.abort(msg)
             } catch {
               case e: Exception => logError("Exception in error callback", e)
             }
diff --git a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
index 4f2b0fa162b72..86728cb2b62af 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
@@ -101,9 +101,15 @@ class DAGSchedulerSuite
   /** Length of time to wait while draining listener events. */
   val WAIT_TIMEOUT_MILLIS = 10000
   val sparkListener = new SparkListener() {
+    val submittedStageInfos = new HashSet[StageInfo]
     val successfulStages = new HashSet[Int]
     val failedStages = new ArrayBuffer[Int]
     val stageByOrderOfExecution = new ArrayBuffer[Int]
+
+    override def onStageSubmitted(stageSubmitted: SparkListenerStageSubmitted) {
+      submittedStageInfos += stageSubmitted.stageInfo
+    }
+
     override def onStageCompleted(stageCompleted: SparkListenerStageCompleted) {
       val stageInfo = stageCompleted.stageInfo
       stageByOrderOfExecution += stageInfo.stageId
@@ -150,6 +156,7 @@ class DAGSchedulerSuite
     // Enable local execution for this test
     val conf = new SparkConf().set("spark.localExecution.enabled", "true")
     sc = new SparkContext("local", "DAGSchedulerSuite", conf)
+    sparkListener.submittedStageInfos.clear()
     sparkListener.successfulStages.clear()
     sparkListener.failedStages.clear()
     failure = null
@@ -547,6 +554,140 @@ class DAGSchedulerSuite
     assert(sparkListener.failedStages.size == 1)
   }
 
+  /**
+   * This tests the case where another FetchFailed comes in while the map stage is getting
+   * re-run.
+   */
+  test("late fetch failures don't cause multiple concurrent attempts for the same map stage") {
+    val shuffleMapRdd = new MyRDD(sc, 2, Nil)
+    val shuffleDep = new ShuffleDependency(shuffleMapRdd, null)
+    val shuffleId = shuffleDep.shuffleId
+    val reduceRdd = new MyRDD(sc, 2, List(shuffleDep))
+    submit(reduceRdd, Array(0, 1))
+
+    val mapStageId = 0
+    def countSubmittedMapStageAttempts(): Int = {
+      sparkListener.submittedStageInfos.count(_.stageId == mapStageId)
+    }
+
+    // The map stage should have been submitted.
+    assert(countSubmittedMapStageAttempts() === 1)
+
+    complete(taskSets(0), Seq(
+      (Success, makeMapStatus("hostA", 2)),
+      (Success, makeMapStatus("hostB", 2))))
+    // The MapOutputTracker should know about both map output locations.
+    assert(mapOutputTracker.getServerStatuses(shuffleId, 0).map(_._1.host) ===
+      Array("hostA", "hostB"))
+    assert(mapOutputTracker.getServerStatuses(shuffleId, 1).map(_._1.host) ===
+      Array("hostA", "hostB"))
+
+    // The first result task fails, with a fetch failure for the output from the first mapper.
+    runEvent(CompletionEvent(
+      taskSets(1).tasks(0),
+      FetchFailed(makeBlockManagerId("hostA"), shuffleId, 0, 0, "ignored"),
+      null,
+      Map[Long, Any](),
+      createFakeTaskInfo(),
+      null))
+    sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS)
+    assert(sparkListener.failedStages.contains(1))
+
+    // Trigger resubmission of the failed map stage.
+    runEvent(ResubmitFailedStages)
+    sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS)
+
+    // Another attempt for the map stage should have been submitted, resulting in 2 total attempts.
+    assert(countSubmittedMapStageAttempts() === 2)
+
+    // The second ResultTask fails, with a fetch failure for the output from the second mapper.
+    runEvent(CompletionEvent(
+      taskSets(1).tasks(1),
+      FetchFailed(makeBlockManagerId("hostB"), shuffleId, 1, 1, "ignored"),
+      null,
+      Map[Long, Any](),
+      createFakeTaskInfo(),
+      null))
+
+    // Another ResubmitFailedStages event should not result in another attempt for the map
+    // stage being run concurrently.
+    // NOTE: the actual ResubmitFailedStages may get called at any time during this, but it
+    // shouldn't effect anything -- our calling it just makes *SURE* it gets called between the
+    // desired event and our check.
+    runEvent(ResubmitFailedStages)
+    sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS)
+    assert(countSubmittedMapStageAttempts() === 2)
+
+  }
+
+  /**
+    * This tests the case where a late FetchFailed comes in after the map stage has finished getting
+    * retried and a new reduce stage starts running.
+    */
+  test("extremely late fetch failures don't cause multiple concurrent attempts for " +
+      "the same stage") {
+    val shuffleMapRdd = new MyRDD(sc, 2, Nil)
+    val shuffleDep = new ShuffleDependency(shuffleMapRdd, null)
+    val shuffleId = shuffleDep.shuffleId
+    val reduceRdd = new MyRDD(sc, 2, List(shuffleDep))
+    submit(reduceRdd, Array(0, 1))
+
+    def countSubmittedReduceStageAttempts(): Int = {
+      sparkListener.submittedStageInfos.count(_.stageId == 1)
+    }
+    def countSubmittedMapStageAttempts(): Int = {
+      sparkListener.submittedStageInfos.count(_.stageId == 0)
+    }
+
+    // The map stage should have been submitted.
+    sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS)
+    assert(countSubmittedMapStageAttempts() === 1)
+
+    // Complete the map stage.
+    complete(taskSets(0), Seq(
+      (Success, makeMapStatus("hostA", 2)),
+      (Success, makeMapStatus("hostB", 2))))
+
+    // The reduce stage should have been submitted.
+    sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS)
+    assert(countSubmittedReduceStageAttempts() === 1)
+
+    // The first result task fails, with a fetch failure for the output from the first mapper.
+    runEvent(CompletionEvent(
+      taskSets(1).tasks(0),
+      FetchFailed(makeBlockManagerId("hostA"), shuffleId, 0, 0, "ignored"),
+      null,
+      Map[Long, Any](),
+      createFakeTaskInfo(),
+      null))
+
+    // Trigger resubmission of the failed map stage and finish the re-started map task.
+    runEvent(ResubmitFailedStages)
+    complete(taskSets(2), Seq((Success, makeMapStatus("hostA", 1))))
+
+    // Because the map stage finished, another attempt for the reduce stage should have been
+    // submitted, resulting in 2 total attempts for each the map and the reduce stage.
+    sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS)
+    assert(countSubmittedMapStageAttempts() === 2)
+    assert(countSubmittedReduceStageAttempts() === 2)
+
+    // A late FetchFailed arrives from the second task in the original reduce stage.
+    runEvent(CompletionEvent(
+      taskSets(1).tasks(1),
+      FetchFailed(makeBlockManagerId("hostB"), shuffleId, 1, 1, "ignored"),
+      null,
+      Map[Long, Any](),
+      createFakeTaskInfo(),
+      null))
+
+    // Running ResubmitFailedStages shouldn't result in any more attempts for the map stage, because
+    // the FetchFailed should have been ignored
+    runEvent(ResubmitFailedStages)
+
+    // The FetchFailed from the original reduce stage should be ignored.
+    assert(countSubmittedMapStageAttempts() === 2)
+  }
+
   test("ignore late map task completions") {
     val shuffleMapRdd = new MyRDD(sc, 2, Nil)
     val shuffleDep = new ShuffleDependency(shuffleMapRdd, null)
diff --git a/core/src/test/scala/org/apache/spark/scheduler/FakeTask.scala b/core/src/test/scala/org/apache/spark/scheduler/FakeTask.scala
index 0a7cb69416a08..b3ca150195a5f 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/FakeTask.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/FakeTask.scala
@@ -19,7 +19,7 @@ package org.apache.spark.scheduler
 
 import org.apache.spark.TaskContext
 
-class FakeTask(stageId: Int, prefLocs: Seq[TaskLocation] = Nil) extends Task[Int](stageId, 0) {
+class FakeTask(stageId: Int, prefLocs: Seq[TaskLocation] = Nil) extends Task[Int](stageId, 0, 0) {
   override def runTask(context: TaskContext): Int = 0
 
   override def preferredLocations: Seq[TaskLocation] = prefLocs
@@ -31,12 +31,16 @@ object FakeTask {
    * locations for each task (given as varargs) if this sequence is not empty.
    */
   def createTaskSet(numTasks: Int, prefLocs: Seq[TaskLocation]*): TaskSet = {
+    createTaskSet(numTasks, 0, prefLocs: _*)
+  }
+
+  def createTaskSet(numTasks: Int, stageAttemptId: Int, prefLocs: Seq[TaskLocation]*): TaskSet = {
     if (prefLocs.size != 0 && prefLocs.size != numTasks) {
       throw new IllegalArgumentException("Wrong number of task locations")
     }
     val tasks = Array.tabulate[Task[_]](numTasks) { i =>
       new FakeTask(i, if (prefLocs.size != 0) prefLocs(i) else Nil)
     }
-    new TaskSet(tasks, 0, 0, 0, null)
+    new TaskSet(tasks, 0, stageAttemptId, 0, null)
   }
 }
diff --git a/core/src/test/scala/org/apache/spark/scheduler/NotSerializableFakeTask.scala b/core/src/test/scala/org/apache/spark/scheduler/NotSerializableFakeTask.scala
index 9b92f8de56759..383855caefa2f 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/NotSerializableFakeTask.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/NotSerializableFakeTask.scala
@@ -25,7 +25,7 @@ import org.apache.spark.TaskContext
  * A Task implementation that fails to serialize.
  */
 private[spark] class NotSerializableFakeTask(myId: Int, stageId: Int)
-  extends Task[Array[Byte]](stageId, 0) {
+  extends Task[Array[Byte]](stageId, 0, 0) {
 
   override def runTask(context: TaskContext): Array[Byte] = Array.empty[Byte]
   override def preferredLocations: Seq[TaskLocation] = Seq[TaskLocation]()
diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskContextSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskContextSuite.scala
index 7c1adc1aef1b6..b9b0eccb0d834 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/TaskContextSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/TaskContextSuite.scala
@@ -41,8 +41,8 @@ class TaskContextSuite extends SparkFunSuite with BeforeAndAfter with LocalSpark
     }
     val closureSerializer = SparkEnv.get.closureSerializer.newInstance()
     val func = (c: TaskContext, i: Iterator[String]) => i.next()
-    val task = new ResultTask[String, String](
-      0, sc.broadcast(closureSerializer.serialize((rdd, func)).array), rdd.partitions(0), Seq(), 0)
+    val task = new ResultTask[String, String](0, 0,
+      sc.broadcast(closureSerializer.serialize((rdd, func)).array), rdd.partitions(0), Seq(), 0)
     intercept[RuntimeException] {
       task.run(0, 0)
     }
diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala
index a6d5232feb8de..c2edd4c317d6e 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala
@@ -33,7 +33,7 @@ class TaskSchedulerImplSuite extends SparkFunSuite with LocalSparkContext with L
     val taskScheduler = new TaskSchedulerImpl(sc)
     taskScheduler.initialize(new FakeSchedulerBackend)
     // Need to initialize a DAGScheduler for the taskScheduler to use for callbacks.
-    val dagScheduler = new DAGScheduler(sc, taskScheduler) {
+    new DAGScheduler(sc, taskScheduler) {
       override def taskStarted(task: Task[_], taskInfo: TaskInfo) {}
       override def executorAdded(execId: String, host: String) {}
     }
@@ -67,7 +67,7 @@ class TaskSchedulerImplSuite extends SparkFunSuite with LocalSparkContext with L
     val taskScheduler = new TaskSchedulerImpl(sc)
     taskScheduler.initialize(new FakeSchedulerBackend)
     // Need to initialize a DAGScheduler for the taskScheduler to use for callbacks.
-    val dagScheduler = new DAGScheduler(sc, taskScheduler) {
+    new DAGScheduler(sc, taskScheduler) {
       override def taskStarted(task: Task[_], taskInfo: TaskInfo) {}
       override def executorAdded(execId: String, host: String) {}
     }
@@ -128,4 +128,113 @@ class TaskSchedulerImplSuite extends SparkFunSuite with LocalSparkContext with L
     assert(taskDescriptions.map(_.executorId) === Seq("executor0"))
   }
 
+  test("refuse to schedule concurrent attempts for the same stage (SPARK-8103)") {
+    sc = new SparkContext("local", "TaskSchedulerImplSuite")
+    val taskScheduler = new TaskSchedulerImpl(sc)
+    taskScheduler.initialize(new FakeSchedulerBackend)
+    // Need to initialize a DAGScheduler for the taskScheduler to use for callbacks.
+    val dagScheduler = new DAGScheduler(sc, taskScheduler) {
+      override def taskStarted(task: Task[_], taskInfo: TaskInfo) {}
+      override def executorAdded(execId: String, host: String) {}
+    }
+    taskScheduler.setDAGScheduler(dagScheduler)
+    val attempt1 = FakeTask.createTaskSet(1, 0)
+    val attempt2 = FakeTask.createTaskSet(1, 1)
+    taskScheduler.submitTasks(attempt1)
+    intercept[IllegalStateException] { taskScheduler.submitTasks(attempt2) }
+
+    // OK to submit multiple if previous attempts are all zombie
+    taskScheduler.taskSetManagerForAttempt(attempt1.stageId, attempt1.stageAttemptId)
+      .get.isZombie = true
+    taskScheduler.submitTasks(attempt2)
+    val attempt3 = FakeTask.createTaskSet(1, 2)
+    intercept[IllegalStateException] { taskScheduler.submitTasks(attempt3) }
+    taskScheduler.taskSetManagerForAttempt(attempt2.stageId, attempt2.stageAttemptId)
+      .get.isZombie = true
+    taskScheduler.submitTasks(attempt3)
+  }
+
+  test("don't schedule more tasks after a taskset is zombie") {
+    sc = new SparkContext("local", "TaskSchedulerImplSuite")
+    val taskScheduler = new TaskSchedulerImpl(sc)
+    taskScheduler.initialize(new FakeSchedulerBackend)
+    // Need to initialize a DAGScheduler for the taskScheduler to use for callbacks.
+    new DAGScheduler(sc, taskScheduler) {
+      override def taskStarted(task: Task[_], taskInfo: TaskInfo) {}
+      override def executorAdded(execId: String, host: String) {}
+    }
+
+    val numFreeCores = 1
+    val workerOffers = Seq(new WorkerOffer("executor0", "host0", numFreeCores))
+    val attempt1 = FakeTask.createTaskSet(10)
+
+    // submit attempt 1, offer some resources, some tasks get scheduled
+    taskScheduler.submitTasks(attempt1)
+    val taskDescriptions = taskScheduler.resourceOffers(workerOffers).flatten
+    assert(1 === taskDescriptions.length)
+
+    // now mark attempt 1 as a zombie
+    taskScheduler.taskSetManagerForAttempt(attempt1.stageId, attempt1.stageAttemptId)
+      .get.isZombie = true
+
+    // don't schedule anything on another resource offer
+    val taskDescriptions2 = taskScheduler.resourceOffers(workerOffers).flatten
+    assert(0 === taskDescriptions2.length)
+
+    // if we schedule another attempt for the same stage, it should get scheduled
+    val attempt2 = FakeTask.createTaskSet(10, 1)
+
+    // submit attempt 2, offer some resources, some tasks get scheduled
+    taskScheduler.submitTasks(attempt2)
+    val taskDescriptions3 = taskScheduler.resourceOffers(workerOffers).flatten
+    assert(1 === taskDescriptions3.length)
+    val mgr = taskScheduler.taskIdToTaskSetManager.get(taskDescriptions3(0).taskId).get
+    assert(mgr.taskSet.stageAttemptId === 1)
+  }
+
+  test("if a zombie attempt finishes, continue scheduling tasks for non-zombie attempts") {
+    sc = new SparkContext("local", "TaskSchedulerImplSuite")
+    val taskScheduler = new TaskSchedulerImpl(sc)
+    taskScheduler.initialize(new FakeSchedulerBackend)
+    // Need to initialize a DAGScheduler for the taskScheduler to use for callbacks.
+    new DAGScheduler(sc, taskScheduler) {
+      override def taskStarted(task: Task[_], taskInfo: TaskInfo) {}
+      override def executorAdded(execId: String, host: String) {}
+    }
+
+    val numFreeCores = 10
+    val workerOffers = Seq(new WorkerOffer("executor0", "host0", numFreeCores))
+    val attempt1 = FakeTask.createTaskSet(10)
+
+    // submit attempt 1, offer some resources, some tasks get scheduled
+    taskScheduler.submitTasks(attempt1)
+    val taskDescriptions = taskScheduler.resourceOffers(workerOffers).flatten
+    assert(10 === taskDescriptions.length)
+
+    // now mark attempt 1 as a zombie
+    val mgr1 = taskScheduler.taskSetManagerForAttempt(attempt1.stageId, attempt1.stageAttemptId).get
+    mgr1.isZombie = true
+
+    // don't schedule anything on another resource offer
+    val taskDescriptions2 = taskScheduler.resourceOffers(workerOffers).flatten
+    assert(0 === taskDescriptions2.length)
+
+    // submit attempt 2
+    val attempt2 = FakeTask.createTaskSet(10, 1)
+    taskScheduler.submitTasks(attempt2)
+
+    // attempt 1 finished (this can happen even if it was marked zombie earlier -- all tasks were
+    // already submitted, and then they finish)
+    taskScheduler.taskSetFinished(mgr1)
+
+    // now with another resource offer, we should still schedule all the tasks in attempt2
+    val taskDescriptions3 = taskScheduler.resourceOffers(workerOffers).flatten
+    assert(10 === taskDescriptions3.length)
+
+    taskDescriptions3.foreach { task =>
+      val mgr = taskScheduler.taskIdToTaskSetManager.get(task.taskId).get
+      assert(mgr.taskSet.stageAttemptId === 1)
+    }
+  }
+
 }
diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
index cdae0d83d01dc..3abb99c4b2b54 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
@@ -136,7 +136,7 @@ class FakeTaskScheduler(sc: SparkContext, liveExecutors: (String, String)* /* ex
 /**
  * A Task implementation that results in a large serialized task.
  */
-class LargeTask(stageId: Int) extends Task[Array[Byte]](stageId, 0) {
+class LargeTask(stageId: Int) extends Task[Array[Byte]](stageId, 0, 0) {
   val randomBuffer = new Array[Byte](TaskSetManager.TASK_SIZE_TO_WARN_KB * 1024)
   val random = new Random(0)
   random.nextBytes(randomBuffer)

From 02181fb6d14833448fb5c501045655213d3cf340 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20Bu=C5=9Bkiewicz?= <mateusz.buskiewicz@getbase.com>
Date: Mon, 20 Jul 2015 12:00:48 -0700
Subject: [PATCH 267/549] [SPARK-9101] [PySpark] Add missing NullType
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

JIRA: https://issues.apache.org/jira/browse/SPARK-9101

Author: Mateusz Buśkiewicz <mateusz.buskiewicz@getbase.com>

Closes #7499 from sixers/spark-9101 and squashes the following commits:

dd75aa6 [Mateusz Buśkiewicz] [SPARK-9101] [PySpark] Test for selecting null literal
97e3f2f [Mateusz Buśkiewicz] [SPARK-9101] [PySpark] Add missing NullType to _atomic_types in pyspark.sql.types
---
 python/pyspark/sql/tests.py | 4 ++++
 python/pyspark/sql/types.py | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 86706e2dc41a3..7a55d801e48e6 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -333,6 +333,10 @@ def test_infer_nested_schema(self):
         df = self.sqlCtx.inferSchema(rdd)
         self.assertEquals(Row(field1=1, field2=u'row1'), df.first())
 
+    def test_select_null_literal(self):
+        df = self.sqlCtx.sql("select null as col")
+        self.assertEquals(Row(col=None), df.first())
+
     def test_apply_schema(self):
         from datetime import date, datetime
         rdd = self.sc.parallelize([(127, -128, -32768, 32767, 2147483647, 1.0,
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index f75791fad1612..10ad89ea14a8d 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -642,7 +642,7 @@ def __eq__(self, other):
 
 
 _atomic_types = [StringType, BinaryType, BooleanType, DecimalType, FloatType, DoubleType,
-                 ByteType, ShortType, IntegerType, LongType, DateType, TimestampType]
+                 ByteType, ShortType, IntegerType, LongType, DateType, TimestampType, NullType]
 _all_atomic_types = dict((t.typeName(), t) for t in _atomic_types)
 _all_complex_types = dict((v.typeName(), v)
                           for v in [ArrayType, MapType, StructType])

From 9f913c4fd6f0f223fd378e453d5b9a87beda1ac4 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Mon, 20 Jul 2015 12:14:47 -0700
Subject: [PATCH 268/549] [SPARK-9114] [SQL] [PySpark] convert returned object
 from UDF into internal type

This PR also remove the duplicated code between registerFunction and UserDefinedFunction.

cc JoshRosen

Author: Davies Liu <davies@databricks.com>

Closes #7450 from davies/fix_return_type and squashes the following commits:

e80bf9f [Davies Liu] remove debugging code
f94b1f6 [Davies Liu] fix mima
8f9c58b [Davies Liu] convert returned object from UDF into internal type
---
 project/MimaExcludes.scala                    |  4 +-
 python/pyspark/sql/context.py                 | 16 ++-----
 python/pyspark/sql/functions.py               | 15 ++++---
 python/pyspark/sql/tests.py                   |  4 +-
 .../apache/spark/sql/UDFRegistration.scala    | 44 ++++---------------
 .../spark/sql/UserDefinedFunction.scala       | 10 +++--
 6 files changed, 32 insertions(+), 61 deletions(-)

diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index dd852547492aa..a2595ff6c22f4 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -69,7 +69,9 @@ object MimaExcludes {
             ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.json.JsonRDD"),
             // local function inside a method
             ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.sql.SQLContext.org$apache$spark$sql$SQLContext$$needsConversion$1")
+              "org.apache.spark.sql.SQLContext.org$apache$spark$sql$SQLContext$$needsConversion$1"),
+            ProblemFilters.exclude[MissingMethodProblem](
+              "org.apache.spark.sql.UDFRegistration.org$apache$spark$sql$UDFRegistration$$builder$24")
           ) ++ Seq(
             // SPARK-8479 Add numNonzeros and numActives to Matrix.
             ProblemFilters.exclude[MissingMethodProblem](
diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py
index c93a15badae29..abb6522dde7b0 100644
--- a/python/pyspark/sql/context.py
+++ b/python/pyspark/sql/context.py
@@ -34,6 +34,7 @@
 from pyspark.sql.dataframe import DataFrame
 from pyspark.sql.readwriter import DataFrameReader
 from pyspark.sql.utils import install_exception_handler
+from pyspark.sql.functions import UserDefinedFunction
 
 try:
     import pandas
@@ -191,19 +192,8 @@ def registerFunction(self, name, f, returnType=StringType()):
         >>> sqlContext.sql("SELECT stringLengthInt('test')").collect()
         [Row(_c0=4)]
         """
-        func = lambda _, it: map(lambda x: f(*x), it)
-        ser = AutoBatchedSerializer(PickleSerializer())
-        command = (func, None, ser, ser)
-        pickled_cmd, bvars, env, includes = _prepare_for_python_RDD(self._sc, command, self)
-        self._ssql_ctx.udf().registerPython(name,
-                                            bytearray(pickled_cmd),
-                                            env,
-                                            includes,
-                                            self._sc.pythonExec,
-                                            self._sc.pythonVer,
-                                            bvars,
-                                            self._sc._javaAccumulator,
-                                            returnType.json())
+        udf = UserDefinedFunction(f, returnType, name)
+        self._ssql_ctx.udf().registerPython(name, udf._judf)
 
     def _inferSchemaFromList(self, data):
         """
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index fd5a3ba8adab3..031745a1c4d3b 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -801,23 +801,24 @@ class UserDefinedFunction(object):
 
     .. versionadded:: 1.3
     """
-    def __init__(self, func, returnType):
+    def __init__(self, func, returnType, name=None):
         self.func = func
         self.returnType = returnType
         self._broadcast = None
-        self._judf = self._create_judf()
+        self._judf = self._create_judf(name)
 
-    def _create_judf(self):
-        f = self.func  # put it in closure `func`
-        func = lambda _, it: map(lambda x: f(*x), it)
+    def _create_judf(self, name):
+        f, returnType = self.func, self.returnType  # put them in closure `func`
+        func = lambda _, it: map(lambda x: returnType.toInternal(f(*x)), it)
         ser = AutoBatchedSerializer(PickleSerializer())
         command = (func, None, ser, ser)
         sc = SparkContext._active_spark_context
         pickled_command, broadcast_vars, env, includes = _prepare_for_python_RDD(sc, command, self)
         ssql_ctx = sc._jvm.SQLContext(sc._jsc.sc())
         jdt = ssql_ctx.parseDataType(self.returnType.json())
-        fname = f.__name__ if hasattr(f, '__name__') else f.__class__.__name__
-        judf = sc._jvm.UserDefinedPythonFunction(fname, bytearray(pickled_command), env, includes,
+        if name is None:
+            name = f.__name__ if hasattr(f, '__name__') else f.__class__.__name__
+        judf = sc._jvm.UserDefinedPythonFunction(name, bytearray(pickled_command), env, includes,
                                                  sc.pythonExec, sc.pythonVer, broadcast_vars,
                                                  sc._javaAccumulator, jdt)
         return judf
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 7a55d801e48e6..ea821f486f13a 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -417,12 +417,14 @@ def test_apply_schema_with_udt(self):
         self.assertEquals(point, ExamplePoint(1.0, 2.0))
 
     def test_udf_with_udt(self):
-        from pyspark.sql.tests import ExamplePoint
+        from pyspark.sql.tests import ExamplePoint, ExamplePointUDT
         row = Row(label=1.0, point=ExamplePoint(1.0, 2.0))
         df = self.sc.parallelize([row]).toDF()
         self.assertEqual(1.0, df.map(lambda r: r.point.x).first())
         udf = UserDefinedFunction(lambda p: p.y, DoubleType())
         self.assertEqual(2.0, df.select(udf(df.point)).first()[0])
+        udf2 = UserDefinedFunction(lambda p: ExamplePoint(p.x + 1, p.y + 1), ExamplePointUDT())
+        self.assertEqual(ExamplePoint(2.0, 3.0), df.select(udf2(df.point)).first()[0])
 
     def test_parquet_with_udt(self):
         from pyspark.sql.tests import ExamplePoint
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
index d35d37d017198..7cd7421a518c9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
@@ -22,13 +22,10 @@ import java.util.{List => JList, Map => JMap}
 import scala.reflect.runtime.universe.TypeTag
 import scala.util.Try
 
-import org.apache.spark.{Accumulator, Logging}
-import org.apache.spark.api.python.PythonBroadcast
-import org.apache.spark.broadcast.Broadcast
+import org.apache.spark.Logging
 import org.apache.spark.sql.api.java._
 import org.apache.spark.sql.catalyst.ScalaReflection
 import org.apache.spark.sql.catalyst.expressions.{Expression, ScalaUDF}
-import org.apache.spark.sql.execution.PythonUDF
 import org.apache.spark.sql.types.DataType
 
 /**
@@ -40,44 +37,19 @@ class UDFRegistration private[sql] (sqlContext: SQLContext) extends Logging {
 
   private val functionRegistry = sqlContext.functionRegistry
 
-  protected[sql] def registerPython(
-      name: String,
-      command: Array[Byte],
-      envVars: JMap[String, String],
-      pythonIncludes: JList[String],
-      pythonExec: String,
-      pythonVer: String,
-      broadcastVars: JList[Broadcast[PythonBroadcast]],
-      accumulator: Accumulator[JList[Array[Byte]]],
-      stringDataType: String): Unit = {
+  protected[sql] def registerPython(name: String, udf: UserDefinedPythonFunction): Unit = {
     log.debug(
       s"""
         | Registering new PythonUDF:
         | name: $name
-        | command: ${command.toSeq}
-        | envVars: $envVars
-        | pythonIncludes: $pythonIncludes
-        | pythonExec: $pythonExec
-        | dataType: $stringDataType
+        | command: ${udf.command.toSeq}
+        | envVars: ${udf.envVars}
+        | pythonIncludes: ${udf.pythonIncludes}
+        | pythonExec: ${udf.pythonExec}
+        | dataType: ${udf.dataType}
       """.stripMargin)
 
-
-    val dataType = sqlContext.parseDataType(stringDataType)
-
-    def builder(e: Seq[Expression]): PythonUDF =
-      PythonUDF(
-        name,
-        command,
-        envVars,
-        pythonIncludes,
-        pythonExec,
-        pythonVer,
-        broadcastVars,
-        accumulator,
-        dataType,
-        e)
-
-    functionRegistry.registerFunction(name, builder)
+    functionRegistry.registerFunction(name, udf.builder)
   }
 
   // scalastyle:off
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/UserDefinedFunction.scala b/sql/core/src/main/scala/org/apache/spark/sql/UserDefinedFunction.scala
index b14e00ab9b163..0f8cd280b5acb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/UserDefinedFunction.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/UserDefinedFunction.scala
@@ -23,7 +23,7 @@ import org.apache.spark.Accumulator
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.api.python.PythonBroadcast
 import org.apache.spark.broadcast.Broadcast
-import org.apache.spark.sql.catalyst.expressions.ScalaUDF
+import org.apache.spark.sql.catalyst.expressions.{Expression, ScalaUDF}
 import org.apache.spark.sql.execution.PythonUDF
 import org.apache.spark.sql.types.DataType
 
@@ -66,10 +66,14 @@ private[sql] case class UserDefinedPythonFunction(
     accumulator: Accumulator[JList[Array[Byte]]],
     dataType: DataType) {
 
+  def builder(e: Seq[Expression]): PythonUDF = {
+    PythonUDF(name, command, envVars, pythonIncludes, pythonExec, pythonVer, broadcastVars,
+      accumulator, dataType, e)
+  }
+
   /** Returns a [[Column]] that will evaluate to calling this UDF with the given input. */
   def apply(exprs: Column*): Column = {
-    val udf = PythonUDF(name, command, envVars, pythonIncludes, pythonExec, pythonVer,
-      broadcastVars, accumulator, dataType, exprs.map(_.expr))
+    val udf = builder(exprs.map(_.expr))
     Column(udf)
   }
 }

From dde0e12f32e3a0448d8308ec78ad59cbb2c55d23 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Mon, 20 Jul 2015 15:12:06 -0700
Subject: [PATCH 269/549] [SPARK-6910] [SQL] Support for pushing predicates
 down to metastore for partition pruning

This PR forks PR #7421 authored by piaozhexiu and adds [a workaround] [1] for fixing the occasional test failures occurred in PR #7421. Please refer to these [two] [2] [comments] [3] for details.

[1]: https://github.com/liancheng/spark/commit/536ac41a7e6b2abeb1f6ec1a6491bbf09ed3e591
[2]: https://github.com/apache/spark/pull/7421#issuecomment-122527391
[3]: https://github.com/apache/spark/pull/7421#issuecomment-122528059

Author: Cheolsoo Park <cheolsoop@netflix.com>
Author: Cheng Lian <lian@databricks.com>
Author: Michael Armbrust <michael@databricks.com>

Closes #7492 from liancheng/pr-7421-workaround and squashes the following commits:

5599cc4 [Cheolsoo Park] Predicate pushdown to hive metastore
536ac41 [Cheng Lian] Sets hive.metastore.integral.jdo.pushdown to true to workaround test failures caused by in #7421
---
 .../spark/sql/hive/HiveMetastoreCatalog.scala | 58 +++++++-------
 .../org/apache/spark/sql/hive/HiveShim.scala  |  1 +
 .../spark/sql/hive/HiveStrategies.scala       |  4 +-
 .../sql/hive/client/ClientInterface.scala     | 11 ++-
 .../spark/sql/hive/client/ClientWrapper.scala | 21 ++---
 .../spark/sql/hive/client/HiveShim.scala      | 71 ++++++++++++++++-
 .../sql/hive/execution/HiveTableScan.scala    |  7 +-
 .../apache/spark/sql/hive/test/TestHive.scala |  5 +-
 .../spark/sql/hive/client/FiltersSuite.scala  | 78 +++++++++++++++++++
 .../spark/sql/hive/client/VersionsSuite.scala |  8 ++
 .../sql/hive/execution/PruningSuite.scala     |  2 +-
 11 files changed, 221 insertions(+), 45 deletions(-)
 create mode 100644 sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index 6589bc6ea2921..b15261b7914dd 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -301,7 +301,9 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
     val result = if (metastoreRelation.hiveQlTable.isPartitioned) {
       val partitionSchema = StructType.fromAttributes(metastoreRelation.partitionKeys)
       val partitionColumnDataTypes = partitionSchema.map(_.dataType)
-      val partitions = metastoreRelation.hiveQlPartitions.map { p =>
+      // We're converting the entire table into ParquetRelation, so predicates to Hive metastore
+      // are empty.
+      val partitions = metastoreRelation.getHiveQlPartitions().map { p =>
         val location = p.getLocation
         val values = InternalRow.fromSeq(p.getValues.zip(partitionColumnDataTypes).map {
           case (rawValue, dataType) => Cast(Literal(rawValue), dataType).eval(null)
@@ -642,32 +644,6 @@ private[hive] case class MetastoreRelation
     new Table(tTable)
   }
 
-  @transient val hiveQlPartitions: Seq[Partition] = table.getAllPartitions.map { p =>
-    val tPartition = new org.apache.hadoop.hive.metastore.api.Partition
-    tPartition.setDbName(databaseName)
-    tPartition.setTableName(tableName)
-    tPartition.setValues(p.values)
-
-    val sd = new org.apache.hadoop.hive.metastore.api.StorageDescriptor()
-    tPartition.setSd(sd)
-    sd.setCols(table.schema.map(c => new FieldSchema(c.name, c.hiveType, c.comment)))
-
-    sd.setLocation(p.storage.location)
-    sd.setInputFormat(p.storage.inputFormat)
-    sd.setOutputFormat(p.storage.outputFormat)
-
-    val serdeInfo = new org.apache.hadoop.hive.metastore.api.SerDeInfo
-    sd.setSerdeInfo(serdeInfo)
-    serdeInfo.setSerializationLib(p.storage.serde)
-
-    val serdeParameters = new java.util.HashMap[String, String]()
-    serdeInfo.setParameters(serdeParameters)
-    table.serdeProperties.foreach { case (k, v) => serdeParameters.put(k, v) }
-    p.storage.serdeProperties.foreach { case (k, v) => serdeParameters.put(k, v) }
-
-    new Partition(hiveQlTable, tPartition)
-  }
-
   @transient override lazy val statistics: Statistics = Statistics(
     sizeInBytes = {
       val totalSize = hiveQlTable.getParameters.get(StatsSetupConst.TOTAL_SIZE)
@@ -688,6 +664,34 @@ private[hive] case class MetastoreRelation
     }
   )
 
+  def getHiveQlPartitions(predicates: Seq[Expression] = Nil): Seq[Partition] = {
+    table.getPartitions(predicates).map { p =>
+      val tPartition = new org.apache.hadoop.hive.metastore.api.Partition
+      tPartition.setDbName(databaseName)
+      tPartition.setTableName(tableName)
+      tPartition.setValues(p.values)
+
+      val sd = new org.apache.hadoop.hive.metastore.api.StorageDescriptor()
+      tPartition.setSd(sd)
+      sd.setCols(table.schema.map(c => new FieldSchema(c.name, c.hiveType, c.comment)))
+
+      sd.setLocation(p.storage.location)
+      sd.setInputFormat(p.storage.inputFormat)
+      sd.setOutputFormat(p.storage.outputFormat)
+
+      val serdeInfo = new org.apache.hadoop.hive.metastore.api.SerDeInfo
+      sd.setSerdeInfo(serdeInfo)
+      serdeInfo.setSerializationLib(p.storage.serde)
+
+      val serdeParameters = new java.util.HashMap[String, String]()
+      serdeInfo.setParameters(serdeParameters)
+      table.serdeProperties.foreach { case (k, v) => serdeParameters.put(k, v) }
+      p.storage.serdeProperties.foreach { case (k, v) => serdeParameters.put(k, v) }
+
+      new Partition(hiveQlTable, tPartition)
+    }
+  }
+
   /** Only compare database and tablename, not alias. */
   override def sameResult(plan: LogicalPlan): Boolean = {
     plan match {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveShim.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveShim.scala
index d08c594151654..a357bb39ca7fd 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveShim.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveShim.scala
@@ -27,6 +27,7 @@ import scala.reflect.ClassTag
 
 import com.esotericsoftware.kryo.Kryo
 import com.esotericsoftware.kryo.io.{Input, Output}
+
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.hive.ql.exec.{UDF, Utilities}
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
index ed359620a5f7f..9638a8201e190 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
@@ -125,7 +125,7 @@ private[hive] trait HiveStrategies {
                 InterpretedPredicate.create(castedPredicate)
               }
 
-            val partitions = relation.hiveQlPartitions.filter { part =>
+            val partitions = relation.getHiveQlPartitions(pruningPredicates).filter { part =>
               val partitionValues = part.getValues
               var i = 0
               while (i < partitionValues.size()) {
@@ -213,7 +213,7 @@ private[hive] trait HiveStrategies {
           projectList,
           otherPredicates,
           identity[Seq[Expression]],
-          HiveTableScan(_, relation, pruningPredicates.reduceLeftOption(And))(hiveContext)) :: Nil
+          HiveTableScan(_, relation, pruningPredicates)(hiveContext)) :: Nil
       case _ =>
         Nil
     }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientInterface.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientInterface.scala
index 0a1d761a52f88..1656587d14835 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientInterface.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientInterface.scala
@@ -21,6 +21,7 @@ import java.io.PrintStream
 import java.util.{Map => JMap}
 
 import org.apache.spark.sql.catalyst.analysis.{NoSuchDatabaseException, NoSuchTableException}
+import org.apache.spark.sql.catalyst.expressions.Expression
 
 private[hive] case class HiveDatabase(
     name: String,
@@ -71,7 +72,12 @@ private[hive] case class HiveTable(
 
   def isPartitioned: Boolean = partitionColumns.nonEmpty
 
-  def getAllPartitions: Seq[HivePartition] = client.getAllPartitions(this)
+  def getPartitions(predicates: Seq[Expression]): Seq[HivePartition] = {
+    predicates match {
+      case Nil => client.getAllPartitions(this)
+      case _ => client.getPartitionsByFilter(this, predicates)
+    }
+  }
 
   // Hive does not support backticks when passing names to the client.
   def qualifiedName: String = s"$database.$name"
@@ -132,6 +138,9 @@ private[hive] trait ClientInterface {
   /** Returns all partitions for the given table. */
   def getAllPartitions(hTable: HiveTable): Seq[HivePartition]
 
+  /** Returns partitions filtered by predicates for the given table. */
+  def getPartitionsByFilter(hTable: HiveTable, predicates: Seq[Expression]): Seq[HivePartition]
+
   /** Loads a static partition into an existing table. */
   def loadPartition(
       loadPath: String,
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala
index 53f457ad4f3cc..8adda54754230 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala
@@ -17,25 +17,21 @@
 
 package org.apache.spark.sql.hive.client
 
-import java.io.{BufferedReader, InputStreamReader, File, PrintStream}
-import java.net.URI
-import java.util.{ArrayList => JArrayList, Map => JMap, List => JList, Set => JSet}
+import java.io.{File, PrintStream}
+import java.util.{Map => JMap}
 import javax.annotation.concurrent.GuardedBy
 
 import scala.collection.JavaConversions._
 import scala.language.reflectiveCalls
 
 import org.apache.hadoop.fs.Path
-import org.apache.hadoop.hive.metastore.api.Database
 import org.apache.hadoop.hive.conf.HiveConf
+import org.apache.hadoop.hive.metastore.api.{Database, FieldSchema}
 import org.apache.hadoop.hive.metastore.{TableType => HTableType}
-import org.apache.hadoop.hive.metastore.api
-import org.apache.hadoop.hive.metastore.api.FieldSchema
-import org.apache.hadoop.hive.ql.metadata
 import org.apache.hadoop.hive.ql.metadata.Hive
-import org.apache.hadoop.hive.ql.session.SessionState
 import org.apache.hadoop.hive.ql.processors._
-import org.apache.hadoop.hive.ql.Driver
+import org.apache.hadoop.hive.ql.session.SessionState
+import org.apache.hadoop.hive.ql.{Driver, metadata}
 
 import org.apache.spark.Logging
 import org.apache.spark.sql.catalyst.expressions.Expression
@@ -316,6 +312,13 @@ private[hive] class ClientWrapper(
     shim.getAllPartitions(client, qlTable).map(toHivePartition)
   }
 
+  override def getPartitionsByFilter(
+      hTable: HiveTable,
+      predicates: Seq[Expression]): Seq[HivePartition] = withHiveState {
+    val qlTable = toQlTable(hTable)
+    shim.getPartitionsByFilter(client, qlTable, predicates).map(toHivePartition)
+  }
+
   override def listTables(dbName: String): Seq[String] = withHiveState {
     client.getAllTables(dbName)
   }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
index 1fa9d278e2a57..956997e5f9dce 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
@@ -31,6 +31,11 @@ import org.apache.hadoop.hive.ql.Driver
 import org.apache.hadoop.hive.ql.metadata.{Hive, Partition, Table}
 import org.apache.hadoop.hive.ql.processors.{CommandProcessor, CommandProcessorFactory}
 import org.apache.hadoop.hive.ql.session.SessionState
+import org.apache.hadoop.hive.serde.serdeConstants
+
+import org.apache.spark.Logging
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.types.{StringType, IntegralType}
 
 /**
  * A shim that defines the interface between ClientWrapper and the underlying Hive library used to
@@ -61,6 +66,8 @@ private[client] sealed abstract class Shim {
 
   def getAllPartitions(hive: Hive, table: Table): Seq[Partition]
 
+  def getPartitionsByFilter(hive: Hive, table: Table, predicates: Seq[Expression]): Seq[Partition]
+
   def getCommandProcessor(token: String, conf: HiveConf): CommandProcessor
 
   def getDriverResults(driver: Driver): Seq[String]
@@ -109,7 +116,7 @@ private[client] sealed abstract class Shim {
 
 }
 
-private[client] class Shim_v0_12 extends Shim {
+private[client] class Shim_v0_12 extends Shim with Logging {
 
   private lazy val startMethod =
     findStaticMethod(
@@ -196,6 +203,17 @@ private[client] class Shim_v0_12 extends Shim {
   override def getAllPartitions(hive: Hive, table: Table): Seq[Partition] =
     getAllPartitionsMethod.invoke(hive, table).asInstanceOf[JSet[Partition]].toSeq
 
+  override def getPartitionsByFilter(
+      hive: Hive,
+      table: Table,
+      predicates: Seq[Expression]): Seq[Partition] = {
+    // getPartitionsByFilter() doesn't support binary comparison ops in Hive 0.12.
+    // See HIVE-4888.
+    logDebug("Hive 0.12 doesn't support predicate pushdown to metastore. " +
+      "Please use Hive 0.13 or higher.")
+    getAllPartitions(hive, table)
+  }
+
   override def getCommandProcessor(token: String, conf: HiveConf): CommandProcessor =
     getCommandProcessorMethod.invoke(null, token, conf).asInstanceOf[CommandProcessor]
 
@@ -267,6 +285,12 @@ private[client] class Shim_v0_13 extends Shim_v0_12 {
       classOf[Hive],
       "getAllPartitionsOf",
       classOf[Table])
+  private lazy val getPartitionsByFilterMethod =
+    findMethod(
+      classOf[Hive],
+      "getPartitionsByFilter",
+      classOf[Table],
+      classOf[String])
   private lazy val getCommandProcessorMethod =
     findStaticMethod(
       classOf[CommandProcessorFactory],
@@ -288,6 +312,51 @@ private[client] class Shim_v0_13 extends Shim_v0_12 {
   override def getAllPartitions(hive: Hive, table: Table): Seq[Partition] =
     getAllPartitionsMethod.invoke(hive, table).asInstanceOf[JSet[Partition]].toSeq
 
+  /**
+   * Converts catalyst expression to the format that Hive's getPartitionsByFilter() expects, i.e.
+   * a string that represents partition predicates like "str_key=\"value\" and int_key=1 ...".
+   *
+   * Unsupported predicates are skipped.
+   */
+  def convertFilters(table: Table, filters: Seq[Expression]): String = {
+    // hive varchar is treated as catalyst string, but hive varchar can't be pushed down.
+    val varcharKeys = table.getPartitionKeys
+      .filter(col => col.getType.startsWith(serdeConstants.VARCHAR_TYPE_NAME))
+      .map(col => col.getName).toSet
+
+    filters.collect {
+      case op @ BinaryComparison(a: Attribute, Literal(v, _: IntegralType)) =>
+        s"${a.name} ${op.symbol} $v"
+      case op @ BinaryComparison(Literal(v, _: IntegralType), a: Attribute) =>
+        s"$v ${op.symbol} ${a.name}"
+      case op @ BinaryComparison(a: Attribute, Literal(v, _: StringType))
+          if !varcharKeys.contains(a.name) =>
+        s"""${a.name} ${op.symbol} "$v""""
+      case op @ BinaryComparison(Literal(v, _: StringType), a: Attribute)
+          if !varcharKeys.contains(a.name) =>
+        s""""$v" ${op.symbol} ${a.name}"""
+    }.mkString(" and ")
+  }
+
+  override def getPartitionsByFilter(
+      hive: Hive,
+      table: Table,
+      predicates: Seq[Expression]): Seq[Partition] = {
+
+    // Hive getPartitionsByFilter() takes a string that represents partition
+    // predicates like "str_key=\"value\" and int_key=1 ..."
+    val filter = convertFilters(table, predicates)
+    val partitions =
+      if (filter.isEmpty) {
+        getAllPartitionsMethod.invoke(hive, table).asInstanceOf[JSet[Partition]]
+      } else {
+        logDebug(s"Hive metastore filter is '$filter'.")
+        getPartitionsByFilterMethod.invoke(hive, table, filter).asInstanceOf[JArrayList[Partition]]
+      }
+
+    partitions.toSeq
+  }
+
   override def getCommandProcessor(token: String, conf: HiveConf): CommandProcessor =
     getCommandProcessorMethod.invoke(null, Array(token), conf).asInstanceOf[CommandProcessor]
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScan.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScan.scala
index d33da8242cc1d..ba7eb15a1c0c6 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScan.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScan.scala
@@ -44,7 +44,7 @@ private[hive]
 case class HiveTableScan(
     requestedAttributes: Seq[Attribute],
     relation: MetastoreRelation,
-    partitionPruningPred: Option[Expression])(
+    partitionPruningPred: Seq[Expression])(
     @transient val context: HiveContext)
   extends LeafNode {
 
@@ -56,7 +56,7 @@ case class HiveTableScan(
 
   // Bind all partition key attribute references in the partition pruning predicate for later
   // evaluation.
-  private[this] val boundPruningPred = partitionPruningPred.map { pred =>
+  private[this] val boundPruningPred = partitionPruningPred.reduceLeftOption(And).map { pred =>
     require(
       pred.dataType == BooleanType,
       s"Data type of predicate $pred must be BooleanType rather than ${pred.dataType}.")
@@ -133,7 +133,8 @@ case class HiveTableScan(
   protected override def doExecute(): RDD[InternalRow] = if (!relation.hiveQlTable.isPartitioned) {
     hadoopReader.makeRDDForTable(relation.hiveQlTable)
   } else {
-    hadoopReader.makeRDDForPartitionedTable(prunePartitions(relation.hiveQlPartitions))
+    hadoopReader.makeRDDForPartitionedTable(
+      prunePartitions(relation.getHiveQlPartitions(partitionPruningPred)))
   }
 
   override def output: Seq[Attribute] = attributes
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
index 0f217bc66869f..3662a4352f55d 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
@@ -21,6 +21,7 @@ import java.io.File
 import java.util.{Set => JavaSet}
 
 import org.apache.hadoop.hive.conf.HiveConf
+import org.apache.hadoop.hive.conf.HiveConf.ConfVars
 import org.apache.hadoop.hive.ql.exec.FunctionRegistry
 import org.apache.hadoop.hive.ql.io.avro.{AvroContainerInputFormat, AvroContainerOutputFormat}
 import org.apache.hadoop.hive.ql.metadata.Table
@@ -87,7 +88,9 @@ class TestHiveContext(sc: SparkContext) extends HiveContext(sc) {
 
   /** Sets up the system initially or after a RESET command */
   protected override def configure(): Map[String, String] =
-    temporaryConfig ++ Map("hive.metastore.warehouse.dir" -> warehousePath.toString)
+    temporaryConfig ++ Map(
+      ConfVars.METASTOREWAREHOUSE.varname -> warehousePath.toString,
+      ConfVars.METASTORE_INTEGER_JDO_PUSHDOWN.varname -> "true")
 
   val testTempDir = Utils.createTempDir()
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala
new file mode 100644
index 0000000000000..0efcf80bd4ea7
--- /dev/null
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.client
+
+import scala.collection.JavaConversions._
+
+import org.apache.hadoop.hive.metastore.api.FieldSchema
+import org.apache.hadoop.hive.serde.serdeConstants
+
+import org.apache.spark.{Logging, SparkFunSuite}
+import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.types._
+
+/**
+ * A set of tests for the filter conversion logic used when pushing partition pruning into the
+ * metastore
+ */
+class FiltersSuite extends SparkFunSuite with Logging {
+  private val shim = new Shim_v0_13
+
+  private val testTable = new org.apache.hadoop.hive.ql.metadata.Table("default", "test")
+  private val varCharCol = new FieldSchema()
+  varCharCol.setName("varchar")
+  varCharCol.setType(serdeConstants.VARCHAR_TYPE_NAME)
+  testTable.setPartCols(varCharCol :: Nil)
+
+  filterTest("string filter",
+    (a("stringcol", StringType) > Literal("test")) :: Nil,
+    "stringcol > \"test\"")
+
+  filterTest("string filter backwards",
+    (Literal("test") > a("stringcol", StringType)) :: Nil,
+    "\"test\" > stringcol")
+
+  filterTest("int filter",
+    (a("intcol", IntegerType) === Literal(1)) :: Nil,
+    "intcol = 1")
+
+  filterTest("int filter backwards",
+    (Literal(1) === a("intcol", IntegerType)) :: Nil,
+    "1 = intcol")
+
+  filterTest("int and string filter",
+    (Literal(1) === a("intcol", IntegerType)) :: (Literal("a") === a("strcol", IntegerType)) :: Nil,
+    "1 = intcol and \"a\" = strcol")
+
+  filterTest("skip varchar",
+    (Literal("") === a("varchar", StringType)) :: Nil,
+    "")
+
+  private def filterTest(name: String, filters: Seq[Expression], result: String) = {
+    test(name){
+      val converted = shim.convertFilters(testTable, filters)
+      if (converted != result) {
+        fail(
+          s"Expected filters ${filters.mkString(",")} to convert to '$result' but got '$converted'")
+      }
+    }
+  }
+
+  private def a(name: String, dataType: DataType) = AttributeReference(name, dataType)()
+}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
index d52e162acbd04..3eb127e23d486 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
@@ -20,7 +20,9 @@ package org.apache.spark.sql.hive.client
 import java.io.File
 
 import org.apache.spark.{Logging, SparkFunSuite}
+import org.apache.spark.sql.catalyst.expressions.{NamedExpression, Literal, AttributeReference, EqualTo}
 import org.apache.spark.sql.catalyst.util.quietly
+import org.apache.spark.sql.types.IntegerType
 import org.apache.spark.util.Utils
 
 /**
@@ -151,6 +153,12 @@ class VersionsSuite extends SparkFunSuite with Logging {
       client.getAllPartitions(client.getTable("default", "src_part"))
     }
 
+    test(s"$version: getPartitionsByFilter") {
+      client.getPartitionsByFilter(client.getTable("default", "src_part"), Seq(EqualTo(
+        AttributeReference("key", IntegerType, false)(NamedExpression.newExprId),
+        Literal(1))))
+    }
+
     test(s"$version: loadPartition") {
       client.loadPartition(
         emptyDir,
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruningSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruningSuite.scala
index de6a41ce5bfcb..e83a7dc77e329 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruningSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruningSuite.scala
@@ -151,7 +151,7 @@ class PruningSuite extends HiveComparisonTest with BeforeAndAfter {
         case p @ HiveTableScan(columns, relation, _) =>
           val columnNames = columns.map(_.name)
           val partValues = if (relation.table.isPartitioned) {
-            p.prunePartitions(relation.hiveQlPartitions).map(_.getValues)
+            p.prunePartitions(relation.getHiveQlPartitions()).map(_.getValues)
           } else {
             Seq.empty
           }

From 4863c11ea9d9f94799fbe6ae5a60860f0740a44d Mon Sep 17 00:00:00 2001
From: Tarek Auel <tarek.auel@googlemail.com>
Date: Mon, 20 Jul 2015 15:23:28 -0700
Subject: [PATCH 270/549] [SPARK-9155][SQL] codegen StringSpace

Jira https://issues.apache.org/jira/browse/SPARK-9155

Author: Tarek Auel <tarek.auel@googlemail.com>

Closes #7531 from tarekauel/SPARK-9155 and squashes the following commits:

423c426 [Tarek Auel] [SPARK-9155] language typo fix
e34bd1b [Tarek Auel] [SPARK-9155] moved creation of blank string to UTF8String
4bc33e6 [Tarek Auel] [SPARK-9155] codegen StringSpace
---
 .../sql/catalyst/expressions/stringOperations.scala  | 12 +++++++-----
 .../org/apache/spark/unsafe/types/UTF8String.java    | 10 ++++++++++
 .../apache/spark/unsafe/types/UTF8StringSuite.java   |  8 ++++++++
 3 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
index 6608036f01318..e42be85367aeb 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
@@ -593,17 +593,19 @@ case class StringReverse(child: Expression) extends UnaryExpression with String2
  * Returns a n spaces string.
  */
 case class StringSpace(child: Expression)
-  extends UnaryExpression with ImplicitCastInputTypes with CodegenFallback {
+  extends UnaryExpression with ImplicitCastInputTypes {
 
   override def dataType: DataType = StringType
   override def inputTypes: Seq[DataType] = Seq(IntegerType)
 
   override def nullSafeEval(s: Any): Any = {
-    val length = s.asInstanceOf[Integer]
+    val length = s.asInstanceOf[Int]
+    UTF8String.blankString(if (length < 0) 0 else length)
+  }
 
-    val spaces = new Array[Byte](if (length < 0) 0 else length)
-    java.util.Arrays.fill(spaces, ' '.asInstanceOf[Byte])
-    UTF8String.fromBytes(spaces)
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    nullSafeCodeGen(ctx, ev, (length) =>
+      s"""${ev.primitive} = UTF8String.blankString(($length < 0) ? 0 : $length);""")
   }
 
   override def prettyName: String = "space"
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
index 3eecd657e6ef9..819639f300177 100644
--- a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
+++ b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -20,6 +20,7 @@
 import javax.annotation.Nonnull;
 import java.io.Serializable;
 import java.io.UnsupportedEncodingException;
+import java.util.Arrays;
 
 import org.apache.spark.unsafe.PlatformDependent;
 import org.apache.spark.unsafe.array.ByteArrayMethods;
@@ -77,6 +78,15 @@ public static UTF8String fromString(String str) {
     }
   }
 
+  /**
+   * Creates an UTF8String that contains `length` spaces.
+   */
+  public static UTF8String blankString(int length) {
+    byte[] spaces = new byte[length];
+    Arrays.fill(spaces, (byte) ' ');
+    return fromBytes(spaces);
+  }
+
   protected UTF8String(Object base, long offset, int size) {
     this.base = base;
     this.offset = offset;
diff --git a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
index 7d0c49e2fb84c..6a21c27461163 100644
--- a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
+++ b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
@@ -286,4 +286,12 @@ public void levenshteinDistance() {
     assertEquals(
         UTF8String.fromString("世界千世").levenshteinDistance(UTF8String.fromString("千a世b")),4);
   }
+
+  @Test
+  public void createBlankString() {
+    assertEquals(fromString(" "), blankString(1));
+    assertEquals(fromString("  "), blankString(2));
+    assertEquals(fromString("   "), blankString(3));
+    assertEquals(fromString(""), blankString(0));
+  }
 }

From c9db8eaa42387c03cde12c1d145a6f72872def71 Mon Sep 17 00:00:00 2001
From: Tarek Auel <tarek.auel@googlemail.com>
Date: Mon, 20 Jul 2015 15:32:46 -0700
Subject: [PATCH 271/549] [SPARK-9159][SQL] codegen ascii, base64, unbase64

Jira: https://issues.apache.org/jira/browse/SPARK-9159

Author: Tarek Auel <tarek.auel@googlemail.com>

Closes #7542 from tarekauel/SPARK-9159 and squashes the following commits:

772e6bc [Tarek Auel] [SPARK-9159][SQL] codegen ascii, base64, unbase64
---
 .../expressions/stringOperations.scala        | 37 ++++++++++++++++---
 .../expressions/StringExpressionsSuite.scala  |  2 +-
 2 files changed, 32 insertions(+), 7 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
index e42be85367aeb..e660d499fabd3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
@@ -742,8 +742,7 @@ case class Levenshtein(left: Expression, right: Expression) extends BinaryExpres
 /**
  * Returns the numeric value of the first character of str.
  */
-case class Ascii(child: Expression)
-  extends UnaryExpression with ImplicitCastInputTypes with CodegenFallback {
+case class Ascii(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
 
   override def dataType: DataType = IntegerType
   override def inputTypes: Seq[DataType] = Seq(StringType)
@@ -756,13 +755,25 @@ case class Ascii(child: Expression)
       0
     }
   }
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    nullSafeCodeGen(ctx, ev, (child) => {
+      val bytes = ctx.freshName("bytes")
+      s"""
+        byte[] $bytes = $child.getBytes();
+        if ($bytes.length > 0) {
+          ${ev.primitive} = (int) $bytes[0];
+        } else {
+          ${ev.primitive} = 0;
+        }
+       """})
+  }
 }
 
 /**
  * Converts the argument from binary to a base 64 string.
  */
-case class Base64(child: Expression)
-  extends UnaryExpression with ImplicitCastInputTypes with CodegenFallback {
+case class Base64(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
 
   override def dataType: DataType = StringType
   override def inputTypes: Seq[DataType] = Seq(BinaryType)
@@ -772,19 +783,33 @@ case class Base64(child: Expression)
       org.apache.commons.codec.binary.Base64.encodeBase64(
         bytes.asInstanceOf[Array[Byte]]))
   }
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    nullSafeCodeGen(ctx, ev, (child) => {
+      s"""${ev.primitive} = UTF8String.fromBytes(
+            org.apache.commons.codec.binary.Base64.encodeBase64($child));
+       """})
+  }
+
 }
 
 /**
  * Converts the argument from a base 64 string to BINARY.
  */
-case class UnBase64(child: Expression)
-  extends UnaryExpression with ImplicitCastInputTypes with CodegenFallback {
+case class UnBase64(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
 
   override def dataType: DataType = BinaryType
   override def inputTypes: Seq[DataType] = Seq(StringType)
 
   protected override def nullSafeEval(string: Any): Any =
     org.apache.commons.codec.binary.Base64.decodeBase64(string.asInstanceOf[UTF8String].toString)
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    nullSafeCodeGen(ctx, ev, (child) => {
+      s"""
+         ${ev.primitive} = org.apache.commons.codec.binary.Base64.decodeBase64($child.toString());
+       """})
+  }
 }
 
 /**
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
index d5731229df3bb..67d97cd30b039 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
@@ -290,7 +290,7 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(Base64(b), "AQIDBA==", create_row(bytes))
     checkEvaluation(Base64(b), "", create_row(Array[Byte]()))
     checkEvaluation(Base64(b), null, create_row(null))
-    checkEvaluation(Base64(Literal.create(null, StringType)), null, create_row("abdef"))
+    checkEvaluation(Base64(Literal.create(null, BinaryType)), null, create_row("abdef"))
 
     checkEvaluation(UnBase64(a), null, create_row(null))
     checkEvaluation(UnBase64(Literal.create(null, StringType)), null, create_row("abdef"))

From dac7dbf5a6bd663ef3acaa5b2249b31140fa6857 Mon Sep 17 00:00:00 2001
From: Tarek Auel <tarek.auel@googlemail.com>
Date: Mon, 20 Jul 2015 16:11:56 -0700
Subject: [PATCH 272/549] [SPARK-9160][SQL] codegen encode, decode

Jira: https://issues.apache.org/jira/browse/SPARK-9160

Author: Tarek Auel <tarek.auel@googlemail.com>

Closes #7543 from tarekauel/SPARK-9160 and squashes the following commits:

7528f0e [Tarek Auel] [SPARK-9160][SQL] codegen encode, decode
---
 .../expressions/stringOperations.scala        | 25 +++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
index e660d499fabd3..a5682428b3d40 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
@@ -818,7 +818,7 @@ case class UnBase64(child: Expression) extends UnaryExpression with ImplicitCast
  * If either argument is null, the result will also be null.
  */
 case class Decode(bin: Expression, charset: Expression)
-  extends BinaryExpression with ImplicitCastInputTypes with CodegenFallback {
+  extends BinaryExpression with ImplicitCastInputTypes {
 
   override def left: Expression = bin
   override def right: Expression = charset
@@ -829,6 +829,17 @@ case class Decode(bin: Expression, charset: Expression)
     val fromCharset = input2.asInstanceOf[UTF8String].toString
     UTF8String.fromString(new String(input1.asInstanceOf[Array[Byte]], fromCharset))
   }
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    nullSafeCodeGen(ctx, ev, (bytes, charset) =>
+      s"""
+        try {
+          ${ev.primitive} = UTF8String.fromString(new String($bytes, $charset.toString()));
+        } catch (java.io.UnsupportedEncodingException e) {
+          org.apache.spark.unsafe.PlatformDependent.throwException(e);
+        }
+      """)
+  }
 }
 
 /**
@@ -837,7 +848,7 @@ case class Decode(bin: Expression, charset: Expression)
  * If either argument is null, the result will also be null.
 */
 case class Encode(value: Expression, charset: Expression)
-  extends BinaryExpression with ImplicitCastInputTypes with CodegenFallback {
+  extends BinaryExpression with ImplicitCastInputTypes {
 
   override def left: Expression = value
   override def right: Expression = charset
@@ -848,6 +859,16 @@ case class Encode(value: Expression, charset: Expression)
     val toCharset = input2.asInstanceOf[UTF8String].toString
     input1.asInstanceOf[UTF8String].toString.getBytes(toCharset)
   }
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    nullSafeCodeGen(ctx, ev, (string, charset) =>
+      s"""
+        try {
+          ${ev.primitive} = $string.toString().getBytes($charset.toString());
+        } catch (java.io.UnsupportedEncodingException e) {
+          org.apache.spark.unsafe.PlatformDependent.throwException(e);
+        }""")
+  }
 }
 
 /**

From a1064df0ee3daf496800be84293345a10e1497d9 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Mon, 20 Jul 2015 16:42:43 -0700
Subject: [PATCH 273/549] [SPARK-8125] [SQL] Accelerates Parquet schema merging
 and partition discovery

This PR tries to accelerate Parquet schema discovery and `HadoopFsRelation` partition discovery.  The acceleration is done by the following means:

- Turning off schema merging by default

  Schema merging is not the most common case, but requires reading footers of all Parquet part-files and can be very slow.

- Avoiding `FileSystem.globStatus()` call when possible

  `FileSystem.globStatus()` may issue multiple synchronous RPC calls, and can be very slow (esp. on S3).  This PR adds `SparkHadoopUtil.globPathIfNecessary()`, which only issues RPC calls when the path contain glob-pattern specific character(s) (`{}[]*?\`).

  This is especially useful when converting a metastore Parquet table with lots of partitions, since Spark SQL adds all partition directories as the input paths, and currently we do a `globStatus` call on each input path sequentially.

- Listing leaf files in parallel when the number of input paths exceeds a threshold

  Listing leaf files is required by partition discovery.  Currently it is done on driver side, and can be slow when there are lots of (nested) directories, since each `FileSystem.listStatus()` call issues an RPC.  In this PR, we list leaf files in a BFS style, and resort to a Spark job once we found that the number of directories need to be listed exceed a threshold.

  The threshold is controlled by `SQLConf` option `spark.sql.sources.parallelPartitionDiscovery.threshold`, which defaults to 32.

- Discovering Parquet schema in parallel

  Currently, schema merging is also done on driver side, and needs to read footers of all part-files.  This PR uses a Spark job to do schema merging.  Together with task side metadata reading in Parquet 1.7.0, we never read any footers on driver side now.

Author: Cheng Lian <lian@databricks.com>

Closes #7396 from liancheng/accel-parquet and squashes the following commits:

5598efc [Cheng Lian] Uses ParquetInputFormat[InternalRow] instead of ParquetInputFormat[Row]
ff32cd0 [Cheng Lian] Excludes directories while listing leaf files
3c580f1 [Cheng Lian] Fixes test failure caused by making "mergeSchema" default to "false"
b1646aa [Cheng Lian] Should allow empty input paths
32e5f0d [Cheng Lian] Moves schema merging to executor side
---
 .../apache/spark/deploy/SparkHadoopUtil.scala |   8 +
 .../apache/spark/sql/DataFrameReader.scala    |  12 +-
 .../scala/org/apache/spark/sql/SQLConf.scala  |  10 +-
 .../sql/parquet/ParquetTableOperations.scala  |  14 +-
 .../apache/spark/sql/parquet/newParquet.scala | 158 +++++++++++++-----
 .../org/apache/spark/sql/sources/ddl.scala    |   8 +-
 .../apache/spark/sql/sources/interfaces.scala | 120 ++++++++++---
 .../ParquetPartitionDiscoverySuite.scala      |  18 +-
 8 files changed, 258 insertions(+), 90 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
index 6b14d407a6380..e06b06e06fb4a 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
@@ -239,6 +239,14 @@ class SparkHadoopUtil extends Logging {
     }.getOrElse(Seq.empty[Path])
   }
 
+  def globPathIfNecessary(pattern: Path): Seq[Path] = {
+    if (pattern.toString.exists("{}[]*?\\".toSet.contains)) {
+      globPath(pattern)
+    } else {
+      Seq(pattern)
+    }
+  }
+
   /**
    * Lists all the files in a directory with the specified prefix, and does not end with the
    * given suffix. The returned {{FileStatus}} instances are sorted by the modification times of
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index 9b23df4843c06..0e37ad3e12e08 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql
 import java.util.Properties
 
 import org.apache.hadoop.fs.Path
-import org.apache.spark.Partition
+import org.apache.spark.{Logging, Partition}
 
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.api.java.JavaRDD
@@ -40,7 +40,7 @@ import org.apache.spark.sql.types.StructType
  * @since 1.4.0
  */
 @Experimental
-class DataFrameReader private[sql](sqlContext: SQLContext) {
+class DataFrameReader private[sql](sqlContext: SQLContext) extends Logging {
 
   /**
    * Specifies the input data source format.
@@ -251,7 +251,13 @@ class DataFrameReader private[sql](sqlContext: SQLContext) {
     if (paths.isEmpty) {
       sqlContext.emptyDataFrame
     } else {
-      val globbedPaths = paths.map(new Path(_)).flatMap(SparkHadoopUtil.get.globPath).toArray
+      val globbedPaths = paths.flatMap { path =>
+        val hdfsPath = new Path(path)
+        val fs = hdfsPath.getFileSystem(sqlContext.sparkContext.hadoopConfiguration)
+        val qualified = hdfsPath.makeQualified(fs.getUri, fs.getWorkingDirectory)
+        SparkHadoopUtil.get.globPathIfNecessary(qualified)
+      }.toArray
+
       sqlContext.baseRelationToDataFrame(
         new ParquetRelation2(
           globbedPaths.map(_.toString), None, None, extraOptions.toMap)(sqlContext))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
index 84d3271ceb738..78c780bdc5797 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
@@ -242,7 +242,7 @@ private[spark] object SQLConf {
     doc = "Whether the query analyzer should be case sensitive or not.")
 
   val PARQUET_SCHEMA_MERGING_ENABLED = booleanConf("spark.sql.parquet.mergeSchema",
-    defaultValue = Some(true),
+    defaultValue = Some(false),
     doc = "When true, the Parquet data source merges schemas collected from all data files, " +
           "otherwise the schema is picked from the summary file or a random data file " +
           "if no summary file is available.")
@@ -376,6 +376,11 @@ private[spark] object SQLConf {
   val OUTPUT_COMMITTER_CLASS =
     stringConf("spark.sql.sources.outputCommitterClass", isPublic = false)
 
+  val PARALLEL_PARTITION_DISCOVERY_THRESHOLD = intConf(
+    key = "spark.sql.sources.parallelPartitionDiscovery.threshold",
+    defaultValue = Some(32),
+    doc = "<TODO>")
+
   // Whether to perform eager analysis when constructing a dataframe.
   // Set to false when debugging requires the ability to look at invalid query plans.
   val DATAFRAME_EAGER_ANALYSIS = booleanConf(
@@ -495,6 +500,9 @@ private[sql] class SQLConf extends Serializable with CatalystConf {
   private[spark] def partitionColumnTypeInferenceEnabled(): Boolean =
     getConf(SQLConf.PARTITION_COLUMN_TYPE_INFERENCE)
 
+  private[spark] def parallelPartitionDiscoveryThreshold: Int =
+    getConf(SQLConf.PARALLEL_PARTITION_DISCOVERY_THRESHOLD)
+
   // Do not use a value larger than 4000 as the default value of this property.
   // See the comments of SCHEMA_STRING_LENGTH_THRESHOLD above for more information.
   private[spark] def schemaStringLengthThreshold: Int = getConf(SCHEMA_STRING_LENGTH_THRESHOLD)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
index 9058b09375291..28cba5e54d69e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
@@ -426,6 +426,7 @@ private[parquet] class AppendingParquetOutputFormat(offset: Int)
   }
 }
 
+// TODO Removes this class after removing old Parquet support code
 /**
  * We extend ParquetInputFormat in order to have more control over which
  * RecordFilter we want to use.
@@ -433,8 +434,6 @@ private[parquet] class AppendingParquetOutputFormat(offset: Int)
 private[parquet] class FilteringParquetRowInputFormat
   extends org.apache.parquet.hadoop.ParquetInputFormat[InternalRow] with Logging {
 
-  private var fileStatuses = Map.empty[Path, FileStatus]
-
   override def createRecordReader(
       inputSplit: InputSplit,
       taskAttemptContext: TaskAttemptContext): RecordReader[Void, InternalRow] = {
@@ -455,17 +454,6 @@ private[parquet] class FilteringParquetRowInputFormat
 
 }
 
-private[parquet] object FilteringParquetRowInputFormat {
-  private val footerCache = CacheBuilder.newBuilder()
-    .maximumSize(20000)
-    .build[FileStatus, Footer]()
-
-  private val blockLocationCache = CacheBuilder.newBuilder()
-    .maximumSize(20000)
-    .expireAfterWrite(15, TimeUnit.MINUTES)  // Expire locations since HDFS files might move
-    .build[FileStatus, Array[BlockLocation]]()
-}
-
 private[parquet] object FileSystemHelper {
   def listFiles(pathStr: String, conf: Configuration): Seq[Path] = {
     val origPath = new Path(pathStr)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
index 01dd6f471bd7c..e683eb0126004 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
@@ -22,7 +22,7 @@ import java.util.{List => JList}
 
 import scala.collection.JavaConversions._
 import scala.collection.mutable
-import scala.util.Try
+import scala.util.{Failure, Try}
 
 import com.google.common.base.Objects
 import org.apache.hadoop.fs.{FileStatus, Path}
@@ -31,12 +31,11 @@ import org.apache.hadoop.mapreduce._
 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
 import org.apache.parquet.filter2.predicate.FilterApi
 import org.apache.parquet.hadoop._
-import org.apache.parquet.hadoop.metadata.{FileMetaData, CompressionCodecName}
+import org.apache.parquet.hadoop.metadata.CompressionCodecName
 import org.apache.parquet.hadoop.util.ContextUtil
 import org.apache.parquet.schema.MessageType
 
 import org.apache.spark.broadcast.Broadcast
-import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.rdd.RDD
 import org.apache.spark.rdd.RDD._
 import org.apache.spark.sql._
@@ -278,19 +277,13 @@ private[sql] class ParquetRelation2(
     // Create the function to set input paths at the driver side.
     val setInputPaths = ParquetRelation2.initializeDriverSideJobFunc(inputFiles) _
 
-    val footers = inputFiles.map(f => metadataCache.footers(f.getPath))
-
     Utils.withDummyCallSite(sqlContext.sparkContext) {
-      // TODO Stop using `FilteringParquetRowInputFormat` and overriding `getPartition`.
-      // After upgrading to Parquet 1.6.0, we should be able to stop caching `FileStatus` objects
-      // and footers. Especially when a global arbitrative schema (either from metastore or data
-      // source DDL) is available.
       new SqlNewHadoopRDD(
         sc = sqlContext.sparkContext,
         broadcastedConf = broadcastedConf,
         initDriverSideJobFuncOpt = Some(setInputPaths),
         initLocalJobFuncOpt = Some(initLocalJobFuncOpt),
-        inputFormatClass = classOf[FilteringParquetRowInputFormat],
+        inputFormatClass = classOf[ParquetInputFormat[InternalRow]],
         keyClass = classOf[Void],
         valueClass = classOf[InternalRow]) {
 
@@ -306,12 +299,6 @@ private[sql] class ParquetRelation2(
             f.getAccessTime, f.getPermission, f.getOwner, f.getGroup, pathWithEscapedAuthority)
         }.toSeq
 
-        @transient val cachedFooters = footers.map { f =>
-          // In order to encode the authority of a Path containing special characters such as /,
-          // we need to use the string returned by the URI of the path to create a new Path.
-          new Footer(escapePathUserInfo(f.getFile), f.getParquetMetadata)
-        }.toSeq
-
         private def escapePathUserInfo(path: Path): Path = {
           val uri = path.toUri
           new Path(new URI(
@@ -321,13 +308,10 @@ private[sql] class ParquetRelation2(
 
         // Overridden so we can inject our own cached files statuses.
         override def getPartitions: Array[SparkPartition] = {
-          val inputFormat = if (cacheMetadata) {
-            new FilteringParquetRowInputFormat {
-              override def listStatus(jobContext: JobContext): JList[FileStatus] = cachedStatuses
-              override def getFooters(jobContext: JobContext): JList[Footer] = cachedFooters
+          val inputFormat = new ParquetInputFormat[InternalRow] {
+            override def listStatus(jobContext: JobContext): JList[FileStatus] = {
+              if (cacheMetadata) cachedStatuses else super.listStatus(jobContext)
             }
-          } else {
-            new FilteringParquetRowInputFormat
           }
 
           val jobContext = newJobContext(getConf(isDriverSide = true), jobId)
@@ -348,9 +332,6 @@ private[sql] class ParquetRelation2(
     // `FileStatus` objects of all "_common_metadata" files.
     private var commonMetadataStatuses: Array[FileStatus] = _
 
-    // Parquet footer cache.
-    var footers: Map[Path, Footer] = _
-
     // `FileStatus` objects of all data files (Parquet part-files).
     var dataStatuses: Array[FileStatus] = _
 
@@ -376,20 +357,6 @@ private[sql] class ParquetRelation2(
       commonMetadataStatuses =
         leaves.filter(_.getPath.getName == ParquetFileWriter.PARQUET_COMMON_METADATA_FILE)
 
-      footers = {
-        val conf = SparkHadoopUtil.get.conf
-        val taskSideMetaData = conf.getBoolean(ParquetInputFormat.TASK_SIDE_METADATA, true)
-        val rawFooters = if (shouldMergeSchemas) {
-          ParquetFileReader.readAllFootersInParallel(
-            conf, seqAsJavaList(leaves), taskSideMetaData)
-        } else {
-          ParquetFileReader.readAllFootersInParallelUsingSummaryFiles(
-            conf, seqAsJavaList(leaves), taskSideMetaData)
-        }
-
-        rawFooters.map(footer => footer.getFile -> footer).toMap
-      }
-
       // If we already get the schema, don't need to re-compute it since the schema merging is
       // time-consuming.
       if (dataSchema == null) {
@@ -422,7 +389,7 @@ private[sql] class ParquetRelation2(
       // Always tries the summary files first if users don't require a merged schema.  In this case,
       // "_common_metadata" is more preferable than "_metadata" because it doesn't contain row
       // groups information, and could be much smaller for large Parquet files with lots of row
-      // groups.
+      // groups.  If no summary file is available, falls back to some random part-file.
       //
       // NOTE: Metadata stored in the summary files are merged from all part-files.  However, for
       // user defined key-value metadata (in which we store Spark SQL schema), Parquet doesn't know
@@ -457,10 +424,10 @@ private[sql] class ParquetRelation2(
 
       assert(
         filesToTouch.nonEmpty || maybeDataSchema.isDefined || maybeMetastoreSchema.isDefined,
-        "No schema defined, " +
-          s"and no Parquet data file or summary file found under ${paths.mkString(", ")}.")
+        "No predefined schema found, " +
+          s"and no Parquet data files or summary files found under ${paths.mkString(", ")}.")
 
-      ParquetRelation2.readSchema(filesToTouch.map(f => footers.apply(f.getPath)), sqlContext)
+      ParquetRelation2.mergeSchemasInParallel(filesToTouch, sqlContext)
     }
   }
 }
@@ -519,6 +486,7 @@ private[sql] object ParquetRelation2 extends Logging {
   private[parquet] def initializeDriverSideJobFunc(
       inputFiles: Array[FileStatus])(job: Job): Unit = {
     // We side the input paths at the driver side.
+    logInfo(s"Reading Parquet file(s) from ${inputFiles.map(_.getPath).mkString(", ")}")
     if (inputFiles.nonEmpty) {
       FileInputFormat.setInputPaths(job, inputFiles.map(_.getPath): _*)
     }
@@ -543,7 +511,7 @@ private[sql] object ParquetRelation2 extends Logging {
         .getKeyValueMetaData
         .toMap
         .get(RowReadSupport.SPARK_METADATA_KEY)
-      if (serializedSchema == None) {
+      if (serializedSchema.isEmpty) {
         // Falls back to Parquet schema if no Spark SQL schema found.
         Some(parseParquetSchema(metadata.getSchema))
       } else if (!seen.contains(serializedSchema.get)) {
@@ -646,4 +614,106 @@ private[sql] object ParquetRelation2 extends Logging {
       .filter(_.nullable)
     StructType(parquetSchema ++ missingFields)
   }
+
+  /**
+   * Figures out a merged Parquet schema with a distributed Spark job.
+   *
+   * Note that locality is not taken into consideration here because:
+   *
+   *  1. For a single Parquet part-file, in most cases the footer only resides in the last block of
+   *     that file.  Thus we only need to retrieve the location of the last block.  However, Hadoop
+   *     `FileSystem` only provides API to retrieve locations of all blocks, which can be
+   *     potentially expensive.
+   *
+   *  2. This optimization is mainly useful for S3, where file metadata operations can be pretty
+   *     slow.  And basically locality is not available when using S3 (you can't run computation on
+   *     S3 nodes).
+   */
+  def mergeSchemasInParallel(
+      filesToTouch: Seq[FileStatus], sqlContext: SQLContext): Option[StructType] = {
+    val assumeBinaryIsString = sqlContext.conf.isParquetBinaryAsString
+    val assumeInt96IsTimestamp = sqlContext.conf.isParquetINT96AsTimestamp
+    val followParquetFormatSpec = sqlContext.conf.followParquetFormatSpec
+    val serializedConf = new SerializableConfiguration(sqlContext.sparkContext.hadoopConfiguration)
+
+    // HACK ALERT:
+    //
+    // Parquet requires `FileStatus`es to read footers.  Here we try to send cached `FileStatus`es
+    // to executor side to avoid fetching them again.  However, `FileStatus` is not `Serializable`
+    // but only `Writable`.  What makes it worth, for some reason, `FileStatus` doesn't play well
+    // with `SerializableWritable[T]` and always causes a weird `IllegalStateException`.  These
+    // facts virtually prevents us to serialize `FileStatus`es.
+    //
+    // Since Parquet only relies on path and length information of those `FileStatus`es to read
+    // footers, here we just extract them (which can be easily serialized), send them to executor
+    // side, and resemble fake `FileStatus`es there.
+    val partialFileStatusInfo = filesToTouch.map(f => (f.getPath.toString, f.getLen))
+
+    // Issues a Spark job to read Parquet schema in parallel.
+    val partiallyMergedSchemas =
+      sqlContext
+        .sparkContext
+        .parallelize(partialFileStatusInfo)
+        .mapPartitions { iterator =>
+          // Resembles fake `FileStatus`es with serialized path and length information.
+          val fakeFileStatuses = iterator.map { case (path, length) =>
+            new FileStatus(length, false, 0, 0, 0, 0, null, null, null, new Path(path))
+          }.toSeq
+
+          // Skips row group information since we only need the schema
+          val skipRowGroups = true
+
+          // Reads footers in multi-threaded manner within each task
+          val footers =
+            ParquetFileReader.readAllFootersInParallel(
+              serializedConf.value, fakeFileStatuses, skipRowGroups)
+
+          // Converter used to convert Parquet `MessageType` to Spark SQL `StructType`
+          val converter =
+            new CatalystSchemaConverter(
+              assumeBinaryIsString = assumeBinaryIsString,
+              assumeInt96IsTimestamp = assumeInt96IsTimestamp,
+              followParquetFormatSpec = followParquetFormatSpec)
+
+          footers.map { footer =>
+            ParquetRelation2.readSchemaFromFooter(footer, converter)
+          }.reduceOption(_ merge _).iterator
+        }.collect()
+
+    partiallyMergedSchemas.reduceOption(_ merge _)
+  }
+
+  /**
+   * Reads Spark SQL schema from a Parquet footer.  If a valid serialized Spark SQL schema string
+   * can be found in the file metadata, returns the deserialized [[StructType]], otherwise, returns
+   * a [[StructType]] converted from the [[MessageType]] stored in this footer.
+   */
+  def readSchemaFromFooter(
+      footer: Footer, converter: CatalystSchemaConverter): StructType = {
+    val fileMetaData = footer.getParquetMetadata.getFileMetaData
+    fileMetaData
+      .getKeyValueMetaData
+      .toMap
+      .get(RowReadSupport.SPARK_METADATA_KEY)
+      .flatMap(deserializeSchemaString)
+      .getOrElse(converter.convert(fileMetaData.getSchema))
+  }
+
+  private def deserializeSchemaString(schemaString: String): Option[StructType] = {
+    // Tries to deserialize the schema string as JSON first, then falls back to the case class
+    // string parser (data generated by older versions of Spark SQL uses this format).
+    Try(DataType.fromJson(schemaString).asInstanceOf[StructType]).recover {
+      case _: Throwable =>
+        logInfo(
+          s"Serialized Spark schema in Parquet key-value metadata is not in JSON format, " +
+            "falling back to the deprecated DataType.fromCaseClassString parser.")
+        DataType.fromCaseClassString(schemaString).asInstanceOf[StructType]
+    }.recoverWith {
+      case cause: Throwable =>
+        logWarning(
+          "Failed to parse and ignored serialized Spark schema in " +
+            s"Parquet key-value metadata:\n\t$schemaString", cause)
+        Failure(cause)
+    }.toOption
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
index d7440c55bd4a6..5a8c97c773ee6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
@@ -247,7 +247,9 @@ private[sql] object ResolvedDataSource {
           val caseInsensitiveOptions = new CaseInsensitiveMap(options)
           val paths = {
             val patternPath = new Path(caseInsensitiveOptions("path"))
-            SparkHadoopUtil.get.globPath(patternPath).map(_.toString).toArray
+            val fs = patternPath.getFileSystem(sqlContext.sparkContext.hadoopConfiguration)
+            val qualifiedPattern = patternPath.makeQualified(fs.getUri, fs.getWorkingDirectory)
+            SparkHadoopUtil.get.globPathIfNecessary(qualifiedPattern).map(_.toString).toArray
           }
 
           val dataSchema =
@@ -272,7 +274,9 @@ private[sql] object ResolvedDataSource {
           val caseInsensitiveOptions = new CaseInsensitiveMap(options)
           val paths = {
             val patternPath = new Path(caseInsensitiveOptions("path"))
-            SparkHadoopUtil.get.globPath(patternPath).map(_.toString).toArray
+            val fs = patternPath.getFileSystem(sqlContext.sparkContext.hadoopConfiguration)
+            val qualifiedPattern = patternPath.makeQualified(fs.getUri, fs.getWorkingDirectory)
+            SparkHadoopUtil.get.globPathIfNecessary(qualifiedPattern).map(_.toString).toArray
           }
           dataSource.createRelation(sqlContext, paths, None, None, caseInsensitiveOptions)
         case dataSource: org.apache.spark.sql.sources.SchemaRelationProvider =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
index 5d7cc2ff55af1..2cd8b358d81c6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
@@ -18,21 +18,23 @@
 package org.apache.spark.sql.sources
 
 import scala.collection.mutable
+import scala.collection.mutable.ArrayBuffer
 import scala.util.Try
 
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileStatus, FileSystem, Path}
 import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext}
 
+import org.apache.spark.{Logging, SparkContext}
 import org.apache.spark.annotation.{DeveloperApi, Experimental}
 import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{DataFrame, Row, SaveMode, SQLContext}
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.execution.RDDConversions
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.codegen.GenerateMutableProjection
+import org.apache.spark.sql.execution.RDDConversions
 import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql._
 import org.apache.spark.util.SerializableConfiguration
 
 /**
@@ -367,7 +369,9 @@ abstract class OutputWriter {
  */
 @Experimental
 abstract class HadoopFsRelation private[sql](maybePartitionSpec: Option[PartitionSpec])
-  extends BaseRelation {
+  extends BaseRelation with Logging {
+
+  logInfo("Constructing HadoopFsRelation")
 
   def this() = this(None)
 
@@ -382,36 +386,40 @@ abstract class HadoopFsRelation private[sql](maybePartitionSpec: Option[Partitio
 
     var leafDirToChildrenFiles = mutable.Map.empty[Path, Array[FileStatus]]
 
-    def refresh(): Unit = {
-      // We don't filter files/directories whose name start with "_" except "_temporary" here, as
-      // specific data sources may take advantages over them (e.g. Parquet _metadata and
-      // _common_metadata files). "_temporary" directories are explicitly ignored since failed
-      // tasks/jobs may leave partial/corrupted data files there.
-      def listLeafFilesAndDirs(fs: FileSystem, status: FileStatus): Set[FileStatus] = {
-        if (status.getPath.getName.toLowerCase == "_temporary") {
-          Set.empty
+    private def listLeafFiles(paths: Array[String]): Set[FileStatus] = {
+      if (paths.length >= sqlContext.conf.parallelPartitionDiscoveryThreshold) {
+        HadoopFsRelation.listLeafFilesInParallel(paths, hadoopConf, sqlContext.sparkContext)
+      } else {
+        val statuses = paths.flatMap { path =>
+          val hdfsPath = new Path(path)
+          val fs = hdfsPath.getFileSystem(hadoopConf)
+          val qualified = hdfsPath.makeQualified(fs.getUri, fs.getWorkingDirectory)
+
+          logInfo(s"Listing $qualified on driver")
+          Try(fs.listStatus(qualified)).getOrElse(Array.empty)
+        }.filterNot { status =>
+          val name = status.getPath.getName
+          name.toLowerCase == "_temporary" || name.startsWith(".")
+        }
+
+        val (dirs, files) = statuses.partition(_.isDir)
+
+        if (dirs.isEmpty) {
+          files.toSet
         } else {
-          val (dirs, files) = fs.listStatus(status.getPath).partition(_.isDir)
-          val leafDirs = if (dirs.isEmpty) Set(status) else Set.empty[FileStatus]
-          files.toSet ++ leafDirs ++ dirs.flatMap(dir => listLeafFilesAndDirs(fs, dir))
+          files.toSet ++ listLeafFiles(dirs.map(_.getPath.toString))
         }
       }
+    }
 
-      leafFiles.clear()
+    def refresh(): Unit = {
+      val files = listLeafFiles(paths)
 
-      val statuses = paths.flatMap { path =>
-        val hdfsPath = new Path(path)
-        val fs = hdfsPath.getFileSystem(hadoopConf)
-        val qualified = hdfsPath.makeQualified(fs.getUri, fs.getWorkingDirectory)
-        Try(fs.getFileStatus(qualified)).toOption.toArray.flatMap(listLeafFilesAndDirs(fs, _))
-      }.filterNot { status =>
-        // SPARK-8037: Ignores files like ".DS_Store" and other hidden files/directories
-        status.getPath.getName.startsWith(".")
-      }
+      leafFiles.clear()
+      leafDirToChildrenFiles.clear()
 
-      val files = statuses.filterNot(_.isDir)
       leafFiles ++= files.map(f => f.getPath -> f).toMap
-      leafDirToChildrenFiles ++= files.groupBy(_.getPath.getParent)
+      leafDirToChildrenFiles ++= files.toArray.groupBy(_.getPath.getParent)
     }
   }
 
@@ -666,3 +674,63 @@ abstract class HadoopFsRelation private[sql](maybePartitionSpec: Option[Partitio
    */
   def prepareJobForWrite(job: Job): OutputWriterFactory
 }
+
+private[sql] object HadoopFsRelation extends Logging {
+  // We don't filter files/directories whose name start with "_" except "_temporary" here, as
+  // specific data sources may take advantages over them (e.g. Parquet _metadata and
+  // _common_metadata files). "_temporary" directories are explicitly ignored since failed
+  // tasks/jobs may leave partial/corrupted data files there.  Files and directories whose name
+  // start with "." are also ignored.
+  def listLeafFiles(fs: FileSystem, status: FileStatus): Array[FileStatus] = {
+    logInfo(s"Listing ${status.getPath}")
+    val name = status.getPath.getName.toLowerCase
+    if (name == "_temporary" || name.startsWith(".")) {
+      Array.empty
+    } else {
+      val (dirs, files) = fs.listStatus(status.getPath).partition(_.isDir)
+      files ++ dirs.flatMap(dir => listLeafFiles(fs, dir))
+    }
+  }
+
+  // `FileStatus` is Writable but not serializable.  What make it worse, somehow it doesn't play
+  // well with `SerializableWritable`.  So there seems to be no way to serialize a `FileStatus`.
+  // Here we use `FakeFileStatus` to extract key components of a `FileStatus` to serialize it from
+  // executor side and reconstruct it on driver side.
+  case class FakeFileStatus(
+      path: String,
+      length: Long,
+      isDir: Boolean,
+      blockReplication: Short,
+      blockSize: Long,
+      modificationTime: Long,
+      accessTime: Long)
+
+  def listLeafFilesInParallel(
+      paths: Array[String],
+      hadoopConf: Configuration,
+      sparkContext: SparkContext): Set[FileStatus] = {
+    logInfo(s"Listing leaf files and directories in parallel under: ${paths.mkString(", ")}")
+
+    val serializableConfiguration = new SerializableConfiguration(hadoopConf)
+    val fakeStatuses = sparkContext.parallelize(paths).flatMap { path =>
+      val hdfsPath = new Path(path)
+      val fs = hdfsPath.getFileSystem(serializableConfiguration.value)
+      val qualified = hdfsPath.makeQualified(fs.getUri, fs.getWorkingDirectory)
+      Try(listLeafFiles(fs, fs.getFileStatus(qualified))).getOrElse(Array.empty)
+    }.map { status =>
+      FakeFileStatus(
+        status.getPath.toString,
+        status.getLen,
+        status.isDir,
+        status.getReplication,
+        status.getBlockSize,
+        status.getModificationTime,
+        status.getAccessTime)
+    }.collect()
+
+    fakeStatuses.map { f =>
+      new FileStatus(
+        f.length, f.isDir, f.blockReplication, f.blockSize, f.modificationTime, new Path(f.path))
+    }.toSet
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala
index d0ebb11b063f0..37b0a9fbf7a4e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala
@@ -447,7 +447,12 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest {
         (1 to 10).map(i => (i, i.toString)).toDF("intField", "stringField"),
         makePartitionDir(base, defaultPartitionName, "pi" -> 2))
 
-      sqlContext.read.format("parquet").load(base.getCanonicalPath).registerTempTable("t")
+      sqlContext
+        .read
+        .option("mergeSchema", "true")
+        .format("parquet")
+        .load(base.getCanonicalPath)
+        .registerTempTable("t")
 
       withTempTable("t") {
         checkAnswer(
@@ -583,4 +588,15 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest {
           Seq("a", "a, b"),
           Seq("file:/tmp/foo/a=1", "file:/tmp/foo/a=1/b=foo")))
   }
+
+  test("Parallel partition discovery") {
+    withTempPath { dir =>
+      withSQLConf(SQLConf.PARALLEL_PARTITION_DISCOVERY_THRESHOLD.key -> "1") {
+        val path = dir.getCanonicalPath
+        val df = sqlContext.range(5).select('id as 'a, 'id as 'b, 'id as 'c).coalesce(1)
+        df.write.partitionBy("b", "c").parquet(path)
+        checkAnswer(sqlContext.read.parquet(path), df)
+      }
+    }
+  }
 }

From a5d05819afcc9b19aeae4817d842205f32b34335 Mon Sep 17 00:00:00 2001
From: "Joseph K. Bradley" <joseph@databricks.com>
Date: Mon, 20 Jul 2015 16:49:55 -0700
Subject: [PATCH 274/549] [SPARK-9198] [MLLIB] [PYTHON] Fixed typo in pyspark
 sparsevector doc tests

Several places in the PySpark SparseVector docs have one defined as:
```
SparseVector(4, [2, 4], [1.0, 2.0])
```
The index 4 goes out of bounds (but this is not checked).

CC: mengxr

Author: Joseph K. Bradley <joseph@databricks.com>

Closes #7541 from jkbradley/sparsevec-doc-typo-fix and squashes the following commits:

c806a65 [Joseph K. Bradley] fixed doc test
e2dcb23 [Joseph K. Bradley] Fixed typo in pyspark sparsevector doc tests
---
 python/pyspark/mllib/linalg.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/pyspark/mllib/linalg.py b/python/pyspark/mllib/linalg.py
index 529bd75894c96..334dc8e38bb8f 100644
--- a/python/pyspark/mllib/linalg.py
+++ b/python/pyspark/mllib/linalg.py
@@ -566,7 +566,7 @@ def dot(self, other):
         25.0
         >>> a.dot(array.array('d', [1., 2., 3., 4.]))
         22.0
-        >>> b = SparseVector(4, [2, 4], [1.0, 2.0])
+        >>> b = SparseVector(4, [2], [1.0])
         >>> a.dot(b)
         0.0
         >>> a.dot(np.array([[1, 1], [2, 2], [3, 3], [4, 4]]))
@@ -624,11 +624,11 @@ def squared_distance(self, other):
         11.0
         >>> a.squared_distance(np.array([1., 2., 3., 4.]))
         11.0
-        >>> b = SparseVector(4, [2, 4], [1.0, 2.0])
+        >>> b = SparseVector(4, [2], [1.0])
         >>> a.squared_distance(b)
-        30.0
+        26.0
         >>> b.squared_distance(a)
-        30.0
+        26.0
         >>> b.squared_distance([1., 2.])
         Traceback (most recent call last):
             ...

From ff3c72dbafa16c6158fc36619f3c38344c452ba0 Mon Sep 17 00:00:00 2001
From: Meihua Wu <meihuawu@umich.edu>
Date: Mon, 20 Jul 2015 17:03:46 -0700
Subject: [PATCH 275/549] [SPARK-9175] [MLLIB] BLAS.gemm fails to update matrix
 C when alpha==0 and beta!=1

Fix BLAS.gemm to update matrix C when alpha==0 and beta!=1
Also include unit tests to verify the fix.

mengxr brkyvz

Author: Meihua Wu <meihuawu@umich.edu>

Closes #7503 from rotationsymmetry/fix_BLAS_gemm and squashes the following commits:

fce199c [Meihua Wu] Fix BLAS.gemm to update C when alpha==0 and beta!=1
---
 .../org/apache/spark/mllib/linalg/BLAS.scala     |  4 ++--
 .../apache/spark/mllib/linalg/BLASSuite.scala    | 16 ++++++++++++++++
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala
index 3523f1804325d..9029093e0fa08 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala
@@ -303,8 +303,8 @@ private[spark] object BLAS extends Serializable with Logging {
       C: DenseMatrix): Unit = {
     require(!C.isTransposed,
       "The matrix C cannot be the product of a transpose() call. C.isTransposed must be false.")
-    if (alpha == 0.0) {
-      logDebug("gemm: alpha is equal to 0. Returning C.")
+    if (alpha == 0.0 && beta == 1.0) {
+      logDebug("gemm: alpha is equal to 0 and beta is equal to 1. Returning C.")
     } else {
       A match {
         case sparse: SparseMatrix => gemm(alpha, sparse, B, beta, C)
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/BLASSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/BLASSuite.scala
index b0f3f71113c57..d119e0b50a393 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/BLASSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/BLASSuite.scala
@@ -200,8 +200,14 @@ class BLASSuite extends SparkFunSuite {
     val C10 = C1.copy
     val C11 = C1.copy
     val C12 = C1.copy
+    val C13 = C1.copy
+    val C14 = C1.copy
+    val C15 = C1.copy
+    val C16 = C1.copy
     val expected2 = new DenseMatrix(4, 2, Array(2.0, 1.0, 4.0, 2.0, 4.0, 0.0, 4.0, 3.0))
     val expected3 = new DenseMatrix(4, 2, Array(2.0, 2.0, 4.0, 2.0, 8.0, 0.0, 6.0, 6.0))
+    val expected4 = new DenseMatrix(4, 2, Array(5.0, 0.0, 10.0, 5.0, 0.0, 0.0, 5.0, 0.0))
+    val expected5 = C1.copy
 
     gemm(1.0, dA, B, 2.0, C1)
     gemm(1.0, sA, B, 2.0, C2)
@@ -248,6 +254,16 @@ class BLASSuite extends SparkFunSuite {
     assert(C10 ~== expected2 absTol 1e-15)
     assert(C11 ~== expected3 absTol 1e-15)
     assert(C12 ~== expected3 absTol 1e-15)
+
+    gemm(0, dA, B, 5, C13)
+    gemm(0, sA, B, 5, C14)
+    gemm(0, dA, B, 1, C15)
+    gemm(0, sA, B, 1, C16)
+    assert(C13 ~== expected4 absTol 1e-15)
+    assert(C14 ~== expected4 absTol 1e-15)
+    assert(C15 ~== expected5 absTol 1e-15)
+    assert(C16 ~== expected5 absTol 1e-15)
+
   }
 
   test("gemv") {

From 66bb8003b949860b8652542e1232bc48665448c2 Mon Sep 17 00:00:00 2001
From: Carson Wang <carson.wang@intel.com>
Date: Mon, 20 Jul 2015 18:08:59 -0700
Subject: [PATCH 276/549] [SPARK-9187] [WEBUI] Timeline view may show negative
 value for running tasks

For running tasks, the executorRunTime metrics is 0 which causes negative executorComputingTime in the timeline. It also causes an incorrect SchedulerDelay time.
![timelinenegativevalue](https://cloud.githubusercontent.com/assets/9278199/8770953/f4362378-2eec-11e5-81e6-a06a07c04794.png)

Author: Carson Wang <carson.wang@intel.com>

Closes #7526 from carsonwang/timeline-negValue and squashes the following commits:

7b17db2 [Carson Wang] Fix negative value in timeline view
---
 .../org/apache/spark/ui/jobs/StagePage.scala  | 23 ++++++++++++-------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
index 27b82aaddd2e4..6e077bf3e70d5 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
@@ -537,20 +537,27 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") {
           (metricsOpt.flatMap(_.shuffleWriteMetrics
             .map(_.shuffleWriteTime)).getOrElse(0L) / 1e6).toLong
         val shuffleWriteTimeProportion = toProportion(shuffleWriteTime)
-        val executorComputingTime = metricsOpt.map(_.executorRunTime).getOrElse(0L) -
-          shuffleReadTime - shuffleWriteTime
-        val executorComputingTimeProportion = toProportion(executorComputingTime)
+
         val serializationTime = metricsOpt.map(_.resultSerializationTime).getOrElse(0L)
         val serializationTimeProportion = toProportion(serializationTime)
         val deserializationTime = metricsOpt.map(_.executorDeserializeTime).getOrElse(0L)
         val deserializationTimeProportion = toProportion(deserializationTime)
         val gettingResultTime = getGettingResultTime(taskUIData.taskInfo, currentTime)
         val gettingResultTimeProportion = toProportion(gettingResultTime)
-        val schedulerDelay = totalExecutionTime -
-          (executorComputingTime + shuffleReadTime + shuffleWriteTime +
-            serializationTime + deserializationTime + gettingResultTime)
-        val schedulerDelayProportion =
-          (100 - executorComputingTimeProportion - shuffleReadTimeProportion -
+        val schedulerDelay =
+          metricsOpt.map(getSchedulerDelay(taskInfo, _, currentTime)).getOrElse(0L)
+        val schedulerDelayProportion = toProportion(schedulerDelay)
+
+        val executorOverhead = serializationTime + deserializationTime
+        val executorRunTime = if (taskInfo.running) {
+          totalExecutionTime - executorOverhead - gettingResultTime
+        } else {
+          metricsOpt.map(_.executorRunTime).getOrElse(
+            totalExecutionTime - executorOverhead - gettingResultTime)
+        }
+        val executorComputingTime = executorRunTime - shuffleReadTime - shuffleWriteTime
+        val executorComputingTimeProportion =
+          (100 - schedulerDelayProportion - shuffleReadTimeProportion -
             shuffleWriteTimeProportion - serializationTimeProportion -
             deserializationTimeProportion - gettingResultTimeProportion)
 

From 047ccc8c9a88e74f7bc87709ee5d531f1d7a4228 Mon Sep 17 00:00:00 2001
From: Tarek Auel <tarek.auel@googlemail.com>
Date: Mon, 20 Jul 2015 18:16:49 -0700
Subject: [PATCH 277/549] [SPARK-9178][SQL] Add an empty string constant to
 UTF8String

Jira: https://issues.apache.org/jira/browse/SPARK-9178

In order to avoid calls of `UTF8String.fromString("")` this pr adds an `EMPTY_STRING` constant to `UTF8String`. An `UTF8String` is immutable, so we can use a constant, isn't it?

I searched for current usage of `UTF8String.fromString("")` with
`grep -R  "UTF8String.fromString(\"\")" .`

Author: Tarek Auel <tarek.auel@googlemail.com>

Closes #7509 from tarekauel/SPARK-9178 and squashes the following commits:

8d6c405 [Tarek Auel] [SPARK-9178] revert intellij indents
3627b80 [Tarek Auel] [SPARK-9178] revert concat tests changes
3f5fbf5 [Tarek Auel] [SPARK-9178] rebase and add final to UTF8String.EMPTY_UTF8
47cda68 [Tarek Auel] Merge branch 'master' into SPARK-9178
4a37344 [Tarek Auel] [SPARK-9178] changed name to EMPTY_UTF8, added tests
748b87a [Tarek Auel] [SPARK-9178] Add empty string constant to UTF8String
---
 .../apache/spark/unsafe/types/UTF8String.java |  2 +
 .../spark/unsafe/types/UTF8StringSuite.java   | 76 +++++++++----------
 2 files changed, 39 insertions(+), 39 deletions(-)

diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
index 819639f300177..fc63fe537d226 100644
--- a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
+++ b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -50,6 +50,8 @@ public final class UTF8String implements Comparable<UTF8String>, Serializable {
     5, 5, 5, 5,
     6, 6, 6, 6};
 
+  public static final UTF8String EMPTY_UTF8 = UTF8String.fromString("");
+
   /**
    * Creates an UTF8String from byte array, which should be encoded in UTF-8.
    *
diff --git a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
index 6a21c27461163..d730b1d1384f5 100644
--- a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
+++ b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
@@ -54,6 +54,14 @@ public void basicTest() throws UnsupportedEncodingException {
     checkBasic("大 千 世 界", 7);
   }
 
+  @Test
+  public void emptyStringTest() {
+    assertEquals(fromString(""), EMPTY_UTF8);
+    assertEquals(fromBytes(new byte[0]), EMPTY_UTF8);
+    assertEquals(0, EMPTY_UTF8.numChars());
+    assertEquals(0, EMPTY_UTF8.numBytes());
+  }
+
   @Test
   public void compareTo() {
     assertTrue(fromString("abc").compareTo(fromString("ABC")) > 0);
@@ -88,9 +96,9 @@ public void upperAndLower() {
 
   @Test
   public void concatTest() {
-    assertEquals(fromString(""), concat());
+    assertEquals(EMPTY_UTF8, concat());
     assertEquals(null, concat((UTF8String) null));
-    assertEquals(fromString(""), concat(fromString("")));
+    assertEquals(EMPTY_UTF8, concat(EMPTY_UTF8));
     assertEquals(fromString("ab"), concat(fromString("ab")));
     assertEquals(fromString("ab"), concat(fromString("a"), fromString("b")));
     assertEquals(fromString("abc"), concat(fromString("a"), fromString("b"), fromString("c")));
@@ -109,8 +117,8 @@ public void concatWsTest() {
     // If separator is null, concatWs should skip all null inputs and never return null.
     UTF8String sep = fromString("哈哈");
     assertEquals(
-      fromString(""),
-      concatWs(sep, fromString("")));
+      EMPTY_UTF8,
+      concatWs(sep, EMPTY_UTF8));
     assertEquals(
       fromString("ab"),
       concatWs(sep, fromString("ab")));
@@ -127,7 +135,7 @@ public void concatWsTest() {
       fromString("a"),
       concatWs(sep, fromString("a"), null, null));
     assertEquals(
-      fromString(""),
+      EMPTY_UTF8,
       concatWs(sep, null, null, null));
     assertEquals(
       fromString("数据哈哈砖头"),
@@ -136,7 +144,7 @@ public void concatWsTest() {
 
   @Test
   public void contains() {
-    assertTrue(fromString("").contains(fromString("")));
+    assertTrue(EMPTY_UTF8.contains(EMPTY_UTF8));
     assertTrue(fromString("hello").contains(fromString("ello")));
     assertFalse(fromString("hello").contains(fromString("vello")));
     assertFalse(fromString("hello").contains(fromString("hellooo")));
@@ -147,7 +155,7 @@ public void contains() {
 
   @Test
   public void startsWith() {
-    assertTrue(fromString("").startsWith(fromString("")));
+    assertTrue(EMPTY_UTF8.startsWith(EMPTY_UTF8));
     assertTrue(fromString("hello").startsWith(fromString("hell")));
     assertFalse(fromString("hello").startsWith(fromString("ell")));
     assertFalse(fromString("hello").startsWith(fromString("hellooo")));
@@ -158,7 +166,7 @@ public void startsWith() {
 
   @Test
   public void endsWith() {
-    assertTrue(fromString("").endsWith(fromString("")));
+    assertTrue(EMPTY_UTF8.endsWith(EMPTY_UTF8));
     assertTrue(fromString("hello").endsWith(fromString("ello")));
     assertFalse(fromString("hello").endsWith(fromString("ellov")));
     assertFalse(fromString("hello").endsWith(fromString("hhhello")));
@@ -169,7 +177,7 @@ public void endsWith() {
 
   @Test
   public void substring() {
-    assertEquals(fromString(""), fromString("hello").substring(0, 0));
+    assertEquals(EMPTY_UTF8, fromString("hello").substring(0, 0));
     assertEquals(fromString("el"), fromString("hello").substring(1, 3));
     assertEquals(fromString("数"), fromString("数据砖头").substring(0, 1));
     assertEquals(fromString("据砖"), fromString("数据砖头").substring(1, 3));
@@ -183,9 +191,9 @@ public void trims() {
     assertEquals(fromString("hello "), fromString("  hello ").trimLeft());
     assertEquals(fromString("  hello"), fromString("  hello ").trimRight());
 
-    assertEquals(fromString(""), fromString("  ").trim());
-    assertEquals(fromString(""), fromString("  ").trimLeft());
-    assertEquals(fromString(""), fromString("  ").trimRight());
+    assertEquals(EMPTY_UTF8, fromString("  ").trim());
+    assertEquals(EMPTY_UTF8, fromString("  ").trimLeft());
+    assertEquals(EMPTY_UTF8, fromString("  ").trimRight());
 
     assertEquals(fromString("数据砖头"), fromString("  数据砖头 ").trim());
     assertEquals(fromString("数据砖头 "), fromString("  数据砖头 ").trimLeft());
@@ -198,9 +206,9 @@ public void trims() {
 
   @Test
   public void indexOf() {
-    assertEquals(0, fromString("").indexOf(fromString(""), 0));
-    assertEquals(-1, fromString("").indexOf(fromString("l"), 0));
-    assertEquals(0, fromString("hello").indexOf(fromString(""), 0));
+    assertEquals(0, EMPTY_UTF8.indexOf(EMPTY_UTF8, 0));
+    assertEquals(-1, EMPTY_UTF8.indexOf(fromString("l"), 0));
+    assertEquals(0, fromString("hello").indexOf(EMPTY_UTF8, 0));
     assertEquals(2, fromString("hello").indexOf(fromString("l"), 0));
     assertEquals(3, fromString("hello").indexOf(fromString("l"), 3));
     assertEquals(-1, fromString("hello").indexOf(fromString("a"), 0));
@@ -215,7 +223,7 @@ public void indexOf() {
   @Test
   public void reverse() {
     assertEquals(fromString("olleh"), fromString("hello").reverse());
-    assertEquals(fromString(""), fromString("").reverse());
+    assertEquals(EMPTY_UTF8, EMPTY_UTF8.reverse());
     assertEquals(fromString("者行孙"), fromString("孙行者").reverse());
     assertEquals(fromString("者行孙 olleh"), fromString("hello 孙行者").reverse());
   }
@@ -224,7 +232,7 @@ public void reverse() {
   public void repeat() {
     assertEquals(fromString("数d数d数d数d数d"), fromString("数d").repeat(5));
     assertEquals(fromString("数d"), fromString("数d").repeat(1));
-    assertEquals(fromString(""), fromString("数d").repeat(-1));
+    assertEquals(EMPTY_UTF8, fromString("数d").repeat(-1));
   }
 
   @Test
@@ -234,14 +242,14 @@ public void pad() {
     assertEquals(fromString("?hello"), fromString("hello").lpad(6, fromString("????")));
     assertEquals(fromString("???????hello"), fromString("hello").lpad(12, fromString("????")));
     assertEquals(fromString("?????hello"), fromString("hello").lpad(10, fromString("?????")));
-    assertEquals(fromString("???????"), fromString("").lpad(7, fromString("?????")));
+    assertEquals(fromString("???????"), EMPTY_UTF8.lpad(7, fromString("?????")));
 
     assertEquals(fromString("hel"), fromString("hello").rpad(3, fromString("????")));
     assertEquals(fromString("hello"), fromString("hello").rpad(5, fromString("????")));
     assertEquals(fromString("hello?"), fromString("hello").rpad(6, fromString("????")));
     assertEquals(fromString("hello???????"), fromString("hello").rpad(12, fromString("????")));
     assertEquals(fromString("hello?????"), fromString("hello").rpad(10, fromString("?????")));
-    assertEquals(fromString("???????"), fromString("").rpad(7, fromString("?????")));
+    assertEquals(fromString("???????"), EMPTY_UTF8.rpad(7, fromString("?????")));
 
 
     assertEquals(fromString("数据砖"), fromString("数据砖头").lpad(3, fromString("????")));
@@ -265,26 +273,16 @@ public void pad() {
   
   @Test
   public void levenshteinDistance() {
-    assertEquals(
-        UTF8String.fromString("").levenshteinDistance(UTF8String.fromString("")), 0);
-    assertEquals(
-        UTF8String.fromString("").levenshteinDistance(UTF8String.fromString("a")), 1);
-    assertEquals(
-        UTF8String.fromString("aaapppp").levenshteinDistance(UTF8String.fromString("")), 7);
-    assertEquals(
-        UTF8String.fromString("frog").levenshteinDistance(UTF8String.fromString("fog")), 1);
-    assertEquals(
-        UTF8String.fromString("fly").levenshteinDistance(UTF8String.fromString("ant")),3);
-    assertEquals(
-        UTF8String.fromString("elephant").levenshteinDistance(UTF8String.fromString("hippo")), 7);
-    assertEquals(
-        UTF8String.fromString("hippo").levenshteinDistance(UTF8String.fromString("elephant")), 7);
-    assertEquals(
-        UTF8String.fromString("hippo").levenshteinDistance(UTF8String.fromString("zzzzzzzz")), 8);
-    assertEquals(
-        UTF8String.fromString("hello").levenshteinDistance(UTF8String.fromString("hallo")),1);
-    assertEquals(
-        UTF8String.fromString("世界千世").levenshteinDistance(UTF8String.fromString("千a世b")),4);
+    assertEquals(EMPTY_UTF8.levenshteinDistance(EMPTY_UTF8), 0);
+    assertEquals(EMPTY_UTF8.levenshteinDistance(fromString("a")), 1);
+    assertEquals(fromString("aaapppp").levenshteinDistance(EMPTY_UTF8), 7);
+    assertEquals(fromString("frog").levenshteinDistance(fromString("fog")), 1);
+    assertEquals(fromString("fly").levenshteinDistance(fromString("ant")),3);
+    assertEquals(fromString("elephant").levenshteinDistance(fromString("hippo")), 7);
+    assertEquals(fromString("hippo").levenshteinDistance(fromString("elephant")), 7);
+    assertEquals(fromString("hippo").levenshteinDistance(fromString("zzzzzzzz")), 8);
+    assertEquals(fromString("hello").levenshteinDistance(fromString("hallo")),1);
+    assertEquals(fromString("世界千世").levenshteinDistance(fromString("千a世b")),4);
   }
 
   @Test

From 6853ac7c8c76003160fc861ddcc8e8e39e4a5924 Mon Sep 17 00:00:00 2001
From: Tarek Auel <tarek.auel@googlemail.com>
Date: Mon, 20 Jul 2015 18:21:05 -0700
Subject: [PATCH 278/549] [SPARK-9156][SQL] codegen StringSplit

Jira: https://issues.apache.org/jira/browse/SPARK-9156

Author: Tarek Auel <tarek.auel@googlemail.com>

Closes #7547 from tarekauel/SPARK-9156 and squashes the following commits:

0be2700 [Tarek Auel] [SPARK-9156][SQL] indention fix
b860eaf [Tarek Auel] [SPARK-9156][SQL] codegen StringSplit
5ad6a1f [Tarek Auel] [SPARK-9156] codegen StringSplit
---
 .../sql/catalyst/expressions/stringOperations.scala  | 12 ++++++++----
 .../org/apache/spark/unsafe/types/UTF8String.java    |  9 +++++++++
 .../apache/spark/unsafe/types/UTF8StringSuite.java   | 11 +++++++++++
 3 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
index a5682428b3d40..5c1908d55576a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
@@ -615,7 +615,7 @@ case class StringSpace(child: Expression)
  * Splits str around pat (pattern is a regular expression).
  */
 case class StringSplit(str: Expression, pattern: Expression)
-  extends BinaryExpression with ImplicitCastInputTypes with CodegenFallback {
+  extends BinaryExpression with ImplicitCastInputTypes {
 
   override def left: Expression = str
   override def right: Expression = pattern
@@ -623,9 +623,13 @@ case class StringSplit(str: Expression, pattern: Expression)
   override def inputTypes: Seq[DataType] = Seq(StringType, StringType)
 
   override def nullSafeEval(string: Any, regex: Any): Any = {
-    val splits =
-      string.asInstanceOf[UTF8String].toString.split(regex.asInstanceOf[UTF8String].toString, -1)
-    splits.toSeq.map(UTF8String.fromString)
+    string.asInstanceOf[UTF8String].split(regex.asInstanceOf[UTF8String], -1).toSeq
+  }
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    nullSafeCodeGen(ctx, ev, (str, pattern) =>
+      s"""${ev.primitive} = scala.collection.JavaConversions.asScalaBuffer(
+            java.util.Arrays.asList($str.split($pattern, -1)));""")
   }
 
   override def prettyName: String = "split"
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
index fc63fe537d226..ed354f7f877f1 100644
--- a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
+++ b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -487,6 +487,15 @@ public static UTF8String concatWs(UTF8String separator, UTF8String... inputs) {
     return fromBytes(result);
   }
 
+  public UTF8String[] split(UTF8String pattern, int limit) {
+    String[] splits = toString().split(pattern.toString(), limit);
+    UTF8String[] res = new UTF8String[splits.length];
+    for (int i = 0; i < res.length; i++) {
+      res[i] = fromString(splits[i]);
+    }
+    return res;
+  }
+
   @Override
   public String toString() {
     try {
diff --git a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
index d730b1d1384f5..1f5572c509bdb 100644
--- a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
+++ b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
@@ -18,6 +18,7 @@
 package org.apache.spark.unsafe.types;
 
 import java.io.UnsupportedEncodingException;
+import java.util.Arrays;
 
 import org.junit.Test;
 
@@ -270,6 +271,16 @@ public void pad() {
       fromString("数据砖头孙行者孙行者孙行"),
       fromString("数据砖头").rpad(12, fromString("孙行者")));
   }
+
+  @Test
+  public void split() {
+    assertTrue(Arrays.equals(fromString("ab,def,ghi").split(fromString(","), -1),
+      new UTF8String[]{fromString("ab"), fromString("def"), fromString("ghi")}));
+    assertTrue(Arrays.equals(fromString("ab,def,ghi").split(fromString(","), 2),
+      new UTF8String[]{fromString("ab"), fromString("def,ghi")}));
+    assertTrue(Arrays.equals(fromString("ab,def,ghi").split(fromString(","), 2),
+      new UTF8String[]{fromString("ab"), fromString("def,ghi")}));
+  }
   
   @Test
   public void levenshteinDistance() {

From e90543e5366808332bbde18d78cccd4d064a3338 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Mon, 20 Jul 2015 18:23:51 -0700
Subject: [PATCH 279/549] [SPARK-9142][SQL] Removing unnecessary self types in
 expressions.

Also added documentation to expressions to explain the important traits and abstract classes.

Author: Reynold Xin <rxin@databricks.com>

Closes #7550 from rxin/remove-self-types and squashes the following commits:

b2a3ec1 [Reynold Xin] [SPARK-9142][SQL] Removing unnecessary self types in expressions.
---
 .../expressions/ExpectsInputTypes.scala       |  4 +--
 .../sql/catalyst/expressions/Expression.scala | 33 +++++++++++--------
 .../expressions/codegen/CodegenFallback.scala |  2 +-
 3 files changed, 22 insertions(+), 17 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExpectsInputTypes.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExpectsInputTypes.scala
index ded89e85dea79..abe6457747550 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExpectsInputTypes.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExpectsInputTypes.scala
@@ -29,7 +29,7 @@ import org.apache.spark.sql.catalyst.analysis.HiveTypeCoercion.ImplicitTypeCasts
  *
  * Most function expressions (e.g. [[Substring]] should extends [[ImplicitCastInputTypes]]) instead.
  */
-trait ExpectsInputTypes { self: Expression =>
+trait ExpectsInputTypes extends Expression {
 
   /**
    * Expected input types from child expressions. The i-th position in the returned seq indicates
@@ -60,6 +60,6 @@ trait ExpectsInputTypes { self: Expression =>
 /**
  * A mixin for the analyzer to perform implicit type casting using [[ImplicitTypeCasts]].
  */
-trait ImplicitCastInputTypes extends ExpectsInputTypes { self: Expression =>
+trait ImplicitCastInputTypes extends ExpectsInputTypes {
   // No other methods
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
index da599b8963340..aada25276adb7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
@@ -19,19 +19,12 @@ package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.{TypeCheckResult, UnresolvedAttribute}
-import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode}
+import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.catalyst.trees.TreeNode
 import org.apache.spark.sql.types._
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
-// This file defines the basic expression abstract classes in Catalyst, including:
-// Expression: the base expression abstract class
-// LeafExpression
-// UnaryExpression
-// BinaryExpression
-// BinaryOperator
-//
-// For details, see their classdocs.
+// This file defines the basic expression abstract classes in Catalyst.
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 /**
@@ -39,9 +32,21 @@ import org.apache.spark.sql.types._
  *
  * If an expression wants to be exposed in the function registry (so users can call it with
  * "name(arguments...)", the concrete implementation must be a case class whose constructor
- * arguments are all Expressions types.
+ * arguments are all Expressions types. See [[Substring]] for an example.
+ *
+ * There are a few important traits:
+ *
+ * - [[Nondeterministic]]: an expression that is not deterministic.
+ * - [[Unevaluable]]: an expression that is not supposed to be evaluated.
+ * - [[CodegenFallback]]: an expression that does not have code gen implemented and falls back to
+ *                        interpreted mode.
+ *
+ * - [[LeafExpression]]: an expression that has no child.
+ * - [[UnaryExpression]]: an expression that has one child.
+ * - [[BinaryExpression]]: an expression that has two children.
+ * - [[BinaryOperator]]: a special case of [[BinaryExpression]] that requires two children to have
+ *                       the same output data type.
  *
- * See [[Substring]] for an example.
  */
 abstract class Expression extends TreeNode[Expression] {
 
@@ -176,7 +181,7 @@ abstract class Expression extends TreeNode[Expression] {
  * An expression that cannot be evaluated. Some expressions don't live past analysis or optimization
  * time (e.g. Star). This trait is used by those expressions.
  */
-trait Unevaluable { self: Expression =>
+trait Unevaluable extends Expression {
 
   override def eval(input: InternalRow = null): Any =
     throw new UnsupportedOperationException(s"Cannot evaluate expression: $this")
@@ -185,11 +190,11 @@ trait Unevaluable { self: Expression =>
     throw new UnsupportedOperationException(s"Cannot evaluate expression: $this")
 }
 
+
 /**
  * An expression that is nondeterministic.
  */
-trait Nondeterministic { self: Expression =>
-
+trait Nondeterministic extends Expression {
   override def deterministic: Boolean = false
 }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodegenFallback.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodegenFallback.scala
index bf4f600cb26e5..6b187f05604fd 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodegenFallback.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodegenFallback.scala
@@ -22,7 +22,7 @@ import org.apache.spark.sql.catalyst.expressions.Expression
 /**
  * A trait that can be used to provide a fallback mode for expression code generation.
  */
-trait CodegenFallback { self: Expression =>
+trait CodegenFallback extends Expression {
 
   protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     ctx.references += this

From 936a96cb31a6dd7d8685bce05103e779ca02e763 Mon Sep 17 00:00:00 2001
From: Tarek Auel <tarek.auel@googlemail.com>
Date: Mon, 20 Jul 2015 19:17:59 -0700
Subject: [PATCH 280/549] [SPARK-9164] [SQL] codegen hex/unhex

Jira: https://issues.apache.org/jira/browse/SPARK-9164

The diff looks heavy, but I just moved the `hex` and `unhex` methods to `object Hex`.  This allows me to call them from `eval` and `codeGen`

Author: Tarek Auel <tarek.auel@googlemail.com>

Closes #7548 from tarekauel/SPARK-9164 and squashes the following commits:

dd91c57 [Tarek Auel] [SPARK-9164][SQL] codegen hex/unhex
---
 .../spark/sql/catalyst/expressions/math.scala | 96 +++++++++++--------
 1 file changed, 57 insertions(+), 39 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
index 7ce64d29ba59a..7a9be02ba45b3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
@@ -489,28 +489,8 @@ object Hex {
     (0 to 5).foreach(i => array('a' + i) = (i + 10).toByte)
     array
   }
-}
 
-/**
- * If the argument is an INT or binary, hex returns the number as a STRING in hexadecimal format.
- * Otherwise if the number is a STRING, it converts each character into its hex representation
- * and returns the resulting STRING. Negative numbers would be treated as two's complement.
- */
-case class Hex(child: Expression)
-  extends UnaryExpression with ImplicitCastInputTypes with CodegenFallback {
-
-  override def inputTypes: Seq[AbstractDataType] =
-    Seq(TypeCollection(LongType, BinaryType, StringType))
-
-  override def dataType: DataType = StringType
-
-  protected override def nullSafeEval(num: Any): Any = child.dataType match {
-    case LongType => hex(num.asInstanceOf[Long])
-    case BinaryType => hex(num.asInstanceOf[Array[Byte]])
-    case StringType => hex(num.asInstanceOf[UTF8String].getBytes)
-  }
-
-  private[this] def hex(bytes: Array[Byte]): UTF8String = {
+  def hex(bytes: Array[Byte]): UTF8String = {
     val length = bytes.length
     val value = new Array[Byte](length * 2)
     var i = 0
@@ -522,7 +502,7 @@ case class Hex(child: Expression)
     UTF8String.fromBytes(value)
   }
 
-  private def hex(num: Long): UTF8String = {
+  def hex(num: Long): UTF8String = {
     // Extract the hex digits of num into value[] from right to left
     val value = new Array[Byte](16)
     var numBuf = num
@@ -534,24 +514,8 @@ case class Hex(child: Expression)
     } while (numBuf != 0)
     UTF8String.fromBytes(java.util.Arrays.copyOfRange(value, value.length - len, value.length))
   }
-}
 
-/**
- * Performs the inverse operation of HEX.
- * Resulting characters are returned as a byte array.
- */
-case class Unhex(child: Expression)
-  extends UnaryExpression with ImplicitCastInputTypes with CodegenFallback {
-
-  override def inputTypes: Seq[AbstractDataType] = Seq(StringType)
-
-  override def nullable: Boolean = true
-  override def dataType: DataType = BinaryType
-
-  protected override def nullSafeEval(num: Any): Any =
-    unhex(num.asInstanceOf[UTF8String].getBytes)
-
-  private[this] def unhex(bytes: Array[Byte]): Array[Byte] = {
+  def unhex(bytes: Array[Byte]): Array[Byte] = {
     val out = new Array[Byte]((bytes.length + 1) >> 1)
     var i = 0
     if ((bytes.length & 0x01) != 0) {
@@ -583,6 +547,60 @@ case class Unhex(child: Expression)
   }
 }
 
+/**
+ * If the argument is an INT or binary, hex returns the number as a STRING in hexadecimal format.
+ * Otherwise if the number is a STRING, it converts each character into its hex representation
+ * and returns the resulting STRING. Negative numbers would be treated as two's complement.
+ */
+case class Hex(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
+
+  override def inputTypes: Seq[AbstractDataType] =
+    Seq(TypeCollection(LongType, BinaryType, StringType))
+
+  override def dataType: DataType = StringType
+
+  protected override def nullSafeEval(num: Any): Any = child.dataType match {
+    case LongType => Hex.hex(num.asInstanceOf[Long])
+    case BinaryType => Hex.hex(num.asInstanceOf[Array[Byte]])
+    case StringType => Hex.hex(num.asInstanceOf[UTF8String].getBytes)
+  }
+
+  override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    nullSafeCodeGen(ctx, ev, (c) => {
+      val hex = Hex.getClass.getName.stripSuffix("$")
+      s"${ev.primitive} = " + (child.dataType match {
+        case StringType => s"""$hex.hex($c.getBytes());"""
+        case _ => s"""$hex.hex($c);"""
+      })
+    })
+  }
+}
+
+/**
+ * Performs the inverse operation of HEX.
+ * Resulting characters are returned as a byte array.
+ */
+case class Unhex(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
+
+  override def inputTypes: Seq[AbstractDataType] = Seq(StringType)
+
+  override def nullable: Boolean = true
+  override def dataType: DataType = BinaryType
+
+  protected override def nullSafeEval(num: Any): Any =
+    Hex.unhex(num.asInstanceOf[UTF8String].getBytes)
+
+  override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    nullSafeCodeGen(ctx, ev, (c) => {
+      val hex = Hex.getClass.getName.stripSuffix("$")
+      s"""
+        ${ev.primitive} = $hex.unhex($c.getBytes());
+        ${ev.isNull} = ${ev.primitive} == null;
+       """
+    })
+  }
+}
+
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////////////////////////

From 2bdf9914ab709bf9c1cdd17fc5dd7a69f6d46f29 Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Tue, 21 Jul 2015 11:38:22 +0900
Subject: [PATCH 281/549] [SPARK-9052] [SPARKR] Fix comments after curly braces

[[SPARK-9052] Fix comments after curly braces - ASF JIRA](https://issues.apache.org/jira/browse/SPARK-9052)

This is the full result of lintr at the rivision:011551620faa87107a787530f074af3d9be7e695.
[[SPARK-9052] the result of lint-r at the revision:011551620faa87107a787530f074af3d9be7e695](https://gist.github.com/yu-iskw/e7246041b173a3f29482)

This is the difference of the result between before and after.
https://gist.github.com/yu-iskw/e7246041b173a3f29482/revisions

Author: Yu ISHIKAWA <yuu.ishikawa@gmail.com>

Closes #7440 from yu-iskw/SPARK-9052 and squashes the following commits:

015d738 [Yu ISHIKAWA] Fix the indentations and move the placement of commna
5cc30fe [Yu ISHIKAWA] Fix the indentation in a condition
4ead0e5 [Yu ISHIKAWA] [SPARK-9052][SparkR] Fix comments after curly braces
---
 R/pkg/R/schema.R | 13 ++++++++-----
 R/pkg/R/utils.R  | 33 ++++++++++++++++++++++-----------
 2 files changed, 30 insertions(+), 16 deletions(-)

diff --git a/R/pkg/R/schema.R b/R/pkg/R/schema.R
index 06df430687682..79c744ef29c23 100644
--- a/R/pkg/R/schema.R
+++ b/R/pkg/R/schema.R
@@ -69,11 +69,14 @@ structType.structField <- function(x, ...) {
 #' @param ... further arguments passed to or from other methods
 print.structType <- function(x, ...) {
   cat("StructType\n",
-      sapply(x$fields(), function(field) { paste("|-", "name = \"", field$name(),
-                                           "\", type = \"", field$dataType.toString(),
-                                           "\", nullable = ", field$nullable(), "\n",
-                                           sep = "") })
-      , sep = "")
+      sapply(x$fields(),
+             function(field) {
+               paste("|-", "name = \"", field$name(),
+                     "\", type = \"", field$dataType.toString(),
+                     "\", nullable = ", field$nullable(), "\n",
+                     sep = "")
+             }),
+      sep = "")
 }
 
 #' structField
diff --git a/R/pkg/R/utils.R b/R/pkg/R/utils.R
index 950ba74dbe017..3f45589a50443 100644
--- a/R/pkg/R/utils.R
+++ b/R/pkg/R/utils.R
@@ -390,14 +390,17 @@ processClosure <- function(node, oldEnv, defVars, checkedFuncs, newEnv) {
       for (i in 1:nodeLen) {
         processClosure(node[[i]], oldEnv, defVars, checkedFuncs, newEnv)
       }
-    } else {  # if node[[1]] is length of 1, check for some R special functions.
+    } else {
+      # if node[[1]] is length of 1, check for some R special functions.
       nodeChar <- as.character(node[[1]])
-      if (nodeChar == "{" || nodeChar == "(") {  # Skip start symbol.
+      if (nodeChar == "{" || nodeChar == "(") {
+        # Skip start symbol.
         for (i in 2:nodeLen) {
           processClosure(node[[i]], oldEnv, defVars, checkedFuncs, newEnv)
         }
       } else if (nodeChar == "<-" || nodeChar == "=" ||
-                   nodeChar == "<<-") { # Assignment Ops.
+                   nodeChar == "<<-") {
+        # Assignment Ops.
         defVar <- node[[2]]
         if (length(defVar) == 1 && typeof(defVar) == "symbol") {
           # Add the defined variable name into defVars.
@@ -408,14 +411,16 @@ processClosure <- function(node, oldEnv, defVars, checkedFuncs, newEnv) {
         for (i in 3:nodeLen) {
           processClosure(node[[i]], oldEnv, defVars, checkedFuncs, newEnv)
         }
-      } else if (nodeChar == "function") {  # Function definition.
+      } else if (nodeChar == "function") {
+        # Function definition.
         # Add parameter names.
         newArgs <- names(node[[2]])
         lapply(newArgs, function(arg) { addItemToAccumulator(defVars, arg) })
         for (i in 3:nodeLen) {
           processClosure(node[[i]], oldEnv, defVars, checkedFuncs, newEnv)
         }
-      } else if (nodeChar == "$") {  # Skip the field.
+      } else if (nodeChar == "$") {
+        # Skip the field.
         processClosure(node[[2]], oldEnv, defVars, checkedFuncs, newEnv)
       } else if (nodeChar == "::" || nodeChar == ":::") {
         processClosure(node[[3]], oldEnv, defVars, checkedFuncs, newEnv)
@@ -429,7 +434,8 @@ processClosure <- function(node, oldEnv, defVars, checkedFuncs, newEnv) {
                (typeof(node) == "symbol" || typeof(node) == "language")) {
     # Base case: current AST node is a leaf node and a symbol or a function call.
     nodeChar <- as.character(node)
-    if (!nodeChar %in% defVars$data) {  # Not a function parameter or local variable.
+    if (!nodeChar %in% defVars$data) {
+      # Not a function parameter or local variable.
       func.env <- oldEnv
       topEnv <- parent.env(.GlobalEnv)
       # Search in function environment, and function's enclosing environments
@@ -439,20 +445,24 @@ processClosure <- function(node, oldEnv, defVars, checkedFuncs, newEnv) {
       while (!identical(func.env, topEnv)) {
         # Namespaces other than "SparkR" will not be searched.
         if (!isNamespace(func.env) ||
-              (getNamespaceName(func.env) == "SparkR" &&
-              !(nodeChar %in% getNamespaceExports("SparkR")))) {  # Only include SparkR internals.
+            (getNamespaceName(func.env) == "SparkR" &&
+               !(nodeChar %in% getNamespaceExports("SparkR")))) {
+          # Only include SparkR internals.
+
           # Set parameter 'inherits' to FALSE since we do not need to search in
           # attached package environments.
           if (tryCatch(exists(nodeChar, envir = func.env, inherits = FALSE),
                        error = function(e) { FALSE })) {
             obj <- get(nodeChar, envir = func.env, inherits = FALSE)
-            if (is.function(obj)) {  # If the node is a function call.
+            if (is.function(obj)) {
+              # If the node is a function call.
               funcList <- mget(nodeChar, envir = checkedFuncs, inherits = F,
                                ifnotfound = list(list(NULL)))[[1]]
               found <- sapply(funcList, function(func) {
                 ifelse(identical(func, obj), TRUE, FALSE)
               })
-              if (sum(found) > 0) {  # If function has been examined, ignore.
+              if (sum(found) > 0) {
+                # If function has been examined, ignore.
                 break
               }
               # Function has not been examined, record it and recursively clean its closure.
@@ -495,7 +505,8 @@ cleanClosure <- function(func, checkedFuncs = new.env()) {
     # environment. First, function's arguments are added to defVars.
     defVars <- initAccumulator()
     argNames <- names(as.list(args(func)))
-    for (i in 1:(length(argNames) - 1)) {  # Remove the ending NULL in pairlist.
+    for (i in 1:(length(argNames) - 1)) {
+      # Remove the ending NULL in pairlist.
       addItemToAccumulator(defVars, argNames[i])
     }
     # Recursively examine variables in the function body.

From 1cbdd8991898912a8471a7070c472a0edb92487c Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Mon, 20 Jul 2015 20:49:38 -0700
Subject: [PATCH 282/549] [SPARK-9201] [ML] Initial integration of MLlib +
 SparkR using RFormula

This exposes the SparkR:::glm() and SparkR:::predict() APIs. It was necessary to change RFormula to silently drop the label column if it was missing from the input dataset, which is kind of a hack but necessary to integrate with the Pipeline API.

The umbrella design doc for MLlib + SparkR integration can be viewed here: https://docs.google.com/document/d/10NZNSEurN2EdWM31uFYsgayIPfCFHiuIu3pCWrUmP_c/edit

mengxr

Author: Eric Liang <ekl@databricks.com>

Closes #7483 from ericl/spark-8774 and squashes the following commits:

3dfac0c [Eric Liang] update
17ef516 [Eric Liang] more comments
1753a0f [Eric Liang] make glm generic
b0f50f8 [Eric Liang] equivalence test
550d56d [Eric Liang] export methods
c015697 [Eric Liang] second pass
117949a [Eric Liang] comments
5afbc67 [Eric Liang] test label columns
6b7f15f [Eric Liang] Fri Jul 17 14:20:22 PDT 2015
3a63ae5 [Eric Liang] Fri Jul 17 13:41:52 PDT 2015
ce61367 [Eric Liang] Fri Jul 17 13:41:17 PDT 2015
0299c59 [Eric Liang] Fri Jul 17 13:40:32 PDT 2015
e37603f [Eric Liang] Fri Jul 17 12:15:03 PDT 2015
d417d0c [Eric Liang] Merge remote-tracking branch 'upstream/master' into spark-8774
29a2ce7 [Eric Liang] Merge branch 'spark-8774-1' into spark-8774
d1959d2 [Eric Liang] clarify comment
2db68aa [Eric Liang] second round of comments
dc3c943 [Eric Liang] address comments
5765ec6 [Eric Liang] fix style checks
1f361b0 [Eric Liang] doc
d33211b [Eric Liang] r support
fb0826b [Eric Liang] [SPARK-8774] Add R model formula with basic support as a transformer
---
 R/pkg/DESCRIPTION                             |  1 +
 R/pkg/NAMESPACE                               |  4 +
 R/pkg/R/generics.R                            |  4 +
 R/pkg/R/mllib.R                               | 73 +++++++++++++++++++
 R/pkg/inst/tests/test_mllib.R                 | 42 +++++++++++
 .../apache/spark/ml/feature/RFormula.scala    | 14 +++-
 .../apache/spark/ml/r/SparkRWrappers.scala    | 41 +++++++++++
 .../spark/ml/feature/RFormulaSuite.scala      |  9 +++
 8 files changed, 185 insertions(+), 3 deletions(-)
 create mode 100644 R/pkg/R/mllib.R
 create mode 100644 R/pkg/inst/tests/test_mllib.R
 create mode 100644 mllib/src/main/scala/org/apache/spark/ml/r/SparkRWrappers.scala

diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index d028821534b1a..4949d86d20c91 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -29,6 +29,7 @@ Collate:
     'client.R'
     'context.R'
     'deserialize.R'
+    'mllib.R'
     'serialize.R'
     'sparkR.R'
     'utils.R'
diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 331307c2077a5..5834813319bfd 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -10,6 +10,10 @@ export("sparkR.init")
 export("sparkR.stop")
 export("print.jobj")
 
+# MLlib integration
+exportMethods("glm",
+              "predict")
+
 # Job group lifecycle management methods
 export("setJobGroup",
        "clearJobGroup",
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index ebe6fbd97ce86..39b5586f7c90e 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -661,3 +661,7 @@ setGeneric("toRadians", function(x) { standardGeneric("toRadians") })
 #' @rdname column
 #' @export
 setGeneric("upper", function(x) { standardGeneric("upper") })
+
+#' @rdname glm
+#' @export
+setGeneric("glm")
diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
new file mode 100644
index 0000000000000..258e354081fc1
--- /dev/null
+++ b/R/pkg/R/mllib.R
@@ -0,0 +1,73 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# mllib.R: Provides methods for MLlib integration
+
+#' @title S4 class that represents a PipelineModel
+#' @param model A Java object reference to the backing Scala PipelineModel
+#' @export
+setClass("PipelineModel", representation(model = "jobj"))
+
+#' Fits a generalized linear model
+#'
+#' Fits a generalized linear model, similarly to R's glm(). Also see the glmnet package.
+#'
+#' @param formula A symbolic description of the model to be fitted. Currently only a few formula
+#'                operators are supported, including '~' and '+'.
+#' @param data DataFrame for training
+#' @param family Error distribution. "gaussian" -> linear regression, "binomial" -> logistic reg.
+#' @param lambda Regularization parameter
+#' @param alpha Elastic-net mixing parameter (see glmnet's documentation for details)
+#' @return a fitted MLlib model
+#' @rdname glm
+#' @export
+#' @examples
+#'\dontrun{
+#' sc <- sparkR.init()
+#' sqlContext <- sparkRSQL.init(sc)
+#' data(iris)
+#' df <- createDataFrame(sqlContext, iris)
+#' model <- glm(Sepal_Length ~ Sepal_Width, df)
+#'}
+setMethod("glm", signature(formula = "formula", family = "ANY", data = "DataFrame"),
+          function(formula, family = c("gaussian", "binomial"), data, lambda = 0, alpha = 0) {
+            family <- match.arg(family)
+            model <- callJStatic("org.apache.spark.ml.api.r.SparkRWrappers",
+                                 "fitRModelFormula", deparse(formula), data@sdf, family, lambda,
+                                 alpha)
+            return(new("PipelineModel", model = model))
+          })
+
+#' Make predictions from a model
+#'
+#' Makes predictions from a model produced by glm(), similarly to R's predict().
+#'
+#' @param model A fitted MLlib model
+#' @param newData DataFrame for testing
+#' @return DataFrame containing predicted values
+#' @rdname glm
+#' @export
+#' @examples
+#'\dontrun{
+#' model <- glm(y ~ x, trainingData)
+#' predicted <- predict(model, testData)
+#' showDF(predicted)
+#'}
+setMethod("predict", signature(object = "PipelineModel"),
+          function(object, newData) {
+            return(dataFrame(callJMethod(object@model, "transform", newData@sdf)))
+          })
diff --git a/R/pkg/inst/tests/test_mllib.R b/R/pkg/inst/tests/test_mllib.R
new file mode 100644
index 0000000000000..a492763344ae6
--- /dev/null
+++ b/R/pkg/inst/tests/test_mllib.R
@@ -0,0 +1,42 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+library(testthat)
+
+context("MLlib functions")
+
+# Tests for MLlib functions in SparkR
+
+sc <- sparkR.init()
+
+sqlContext <- sparkRSQL.init(sc)
+
+test_that("glm and predict", {
+  training <- createDataFrame(sqlContext, iris)
+  test <- select(training, "Sepal_Length")
+  model <- glm(Sepal_Width ~ Sepal_Length, training, family = "gaussian")
+  prediction <- predict(model, test)
+  expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
+})
+
+test_that("predictions match with native glm", {
+  training <- createDataFrame(sqlContext, iris)
+  model <- glm(Sepal_Width ~ Sepal_Length, data = training)
+  vals <- collect(select(predict(model, training), "prediction"))
+  rVals <- predict(glm(Sepal.Width ~ Sepal.Length, data = iris), iris)
+  expect_true(all(abs(rVals - vals) < 1e-9), rVals - vals)
+})
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
index 56169f2a01fc9..f7b46efa10e90 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
@@ -73,12 +73,16 @@ class RFormula(override val uid: String)
     val withFeatures = transformFeatures.transformSchema(schema)
     if (hasLabelCol(schema)) {
       withFeatures
-    } else {
+    } else if (schema.exists(_.name == parsedFormula.get.label)) {
       val nullable = schema(parsedFormula.get.label).dataType match {
         case _: NumericType | BooleanType => false
         case _ => true
       }
       StructType(withFeatures.fields :+ StructField($(labelCol), DoubleType, nullable))
+    } else {
+      // Ignore the label field. This is a hack so that this transformer can also work on test
+      // datasets in a Pipeline.
+      withFeatures
     }
   }
 
@@ -92,10 +96,10 @@ class RFormula(override val uid: String)
   override def toString: String = s"RFormula(${get(formula)})"
 
   private def transformLabel(dataset: DataFrame): DataFrame = {
+    val labelName = parsedFormula.get.label
     if (hasLabelCol(dataset.schema)) {
       dataset
-    } else {
-      val labelName = parsedFormula.get.label
+    } else if (dataset.schema.exists(_.name == labelName)) {
       dataset.schema(labelName).dataType match {
         case _: NumericType | BooleanType =>
           dataset.withColumn($(labelCol), dataset(labelName).cast(DoubleType))
@@ -103,6 +107,10 @@ class RFormula(override val uid: String)
         case other =>
           throw new IllegalArgumentException("Unsupported type for label: " + other)
       }
+    } else {
+      // Ignore the label field. This is a hack so that this transformer can also work on test
+      // datasets in a Pipeline.
+      dataset
     }
   }
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/SparkRWrappers.scala b/mllib/src/main/scala/org/apache/spark/ml/r/SparkRWrappers.scala
new file mode 100644
index 0000000000000..1ee080641e3e3
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/SparkRWrappers.scala
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.api.r
+
+import org.apache.spark.ml.feature.RFormula
+import org.apache.spark.ml.classification.LogisticRegression
+import org.apache.spark.ml.regression.LinearRegression
+import org.apache.spark.ml.{Pipeline, PipelineModel}
+import org.apache.spark.sql.DataFrame
+
+private[r] object SparkRWrappers {
+  def fitRModelFormula(
+      value: String,
+      df: DataFrame,
+      family: String,
+      lambda: Double,
+      alpha: Double): PipelineModel = {
+    val formula = new RFormula().setFormula(value)
+    val estimator = family match {
+      case "gaussian" => new LinearRegression().setRegParam(lambda).setElasticNetParam(alpha)
+      case "binomial" => new LogisticRegression().setRegParam(lambda).setElasticNetParam(alpha)
+    }
+    val pipeline = new Pipeline().setStages(Array(formula, estimator))
+    pipeline.fit(df)
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
index fa8611b243a9f..79c4ccf02d4e0 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
@@ -74,6 +74,15 @@ class RFormulaSuite extends SparkFunSuite with MLlibTestSparkContext {
     }
   }
 
+  test("allow missing label column for test datasets") {
+    val formula = new RFormula().setFormula("y ~ x").setLabelCol("label")
+    val original = sqlContext.createDataFrame(Seq((0, 1.0), (2, 2.0))).toDF("x", "_not_y")
+    val resultSchema = formula.transformSchema(original.schema)
+    assert(resultSchema.length == 3)
+    assert(!resultSchema.exists(_.name == "label"))
+    assert(resultSchema.toString == formula.transform(original).schema.toString)
+  }
+
 // TODO(ekl) enable after we implement string label support
 //  test("transform string label") {
 //    val formula = new RFormula().setFormula("name ~ id")

From a3c7a3ce32697ad293b8bcaf29f9384c8255b37f Mon Sep 17 00:00:00 2001
From: Tarek Auel <tarek.auel@googlemail.com>
Date: Mon, 20 Jul 2015 22:08:12 -0700
Subject: [PATCH 283/549] [SPARK-9132][SPARK-9163][SQL] codegen conv

Jira: https://issues.apache.org/jira/browse/SPARK-9132
https://issues.apache.org/jira/browse/SPARK-9163

rxin as you proposed in the Jira ticket, I just moved the logic to a separate object. I haven't changed anything of the logic of `NumberConverter`.

Author: Tarek Auel <tarek.auel@googlemail.com>

Closes #7552 from tarekauel/SPARK-9163 and squashes the following commits:

40dcde9 [Tarek Auel] [SPARK-9132][SPARK-9163][SQL] style fix
fa985bd [Tarek Auel] [SPARK-9132][SPARK-9163][SQL] codegen conv
---
 .../spark/sql/catalyst/expressions/math.scala | 204 ++++--------------
 .../sql/catalyst/util/NumberConverter.scala   | 176 +++++++++++++++
 .../expressions/MathFunctionsSuite.scala      |   4 +-
 .../catalyst/util/NumberConverterSuite.scala  |  40 ++++
 4 files changed, 263 insertions(+), 161 deletions(-)
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/NumberConverter.scala
 create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/NumberConverterSuite.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
index 7a9be02ba45b3..68cca0ad3d067 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
@@ -23,6 +23,7 @@ import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.{TypeCheckSuccess, TypeCheckFailure}
 import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.util.NumberConverter
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
 
@@ -164,7 +165,7 @@ case class Cosh(child: Expression) extends UnaryMathExpression(math.cosh, "COSH"
  * @param toBaseExpr to which base
  */
 case class Conv(numExpr: Expression, fromBaseExpr: Expression, toBaseExpr: Expression)
-  extends Expression with ImplicitCastInputTypes with CodegenFallback {
+  extends Expression with ImplicitCastInputTypes {
 
   override def foldable: Boolean = numExpr.foldable && fromBaseExpr.foldable && toBaseExpr.foldable
 
@@ -179,169 +180,54 @@ case class Conv(numExpr: Expression, fromBaseExpr: Expression, toBaseExpr: Expre
   /** Returns the result of evaluating this expression on a given input Row */
   override def eval(input: InternalRow): Any = {
     val num = numExpr.eval(input)
-    val fromBase = fromBaseExpr.eval(input)
-    val toBase = toBaseExpr.eval(input)
-    if (num == null || fromBase == null || toBase == null) {
-      null
-    } else {
-      conv(
-        num.asInstanceOf[UTF8String].getBytes,
-        fromBase.asInstanceOf[Int],
-        toBase.asInstanceOf[Int])
-    }
-  }
-
-  private val value = new Array[Byte](64)
-
-  /**
-   * Divide x by m as if x is an unsigned 64-bit integer. Examples:
-   * unsignedLongDiv(-1, 2) == Long.MAX_VALUE unsignedLongDiv(6, 3) == 2
-   * unsignedLongDiv(0, 5) == 0
-   *
-   * @param x is treated as unsigned
-   * @param m is treated as signed
-   */
-  private def unsignedLongDiv(x: Long, m: Int): Long = {
-    if (x >= 0) {
-      x / m
-    } else {
-      // Let uval be the value of the unsigned long with the same bits as x
-      // Two's complement => x = uval - 2*MAX - 2
-      // => uval = x + 2*MAX + 2
-      // Now, use the fact: (a+b)/c = a/c + b/c + (a%c+b%c)/c
-      x / m + 2 * (Long.MaxValue / m) + 2 / m + (x % m + 2 * (Long.MaxValue % m) + 2 % m) / m
-    }
-  }
-
-  /**
-   * Decode v into value[].
-   *
-   * @param v is treated as an unsigned 64-bit integer
-   * @param radix must be between MIN_RADIX and MAX_RADIX
-   */
-  private def decode(v: Long, radix: Int): Unit = {
-    var tmpV = v
-    java.util.Arrays.fill(value, 0.asInstanceOf[Byte])
-    var i = value.length - 1
-    while (tmpV != 0) {
-      val q = unsignedLongDiv(tmpV, radix)
-      value(i) = (tmpV - q * radix).asInstanceOf[Byte]
-      tmpV = q
-      i -= 1
-    }
-  }
-
-  /**
-   * Convert value[] into a long. On overflow, return -1 (as mySQL does). If a
-   * negative digit is found, ignore the suffix starting there.
-   *
-   * @param radix  must be between MIN_RADIX and MAX_RADIX
-   * @param fromPos is the first element that should be conisdered
-   * @return the result should be treated as an unsigned 64-bit integer.
-   */
-  private def encode(radix: Int, fromPos: Int): Long = {
-    var v: Long = 0L
-    val bound = unsignedLongDiv(-1 - radix, radix) // Possible overflow once
-    // val
-    // exceeds this value
-    var i = fromPos
-    while (i < value.length && value(i) >= 0) {
-      if (v >= bound) {
-        // Check for overflow
-        if (unsignedLongDiv(-1 - value(i), radix) < v) {
-          return -1
+    if (num != null) {
+      val fromBase = fromBaseExpr.eval(input)
+      if (fromBase != null) {
+        val toBase = toBaseExpr.eval(input)
+        if (toBase != null) {
+          NumberConverter.convert(
+            num.asInstanceOf[UTF8String].getBytes,
+            fromBase.asInstanceOf[Int],
+            toBase.asInstanceOf[Int])
+        } else {
+          null
         }
-      }
-      v = v * radix + value(i)
-      i += 1
-    }
-    v
-  }
-
-  /**
-   * Convert the bytes in value[] to the corresponding chars.
-   *
-   * @param radix must be between MIN_RADIX and MAX_RADIX
-   * @param fromPos is the first nonzero element
-   */
-  private def byte2char(radix: Int, fromPos: Int): Unit = {
-    var i = fromPos
-    while (i < value.length) {
-      value(i) = Character.toUpperCase(Character.forDigit(value(i), radix)).asInstanceOf[Byte]
-      i += 1
-    }
-  }
-
-  /**
-   * Convert the chars in value[] to the corresponding integers. Convert invalid
-   * characters to -1.
-   *
-   * @param radix must be between MIN_RADIX and MAX_RADIX
-   * @param fromPos is the first nonzero element
-   */
-  private def char2byte(radix: Int, fromPos: Int): Unit = {
-    var i = fromPos
-    while ( i < value.length) {
-      value(i) = Character.digit(value(i), radix).asInstanceOf[Byte]
-      i += 1
-    }
-  }
-
-  /**
-   * Convert numbers between different number bases. If toBase>0 the result is
-   * unsigned, otherwise it is signed.
-   * NB: This logic is borrowed from org.apache.hadoop.hive.ql.ud.UDFConv
-   */
-  private def conv(n: Array[Byte] , fromBase: Int, toBase: Int ): UTF8String = {
-    if (fromBase < Character.MIN_RADIX || fromBase > Character.MAX_RADIX
-      || Math.abs(toBase) < Character.MIN_RADIX
-      || Math.abs(toBase) > Character.MAX_RADIX) {
-      return null
-    }
-
-    if (n.length == 0) {
-      return null
-    }
-
-    var (negative, first) = if (n(0) == '-') (true, 1) else (false, 0)
-
-    // Copy the digits in the right side of the array
-    var i = 1
-    while (i <= n.length - first) {
-      value(value.length - i) = n(n.length - i)
-      i += 1
-    }
-    char2byte(fromBase, value.length - n.length + first)
-
-    // Do the conversion by going through a 64 bit integer
-    var v = encode(fromBase, value.length - n.length + first)
-    if (negative && toBase > 0) {
-      if (v < 0) {
-        v = -1
       } else {
-        v = -v
+        null
       }
+    } else {
+      null
     }
-    if (toBase < 0 && v < 0) {
-      v = -v
-      negative = true
-    }
-    decode(v, Math.abs(toBase))
-
-    // Find the first non-zero digit or the last digits if all are zero.
-    val firstNonZeroPos = {
-      val firstNonZero = value.indexWhere( _ != 0)
-      if (firstNonZero != -1) firstNonZero else value.length - 1
-    }
-
-    byte2char(Math.abs(toBase), firstNonZeroPos)
+  }
 
-    var resultStartPos = firstNonZeroPos
-    if (negative && toBase < 0) {
-      resultStartPos = firstNonZeroPos - 1
-      value(resultStartPos) = '-'
-    }
-    UTF8String.fromBytes(java.util.Arrays.copyOfRange(value, resultStartPos, value.length))
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    val numGen = numExpr.gen(ctx)
+    val from = fromBaseExpr.gen(ctx)
+    val to = toBaseExpr.gen(ctx)
+
+    val numconv = NumberConverter.getClass.getName.stripSuffix("$")
+    s"""
+       ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+       ${numGen.code}
+       boolean ${ev.isNull} = ${numGen.isNull};
+       if (!${ev.isNull}) {
+         ${from.code}
+         if (!${from.isNull}) {
+           ${to.code}
+           if (!${to.isNull}) {
+             ${ev.primitive} = $numconv.convert(${numGen.primitive}.getBytes(),
+               ${from.primitive}, ${to.primitive});
+             if (${ev.primitive} == null) {
+               ${ev.isNull} = true;
+             }
+           } else {
+             ${ev.isNull} = true;
+           }
+         } else {
+           ${ev.isNull} = true;
+         }
+       }
+     """
   }
 }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/NumberConverter.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/NumberConverter.scala
new file mode 100644
index 0000000000000..9fefc5656aac0
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/NumberConverter.scala
@@ -0,0 +1,176 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.util
+
+import org.apache.spark.unsafe.types.UTF8String
+
+object NumberConverter {
+
+  private val value = new Array[Byte](64)
+
+  /**
+   * Divide x by m as if x is an unsigned 64-bit integer. Examples:
+   * unsignedLongDiv(-1, 2) == Long.MAX_VALUE unsignedLongDiv(6, 3) == 2
+   * unsignedLongDiv(0, 5) == 0
+   *
+   * @param x is treated as unsigned
+   * @param m is treated as signed
+   */
+  private def unsignedLongDiv(x: Long, m: Int): Long = {
+    if (x >= 0) {
+      x / m
+    } else {
+      // Let uval be the value of the unsigned long with the same bits as x
+      // Two's complement => x = uval - 2*MAX - 2
+      // => uval = x + 2*MAX + 2
+      // Now, use the fact: (a+b)/c = a/c + b/c + (a%c+b%c)/c
+      x / m + 2 * (Long.MaxValue / m) + 2 / m + (x % m + 2 * (Long.MaxValue % m) + 2 % m) / m
+    }
+  }
+
+  /**
+   * Decode v into value[].
+   *
+   * @param v is treated as an unsigned 64-bit integer
+   * @param radix must be between MIN_RADIX and MAX_RADIX
+   */
+  private def decode(v: Long, radix: Int): Unit = {
+    var tmpV = v
+    java.util.Arrays.fill(value, 0.asInstanceOf[Byte])
+    var i = value.length - 1
+    while (tmpV != 0) {
+      val q = unsignedLongDiv(tmpV, radix)
+      value(i) = (tmpV - q * radix).asInstanceOf[Byte]
+      tmpV = q
+      i -= 1
+    }
+  }
+
+  /**
+   * Convert value[] into a long. On overflow, return -1 (as mySQL does). If a
+   * negative digit is found, ignore the suffix starting there.
+   *
+   * @param radix  must be between MIN_RADIX and MAX_RADIX
+   * @param fromPos is the first element that should be conisdered
+   * @return the result should be treated as an unsigned 64-bit integer.
+   */
+  private def encode(radix: Int, fromPos: Int): Long = {
+    var v: Long = 0L
+    val bound = unsignedLongDiv(-1 - radix, radix) // Possible overflow once
+    // val
+    // exceeds this value
+    var i = fromPos
+    while (i < value.length && value(i) >= 0) {
+      if (v >= bound) {
+        // Check for overflow
+        if (unsignedLongDiv(-1 - value(i), radix) < v) {
+          return -1
+        }
+      }
+      v = v * radix + value(i)
+      i += 1
+    }
+    v
+  }
+
+  /**
+   * Convert the bytes in value[] to the corresponding chars.
+   *
+   * @param radix must be between MIN_RADIX and MAX_RADIX
+   * @param fromPos is the first nonzero element
+   */
+  private def byte2char(radix: Int, fromPos: Int): Unit = {
+    var i = fromPos
+    while (i < value.length) {
+      value(i) = Character.toUpperCase(Character.forDigit(value(i), radix)).asInstanceOf[Byte]
+      i += 1
+    }
+  }
+
+  /**
+   * Convert the chars in value[] to the corresponding integers. Convert invalid
+   * characters to -1.
+   *
+   * @param radix must be between MIN_RADIX and MAX_RADIX
+   * @param fromPos is the first nonzero element
+   */
+  private def char2byte(radix: Int, fromPos: Int): Unit = {
+    var i = fromPos
+    while ( i < value.length) {
+      value(i) = Character.digit(value(i), radix).asInstanceOf[Byte]
+      i += 1
+    }
+  }
+
+  /**
+   * Convert numbers between different number bases. If toBase>0 the result is
+   * unsigned, otherwise it is signed.
+   * NB: This logic is borrowed from org.apache.hadoop.hive.ql.ud.UDFConv
+   */
+  def convert(n: Array[Byte] , fromBase: Int, toBase: Int ): UTF8String = {
+    if (fromBase < Character.MIN_RADIX || fromBase > Character.MAX_RADIX
+      || Math.abs(toBase) < Character.MIN_RADIX
+      || Math.abs(toBase) > Character.MAX_RADIX) {
+      return null
+    }
+
+    if (n.length == 0) {
+      return null
+    }
+
+    var (negative, first) = if (n(0) == '-') (true, 1) else (false, 0)
+
+    // Copy the digits in the right side of the array
+    var i = 1
+    while (i <= n.length - first) {
+      value(value.length - i) = n(n.length - i)
+      i += 1
+    }
+    char2byte(fromBase, value.length - n.length + first)
+
+    // Do the conversion by going through a 64 bit integer
+    var v = encode(fromBase, value.length - n.length + first)
+    if (negative && toBase > 0) {
+      if (v < 0) {
+        v = -1
+      } else {
+        v = -v
+      }
+    }
+    if (toBase < 0 && v < 0) {
+      v = -v
+      negative = true
+    }
+    decode(v, Math.abs(toBase))
+
+    // Find the first non-zero digit or the last digits if all are zero.
+    val firstNonZeroPos = {
+      val firstNonZero = value.indexWhere( _ != 0)
+      if (firstNonZero != -1) firstNonZero else value.length - 1
+    }
+
+    byte2char(Math.abs(toBase), firstNonZeroPos)
+
+    var resultStartPos = firstNonZeroPos
+    if (negative && toBase < 0) {
+      resultStartPos = firstNonZeroPos - 1
+      value(resultStartPos) = '-'
+    }
+    UTF8String.fromBytes(java.util.Arrays.copyOfRange(value, resultStartPos, value.length))
+  }
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala
index 04acd5b5ff4d1..a2b0fad7b7a04 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala
@@ -115,8 +115,8 @@ class MathFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(Conv(Literal("-15"), Literal(10), Literal(-16)), "-F")
     checkEvaluation(Conv(Literal("-15"), Literal(10), Literal(16)), "FFFFFFFFFFFFFFF1")
     checkEvaluation(Conv(Literal("big"), Literal(36), Literal(16)), "3A48")
-    checkEvaluation(Conv(Literal(null), Literal(36), Literal(16)), null)
-    checkEvaluation(Conv(Literal("3"), Literal(null), Literal(16)), null)
+    checkEvaluation(Conv(Literal.create(null, StringType), Literal(36), Literal(16)), null)
+    checkEvaluation(Conv(Literal("3"), Literal.create(null, IntegerType), Literal(16)), null)
     checkEvaluation(
       Conv(Literal("1234"), Literal(10), Literal(37)), null)
     checkEvaluation(
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/NumberConverterSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/NumberConverterSuite.scala
new file mode 100644
index 0000000000000..13265a1ff1c7f
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/NumberConverterSuite.scala
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.util
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.catalyst.util.NumberConverter.convert
+import org.apache.spark.unsafe.types.UTF8String
+
+class NumberConverterSuite extends SparkFunSuite {
+
+  private[this] def checkConv(n: String, fromBase: Int, toBase: Int, expected: String): Unit = {
+    assert(convert(UTF8String.fromString(n).getBytes, fromBase, toBase) ===
+      UTF8String.fromString(expected))
+  }
+
+  test("convert") {
+    checkConv("3", 10, 2, "11")
+    checkConv("-15", 10, -16, "-F")
+    checkConv("-15", 10, 16, "FFFFFFFFFFFFFFF1")
+    checkConv("big", 36, 16, "3A48")
+    checkConv("9223372036854775807", 36, 16, "FFFFFFFFFFFFFFFF")
+    checkConv("11abc", 10, 16, "B")
+  }
+
+}

From 4d97be95300f729391c17b4c162e3c7fba09b8bf Mon Sep 17 00:00:00 2001
From: Holden Karau <holden@pigscanfly.ca>
Date: Mon, 20 Jul 2015 22:15:10 -0700
Subject: [PATCH 284/549] [SPARK-9204][ML] Add default params test for
 linearyregression suite

Author: Holden Karau <holden@pigscanfly.ca>

Closes #7553 from holdenk/SPARK-9204-add-default-params-test-to-linear-regression and squashes the following commits:

630ba19 [Holden Karau] style fix
faa08a3 [Holden Karau] Add default params test for linearyregression suite
---
 .../ml/regression/LinearRegressionSuite.scala | 25 +++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
index 374002c5b4fdd..7cdda3db88ad1 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.ml.regression
 
 import org.apache.spark.SparkFunSuite
+import org.apache.spark.ml.param.ParamsSuite
 import org.apache.spark.mllib.linalg.{DenseVector, Vectors}
 import org.apache.spark.mllib.util.{LinearDataGenerator, MLlibTestSparkContext}
 import org.apache.spark.mllib.util.TestingUtils._
@@ -55,6 +56,30 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   }
 
+  test("params") {
+    ParamsSuite.checkParams(new LinearRegression)
+    val model = new LinearRegressionModel("linearReg", Vectors.dense(0.0), 0.0)
+    ParamsSuite.checkParams(model)
+  }
+
+  test("linear regression: default params") {
+    val lir = new LinearRegression
+    assert(lir.getLabelCol === "label")
+    assert(lir.getFeaturesCol === "features")
+    assert(lir.getPredictionCol === "prediction")
+    assert(lir.getRegParam === 0.0)
+    assert(lir.getElasticNetParam === 0.0)
+    assert(lir.getFitIntercept)
+    val model = lir.fit(dataset)
+    model.transform(dataset)
+      .select("label", "prediction")
+      .collect()
+    assert(model.getFeaturesCol === "features")
+    assert(model.getPredictionCol === "prediction")
+    assert(model.intercept !== 0.0)
+    assert(model.hasParent)
+  }
+
   test("linear regression with intercept without regularization") {
     val trainer = new LinearRegression
     val model = trainer.fit(dataset)

From c032b0bf92130dc4facb003f0deaeb1228aefded Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Mon, 20 Jul 2015 22:38:05 -0700
Subject: [PATCH 285/549] [SPARK-8797] [SPARK-9146] [SPARK-9145] [SPARK-9147]
 Support NaN ordering and equality comparisons in Spark SQL

This patch addresses an issue where queries that sorted float or double columns containing NaN values could fail with "Comparison method violates its general contract!" errors from TimSort.  The root of this problem is that `NaN > anything`, `NaN == anything`, and `NaN < anything` all return `false`.

Per the design specified in SPARK-9079, we have decided that `NaN = NaN` should return true and that NaN should appear last when sorting in ascending order (i.e. it is larger than any other numeric value).

In addition to implementing these semantics, this patch also adds canonicalization of NaN values in UnsafeRow, which is necessary in order to be able to do binary equality comparisons on equal NaNs that might have different bit representations (see SPARK-9147).

Author: Josh Rosen <joshrosen@databricks.com>

Closes #7194 from JoshRosen/nan and squashes the following commits:

983d4fc [Josh Rosen] Merge remote-tracking branch 'origin/master' into nan
88bd73c [Josh Rosen] Fix Row.equals()
a702e2e [Josh Rosen] normalization -> canonicalization
a7267cf [Josh Rosen] Normalize NaNs in UnsafeRow
fe629ae [Josh Rosen] Merge remote-tracking branch 'origin/master' into nan
fbb2a29 [Josh Rosen] Fix NaN comparisons in BinaryComparison expressions
c1fd4fe [Josh Rosen] Fold NaN test into existing test framework
b31eb19 [Josh Rosen] Uncomment failing tests
7fe67af [Josh Rosen] Support NaN == NaN (SPARK-9145)
58bad2c [Josh Rosen] Revert "Compare rows' string representations to work around NaN incomparability."
fc6b4d2 [Josh Rosen] Update CodeGenerator
3998ef2 [Josh Rosen] Remove unused code
a2ba2e7 [Josh Rosen] Fix prefix comparision for NaNs
a30d371 [Josh Rosen] Compare rows' string representations to work around NaN incomparability.
6f03f85 [Josh Rosen] Fix bug in Double / Float ordering
42a1ad5 [Josh Rosen] Stop filtering NaNs in UnsafeExternalSortSuite
bfca524 [Josh Rosen] Change ordering so that NaN is maximum value.
8d7be61 [Josh Rosen] Update randomized test to use ScalaTest's assume()
b20837b [Josh Rosen] Add failing test for new NaN comparision ordering
5b88b2b [Josh Rosen] Fix compilation of CodeGenerationSuite
d907b5b [Josh Rosen] Merge remote-tracking branch 'origin/master' into nan
630ebc5 [Josh Rosen] Specify an ordering for NaN values.
9bf195a [Josh Rosen] Re-enable NaNs in CodeGenerationSuite to produce more regression tests
13fc06a [Josh Rosen] Add regression test for NaN sorting issue
f9efbb5 [Josh Rosen] Fix ORDER BY NULL
e7dc4fb [Josh Rosen] Add very generic test for ordering
7d5c13e [Josh Rosen] Add regression test for SPARK-8782 (ORDER BY NULL)
b55875a [Josh Rosen] Generate doubles and floats over entire possible range.
5acdd5c [Josh Rosen] Infinity and NaN are interesting.
ab76cbd [Josh Rosen] Move code to Catalyst package.
d2b4a4a [Josh Rosen] Add random data generator test utilities to Spark SQL.
---
 .../unsafe/sort/PrefixComparators.java        |  5 ++-
 .../scala/org/apache/spark/util/Utils.scala   | 28 +++++++++++++
 .../org/apache/spark/util/UtilsSuite.scala    | 31 +++++++++++++++
 .../unsafe/sort/PrefixComparatorsSuite.scala  | 25 ++++++++++++
 .../sql/catalyst/expressions/UnsafeRow.java   |  6 +++
 .../main/scala/org/apache/spark/sql/Row.scala | 24 ++++++++----
 .../expressions/codegen/CodeGenerator.scala   |  4 ++
 .../sql/catalyst/expressions/predicates.scala | 22 +++++++++--
 .../apache/spark/sql/types/DoubleType.scala   |  5 ++-
 .../apache/spark/sql/types/FloatType.scala    |  5 ++-
 .../expressions/CodeGenerationSuite.scala     | 39 +++++++++++++++++++
 .../catalyst/expressions/PredicateSuite.scala | 13 ++++---
 .../expressions/UnsafeRowConverterSuite.scala | 22 +++++++++++
 .../org/apache/spark/sql/DataFrameSuite.scala | 22 +++++++++++
 .../scala/org/apache/spark/sql/RowSuite.scala | 12 ++++++
 .../execution/UnsafeExternalSortSuite.scala   |  6 +--
 16 files changed, 243 insertions(+), 26 deletions(-)

diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/PrefixComparators.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/PrefixComparators.java
index 438742565c51d..bf1bc5dffba78 100644
--- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/PrefixComparators.java
+++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/PrefixComparators.java
@@ -23,6 +23,7 @@
 
 import org.apache.spark.annotation.Private;
 import org.apache.spark.unsafe.types.UTF8String;
+import org.apache.spark.util.Utils;
 
 @Private
 public class PrefixComparators {
@@ -82,7 +83,7 @@ public static final class FloatPrefixComparator extends PrefixComparator {
     public int compare(long aPrefix, long bPrefix) {
       float a = Float.intBitsToFloat((int) aPrefix);
       float b = Float.intBitsToFloat((int) bPrefix);
-      return (a < b) ? -1 : (a > b) ? 1 : 0;
+      return Utils.nanSafeCompareFloats(a, b);
     }
 
     public long computePrefix(float value) {
@@ -97,7 +98,7 @@ public static final class DoublePrefixComparator extends PrefixComparator {
     public int compare(long aPrefix, long bPrefix) {
       double a = Double.longBitsToDouble(aPrefix);
       double b = Double.longBitsToDouble(bPrefix);
-      return (a < b) ? -1 : (a > b) ? 1 : 0;
+      return Utils.nanSafeCompareDoubles(a, b);
     }
 
     public long computePrefix(double value) {
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index e6374f17d858f..c5816949cd360 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -1586,6 +1586,34 @@ private[spark] object Utils extends Logging {
     hashAbs
   }
 
+  /**
+   * NaN-safe version of [[java.lang.Double.compare()]] which allows NaN values to be compared
+   * according to semantics where NaN == NaN and NaN > any non-NaN double.
+   */
+  def nanSafeCompareDoubles(x: Double, y: Double): Int = {
+    val xIsNan: Boolean = java.lang.Double.isNaN(x)
+    val yIsNan: Boolean = java.lang.Double.isNaN(y)
+    if ((xIsNan && yIsNan) || (x == y)) 0
+    else if (xIsNan) 1
+    else if (yIsNan) -1
+    else if (x > y) 1
+    else -1
+  }
+
+  /**
+   * NaN-safe version of [[java.lang.Float.compare()]] which allows NaN values to be compared
+   * according to semantics where NaN == NaN and NaN > any non-NaN float.
+   */
+  def nanSafeCompareFloats(x: Float, y: Float): Int = {
+    val xIsNan: Boolean = java.lang.Float.isNaN(x)
+    val yIsNan: Boolean = java.lang.Float.isNaN(y)
+    if ((xIsNan && yIsNan) || (x == y)) 0
+    else if (xIsNan) 1
+    else if (yIsNan) -1
+    else if (x > y) 1
+    else -1
+  }
+
   /** Returns the system properties map that is thread-safe to iterator over. It gets the
     * properties which have been set explicitly, as well as those for which only a default value
     * has been defined. */
diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
index c7638507c88c6..8f7e402d5f2a6 100644
--- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.util
 
 import java.io.{File, ByteArrayOutputStream, ByteArrayInputStream, FileOutputStream}
+import java.lang.{Double => JDouble, Float => JFloat}
 import java.net.{BindException, ServerSocket, URI}
 import java.nio.{ByteBuffer, ByteOrder}
 import java.text.DecimalFormatSymbols
@@ -689,4 +690,34 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging {
     // scalastyle:on println
     assert(buffer.toString === "t circular test circular\n")
   }
+
+  test("nanSafeCompareDoubles") {
+    def shouldMatchDefaultOrder(a: Double, b: Double): Unit = {
+      assert(Utils.nanSafeCompareDoubles(a, b) === JDouble.compare(a, b))
+      assert(Utils.nanSafeCompareDoubles(b, a) === JDouble.compare(b, a))
+    }
+    shouldMatchDefaultOrder(0d, 0d)
+    shouldMatchDefaultOrder(0d, 1d)
+    shouldMatchDefaultOrder(Double.MinValue, Double.MaxValue)
+    assert(Utils.nanSafeCompareDoubles(Double.NaN, Double.NaN) === 0)
+    assert(Utils.nanSafeCompareDoubles(Double.NaN, Double.PositiveInfinity) === 1)
+    assert(Utils.nanSafeCompareDoubles(Double.NaN, Double.NegativeInfinity) === 1)
+    assert(Utils.nanSafeCompareDoubles(Double.PositiveInfinity, Double.NaN) === -1)
+    assert(Utils.nanSafeCompareDoubles(Double.NegativeInfinity, Double.NaN) === -1)
+  }
+
+  test("nanSafeCompareFloats") {
+    def shouldMatchDefaultOrder(a: Float, b: Float): Unit = {
+      assert(Utils.nanSafeCompareFloats(a, b) === JFloat.compare(a, b))
+      assert(Utils.nanSafeCompareFloats(b, a) === JFloat.compare(b, a))
+    }
+    shouldMatchDefaultOrder(0f, 0f)
+    shouldMatchDefaultOrder(1f, 1f)
+    shouldMatchDefaultOrder(Float.MinValue, Float.MaxValue)
+    assert(Utils.nanSafeCompareFloats(Float.NaN, Float.NaN) === 0)
+    assert(Utils.nanSafeCompareFloats(Float.NaN, Float.PositiveInfinity) === 1)
+    assert(Utils.nanSafeCompareFloats(Float.NaN, Float.NegativeInfinity) === 1)
+    assert(Utils.nanSafeCompareFloats(Float.PositiveInfinity, Float.NaN) === -1)
+    assert(Utils.nanSafeCompareFloats(Float.NegativeInfinity, Float.NaN) === -1)
+  }
 }
diff --git a/core/src/test/scala/org/apache/spark/util/collection/unsafe/sort/PrefixComparatorsSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/unsafe/sort/PrefixComparatorsSuite.scala
index dd505dfa7d758..dc03e374b51db 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/unsafe/sort/PrefixComparatorsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/unsafe/sort/PrefixComparatorsSuite.scala
@@ -47,4 +47,29 @@ class PrefixComparatorsSuite extends SparkFunSuite with PropertyChecks {
     forAll (regressionTests) { (s1: String, s2: String) => testPrefixComparison(s1, s2) }
     forAll { (s1: String, s2: String) => testPrefixComparison(s1, s2) }
   }
+
+  test("float prefix comparator handles NaN properly") {
+    val nan1: Float = java.lang.Float.intBitsToFloat(0x7f800001)
+    val nan2: Float = java.lang.Float.intBitsToFloat(0x7fffffff)
+    assert(nan1.isNaN)
+    assert(nan2.isNaN)
+    val nan1Prefix = PrefixComparators.FLOAT.computePrefix(nan1)
+    val nan2Prefix = PrefixComparators.FLOAT.computePrefix(nan2)
+    assert(nan1Prefix === nan2Prefix)
+    val floatMaxPrefix = PrefixComparators.FLOAT.computePrefix(Float.MaxValue)
+    assert(PrefixComparators.FLOAT.compare(nan1Prefix, floatMaxPrefix) === 1)
+  }
+
+  test("double prefix comparator handles NaNs properly") {
+    val nan1: Double = java.lang.Double.longBitsToDouble(0x7ff0000000000001L)
+    val nan2: Double = java.lang.Double.longBitsToDouble(0x7fffffffffffffffL)
+    assert(nan1.isNaN)
+    assert(nan2.isNaN)
+    val nan1Prefix = PrefixComparators.DOUBLE.computePrefix(nan1)
+    val nan2Prefix = PrefixComparators.DOUBLE.computePrefix(nan2)
+    assert(nan1Prefix === nan2Prefix)
+    val doubleMaxPrefix = PrefixComparators.DOUBLE.computePrefix(Double.MaxValue)
+    assert(PrefixComparators.DOUBLE.compare(nan1Prefix, doubleMaxPrefix) === 1)
+  }
+
 }
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
index 87294a0e21441..8cd9e7bc60a03 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
@@ -215,6 +215,9 @@ public void setLong(int ordinal, long value) {
   public void setDouble(int ordinal, double value) {
     assertIndexIsValid(ordinal);
     setNotNullAt(ordinal);
+    if (Double.isNaN(value)) {
+      value = Double.NaN;
+    }
     PlatformDependent.UNSAFE.putDouble(baseObject, getFieldOffset(ordinal), value);
   }
 
@@ -243,6 +246,9 @@ public void setByte(int ordinal, byte value) {
   public void setFloat(int ordinal, float value) {
     assertIndexIsValid(ordinal);
     setNotNullAt(ordinal);
+    if (Float.isNaN(value)) {
+      value = Float.NaN;
+    }
     PlatformDependent.UNSAFE.putFloat(baseObject, getFieldOffset(ordinal), value);
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala
index 2cb64d00935de..91449479fa539 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala
@@ -403,20 +403,28 @@ trait Row extends Serializable {
       if (!isNullAt(i)) {
         val o1 = get(i)
         val o2 = other.get(i)
-        if (o1.isInstanceOf[Array[Byte]]) {
-          // handle equality of Array[Byte]
-          val b1 = o1.asInstanceOf[Array[Byte]]
-          if (!o2.isInstanceOf[Array[Byte]] ||
-            !java.util.Arrays.equals(b1, o2.asInstanceOf[Array[Byte]])) {
+        o1 match {
+          case b1: Array[Byte] =>
+            if (!o2.isInstanceOf[Array[Byte]] ||
+                !java.util.Arrays.equals(b1, o2.asInstanceOf[Array[Byte]])) {
+              return false
+            }
+          case f1: Float if java.lang.Float.isNaN(f1) =>
+            if (!o2.isInstanceOf[Float] || ! java.lang.Float.isNaN(o2.asInstanceOf[Float])) {
+              return false
+            }
+          case d1: Double if java.lang.Double.isNaN(d1) =>
+            if (!o2.isInstanceOf[Double] || ! java.lang.Double.isNaN(o2.asInstanceOf[Double])) {
+              return false
+            }
+          case _ => if (o1 != o2) {
             return false
           }
-        } else if (o1 != o2) {
-          return false
         }
       }
       i += 1
     }
-    return true
+    true
   }
 
   /* ---------------------- utility methods for Scala ---------------------- */
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index 10f411ff7451a..606f770cb4f7b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -194,6 +194,8 @@ class CodeGenContext {
    */
   def genEqual(dataType: DataType, c1: String, c2: String): String = dataType match {
     case BinaryType => s"java.util.Arrays.equals($c1, $c2)"
+    case FloatType => s"(java.lang.Float.isNaN($c1) && java.lang.Float.isNaN($c2)) || $c1 == $c2"
+    case DoubleType => s"(java.lang.Double.isNaN($c1) && java.lang.Double.isNaN($c2)) || $c1 == $c2"
     case dt: DataType if isPrimitiveType(dt) => s"$c1 == $c2"
     case other => s"$c1.equals($c2)"
   }
@@ -204,6 +206,8 @@ class CodeGenContext {
   def genComp(dataType: DataType, c1: String, c2: String): String = dataType match {
     // java boolean doesn't support > or < operator
     case BooleanType => s"($c1 == $c2 ? 0 : ($c1 ? 1 : -1))"
+    case DoubleType => s"org.apache.spark.util.Utils.nanSafeCompareDoubles($c1, $c2)"
+    case FloatType => s"org.apache.spark.util.Utils.nanSafeCompareFloats($c1, $c2)"
     // use c1 - c2 may overflow
     case dt: DataType if isPrimitiveType(dt) => s"($c1 > $c2 ? 1 : $c1 < $c2 ? -1 : 0)"
     case BinaryType => s"org.apache.spark.sql.catalyst.util.TypeUtils.compareBinary($c1, $c2)"
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
index 40ec3df224ce1..a53ec31ee6a4b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
@@ -22,6 +22,7 @@ import org.apache.spark.sql.catalyst.util.TypeUtils
 import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.types._
+import org.apache.spark.util.Utils
 
 
 object InterpretedPredicate {
@@ -222,7 +223,9 @@ case class Or(left: Expression, right: Expression) extends BinaryOperator with P
 abstract class BinaryComparison extends BinaryOperator with Predicate {
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
-    if (ctx.isPrimitiveType(left.dataType)) {
+    if (ctx.isPrimitiveType(left.dataType)
+        && left.dataType != FloatType
+        && left.dataType != DoubleType) {
       // faster version
       defineCodeGen(ctx, ev, (c1, c2) => s"$c1 $symbol $c2")
     } else {
@@ -254,8 +257,15 @@ case class EqualTo(left: Expression, right: Expression) extends BinaryComparison
   override def symbol: String = "="
 
   protected override def nullSafeEval(input1: Any, input2: Any): Any = {
-    if (left.dataType != BinaryType) input1 == input2
-    else java.util.Arrays.equals(input1.asInstanceOf[Array[Byte]], input2.asInstanceOf[Array[Byte]])
+    if (left.dataType == FloatType) {
+      Utils.nanSafeCompareFloats(input1.asInstanceOf[Float], input2.asInstanceOf[Float]) == 0
+    } else if (left.dataType == DoubleType) {
+      Utils.nanSafeCompareDoubles(input1.asInstanceOf[Double], input2.asInstanceOf[Double]) == 0
+    } else if (left.dataType != BinaryType) {
+      input1 == input2
+    } else {
+      java.util.Arrays.equals(input1.asInstanceOf[Array[Byte]], input2.asInstanceOf[Array[Byte]])
+    }
   }
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
@@ -280,7 +290,11 @@ case class EqualNullSafe(left: Expression, right: Expression) extends BinaryComp
     } else if (input1 == null || input2 == null) {
       false
     } else {
-      if (left.dataType != BinaryType) {
+      if (left.dataType == FloatType) {
+        Utils.nanSafeCompareFloats(input1.asInstanceOf[Float], input2.asInstanceOf[Float]) == 0
+      } else if (left.dataType == DoubleType) {
+        Utils.nanSafeCompareDoubles(input1.asInstanceOf[Double], input2.asInstanceOf[Double]) == 0
+      } else if (left.dataType != BinaryType) {
         input1 == input2
       } else {
         java.util.Arrays.equals(input1.asInstanceOf[Array[Byte]], input2.asInstanceOf[Array[Byte]])
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DoubleType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DoubleType.scala
index 986c2ab055386..2a1bf0938e5a8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DoubleType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DoubleType.scala
@@ -23,6 +23,7 @@ import scala.reflect.runtime.universe.typeTag
 
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.sql.catalyst.ScalaReflectionLock
+import org.apache.spark.util.Utils
 
 /**
  * :: DeveloperApi ::
@@ -37,7 +38,9 @@ class DoubleType private() extends FractionalType {
   @transient private[sql] lazy val tag = ScalaReflectionLock.synchronized { typeTag[InternalType] }
   private[sql] val numeric = implicitly[Numeric[Double]]
   private[sql] val fractional = implicitly[Fractional[Double]]
-  private[sql] val ordering = implicitly[Ordering[InternalType]]
+  private[sql] val ordering = new Ordering[Double] {
+    override def compare(x: Double, y: Double): Int = Utils.nanSafeCompareDoubles(x, y)
+  }
   private[sql] val asIntegral = DoubleAsIfIntegral
 
   /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/FloatType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/FloatType.scala
index 9bd48ece83a1c..08e22252aef82 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/FloatType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/FloatType.scala
@@ -23,6 +23,7 @@ import scala.reflect.runtime.universe.typeTag
 
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.sql.catalyst.ScalaReflectionLock
+import org.apache.spark.util.Utils
 
 /**
  * :: DeveloperApi ::
@@ -37,7 +38,9 @@ class FloatType private() extends FractionalType {
   @transient private[sql] lazy val tag = ScalaReflectionLock.synchronized { typeTag[InternalType] }
   private[sql] val numeric = implicitly[Numeric[Float]]
   private[sql] val fractional = implicitly[Fractional[Float]]
-  private[sql] val ordering = implicitly[Ordering[InternalType]]
+  private[sql] val ordering = new Ordering[Float] {
+    override def compare(x: Float, y: Float): Int = Utils.nanSafeCompareFloats(x, y)
+  }
   private[sql] val asIntegral = FloatAsIfIntegral
 
   /**
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala
index e05218a23aa73..f4fbc49677ca3 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala
@@ -17,9 +17,14 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+import scala.math._
+
 import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.RandomDataGenerator
+import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.expressions.codegen._
+import org.apache.spark.sql.types.{DataTypeTestUtils, NullType, StructField, StructType}
 
 /**
  * Additional tests for code generation.
@@ -43,6 +48,40 @@ class CodeGenerationSuite extends SparkFunSuite with ExpressionEvalHelper {
     futures.foreach(Await.result(_, 10.seconds))
   }
 
+  // Test GenerateOrdering for all common types. For each type, we construct random input rows that
+  // contain two columns of that type, then for pairs of randomly-generated rows we check that
+  // GenerateOrdering agrees with RowOrdering.
+  (DataTypeTestUtils.atomicTypes ++ Set(NullType)).foreach { dataType =>
+    test(s"GenerateOrdering with $dataType") {
+      val rowOrdering = RowOrdering.forSchema(Seq(dataType, dataType))
+      val genOrdering = GenerateOrdering.generate(
+        BoundReference(0, dataType, nullable = true).asc ::
+          BoundReference(1, dataType, nullable = true).asc :: Nil)
+      val rowType = StructType(
+        StructField("a", dataType, nullable = true) ::
+          StructField("b", dataType, nullable = true) :: Nil)
+      val maybeDataGenerator = RandomDataGenerator.forType(rowType, nullable = false)
+      assume(maybeDataGenerator.isDefined)
+      val randGenerator = maybeDataGenerator.get
+      val toCatalyst = CatalystTypeConverters.createToCatalystConverter(rowType)
+      for (_ <- 1 to 50) {
+        val a = toCatalyst(randGenerator()).asInstanceOf[InternalRow]
+        val b = toCatalyst(randGenerator()).asInstanceOf[InternalRow]
+        withClue(s"a = $a, b = $b") {
+          assert(genOrdering.compare(a, a) === 0)
+          assert(genOrdering.compare(b, b) === 0)
+          assert(rowOrdering.compare(a, a) === 0)
+          assert(rowOrdering.compare(b, b) === 0)
+          assert(signum(genOrdering.compare(a, b)) === -1 * signum(genOrdering.compare(b, a)))
+          assert(signum(rowOrdering.compare(a, b)) === -1 * signum(rowOrdering.compare(b, a)))
+          assert(
+            signum(rowOrdering.compare(a, b)) === signum(genOrdering.compare(a, b)),
+            "Generated and non-generated orderings should agree")
+        }
+      }
+    }
+  }
+
   test("SPARK-8443: split wide projections into blocks due to JVM code size limit") {
     val length = 5000
     val expressions = List.fill(length)(EqualTo(Literal(1), Literal(1)))
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
index 2173a0c25c645..0bc2812a5dc83 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
@@ -136,11 +136,14 @@ class PredicateSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(And(InSet(one, hS), InSet(two, hS)), true)
   }
 
-  private val smallValues = Seq(1, Decimal(1), Array(1.toByte), "a").map(Literal(_))
-  private val largeValues = Seq(2, Decimal(2), Array(2.toByte), "b").map(Literal(_))
-
-  private val equalValues1 = smallValues
-  private val equalValues2 = Seq(1, Decimal(1), Array(1.toByte), "a").map(Literal(_))
+  private val smallValues = Seq(1, Decimal(1), Array(1.toByte), "a", 0f, 0d).map(Literal(_))
+  private val largeValues =
+    Seq(2, Decimal(2), Array(2.toByte), "b", Float.NaN, Double.NaN).map(Literal(_))
+
+  private val equalValues1 =
+    Seq(1, Decimal(1), Array(1.toByte), "a", Float.NaN, Double.NaN).map(Literal(_))
+  private val equalValues2 =
+    Seq(1, Decimal(1), Array(1.toByte), "a", Float.NaN, Double.NaN).map(Literal(_))
 
   test("BinaryComparison: <") {
     for (i <- 0 until smallValues.length) {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala
index d00aeb4dfbf47..dff5faf9f6ec8 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala
@@ -316,4 +316,26 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers {
     assert(setToNullAfterCreation.get(11) === rowWithNoNullColumns.get(11))
   }
 
+  test("NaN canonicalization") {
+    val fieldTypes: Array[DataType] = Array(FloatType, DoubleType)
+
+    val row1 = new SpecificMutableRow(fieldTypes)
+    row1.setFloat(0, java.lang.Float.intBitsToFloat(0x7f800001))
+    row1.setDouble(1, java.lang.Double.longBitsToDouble(0x7ff0000000000001L))
+
+    val row2 = new SpecificMutableRow(fieldTypes)
+    row2.setFloat(0, java.lang.Float.intBitsToFloat(0x7fffffff))
+    row2.setDouble(1, java.lang.Double.longBitsToDouble(0x7fffffffffffffffL))
+
+    val converter = new UnsafeRowConverter(fieldTypes)
+    val row1Buffer = new Array[Byte](converter.getSizeRequirement(row1))
+    val row2Buffer = new Array[Byte](converter.getSizeRequirement(row2))
+    converter.writeRow(
+      row1, row1Buffer, PlatformDependent.BYTE_ARRAY_OFFSET, row1Buffer.length, null)
+    converter.writeRow(
+      row2, row2Buffer, PlatformDependent.BYTE_ARRAY_OFFSET, row2Buffer.length, null)
+
+    assert(row1Buffer.toSeq === row2Buffer.toSeq)
+  }
+
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 192cc0a6e5d7c..f67f2c60c0e16 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql
 import java.io.File
 
 import scala.language.postfixOps
+import scala.util.Random
 
 import org.apache.spark.sql.catalyst.plans.logical.OneRowRelation
 import org.apache.spark.sql.functions._
@@ -742,6 +743,27 @@ class DataFrameSuite extends QueryTest with SQLTestUtils {
     df.col("t.``")
   }
 
+  test("SPARK-8797: sort by float column containing NaN should not crash") {
+    val inputData = Seq.fill(10)(Tuple1(Float.NaN)) ++ (1 to 1000).map(x => Tuple1(x.toFloat))
+    val df = Random.shuffle(inputData).toDF("a")
+    df.orderBy("a").collect()
+  }
+
+  test("SPARK-8797: sort by double column containing NaN should not crash") {
+    val inputData = Seq.fill(10)(Tuple1(Double.NaN)) ++ (1 to 1000).map(x => Tuple1(x.toDouble))
+    val df = Random.shuffle(inputData).toDF("a")
+    df.orderBy("a").collect()
+  }
+
+  test("NaN is greater than all other non-NaN numeric values") {
+    val maxDouble = Seq(Double.NaN, Double.PositiveInfinity, Double.MaxValue)
+      .map(Tuple1.apply).toDF("a").selectExpr("max(a)").first()
+    assert(java.lang.Double.isNaN(maxDouble.getDouble(0)))
+    val maxFloat = Seq(Float.NaN, Float.PositiveInfinity, Float.MaxValue)
+      .map(Tuple1.apply).toDF("a").selectExpr("max(a)").first()
+    assert(java.lang.Float.isNaN(maxFloat.getFloat(0)))
+  }
+
   test("SPARK-8072: Better Exception for Duplicate Columns") {
     // only one duplicate column present
     val e = intercept[org.apache.spark.sql.AnalysisException] {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/RowSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/RowSuite.scala
index d84b57af9c882..7cc6ffd7548d0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/RowSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/RowSuite.scala
@@ -73,4 +73,16 @@ class RowSuite extends SparkFunSuite {
       row.getAs[Int]("c")
     }
   }
+
+  test("float NaN == NaN") {
+    val r1 = Row(Float.NaN)
+    val r2 = Row(Float.NaN)
+    assert(r1 === r2)
+  }
+
+  test("double NaN == NaN") {
+    val r1 = Row(Double.NaN)
+    val r2 = Row(Double.NaN)
+    assert(r1 === r2)
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeExternalSortSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeExternalSortSuite.scala
index 4f4c1f28564cb..5fe73f7e0b072 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeExternalSortSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeExternalSortSuite.scala
@@ -83,11 +83,7 @@ class UnsafeExternalSortSuite extends SparkPlanTest with BeforeAndAfterAll {
     randomDataGenerator <- RandomDataGenerator.forType(dataType, nullable)
   ) {
     test(s"sorting on $dataType with nullable=$nullable, sortOrder=$sortOrder") {
-      val inputData = Seq.fill(1000)(randomDataGenerator()).filter {
-        case d: Double => !d.isNaN
-        case f: Float => !java.lang.Float.isNaN(f)
-        case x => true
-      }
+      val inputData = Seq.fill(1000)(randomDataGenerator())
       val inputDf = TestSQLContext.createDataFrame(
         TestSQLContext.sparkContext.parallelize(Random.shuffle(inputData).map(v => Row(v))),
         StructType(StructField("a", dataType, nullable = true) :: Nil)

From 560b355ccd038ca044726c9c9fcffd14d02e6696 Mon Sep 17 00:00:00 2001
From: Tarek Auel <tarek.auel@googlemail.com>
Date: Mon, 20 Jul 2015 22:43:30 -0700
Subject: [PATCH 286/549] [SPARK-9157] [SQL] codegen substring

https://issues.apache.org/jira/browse/SPARK-9157

Author: Tarek Auel <tarek.auel@googlemail.com>

Closes #7534 from tarekauel/SPARK-9157 and squashes the following commits:

e65e3e9 [Tarek Auel] [SPARK-9157] indent fix
44e89f8 [Tarek Auel] [SPARK-9157] use EMPTY_UTF8
37d54c4 [Tarek Auel] Merge branch 'master' into SPARK-9157
60732ea [Tarek Auel] [SPARK-9157] created substringSQL in UTF8String
18c3576 [Tarek Auel] [SPARK-9157][SQL] remove slice pos
1a2e611 [Tarek Auel] [SPARK-9157][SQL] codegen substring
---
 .../expressions/stringOperations.scala        | 87 ++++++++++---------
 .../apache/spark/unsafe/types/UTF8String.java | 12 +++
 .../spark/unsafe/types/UTF8StringSuite.java   | 19 ++++
 3 files changed, 75 insertions(+), 43 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
index 5c1908d55576a..438215e8e6e37 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
@@ -640,7 +640,7 @@ case class StringSplit(str: Expression, pattern: Expression)
  * Defined for String and Binary types.
  */
 case class Substring(str: Expression, pos: Expression, len: Expression)
-  extends Expression with ImplicitCastInputTypes with CodegenFallback {
+  extends Expression with ImplicitCastInputTypes {
 
   def this(str: Expression, pos: Expression) = {
     this(str, pos, Literal(Integer.MAX_VALUE))
@@ -649,58 +649,59 @@ case class Substring(str: Expression, pos: Expression, len: Expression)
   override def foldable: Boolean = str.foldable && pos.foldable && len.foldable
   override def nullable: Boolean = str.nullable || pos.nullable || len.nullable
 
-  override def dataType: DataType = {
-    if (!resolved) {
-      throw new UnresolvedException(this, s"Cannot resolve since $children are not resolved")
-    }
-    if (str.dataType == BinaryType) str.dataType else StringType
-  }
+  override def dataType: DataType = StringType
 
   override def inputTypes: Seq[DataType] = Seq(StringType, IntegerType, IntegerType)
 
   override def children: Seq[Expression] = str :: pos :: len :: Nil
 
-  @inline
-  def slicePos(startPos: Int, sliceLen: Int, length: () => Int): (Int, Int) = {
-    // Hive and SQL use one-based indexing for SUBSTR arguments but also accept zero and
-    // negative indices for start positions. If a start index i is greater than 0, it
-    // refers to element i-1 in the sequence. If a start index i is less than 0, it refers
-    // to the -ith element before the end of the sequence. If a start index i is 0, it
-    // refers to the first element.
-
-    val start = startPos match {
-      case pos if pos > 0 => pos - 1
-      case neg if neg < 0 => length() + neg
-      case _ => 0
-    }
-
-    val end = sliceLen match {
-      case max if max == Integer.MAX_VALUE => max
-      case x => start + x
+  override def eval(input: InternalRow): Any = {
+    val stringEval = str.eval(input)
+    if (stringEval != null) {
+      val posEval = pos.eval(input)
+      if (posEval != null) {
+        val lenEval = len.eval(input)
+        if (lenEval != null) {
+          stringEval.asInstanceOf[UTF8String]
+            .substringSQL(posEval.asInstanceOf[Int], lenEval.asInstanceOf[Int])
+        } else {
+          null
+        }
+      } else {
+        null
+      }
+    } else {
+      null
     }
-
-    (start, end)
   }
 
-  override def eval(input: InternalRow): Any = {
-    val string = str.eval(input)
-    val po = pos.eval(input)
-    val ln = len.eval(input)
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    val strGen = str.gen(ctx)
+    val posGen = pos.gen(ctx)
+    val lenGen = len.gen(ctx)
 
-    if ((string == null) || (po == null) || (ln == null)) {
-      null
-    } else {
-      val start = po.asInstanceOf[Int]
-      val length = ln.asInstanceOf[Int]
-      string match {
-        case ba: Array[Byte] =>
-          val (st, end) = slicePos(start, length, () => ba.length)
-          ba.slice(st, end)
-        case s: UTF8String =>
-          val (st, end) = slicePos(start, length, () => s.numChars())
-          s.substring(st, end)
+    val start = ctx.freshName("start")
+    val end = ctx.freshName("end")
+
+    s"""
+      ${strGen.code}
+      boolean ${ev.isNull} = ${strGen.isNull};
+      ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+      if (!${ev.isNull}) {
+        ${posGen.code}
+        if (!${posGen.isNull}) {
+          ${lenGen.code}
+          if (!${lenGen.isNull}) {
+            ${ev.primitive} = ${strGen.primitive}
+              .substringSQL(${posGen.primitive}, ${lenGen.primitive});
+          } else {
+            ${ev.isNull} = true;
+          }
+        } else {
+          ${ev.isNull} = true;
+        }
       }
-    }
+     """
   }
 }
 
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
index ed354f7f877f1..946d355f1fc28 100644
--- a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
+++ b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -165,6 +165,18 @@ public UTF8String substring(final int start, final int until) {
     return fromBytes(bytes);
   }
 
+  public UTF8String substringSQL(int pos, int length) {
+    // Information regarding the pos calculation:
+    // Hive and SQL use one-based indexing for SUBSTR arguments but also accept zero and
+    // negative indices for start positions. If a start index i is greater than 0, it
+    // refers to element i-1 in the sequence. If a start index i is less than 0, it refers
+    // to the -ith element before the end of the sequence. If a start index i is 0, it
+    // refers to the first element.
+    int start = (pos > 0) ? pos -1 : ((pos < 0) ? numChars() + pos : 0);
+    int end = (length == Integer.MAX_VALUE) ? Integer.MAX_VALUE : start + length;
+    return substring(start, end);
+  }
+
   /**
    * Returns whether this contains `substring` or not.
    */
diff --git a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
index 1f5572c509bdb..e2a5628ff4d93 100644
--- a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
+++ b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
@@ -272,6 +272,25 @@ public void pad() {
       fromString("数据砖头").rpad(12, fromString("孙行者")));
   }
 
+  @Test
+  public void substringSQL() {
+    UTF8String e = fromString("example");
+    assertEquals(e.substringSQL(0, 2), fromString("ex"));
+    assertEquals(e.substringSQL(1, 2), fromString("ex"));
+    assertEquals(e.substringSQL(0, 7), fromString("example"));
+    assertEquals(e.substringSQL(1, 2), fromString("ex"));
+    assertEquals(e.substringSQL(0, 100), fromString("example"));
+    assertEquals(e.substringSQL(1, 100), fromString("example"));
+    assertEquals(e.substringSQL(2, 2), fromString("xa"));
+    assertEquals(e.substringSQL(1, 6), fromString("exampl"));
+    assertEquals(e.substringSQL(2, 100), fromString("xample"));
+    assertEquals(e.substringSQL(0, 0), fromString(""));
+    assertEquals(e.substringSQL(100, 4), EMPTY_UTF8);
+    assertEquals(e.substringSQL(0, Integer.MAX_VALUE), fromString("example"));
+    assertEquals(e.substringSQL(1, Integer.MAX_VALUE), fromString("example"));
+    assertEquals(e.substringSQL(2, Integer.MAX_VALUE), fromString("xample"));
+  }
+
   @Test
   public void split() {
     assertTrue(Arrays.equals(fromString("ab,def,ghi").split(fromString(","), -1),

From 67570beed5950974126a91eacd48fd0fedfeb141 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Mon, 20 Jul 2015 22:48:13 -0700
Subject: [PATCH 287/549] [SPARK-9208][SQL] Remove variant of DataFrame string
 functions that accept column names.

It can be ambiguous whether that is a string literal or a column name.

cc marmbrus

Author: Reynold Xin <rxin@databricks.com>

Closes #7556 from rxin/str-exprs and squashes the following commits:

92afa83 [Reynold Xin] [SPARK-9208][SQL] Remove variant of DataFrame string functions that accept column names.
---
 .../sql/catalyst/optimizer/Optimizer.scala    |   2 +-
 .../org/apache/spark/sql/functions.scala      | 459 ++----------------
 .../spark/sql/DataFrameFunctionsSuite.scala   |   8 +-
 .../spark/sql/MathExpressionsSuite.scala      |   1 -
 .../spark/sql/StringFunctionsSuite.scala      |  59 +--
 5 files changed, 74 insertions(+), 455 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index fafdae07c92f0..9c45b196245da 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -684,7 +684,7 @@ object CombineLimits extends Rule[LogicalPlan] {
 }
 
 /**
- * Removes the inner [[CaseConversionExpression]] that are unnecessary because
+ * Removes the inner case conversion expressions that are unnecessary because
  * the inner conversion is overwritten by the outer one.
  */
 object SimplifyCaseConversionExpressions extends Rule[LogicalPlan] {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 41b25d1836481..8fa017610b63c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -69,7 +69,7 @@ object functions {
   def column(colName: String): Column = Column(colName)
 
   /**
-   * Convert a number from one base to another for the specified expressions
+   * Convert a number in string format from one base to another.
    *
    * @group math_funcs
    * @since 1.5.0
@@ -77,15 +77,6 @@ object functions {
   def conv(num: Column, fromBase: Int, toBase: Int): Column =
     Conv(num.expr, lit(fromBase).expr, lit(toBase).expr)
 
-  /**
-   * Convert a number from one base to another for the specified expressions
-   *
-   * @group math_funcs
-   * @since 1.5.0
-   */
-  def conv(numColName: String, fromBase: Int, toBase: Int): Column =
-    conv(Column(numColName), fromBase, toBase)
-
   /**
    * Creates a [[Column]] of literal value.
    *
@@ -627,14 +618,6 @@ object functions {
    */
   def isNaN(e: Column): Column = IsNaN(e.expr)
 
-  /**
-   * Converts a string expression to lower case.
-   *
-   * @group normal_funcs
-   * @since 1.3.0
-   */
-  def lower(e: Column): Column = Lower(e.expr)
-
   /**
    * A column expression that generates monotonically increasing 64-bit integers.
    *
@@ -791,14 +774,6 @@ object functions {
     struct((colName +: colNames).map(col) : _*)
   }
 
-  /**
-   * Converts a string expression to upper case.
-   *
-   * @group normal_funcs
-   * @since 1.3.0
-   */
-  def upper(e: Column): Column = Upper(e.expr)
-
   /**
    * Computes bitwise NOT.
    *
@@ -1106,9 +1081,8 @@ object functions {
    * @since 1.5.0
    */
   @scala.annotation.varargs
-  def greatest(exprs: Column*): Column = if (exprs.length < 2) {
-    sys.error("GREATEST takes at least 2 parameters")
-  } else {
+  def greatest(exprs: Column*): Column = {
+    require(exprs.length > 1, "greatest requires at least 2 arguments.")
     Greatest(exprs.map(_.expr))
   }
 
@@ -1120,9 +1094,7 @@ object functions {
    * @since 1.5.0
    */
   @scala.annotation.varargs
-  def greatest(columnName: String, columnNames: String*): Column = if (columnNames.isEmpty) {
-    sys.error("GREATEST takes at least 2 parameters")
-  } else {
+  def greatest(columnName: String, columnNames: String*): Column = {
     greatest((columnName +: columnNames).map(Column.apply): _*)
   }
 
@@ -1134,14 +1106,6 @@ object functions {
     */
   def hex(column: Column): Column = Hex(column.expr)
 
-  /**
-   * Computes hex value of the given input.
-   *
-   * @group math_funcs
-   * @since 1.5.0
-   */
-  def hex(colName: String): Column = hex(Column(colName))
-
   /**
    * Inverse of hex. Interprets each pair of characters as a hexadecimal number
    * and converts to the byte representation of number.
@@ -1151,15 +1115,6 @@ object functions {
    */
   def unhex(column: Column): Column = Unhex(column.expr)
 
-  /**
-   * Inverse of hex. Interprets each pair of characters as a hexadecimal number
-   * and converts to the byte representation of number.
-   *
-   * @group math_funcs
-   * @since 1.5.0
-   */
-  def unhex(colName: String): Column = unhex(Column(colName))
-
   /**
    * Computes `sqrt(a^2^ + b^2^)` without intermediate overflow or underflow.
    *
@@ -1233,9 +1188,8 @@ object functions {
    * @since 1.5.0
    */
   @scala.annotation.varargs
-  def least(exprs: Column*): Column = if (exprs.length < 2) {
-    sys.error("LEAST takes at least 2 parameters")
-  } else {
+  def least(exprs: Column*): Column = {
+    require(exprs.length > 1, "least requires at least 2 arguments.")
     Least(exprs.map(_.expr))
   }
 
@@ -1247,9 +1201,7 @@ object functions {
    * @since 1.5.0
    */
   @scala.annotation.varargs
-  def least(columnName: String, columnNames: String*): Column = if (columnNames.isEmpty) {
-    sys.error("LEAST takes at least 2 parameters")
-  } else {
+  def least(columnName: String, columnNames: String*): Column = {
     least((columnName +: columnNames).map(Column.apply): _*)
   }
 
@@ -1639,7 +1591,8 @@ object functions {
   //////////////////////////////////////////////////////////////////////////////////////////////
 
   /**
-   * Calculates the MD5 digest and returns the value as a 32 character hex string.
+   * Calculates the MD5 digest of a binary column and returns the value
+   * as a 32 character hex string.
    *
    * @group misc_funcs
    * @since 1.5.0
@@ -1647,15 +1600,8 @@ object functions {
   def md5(e: Column): Column = Md5(e.expr)
 
   /**
-   * Calculates the MD5 digest and returns the value as a 32 character hex string.
-   *
-   * @group misc_funcs
-   * @since 1.5.0
-   */
-  def md5(columnName: String): Column = md5(Column(columnName))
-
-  /**
-   * Calculates the SHA-1 digest and returns the value as a 40 character hex string.
+   * Calculates the SHA-1 digest of a binary column and returns the value
+   * as a 40 character hex string.
    *
    * @group misc_funcs
    * @since 1.5.0
@@ -1663,15 +1609,11 @@ object functions {
   def sha1(e: Column): Column = Sha1(e.expr)
 
   /**
-   * Calculates the SHA-1 digest and returns the value as a 40 character hex string.
+   * Calculates the SHA-2 family of hash functions of a binary column and
+   * returns the value as a hex string.
    *
-   * @group misc_funcs
-   * @since 1.5.0
-   */
-  def sha1(columnName: String): Column = sha1(Column(columnName))
-
-  /**
-   * Calculates the SHA-2 family of hash functions and returns the value as a hex string.
+   * @param e column to compute SHA-2 on.
+   * @param numBits one of 224, 256, 384, or 512.
    *
    * @group misc_funcs
    * @since 1.5.0
@@ -1683,29 +1625,14 @@ object functions {
   }
 
   /**
-   * Calculates the SHA-2 family of hash functions and returns the value as a hex string.
-   *
-   * @group misc_funcs
-   * @since 1.5.0
-   */
-  def sha2(columnName: String, numBits: Int): Column = sha2(Column(columnName), numBits)
-
-  /**
-   * Calculates the cyclic redundancy check value and returns the value as a bigint.
+   * Calculates the cyclic redundancy check value  (CRC32) of a binary column and
+   * returns the value as a bigint.
    *
    * @group misc_funcs
    * @since 1.5.0
    */
   def crc32(e: Column): Column = Crc32(e.expr)
 
-  /**
-   * Calculates the cyclic redundancy check value and returns the value as a bigint.
-   *
-   * @group misc_funcs
-   * @since 1.5.0
-   */
-  def crc32(columnName: String): Column = crc32(Column(columnName))
-
   //////////////////////////////////////////////////////////////////////////////////////////////
   // String functions
   //////////////////////////////////////////////////////////////////////////////////////////////
@@ -1719,19 +1646,6 @@ object functions {
   @scala.annotation.varargs
   def concat(exprs: Column*): Column = Concat(exprs.map(_.expr))
 
-  /**
-   * Concatenates input strings together into a single string.
-   *
-   * This is the variant of concat that takes in the column names.
-   *
-   * @group string_funcs
-   * @since 1.5.0
-   */
-  @scala.annotation.varargs
-  def concat(columnName: String, columnNames: String*): Column = {
-    concat((columnName +: columnNames).map(Column.apply): _*)
-  }
-
   /**
    * Concatenates input strings together into a single string, using the given separator.
    *
@@ -1743,19 +1657,6 @@ object functions {
     ConcatWs(Literal.create(sep, StringType) +: exprs.map(_.expr))
   }
 
-  /**
-   * Concatenates input strings together into a single string, using the given separator.
-   *
-   * This is the variant of concat_ws that takes in the column names.
-   *
-   * @group string_funcs
-   * @since 1.5.0
-   */
-  @scala.annotation.varargs
-  def concat_ws(sep: String, columnName: String, columnNames: String*): Column = {
-    concat_ws(sep, (columnName +: columnNames).map(Column.apply) : _*)
-  }
-
   /**
    * Computes the length of a given string / binary value.
    *
@@ -1765,23 +1666,20 @@ object functions {
   def length(e: Column): Column = Length(e.expr)
 
   /**
-   * Computes the length of a given string / binary column.
+   * Converts a string expression to lower case.
    *
    * @group string_funcs
-   * @since 1.5.0
+   * @since 1.3.0
    */
-  def length(columnName: String): Column = length(Column(columnName))
+  def lower(e: Column): Column = Lower(e.expr)
 
   /**
-   * Formats the number X to a format like '#,###,###.##', rounded to d decimal places,
-   * and returns the result as a string.
-   * If d is 0, the result has no decimal point or fractional part.
-   * If d < 0, the result will be null.
+   * Converts a string expression to upper case.
    *
    * @group string_funcs
-   * @since 1.5.0
+   * @since 1.3.0
    */
-  def format_number(x: Column, d: Int): Column = FormatNumber(x.expr, lit(d).expr)
+  def upper(e: Column): Column = Upper(e.expr)
 
   /**
    * Formats the number X to a format like '#,###,###.##', rounded to d decimal places,
@@ -1792,57 +1690,31 @@ object functions {
    * @group string_funcs
    * @since 1.5.0
    */
-  def format_number(columnXName: String, d: Int): Column = {
-    format_number(Column(columnXName), d)
-  }
+  def format_number(x: Column, d: Int): Column = FormatNumber(x.expr, lit(d).expr)
 
   /**
-   * Computes the Levenshtein distance of the two given strings.
+   * Computes the Levenshtein distance of the two given string columns.
    * @group string_funcs
    * @since 1.5.0
    */
   def levenshtein(l: Column, r: Column): Column = Levenshtein(l.expr, r.expr)
 
-  /**
-   * Computes the Levenshtein distance of the two given strings.
-   * @group string_funcs
-   * @since 1.5.0
-   */
-  def levenshtein(leftColumnName: String, rightColumnName: String): Column =
-    levenshtein(Column(leftColumnName), Column(rightColumnName))
-
-  /**
-   * Computes the numeric value of the first character of the specified string value.
-   *
-   * @group string_funcs
-   * @since 1.5.0
-   */
-  def ascii(e: Column): Column = Ascii(e.expr)
-
   /**
    * Computes the numeric value of the first character of the specified string column.
    *
    * @group string_funcs
    * @since 1.5.0
    */
-  def ascii(columnName: String): Column = ascii(Column(columnName))
+  def ascii(e: Column): Column = Ascii(e.expr)
 
   /**
-   * Trim the spaces from both ends for the specified string value.
+   * Trim the spaces from both ends for the specified string column.
    *
    * @group string_funcs
    * @since 1.5.0
    */
   def trim(e: Column): Column = StringTrim(e.expr)
 
-  /**
-   * Trim the spaces from both ends for the specified column.
-   *
-   * @group string_funcs
-   * @since 1.5.0
-   */
-  def trim(columnName: String): Column = trim(Column(columnName))
-
   /**
    * Trim the spaces from left end for the specified string value.
    *
@@ -1851,14 +1723,6 @@ object functions {
    */
   def ltrim(e: Column): Column = StringTrimLeft(e.expr)
 
-  /**
-   * Trim the spaces from left end for the specified column.
-   *
-   * @group string_funcs
-   * @since 1.5.0
-   */
-  def ltrim(columnName: String): Column = ltrim(Column(columnName))
-
   /**
    * Trim the spaces from right end for the specified string value.
    *
@@ -1867,25 +1731,6 @@ object functions {
    */
   def rtrim(e: Column): Column = StringTrimRight(e.expr)
 
-  /**
-   * Trim the spaces from right end for the specified column.
-   *
-   * @group string_funcs
-   * @since 1.5.0
-   */
-  def rtrim(columnName: String): Column = rtrim(Column(columnName))
-
-  /**
-   * Format strings in printf-style.
-   *
-   * @group string_funcs
-   * @since 1.5.0
-   */
-  @scala.annotation.varargs
-  def formatString(format: Column, arguments: Column*): Column = {
-    StringFormat((format +: arguments).map(_.expr): _*)
-  }
-
   /**
    * Format strings in printf-style.
    * NOTE: `format` is the string value of the formatter, not column name.
@@ -1898,18 +1743,6 @@ object functions {
     StringFormat(lit(format).expr +: arguNames.map(Column(_).expr): _*)
   }
 
-  /**
-   * Locate the position of the first occurrence of substr value in the given string.
-   * Returns null if either of the arguments are null.
-   *
-   * NOTE: The position is not zero based, but 1 based index, returns 0 if substr
-   * could not be found in str.
-   *
-   * @group string_funcs
-   * @since 1.5.0
-   */
-  def instr(substr: String, sub: String): Column = instr(Column(substr), Column(sub))
-
   /**
    * Locate the position of the first occurrence of substr column in the given string.
    * Returns null if either of the arguments are null.
@@ -1920,10 +1753,10 @@ object functions {
    * @group string_funcs
    * @since 1.5.0
    */
-  def instr(substr: Column, sub: Column): Column = StringInstr(substr.expr, sub.expr)
+  def instr(str: Column, substring: String): Column = StringInstr(str.expr, lit(substring).expr)
 
   /**
-   * Locate the position of the first occurrence of substr.
+   * Locate the position of the first occurrence of substr in a string column.
    *
    * NOTE: The position is not zero based, but 1 based index, returns 0 if substr
    * could not be found in str.
@@ -1931,77 +1764,26 @@ object functions {
    * @group string_funcs
    * @since 1.5.0
    */
-  def locate(substr: String, str: String): Column = {
-    locate(Column(substr), Column(str))
+  def locate(substr: String, str: Column): Column = {
+    new StringLocate(lit(substr).expr, str.expr)
   }
 
   /**
-   * Locate the position of the first occurrence of substr.
+   * Locate the position of the first occurrence of substr in a string column, after position pos.
    *
-   * NOTE: The position is not zero based, but 1 based index, returns 0 if substr
+   * NOTE: The position is not zero based, but 1 based index. returns 0 if substr
    * could not be found in str.
    *
    * @group string_funcs
    * @since 1.5.0
    */
-  def locate(substr: Column, str: Column): Column = {
-    new StringLocate(substr.expr, str.expr)
+  def locate(substr: String, str: Column, pos: Int): Column = {
+    StringLocate(lit(substr).expr, str.expr, lit(pos).expr)
   }
 
   /**
-   * Locate the position of the first occurrence of substr in a given string after position pos.
-   *
-   * NOTE: The position is not zero based, but 1 based index, returns 0 if substr
-   * could not be found in str.
-   *
-   * @group string_funcs
-   * @since 1.5.0
-   */
-  def locate(substr: String, str: String, pos: String): Column = {
-    locate(Column(substr), Column(str), Column(pos))
-  }
-
-  /**
-   * Locate the position of the first occurrence of substr in a given string after position pos.
-   *
-   * NOTE: The position is not zero based, but 1 based index, returns 0 if substr
-   * could not be found in str.
-   *
-   * @group string_funcs
-   * @since 1.5.0
-   */
-  def locate(substr: Column, str: Column, pos: Column): Column = {
-    StringLocate(substr.expr, str.expr, pos.expr)
-  }
-
-  /**
-   * Locate the position of the first occurrence of substr in a given string after position pos.
-   *
-   * NOTE: The position is not zero based, but 1 based index, returns 0 if substr
-   * could not be found in str.
-   *
-   * @group string_funcs
-   * @since 1.5.0
-   */
-  def locate(substr: Column, str: Column, pos: Int): Column = {
-    StringLocate(substr.expr, str.expr, lit(pos).expr)
-  }
-
-  /**
-   * Locate the position of the first occurrence of substr in a given string after position pos.
-   *
-   * NOTE: The position is not zero based, but 1 based index, returns 0 if substr
-   * could not be found in str.
-   *
-   * @group string_funcs
-   * @since 1.5.0
-   */
-  def locate(substr: String, str: String, pos: Int): Column = {
-    locate(Column(substr), Column(str), lit(pos))
-  }
-
-  /**
-   * Computes the specified value from binary to a base64 string.
+   * Computes the BASE64 encoding of a binary column and returns it as a string column.
+   * This is the reverse of unbase64.
    *
    * @group string_funcs
    * @since 1.5.0
@@ -2009,67 +1791,22 @@ object functions {
   def base64(e: Column): Column = Base64(e.expr)
 
   /**
-   * Computes the specified column from binary to a base64 string.
-   *
-   * @group string_funcs
-   * @since 1.5.0
-   */
-  def base64(columnName: String): Column = base64(Column(columnName))
-
-  /**
-   * Computes the specified value from a base64 string to binary.
+   * Decodes a BASE64 encoded string column and returns it as a binary column.
+   * This is the reverse of base64.
    *
    * @group string_funcs
    * @since 1.5.0
    */
   def unbase64(e: Column): Column = UnBase64(e.expr)
 
-  /**
-   * Computes the specified column from a base64 string to binary.
-   *
-   * @group string_funcs
-   * @since 1.5.0
-   */
-  def unbase64(columnName: String): Column = unbase64(Column(columnName))
-
   /**
    * Left-padded with pad to a length of len.
    *
    * @group string_funcs
    * @since 1.5.0
    */
-  def lpad(str: String, len: String, pad: String): Column = {
-    lpad(Column(str), Column(len), Column(pad))
-  }
-
-  /**
-   * Left-padded with pad to a length of len.
-   *
-   * @group string_funcs
-   * @since 1.5.0
-   */
-  def lpad(str: Column, len: Column, pad: Column): Column = {
-    StringLPad(str.expr, len.expr, pad.expr)
-  }
-
-  /**
-   * Left-padded with pad to a length of len.
-   *
-   * @group string_funcs
-   * @since 1.5.0
-   */
-  def lpad(str: Column, len: Int, pad: Column): Column = {
-    StringLPad(str.expr, lit(len).expr, pad.expr)
-  }
-
-  /**
-   * Left-padded with pad to a length of len.
-   *
-   * @group string_funcs
-   * @since 1.5.0
-   */
-  def lpad(str: String, len: Int, pad: String): Column = {
-    lpad(Column(str), len, Column(pad))
+  def lpad(str: Column, len: Int, pad: String): Column = {
+    StringLPad(str.expr, lit(len).expr, lit(pad).expr)
   }
 
   /**
@@ -2082,18 +1819,6 @@ object functions {
    */
   def encode(value: Column, charset: String): Column = Encode(value.expr, lit(charset).expr)
 
-  /**
-   * Computes the first argument into a binary from a string using the provided character set
-   * (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16').
-   * If either argument is null, the result will also be null.
-   * NOTE: charset represents the string value of the character set, not the column name.
-   *
-   * @group string_funcs
-   * @since 1.5.0
-   */
-  def encode(columnName: String, charset: String): Column =
-    encode(Column(columnName), charset)
-
   /**
    * Computes the first argument into a string from a binary using the provided character set
    * (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16').
@@ -2104,106 +1829,24 @@ object functions {
    */
   def decode(value: Column, charset: String): Column = Decode(value.expr, lit(charset).expr)
 
-  /**
-   * Computes the first argument into a string from a binary using the provided character set
-   * (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16').
-   * If either argument is null, the result will also be null.
-   * NOTE: charset represents the string value of the character set, not the column name.
-   *
-   * @group string_funcs
-   * @since 1.5.0
-   */
-  def decode(columnName: String, charset: String): Column =
-    decode(Column(columnName), charset)
-
-  /**
-   * Right-padded with pad to a length of len.
-   *
-   * @group string_funcs
-   * @since 1.5.0
-   */
-  def rpad(str: String, len: String, pad: String): Column = {
-    rpad(Column(str), Column(len), Column(pad))
-  }
-
-  /**
-   * Right-padded with pad to a length of len.
-   *
-   * @group string_funcs
-   * @since 1.5.0
-   */
-  def rpad(str: Column, len: Column, pad: Column): Column = {
-    StringRPad(str.expr, len.expr, pad.expr)
-  }
-
-  /**
-   * Right-padded with pad to a length of len.
-   *
-   * @group string_funcs
-   * @since 1.5.0
-   */
-  def rpad(str: String, len: Int, pad: String): Column = {
-    rpad(Column(str), len, Column(pad))
-  }
-
   /**
    * Right-padded with pad to a length of len.
    *
    * @group string_funcs
    * @since 1.5.0
    */
-  def rpad(str: Column, len: Int, pad: Column): Column = {
-    StringRPad(str.expr, lit(len).expr, pad.expr)
-  }
-
-  /**
-   * Repeat the string value of the specified column n times.
-   *
-   * @group string_funcs
-   * @since 1.5.0
-   */
-  def repeat(strColumn: String, timesColumn: String): Column = {
-    repeat(Column(strColumn), Column(timesColumn))
+  def rpad(str: Column, len: Int, pad: String): Column = {
+    StringRPad(str.expr, lit(len).expr, lit(pad).expr)
   }
 
   /**
-   * Repeat the string expression value n times.
+   * Repeats a string column n times, and returns it as a new string column.
    *
    * @group string_funcs
    * @since 1.5.0
    */
-  def repeat(str: Column, times: Column): Column = {
-    StringRepeat(str.expr, times.expr)
-  }
-
-  /**
-   * Repeat the string value of the specified column n times.
-   *
-   * @group string_funcs
-   * @since 1.5.0
-   */
-  def repeat(strColumn: String, times: Int): Column = {
-    repeat(Column(strColumn), times)
-  }
-
-  /**
-   * Repeat the string expression value n times.
-   *
-   * @group string_funcs
-   * @since 1.5.0
-   */
-  def repeat(str: Column, times: Int): Column = {
-    StringRepeat(str.expr, lit(times).expr)
-  }
-
-  /**
-   * Splits str around pattern (pattern is a regular expression).
-   *
-   * @group string_funcs
-   * @since 1.5.0
-   */
-  def split(strColumnName: String, pattern: String): Column = {
-    split(Column(strColumnName), pattern)
+  def repeat(str: Column, n: Int): Column = {
+    StringRepeat(str.expr, lit(n).expr)
   }
 
   /**
@@ -2217,16 +1860,6 @@ object functions {
     StringSplit(str.expr, lit(pattern).expr)
   }
 
-  /**
-   * Reversed the string for the specified column.
-   *
-   * @group string_funcs
-   * @since 1.5.0
-   */
-  def reverse(str: String): Column = {
-    reverse(Column(str))
-  }
-
   /**
    * Reversed the string for the specified value.
    *
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
index 29f1197a8543c..8d2ff2f9690d6 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
@@ -160,7 +160,7 @@ class DataFrameFunctionsSuite extends QueryTest {
   test("misc md5 function") {
     val df = Seq(("ABC", Array[Byte](1, 2, 3, 4, 5, 6))).toDF("a", "b")
     checkAnswer(
-      df.select(md5($"a"), md5("b")),
+      df.select(md5($"a"), md5($"b")),
       Row("902fbdd2b1df0c4f70b4a5d23525e932", "6ac1e56bc78f031059be7be854522c4c"))
 
     checkAnswer(
@@ -171,7 +171,7 @@ class DataFrameFunctionsSuite extends QueryTest {
   test("misc sha1 function") {
     val df = Seq(("ABC", "ABC".getBytes)).toDF("a", "b")
     checkAnswer(
-      df.select(sha1($"a"), sha1("b")),
+      df.select(sha1($"a"), sha1($"b")),
       Row("3c01bdbb26f358bab27f267924aa2c9a03fcfdb8", "3c01bdbb26f358bab27f267924aa2c9a03fcfdb8"))
 
     val dfEmpty = Seq(("", "".getBytes)).toDF("a", "b")
@@ -183,7 +183,7 @@ class DataFrameFunctionsSuite extends QueryTest {
   test("misc sha2 function") {
     val df = Seq(("ABC", Array[Byte](1, 2, 3, 4, 5, 6))).toDF("a", "b")
     checkAnswer(
-      df.select(sha2($"a", 256), sha2("b", 256)),
+      df.select(sha2($"a", 256), sha2($"b", 256)),
       Row("b5d4045c3f466fa91fe2cc6abe79232a1a57cdf104f7a26e716e0a1e2789df78",
         "7192385c3c0605de55bb9476ce1d90748190ecb32a8eed7f5207b30cf6a1fe89"))
 
@@ -200,7 +200,7 @@ class DataFrameFunctionsSuite extends QueryTest {
   test("misc crc32 function") {
     val df = Seq(("ABC", Array[Byte](1, 2, 3, 4, 5, 6))).toDF("a", "b")
     checkAnswer(
-      df.select(crc32($"a"), crc32("b")),
+      df.select(crc32($"a"), crc32($"b")),
       Row(2743272264L, 2180413220L))
 
     checkAnswer(
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala
index a51523f1a7a0f..21256704a5b16 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala
@@ -176,7 +176,6 @@ class MathExpressionsSuite extends QueryTest {
   test("conv") {
     val df = Seq(("333", 10, 2)).toDF("num", "fromBase", "toBase")
     checkAnswer(df.select(conv('num, 10, 16)), Row("14D"))
-    checkAnswer(df.select(conv("num", 10, 16)), Row("14D"))
     checkAnswer(df.select(conv(lit(100), 2, 16)), Row("4"))
     checkAnswer(df.select(conv(lit(3122234455L), 10, 16)), Row("BA198457"))
     checkAnswer(df.selectExpr("conv(num, fromBase, toBase)"), Row("101001101"))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
index 413f3858d6764..4551192b157ff 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
@@ -52,14 +52,14 @@ class StringFunctionsSuite extends QueryTest {
 
   test("string Levenshtein distance") {
     val df = Seq(("kitten", "sitting"), ("frog", "fog")).toDF("l", "r")
-    checkAnswer(df.select(levenshtein("l", "r")), Seq(Row(3), Row(1)))
+    checkAnswer(df.select(levenshtein($"l", $"r")), Seq(Row(3), Row(1)))
     checkAnswer(df.selectExpr("levenshtein(l, r)"), Seq(Row(3), Row(1)))
   }
 
   test("string ascii function") {
     val df = Seq(("abc", "")).toDF("a", "b")
     checkAnswer(
-      df.select(ascii($"a"), ascii("b")),
+      df.select(ascii($"a"), ascii($"b")),
       Row(97, 0))
 
     checkAnswer(
@@ -71,8 +71,8 @@ class StringFunctionsSuite extends QueryTest {
     val bytes = Array[Byte](1, 2, 3, 4)
     val df = Seq((bytes, "AQIDBA==")).toDF("a", "b")
     checkAnswer(
-      df.select(base64("a"), base64($"a"), unbase64("b"), unbase64($"b")),
-      Row("AQIDBA==", "AQIDBA==", bytes, bytes))
+      df.select(base64($"a"), unbase64($"b")),
+      Row("AQIDBA==", bytes))
 
     checkAnswer(
       df.selectExpr("base64(a)", "unbase64(b)"),
@@ -85,12 +85,8 @@ class StringFunctionsSuite extends QueryTest {
     // non ascii characters are not allowed in the code, so we disable the scalastyle here.
     val df = Seq(("大千世界", "utf-8", bytes)).toDF("a", "b", "c")
     checkAnswer(
-      df.select(
-        encode($"a", "utf-8"),
-        encode("a", "utf-8"),
-        decode($"c", "utf-8"),
-        decode("c", "utf-8")),
-      Row(bytes, bytes, "大千世界", "大千世界"))
+      df.select(encode($"a", "utf-8"), decode($"c", "utf-8")),
+      Row(bytes, "大千世界"))
 
     checkAnswer(
       df.selectExpr("encode(a, 'utf-8')", "decode(c, 'utf-8')"),
@@ -114,8 +110,8 @@ class StringFunctionsSuite extends QueryTest {
     val df = Seq(("aa%d%s", 123, "cc")).toDF("a", "b", "c")
 
     checkAnswer(
-      df.select(formatString($"a", $"b", $"c"), formatString("aa%d%s", "b", "c")),
-      Row("aa123cc", "aa123cc"))
+      df.select(formatString("aa%d%s", "b", "c")),
+      Row("aa123cc"))
 
     checkAnswer(
       df.selectExpr("printf(a, b, c)"),
@@ -126,8 +122,8 @@ class StringFunctionsSuite extends QueryTest {
     val df = Seq(("aaads", "aa", "zz")).toDF("a", "b", "c")
 
     checkAnswer(
-      df.select(instr($"a", $"b"), instr("a", "b")),
-      Row(1, 1))
+      df.select(instr($"a", "aa")),
+      Row(1))
 
     checkAnswer(
       df.selectExpr("instr(a, b)"),
@@ -138,10 +134,8 @@ class StringFunctionsSuite extends QueryTest {
     val df = Seq(("aaads", "aa", "zz", 1)).toDF("a", "b", "c", "d")
 
     checkAnswer(
-      df.select(
-        locate($"b", $"a"), locate("b", "a"), locate($"b", $"a", 1),
-        locate("b", "a", 1), locate($"b", $"a", $"d"), locate("b", "a", "d")),
-      Row(1, 1, 2, 2, 2, 2))
+      df.select(locate("aa", $"a"), locate("aa", $"a", 1)),
+      Row(1, 2))
 
     checkAnswer(
       df.selectExpr("locate(b, a)", "locate(b, a, d)"),
@@ -152,10 +146,8 @@ class StringFunctionsSuite extends QueryTest {
     val df = Seq(("hi", 5, "??")).toDF("a", "b", "c")
 
     checkAnswer(
-      df.select(
-        lpad($"a", $"b", $"c"), rpad("a", "b", "c"),
-        lpad($"a", 1, $"c"), rpad("a", 1, "c")),
-      Row("???hi", "hi???", "h", "h"))
+      df.select(lpad($"a", 1, "c"), lpad($"a", 5, "??"), rpad($"a", 1, "c"), rpad($"a", 5, "??")),
+      Row("h", "???hi", "h", "hi???"))
 
     checkAnswer(
       df.selectExpr("lpad(a, b, c)", "rpad(a, b, c)", "lpad(a, 1, c)", "rpad(a, 1, c)"),
@@ -166,9 +158,8 @@ class StringFunctionsSuite extends QueryTest {
     val df = Seq(("hi", 2)).toDF("a", "b")
 
     checkAnswer(
-      df.select(
-        repeat($"a", 2), repeat("a", 2), repeat($"a", $"b"), repeat("a", "b")),
-      Row("hihi", "hihi", "hihi", "hihi"))
+      df.select(repeat($"a", 2)),
+      Row("hihi"))
 
     checkAnswer(
       df.selectExpr("repeat(a, 2)", "repeat(a, b)"),
@@ -179,7 +170,7 @@ class StringFunctionsSuite extends QueryTest {
     val df = Seq(("hi", "hhhi")).toDF("a", "b")
 
     checkAnswer(
-      df.select(reverse($"a"), reverse("b")),
+      df.select(reverse($"a"), reverse($"b")),
       Row("ih", "ihhh"))
 
     checkAnswer(
@@ -199,10 +190,8 @@ class StringFunctionsSuite extends QueryTest {
     val df = Seq(("aa2bb3cc", "[1-9]+")).toDF("a", "b")
 
     checkAnswer(
-      df.select(
-        split($"a", "[1-9]+"),
-        split("a", "[1-9]+")),
-      Row(Seq("aa", "bb", "cc"), Seq("aa", "bb", "cc")))
+      df.select(split($"a", "[1-9]+")),
+      Row(Seq("aa", "bb", "cc")))
 
     checkAnswer(
       df.selectExpr("split(a, '[1-9]+')"),
@@ -212,8 +201,8 @@ class StringFunctionsSuite extends QueryTest {
   test("string / binary length function") {
     val df = Seq(("123", Array[Byte](1, 2, 3, 4), 123)).toDF("a", "b", "c")
     checkAnswer(
-      df.select(length($"a"), length("a"), length($"b"), length("b")),
-      Row(3, 3, 4, 4))
+      df.select(length($"a"), length($"b")),
+      Row(3, 4))
 
     checkAnswer(
       df.selectExpr("length(a)", "length(b)"),
@@ -243,10 +232,8 @@ class StringFunctionsSuite extends QueryTest {
           "h") // decimal 7.128381
 
     checkAnswer(
-      df.select(
-        format_number($"f", 4),
-        format_number("f", 4)),
-      Row("5.0000", "5.0000"))
+      df.select(format_number($"f", 4)),
+      Row("5.0000"))
 
     checkAnswer(
       df.selectExpr("format_number(b, e)"), // convert the 1st argument to integer

From 48f8fd46b32973f1f3b865da80345698cb1a71c7 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Mon, 20 Jul 2015 23:28:35 -0700
Subject: [PATCH 288/549] [SPARK-9023] [SQL] Followup for #7456 (Efficiency
 improvements for UnsafeRows in Exchange)

This patch addresses code review feedback from #7456.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #7551 from JoshRosen/unsafe-exchange-followup and squashes the following commits:

76dbdf8 [Josh Rosen] Add comments + more methods to UnsafeRowSerializer
3d7a1f2 [Josh Rosen] Add writeToStream() method to UnsafeRow
---
 .../sql/catalyst/expressions/UnsafeRow.java   | 33 ++++++++
 .../sql/execution/UnsafeRowSerializer.scala   | 80 +++++++++++++------
 .../org/apache/spark/sql/UnsafeRowSuite.scala | 71 ++++++++++++++++
 3 files changed, 161 insertions(+), 23 deletions(-)
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/UnsafeRowSuite.scala

diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
index 8cd9e7bc60a03..6ce03a48e9538 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
@@ -17,6 +17,9 @@
 
 package org.apache.spark.sql.catalyst.expressions;
 
+import java.io.IOException;
+import java.io.OutputStream;
+
 import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.catalyst.util.ObjectPool;
 import org.apache.spark.unsafe.PlatformDependent;
@@ -371,6 +374,36 @@ public InternalRow copy() {
     }
   }
 
+  /**
+   * Write this UnsafeRow's underlying bytes to the given OutputStream.
+   *
+   * @param out the stream to write to.
+   * @param writeBuffer a byte array for buffering chunks of off-heap data while writing to the
+   *                    output stream. If this row is backed by an on-heap byte array, then this
+   *                    buffer will not be used and may be null.
+   */
+  public void writeToStream(OutputStream out, byte[] writeBuffer) throws IOException {
+    if (baseObject instanceof byte[]) {
+      int offsetInByteArray = (int) (PlatformDependent.BYTE_ARRAY_OFFSET - baseOffset);
+      out.write((byte[]) baseObject, offsetInByteArray, sizeInBytes);
+    } else {
+      int dataRemaining = sizeInBytes;
+      long rowReadPosition = baseOffset;
+      while (dataRemaining > 0) {
+        int toTransfer = Math.min(writeBuffer.length, dataRemaining);
+        PlatformDependent.copyMemory(
+          baseObject,
+          rowReadPosition,
+          writeBuffer,
+          PlatformDependent.BYTE_ARRAY_OFFSET,
+          toTransfer);
+        out.write(writeBuffer, 0, toTransfer);
+        rowReadPosition += toTransfer;
+        dataRemaining -= toTransfer;
+      }
+    }
+  }
+
   @Override
   public boolean anyNull() {
     return BitSetMethods.anySet(baseObject, baseOffset, bitSetWidthInBytes);
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/UnsafeRowSerializer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/UnsafeRowSerializer.scala
index 19503ed00056c..318550e5ed899 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/UnsafeRowSerializer.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/UnsafeRowSerializer.scala
@@ -49,8 +49,16 @@ private[sql] class UnsafeRowSerializer(numFields: Int) extends Serializer with S
 
 private class UnsafeRowSerializerInstance(numFields: Int) extends SerializerInstance {
 
+  /**
+   * Marks the end of a stream written with [[serializeStream()]].
+   */
   private[this] val EOF: Int = -1
 
+  /**
+   * Serializes a stream of UnsafeRows. Within the stream, each record consists of a record
+   * length (stored as a 4-byte integer, written high byte first), followed by the record's bytes.
+   * The end of the stream is denoted by a record with the special length `EOF` (-1).
+   */
   override def serializeStream(out: OutputStream): SerializationStream = new SerializationStream {
     private[this] var writeBuffer: Array[Byte] = new Array[Byte](4096)
     private[this] val dOut: DataOutputStream = new DataOutputStream(out)
@@ -59,32 +67,31 @@ private class UnsafeRowSerializerInstance(numFields: Int) extends SerializerInst
       val row = value.asInstanceOf[UnsafeRow]
       assert(row.getPool == null, "UnsafeRowSerializer does not support ObjectPool")
       dOut.writeInt(row.getSizeInBytes)
-      var dataRemaining: Int = row.getSizeInBytes
-      val baseObject = row.getBaseObject
-      var rowReadPosition: Long = row.getBaseOffset
-      while (dataRemaining > 0) {
-        val toTransfer: Int = Math.min(writeBuffer.length, dataRemaining)
-        PlatformDependent.copyMemory(
-          baseObject,
-          rowReadPosition,
-          writeBuffer,
-          PlatformDependent.BYTE_ARRAY_OFFSET,
-          toTransfer)
-        out.write(writeBuffer, 0, toTransfer)
-        rowReadPosition += toTransfer
-        dataRemaining -= toTransfer
-      }
+      row.writeToStream(out, writeBuffer)
       this
     }
+
     override def writeKey[T: ClassTag](key: T): SerializationStream = {
+      // The key is only needed on the map side when computing partition ids. It does not need to
+      // be shuffled.
       assert(key.isInstanceOf[Int])
       this
     }
-    override def writeAll[T: ClassTag](iter: Iterator[T]): SerializationStream =
+
+    override def writeAll[T: ClassTag](iter: Iterator[T]): SerializationStream = {
+      // This method is never called by shuffle code.
       throw new UnsupportedOperationException
-    override def writeObject[T: ClassTag](t: T): SerializationStream =
+    }
+
+    override def writeObject[T: ClassTag](t: T): SerializationStream = {
+      // This method is never called by shuffle code.
       throw new UnsupportedOperationException
-    override def flush(): Unit = dOut.flush()
+    }
+
+    override def flush(): Unit = {
+      dOut.flush()
+    }
+
     override def close(): Unit = {
       writeBuffer = null
       dOut.writeInt(EOF)
@@ -95,6 +102,7 @@ private class UnsafeRowSerializerInstance(numFields: Int) extends SerializerInst
   override def deserializeStream(in: InputStream): DeserializationStream = {
     new DeserializationStream {
       private[this] val dIn: DataInputStream = new DataInputStream(in)
+      // 1024 is a default buffer size; this buffer will grow to accommodate larger rows
       private[this] var rowBuffer: Array[Byte] = new Array[Byte](1024)
       private[this] var row: UnsafeRow = new UnsafeRow()
       private[this] var rowTuple: (Int, UnsafeRow) = (0, row)
@@ -126,14 +134,40 @@ private class UnsafeRowSerializerInstance(numFields: Int) extends SerializerInst
           }
         }
       }
-      override def asIterator: Iterator[Any] = throw new UnsupportedOperationException
-      override def readKey[T: ClassTag](): T = throw new UnsupportedOperationException
-      override def readValue[T: ClassTag](): T = throw new UnsupportedOperationException
-      override def readObject[T: ClassTag](): T = throw new UnsupportedOperationException
-      override def close(): Unit = dIn.close()
+
+      override def asIterator: Iterator[Any] = {
+        // This method is never called by shuffle code.
+        throw new UnsupportedOperationException
+      }
+
+      override def readKey[T: ClassTag](): T = {
+        // We skipped serialization of the key in writeKey(), so just return a dummy value since
+        // this is going to be discarded anyways.
+        null.asInstanceOf[T]
+      }
+
+      override def readValue[T: ClassTag](): T = {
+        val rowSize = dIn.readInt()
+        if (rowBuffer.length < rowSize) {
+          rowBuffer = new Array[Byte](rowSize)
+        }
+        ByteStreams.readFully(in, rowBuffer, 0, rowSize)
+        row.pointTo(rowBuffer, PlatformDependent.BYTE_ARRAY_OFFSET, numFields, rowSize, null)
+        row.asInstanceOf[T]
+      }
+
+      override def readObject[T: ClassTag](): T = {
+        // This method is never called by shuffle code.
+        throw new UnsupportedOperationException
+      }
+
+      override def close(): Unit = {
+        dIn.close()
+      }
     }
   }
 
+  // These methods are never called by shuffle code.
   override def serialize[T: ClassTag](t: T): ByteBuffer = throw new UnsupportedOperationException
   override def deserialize[T: ClassTag](bytes: ByteBuffer): T =
     throw new UnsupportedOperationException
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UnsafeRowSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UnsafeRowSuite.scala
new file mode 100644
index 0000000000000..3854dc1b7a3d1
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/UnsafeRowSuite.scala
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql
+
+import java.io.ByteArrayOutputStream
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.{UnsafeRow, UnsafeProjection}
+import org.apache.spark.sql.types.{IntegerType, StringType}
+import org.apache.spark.unsafe.PlatformDependent
+import org.apache.spark.unsafe.memory.MemoryAllocator
+import org.apache.spark.unsafe.types.UTF8String
+
+class UnsafeRowSuite extends SparkFunSuite {
+  test("writeToStream") {
+    val row = InternalRow.apply(UTF8String.fromString("hello"), UTF8String.fromString("world"), 123)
+    val arrayBackedUnsafeRow: UnsafeRow =
+      UnsafeProjection.create(Seq(StringType, StringType, IntegerType)).apply(row)
+    assert(arrayBackedUnsafeRow.getBaseObject.isInstanceOf[Array[Byte]])
+    val bytesFromArrayBackedRow: Array[Byte] = {
+      val baos = new ByteArrayOutputStream()
+      arrayBackedUnsafeRow.writeToStream(baos, null)
+      baos.toByteArray
+    }
+    val bytesFromOffheapRow: Array[Byte] = {
+      val offheapRowPage = MemoryAllocator.UNSAFE.allocate(arrayBackedUnsafeRow.getSizeInBytes)
+      try {
+        PlatformDependent.copyMemory(
+          arrayBackedUnsafeRow.getBaseObject,
+          arrayBackedUnsafeRow.getBaseOffset,
+          offheapRowPage.getBaseObject,
+          offheapRowPage.getBaseOffset,
+          arrayBackedUnsafeRow.getSizeInBytes
+        )
+        val offheapUnsafeRow: UnsafeRow = new UnsafeRow()
+        offheapUnsafeRow.pointTo(
+          offheapRowPage.getBaseObject,
+          offheapRowPage.getBaseOffset,
+          3, // num fields
+          arrayBackedUnsafeRow.getSizeInBytes,
+          null // object pool
+        )
+        assert(offheapUnsafeRow.getBaseObject === null)
+        val baos = new ByteArrayOutputStream()
+        val writeBuffer = new Array[Byte](1024)
+        offheapUnsafeRow.writeToStream(baos, writeBuffer)
+        baos.toByteArray
+      } finally {
+        MemoryAllocator.UNSAFE.free(offheapRowPage)
+      }
+    }
+
+    assert(bytesFromArrayBackedRow === bytesFromOffheapRow)
+  }
+}

From 228ab65a4eeef8a42eb4713edf72b50590f63176 Mon Sep 17 00:00:00 2001
From: Shivaram Venkataraman <shivaram@cs.berkeley.edu>
Date: Mon, 20 Jul 2015 23:31:08 -0700
Subject: [PATCH 289/549] [SPARK-9179] [BUILD] Use default primary author if
 unspecified

Fixes feature introduced in #7508 to use the default value if nothing is specified in command line

cc liancheng rxin pwendell

Author: Shivaram Venkataraman <shivaram@cs.berkeley.edu>

Closes #7558 from shivaram/merge-script-fix and squashes the following commits:

7092141 [Shivaram Venkataraman] Use default primary author if unspecified
---
 dev/merge_spark_pr.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/dev/merge_spark_pr.py b/dev/merge_spark_pr.py
index d586a57481aa1..ad4b76695c9ff 100755
--- a/dev/merge_spark_pr.py
+++ b/dev/merge_spark_pr.py
@@ -133,6 +133,8 @@ def merge_pr(pr_num, target_ref, title, body, pr_repo_desc):
     primary_author = raw_input(
         "Enter primary author in the format of \"name <email>\" [%s]: " %
         distinct_authors[0])
+    if primary_author == "":
+        primary_author = distinct_authors[0]
 
     commits = run_cmd(['git', 'log', 'HEAD..%s' % pr_branch_name,
                       '--pretty=format:%h [%an] %s']).split("\n\n")

From 1ddd0f2f1688560f88470e312b72af04364e2d49 Mon Sep 17 00:00:00 2001
From: Tarek Auel <tarek.auel@googlemail.com>
Date: Mon, 20 Jul 2015 23:33:07 -0700
Subject: [PATCH 290/549] [SPARK-9161][SQL] codegen FormatNumber

Jira https://issues.apache.org/jira/browse/SPARK-9161

Author: Tarek Auel <tarek.auel@googlemail.com>

Closes #7545 from tarekauel/SPARK-9161 and squashes the following commits:

21425c8 [Tarek Auel] [SPARK-9161][SQL] codegen FormatNumber
---
 .../expressions/stringOperations.scala        | 68 +++++++++++++++----
 1 file changed, 54 insertions(+), 14 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
index 438215e8e6e37..92fefe1585b23 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
@@ -902,22 +902,15 @@ case class FormatNumber(x: Expression, d: Expression)
   @transient
   private val numberFormat: DecimalFormat = new DecimalFormat("")
 
-  override def eval(input: InternalRow): Any = {
-    val xObject = x.eval(input)
-    if (xObject == null) {
+  override protected def nullSafeEval(xObject: Any, dObject: Any): Any = {
+    val dValue = dObject.asInstanceOf[Int]
+    if (dValue < 0) {
       return null
     }
 
-    val dObject = d.eval(input)
-
-    if (dObject == null || dObject.asInstanceOf[Int] < 0) {
-      return null
-    }
-    val dValue = dObject.asInstanceOf[Int]
-
     if (dValue != lastDValue) {
       // construct a new DecimalFormat only if a new dValue
-      pattern.delete(0, pattern.length())
+      pattern.delete(0, pattern.length)
       pattern.append("#,###,###,###,###,###,##0")
 
       // decimal place
@@ -930,9 +923,10 @@ case class FormatNumber(x: Expression, d: Expression)
           pattern.append("0")
         }
       }
-      val dFormat = new DecimalFormat(pattern.toString())
-      lastDValue = dValue;
-      numberFormat.applyPattern(dFormat.toPattern())
+      val dFormat = new DecimalFormat(pattern.toString)
+      lastDValue = dValue
+
+      numberFormat.applyPattern(dFormat.toPattern)
     }
 
     x.dataType match {
@@ -947,6 +941,52 @@ case class FormatNumber(x: Expression, d: Expression)
     }
   }
 
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    nullSafeCodeGen(ctx, ev, (num, d) => {
+
+      def typeHelper(p: String): String = {
+        x.dataType match {
+          case _ : DecimalType => s"""$p.toJavaBigDecimal()"""
+          case _ => s"$p"
+        }
+      }
+
+      val sb = classOf[StringBuffer].getName
+      val df = classOf[DecimalFormat].getName
+      val lastDValue = ctx.freshName("lastDValue")
+      val pattern = ctx.freshName("pattern")
+      val numberFormat = ctx.freshName("numberFormat")
+      val i = ctx.freshName("i")
+      val dFormat = ctx.freshName("dFormat")
+      ctx.addMutableState("int", lastDValue, s"$lastDValue = -100;")
+      ctx.addMutableState(sb, pattern, s"$pattern = new $sb();")
+      ctx.addMutableState(df, numberFormat, s"""$numberFormat = new $df("");""")
+
+      s"""
+        if ($d >= 0) {
+          $pattern.delete(0, $pattern.length());
+          if ($d != $lastDValue) {
+            $pattern.append("#,###,###,###,###,###,##0");
+
+            if ($d > 0) {
+              $pattern.append(".");
+              for (int $i = 0; $i < $d; $i++) {
+                $pattern.append("0");
+              }
+            }
+            $df $dFormat = new $df($pattern.toString());
+            $lastDValue = $d;
+            $numberFormat.applyPattern($dFormat.toPattern());
+            ${ev.primitive} = UTF8String.fromString($numberFormat.format(${typeHelper(num)}));
+          }
+        } else {
+          ${ev.primitive} = null;
+          ${ev.isNull} = true;
+        }
+       """
+    })
+  }
+
   override def prettyName: String = "format_number"
 }
 

From d38c5029a2ca845e2782096044a6412b653c9f95 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Tue, 21 Jul 2015 15:08:44 +0800
Subject: [PATCH 291/549] [SPARK-9100] [SQL] Adds DataFrame reader/writer
 shortcut methods for ORC

This PR adds DataFrame reader/writer shortcut methods for ORC in both Scala and Python.

Author: Cheng Lian <lian@databricks.com>

Closes #7444 from liancheng/spark-9100 and squashes the following commits:

284d043 [Cheng Lian] Fixes PySpark test cases and addresses PR comments
e0b09fb [Cheng Lian] Adds DataFrame reader/writer shortcut methods for ORC
---
 python/pyspark/sql/readwriter.py              |  44 ++++++++++++++++--
 .../sql/orc_partitioned/._SUCCESS.crc         | Bin 0 -> 8 bytes
 .../test_support/sql/orc_partitioned/_SUCCESS |   0
 ...9af031-b970-49d6-ad39-30460a0be2c8.orc.crc | Bin 0 -> 12 bytes
 ...0-829af031-b970-49d6-ad39-30460a0be2c8.orc | Bin 0 -> 168 bytes
 ...9af031-b970-49d6-ad39-30460a0be2c8.orc.crc | Bin 0 -> 12 bytes
 ...0-829af031-b970-49d6-ad39-30460a0be2c8.orc | Bin 0 -> 168 bytes
 .../apache/spark/sql/DataFrameReader.scala    |   9 ++++
 .../apache/spark/sql/DataFrameWriter.scala    |  12 +++++
 .../hive/orc/OrcHadoopFsRelationSuite.scala   |   3 +-
 .../hive/orc/OrcPartitionDiscoverySuite.scala |  14 +++---
 .../spark/sql/hive/orc/OrcQuerySuite.scala    |  12 ++---
 .../apache/spark/sql/hive/orc/OrcTest.scala   |   8 ++--
 13 files changed, 79 insertions(+), 23 deletions(-)
 create mode 100644 python/test_support/sql/orc_partitioned/._SUCCESS.crc
 create mode 100755 python/test_support/sql/orc_partitioned/_SUCCESS
 create mode 100644 python/test_support/sql/orc_partitioned/b=0/c=0/.part-r-00000-829af031-b970-49d6-ad39-30460a0be2c8.orc.crc
 create mode 100755 python/test_support/sql/orc_partitioned/b=0/c=0/part-r-00000-829af031-b970-49d6-ad39-30460a0be2c8.orc
 create mode 100644 python/test_support/sql/orc_partitioned/b=1/c=1/.part-r-00000-829af031-b970-49d6-ad39-30460a0be2c8.orc.crc
 create mode 100755 python/test_support/sql/orc_partitioned/b=1/c=1/part-r-00000-829af031-b970-49d6-ad39-30460a0be2c8.orc

diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index 882a03090ec13..dea8bad79e187 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -146,14 +146,28 @@ def table(self, tableName):
         return self._df(self._jreader.table(tableName))
 
     @since(1.4)
-    def parquet(self, *path):
+    def parquet(self, *paths):
         """Loads a Parquet file, returning the result as a :class:`DataFrame`.
 
         >>> df = sqlContext.read.parquet('python/test_support/sql/parquet_partitioned')
         >>> df.dtypes
         [('name', 'string'), ('year', 'int'), ('month', 'int'), ('day', 'int')]
         """
-        return self._df(self._jreader.parquet(_to_seq(self._sqlContext._sc, path)))
+        return self._df(self._jreader.parquet(_to_seq(self._sqlContext._sc, paths)))
+
+    @since(1.5)
+    def orc(self, path):
+        """
+        Loads an ORC file, returning the result as a :class:`DataFrame`.
+
+        ::Note: Currently ORC support is only available together with
+        :class:`HiveContext`.
+
+        >>> df = hiveContext.read.orc('python/test_support/sql/orc_partitioned')
+        >>> df.dtypes
+        [('a', 'bigint'), ('b', 'int'), ('c', 'int')]
+        """
+        return self._df(self._jreader.orc(path))
 
     @since(1.4)
     def jdbc(self, url, table, column=None, lowerBound=None, upperBound=None, numPartitions=None,
@@ -378,6 +392,29 @@ def parquet(self, path, mode=None, partitionBy=None):
             self.partitionBy(partitionBy)
         self._jwrite.parquet(path)
 
+    def orc(self, path, mode=None, partitionBy=None):
+        """Saves the content of the :class:`DataFrame` in ORC format at the specified path.
+
+        ::Note: Currently ORC support is only available together with
+        :class:`HiveContext`.
+
+        :param path: the path in any Hadoop supported file system
+        :param mode: specifies the behavior of the save operation when data already exists.
+
+            * ``append``: Append contents of this :class:`DataFrame` to existing data.
+            * ``overwrite``: Overwrite existing data.
+            * ``ignore``: Silently ignore this operation if data already exists.
+            * ``error`` (default case): Throw an exception if data already exists.
+        :param partitionBy: names of partitioning columns
+
+        >>> orc_df = hiveContext.read.orc('python/test_support/sql/orc_partitioned')
+        >>> orc_df.write.orc(os.path.join(tempfile.mkdtemp(), 'data'))
+        """
+        self.mode(mode)
+        if partitionBy is not None:
+            self.partitionBy(partitionBy)
+        self._jwrite.orc(path)
+
     @since(1.4)
     def jdbc(self, url, table, mode=None, properties={}):
         """Saves the content of the :class:`DataFrame` to a external database table via JDBC.
@@ -408,7 +445,7 @@ def _test():
     import os
     import tempfile
     from pyspark.context import SparkContext
-    from pyspark.sql import Row, SQLContext
+    from pyspark.sql import Row, SQLContext, HiveContext
     import pyspark.sql.readwriter
 
     os.chdir(os.environ["SPARK_HOME"])
@@ -420,6 +457,7 @@ def _test():
     globs['os'] = os
     globs['sc'] = sc
     globs['sqlContext'] = SQLContext(sc)
+    globs['hiveContext'] = HiveContext(sc)
     globs['df'] = globs['sqlContext'].read.parquet('python/test_support/sql/parquet_partitioned')
 
     (failure_count, test_count) = doctest.testmod(
diff --git a/python/test_support/sql/orc_partitioned/._SUCCESS.crc b/python/test_support/sql/orc_partitioned/._SUCCESS.crc
new file mode 100644
index 0000000000000000000000000000000000000000..3b7b044936a890cd8d651d349a752d819d71d22c
GIT binary patch
literal 8
PcmYc;N@ieSU}69O2$TUk

literal 0
HcmV?d00001

diff --git a/python/test_support/sql/orc_partitioned/_SUCCESS b/python/test_support/sql/orc_partitioned/_SUCCESS
new file mode 100755
index 0000000000000..e69de29bb2d1d
diff --git a/python/test_support/sql/orc_partitioned/b=0/c=0/.part-r-00000-829af031-b970-49d6-ad39-30460a0be2c8.orc.crc b/python/test_support/sql/orc_partitioned/b=0/c=0/.part-r-00000-829af031-b970-49d6-ad39-30460a0be2c8.orc.crc
new file mode 100644
index 0000000000000000000000000000000000000000..834cf0b7f227244a3ccda18809a0bb49d27b59d2
GIT binary patch
literal 12
TcmYc;N@ieSU}CV!x?uzW5r_im

literal 0
HcmV?d00001

diff --git a/python/test_support/sql/orc_partitioned/b=0/c=0/part-r-00000-829af031-b970-49d6-ad39-30460a0be2c8.orc b/python/test_support/sql/orc_partitioned/b=0/c=0/part-r-00000-829af031-b970-49d6-ad39-30460a0be2c8.orc
new file mode 100755
index 0000000000000000000000000000000000000000..49438018733565be297429b4f9349450441230f9
GIT binary patch
literal 168
zcmeYda^_`V;9?PC;$Tz<G6cDp85kIZxHuSv*f<yj7$g`t85j-#h0GWj9w$9c5Kfrz
z;0e>k9-$<MS%My$xeW^q7z~9ORDoIrxPWGGA?ZwKVBlaDaF^l&lJXMl3UV5ZN*o+K
wLX3=3jEPE294rQmRzNLQNLm~w_;HvCFiJEu2q-Zz@MwrOePL$y4{{a-0D%M+=Kufz

literal 0
HcmV?d00001

diff --git a/python/test_support/sql/orc_partitioned/b=1/c=1/.part-r-00000-829af031-b970-49d6-ad39-30460a0be2c8.orc.crc b/python/test_support/sql/orc_partitioned/b=1/c=1/.part-r-00000-829af031-b970-49d6-ad39-30460a0be2c8.orc.crc
new file mode 100644
index 0000000000000000000000000000000000000000..693dceeee3ef2ff6fb24e8ac7fd006ffd7463074
GIT binary patch
literal 12
TcmYc;N@ieSU}8vHaCb8R6J-O@

literal 0
HcmV?d00001

diff --git a/python/test_support/sql/orc_partitioned/b=1/c=1/part-r-00000-829af031-b970-49d6-ad39-30460a0be2c8.orc b/python/test_support/sql/orc_partitioned/b=1/c=1/part-r-00000-829af031-b970-49d6-ad39-30460a0be2c8.orc
new file mode 100755
index 0000000000000000000000000000000000000000..4cbb95ae0242cd99ed08f5d28ae7531dc55ccee3
GIT binary patch
literal 168
zcmeYda^_`V;9?PC;$Tz<G6cDp85kIZxHuSv*f^L3m?W4u85j;QFfo`hFg#9roFJSq
z;lUH8i#<X~4zmP3Hgg*m8Za0NGpGW!3UC3<;6l=w&cMLIEZ{E11tjGq*cIe77?n6U
xc!U@kr5F>Hm^fGr7_ER>tdO)gOz`6{6JV5RXb@0hV&KsbZTiB@>>uPT3IK<`7W)7I

literal 0
HcmV?d00001

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index 0e37ad3e12e08..f1c1ddf898986 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -264,6 +264,15 @@ class DataFrameReader private[sql](sqlContext: SQLContext) extends Logging {
     }
   }
 
+  /**
+   * Loads an ORC file and returns the result as a [[DataFrame]].
+   *
+   * @param path input path
+   * @since 1.5.0
+   * @note Currently, this method can only be used together with `HiveContext`.
+   */
+  def orc(path: String): DataFrame = format("orc").load(path)
+
   /**
    * Returns the specified table as a [[DataFrame]].
    *
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index 5548b26cb8f80..3e7b9cd7976c3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -280,6 +280,18 @@ final class DataFrameWriter private[sql](df: DataFrame) {
    */
   def parquet(path: String): Unit = format("parquet").save(path)
 
+  /**
+   * Saves the content of the [[DataFrame]] in ORC format at the specified path.
+   * This is equivalent to:
+   * {{{
+   *   format("orc").save(path)
+   * }}}
+   *
+   * @since 1.5.0
+   * @note Currently, this method can only be used together with `HiveContext`.
+   */
+  def orc(path: String): Unit = format("orc").save(path)
+
   ///////////////////////////////////////////////////////////////////////////////////////
   // Builder pattern config options
   ///////////////////////////////////////////////////////////////////////////////////////
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcHadoopFsRelationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcHadoopFsRelationSuite.scala
index 080af5bb23c16..af3f468aaa5e9 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcHadoopFsRelationSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcHadoopFsRelationSuite.scala
@@ -41,8 +41,7 @@ class OrcHadoopFsRelationSuite extends HadoopFsRelationTest {
           .parallelize(for (i <- 1 to 3) yield (i, s"val_$i", p1))
           .toDF("a", "b", "p1")
           .write
-          .format("orc")
-          .save(partitionDir.toString)
+          .orc(partitionDir.toString)
       }
 
       val dataSchemaWithPartition =
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcPartitionDiscoverySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcPartitionDiscoverySuite.scala
index 3c2efe329bfd5..d463e8fd626f9 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcPartitionDiscoverySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcPartitionDiscoverySuite.scala
@@ -49,13 +49,13 @@ class OrcPartitionDiscoverySuite extends QueryTest with BeforeAndAfterAll {
 
   def makeOrcFile[T <: Product: ClassTag: TypeTag](
       data: Seq[T], path: File): Unit = {
-    data.toDF().write.format("orc").mode("overwrite").save(path.getCanonicalPath)
+    data.toDF().write.mode("overwrite").orc(path.getCanonicalPath)
   }
 
 
   def makeOrcFile[T <: Product: ClassTag: TypeTag](
       df: DataFrame, path: File): Unit = {
-    df.write.format("orc").mode("overwrite").save(path.getCanonicalPath)
+    df.write.mode("overwrite").orc(path.getCanonicalPath)
   }
 
   protected def withTempTable(tableName: String)(f: => Unit): Unit = {
@@ -90,7 +90,7 @@ class OrcPartitionDiscoverySuite extends QueryTest with BeforeAndAfterAll {
           makePartitionDir(base, defaultPartitionName, "pi" -> pi, "ps" -> ps))
       }
 
-      read.format("orc").load(base.getCanonicalPath).registerTempTable("t")
+      read.orc(base.getCanonicalPath).registerTempTable("t")
 
       withTempTable("t") {
         checkAnswer(
@@ -137,7 +137,7 @@ class OrcPartitionDiscoverySuite extends QueryTest with BeforeAndAfterAll {
           makePartitionDir(base, defaultPartitionName, "pi" -> pi, "ps" -> ps))
       }
 
-      read.format("orc").load(base.getCanonicalPath).registerTempTable("t")
+      read.orc(base.getCanonicalPath).registerTempTable("t")
 
       withTempTable("t") {
         checkAnswer(
@@ -187,9 +187,8 @@ class OrcPartitionDiscoverySuite extends QueryTest with BeforeAndAfterAll {
       }
 
       read
-        .format("orc")
         .option(ConfVars.DEFAULTPARTITIONNAME.varname, defaultPartitionName)
-        .load(base.getCanonicalPath)
+        .orc(base.getCanonicalPath)
         .registerTempTable("t")
 
       withTempTable("t") {
@@ -230,9 +229,8 @@ class OrcPartitionDiscoverySuite extends QueryTest with BeforeAndAfterAll {
       }
 
       read
-        .format("orc")
         .option(ConfVars.DEFAULTPARTITIONNAME.varname, defaultPartitionName)
-        .load(base.getCanonicalPath)
+        .orc(base.getCanonicalPath)
         .registerTempTable("t")
 
       withTempTable("t") {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
index ca131faaeef05..744d462938141 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
@@ -63,14 +63,14 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest {
 
     withOrcFile(data) { file =>
       checkAnswer(
-        sqlContext.read.format("orc").load(file),
+        sqlContext.read.orc(file),
         data.toDF().collect())
     }
   }
 
   test("Read/write binary data") {
     withOrcFile(BinaryData("test".getBytes("utf8")) :: Nil) { file =>
-      val bytes = read.format("orc").load(file).head().getAs[Array[Byte]](0)
+      val bytes = read.orc(file).head().getAs[Array[Byte]](0)
       assert(new String(bytes, "utf8") === "test")
     }
   }
@@ -88,7 +88,7 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest {
 
     withOrcFile(data) { file =>
       checkAnswer(
-        read.format("orc").load(file),
+        read.orc(file),
         data.toDF().collect())
     }
   }
@@ -158,7 +158,7 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest {
 
     withOrcFile(data) { file =>
       checkAnswer(
-        read.format("orc").load(file),
+        read.orc(file),
         Row(Seq.fill(5)(null): _*))
     }
   }
@@ -310,7 +310,7 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest {
              """.stripMargin)
 
           val errorMessage = intercept[AnalysisException] {
-            sqlContext.read.format("orc").load(path)
+            sqlContext.read.orc(path)
           }.getMessage
 
           assert(errorMessage.contains("Failed to discover schema from ORC files"))
@@ -323,7 +323,7 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest {
                |SELECT key, value FROM single
              """.stripMargin)
 
-          val df = sqlContext.read.format("orc").load(path)
+          val df = sqlContext.read.orc(path)
           assert(df.schema === singleRowDF.schema.asNullable)
           checkAnswer(df, singleRowDF)
         }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcTest.scala
index 5daf691aa8c53..9d76d6503a3e6 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcTest.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcTest.scala
@@ -39,7 +39,7 @@ private[sql] trait OrcTest extends SQLTestUtils {
       (data: Seq[T])
       (f: String => Unit): Unit = {
     withTempPath { file =>
-      sparkContext.parallelize(data).toDF().write.format("orc").save(file.getCanonicalPath)
+      sparkContext.parallelize(data).toDF().write.orc(file.getCanonicalPath)
       f(file.getCanonicalPath)
     }
   }
@@ -51,7 +51,7 @@ private[sql] trait OrcTest extends SQLTestUtils {
   protected def withOrcDataFrame[T <: Product: ClassTag: TypeTag]
       (data: Seq[T])
       (f: DataFrame => Unit): Unit = {
-    withOrcFile(data)(path => f(sqlContext.read.format("orc").load(path)))
+    withOrcFile(data)(path => f(sqlContext.read.orc(path)))
   }
 
   /**
@@ -70,11 +70,11 @@ private[sql] trait OrcTest extends SQLTestUtils {
 
   protected def makeOrcFile[T <: Product: ClassTag: TypeTag](
       data: Seq[T], path: File): Unit = {
-    data.toDF().write.format("orc").mode(SaveMode.Overwrite).save(path.getCanonicalPath)
+    data.toDF().write.mode(SaveMode.Overwrite).orc(path.getCanonicalPath)
   }
 
   protected def makeOrcFile[T <: Product: ClassTag: TypeTag](
       df: DataFrame, path: File): Unit = {
-    df.write.format("orc").mode(SaveMode.Overwrite).save(path.getCanonicalPath)
+    df.write.mode(SaveMode.Overwrite).orc(path.getCanonicalPath)
   }
 }

From 8c8f0ef59e12b6f13d5a0bf2d7bf1248b5c1e369 Mon Sep 17 00:00:00 2001
From: Cheng Hao <hao.cheng@intel.com>
Date: Tue, 21 Jul 2015 00:48:07 -0700
Subject: [PATCH 292/549] [SPARK-8255] [SPARK-8256] [SQL] Add
 regex_extract/regex_replace

Add expressions `regex_extract` & `regex_replace`

Author: Cheng Hao <hao.cheng@intel.com>

Closes #7468 from chenghao-intel/regexp and squashes the following commits:

e5ea476 [Cheng Hao] minor update for documentation
ef96fd6 [Cheng Hao] update the code gen
72cf28f [Cheng Hao] Add more log for compilation error
4e11381 [Cheng Hao] Add regexp_replace / regexp_extract support
---
 python/pyspark/sql/functions.py               |  30 +++
 .../catalyst/analysis/FunctionRegistry.scala  |   2 +
 .../expressions/codegen/CodeGenerator.scala   |   5 +-
 .../expressions/stringOperations.scala        | 217 +++++++++++++++++-
 .../expressions/ExpressionEvalHelper.scala    |   1 -
 .../expressions/StringExpressionsSuite.scala  |  35 +++
 .../org/apache/spark/sql/functions.scala      |  21 ++
 .../spark/sql/StringFunctionsSuite.scala      |  16 ++
 8 files changed, 323 insertions(+), 4 deletions(-)

diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 031745a1c4d3b..3c134faa0a765 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -46,6 +46,8 @@
     'monotonicallyIncreasingId',
     'rand',
     'randn',
+    'regexp_extract',
+    'regexp_replace',
     'sha1',
     'sha2',
     'sparkPartitionId',
@@ -343,6 +345,34 @@ def levenshtein(left, right):
     return Column(jc)
 
 
+@ignore_unicode_prefix
+@since(1.5)
+def regexp_extract(str, pattern, idx):
+    """Extract a specific(idx) group identified by a java regex, from the specified string column.
+
+    >>> df = sqlContext.createDataFrame([('100-200',)], ['str'])
+    >>> df.select(regexp_extract('str', '(\d+)-(\d+)', 1).alias('d')).collect()
+    [Row(d=u'100')]
+    """
+    sc = SparkContext._active_spark_context
+    jc = sc._jvm.functions.regexp_extract(_to_java_column(str), pattern, idx)
+    return Column(jc)
+
+
+@ignore_unicode_prefix
+@since(1.5)
+def regexp_replace(str, pattern, replacement):
+    """Replace all substrings of the specified string value that match regexp with rep.
+
+    >>> df = sqlContext.createDataFrame([('100-200',)], ['str'])
+    >>> df.select(regexp_replace('str', '(\\d+)', '##').alias('d')).collect()
+    [Row(d=u'##-##')]
+    """
+    sc = SparkContext._active_spark_context
+    jc = sc._jvm.functions.regexp_replace(_to_java_column(str), pattern, replacement)
+    return Column(jc)
+
+
 @ignore_unicode_prefix
 @since(1.5)
 def md5(col):
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index 71e87b98d86fc..aec392379c186 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -161,6 +161,8 @@ object FunctionRegistry {
     expression[Lower]("lower"),
     expression[Length]("length"),
     expression[Levenshtein]("levenshtein"),
+    expression[RegExpExtract]("regexp_extract"),
+    expression[RegExpReplace]("regexp_replace"),
     expression[StringInstr]("instr"),
     expression[StringLocate]("locate"),
     expression[StringLPad]("lpad"),
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index 606f770cb4f7b..319dcd1c04316 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -297,8 +297,9 @@ abstract class CodeGenerator[InType <: AnyRef, OutType <: AnyRef] extends Loggin
       evaluator.cook(code)
     } catch {
       case e: Exception =>
-        logError(s"failed to compile:\n $code", e)
-        throw e
+        val msg = s"failed to compile:\n $code"
+        logError(msg, e)
+        throw new Exception(msg, e)
     }
     evaluator.getClazz().newInstance().asInstanceOf[GeneratedClass]
   }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
index 92fefe1585b23..fe57d17f1ec14 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.expressions
 
 import java.text.DecimalFormat
 import java.util.Locale
-import java.util.regex.Pattern
+import java.util.regex.{MatchResult, Pattern}
 
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.UnresolvedException
@@ -876,6 +876,221 @@ case class Encode(value: Expression, charset: Expression)
   }
 }
 
+/**
+ * Replace all substrings of str that match regexp with rep.
+ *
+ * NOTE: this expression is not THREAD-SAFE, as it has some internal mutable status.
+ */
+case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expression)
+  extends Expression with ImplicitCastInputTypes {
+
+  // last regex in string, we will update the pattern iff regexp value changed.
+  @transient private var lastRegex: UTF8String = _
+  // last regex pattern, we cache it for performance concern
+  @transient private var pattern: Pattern = _
+  // last replacement string, we don't want to convert a UTF8String => java.langString every time.
+  @transient private var lastReplacement: String = _
+  @transient private var lastReplacementInUTF8: UTF8String = _
+  // result buffer write by Matcher
+  @transient private val result: StringBuffer = new StringBuffer
+
+  override def nullable: Boolean = subject.nullable || regexp.nullable || rep.nullable
+  override def foldable: Boolean = subject.foldable && regexp.foldable && rep.foldable
+
+  override def eval(input: InternalRow): Any = {
+    val s = subject.eval(input)
+    if (null != s) {
+      val p = regexp.eval(input)
+      if (null != p) {
+        val r = rep.eval(input)
+        if (null != r) {
+          if (!p.equals(lastRegex)) {
+            // regex value changed
+            lastRegex = p.asInstanceOf[UTF8String]
+            pattern = Pattern.compile(lastRegex.toString)
+          }
+          if (!r.equals(lastReplacementInUTF8)) {
+            // replacement string changed
+            lastReplacementInUTF8 = r.asInstanceOf[UTF8String]
+            lastReplacement = lastReplacementInUTF8.toString
+          }
+          val m = pattern.matcher(s.toString())
+          result.delete(0, result.length())
+
+          while (m.find) {
+            m.appendReplacement(result, lastReplacement)
+          }
+          m.appendTail(result)
+
+          return UTF8String.fromString(result.toString)
+        }
+      }
+    }
+
+    null
+  }
+
+  override def dataType: DataType = StringType
+  override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType, StringType)
+  override def children: Seq[Expression] = subject :: regexp :: rep :: Nil
+  override def prettyName: String = "regexp_replace"
+
+  override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    val termLastRegex = ctx.freshName("lastRegex")
+    val termPattern = ctx.freshName("pattern")
+
+    val termLastReplacement = ctx.freshName("lastReplacement")
+    val termLastReplacementInUTF8 = ctx.freshName("lastReplacementInUTF8")
+
+    val termResult = ctx.freshName("result")
+
+    val classNameUTF8String = classOf[UTF8String].getCanonicalName
+    val classNamePattern = classOf[Pattern].getCanonicalName
+    val classNameString = classOf[java.lang.String].getCanonicalName
+    val classNameStringBuffer = classOf[java.lang.StringBuffer].getCanonicalName
+
+    ctx.addMutableState(classNameUTF8String,
+      termLastRegex, s"${termLastRegex} = null;")
+    ctx.addMutableState(classNamePattern,
+      termPattern, s"${termPattern} = null;")
+    ctx.addMutableState(classNameString,
+      termLastReplacement, s"${termLastReplacement} = null;")
+    ctx.addMutableState(classNameUTF8String,
+      termLastReplacementInUTF8, s"${termLastReplacementInUTF8} = null;")
+    ctx.addMutableState(classNameStringBuffer,
+      termResult, s"${termResult} = new $classNameStringBuffer();")
+
+    val evalSubject = subject.gen(ctx)
+    val evalRegexp = regexp.gen(ctx)
+    val evalRep = rep.gen(ctx)
+
+    s"""
+      ${evalSubject.code}
+      boolean ${ev.isNull} = ${evalSubject.isNull};
+      ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+      if (!${evalSubject.isNull}) {
+        ${evalRegexp.code}
+        if (!${evalRegexp.isNull}) {
+          ${evalRep.code}
+          if (!${evalRep.isNull}) {
+            if (!${evalRegexp.primitive}.equals(${termLastRegex})) {
+              // regex value changed
+              ${termLastRegex} = ${evalRegexp.primitive};
+              ${termPattern} = ${classNamePattern}.compile(${termLastRegex}.toString());
+            }
+            if (!${evalRep.primitive}.equals(${termLastReplacementInUTF8})) {
+              // replacement string changed
+              ${termLastReplacementInUTF8} = ${evalRep.primitive};
+              ${termLastReplacement} = ${termLastReplacementInUTF8}.toString();
+            }
+            ${termResult}.delete(0, ${termResult}.length());
+            ${classOf[java.util.regex.Matcher].getCanonicalName} m =
+                                   ${termPattern}.matcher(${evalSubject.primitive}.toString());
+
+            while (m.find()) {
+              m.appendReplacement(${termResult}, ${termLastReplacement});
+            }
+            m.appendTail(${termResult});
+            ${ev.primitive} = ${classNameUTF8String}.fromString(${termResult}.toString());
+            ${ev.isNull} = false;
+          }
+        }
+      }
+    """
+  }
+}
+
+/**
+ * Extract a specific(idx) group identified by a Java regex.
+ *
+ * NOTE: this expression is not THREAD-SAFE, as it has some internal mutable status.
+ */
+case class RegExpExtract(subject: Expression, regexp: Expression, idx: Expression)
+  extends Expression with ImplicitCastInputTypes {
+  def this(s: Expression, r: Expression) = this(s, r, Literal(1))
+
+  // last regex in string, we will update the pattern iff regexp value changed.
+  @transient private var lastRegex: UTF8String = _
+  // last regex pattern, we cache it for performance concern
+  @transient private var pattern: Pattern = _
+
+  override def nullable: Boolean = subject.nullable || regexp.nullable || idx.nullable
+  override def foldable: Boolean = subject.foldable && regexp.foldable && idx.foldable
+
+  override def eval(input: InternalRow): Any = {
+    val s = subject.eval(input)
+    if (null != s) {
+      val p = regexp.eval(input)
+      if (null != p) {
+        val r = idx.eval(input)
+        if (null != r) {
+          if (!p.equals(lastRegex)) {
+            // regex value changed
+            lastRegex = p.asInstanceOf[UTF8String]
+            pattern = Pattern.compile(lastRegex.toString)
+          }
+          val m = pattern.matcher(s.toString())
+          if (m.find) {
+            val mr: MatchResult = m.toMatchResult
+            return UTF8String.fromString(mr.group(r.asInstanceOf[Int]))
+          }
+          return UTF8String.EMPTY_UTF8
+        }
+      }
+    }
+
+    null
+  }
+
+  override def dataType: DataType = StringType
+  override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType, IntegerType)
+  override def children: Seq[Expression] = subject :: regexp :: idx :: Nil
+  override def prettyName: String = "regexp_extract"
+
+  override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    val termLastRegex = ctx.freshName("lastRegex")
+    val termPattern = ctx.freshName("pattern")
+    val classNameUTF8String = classOf[UTF8String].getCanonicalName
+    val classNamePattern = classOf[Pattern].getCanonicalName
+
+    ctx.addMutableState(classNameUTF8String, termLastRegex, s"${termLastRegex} = null;")
+    ctx.addMutableState(classNamePattern, termPattern, s"${termPattern} = null;")
+
+    val evalSubject = subject.gen(ctx)
+    val evalRegexp = regexp.gen(ctx)
+    val evalIdx = idx.gen(ctx)
+
+    s"""
+      ${ctx.javaType(dataType)} ${ev.primitive} = null;
+      boolean ${ev.isNull} = true;
+      ${evalSubject.code}
+      if (!${evalSubject.isNull}) {
+        ${evalRegexp.code}
+        if (!${evalRegexp.isNull}) {
+          ${evalIdx.code}
+          if (!${evalIdx.isNull}) {
+            if (!${evalRegexp.primitive}.equals(${termLastRegex})) {
+              // regex value changed
+              ${termLastRegex} = ${evalRegexp.primitive};
+              ${termPattern} = ${classNamePattern}.compile(${termLastRegex}.toString());
+            }
+            ${classOf[java.util.regex.Matcher].getCanonicalName} m =
+                                   ${termPattern}.matcher(${evalSubject.primitive}.toString());
+            if (m.find()) {
+              ${classOf[java.util.regex.MatchResult].getCanonicalName} mr = m.toMatchResult();
+              ${ev.primitive} = ${classNameUTF8String}.fromString(mr.group(${evalIdx.primitive}));
+              ${ev.isNull} = false;
+            } else {
+              ${ev.primitive} = ${classNameUTF8String}.EMPTY_UTF8;
+              ${ev.isNull} = false;
+            }
+          }
+        }
+      }
+    """
+  }
+}
+
 /**
  * Formats the number X to a format like '#,###,###.##', rounded to D decimal places,
  * and returns the result as a string. If D is 0, the result has no decimal point or
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
index 7a96044d35a09..6e17ffcda9dc4 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
@@ -79,7 +79,6 @@ trait ExpressionEvalHelper {
         fail(
           s"""
             |Code generation of $expression failed:
-            |${evaluated.code}
             |$e
           """.stripMargin)
     }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
index 67d97cd30b039..96c540ab36f08 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
@@ -464,6 +464,41 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(StringSpace(s1), null, row2)
   }
 
+  test("RegexReplace") {
+    val row1 = create_row("100-200", "(\\d+)", "num")
+    val row2 = create_row("100-200", "(\\d+)", "###")
+    val row3 = create_row("100-200", "(-)", "###")
+
+    val s = 's.string.at(0)
+    val p = 'p.string.at(1)
+    val r = 'r.string.at(2)
+
+    val expr = RegExpReplace(s, p, r)
+    checkEvaluation(expr, "num-num", row1)
+    checkEvaluation(expr, "###-###", row2)
+    checkEvaluation(expr, "100###200", row3)
+  }
+
+  test("RegexExtract") {
+    val row1 = create_row("100-200", "(\\d+)-(\\d+)", 1)
+    val row2 = create_row("100-200", "(\\d+)-(\\d+)", 2)
+    val row3 = create_row("100-200", "(\\d+).*", 1)
+    val row4 = create_row("100-200", "([a-z])", 1)
+
+    val s = 's.string.at(0)
+    val p = 'p.string.at(1)
+    val r = 'r.int.at(2)
+
+    val expr = RegExpExtract(s, p, r)
+    checkEvaluation(expr, "100", row1)
+    checkEvaluation(expr, "200", row2)
+    checkEvaluation(expr, "100", row3)
+    checkEvaluation(expr, "", row4) // will not match anything, empty string get
+
+    val expr1 = new RegExpExtract(s, p)
+    checkEvaluation(expr1, "100", row1)
+  }
+
   test("SPLIT") {
     val s1 = 'a.string.at(0)
     val s2 = 'b.string.at(1)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 8fa017610b63c..6d60dae624b0c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -1781,6 +1781,27 @@ object functions {
     StringLocate(lit(substr).expr, str.expr, lit(pos).expr)
   }
 
+
+  /**
+   * Extract a specific(idx) group identified by a java regex, from the specified string column.
+   *
+   * @group string_funcs
+   * @since 1.5.0
+   */
+  def regexp_extract(e: Column, exp: String, groupIdx: Int): Column = {
+    RegExpExtract(e.expr, lit(exp).expr, lit(groupIdx).expr)
+  }
+
+  /**
+   * Replace all substrings of the specified string value that match regexp with rep.
+   *
+   * @group string_funcs
+   * @since 1.5.0
+   */
+  def regexp_replace(e: Column, pattern: String, replacement: String): Column = {
+    RegExpReplace(e.expr, lit(pattern).expr, lit(replacement).expr)
+  }
+
   /**
    * Computes the BASE64 encoding of a binary column and returns it as a string column.
    * This is the reverse of unbase64.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
index 4551192b157ff..d1f855903ca4b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
@@ -56,6 +56,22 @@ class StringFunctionsSuite extends QueryTest {
     checkAnswer(df.selectExpr("levenshtein(l, r)"), Seq(Row(3), Row(1)))
   }
 
+  test("string regex_replace / regex_extract") {
+    val df = Seq(("100-200", "")).toDF("a", "b")
+
+    checkAnswer(
+      df.select(
+        regexp_replace($"a", "(\\d+)", "num"),
+        regexp_extract($"a", "(\\d+)-(\\d+)", 1)),
+      Row("num-num", "100"))
+
+    checkAnswer(
+      df.selectExpr(
+        "regexp_replace(a, '(\\d+)', 'num')",
+        "regexp_extract(a, '(\\d+)-(\\d+)', 2)"),
+      Row("num-num", "200"))
+  }
+
   test("string ascii function") {
     val df = Seq(("abc", "")).toDF("a", "b")
     checkAnswer(

From 560c658a7462844c698b5bda09a4cfb4094fd65b Mon Sep 17 00:00:00 2001
From: Pedro Rodriguez <ski.rodriguez@gmail.com>
Date: Tue, 21 Jul 2015 00:53:20 -0700
Subject: [PATCH 293/549] [SPARK-8230][SQL] Add array/map size method

Pull Request for: https://issues.apache.org/jira/browse/SPARK-8230

Primary issue resolved is to implement array/map size for Spark SQL. Code is ready for review by a committer. Chen Hao is on the JIRA ticket, but I don't know his username on github, rxin is also on JIRA ticket.

Things to review:
1. Where to put added functions namespace wise, they seem to be part of a few operations on collections which includes `sort_array` and `array_contains`. Hence the name given `collectionOperations.scala` and `_collection_functions` in python.
2. In Python code, should it be in a `1.5.0` function array or in a collections array?
3. Are there any missing methods on the `Size` case class? Looks like many of these functions have generated Java code, is that also needed in this case?
4. Something else?

Author: Pedro Rodriguez <ski.rodriguez@gmail.com>
Author: Pedro Rodriguez <prodriguez@trulia.com>

Closes #7462 from EntilZha/SPARK-8230 and squashes the following commits:

9a442ae [Pedro Rodriguez] fixed functions and sorted __all__
9aea3bb [Pedro Rodriguez] removed imports from python docs
15d4bf1 [Pedro Rodriguez] Added null test case and changed to nullSafeCodeGen
d88247c [Pedro Rodriguez] removed python code
bd5f0e4 [Pedro Rodriguez] removed duplicate function from rebase/merge
59931b4 [Pedro Rodriguez] fixed compile bug instroduced when merging
c187175 [Pedro Rodriguez] updated code to add size to __all__ directly and removed redundent pretty print
130839f [Pedro Rodriguez] fixed failing test
aa9bade [Pedro Rodriguez] fix style
e093473 [Pedro Rodriguez] updated python code with docs, switched classes/traits implemented, added (failing) expression tests
0449377 [Pedro Rodriguez] refactored code to use better abstract classes/traits and implementations
9a1a2ff [Pedro Rodriguez] added unit tests for map size
2bfbcb6 [Pedro Rodriguez] added unit test for size
20df2b4 [Pedro Rodriguez] Finished working version of size function and added it to python
b503e75 [Pedro Rodriguez] First attempt at implementing size for maps and arrays
99a6a5c [Pedro Rodriguez] fixed failing test
cac75ac [Pedro Rodriguez] fix style
933d843 [Pedro Rodriguez] updated python code with docs, switched classes/traits implemented, added (failing) expression tests
42bb7d4 [Pedro Rodriguez] refactored code to use better abstract classes/traits and implementations
f9c3b8a [Pedro Rodriguez] added unit tests for map size
2515d9f [Pedro Rodriguez] added documentation
0e60541 [Pedro Rodriguez] added unit test for size
acf9853 [Pedro Rodriguez] Finished working version of size function and added it to python
84a5d38 [Pedro Rodriguez] First attempt at implementing size for maps and arrays
---
 python/pyspark/sql/functions.py               | 15 ++++++
 .../catalyst/analysis/FunctionRegistry.scala  |  4 +-
 .../expressions/collectionOperations.scala    | 37 +++++++++++++++
 .../CollectionFunctionsSuite.scala            | 46 +++++++++++++++++++
 .../org/apache/spark/sql/functions.scala      | 20 ++++++++
 .../spark/sql/DataFrameFunctionsSuite.scala   | 31 +++++++++++++
 6 files changed, 152 insertions(+), 1 deletion(-)
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
 create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionFunctionsSuite.scala

diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 3c134faa0a765..719e623a1a11f 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -50,6 +50,7 @@
     'regexp_replace',
     'sha1',
     'sha2',
+    'size',
     'sparkPartitionId',
     'struct',
     'udf',
@@ -825,6 +826,20 @@ def weekofyear(col):
     return Column(sc._jvm.functions.weekofyear(col))
 
 
+@since(1.5)
+def size(col):
+    """
+    Collection function: returns the length of the array or map stored in the column.
+    :param col: name of column or expression
+
+    >>> df = sqlContext.createDataFrame([([1, 2, 3],),([1],),([],)], ['data'])
+    >>> df.select(size(df.data)).collect()
+    [Row(size(data)=3), Row(size(data)=1), Row(size(data)=0)]
+    """
+    sc = SparkContext._active_spark_context
+    return Column(sc._jvm.functions.size(_to_java_column(col)))
+
+
 class UserDefinedFunction(object):
     """
     User defined function in Python
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index aec392379c186..13523720daff0 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -195,8 +195,10 @@ object FunctionRegistry {
     expression[Quarter]("quarter"),
     expression[Second]("second"),
     expression[WeekOfYear]("weekofyear"),
-    expression[Year]("year")
+    expression[Year]("year"),
 
+    // collection functions
+    expression[Size]("size")
   )
 
   val builtin: FunctionRegistry = {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
new file mode 100644
index 0000000000000..2d92dcf23a86e
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.catalyst.expressions
+
+import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode}
+import org.apache.spark.sql.types._
+
+/**
+ * Given an array or map, returns its size.
+ */
+case class Size(child: Expression) extends UnaryExpression with ExpectsInputTypes {
+  override def dataType: DataType = IntegerType
+  override def inputTypes: Seq[AbstractDataType] = Seq(TypeCollection(ArrayType, MapType))
+
+  override def nullSafeEval(value: Any): Int = child.dataType match {
+    case ArrayType(_, _) => value.asInstanceOf[Seq[Any]].size
+    case MapType(_, _, _) => value.asInstanceOf[Map[Any, Any]].size
+  }
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    nullSafeCodeGen(ctx, ev, c => s"${ev.primitive} = ($c).size();")
+  }
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionFunctionsSuite.scala
new file mode 100644
index 0000000000000..28c41b57169f9
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionFunctionsSuite.scala
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.types._
+
+
+class CollectionFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
+
+  test("Array and Map Size") {
+    val a0 = Literal.create(Seq(1, 2, 3), ArrayType(IntegerType))
+    val a1 = Literal.create(Seq[Integer](), ArrayType(IntegerType))
+    val a2 = Literal.create(Seq(1, 2), ArrayType(IntegerType))
+
+    checkEvaluation(Size(a0), 3)
+    checkEvaluation(Size(a1), 0)
+    checkEvaluation(Size(a2), 2)
+
+    val m0 = Literal.create(Map("a" -> "a", "b" -> "b"), MapType(StringType, StringType))
+    val m1 = Literal.create(Map[String, String](), MapType(StringType, StringType))
+    val m2 = Literal.create(Map("a" -> "a"), MapType(StringType, StringType))
+
+    checkEvaluation(Size(m0), 2)
+    checkEvaluation(Size(m1), 0)
+    checkEvaluation(Size(m2), 1)
+
+    checkEvaluation(Literal.create(null, MapType(StringType, StringType)), null)
+    checkEvaluation(Literal.create(null, ArrayType(StringType)), null)
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 6d60dae624b0c..60b089180c876 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -42,6 +42,7 @@ import org.apache.spark.util.Utils
  * @groupname misc_funcs Misc functions
  * @groupname window_funcs Window functions
  * @groupname string_funcs String functions
+ * @groupname collection_funcs Collection functions
  * @groupname Ungrouped Support functions for DataFrames.
  * @since 1.3.0
  */
@@ -2053,6 +2054,25 @@ object functions {
    */
   def weekofyear(columnName: String): Column = weekofyear(Column(columnName))
 
+  //////////////////////////////////////////////////////////////////////////////////////////////
+  // Collection functions
+  //////////////////////////////////////////////////////////////////////////////////////////////
+
+  /**
+   * Returns length of array or map
+   * @group collection_funcs
+   * @since 1.5.0
+   */
+  def size(columnName: String): Column = size(Column(columnName))
+
+  /**
+   * Returns length of array or map
+   * @group collection_funcs
+   * @since 1.5.0
+   */
+  def size(column: Column): Column = Size(column.expr)
+
+
   //////////////////////////////////////////////////////////////////////////////////////////////
   //////////////////////////////////////////////////////////////////////////////////////////////
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
index 8d2ff2f9690d6..1baec5d37699d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
@@ -267,4 +267,35 @@ class DataFrameFunctionsSuite extends QueryTest {
     )
   }
 
+  test("array size function") {
+    val df = Seq(
+      (Array[Int](1, 2), "x"),
+      (Array[Int](), "y"),
+      (Array[Int](1, 2, 3), "z")
+    ).toDF("a", "b")
+    checkAnswer(
+      df.select(size("a")),
+      Seq(Row(2), Row(0), Row(3))
+    )
+    checkAnswer(
+      df.selectExpr("size(a)"),
+      Seq(Row(2), Row(0), Row(3))
+    )
+  }
+
+  test("map size function") {
+    val df = Seq(
+      (Map[Int, Int](1 -> 1, 2 -> 2), "x"),
+      (Map[Int, Int](), "y"),
+      (Map[Int, Int](1 -> 1, 2 -> 2, 3 -> 3), "z")
+    ).toDF("a", "b")
+    checkAnswer(
+      df.select(size("a")),
+      Seq(Row(2), Row(0), Row(3))
+    )
+    checkAnswer(
+      df.selectExpr("size(a)"),
+      Seq(Row(2), Row(0), Row(3))
+    )
+  }
 }

From ae230596b866d8e369bd061256c4cc569dba430a Mon Sep 17 00:00:00 2001
From: Yijie Shen <henry.yijieshen@gmail.com>
Date: Tue, 21 Jul 2015 00:56:57 -0700
Subject: [PATCH 294/549] [SPARK-9173][SQL]UnionPushDown should also support
 Intersect and Except

JIRA: https://issues.apache.org/jira/browse/SPARK-9173

Author: Yijie Shen <henry.yijieshen@gmail.com>

Closes #7540 from yjshen/union_pushdown and squashes the following commits:

278510a [Yijie Shen] rename UnionPushDown to SetOperationPushDown
91741c1 [Yijie Shen] Add UnionPushDown support for intersect and except
---
 .../sql/catalyst/optimizer/Optimizer.scala    | 47 +++++++++--
 .../optimizer/SetOperationPushDownSuite.scala | 82 +++++++++++++++++++
 .../optimizer/UnionPushdownSuite.scala        | 61 --------------
 3 files changed, 120 insertions(+), 70 deletions(-)
 create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/SetOperationPushDownSuite.scala
 delete mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/UnionPushdownSuite.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 9c45b196245da..e42f0b9a247e3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -40,7 +40,7 @@ object DefaultOptimizer extends Optimizer {
       ReplaceDistinctWithAggregate) ::
     Batch("Operator Optimizations", FixedPoint(100),
       // Operator push down
-      UnionPushDown,
+      SetOperationPushDown,
       SamplePushDown,
       PushPredicateThroughJoin,
       PushPredicateThroughProject,
@@ -84,23 +84,24 @@ object SamplePushDown extends Rule[LogicalPlan] {
 }
 
 /**
- * Pushes operations to either side of a Union.
+ * Pushes operations to either side of a Union, Intersect or Except.
  */
-object UnionPushDown extends Rule[LogicalPlan] {
+object SetOperationPushDown extends Rule[LogicalPlan] {
 
   /**
    * Maps Attributes from the left side to the corresponding Attribute on the right side.
    */
-  private def buildRewrites(union: Union): AttributeMap[Attribute] = {
-    assert(union.left.output.size == union.right.output.size)
+  private def buildRewrites(bn: BinaryNode): AttributeMap[Attribute] = {
+    assert(bn.isInstanceOf[Union] || bn.isInstanceOf[Intersect] || bn.isInstanceOf[Except])
+    assert(bn.left.output.size == bn.right.output.size)
 
-    AttributeMap(union.left.output.zip(union.right.output))
+    AttributeMap(bn.left.output.zip(bn.right.output))
   }
 
   /**
-   * Rewrites an expression so that it can be pushed to the right side of a Union operator.
-   * This method relies on the fact that the output attributes of a union are always equal
-   * to the left child's output.
+   * Rewrites an expression so that it can be pushed to the right side of a
+   * Union, Intersect or Except operator. This method relies on the fact that the output attributes
+   * of a union/intersect/except are always equal to the left child's output.
    */
   private def pushToRight[A <: Expression](e: A, rewrites: AttributeMap[Attribute]) = {
     val result = e transform {
@@ -126,6 +127,34 @@ object UnionPushDown extends Rule[LogicalPlan] {
       Union(
         Project(projectList, left),
         Project(projectList.map(pushToRight(_, rewrites)), right))
+
+    // Push down filter into intersect
+    case Filter(condition, i @ Intersect(left, right)) =>
+      val rewrites = buildRewrites(i)
+      Intersect(
+        Filter(condition, left),
+        Filter(pushToRight(condition, rewrites), right))
+
+    // Push down projection into intersect
+    case Project(projectList, i @ Intersect(left, right)) =>
+      val rewrites = buildRewrites(i)
+      Intersect(
+        Project(projectList, left),
+        Project(projectList.map(pushToRight(_, rewrites)), right))
+
+    // Push down filter into except
+    case Filter(condition, e @ Except(left, right)) =>
+      val rewrites = buildRewrites(e)
+      Except(
+        Filter(condition, left),
+        Filter(pushToRight(condition, rewrites), right))
+
+    // Push down projection into except
+    case Project(projectList, e @ Except(left, right)) =>
+      val rewrites = buildRewrites(e)
+      Except(
+        Project(projectList, left),
+        Project(projectList.map(pushToRight(_, rewrites)), right))
   }
 }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/SetOperationPushDownSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/SetOperationPushDownSuite.scala
new file mode 100644
index 0000000000000..49c979bc7d72c
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/SetOperationPushDownSuite.scala
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.optimizer
+
+import org.apache.spark.sql.catalyst.analysis.EliminateSubQueries
+import org.apache.spark.sql.catalyst.plans.PlanTest
+import org.apache.spark.sql.catalyst.plans.logical._
+import org.apache.spark.sql.catalyst.rules._
+import org.apache.spark.sql.catalyst.dsl.plans._
+import org.apache.spark.sql.catalyst.dsl.expressions._
+
+class SetOperationPushDownSuite extends PlanTest {
+  object Optimize extends RuleExecutor[LogicalPlan] {
+    val batches =
+      Batch("Subqueries", Once,
+        EliminateSubQueries) ::
+      Batch("Union Pushdown", Once,
+        SetOperationPushDown) :: Nil
+  }
+
+  val testRelation = LocalRelation('a.int, 'b.int, 'c.int)
+  val testRelation2 = LocalRelation('d.int, 'e.int, 'f.int)
+  val testUnion = Union(testRelation, testRelation2)
+  val testIntersect = Intersect(testRelation, testRelation2)
+  val testExcept = Except(testRelation, testRelation2)
+
+  test("union/intersect/except: filter to each side") {
+    val unionQuery = testUnion.where('a === 1)
+    val intersectQuery = testIntersect.where('b < 10)
+    val exceptQuery = testExcept.where('c >= 5)
+
+    val unionOptimized = Optimize.execute(unionQuery.analyze)
+    val intersectOptimized = Optimize.execute(intersectQuery.analyze)
+    val exceptOptimized = Optimize.execute(exceptQuery.analyze)
+
+    val unionCorrectAnswer =
+      Union(testRelation.where('a === 1), testRelation2.where('d === 1)).analyze
+    val intersectCorrectAnswer =
+      Intersect(testRelation.where('b < 10), testRelation2.where('e < 10)).analyze
+    val exceptCorrectAnswer =
+      Except(testRelation.where('c >= 5), testRelation2.where('f >= 5)).analyze
+
+    comparePlans(unionOptimized, unionCorrectAnswer)
+    comparePlans(intersectOptimized, intersectCorrectAnswer)
+    comparePlans(exceptOptimized, exceptCorrectAnswer)
+  }
+
+  test("union/intersect/except: project to each side") {
+    val unionQuery = testUnion.select('a)
+    val intersectQuery = testIntersect.select('b, 'c)
+    val exceptQuery = testExcept.select('a, 'b, 'c)
+
+    val unionOptimized = Optimize.execute(unionQuery.analyze)
+    val intersectOptimized = Optimize.execute(intersectQuery.analyze)
+    val exceptOptimized = Optimize.execute(exceptQuery.analyze)
+
+    val unionCorrectAnswer =
+      Union(testRelation.select('a), testRelation2.select('d)).analyze
+    val intersectCorrectAnswer =
+      Intersect(testRelation.select('b, 'c), testRelation2.select('e, 'f)).analyze
+    val exceptCorrectAnswer =
+      Except(testRelation.select('a, 'b, 'c), testRelation2.select('d, 'e, 'f)).analyze
+
+    comparePlans(unionOptimized, unionCorrectAnswer)
+    comparePlans(intersectOptimized, intersectCorrectAnswer)
+    comparePlans(exceptOptimized, exceptCorrectAnswer)  }
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/UnionPushdownSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/UnionPushdownSuite.scala
deleted file mode 100644
index ec379489a6d1e..0000000000000
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/UnionPushdownSuite.scala
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.catalyst.optimizer
-
-import org.apache.spark.sql.catalyst.analysis.EliminateSubQueries
-import org.apache.spark.sql.catalyst.plans.PlanTest
-import org.apache.spark.sql.catalyst.plans.logical._
-import org.apache.spark.sql.catalyst.rules._
-import org.apache.spark.sql.catalyst.dsl.plans._
-import org.apache.spark.sql.catalyst.dsl.expressions._
-
-class UnionPushDownSuite extends PlanTest {
-  object Optimize extends RuleExecutor[LogicalPlan] {
-    val batches =
-      Batch("Subqueries", Once,
-        EliminateSubQueries) ::
-      Batch("Union Pushdown", Once,
-        UnionPushDown) :: Nil
-  }
-
-  val testRelation = LocalRelation('a.int, 'b.int, 'c.int)
-  val testRelation2 = LocalRelation('d.int, 'e.int, 'f.int)
-  val testUnion = Union(testRelation, testRelation2)
-
-  test("union: filter to each side") {
-    val query = testUnion.where('a === 1)
-
-    val optimized = Optimize.execute(query.analyze)
-
-    val correctAnswer =
-      Union(testRelation.where('a === 1), testRelation2.where('d === 1)).analyze
-
-    comparePlans(optimized, correctAnswer)
-  }
-
-  test("union: project to each side") {
-    val query = testUnion.select('b)
-
-    val optimized = Optimize.execute(query.analyze)
-
-    val correctAnswer =
-      Union(testRelation.select('b), testRelation2.select('e)).analyze
-
-    comparePlans(optimized, correctAnswer)
-  }
-}

From 6364735bcc67ecb0e9c7e5076d214ed88e927430 Mon Sep 17 00:00:00 2001
From: Kay Ousterhout <kayousterhout@gmail.com>
Date: Tue, 21 Jul 2015 01:12:51 -0700
Subject: [PATCH 295/549] [SPARK-8875] Remove BlockStoreShuffleFetcher class

The shuffle code has gotten increasingly difficult to read as it has evolved, and many classes
have evolved significantly since they were originally created. The BlockStoreShuffleFetcher class
now serves little purpose other than to make the code more difficult to read; this commit moves its
functionality into the ShuffleBlockFetcherIterator class.

cc massie JoshRosen (Josh, this PR also removes the Try you pointed out as being confusing / not necessarily useful in a previous comment).  Matt, would be helpful to know whether this will interfere in any negative ways with your new shuffle PR (I took a look and it seems like this should still cleanly integrate with your parquet work, but want to double check).

Author: Kay Ousterhout <kayousterhout@gmail.com>

Closes #7268 from kayousterhout/SPARK-8875 and squashes the following commits:

2b24a97 [Kay Ousterhout] Fixed DAGSchedulerSuite compile error
98a1831 [Kay Ousterhout] Merge remote-tracking branch 'upstream/master' into SPARK-8875
90f0e89 [Kay Ousterhout] Fixed broken test
14bfcbb [Kay Ousterhout] Last style fix
bc69d2b [Kay Ousterhout] Style improvements based on Josh's code review
ad3c8d1 [Kay Ousterhout] Better documentation for MapOutputTracker methods
0bc0e59 [Kay Ousterhout] [SPARK-8875] Remove BlockStoreShuffleFetcher class
---
 .../org/apache/spark/MapOutputTracker.scala   | 62 ++++++++++----
 .../hash/BlockStoreShuffleFetcher.scala       | 85 -------------------
 .../shuffle/hash/HashShuffleReader.scala      | 19 +++--
 .../storage/ShuffleBlockFetcherIterator.scala | 72 ++++++++++------
 .../apache/spark/MapOutputTrackerSuite.scala  | 28 +++---
 .../scala/org/apache/spark/ShuffleSuite.scala | 12 +--
 .../spark/scheduler/DAGSchedulerSuite.scala   | 32 +++----
 .../shuffle/hash/HashShuffleReaderSuite.scala | 14 +--
 .../ShuffleBlockFetcherIteratorSuite.scala    | 18 ++--
 .../apache/spark/util/AkkaUtilsSuite.scala    | 22 +++--
 10 files changed, 172 insertions(+), 192 deletions(-)
 delete mode 100644 core/src/main/scala/org/apache/spark/shuffle/hash/BlockStoreShuffleFetcher.scala

diff --git a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala
index 862ffe868f58f..92218832d256f 100644
--- a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala
+++ b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala
@@ -21,14 +21,14 @@ import java.io._
 import java.util.concurrent.ConcurrentHashMap
 import java.util.zip.{GZIPInputStream, GZIPOutputStream}
 
-import scala.collection.mutable.{HashMap, HashSet, Map}
+import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet, Map}
 import scala.collection.JavaConversions._
 import scala.reflect.ClassTag
 
 import org.apache.spark.rpc.{RpcEndpointRef, RpcEnv, RpcCallContext, RpcEndpoint}
 import org.apache.spark.scheduler.MapStatus
 import org.apache.spark.shuffle.MetadataFetchFailedException
-import org.apache.spark.storage.BlockManagerId
+import org.apache.spark.storage.{BlockId, BlockManagerId, ShuffleBlockId}
 import org.apache.spark.util._
 
 private[spark] sealed trait MapOutputTrackerMessage
@@ -124,10 +124,18 @@ private[spark] abstract class MapOutputTracker(conf: SparkConf) extends Logging
   }
 
   /**
-   * Called from executors to get the server URIs and output sizes of the map outputs of
-   * a given shuffle.
+   * Called from executors to get the server URIs and output sizes for each shuffle block that
+   * needs to be read from a given reduce task.
+   *
+   * @return A sequence of 2-item tuples, where the first item in the tuple is a BlockManagerId,
+   *         and the second item is a sequence of (shuffle block id, shuffle block size) tuples
+   *         describing the shuffle blocks that are stored at that block manager.
    */
-  def getServerStatuses(shuffleId: Int, reduceId: Int): Array[(BlockManagerId, Long)] = {
+  def getMapSizesByExecutorId(shuffleId: Int, reduceId: Int)
+  : Seq[(BlockManagerId, Seq[(BlockId, Long)])] = {
+    logDebug(s"Fetching outputs for shuffle $shuffleId, reduce $reduceId")
+    val startTime = System.currentTimeMillis
+
     val statuses = mapStatuses.get(shuffleId).orNull
     if (statuses == null) {
       logInfo("Don't have map outputs for shuffle " + shuffleId + ", fetching them")
@@ -167,6 +175,9 @@ private[spark] abstract class MapOutputTracker(conf: SparkConf) extends Logging
           }
         }
       }
+      logDebug(s"Fetching map output location for shuffle $shuffleId, reduce $reduceId took " +
+        s"${System.currentTimeMillis - startTime} ms")
+
       if (fetchedStatuses != null) {
         fetchedStatuses.synchronized {
           return MapOutputTracker.convertMapStatuses(shuffleId, reduceId, fetchedStatuses)
@@ -421,23 +432,38 @@ private[spark] object MapOutputTracker extends Logging {
     }
   }
 
-  // Convert an array of MapStatuses to locations and sizes for a given reduce ID. If
-  // any of the statuses is null (indicating a missing location due to a failed mapper),
-  // throw a FetchFailedException.
+  /**
+   * Converts an array of MapStatuses for a given reduce ID to a sequence that, for each block
+   * manager ID, lists the shuffle block ids and corresponding shuffle block sizes stored at that
+   * block manager.
+   *
+   * If any of the statuses is null (indicating a missing location due to a failed mapper),
+   * throws a FetchFailedException.
+   *
+   * @param shuffleId Identifier for the shuffle
+   * @param reduceId Identifier for the reduce task
+   * @param statuses List of map statuses, indexed by map ID.
+   * @return A sequence of 2-item tuples, where the first item in the tuple is a BlockManagerId,
+   *         and the second item is a sequence of (shuffle block id, shuffle block size) tuples
+   *         describing the shuffle blocks that are stored at that block manager.
+   */
   private def convertMapStatuses(
       shuffleId: Int,
       reduceId: Int,
-      statuses: Array[MapStatus]): Array[(BlockManagerId, Long)] = {
+      statuses: Array[MapStatus]): Seq[(BlockManagerId, Seq[(BlockId, Long)])] = {
     assert (statuses != null)
-    statuses.map {
-      status =>
-        if (status == null) {
-          logError("Missing an output location for shuffle " + shuffleId)
-          throw new MetadataFetchFailedException(
-            shuffleId, reduceId, "Missing an output location for shuffle " + shuffleId)
-        } else {
-          (status.location, status.getSizeForBlock(reduceId))
-        }
+    val splitsByAddress = new HashMap[BlockManagerId, ArrayBuffer[(BlockId, Long)]]
+    for ((status, mapId) <- statuses.zipWithIndex) {
+      if (status == null) {
+        val errorMessage = s"Missing an output location for shuffle $shuffleId"
+        logError(errorMessage)
+        throw new MetadataFetchFailedException(shuffleId, reduceId, errorMessage)
+      } else {
+        splitsByAddress.getOrElseUpdate(status.location, ArrayBuffer()) +=
+          ((ShuffleBlockId(shuffleId, mapId, reduceId), status.getSizeForBlock(reduceId)))
+      }
     }
+
+    splitsByAddress.toSeq
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/shuffle/hash/BlockStoreShuffleFetcher.scala b/core/src/main/scala/org/apache/spark/shuffle/hash/BlockStoreShuffleFetcher.scala
deleted file mode 100644
index 9d8e7e9f03aea..0000000000000
--- a/core/src/main/scala/org/apache/spark/shuffle/hash/BlockStoreShuffleFetcher.scala
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.shuffle.hash
-
-import java.io.InputStream
-
-import scala.collection.mutable.{ArrayBuffer, HashMap}
-import scala.util.{Failure, Success}
-
-import org.apache.spark._
-import org.apache.spark.shuffle.FetchFailedException
-import org.apache.spark.storage.{BlockId, BlockManager, BlockManagerId, ShuffleBlockFetcherIterator,
-  ShuffleBlockId}
-
-private[hash] object BlockStoreShuffleFetcher extends Logging {
-  def fetchBlockStreams(
-      shuffleId: Int,
-      reduceId: Int,
-      context: TaskContext,
-      blockManager: BlockManager,
-      mapOutputTracker: MapOutputTracker)
-    : Iterator[(BlockId, InputStream)] =
-  {
-    logDebug("Fetching outputs for shuffle %d, reduce %d".format(shuffleId, reduceId))
-
-    val startTime = System.currentTimeMillis
-    val statuses = mapOutputTracker.getServerStatuses(shuffleId, reduceId)
-    logDebug("Fetching map output location for shuffle %d, reduce %d took %d ms".format(
-      shuffleId, reduceId, System.currentTimeMillis - startTime))
-
-    val splitsByAddress = new HashMap[BlockManagerId, ArrayBuffer[(Int, Long)]]
-    for (((address, size), index) <- statuses.zipWithIndex) {
-      splitsByAddress.getOrElseUpdate(address, ArrayBuffer()) += ((index, size))
-    }
-
-    val blocksByAddress: Seq[(BlockManagerId, Seq[(BlockId, Long)])] = splitsByAddress.toSeq.map {
-      case (address, splits) =>
-        (address, splits.map(s => (ShuffleBlockId(shuffleId, s._1, reduceId), s._2)))
-    }
-
-    val blockFetcherItr = new ShuffleBlockFetcherIterator(
-      context,
-      blockManager.shuffleClient,
-      blockManager,
-      blocksByAddress,
-      // Note: we use getSizeAsMb when no suffix is provided for backwards compatibility
-      SparkEnv.get.conf.getSizeAsMb("spark.reducer.maxSizeInFlight", "48m") * 1024 * 1024)
-
-    // Make sure that fetch failures are wrapped inside a FetchFailedException for the scheduler
-    blockFetcherItr.map { blockPair =>
-      val blockId = blockPair._1
-      val blockOption = blockPair._2
-      blockOption match {
-        case Success(inputStream) => {
-          (blockId, inputStream)
-        }
-        case Failure(e) => {
-          blockId match {
-            case ShuffleBlockId(shufId, mapId, _) =>
-              val address = statuses(mapId.toInt)._1
-              throw new FetchFailedException(address, shufId.toInt, mapId.toInt, reduceId, e)
-            case _ =>
-              throw new SparkException(
-                "Failed to get block " + blockId + ", which is not a shuffle block", e)
-          }
-        }
-      }
-    }
-  }
-}
diff --git a/core/src/main/scala/org/apache/spark/shuffle/hash/HashShuffleReader.scala b/core/src/main/scala/org/apache/spark/shuffle/hash/HashShuffleReader.scala
index d5c9880659dd3..de79fa56f017b 100644
--- a/core/src/main/scala/org/apache/spark/shuffle/hash/HashShuffleReader.scala
+++ b/core/src/main/scala/org/apache/spark/shuffle/hash/HashShuffleReader.scala
@@ -17,10 +17,10 @@
 
 package org.apache.spark.shuffle.hash
 
-import org.apache.spark.{InterruptibleIterator, MapOutputTracker, SparkEnv, TaskContext}
+import org.apache.spark.{InterruptibleIterator, Logging, MapOutputTracker, SparkEnv, TaskContext}
 import org.apache.spark.serializer.Serializer
 import org.apache.spark.shuffle.{BaseShuffleHandle, ShuffleReader}
-import org.apache.spark.storage.BlockManager
+import org.apache.spark.storage.{BlockManager, ShuffleBlockFetcherIterator}
 import org.apache.spark.util.CompletionIterator
 import org.apache.spark.util.collection.ExternalSorter
 
@@ -31,8 +31,8 @@ private[spark] class HashShuffleReader[K, C](
     context: TaskContext,
     blockManager: BlockManager = SparkEnv.get.blockManager,
     mapOutputTracker: MapOutputTracker = SparkEnv.get.mapOutputTracker)
-  extends ShuffleReader[K, C]
-{
+  extends ShuffleReader[K, C] with Logging {
+
   require(endPartition == startPartition + 1,
     "Hash shuffle currently only supports fetching one partition")
 
@@ -40,11 +40,16 @@ private[spark] class HashShuffleReader[K, C](
 
   /** Read the combined key-values for this reduce task */
   override def read(): Iterator[Product2[K, C]] = {
-    val blockStreams = BlockStoreShuffleFetcher.fetchBlockStreams(
-      handle.shuffleId, startPartition, context, blockManager, mapOutputTracker)
+    val blockFetcherItr = new ShuffleBlockFetcherIterator(
+      context,
+      blockManager.shuffleClient,
+      blockManager,
+      mapOutputTracker.getMapSizesByExecutorId(handle.shuffleId, startPartition),
+      // Note: we use getSizeAsMb when no suffix is provided for backwards compatibility
+      SparkEnv.get.conf.getSizeAsMb("spark.reducer.maxSizeInFlight", "48m") * 1024 * 1024)
 
     // Wrap the streams for compression based on configuration
-    val wrappedStreams = blockStreams.map { case (blockId, inputStream) =>
+    val wrappedStreams = blockFetcherItr.map { case (blockId, inputStream) =>
       blockManager.wrapForCompression(blockId, inputStream)
     }
 
diff --git a/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala b/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala
index e49e39679e940..a759ceb96ec1e 100644
--- a/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala
+++ b/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala
@@ -21,18 +21,19 @@ import java.io.InputStream
 import java.util.concurrent.LinkedBlockingQueue
 
 import scala.collection.mutable.{ArrayBuffer, HashSet, Queue}
-import scala.util.{Failure, Try}
+import scala.util.control.NonFatal
 
-import org.apache.spark.{Logging, TaskContext}
+import org.apache.spark.{Logging, SparkException, TaskContext}
 import org.apache.spark.network.buffer.ManagedBuffer
 import org.apache.spark.network.shuffle.{BlockFetchingListener, ShuffleClient}
+import org.apache.spark.shuffle.FetchFailedException
 import org.apache.spark.util.Utils
 
 /**
  * An iterator that fetches multiple blocks. For local blocks, it fetches from the local block
  * manager. For remote blocks, it fetches them using the provided BlockTransferService.
  *
- * This creates an iterator of (BlockID, Try[InputStream]) tuples so the caller can handle blocks
+ * This creates an iterator of (BlockID, InputStream) tuples so the caller can handle blocks
  * in a pipelined fashion as they are received.
  *
  * The implementation throttles the remote fetches to they don't exceed maxBytesInFlight to avoid
@@ -53,7 +54,7 @@ final class ShuffleBlockFetcherIterator(
     blockManager: BlockManager,
     blocksByAddress: Seq[(BlockManagerId, Seq[(BlockId, Long)])],
     maxBytesInFlight: Long)
-  extends Iterator[(BlockId, Try[InputStream])] with Logging {
+  extends Iterator[(BlockId, InputStream)] with Logging {
 
   import ShuffleBlockFetcherIterator._
 
@@ -115,7 +116,7 @@ final class ShuffleBlockFetcherIterator(
   private[storage] def releaseCurrentResultBuffer(): Unit = {
     // Release the current buffer if necessary
     currentResult match {
-      case SuccessFetchResult(_, _, buf) => buf.release()
+      case SuccessFetchResult(_, _, _, buf) => buf.release()
       case _ =>
     }
     currentResult = null
@@ -132,7 +133,7 @@ final class ShuffleBlockFetcherIterator(
     while (iter.hasNext) {
       val result = iter.next()
       result match {
-        case SuccessFetchResult(_, _, buf) => buf.release()
+        case SuccessFetchResult(_, _, _, buf) => buf.release()
         case _ =>
       }
     }
@@ -157,7 +158,7 @@ final class ShuffleBlockFetcherIterator(
             // Increment the ref count because we need to pass this to a different thread.
             // This needs to be released after use.
             buf.retain()
-            results.put(new SuccessFetchResult(BlockId(blockId), sizeMap(blockId), buf))
+            results.put(new SuccessFetchResult(BlockId(blockId), address, sizeMap(blockId), buf))
             shuffleMetrics.incRemoteBytesRead(buf.size)
             shuffleMetrics.incRemoteBlocksFetched(1)
           }
@@ -166,7 +167,7 @@ final class ShuffleBlockFetcherIterator(
 
         override def onBlockFetchFailure(blockId: String, e: Throwable): Unit = {
           logError(s"Failed to get block(s) from ${req.address.host}:${req.address.port}", e)
-          results.put(new FailureFetchResult(BlockId(blockId), e))
+          results.put(new FailureFetchResult(BlockId(blockId), address, e))
         }
       }
     )
@@ -238,12 +239,12 @@ final class ShuffleBlockFetcherIterator(
         shuffleMetrics.incLocalBlocksFetched(1)
         shuffleMetrics.incLocalBytesRead(buf.size)
         buf.retain()
-        results.put(new SuccessFetchResult(blockId, 0, buf))
+        results.put(new SuccessFetchResult(blockId, blockManager.blockManagerId, 0, buf))
       } catch {
         case e: Exception =>
           // If we see an exception, stop immediately.
           logError(s"Error occurred while fetching local blocks", e)
-          results.put(new FailureFetchResult(blockId, e))
+          results.put(new FailureFetchResult(blockId, blockManager.blockManagerId, e))
           return
       }
     }
@@ -275,12 +276,14 @@ final class ShuffleBlockFetcherIterator(
   override def hasNext: Boolean = numBlocksProcessed < numBlocksToFetch
 
   /**
-   * Fetches the next (BlockId, Try[InputStream]). If a task fails, the ManagedBuffers
+   * Fetches the next (BlockId, InputStream). If a task fails, the ManagedBuffers
    * underlying each InputStream will be freed by the cleanup() method registered with the
    * TaskCompletionListener. However, callers should close() these InputStreams
    * as soon as they are no longer needed, in order to release memory as early as possible.
+   *
+   * Throws a FetchFailedException if the next block could not be fetched.
    */
-  override def next(): (BlockId, Try[InputStream]) = {
+  override def next(): (BlockId, InputStream) = {
     numBlocksProcessed += 1
     val startFetchWait = System.currentTimeMillis()
     currentResult = results.take()
@@ -289,7 +292,7 @@ final class ShuffleBlockFetcherIterator(
     shuffleMetrics.incFetchWaitTime(stopFetchWait - startFetchWait)
 
     result match {
-      case SuccessFetchResult(_, size, _) => bytesInFlight -= size
+      case SuccessFetchResult(_, _, size, _) => bytesInFlight -= size
       case _ =>
     }
     // Send fetch requests up to maxBytesInFlight
@@ -298,19 +301,28 @@ final class ShuffleBlockFetcherIterator(
       sendRequest(fetchRequests.dequeue())
     }
 
-    val iteratorTry: Try[InputStream] = result match {
-      case FailureFetchResult(_, e) =>
-        Failure(e)
-      case SuccessFetchResult(blockId, _, buf) =>
-        // There is a chance that createInputStream can fail (e.g. fetching a local file that does
-        // not exist, SPARK-4085). In that case, we should propagate the right exception so
-        // the scheduler gets a FetchFailedException.
-        Try(buf.createInputStream()).map { inputStream =>
-          new BufferReleasingInputStream(inputStream, this)
+    result match {
+      case FailureFetchResult(blockId, address, e) =>
+        throwFetchFailedException(blockId, address, e)
+
+      case SuccessFetchResult(blockId, address, _, buf) =>
+        try {
+          (result.blockId, new BufferReleasingInputStream(buf.createInputStream(), this))
+        } catch {
+          case NonFatal(t) =>
+            throwFetchFailedException(blockId, address, t)
         }
     }
+  }
 
-    (result.blockId, iteratorTry)
+  private def throwFetchFailedException(blockId: BlockId, address: BlockManagerId, e: Throwable) = {
+    blockId match {
+      case ShuffleBlockId(shufId, mapId, reduceId) =>
+        throw new FetchFailedException(address, shufId.toInt, mapId.toInt, reduceId, e)
+      case _ =>
+        throw new SparkException(
+          "Failed to get block " + blockId + ", which is not a shuffle block", e)
+    }
   }
 }
 
@@ -366,16 +378,22 @@ object ShuffleBlockFetcherIterator {
    */
   private[storage] sealed trait FetchResult {
     val blockId: BlockId
+    val address: BlockManagerId
   }
 
   /**
    * Result of a fetch from a remote block successfully.
    * @param blockId block id
+   * @param address BlockManager that the block was fetched from.
    * @param size estimated size of the block, used to calculate bytesInFlight.
    *             Note that this is NOT the exact bytes.
    * @param buf [[ManagedBuffer]] for the content.
    */
-  private[storage] case class SuccessFetchResult(blockId: BlockId, size: Long, buf: ManagedBuffer)
+  private[storage] case class SuccessFetchResult(
+      blockId: BlockId,
+      address: BlockManagerId,
+      size: Long,
+      buf: ManagedBuffer)
     extends FetchResult {
     require(buf != null)
     require(size >= 0)
@@ -384,8 +402,12 @@ object ShuffleBlockFetcherIterator {
   /**
    * Result of a fetch from a remote block unsuccessfully.
    * @param blockId block id
+   * @param address BlockManager that the block was attempted to be fetched from
    * @param e the failure exception
    */
-  private[storage] case class FailureFetchResult(blockId: BlockId, e: Throwable)
+  private[storage] case class FailureFetchResult(
+      blockId: BlockId,
+      address: BlockManagerId,
+      e: Throwable)
     extends FetchResult
 }
diff --git a/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala b/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala
index 7a1961137cce5..af4e68950f75a 100644
--- a/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala
@@ -17,13 +17,15 @@
 
 package org.apache.spark
 
+import scala.collection.mutable.ArrayBuffer
+
 import org.mockito.Mockito._
 import org.mockito.Matchers.{any, isA}
 
 import org.apache.spark.rpc.{RpcAddress, RpcEndpointRef, RpcCallContext, RpcEnv}
 import org.apache.spark.scheduler.{CompressedMapStatus, MapStatus}
 import org.apache.spark.shuffle.FetchFailedException
-import org.apache.spark.storage.BlockManagerId
+import org.apache.spark.storage.{BlockManagerId, ShuffleBlockId}
 
 class MapOutputTrackerSuite extends SparkFunSuite {
   private val conf = new SparkConf
@@ -55,9 +57,11 @@ class MapOutputTrackerSuite extends SparkFunSuite {
         Array(1000L, 10000L)))
     tracker.registerMapOutput(10, 1, MapStatus(BlockManagerId("b", "hostB", 1000),
         Array(10000L, 1000L)))
-    val statuses = tracker.getServerStatuses(10, 0)
-    assert(statuses.toSeq === Seq((BlockManagerId("a", "hostA", 1000), size1000),
-                                  (BlockManagerId("b", "hostB", 1000), size10000)))
+    val statuses = tracker.getMapSizesByExecutorId(10, 0)
+    assert(statuses.toSet ===
+      Seq((BlockManagerId("a", "hostA", 1000), ArrayBuffer((ShuffleBlockId(10, 0, 0), size1000))),
+          (BlockManagerId("b", "hostB", 1000), ArrayBuffer((ShuffleBlockId(10, 1, 0), size10000))))
+        .toSet)
     tracker.stop()
     rpcEnv.shutdown()
   }
@@ -75,10 +79,10 @@ class MapOutputTrackerSuite extends SparkFunSuite {
     tracker.registerMapOutput(10, 1, MapStatus(BlockManagerId("b", "hostB", 1000),
       Array(compressedSize10000, compressedSize1000)))
     assert(tracker.containsShuffle(10))
-    assert(tracker.getServerStatuses(10, 0).nonEmpty)
+    assert(tracker.getMapSizesByExecutorId(10, 0).nonEmpty)
     tracker.unregisterShuffle(10)
     assert(!tracker.containsShuffle(10))
-    assert(tracker.getServerStatuses(10, 0).isEmpty)
+    assert(tracker.getMapSizesByExecutorId(10, 0).isEmpty)
 
     tracker.stop()
     rpcEnv.shutdown()
@@ -104,7 +108,7 @@ class MapOutputTrackerSuite extends SparkFunSuite {
     // The remaining reduce task might try to grab the output despite the shuffle failure;
     // this should cause it to fail, and the scheduler will ignore the failure due to the
     // stage already being aborted.
-    intercept[FetchFailedException] { tracker.getServerStatuses(10, 1) }
+    intercept[FetchFailedException] { tracker.getMapSizesByExecutorId(10, 1) }
 
     tracker.stop()
     rpcEnv.shutdown()
@@ -126,23 +130,23 @@ class MapOutputTrackerSuite extends SparkFunSuite {
     masterTracker.registerShuffle(10, 1)
     masterTracker.incrementEpoch()
     slaveTracker.updateEpoch(masterTracker.getEpoch)
-    intercept[FetchFailedException] { slaveTracker.getServerStatuses(10, 0) }
+    intercept[FetchFailedException] { slaveTracker.getMapSizesByExecutorId(10, 0) }
 
     val size1000 = MapStatus.decompressSize(MapStatus.compressSize(1000L))
     masterTracker.registerMapOutput(10, 0, MapStatus(
       BlockManagerId("a", "hostA", 1000), Array(1000L)))
     masterTracker.incrementEpoch()
     slaveTracker.updateEpoch(masterTracker.getEpoch)
-    assert(slaveTracker.getServerStatuses(10, 0).toSeq ===
-      Seq((BlockManagerId("a", "hostA", 1000), size1000)))
+    assert(slaveTracker.getMapSizesByExecutorId(10, 0) ===
+      Seq((BlockManagerId("a", "hostA", 1000), ArrayBuffer((ShuffleBlockId(10, 0, 0), size1000)))))
 
     masterTracker.unregisterMapOutput(10, 0, BlockManagerId("a", "hostA", 1000))
     masterTracker.incrementEpoch()
     slaveTracker.updateEpoch(masterTracker.getEpoch)
-    intercept[FetchFailedException] { slaveTracker.getServerStatuses(10, 0) }
+    intercept[FetchFailedException] { slaveTracker.getMapSizesByExecutorId(10, 0) }
 
     // failure should be cached
-    intercept[FetchFailedException] { slaveTracker.getServerStatuses(10, 0) }
+    intercept[FetchFailedException] { slaveTracker.getMapSizesByExecutorId(10, 0) }
 
     masterTracker.stop()
     slaveTracker.stop()
diff --git a/core/src/test/scala/org/apache/spark/ShuffleSuite.scala b/core/src/test/scala/org/apache/spark/ShuffleSuite.scala
index c3c2b1ffc1efa..b68102bfb949f 100644
--- a/core/src/test/scala/org/apache/spark/ShuffleSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ShuffleSuite.scala
@@ -66,8 +66,8 @@ abstract class ShuffleSuite extends SparkFunSuite with Matchers with LocalSparkC
 
     // All blocks must have non-zero size
     (0 until NUM_BLOCKS).foreach { id =>
-      val statuses = SparkEnv.get.mapOutputTracker.getServerStatuses(shuffleId, id)
-      assert(statuses.forall(s => s._2 > 0))
+      val statuses = SparkEnv.get.mapOutputTracker.getMapSizesByExecutorId(shuffleId, id)
+      assert(statuses.forall(_._2.forall(blockIdSizePair => blockIdSizePair._2 > 0)))
     }
   }
 
@@ -105,8 +105,8 @@ abstract class ShuffleSuite extends SparkFunSuite with Matchers with LocalSparkC
     assert(c.count === 4)
 
     val blockSizes = (0 until NUM_BLOCKS).flatMap { id =>
-      val statuses = SparkEnv.get.mapOutputTracker.getServerStatuses(shuffleId, id)
-      statuses.map(x => x._2)
+      val statuses = SparkEnv.get.mapOutputTracker.getMapSizesByExecutorId(shuffleId, id)
+      statuses.flatMap(_._2.map(_._2))
     }
     val nonEmptyBlocks = blockSizes.filter(x => x > 0)
 
@@ -130,8 +130,8 @@ abstract class ShuffleSuite extends SparkFunSuite with Matchers with LocalSparkC
     assert(c.count === 4)
 
     val blockSizes = (0 until NUM_BLOCKS).flatMap { id =>
-      val statuses = SparkEnv.get.mapOutputTracker.getServerStatuses(shuffleId, id)
-      statuses.map(x => x._2)
+      val statuses = SparkEnv.get.mapOutputTracker.getMapSizesByExecutorId(shuffleId, id)
+      statuses.flatMap(_._2.map(_._2))
     }
     val nonEmptyBlocks = blockSizes.filter(x => x > 0)
 
diff --git a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
index 86728cb2b62af..3462a82c9cdd3 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
@@ -483,8 +483,8 @@ class DAGSchedulerSuite
     complete(taskSets(0), Seq(
         (Success, makeMapStatus("hostA", 1)),
         (Success, makeMapStatus("hostB", 1))))
-    assert(mapOutputTracker.getServerStatuses(shuffleId, 0).map(_._1) ===
-           Array(makeBlockManagerId("hostA"), makeBlockManagerId("hostB")))
+    assert(mapOutputTracker.getMapSizesByExecutorId(shuffleId, 0).map(_._1).toSet ===
+      HashSet(makeBlockManagerId("hostA"), makeBlockManagerId("hostB")))
     complete(taskSets(1), Seq((Success, 42)))
     assert(results === Map(0 -> 42))
     assertDataStructuresEmpty()
@@ -510,8 +510,8 @@ class DAGSchedulerSuite
     // have the 2nd attempt pass
     complete(taskSets(2), Seq((Success, makeMapStatus("hostA", reduceRdd.partitions.size))))
     // we can see both result blocks now
-    assert(mapOutputTracker.getServerStatuses(shuffleId, 0).map(_._1.host) ===
-      Array("hostA", "hostB"))
+    assert(mapOutputTracker.getMapSizesByExecutorId(shuffleId, 0).map(_._1.host).toSet ===
+      HashSet("hostA", "hostB"))
     complete(taskSets(3), Seq((Success, 43)))
     assert(results === Map(0 -> 42, 1 -> 43))
     assertDataStructuresEmpty()
@@ -527,8 +527,8 @@ class DAGSchedulerSuite
       (Success, makeMapStatus("hostA", reduceRdd.partitions.size)),
       (Success, makeMapStatus("hostB", reduceRdd.partitions.size))))
     // The MapOutputTracker should know about both map output locations.
-    assert(mapOutputTracker.getServerStatuses(shuffleId, 0).map(_._1.host) ===
-      Array("hostA", "hostB"))
+    assert(mapOutputTracker.getMapSizesByExecutorId(shuffleId, 0).map(_._1.host).toSet ===
+      HashSet("hostA", "hostB"))
 
     // The first result task fails, with a fetch failure for the output from the first mapper.
     runEvent(CompletionEvent(
@@ -577,10 +577,10 @@ class DAGSchedulerSuite
       (Success, makeMapStatus("hostA", 2)),
       (Success, makeMapStatus("hostB", 2))))
     // The MapOutputTracker should know about both map output locations.
-    assert(mapOutputTracker.getServerStatuses(shuffleId, 0).map(_._1.host) ===
-      Array("hostA", "hostB"))
-    assert(mapOutputTracker.getServerStatuses(shuffleId, 1).map(_._1.host) ===
-      Array("hostA", "hostB"))
+    assert(mapOutputTracker.getMapSizesByExecutorId(shuffleId, 0).map(_._1.host).toSet ===
+      HashSet("hostA", "hostB"))
+    assert(mapOutputTracker.getMapSizesByExecutorId(shuffleId, 1).map(_._1.host).toSet ===
+      HashSet("hostA", "hostB"))
 
     // The first result task fails, with a fetch failure for the output from the first mapper.
     runEvent(CompletionEvent(
@@ -713,8 +713,8 @@ class DAGSchedulerSuite
     taskSet.tasks(1).epoch = newEpoch
     runEvent(CompletionEvent(taskSet.tasks(1), Success, makeMapStatus("hostA",
       reduceRdd.partitions.size), null, createFakeTaskInfo(), null))
-    assert(mapOutputTracker.getServerStatuses(shuffleId, 0).map(_._1) ===
-           Array(makeBlockManagerId("hostB"), makeBlockManagerId("hostA")))
+    assert(mapOutputTracker.getMapSizesByExecutorId(shuffleId, 0).map(_._1).toSet ===
+           HashSet(makeBlockManagerId("hostB"), makeBlockManagerId("hostA")))
     complete(taskSets(1), Seq((Success, 42), (Success, 43)))
     assert(results === Map(0 -> 42, 1 -> 43))
     assertDataStructuresEmpty()
@@ -809,8 +809,8 @@ class DAGSchedulerSuite
        (Success, makeMapStatus("hostB", 1))))
     // have hostC complete the resubmitted task
     complete(taskSets(1), Seq((Success, makeMapStatus("hostC", 1))))
-    assert(mapOutputTracker.getServerStatuses(shuffleId, 0).map(_._1) ===
-           Array(makeBlockManagerId("hostC"), makeBlockManagerId("hostB")))
+    assert(mapOutputTracker.getMapSizesByExecutorId(shuffleId, 0).map(_._1).toSet ===
+           HashSet(makeBlockManagerId("hostC"), makeBlockManagerId("hostB")))
     complete(taskSets(2), Seq((Success, 42)))
     assert(results === Map(0 -> 42))
     assertDataStructuresEmpty()
@@ -981,8 +981,8 @@ class DAGSchedulerSuite
     submit(reduceRdd, Array(0))
     complete(taskSets(0), Seq(
         (Success, makeMapStatus("hostA", 1))))
-    assert(mapOutputTracker.getServerStatuses(shuffleId, 0).map(_._1) ===
-           Array(makeBlockManagerId("hostA")))
+    assert(mapOutputTracker.getMapSizesByExecutorId(shuffleId, 0).map(_._1).toSet ===
+           HashSet(makeBlockManagerId("hostA")))
 
     // Reducer should run on the same host that map task ran
     val reduceTaskSet = taskSets(1)
diff --git a/core/src/test/scala/org/apache/spark/shuffle/hash/HashShuffleReaderSuite.scala b/core/src/test/scala/org/apache/spark/shuffle/hash/HashShuffleReaderSuite.scala
index 28ca68698e3dc..6c9cb448e7833 100644
--- a/core/src/test/scala/org/apache/spark/shuffle/hash/HashShuffleReaderSuite.scala
+++ b/core/src/test/scala/org/apache/spark/shuffle/hash/HashShuffleReaderSuite.scala
@@ -115,11 +115,15 @@ class HashShuffleReaderSuite extends SparkFunSuite with LocalSparkContext {
     // Make a mocked MapOutputTracker for the shuffle reader to use to determine what
     // shuffle data to read.
     val mapOutputTracker = mock(classOf[MapOutputTracker])
-    // Test a scenario where all data is local, just to avoid creating a bunch of additional mocks
-    // for the code to read data over the network.
-    val statuses: Array[(BlockManagerId, Long)] =
-      Array.fill(numMaps)((localBlockManagerId, byteOutputStream.size().toLong))
-    when(mapOutputTracker.getServerStatuses(shuffleId, reduceId)).thenReturn(statuses)
+    when(mapOutputTracker.getMapSizesByExecutorId(shuffleId, reduceId)).thenReturn {
+      // Test a scenario where all data is local, to avoid creating a bunch of additional mocks
+      // for the code to read data over the network.
+      val shuffleBlockIdsAndSizes = (0 until numMaps).map { mapId =>
+        val shuffleBlockId = ShuffleBlockId(shuffleId, mapId, reduceId)
+        (shuffleBlockId, byteOutputStream.size().toLong)
+      }
+      Seq((localBlockManagerId, shuffleBlockIdsAndSizes))
+    }
 
     // Create a mocked shuffle handle to pass into HashShuffleReader.
     val shuffleHandle = {
diff --git a/core/src/test/scala/org/apache/spark/storage/ShuffleBlockFetcherIteratorSuite.scala b/core/src/test/scala/org/apache/spark/storage/ShuffleBlockFetcherIteratorSuite.scala
index 9ced4148d7206..64f3fbdcebed9 100644
--- a/core/src/test/scala/org/apache/spark/storage/ShuffleBlockFetcherIteratorSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/ShuffleBlockFetcherIteratorSuite.scala
@@ -33,6 +33,7 @@ import org.apache.spark.{SparkFunSuite, TaskContextImpl}
 import org.apache.spark.network._
 import org.apache.spark.network.buffer.ManagedBuffer
 import org.apache.spark.network.shuffle.BlockFetchingListener
+import org.apache.spark.shuffle.FetchFailedException
 
 
 class ShuffleBlockFetcherIteratorSuite extends SparkFunSuite with PrivateMethodTester {
@@ -106,13 +107,11 @@ class ShuffleBlockFetcherIteratorSuite extends SparkFunSuite with PrivateMethodT
     for (i <- 0 until 5) {
       assert(iterator.hasNext, s"iterator should have 5 elements but actually has $i elements")
       val (blockId, inputStream) = iterator.next()
-      assert(inputStream.isSuccess,
-        s"iterator should have 5 elements defined but actually has $i elements")
 
       // Make sure we release buffers when a wrapped input stream is closed.
       val mockBuf = localBlocks.getOrElse(blockId, remoteBlocks(blockId))
       // Note: ShuffleBlockFetcherIterator wraps input streams in a BufferReleasingInputStream
-      val wrappedInputStream = inputStream.get.asInstanceOf[BufferReleasingInputStream]
+      val wrappedInputStream = inputStream.asInstanceOf[BufferReleasingInputStream]
       verify(mockBuf, times(0)).release()
       val delegateAccess = PrivateMethod[InputStream]('delegate)
 
@@ -175,11 +174,11 @@ class ShuffleBlockFetcherIteratorSuite extends SparkFunSuite with PrivateMethodT
       48 * 1024 * 1024)
 
     verify(blocks(ShuffleBlockId(0, 0, 0)), times(0)).release()
-    iterator.next()._2.get.close() // close() first block's input stream
+    iterator.next()._2.close() // close() first block's input stream
     verify(blocks(ShuffleBlockId(0, 0, 0)), times(1)).release()
 
     // Get the 2nd block but do not exhaust the iterator
-    val subIter = iterator.next()._2.get
+    val subIter = iterator.next()._2
 
     // Complete the task; then the 2nd block buffer should be exhausted
     verify(blocks(ShuffleBlockId(0, 1, 0)), times(0)).release()
@@ -239,9 +238,10 @@ class ShuffleBlockFetcherIteratorSuite extends SparkFunSuite with PrivateMethodT
     // Continue only after the mock calls onBlockFetchFailure
     sem.acquire()
 
-    // The first block should be defined, and the last two are not defined (due to failure)
-    assert(iterator.next()._2.isSuccess)
-    assert(iterator.next()._2.isFailure)
-    assert(iterator.next()._2.isFailure)
+    // The first block should be returned without an exception, and the last two should throw
+    // FetchFailedExceptions (due to failure)
+    iterator.next()
+    intercept[FetchFailedException] { iterator.next() }
+    intercept[FetchFailedException] { iterator.next() }
   }
 }
diff --git a/core/src/test/scala/org/apache/spark/util/AkkaUtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/AkkaUtilsSuite.scala
index 6c40685484ed4..61601016e005e 100644
--- a/core/src/test/scala/org/apache/spark/util/AkkaUtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/AkkaUtilsSuite.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.util
 
+import scala.collection.mutable.ArrayBuffer
+
 import java.util.concurrent.TimeoutException
 
 import akka.actor.ActorNotFound
@@ -24,7 +26,7 @@ import akka.actor.ActorNotFound
 import org.apache.spark._
 import org.apache.spark.rpc.RpcEnv
 import org.apache.spark.scheduler.MapStatus
-import org.apache.spark.storage.BlockManagerId
+import org.apache.spark.storage.{BlockManagerId, ShuffleBlockId}
 import org.apache.spark.SSLSampleConfigs._
 
 
@@ -107,8 +109,9 @@ class AkkaUtilsSuite extends SparkFunSuite with LocalSparkContext with ResetSyst
     slaveTracker.updateEpoch(masterTracker.getEpoch)
 
     // this should succeed since security off
-    assert(slaveTracker.getServerStatuses(10, 0).toSeq ===
-           Seq((BlockManagerId("a", "hostA", 1000), size1000)))
+    assert(slaveTracker.getMapSizesByExecutorId(10, 0).toSeq ===
+           Seq((BlockManagerId("a", "hostA", 1000),
+             ArrayBuffer((ShuffleBlockId(10, 0, 0), size1000)))))
 
     rpcEnv.shutdown()
     slaveRpcEnv.shutdown()
@@ -153,8 +156,9 @@ class AkkaUtilsSuite extends SparkFunSuite with LocalSparkContext with ResetSyst
     slaveTracker.updateEpoch(masterTracker.getEpoch)
 
     // this should succeed since security on and passwords match
-    assert(slaveTracker.getServerStatuses(10, 0).toSeq ===
-           Seq((BlockManagerId("a", "hostA", 1000), size1000)))
+    assert(slaveTracker.getMapSizesByExecutorId(10, 0) ===
+           Seq((BlockManagerId("a", "hostA", 1000),
+             ArrayBuffer((ShuffleBlockId(10, 0, 0), size1000)))))
 
     rpcEnv.shutdown()
     slaveRpcEnv.shutdown()
@@ -232,8 +236,8 @@ class AkkaUtilsSuite extends SparkFunSuite with LocalSparkContext with ResetSyst
     slaveTracker.updateEpoch(masterTracker.getEpoch)
 
     // this should succeed since security off
-    assert(slaveTracker.getServerStatuses(10, 0).toSeq ===
-      Seq((BlockManagerId("a", "hostA", 1000), size1000)))
+    assert(slaveTracker.getMapSizesByExecutorId(10, 0) ===
+      Seq((BlockManagerId("a", "hostA", 1000), ArrayBuffer((ShuffleBlockId(10, 0, 0), size1000)))))
 
     rpcEnv.shutdown()
     slaveRpcEnv.shutdown()
@@ -278,8 +282,8 @@ class AkkaUtilsSuite extends SparkFunSuite with LocalSparkContext with ResetSyst
     masterTracker.incrementEpoch()
     slaveTracker.updateEpoch(masterTracker.getEpoch)
 
-    assert(slaveTracker.getServerStatuses(10, 0).toSeq ===
-      Seq((BlockManagerId("a", "hostA", 1000), size1000)))
+    assert(slaveTracker.getMapSizesByExecutorId(10, 0) ===
+      Seq((BlockManagerId("a", "hostA", 1000), ArrayBuffer((ShuffleBlockId(10, 0, 0), size1000)))))
 
     rpcEnv.shutdown()
     slaveRpcEnv.shutdown()

From f5b6dc5e3e7e3b586096b71164f052318b840e8a Mon Sep 17 00:00:00 2001
From: Michael Allman <michael@videoamp.com>
Date: Tue, 21 Jul 2015 11:14:31 +0100
Subject: [PATCH 296/549] [SPARK-8401] [BUILD] Scala version switching build
 enhancements

These commits address a few minor issues in the Scala cross-version support in the build:

  1. Correct two missing `${scala.binary.version}` pom file substitutions.
  2. Don't update `scala.binary.version` in parent POM. This property is set through profiles.
  3. Update the source of the generated scaladocs in `docs/_plugins/copy_api_dirs.rb`.
  4. Factor common code out of `dev/change-version-to-*.sh` and add some validation. We also test `sed` to see if it's GNU sed and try `gsed` as an alternative if not. This prevents the script from running with a non-GNU sed.

This is my original work and I license this work to the Spark project under the Apache License.

Author: Michael Allman <michael@videoamp.com>

Closes #6832 from mallman/scala-versions and squashes the following commits:

cde2f17 [Michael Allman] Delete dev/change-version-to-*.sh, replacing them with single dev/change-scala-version.sh script that takes a version as argument
02296f2 [Michael Allman] Make the scala version change scripts cross-platform by restricting ourselves to POSIX sed syntax instead of looking for GNU sed
ad9b40a [Michael Allman] Factor change-scala-version.sh out of change-version-to-*.sh, adding command line argument validation and testing for GNU sed
bdd20bf [Michael Allman] Update source of scaladocs when changing Scala version
475088e [Michael Allman] Replace jackson-module-scala_2.10 with jackson-module-scala_${scala.binary.version}
---
 core/pom.xml                         |  2 +-
 dev/change-scala-version.sh          | 66 ++++++++++++++++++++++++++++
 dev/change-version-to-2.10.sh        | 26 -----------
 dev/change-version-to-2.11.sh        | 26 -----------
 dev/create-release/create-release.sh |  6 +--
 docs/building-spark.md               |  2 +-
 pom.xml                              |  2 +-
 7 files changed, 72 insertions(+), 58 deletions(-)
 create mode 100755 dev/change-scala-version.sh
 delete mode 100755 dev/change-version-to-2.10.sh
 delete mode 100755 dev/change-version-to-2.11.sh

diff --git a/core/pom.xml b/core/pom.xml
index 73f7a75cab9d3..95f36eb348698 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -261,7 +261,7 @@
     </dependency>
     <dependency>
       <groupId>com.fasterxml.jackson.module</groupId>
-      <artifactId>jackson-module-scala_2.10</artifactId>
+      <artifactId>jackson-module-scala_${scala.binary.version}</artifactId>
     </dependency>
     <dependency>
       <groupId>org.apache.derby</groupId>
diff --git a/dev/change-scala-version.sh b/dev/change-scala-version.sh
new file mode 100755
index 0000000000000..b81c00c9d6d9d
--- /dev/null
+++ b/dev/change-scala-version.sh
@@ -0,0 +1,66 @@
+#!/usr/bin/env bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+set -e
+
+usage() {
+  echo "Usage: $(basename $0) <version>" 1>&2
+  exit 1
+}
+
+if [ $# -ne 1 ]; then
+  usage
+fi
+
+TO_VERSION=$1
+
+VALID_VERSIONS=( 2.10 2.11 )
+
+check_scala_version() {
+  for i in ${VALID_VERSIONS[*]}; do [ $i = "$1" ] && return 0; done
+  echo "Invalid Scala version: $1. Valid versions: ${VALID_VERSIONS[*]}" 1>&2
+  exit 1
+}
+
+check_scala_version "$TO_VERSION"
+
+if [ $TO_VERSION = "2.11" ]; then
+  FROM_VERSION="2.10"
+else
+  FROM_VERSION="2.11"
+fi
+
+sed_i() {
+  sed -e "$1" "$2" > "$2.tmp" && mv "$2.tmp" "$2"
+}
+
+export -f sed_i
+
+BASEDIR=$(dirname $0)/..
+find "$BASEDIR" -name 'pom.xml' -not -path '*target*' -print \
+  -exec bash -c "sed_i 's/\(artifactId.*\)_'$FROM_VERSION'/\1_'$TO_VERSION'/g' {}" \;
+
+# Also update <scala.binary.version> in parent POM
+# Match any scala binary version to ensure idempotency
+sed_i '1,/<scala\.binary\.version>[0-9]*\.[0-9]*</s/<scala\.binary\.version>[0-9]*\.[0-9]*</<scala.binary.version>'$TO_VERSION'</' \
+  "$BASEDIR/pom.xml"
+
+# Update source of scaladocs
+echo "$BASEDIR/docs/_plugins/copy_api_dirs.rb"
+sed_i 's/scala\-'$FROM_VERSION'/scala\-'$TO_VERSION'/' "$BASEDIR/docs/_plugins/copy_api_dirs.rb"
diff --git a/dev/change-version-to-2.10.sh b/dev/change-version-to-2.10.sh
deleted file mode 100755
index c4adb1f96b7d3..0000000000000
--- a/dev/change-version-to-2.10.sh
+++ /dev/null
@@ -1,26 +0,0 @@
-#!/usr/bin/env bash
-
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# Note that this will not necessarily work as intended with non-GNU sed (e.g. OS X)
-BASEDIR=$(dirname $0)/..
-find $BASEDIR -name 'pom.xml' | grep -v target \
-  | xargs -I {} sed -i -e 's/\(artifactId.*\)_2.11/\1_2.10/g' {}
-
-# Also update <scala.binary.version> in parent POM
-sed -i -e '0,/<scala\.binary\.version>2.11</s//<scala.binary.version>2.10</' $BASEDIR/pom.xml
diff --git a/dev/change-version-to-2.11.sh b/dev/change-version-to-2.11.sh
deleted file mode 100755
index d370019dec34d..0000000000000
--- a/dev/change-version-to-2.11.sh
+++ /dev/null
@@ -1,26 +0,0 @@
-#!/usr/bin/env bash
-
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# Note that this will not necessarily work as intended with non-GNU sed (e.g. OS X)
-BASEDIR=$(dirname $0)/..
-find $BASEDIR -name 'pom.xml' | grep -v target \
-  | xargs -I {} sed -i -e 's/\(artifactId.*\)_2.10/\1_2.11/g' {}
-
-# Also update <scala.binary.version> in parent POM
-sed -i -e '0,/<scala\.binary\.version>2.10</s//<scala.binary.version>2.11</' $BASEDIR/pom.xml
diff --git a/dev/create-release/create-release.sh b/dev/create-release/create-release.sh
index 30190dcd41ec5..86a7a4068c40e 100755
--- a/dev/create-release/create-release.sh
+++ b/dev/create-release/create-release.sh
@@ -122,13 +122,13 @@ if [[ ! "$@" =~ --skip-publish ]]; then
     -Phive-thriftserver -Phadoop-2.2 -Pspark-ganglia-lgpl -Pkinesis-asl \
     clean install
 
-  ./dev/change-version-to-2.11.sh
+  ./dev/change-scala-version.sh 2.11
   
   build/mvn -DskipTests -Pyarn -Phive -Prelease\
     -Dscala-2.11 -Phadoop-2.2 -Pspark-ganglia-lgpl -Pkinesis-asl \
     clean install
 
-  ./dev/change-version-to-2.10.sh
+  ./dev/change-scala-version.sh 2.10
 
   pushd $SPARK_REPO
 
@@ -205,7 +205,7 @@ if [[ ! "$@" =~ --skip-package ]]; then
 
     # TODO There should probably be a flag to make-distribution to allow 2.11 support
     if [[ $FLAGS == *scala-2.11* ]]; then
-      ./dev/change-version-to-2.11.sh
+      ./dev/change-scala-version.sh 2.11
     fi
 
     export ZINC_PORT=$ZINC_PORT
diff --git a/docs/building-spark.md b/docs/building-spark.md
index 2128fdffecc05..a5da3b39502e2 100644
--- a/docs/building-spark.md
+++ b/docs/building-spark.md
@@ -124,7 +124,7 @@ mvn -Pyarn -Phadoop-2.4 -Dhadoop.version=2.4.0 -Phive -Phive-thriftserver -Dskip
 # Building for Scala 2.11
 To produce a Spark package compiled with Scala 2.11, use the `-Dscala-2.11` property:
 
-    dev/change-version-to-2.11.sh
+    dev/change-scala-version.sh 2.11
     mvn -Pyarn -Phadoop-2.4 -Dscala-2.11 -DskipTests clean package
 
 Spark does not yet support its JDBC component for Scala 2.11.
diff --git a/pom.xml b/pom.xml
index 2de0c35fbd51a..1f44dc8abe1d4 100644
--- a/pom.xml
+++ b/pom.xml
@@ -614,7 +614,7 @@
            15.0, which causes runtime incompatibility issues. -->
       <dependency>
         <groupId>com.fasterxml.jackson.module</groupId>
-        <artifactId>jackson-module-scala_2.10</artifactId>
+        <artifactId>jackson-module-scala_${scala.binary.version}</artifactId>
         <version>${fasterxml.jackson.version}</version>
         <exclusions>
           <exclusion>

From be5c5d3741256697cc76938a8ed6f609eb2d4b11 Mon Sep 17 00:00:00 2001
From: Yijie Shen <henry.yijieshen@gmail.com>
Date: Tue, 21 Jul 2015 08:25:50 -0700
Subject: [PATCH 297/549] [SPARK-9081] [SPARK-9168] [SQL] nanvl & dropna/fillna
 supporting nan as well

JIRA:
https://issues.apache.org/jira/browse/SPARK-9081
https://issues.apache.org/jira/browse/SPARK-9168

This PR target at two modifications:
1.  Change `isNaN` to return `false` on `null` input
2.  Make `dropna` and `fillna` to fill/drop NaN values as well
3.  Implement `nanvl`

Author: Yijie Shen <henry.yijieshen@gmail.com>

Closes #7523 from yjshen/fillna_dropna and squashes the following commits:

f0a51db [Yijie Shen] make coalesce untouched and implement nanvl
1d3e35f [Yijie Shen] make Coalesce aware of NaN in order to support fillna
2760cbc [Yijie Shen] change isNaN(null) to false as well as implement dropna
---
 .../catalyst/analysis/FunctionRegistry.scala  |   1 +
 .../catalyst/expressions/nullFunctions.scala  | 104 ++++++++++++++----
 .../sql/catalyst/expressions/predicates.scala |   5 +-
 .../expressions/NullFunctionsSuite.scala      |  39 ++++++-
 .../scala/org/apache/spark/sql/Column.scala   |   2 +-
 .../spark/sql/DataFrameNaFunctions.scala      |  52 +++++----
 .../org/apache/spark/sql/functions.scala      |  13 ++-
 .../spark/sql/ColumnExpressionSuite.scala     |  25 ++++-
 .../spark/sql/DataFrameNaFunctionsSuite.scala |  69 ++++++------
 9 files changed, 222 insertions(+), 88 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index 13523720daff0..e3d8d2adf2135 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -89,6 +89,7 @@ object FunctionRegistry {
     expression[CreateStruct]("struct"),
     expression[CreateNamedStruct]("named_struct"),
     expression[Sqrt]("sqrt"),
+    expression[NaNvl]("nanvl"),
 
     // math functions
     expression[Acos]("acos"),
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullFunctions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullFunctions.scala
index 98c67084642e3..287718fab7f0d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullFunctions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullFunctions.scala
@@ -83,7 +83,7 @@ case class Coalesce(children: Seq[Expression]) extends Expression {
 
 
 /**
- * Evaluates to `true` if it's NaN or null
+ * Evaluates to `true` iff it's NaN.
  */
 case class IsNaN(child: Expression) extends UnaryExpression
   with Predicate with ImplicitCastInputTypes {
@@ -95,7 +95,7 @@ case class IsNaN(child: Expression) extends UnaryExpression
   override def eval(input: InternalRow): Any = {
     val value = child.eval(input)
     if (value == null) {
-      true
+      false
     } else {
       child.dataType match {
         case DoubleType => value.asInstanceOf[Double].isNaN
@@ -107,26 +107,65 @@ case class IsNaN(child: Expression) extends UnaryExpression
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     val eval = child.gen(ctx)
     child.dataType match {
-      case FloatType =>
+      case DoubleType | FloatType =>
         s"""
           ${eval.code}
           boolean ${ev.isNull} = false;
           ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
-          if (${eval.isNull}) {
-            ${ev.primitive} = true;
-          } else {
-            ${ev.primitive} = Float.isNaN(${eval.primitive});
-          }
+          ${ev.primitive} = !${eval.isNull} && Double.isNaN(${eval.primitive});
         """
-      case DoubleType =>
+    }
+  }
+}
+
+/**
+ * An Expression evaluates to `left` iff it's not NaN, or evaluates to `right` otherwise.
+ * This Expression is useful for mapping NaN values to null.
+ */
+case class NaNvl(left: Expression, right: Expression)
+    extends BinaryExpression with ImplicitCastInputTypes {
+
+  override def dataType: DataType = left.dataType
+
+  override def inputTypes: Seq[AbstractDataType] =
+    Seq(TypeCollection(DoubleType, FloatType), TypeCollection(DoubleType, FloatType))
+
+  override def eval(input: InternalRow): Any = {
+    val value = left.eval(input)
+    if (value == null) {
+      null
+    } else {
+      left.dataType match {
+        case DoubleType =>
+          if (!value.asInstanceOf[Double].isNaN) value else right.eval(input)
+        case FloatType =>
+          if (!value.asInstanceOf[Float].isNaN) value else right.eval(input)
+      }
+    }
+  }
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    val leftGen = left.gen(ctx)
+    val rightGen = right.gen(ctx)
+    left.dataType match {
+      case DoubleType | FloatType =>
         s"""
-          ${eval.code}
+          ${leftGen.code}
           boolean ${ev.isNull} = false;
           ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
-          if (${eval.isNull}) {
-            ${ev.primitive} = true;
+          if (${leftGen.isNull}) {
+            ${ev.isNull} = true;
           } else {
-            ${ev.primitive} = Double.isNaN(${eval.primitive});
+            if (!Double.isNaN(${leftGen.primitive})) {
+              ${ev.primitive} = ${leftGen.primitive};
+            } else {
+              ${rightGen.code}
+              if (${rightGen.isNull}) {
+                ${ev.isNull} = true;
+              } else {
+                ${ev.primitive} = ${rightGen.primitive};
+              }
+            }
           }
         """
     }
@@ -186,8 +225,15 @@ case class AtLeastNNonNulls(n: Int, children: Seq[Expression]) extends Predicate
     var numNonNulls = 0
     var i = 0
     while (i < childrenArray.length && numNonNulls < n) {
-      if (childrenArray(i).eval(input) != null) {
-        numNonNulls += 1
+      val evalC = childrenArray(i).eval(input)
+      if (evalC != null) {
+        childrenArray(i).dataType match {
+          case DoubleType =>
+            if (!evalC.asInstanceOf[Double].isNaN) numNonNulls += 1
+          case FloatType =>
+            if (!evalC.asInstanceOf[Float].isNaN) numNonNulls += 1
+          case _ => numNonNulls += 1
+        }
       }
       i += 1
     }
@@ -198,14 +244,26 @@ case class AtLeastNNonNulls(n: Int, children: Seq[Expression]) extends Predicate
     val nonnull = ctx.freshName("nonnull")
     val code = children.map { e =>
       val eval = e.gen(ctx)
-      s"""
-        if ($nonnull < $n) {
-          ${eval.code}
-          if (!${eval.isNull}) {
-            $nonnull += 1;
-          }
-        }
-      """
+      e.dataType match {
+        case DoubleType | FloatType =>
+          s"""
+            if ($nonnull < $n) {
+              ${eval.code}
+              if (!${eval.isNull} && !Double.isNaN(${eval.primitive})) {
+                $nonnull += 1;
+              }
+            }
+          """
+        case _ =>
+          s"""
+            if ($nonnull < $n) {
+              ${eval.code}
+              if (!${eval.isNull}) {
+                $nonnull += 1;
+              }
+            }
+          """
+      }
     }.mkString("\n")
     s"""
       int $nonnull = 0;
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
index a53ec31ee6a4b..3f1bd2a925fe7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
@@ -17,10 +17,10 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenFallback, GeneratedExpressionCode, CodeGenContext}
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.util.TypeUtils
-import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.catalyst.util.TypeUtils
 import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
 
@@ -121,7 +121,6 @@ case class InSet(child: Expression, hset: Set[Any])
   }
 }
 
-
 case class And(left: Expression, right: Expression) extends BinaryOperator with Predicate {
 
   override def inputType: AbstractDataType = BooleanType
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullFunctionsSuite.scala
index 765cc7a969b5d..0728f6695c39d 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullFunctionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullFunctionsSuite.scala
@@ -49,12 +49,22 @@ class NullFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(IsNaN(Literal(Double.NaN)), true)
     checkEvaluation(IsNaN(Literal(Float.NaN)), true)
     checkEvaluation(IsNaN(Literal(math.log(-3))), true)
-    checkEvaluation(IsNaN(Literal.create(null, DoubleType)), true)
+    checkEvaluation(IsNaN(Literal.create(null, DoubleType)), false)
     checkEvaluation(IsNaN(Literal(Double.PositiveInfinity)), false)
     checkEvaluation(IsNaN(Literal(Float.MaxValue)), false)
     checkEvaluation(IsNaN(Literal(5.5f)), false)
   }
 
+  test("nanvl") {
+    checkEvaluation(NaNvl(Literal(5.0), Literal.create(null, DoubleType)), 5.0)
+    checkEvaluation(NaNvl(Literal.create(null, DoubleType), Literal(5.0)), null)
+    checkEvaluation(NaNvl(Literal.create(null, DoubleType), Literal(Double.NaN)), null)
+    checkEvaluation(NaNvl(Literal(Double.NaN), Literal(5.0)), 5.0)
+    checkEvaluation(NaNvl(Literal(Double.NaN), Literal.create(null, DoubleType)), null)
+    assert(NaNvl(Literal(Double.NaN), Literal(Double.NaN)).
+      eval(EmptyRow).asInstanceOf[Double].isNaN)
+  }
+
   test("coalesce") {
     testAllTypes { (value: Any, tpe: DataType) =>
       val lit = Literal.create(value, tpe)
@@ -66,4 +76,31 @@ class NullFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
       checkEvaluation(Coalesce(Seq(nullLit, nullLit, lit)), value)
     }
   }
+
+  test("AtLeastNNonNulls") {
+    val mix = Seq(Literal("x"),
+      Literal.create(null, StringType),
+      Literal.create(null, DoubleType),
+      Literal(Double.NaN),
+      Literal(5f))
+
+    val nanOnly = Seq(Literal("x"),
+      Literal(10.0),
+      Literal(Float.NaN),
+      Literal(math.log(-2)),
+      Literal(Double.MaxValue))
+
+    val nullOnly = Seq(Literal("x"),
+      Literal.create(null, DoubleType),
+      Literal.create(null, DecimalType.Unlimited),
+      Literal(Float.MaxValue),
+      Literal(false))
+
+    checkEvaluation(AtLeastNNonNulls(2, mix), true, EmptyRow)
+    checkEvaluation(AtLeastNNonNulls(3, mix), false, EmptyRow)
+    checkEvaluation(AtLeastNNonNulls(3, nanOnly), true, EmptyRow)
+    checkEvaluation(AtLeastNNonNulls(4, nanOnly), false, EmptyRow)
+    checkEvaluation(AtLeastNNonNulls(3, nullOnly), true, EmptyRow)
+    checkEvaluation(AtLeastNNonNulls(4, nullOnly), false, EmptyRow)
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
index 221cd04c6d288..6e2a6525bf17e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
@@ -401,7 +401,7 @@ class Column(protected[sql] val expr: Expression) extends Logging {
   }
 
   /**
-   * True if the current expression is NaN or null
+   * True if the current expression is NaN.
    *
    * @group expr_ops
    * @since 1.5.0
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala
index 8681a56c82f1e..a4fd4cf3b330b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala
@@ -37,24 +37,24 @@ import org.apache.spark.sql.types._
 final class DataFrameNaFunctions private[sql](df: DataFrame) {
 
   /**
-   * Returns a new [[DataFrame]] that drops rows containing any null values.
+   * Returns a new [[DataFrame]] that drops rows containing any null or NaN values.
    *
    * @since 1.3.1
    */
   def drop(): DataFrame = drop("any", df.columns)
 
   /**
-   * Returns a new [[DataFrame]] that drops rows containing null values.
+   * Returns a new [[DataFrame]] that drops rows containing null or NaN values.
    *
-   * If `how` is "any", then drop rows containing any null values.
-   * If `how` is "all", then drop rows only if every column is null for that row.
+   * If `how` is "any", then drop rows containing any null or NaN values.
+   * If `how` is "all", then drop rows only if every column is null or NaN for that row.
    *
    * @since 1.3.1
    */
   def drop(how: String): DataFrame = drop(how, df.columns)
 
   /**
-   * Returns a new [[DataFrame]] that drops rows containing any null values
+   * Returns a new [[DataFrame]] that drops rows containing any null or NaN values
    * in the specified columns.
    *
    * @since 1.3.1
@@ -62,7 +62,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
   def drop(cols: Array[String]): DataFrame = drop(cols.toSeq)
 
   /**
-   * (Scala-specific) Returns a new [[DataFrame ]] that drops rows containing any null values
+   * (Scala-specific) Returns a new [[DataFrame]] that drops rows containing any null or NaN values
    * in the specified columns.
    *
    * @since 1.3.1
@@ -70,22 +70,22 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
   def drop(cols: Seq[String]): DataFrame = drop(cols.size, cols)
 
   /**
-   * Returns a new [[DataFrame]] that drops rows containing null values
+   * Returns a new [[DataFrame]] that drops rows containing null or NaN values
    * in the specified columns.
    *
-   * If `how` is "any", then drop rows containing any null values in the specified columns.
-   * If `how` is "all", then drop rows only if every specified column is null for that row.
+   * If `how` is "any", then drop rows containing any null or NaN values in the specified columns.
+   * If `how` is "all", then drop rows only if every specified column is null or NaN for that row.
    *
    * @since 1.3.1
    */
   def drop(how: String, cols: Array[String]): DataFrame = drop(how, cols.toSeq)
 
   /**
-   * (Scala-specific) Returns a new [[DataFrame]] that drops rows containing null values
+   * (Scala-specific) Returns a new [[DataFrame]] that drops rows containing null or NaN values
    * in the specified columns.
    *
-   * If `how` is "any", then drop rows containing any null values in the specified columns.
-   * If `how` is "all", then drop rows only if every specified column is null for that row.
+   * If `how` is "any", then drop rows containing any null or NaN values in the specified columns.
+   * If `how` is "all", then drop rows only if every specified column is null or NaN for that row.
    *
    * @since 1.3.1
    */
@@ -98,15 +98,16 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
   }
 
   /**
-   * Returns a new [[DataFrame]] that drops rows containing less than `minNonNulls` non-null values.
+   * Returns a new [[DataFrame]] that drops rows containing
+   * less than `minNonNulls` non-null and non-NaN values.
    *
    * @since 1.3.1
    */
   def drop(minNonNulls: Int): DataFrame = drop(minNonNulls, df.columns)
 
   /**
-   * Returns a new [[DataFrame]] that drops rows containing less than `minNonNulls` non-null
-   * values in the specified columns.
+   * Returns a new [[DataFrame]] that drops rows containing
+   * less than `minNonNulls` non-null and non-NaN values in the specified columns.
    *
    * @since 1.3.1
    */
@@ -114,32 +115,33 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
 
   /**
    * (Scala-specific) Returns a new [[DataFrame]] that drops rows containing less than
-   * `minNonNulls` non-null values in the specified columns.
+   * `minNonNulls` non-null and non-NaN values in the specified columns.
    *
    * @since 1.3.1
    */
   def drop(minNonNulls: Int, cols: Seq[String]): DataFrame = {
-    // Filtering condition -- only keep the row if it has at least `minNonNulls` non-null values.
+    // Filtering condition:
+    // only keep the row if it has at least `minNonNulls` non-null and non-NaN values.
     val predicate = AtLeastNNonNulls(minNonNulls, cols.map(name => df.resolve(name)))
     df.filter(Column(predicate))
   }
 
   /**
-   * Returns a new [[DataFrame]] that replaces null values in numeric columns with `value`.
+   * Returns a new [[DataFrame]] that replaces null or NaN values in numeric columns with `value`.
    *
    * @since 1.3.1
    */
   def fill(value: Double): DataFrame = fill(value, df.columns)
 
   /**
-   * Returns a new [[DataFrame ]] that replaces null values in string columns with `value`.
+   * Returns a new [[DataFrame]] that replaces null values in string columns with `value`.
    *
    * @since 1.3.1
    */
   def fill(value: String): DataFrame = fill(value, df.columns)
 
   /**
-   * Returns a new [[DataFrame]] that replaces null values in specified numeric columns.
+   * Returns a new [[DataFrame]] that replaces null or NaN values in specified numeric columns.
    * If a specified column is not a numeric column, it is ignored.
    *
    * @since 1.3.1
@@ -147,7 +149,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
   def fill(value: Double, cols: Array[String]): DataFrame = fill(value, cols.toSeq)
 
   /**
-   * (Scala-specific) Returns a new [[DataFrame]] that replaces null values in specified
+   * (Scala-specific) Returns a new [[DataFrame]] that replaces null or NaN values in specified
    * numeric columns. If a specified column is not a numeric column, it is ignored.
    *
    * @since 1.3.1
@@ -391,7 +393,13 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
    * Returns a [[Column]] expression that replaces null value in `col` with `replacement`.
    */
   private def fillCol[T](col: StructField, replacement: T): Column = {
-    coalesce(df.col("`" + col.name + "`"), lit(replacement).cast(col.dataType)).as(col.name)
+    col.dataType match {
+      case DoubleType | FloatType =>
+        coalesce(nanvl(df.col("`" + col.name + "`"), lit(null)),
+          lit(replacement).cast(col.dataType)).as(col.name)
+      case _ =>
+        coalesce(df.col("`" + col.name + "`"), lit(replacement).cast(col.dataType)).as(col.name)
+    }
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 60b089180c876..d94d7335828c5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -595,7 +595,7 @@ object functions {
   }
 
   /**
-   * Returns the first column that is not null.
+   * Returns the first column that is not null and not NaN.
    * {{{
    *   df.select(coalesce(df("a"), df("b")))
    * }}}
@@ -612,7 +612,7 @@ object functions {
   def explode(e: Column): Column = Explode(e.expr)
 
   /**
-   * Return true if the column is NaN or null
+   * Return true iff the column is NaN.
    *
    * @group normal_funcs
    * @since 1.5.0
@@ -636,6 +636,15 @@ object functions {
    */
   def monotonicallyIncreasingId(): Column = execution.expressions.MonotonicallyIncreasingID()
 
+  /**
+   * Return an alternative value `r` if `l` is NaN.
+   * This function is useful for mapping NaN values to null.
+   *
+   * @group normal_funcs
+   * @since 1.5.0
+   */
+  def nanvl(l: Column, r: Column): Column = NaNvl(l.expr, r.expr)
+
   /**
    * Unary minus, i.e. negate the expression.
    * {{{
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
index 6bd5804196853..1f9f7118c3f04 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
@@ -211,15 +211,34 @@ class ColumnExpressionSuite extends QueryTest {
 
     checkAnswer(
       testData.select($"a".isNaN, $"b".isNaN),
-      Row(true, true) :: Row(true, true) :: Row(true, true) :: Row(false, false) :: Nil)
+      Row(true, true) :: Row(true, true) :: Row(false, false) :: Row(false, false) :: Nil)
 
     checkAnswer(
       testData.select(isNaN($"a"), isNaN($"b")),
-      Row(true, true) :: Row(true, true) :: Row(true, true) :: Row(false, false) :: Nil)
+      Row(true, true) :: Row(true, true) :: Row(false, false) :: Row(false, false) :: Nil)
 
     checkAnswer(
       ctx.sql("select isnan(15), isnan('invalid')"),
-      Row(false, true))
+      Row(false, false))
+  }
+
+  test("nanvl") {
+    val testData = ctx.createDataFrame(ctx.sparkContext.parallelize(
+      Row(null, 3.0, Double.NaN, Double.PositiveInfinity) :: Nil),
+      StructType(Seq(StructField("a", DoubleType), StructField("b", DoubleType),
+        StructField("c", DoubleType), StructField("d", DoubleType))))
+
+    checkAnswer(
+      testData.select(
+        nanvl($"a", lit(5)), nanvl($"b", lit(10)),
+        nanvl($"c", lit(null).cast(DoubleType)), nanvl($"d", lit(10))),
+      Row(null, 3.0, null, Double.PositiveInfinity)
+    )
+    testData.registerTempTable("t")
+    checkAnswer(
+      ctx.sql("select nanvl(a, 5), nanvl(b, 10), nanvl(c, null), nanvl(d, 10) from t"),
+      Row(null, 3.0, null, Double.PositiveInfinity)
+    )
   }
 
   test("===") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameNaFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameNaFunctionsSuite.scala
index 495701d4f616c..dbe3b44ee2c79 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameNaFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameNaFunctionsSuite.scala
@@ -30,8 +30,10 @@ class DataFrameNaFunctionsSuite extends QueryTest {
       ("Bob", 16, 176.5),
       ("Alice", null, 164.3),
       ("David", 60, null),
+      ("Nina", 25, Double.NaN),
       ("Amy", null, null),
-      (null, null, null)).toDF("name", "age", "height")
+      (null, null, null)
+      ).toDF("name", "age", "height")
   }
 
   test("drop") {
@@ -39,12 +41,12 @@ class DataFrameNaFunctionsSuite extends QueryTest {
     val rows = input.collect()
 
     checkAnswer(
-      input.na.drop("name" :: Nil),
-      rows(0) :: rows(1) :: rows(2) :: rows(3) :: Nil)
+      input.na.drop("name" :: Nil).select("name"),
+      Row("Bob") :: Row("Alice") :: Row("David") :: Row("Nina") :: Row("Amy") :: Nil)
 
     checkAnswer(
-      input.na.drop("age" :: Nil),
-      rows(0) :: rows(2) :: Nil)
+      input.na.drop("age" :: Nil).select("name"),
+      Row("Bob") :: Row("David") :: Row("Nina") :: Nil)
 
     checkAnswer(
       input.na.drop("age" :: "height" :: Nil),
@@ -67,8 +69,8 @@ class DataFrameNaFunctionsSuite extends QueryTest {
     val rows = input.collect()
 
     checkAnswer(
-      input.na.drop("all"),
-      rows(0) :: rows(1) :: rows(2) :: rows(3) :: Nil)
+      input.na.drop("all").select("name"),
+      Row("Bob") :: Row("Alice") :: Row("David") :: Row("Nina") :: Row("Amy") :: Nil)
 
     checkAnswer(
       input.na.drop("any"),
@@ -79,8 +81,8 @@ class DataFrameNaFunctionsSuite extends QueryTest {
       rows(0) :: Nil)
 
     checkAnswer(
-      input.na.drop("all", Seq("age", "height")),
-      rows(0) :: rows(1) :: rows(2) :: Nil)
+      input.na.drop("all", Seq("age", "height")).select("name"),
+      Row("Bob") :: Row("Alice") :: Row("David") :: Row("Nina") :: Nil)
   }
 
   test("drop with threshold") {
@@ -108,6 +110,7 @@ class DataFrameNaFunctionsSuite extends QueryTest {
       Row("Bob", 16, 176.5) ::
         Row("Alice", 50, 164.3) ::
         Row("David", 60, 50.6) ::
+        Row("Nina", 25, 50.6) ::
         Row("Amy", 50, 50.6) ::
         Row(null, 50, 50.6) :: Nil)
 
@@ -117,17 +120,19 @@ class DataFrameNaFunctionsSuite extends QueryTest {
     // string
     checkAnswer(
       input.na.fill("unknown").select("name"),
-      Row("Bob") :: Row("Alice") :: Row("David") :: Row("Amy") :: Row("unknown") :: Nil)
+      Row("Bob") :: Row("Alice") :: Row("David") ::
+        Row("Nina") :: Row("Amy") :: Row("unknown") :: Nil)
     assert(input.na.fill("unknown").columns.toSeq === input.columns.toSeq)
 
     // fill double with subset columns
     checkAnswer(
-      input.na.fill(50.6, "age" :: Nil),
-      Row("Bob", 16, 176.5) ::
-        Row("Alice", 50, 164.3) ::
-        Row("David", 60, null) ::
-        Row("Amy", 50, null) ::
-        Row(null, 50, null) :: Nil)
+      input.na.fill(50.6, "age" :: Nil).select("name", "age"),
+      Row("Bob", 16) ::
+        Row("Alice", 50) ::
+        Row("David", 60) ::
+        Row("Nina", 25) ::
+        Row("Amy", 50) ::
+        Row(null, 50) :: Nil)
 
     // fill string with subset columns
     checkAnswer(
@@ -164,29 +169,27 @@ class DataFrameNaFunctionsSuite extends QueryTest {
       16 -> 61,
       60 -> 6,
       164.3 -> 461.3  // Alice is really tall
-    ))
+    )).collect()
 
-    checkAnswer(
-      out,
-      Row("Bob", 61, 176.5) ::
-        Row("Alice", null, 461.3) ::
-        Row("David", 6, null) ::
-        Row("Amy", null, null) ::
-        Row(null, null, null) :: Nil)
+    assert(out(0) === Row("Bob", 61, 176.5))
+    assert(out(1) === Row("Alice", null, 461.3))
+    assert(out(2) === Row("David", 6, null))
+    assert(out(3).get(2).asInstanceOf[Double].isNaN)
+    assert(out(4) === Row("Amy", null, null))
+    assert(out(5) === Row(null, null, null))
 
     // Replace only the age column
     val out1 = input.na.replace("age", Map(
       16 -> 61,
       60 -> 6,
       164.3 -> 461.3  // Alice is really tall
-    ))
-
-    checkAnswer(
-      out1,
-      Row("Bob", 61, 176.5) ::
-        Row("Alice", null, 164.3) ::
-        Row("David", 6, null) ::
-        Row("Amy", null, null) ::
-        Row(null, null, null) :: Nil)
+    )).collect()
+
+    assert(out1(0) === Row("Bob", 61, 176.5))
+    assert(out1(1) === Row("Alice", null, 164.3))
+    assert(out1(2) === Row("David", 6, null))
+    assert(out1(3).get(2).asInstanceOf[Double].isNaN)
+    assert(out1(4) === Row("Amy", null, null))
+    assert(out1(5) === Row(null, null, null))
   }
 }

From df4ddb3120be28df381c11a36312620e58034b93 Mon Sep 17 00:00:00 2001
From: petz2000 <petz2000@gmail.com>
Date: Tue, 21 Jul 2015 08:50:43 -0700
Subject: [PATCH 298/549] [SPARK-8915] [DOCUMENTATION, MLLIB] Added @since tags
 to mllib.classification

Created since tags for methods in mllib.classification

Author: petz2000 <petz2000@gmail.com>

Closes #7371 from petz2000/add_since_mllib.classification and squashes the following commits:

39fe291 [petz2000] Removed whitespace in block comment
c9b1e03 [petz2000] Removed @since tags again from protected and private methods
cd759b6 [petz2000] Added @since tags to methods
---
 .../classification/ClassificationModel.scala    |  3 +++
 .../classification/LogisticRegression.scala     | 17 +++++++++++++++++
 .../spark/mllib/classification/NaiveBayes.scala |  3 +++
 .../apache/spark/mllib/classification/SVM.scala | 16 ++++++++++++++++
 4 files changed, 39 insertions(+)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/ClassificationModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/ClassificationModel.scala
index 35a0db76f3a8c..ba73024e3c04d 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/ClassificationModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/ClassificationModel.scala
@@ -36,6 +36,7 @@ trait ClassificationModel extends Serializable {
    *
    * @param testData RDD representing data points to be predicted
    * @return an RDD[Double] where each entry contains the corresponding prediction
+   * @since 0.8.0
    */
   def predict(testData: RDD[Vector]): RDD[Double]
 
@@ -44,6 +45,7 @@ trait ClassificationModel extends Serializable {
    *
    * @param testData array representing a single data point
    * @return predicted category from the trained model
+   * @since 0.8.0
    */
   def predict(testData: Vector): Double
 
@@ -51,6 +53,7 @@ trait ClassificationModel extends Serializable {
    * Predict values for examples stored in a JavaRDD.
    * @param testData JavaRDD representing data points to be predicted
    * @return a JavaRDD[java.lang.Double] where each entry contains the corresponding prediction
+   * @since 0.8.0
    */
   def predict(testData: JavaRDD[Vector]): JavaRDD[java.lang.Double] =
     predict(testData.rdd).toJavaRDD().asInstanceOf[JavaRDD[java.lang.Double]]
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
index 2df4d21e8cd55..268642ac6a2f6 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
@@ -85,6 +85,7 @@ class LogisticRegressionModel (
    * in Binary Logistic Regression. An example with prediction score greater than or equal to
    * this threshold is identified as an positive, and negative otherwise. The default value is 0.5.
    * It is only used for binary classification.
+   * @since 1.0.0
    */
   @Experimental
   def setThreshold(threshold: Double): this.type = {
@@ -96,6 +97,7 @@ class LogisticRegressionModel (
    * :: Experimental ::
    * Returns the threshold (if any) used for converting raw prediction scores into 0/1 predictions.
    * It is only used for binary classification.
+   * @since 1.3.0
    */
   @Experimental
   def getThreshold: Option[Double] = threshold
@@ -104,6 +106,7 @@ class LogisticRegressionModel (
    * :: Experimental ::
    * Clears the threshold so that `predict` will output raw prediction scores.
    * It is only used for binary classification.
+   * @since 1.0.0
    */
   @Experimental
   def clearThreshold(): this.type = {
@@ -155,6 +158,9 @@ class LogisticRegressionModel (
     }
   }
 
+  /**
+   * @since 1.3.0
+   */
   override def save(sc: SparkContext, path: String): Unit = {
     GLMClassificationModel.SaveLoadV1_0.save(sc, path, this.getClass.getName,
       numFeatures, numClasses, weights, intercept, threshold)
@@ -162,6 +168,9 @@ class LogisticRegressionModel (
 
   override protected def formatVersion: String = "1.0"
 
+  /**
+   * @since 1.4.0
+   */
   override def toString: String = {
     s"${super.toString}, numClasses = ${numClasses}, threshold = ${threshold.getOrElse("None")}"
   }
@@ -169,6 +178,9 @@ class LogisticRegressionModel (
 
 object LogisticRegressionModel extends Loader[LogisticRegressionModel] {
 
+  /**
+   * @since 1.3.0
+   */
   override def load(sc: SparkContext, path: String): LogisticRegressionModel = {
     val (loadedClassName, version, metadata) = Loader.loadMetadata(sc, path)
     // Hard-code class name string in case it changes in the future
@@ -249,6 +261,7 @@ object LogisticRegressionWithSGD {
    * @param miniBatchFraction Fraction of data to be used per iteration.
    * @param initialWeights Initial set of weights to be used. Array should be equal in size to
    *        the number of features in the data.
+   * @since 1.0.0
    */
   def train(
       input: RDD[LabeledPoint],
@@ -271,6 +284,7 @@ object LogisticRegressionWithSGD {
    * @param stepSize Step size to be used for each iteration of gradient descent.
 
    * @param miniBatchFraction Fraction of data to be used per iteration.
+   * @since 1.0.0
    */
   def train(
       input: RDD[LabeledPoint],
@@ -292,6 +306,7 @@ object LogisticRegressionWithSGD {
 
    * @param numIterations Number of iterations of gradient descent to run.
    * @return a LogisticRegressionModel which has the weights and offset from training.
+   * @since 1.0.0
    */
   def train(
       input: RDD[LabeledPoint],
@@ -309,6 +324,7 @@ object LogisticRegressionWithSGD {
    * @param input RDD of (label, array of features) pairs.
    * @param numIterations Number of iterations of gradient descent to run.
    * @return a LogisticRegressionModel which has the weights and offset from training.
+   * @since 1.0.0
    */
   def train(
       input: RDD[LabeledPoint],
@@ -345,6 +361,7 @@ class LogisticRegressionWithLBFGS
    * Set the number of possible outcomes for k classes classification problem in
    * Multinomial Logistic Regression.
    * By default, it is binary logistic regression so k will be set to 2.
+   * @since 1.3.0
    */
   @Experimental
   def setNumClasses(numClasses: Int): this.type = {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
index 8cf4e15efe7a7..2df91c09421e9 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
@@ -444,6 +444,7 @@ object NaiveBayes {
    *
    * @param input RDD of `(label, array of features)` pairs.  Every vector should be a frequency
    *              vector or a count vector.
+   * @since 0.9.0
    */
   def train(input: RDD[LabeledPoint]): NaiveBayesModel = {
     new NaiveBayes().run(input)
@@ -459,6 +460,7 @@ object NaiveBayes {
    * @param input RDD of `(label, array of features)` pairs.  Every vector should be a frequency
    *              vector or a count vector.
    * @param lambda The smoothing parameter
+   * @since 0.9.0
    */
   def train(input: RDD[LabeledPoint], lambda: Double): NaiveBayesModel = {
     new NaiveBayes(lambda, Multinomial).run(input)
@@ -481,6 +483,7 @@ object NaiveBayes {
    *
    * @param modelType The type of NB model to fit from the enumeration NaiveBayesModels, can be
    *              multinomial or bernoulli
+   * @since 0.9.0
    */
   def train(input: RDD[LabeledPoint], lambda: Double, modelType: String): NaiveBayesModel = {
     require(supportedModelTypes.contains(modelType),
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala
index 348485560713e..5b54feeb10467 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala
@@ -46,6 +46,7 @@ class SVMModel (
    * Sets the threshold that separates positive predictions from negative predictions. An example
    * with prediction score greater than or equal to this threshold is identified as an positive,
    * and negative otherwise. The default value is 0.0.
+   * @since 1.3.0
    */
   @Experimental
   def setThreshold(threshold: Double): this.type = {
@@ -56,6 +57,7 @@ class SVMModel (
   /**
    * :: Experimental ::
    * Returns the threshold (if any) used for converting raw prediction scores into 0/1 predictions.
+   * @since 1.3.0
    */
   @Experimental
   def getThreshold: Option[Double] = threshold
@@ -63,6 +65,7 @@ class SVMModel (
   /**
    * :: Experimental ::
    * Clears the threshold so that `predict` will output raw prediction scores.
+   * @since 1.0.0
    */
   @Experimental
   def clearThreshold(): this.type = {
@@ -81,6 +84,9 @@ class SVMModel (
     }
   }
 
+  /**
+   * @since 1.3.0
+   */
   override def save(sc: SparkContext, path: String): Unit = {
     GLMClassificationModel.SaveLoadV1_0.save(sc, path, this.getClass.getName,
       numFeatures = weights.size, numClasses = 2, weights, intercept, threshold)
@@ -88,6 +94,9 @@ class SVMModel (
 
   override protected def formatVersion: String = "1.0"
 
+  /**
+   * @since 1.4.0
+   */
   override def toString: String = {
     s"${super.toString}, numClasses = 2, threshold = ${threshold.getOrElse("None")}"
   }
@@ -95,6 +104,9 @@ class SVMModel (
 
 object SVMModel extends Loader[SVMModel] {
 
+   /**
+   * @since 1.3.0
+   */
   override def load(sc: SparkContext, path: String): SVMModel = {
     val (loadedClassName, version, metadata) = Loader.loadMetadata(sc, path)
     // Hard-code class name string in case it changes in the future
@@ -173,6 +185,7 @@ object SVMWithSGD {
    * @param miniBatchFraction Fraction of data to be used per iteration.
    * @param initialWeights Initial set of weights to be used. Array should be equal in size to
    *        the number of features in the data.
+   * @since 0.8.0
    */
   def train(
       input: RDD[LabeledPoint],
@@ -196,6 +209,7 @@ object SVMWithSGD {
    * @param stepSize Step size to be used for each iteration of gradient descent.
    * @param regParam Regularization parameter.
    * @param miniBatchFraction Fraction of data to be used per iteration.
+   * @since 0.8.0
    */
   def train(
       input: RDD[LabeledPoint],
@@ -217,6 +231,7 @@ object SVMWithSGD {
    * @param regParam Regularization parameter.
    * @param numIterations Number of iterations of gradient descent to run.
    * @return a SVMModel which has the weights and offset from training.
+   * @since 0.8.0
    */
   def train(
       input: RDD[LabeledPoint],
@@ -235,6 +250,7 @@ object SVMWithSGD {
    * @param input RDD of (label, array of features) pairs.
    * @param numIterations Number of iterations of gradient descent to run.
    * @return a SVMModel which has the weights and offset from training.
+   * @since 0.8.0
    */
   def train(input: RDD[LabeledPoint], numIterations: Int): SVMModel = {
     train(input, numIterations, 1.0, 0.01, 1.0)

From 6592a6058eee6a27a5c91281ca19076284d62483 Mon Sep 17 00:00:00 2001
From: Grace <jie.huang@intel.com>
Date: Tue, 21 Jul 2015 11:35:49 -0500
Subject: [PATCH 299/549] [SPARK-9193] Avoid assigning tasks to "lost"
 executor(s)

Now, when some executors are killed by dynamic-allocation, it leads to some mis-assignment onto lost executors sometimes. Such kind of mis-assignment causes task failure(s) or even job failure if it repeats that errors for 4 times.

The root cause is that ***killExecutors*** doesn't remove those executors under killing ASAP. It depends on the ***OnDisassociated*** event to refresh the active working list later. The delay time really depends on your cluster status (from several milliseconds to sub-minute). When new tasks to be scheduled during that period of time, it will be assigned to those "active" but "under killing" executors. Then the tasks will be failed due to "executor lost". The better way is to exclude those executors under killing in the makeOffers(). Then all those tasks won't be allocated onto those executors "to be lost" any more.

Author: Grace <jie.huang@intel.com>

Closes #7528 from GraceH/AssignToLostExecutor and squashes the following commits:

ecc1da6 [Grace] scala style fix
6e2ed96 [Grace] Re-word makeOffers by more readable lines
b5546ce [Grace] Add comments about the fix
30a9ad0 [Grace] Avoid assigning tasks to lost executors
---
 .../cluster/CoarseGrainedSchedulerBackend.scala | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
index f14c603ac6891..c65b3e517773e 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
@@ -169,9 +169,12 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
 
     // Make fake resource offers on all executors
     private def makeOffers() {
-      launchTasks(scheduler.resourceOffers(executorDataMap.map { case (id, executorData) =>
+      // Filter out executors under killing
+      val activeExecutors = executorDataMap.filterKeys(!executorsPendingToRemove.contains(_))
+      val workOffers = activeExecutors.map { case (id, executorData) =>
         new WorkerOffer(id, executorData.executorHost, executorData.freeCores)
-      }.toSeq))
+      }.toSeq
+      launchTasks(scheduler.resourceOffers(workOffers))
     }
 
     override def onDisconnected(remoteAddress: RpcAddress): Unit = {
@@ -181,9 +184,13 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
 
     // Make fake resource offers on just one executor
     private def makeOffers(executorId: String) {
-      val executorData = executorDataMap(executorId)
-      launchTasks(scheduler.resourceOffers(
-        Seq(new WorkerOffer(executorId, executorData.executorHost, executorData.freeCores))))
+      // Filter out executors under killing
+      if (!executorsPendingToRemove.contains(executorId)) {
+        val executorData = executorDataMap(executorId)
+        val workOffers = Seq(
+          new WorkerOffer(executorId, executorData.executorHost, executorData.freeCores))
+        launchTasks(scheduler.resourceOffers(workOffers))
+      }
     }
 
     // Launch tasks returned by a set of resource offers

From f67da43c394c27ceb4e6bfd49e81be05e406aa29 Mon Sep 17 00:00:00 2001
From: Ben <benjaminpiering@gmail.com>
Date: Tue, 21 Jul 2015 09:51:13 -0700
Subject: [PATCH 300/549] [SPARK-9036] [CORE]
 SparkListenerExecutorMetricsUpdate messages not included in JsonProtocol

This PR implements a JSON serializer and deserializer in the JSONProtocol to handle the (de)serialization of SparkListenerExecutorMetricsUpdate events. It also includes a unit test in the JSONProtocolSuite file. This was implemented to satisfy the improvement request in the JIRA  issue SPARK-9036.

Author: Ben <benjaminpiering@gmail.com>

Closes #7555 from NamelessAnalyst/master and squashes the following commits:

fb4e3cc [Ben] Update JSON Protocol and tests
aa69517 [Ben] Update JSON Protocol and tests --Corrected Stage Attempt to Stage Attempt ID
33e5774 [Ben] Update JSON Protocol Tests
3f237e7 [Ben] Update JSON Protocol Tests
84ca798 [Ben] Update JSON Protocol Tests
cde57a0 [Ben] Update JSON Protocol Tests
8049600 [Ben] Update JSON Protocol Tests
c5bc061 [Ben] Update JSON Protocol Tests
6f25785 [Ben] Merge remote-tracking branch 'origin/master'
df2a609 [Ben] Update JSON Protocol
dcda80b [Ben] Update JSON Protocol
---
 .../org/apache/spark/util/JsonProtocol.scala  | 31 ++++++++-
 .../apache/spark/util/JsonProtocolSuite.scala | 69 ++++++++++++++++++-
 2 files changed, 96 insertions(+), 4 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
index adf69a4e78e71..a078f14af52a1 100644
--- a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
@@ -92,8 +92,8 @@ private[spark] object JsonProtocol {
         executorRemovedToJson(executorRemoved)
       case logStart: SparkListenerLogStart =>
         logStartToJson(logStart)
-      // These aren't used, but keeps compiler happy
-      case SparkListenerExecutorMetricsUpdate(_, _) => JNothing
+      case metricsUpdate: SparkListenerExecutorMetricsUpdate =>
+        executorMetricsUpdateToJson(metricsUpdate)
     }
   }
 
@@ -224,6 +224,19 @@ private[spark] object JsonProtocol {
     ("Spark Version" -> SPARK_VERSION)
   }
 
+  def executorMetricsUpdateToJson(metricsUpdate: SparkListenerExecutorMetricsUpdate): JValue = {
+    val execId = metricsUpdate.execId
+    val taskMetrics = metricsUpdate.taskMetrics
+    ("Event" -> Utils.getFormattedClassName(metricsUpdate)) ~
+    ("Executor ID" -> execId) ~
+    ("Metrics Updated" -> taskMetrics.map { case (taskId, stageId, stageAttemptId, metrics) =>
+      ("Task ID" -> taskId) ~
+      ("Stage ID" -> stageId) ~
+      ("Stage Attempt ID" -> stageAttemptId) ~
+      ("Task Metrics" -> taskMetricsToJson(metrics))
+    })
+  }
+
   /** ------------------------------------------------------------------- *
    * JSON serialization methods for classes SparkListenerEvents depend on |
    * -------------------------------------------------------------------- */
@@ -463,6 +476,7 @@ private[spark] object JsonProtocol {
     val executorAdded = Utils.getFormattedClassName(SparkListenerExecutorAdded)
     val executorRemoved = Utils.getFormattedClassName(SparkListenerExecutorRemoved)
     val logStart = Utils.getFormattedClassName(SparkListenerLogStart)
+    val metricsUpdate = Utils.getFormattedClassName(SparkListenerExecutorMetricsUpdate)
 
     (json \ "Event").extract[String] match {
       case `stageSubmitted` => stageSubmittedFromJson(json)
@@ -481,6 +495,7 @@ private[spark] object JsonProtocol {
       case `executorAdded` => executorAddedFromJson(json)
       case `executorRemoved` => executorRemovedFromJson(json)
       case `logStart` => logStartFromJson(json)
+      case `metricsUpdate` => executorMetricsUpdateFromJson(json)
     }
   }
 
@@ -598,6 +613,18 @@ private[spark] object JsonProtocol {
     SparkListenerLogStart(sparkVersion)
   }
 
+  def executorMetricsUpdateFromJson(json: JValue): SparkListenerExecutorMetricsUpdate = {
+    val execInfo = (json \ "Executor ID").extract[String]
+    val taskMetrics = (json \ "Metrics Updated").extract[List[JValue]].map { json =>
+      val taskId = (json \ "Task ID").extract[Long]
+      val stageId = (json \ "Stage ID").extract[Int]
+      val stageAttemptId = (json \ "Stage Attempt ID").extract[Int]
+      val metrics = taskMetricsFromJson(json \ "Task Metrics")
+      (taskId, stageId, stageAttemptId, metrics)
+    }
+    SparkListenerExecutorMetricsUpdate(execInfo, taskMetrics)
+  }
+
   /** --------------------------------------------------------------------- *
    * JSON deserialization methods for classes SparkListenerEvents depend on |
    * ---------------------------------------------------------------------- */
diff --git a/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala b/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
index e0ef9c70a5fc3..dde95f3778434 100644
--- a/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
@@ -83,6 +83,9 @@ class JsonProtocolSuite extends SparkFunSuite {
     val executorAdded = SparkListenerExecutorAdded(executorAddedTime, "exec1",
       new ExecutorInfo("Hostee.awesome.com", 11, logUrlMap))
     val executorRemoved = SparkListenerExecutorRemoved(executorRemovedTime, "exec2", "test reason")
+    val executorMetricsUpdate = SparkListenerExecutorMetricsUpdate("exec3", Seq(
+      (1L, 2, 3, makeTaskMetrics(300L, 400L, 500L, 600L, 700, 800,
+        hasHadoopInput = true, hasOutput = true))))
 
     testEvent(stageSubmitted, stageSubmittedJsonString)
     testEvent(stageCompleted, stageCompletedJsonString)
@@ -102,6 +105,7 @@ class JsonProtocolSuite extends SparkFunSuite {
     testEvent(applicationEnd, applicationEndJsonString)
     testEvent(executorAdded, executorAddedJsonString)
     testEvent(executorRemoved, executorRemovedJsonString)
+    testEvent(executorMetricsUpdate, executorMetricsUpdateJsonString)
   }
 
   test("Dependent Classes") {
@@ -440,10 +444,20 @@ class JsonProtocolSuite extends SparkFunSuite {
       case (e1: SparkListenerEnvironmentUpdate, e2: SparkListenerEnvironmentUpdate) =>
         assertEquals(e1.environmentDetails, e2.environmentDetails)
       case (e1: SparkListenerExecutorAdded, e2: SparkListenerExecutorAdded) =>
-        assert(e1.executorId == e1.executorId)
+        assert(e1.executorId === e1.executorId)
         assertEquals(e1.executorInfo, e2.executorInfo)
       case (e1: SparkListenerExecutorRemoved, e2: SparkListenerExecutorRemoved) =>
-        assert(e1.executorId == e1.executorId)
+        assert(e1.executorId === e1.executorId)
+      case (e1: SparkListenerExecutorMetricsUpdate, e2: SparkListenerExecutorMetricsUpdate) =>
+        assert(e1.execId === e2.execId)
+        assertSeqEquals[(Long, Int, Int, TaskMetrics)](e1.taskMetrics, e2.taskMetrics, (a, b) => {
+          val (taskId1, stageId1, stageAttemptId1, metrics1) = a
+          val (taskId2, stageId2, stageAttemptId2, metrics2) = b
+          assert(taskId1 === taskId2)
+          assert(stageId1 === stageId2)
+          assert(stageAttemptId1 === stageAttemptId2)
+          assertEquals(metrics1, metrics2)
+        })
       case (e1, e2) =>
         assert(e1 === e2)
       case _ => fail("Events don't match in types!")
@@ -1598,4 +1612,55 @@ class JsonProtocolSuite extends SparkFunSuite {
       |  "Removed Reason": "test reason"
       |}
     """
+
+  private val executorMetricsUpdateJsonString =
+  s"""
+     |{
+     |  "Event": "SparkListenerExecutorMetricsUpdate",
+     |  "Executor ID": "exec3",
+     |  "Metrics Updated": [
+     |  {
+     |    "Task ID": 1,
+     |    "Stage ID": 2,
+     |    "Stage Attempt ID": 3,
+     |    "Task Metrics": {
+     |    "Host Name": "localhost",
+     |    "Executor Deserialize Time": 300,
+     |    "Executor Run Time": 400,
+     |    "Result Size": 500,
+     |    "JVM GC Time": 600,
+     |    "Result Serialization Time": 700,
+     |    "Memory Bytes Spilled": 800,
+     |    "Disk Bytes Spilled": 0,
+     |    "Input Metrics": {
+     |      "Data Read Method": "Hadoop",
+     |      "Bytes Read": 2100,
+     |      "Records Read": 21
+     |    },
+     |    "Output Metrics": {
+     |      "Data Write Method": "Hadoop",
+     |      "Bytes Written": 1200,
+     |      "Records Written": 12
+     |    },
+     |    "Updated Blocks": [
+     |      {
+     |        "Block ID": "rdd_0_0",
+     |        "Status": {
+     |          "Storage Level": {
+     |            "Use Disk": true,
+     |            "Use Memory": true,
+     |            "Use ExternalBlockStore": false,
+     |            "Deserialized": false,
+     |            "Replication": 2
+     |          },
+     |          "Memory Size": 0,
+     |          "ExternalBlockStore Size": 0,
+     |          "Disk Size": 0
+     |        }
+     |      }
+     |    ]
+     |  }
+     |  }]
+     |}
+   """.stripMargin
 }

From 9a4fd875b39b6a1ef7038823d1c49b0826110fbc Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@appier.com>
Date: Tue, 21 Jul 2015 09:52:27 -0700
Subject: [PATCH 301/549] [SPARK-9128] [CORE] Get outerclasses and objects with
 only one method calling in ClosureCleaner

JIRA: https://issues.apache.org/jira/browse/SPARK-9128

Currently, in `ClosureCleaner`, the outerclasses and objects are retrieved using two different methods. However, the logic of the two methods is the same, and we can get both the outerclasses and objects with only one method calling.

Author: Liang-Chi Hsieh <viirya@appier.com>

Closes #7459 from viirya/remove_extra_closurecleaner and squashes the following commits:

7c9858d [Liang-Chi Hsieh] For comments.
a096941 [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into remove_extra_closurecleaner
2ec5ce1 [Liang-Chi Hsieh] Remove unnecessary methods.
4df5a51 [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into remove_extra_closurecleaner
dc110d1 [Liang-Chi Hsieh] Add method to get outerclasses and objects at the same time.
---
 .../apache/spark/util/ClosureCleaner.scala    | 32 +++--------
 .../spark/util/ClosureCleanerSuite2.scala     | 54 ++++++++-----------
 2 files changed, 28 insertions(+), 58 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala b/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala
index 43626b4ef4880..ebead830c6466 100644
--- a/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala
+++ b/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala
@@ -49,45 +49,28 @@ private[spark] object ClosureCleaner extends Logging {
     cls.getName.contains("$anonfun$")
   }
 
-  // Get a list of the classes of the outer objects of a given closure object, obj;
+  // Get a list of the outer objects and their classes of a given closure object, obj;
   // the outer objects are defined as any closures that obj is nested within, plus
   // possibly the class that the outermost closure is in, if any. We stop searching
   // for outer objects beyond that because cloning the user's object is probably
   // not a good idea (whereas we can clone closure objects just fine since we
   // understand how all their fields are used).
-  private def getOuterClasses(obj: AnyRef): List[Class[_]] = {
+  private def getOuterClassesAndObjects(obj: AnyRef): (List[Class[_]], List[AnyRef]) = {
     for (f <- obj.getClass.getDeclaredFields if f.getName == "$outer") {
       f.setAccessible(true)
       val outer = f.get(obj)
       // The outer pointer may be null if we have cleaned this closure before
       if (outer != null) {
         if (isClosure(f.getType)) {
-          return f.getType :: getOuterClasses(outer)
+          val recurRet = getOuterClassesAndObjects(outer)
+          return (f.getType :: recurRet._1, outer :: recurRet._2)
         } else {
-          return f.getType :: Nil // Stop at the first $outer that is not a closure
+          return (f.getType :: Nil, outer :: Nil) // Stop at the first $outer that is not a closure
         }
       }
     }
-    Nil
+    (Nil, Nil)
   }
-
-  // Get a list of the outer objects for a given closure object.
-  private def getOuterObjects(obj: AnyRef): List[AnyRef] = {
-    for (f <- obj.getClass.getDeclaredFields if f.getName == "$outer") {
-      f.setAccessible(true)
-      val outer = f.get(obj)
-      // The outer pointer may be null if we have cleaned this closure before
-      if (outer != null) {
-        if (isClosure(f.getType)) {
-          return outer :: getOuterObjects(outer)
-        } else {
-          return outer :: Nil // Stop at the first $outer that is not a closure
-        }
-      }
-    }
-    Nil
-  }
-
   /**
    * Return a list of classes that represent closures enclosed in the given closure object.
    */
@@ -205,8 +188,7 @@ private[spark] object ClosureCleaner extends Logging {
 
     // A list of enclosing objects and their respective classes, from innermost to outermost
     // An outer object at a given index is of type outer class at the same index
-    val outerClasses = getOuterClasses(func)
-    val outerObjects = getOuterObjects(func)
+    val (outerClasses, outerObjects) = getOuterClassesAndObjects(func)
 
     // For logging purposes only
     val declaredFields = func.getClass.getDeclaredFields
diff --git a/core/src/test/scala/org/apache/spark/util/ClosureCleanerSuite2.scala b/core/src/test/scala/org/apache/spark/util/ClosureCleanerSuite2.scala
index 3147c937769d2..a829b099025e9 100644
--- a/core/src/test/scala/org/apache/spark/util/ClosureCleanerSuite2.scala
+++ b/core/src/test/scala/org/apache/spark/util/ClosureCleanerSuite2.scala
@@ -120,8 +120,8 @@ class ClosureCleanerSuite2 extends SparkFunSuite with BeforeAndAfterAll with Pri
   // Accessors for private methods
   private val _isClosure = PrivateMethod[Boolean]('isClosure)
   private val _getInnerClosureClasses = PrivateMethod[List[Class[_]]]('getInnerClosureClasses)
-  private val _getOuterClasses = PrivateMethod[List[Class[_]]]('getOuterClasses)
-  private val _getOuterObjects = PrivateMethod[List[AnyRef]]('getOuterObjects)
+  private val _getOuterClassesAndObjects =
+    PrivateMethod[(List[Class[_]], List[AnyRef])]('getOuterClassesAndObjects)
 
   private def isClosure(obj: AnyRef): Boolean = {
     ClosureCleaner invokePrivate _isClosure(obj)
@@ -131,12 +131,8 @@ class ClosureCleanerSuite2 extends SparkFunSuite with BeforeAndAfterAll with Pri
     ClosureCleaner invokePrivate _getInnerClosureClasses(closure)
   }
 
-  private def getOuterClasses(closure: AnyRef): List[Class[_]] = {
-    ClosureCleaner invokePrivate _getOuterClasses(closure)
-  }
-
-  private def getOuterObjects(closure: AnyRef): List[AnyRef] = {
-    ClosureCleaner invokePrivate _getOuterObjects(closure)
+  private def getOuterClassesAndObjects(closure: AnyRef): (List[Class[_]], List[AnyRef]) = {
+    ClosureCleaner invokePrivate _getOuterClassesAndObjects(closure)
   }
 
   test("get inner closure classes") {
@@ -171,14 +167,11 @@ class ClosureCleanerSuite2 extends SparkFunSuite with BeforeAndAfterAll with Pri
     val closure2 = () => localValue
     val closure3 = () => someSerializableValue
     val closure4 = () => someSerializableMethod()
-    val outerClasses1 = getOuterClasses(closure1)
-    val outerClasses2 = getOuterClasses(closure2)
-    val outerClasses3 = getOuterClasses(closure3)
-    val outerClasses4 = getOuterClasses(closure4)
-    val outerObjects1 = getOuterObjects(closure1)
-    val outerObjects2 = getOuterObjects(closure2)
-    val outerObjects3 = getOuterObjects(closure3)
-    val outerObjects4 = getOuterObjects(closure4)
+
+    val (outerClasses1, outerObjects1) = getOuterClassesAndObjects(closure1)
+    val (outerClasses2, outerObjects2) = getOuterClassesAndObjects(closure2)
+    val (outerClasses3, outerObjects3) = getOuterClassesAndObjects(closure3)
+    val (outerClasses4, outerObjects4) = getOuterClassesAndObjects(closure4)
 
     // The classes and objects should have the same size
     assert(outerClasses1.size === outerObjects1.size)
@@ -211,10 +204,8 @@ class ClosureCleanerSuite2 extends SparkFunSuite with BeforeAndAfterAll with Pri
       val x = 1
       val closure1 = () => 1
       val closure2 = () => x
-      val outerClasses1 = getOuterClasses(closure1)
-      val outerClasses2 = getOuterClasses(closure2)
-      val outerObjects1 = getOuterObjects(closure1)
-      val outerObjects2 = getOuterObjects(closure2)
+      val (outerClasses1, outerObjects1) = getOuterClassesAndObjects(closure1)
+      val (outerClasses2, outerObjects2) = getOuterClassesAndObjects(closure2)
       assert(outerClasses1.size === outerObjects1.size)
       assert(outerClasses2.size === outerObjects2.size)
       // These inner closures only reference local variables, and so do not have $outer pointers
@@ -227,12 +218,9 @@ class ClosureCleanerSuite2 extends SparkFunSuite with BeforeAndAfterAll with Pri
       val closure1 = () => 1
       val closure2 = () => y
       val closure3 = () => localValue
-      val outerClasses1 = getOuterClasses(closure1)
-      val outerClasses2 = getOuterClasses(closure2)
-      val outerClasses3 = getOuterClasses(closure3)
-      val outerObjects1 = getOuterObjects(closure1)
-      val outerObjects2 = getOuterObjects(closure2)
-      val outerObjects3 = getOuterObjects(closure3)
+      val (outerClasses1, outerObjects1) = getOuterClassesAndObjects(closure1)
+      val (outerClasses2, outerObjects2) = getOuterClassesAndObjects(closure2)
+      val (outerClasses3, outerObjects3) = getOuterClassesAndObjects(closure3)
       assert(outerClasses1.size === outerObjects1.size)
       assert(outerClasses2.size === outerObjects2.size)
       assert(outerClasses3.size === outerObjects3.size)
@@ -265,9 +253,9 @@ class ClosureCleanerSuite2 extends SparkFunSuite with BeforeAndAfterAll with Pri
     val closure1 = () => 1
     val closure2 = () => localValue
     val closure3 = () => someSerializableValue
-    val outerClasses1 = getOuterClasses(closure1)
-    val outerClasses2 = getOuterClasses(closure2)
-    val outerClasses3 = getOuterClasses(closure3)
+    val (outerClasses1, _) = getOuterClassesAndObjects(closure1)
+    val (outerClasses2, _) = getOuterClassesAndObjects(closure2)
+    val (outerClasses3, _) = getOuterClassesAndObjects(closure3)
 
     val fields1 = findAccessedFields(closure1, outerClasses1, findTransitively = false)
     val fields2 = findAccessedFields(closure2, outerClasses2, findTransitively = false)
@@ -307,10 +295,10 @@ class ClosureCleanerSuite2 extends SparkFunSuite with BeforeAndAfterAll with Pri
       val closure2 = () => a
       val closure3 = () => localValue
       val closure4 = () => someSerializableValue
-      val outerClasses1 = getOuterClasses(closure1)
-      val outerClasses2 = getOuterClasses(closure2)
-      val outerClasses3 = getOuterClasses(closure3)
-      val outerClasses4 = getOuterClasses(closure4)
+      val (outerClasses1, _) = getOuterClassesAndObjects(closure1)
+      val (outerClasses2, _) = getOuterClassesAndObjects(closure2)
+      val (outerClasses3, _) = getOuterClassesAndObjects(closure3)
+      val (outerClasses4, _) = getOuterClassesAndObjects(closure4)
 
       // First, find only fields accessed directly, not transitively, by these closures
       val fields1 = findAccessedFields(closure1, outerClasses1, findTransitively = false)

From 31954910d67c29874d2af22ee30590a7346a464c Mon Sep 17 00:00:00 2001
From: Jacek Lewandowski <lewandowski.jacek@gmail.com>
Date: Tue, 21 Jul 2015 09:53:33 -0700
Subject: [PATCH 302/549] [SPARK-7171] Added a method to retrieve metrics
 sources in TaskContext

Author: Jacek Lewandowski <lewandowski.jacek@gmail.com>

Closes #5805 from jacek-lewandowski/SPARK-7171 and squashes the following commits:

ed20bda [Jacek Lewandowski] SPARK-7171: Added a method to retrieve metrics sources in TaskContext
---
 .../scala/org/apache/spark/TaskContext.scala  |  9 ++++++++
 .../org/apache/spark/TaskContextImpl.scala    |  6 +++++
 .../org/apache/spark/executor/Executor.scala  |  5 ++++-
 .../apache/spark/metrics/MetricsSystem.scala  |  3 +++
 .../apache/spark/scheduler/DAGScheduler.scala |  1 +
 .../org/apache/spark/scheduler/Task.scala     |  8 ++++++-
 .../java/org/apache/spark/JavaAPISuite.java   |  2 +-
 .../org/apache/spark/CacheManagerSuite.scala  |  8 +++----
 .../org/apache/spark/rdd/PipedRDDSuite.scala  |  2 +-
 .../spark/scheduler/TaskContextSuite.scala    | 22 ++++++++++++++++---
 .../shuffle/hash/HashShuffleReaderSuite.scala |  2 +-
 .../ShuffleBlockFetcherIteratorSuite.scala    |  6 ++---
 12 files changed, 59 insertions(+), 15 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/TaskContext.scala b/core/src/main/scala/org/apache/spark/TaskContext.scala
index e93eb93124e51..b48836d5c8897 100644
--- a/core/src/main/scala/org/apache/spark/TaskContext.scala
+++ b/core/src/main/scala/org/apache/spark/TaskContext.scala
@@ -21,6 +21,7 @@ import java.io.Serializable
 
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.executor.TaskMetrics
+import org.apache.spark.metrics.source.Source
 import org.apache.spark.unsafe.memory.TaskMemoryManager
 import org.apache.spark.util.TaskCompletionListener
 
@@ -148,6 +149,14 @@ abstract class TaskContext extends Serializable {
   @DeveloperApi
   def taskMetrics(): TaskMetrics
 
+  /**
+   * ::DeveloperApi::
+   * Returns all metrics sources with the given name which are associated with the instance
+   * which runs the task. For more information see [[org.apache.spark.metrics.MetricsSystem!]].
+   */
+  @DeveloperApi
+  def getMetricsSources(sourceName: String): Seq[Source]
+
   /**
    * Returns the manager for this task's managed memory.
    */
diff --git a/core/src/main/scala/org/apache/spark/TaskContextImpl.scala b/core/src/main/scala/org/apache/spark/TaskContextImpl.scala
index 6e394f1b12445..9ee168ae016f8 100644
--- a/core/src/main/scala/org/apache/spark/TaskContextImpl.scala
+++ b/core/src/main/scala/org/apache/spark/TaskContextImpl.scala
@@ -20,6 +20,8 @@ package org.apache.spark
 import scala.collection.mutable.{ArrayBuffer, HashMap}
 
 import org.apache.spark.executor.TaskMetrics
+import org.apache.spark.metrics.MetricsSystem
+import org.apache.spark.metrics.source.Source
 import org.apache.spark.unsafe.memory.TaskMemoryManager
 import org.apache.spark.util.{TaskCompletionListener, TaskCompletionListenerException}
 
@@ -29,6 +31,7 @@ private[spark] class TaskContextImpl(
     override val taskAttemptId: Long,
     override val attemptNumber: Int,
     override val taskMemoryManager: TaskMemoryManager,
+    @transient private val metricsSystem: MetricsSystem,
     val runningLocally: Boolean = false,
     val taskMetrics: TaskMetrics = TaskMetrics.empty)
   extends TaskContext
@@ -95,6 +98,9 @@ private[spark] class TaskContextImpl(
 
   override def isInterrupted(): Boolean = interrupted
 
+  override def getMetricsSources(sourceName: String): Seq[Source] =
+    metricsSystem.getSourcesByName(sourceName)
+
   @transient private val accumulators = new HashMap[Long, Accumulable[_, _]]
 
   private[spark] override def registerAccumulator(a: Accumulable[_, _]): Unit = synchronized {
diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala
index 9087debde8c41..66624ffbe4790 100644
--- a/core/src/main/scala/org/apache/spark/executor/Executor.scala
+++ b/core/src/main/scala/org/apache/spark/executor/Executor.scala
@@ -210,7 +210,10 @@ private[spark] class Executor(
         // Run the actual task and measure its runtime.
         taskStart = System.currentTimeMillis()
         val (value, accumUpdates) = try {
-          task.run(taskAttemptId = taskId, attemptNumber = attemptNumber)
+          task.run(
+            taskAttemptId = taskId,
+            attemptNumber = attemptNumber,
+            metricsSystem = env.metricsSystem)
         } finally {
           // Note: this memory freeing logic is duplicated in DAGScheduler.runLocallyWithinThread;
           // when changing this, make sure to update both copies.
diff --git a/core/src/main/scala/org/apache/spark/metrics/MetricsSystem.scala b/core/src/main/scala/org/apache/spark/metrics/MetricsSystem.scala
index 67f64d5e278de..4517f465ebd3b 100644
--- a/core/src/main/scala/org/apache/spark/metrics/MetricsSystem.scala
+++ b/core/src/main/scala/org/apache/spark/metrics/MetricsSystem.scala
@@ -142,6 +142,9 @@ private[spark] class MetricsSystem private (
     } else { defaultName }
   }
 
+  def getSourcesByName(sourceName: String): Seq[Source] =
+    sources.filter(_.sourceName == sourceName)
+
   def registerSource(source: Source) {
     sources += source
     try {
diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
index 71a219a4f3414..b829d06923404 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -682,6 +682,7 @@ class DAGScheduler(
           taskAttemptId = 0,
           attemptNumber = 0,
           taskMemoryManager = taskMemoryManager,
+          metricsSystem = env.metricsSystem,
           runningLocally = true)
       TaskContext.setTaskContext(taskContext)
       try {
diff --git a/core/src/main/scala/org/apache/spark/scheduler/Task.scala b/core/src/main/scala/org/apache/spark/scheduler/Task.scala
index 76a19aeac4679..d11a00956a9a9 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/Task.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/Task.scala
@@ -22,6 +22,7 @@ import java.nio.ByteBuffer
 
 import scala.collection.mutable.HashMap
 
+import org.apache.spark.metrics.MetricsSystem
 import org.apache.spark.{TaskContextImpl, TaskContext}
 import org.apache.spark.executor.TaskMetrics
 import org.apache.spark.serializer.SerializerInstance
@@ -61,13 +62,18 @@ private[spark] abstract class Task[T](
    * @param attemptNumber how many times this task has been attempted (0 for the first attempt)
    * @return the result of the task along with updates of Accumulators.
    */
-  final def run(taskAttemptId: Long, attemptNumber: Int): (T, AccumulatorUpdates) = {
+  final def run(
+    taskAttemptId: Long,
+    attemptNumber: Int,
+    metricsSystem: MetricsSystem)
+  : (T, AccumulatorUpdates) = {
     context = new TaskContextImpl(
       stageId = stageId,
       partitionId = partitionId,
       taskAttemptId = taskAttemptId,
       attemptNumber = attemptNumber,
       taskMemoryManager = taskMemoryManager,
+      metricsSystem = metricsSystem,
       runningLocally = false)
     TaskContext.setTaskContext(context)
     context.taskMetrics.setHostname(Utils.localHostName())
diff --git a/core/src/test/java/org/apache/spark/JavaAPISuite.java b/core/src/test/java/org/apache/spark/JavaAPISuite.java
index dfd86d3e51e7d..1b04a3b1cff0e 100644
--- a/core/src/test/java/org/apache/spark/JavaAPISuite.java
+++ b/core/src/test/java/org/apache/spark/JavaAPISuite.java
@@ -1011,7 +1011,7 @@ public void persist() {
   @Test
   public void iterator() {
     JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5), 2);
-    TaskContext context = new TaskContextImpl(0, 0, 0L, 0, null, false, new TaskMetrics());
+    TaskContext context = new TaskContextImpl(0, 0, 0L, 0, null, null, false, new TaskMetrics());
     Assert.assertEquals(1, rdd.iterator(rdd.partitions().get(0), context).next().intValue());
   }
 
diff --git a/core/src/test/scala/org/apache/spark/CacheManagerSuite.scala b/core/src/test/scala/org/apache/spark/CacheManagerSuite.scala
index af81e46a657d3..618a5fb24710f 100644
--- a/core/src/test/scala/org/apache/spark/CacheManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/CacheManagerSuite.scala
@@ -65,7 +65,7 @@ class CacheManagerSuite extends SparkFunSuite with LocalSparkContext with Before
     // in blockManager.put is a losing battle. You have been warned.
     blockManager = sc.env.blockManager
     cacheManager = sc.env.cacheManager
-    val context = new TaskContextImpl(0, 0, 0, 0, null)
+    val context = new TaskContextImpl(0, 0, 0, 0, null, null)
     val computeValue = cacheManager.getOrCompute(rdd, split, context, StorageLevel.MEMORY_ONLY)
     val getValue = blockManager.get(RDDBlockId(rdd.id, split.index))
     assert(computeValue.toList === List(1, 2, 3, 4))
@@ -77,7 +77,7 @@ class CacheManagerSuite extends SparkFunSuite with LocalSparkContext with Before
     val result = new BlockResult(Array(5, 6, 7).iterator, DataReadMethod.Memory, 12)
     when(blockManager.get(RDDBlockId(0, 0))).thenReturn(Some(result))
 
-    val context = new TaskContextImpl(0, 0, 0, 0, null)
+    val context = new TaskContextImpl(0, 0, 0, 0, null, null)
     val value = cacheManager.getOrCompute(rdd, split, context, StorageLevel.MEMORY_ONLY)
     assert(value.toList === List(5, 6, 7))
   }
@@ -86,14 +86,14 @@ class CacheManagerSuite extends SparkFunSuite with LocalSparkContext with Before
     // Local computation should not persist the resulting value, so don't expect a put().
     when(blockManager.get(RDDBlockId(0, 0))).thenReturn(None)
 
-    val context = new TaskContextImpl(0, 0, 0, 0, null, true)
+    val context = new TaskContextImpl(0, 0, 0, 0, null, null, true)
     val value = cacheManager.getOrCompute(rdd, split, context, StorageLevel.MEMORY_ONLY)
     assert(value.toList === List(1, 2, 3, 4))
   }
 
   test("verify task metrics updated correctly") {
     cacheManager = sc.env.cacheManager
-    val context = new TaskContextImpl(0, 0, 0, 0, null)
+    val context = new TaskContextImpl(0, 0, 0, 0, null, null)
     cacheManager.getOrCompute(rdd3, split, context, StorageLevel.MEMORY_ONLY)
     assert(context.taskMetrics.updatedBlocks.getOrElse(Seq()).size === 2)
   }
diff --git a/core/src/test/scala/org/apache/spark/rdd/PipedRDDSuite.scala b/core/src/test/scala/org/apache/spark/rdd/PipedRDDSuite.scala
index 32f04d54eff94..3e8816a4c65be 100644
--- a/core/src/test/scala/org/apache/spark/rdd/PipedRDDSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/PipedRDDSuite.scala
@@ -175,7 +175,7 @@ class PipedRDDSuite extends SparkFunSuite with SharedSparkContext {
       }
       val hadoopPart1 = generateFakeHadoopPartition()
       val pipedRdd = new PipedRDD(nums, "printenv " + varName)
-      val tContext = new TaskContextImpl(0, 0, 0, 0, null)
+      val tContext = new TaskContextImpl(0, 0, 0, 0, null, null)
       val rddIter = pipedRdd.compute(hadoopPart1, tContext)
       val arr = rddIter.toArray
       assert(arr(0) == "/some/path")
diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskContextSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskContextSuite.scala
index b9b0eccb0d834..9201d1e1f328b 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/TaskContextSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/TaskContextSuite.scala
@@ -24,11 +24,27 @@ import org.scalatest.BeforeAndAfter
 
 import org.apache.spark._
 import org.apache.spark.rdd.RDD
-import org.apache.spark.util.{TaskCompletionListenerException, TaskCompletionListener}
+import org.apache.spark.util.{TaskCompletionListener, TaskCompletionListenerException}
+import org.apache.spark.metrics.source.JvmSource
 
 
 class TaskContextSuite extends SparkFunSuite with BeforeAndAfter with LocalSparkContext {
 
+  test("provide metrics sources") {
+    val filePath = getClass.getClassLoader.getResource("test_metrics_config.properties").getFile
+    val conf = new SparkConf(loadDefaults = false)
+      .set("spark.metrics.conf", filePath)
+    sc = new SparkContext("local", "test", conf)
+    val rdd = sc.makeRDD(1 to 1)
+    val result = sc.runJob(rdd, (tc: TaskContext, it: Iterator[Int]) => {
+      tc.getMetricsSources("jvm").count {
+        case source: JvmSource => true
+        case _ => false
+      }
+    }).sum
+    assert(result > 0)
+  }
+
   test("calls TaskCompletionListener after failure") {
     TaskContextSuite.completed = false
     sc = new SparkContext("local", "test")
@@ -44,13 +60,13 @@ class TaskContextSuite extends SparkFunSuite with BeforeAndAfter with LocalSpark
     val task = new ResultTask[String, String](0, 0,
       sc.broadcast(closureSerializer.serialize((rdd, func)).array), rdd.partitions(0), Seq(), 0)
     intercept[RuntimeException] {
-      task.run(0, 0)
+      task.run(0, 0, null)
     }
     assert(TaskContextSuite.completed === true)
   }
 
   test("all TaskCompletionListeners should be called even if some fail") {
-    val context = new TaskContextImpl(0, 0, 0, 0, null)
+    val context = new TaskContextImpl(0, 0, 0, 0, null, null)
     val listener = mock(classOf[TaskCompletionListener])
     context.addTaskCompletionListener(_ => throw new Exception("blah"))
     context.addTaskCompletionListener(listener)
diff --git a/core/src/test/scala/org/apache/spark/shuffle/hash/HashShuffleReaderSuite.scala b/core/src/test/scala/org/apache/spark/shuffle/hash/HashShuffleReaderSuite.scala
index 6c9cb448e7833..db718ecabbdb9 100644
--- a/core/src/test/scala/org/apache/spark/shuffle/hash/HashShuffleReaderSuite.scala
+++ b/core/src/test/scala/org/apache/spark/shuffle/hash/HashShuffleReaderSuite.scala
@@ -138,7 +138,7 @@ class HashShuffleReaderSuite extends SparkFunSuite with LocalSparkContext {
       shuffleHandle,
       reduceId,
       reduceId + 1,
-      new TaskContextImpl(0, 0, 0, 0, null),
+      new TaskContextImpl(0, 0, 0, 0, null, null),
       blockManager,
       mapOutputTracker)
 
diff --git a/core/src/test/scala/org/apache/spark/storage/ShuffleBlockFetcherIteratorSuite.scala b/core/src/test/scala/org/apache/spark/storage/ShuffleBlockFetcherIteratorSuite.scala
index 64f3fbdcebed9..cf8bd8ae69625 100644
--- a/core/src/test/scala/org/apache/spark/storage/ShuffleBlockFetcherIteratorSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/ShuffleBlockFetcherIteratorSuite.scala
@@ -95,7 +95,7 @@ class ShuffleBlockFetcherIteratorSuite extends SparkFunSuite with PrivateMethodT
     )
 
     val iterator = new ShuffleBlockFetcherIterator(
-      new TaskContextImpl(0, 0, 0, 0, null),
+      new TaskContextImpl(0, 0, 0, 0, null, null),
       transfer,
       blockManager,
       blocksByAddress,
@@ -165,7 +165,7 @@ class ShuffleBlockFetcherIteratorSuite extends SparkFunSuite with PrivateMethodT
     val blocksByAddress = Seq[(BlockManagerId, Seq[(BlockId, Long)])](
       (remoteBmId, blocks.keys.map(blockId => (blockId, 1.asInstanceOf[Long])).toSeq))
 
-    val taskContext = new TaskContextImpl(0, 0, 0, 0, null)
+    val taskContext = new TaskContextImpl(0, 0, 0, 0, null, null)
     val iterator = new ShuffleBlockFetcherIterator(
       taskContext,
       transfer,
@@ -227,7 +227,7 @@ class ShuffleBlockFetcherIteratorSuite extends SparkFunSuite with PrivateMethodT
     val blocksByAddress = Seq[(BlockManagerId, Seq[(BlockId, Long)])](
       (remoteBmId, blocks.keys.map(blockId => (blockId, 1.asInstanceOf[Long])).toSeq))
 
-    val taskContext = new TaskContextImpl(0, 0, 0, 0, null)
+    val taskContext = new TaskContextImpl(0, 0, 0, 0, null, null)
     val iterator = new ShuffleBlockFetcherIterator(
       taskContext,
       transfer,

From 4f7f1ee378e80b33686508d56e133fc25dec5316 Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Tue, 21 Jul 2015 09:54:39 -0700
Subject: [PATCH 303/549] [SPARK-4598] [WEBUI] Task table pagination for the
 Stage page

This PR adds pagination for the task table to solve the scalability issue of the stage page. Here is the initial screenshot:
<img width="1347" alt="pagination" src="https://cloud.githubusercontent.com/assets/1000778/8679669/9e63863c-2a8e-11e5-94e4-994febcd6717.png">
The task table only shows 100 tasks. There is a page navigation above the table. Users can click the page navigation or type the page number to jump to another page. The table can be sorted by clicking the headers. However, unlike previous implementation, the sorting work is done in the server now. So clicking a table column to sort needs to refresh the web page.

Author: zsxwing <zsxwing@gmail.com>

Closes #7399 from zsxwing/task-table-pagination and squashes the following commits:

144f513 [zsxwing] Display the page navigation when the page number is out of range
a3eee22 [zsxwing] Add extra space for the error message
54c5b84 [zsxwing] Reset page to 1 if the user changes the page size
c2f7f39 [zsxwing] Add a text field to let users fill the page size
bad52eb [zsxwing] Display user-friendly error messages
410586b [zsxwing] Scroll down to the tasks table if the url contains any sort column
a0746d1 [zsxwing] Use expand-dag-viz-arrow-job and expand-dag-viz-arrow-stage instead of expand-dag-viz-arrow-true and expand-dag-viz-arrow-false
b123f67 [zsxwing] Use localStorage to remember the user's actions and replay them when loading the page
894a342 [zsxwing] Show the link cursor when hovering for headers and page links and other minor fix
4d4fecf [zsxwing] Address Carson's comments
d9285f0 [zsxwing] Add comments and fix the style
74285fa [zsxwing] Merge branch 'master' into task-table-pagination
db6c859 [zsxwing] Task table pagination for the Stage page
---
 .../spark/ui/static/additional-metrics.js     |  34 +-
 .../apache/spark/ui/static/spark-dag-viz.js   |  27 +
 .../apache/spark/ui/static/timeline-view.js   |  39 +
 .../org/apache/spark/ui/PagedTable.scala      | 246 +++++
 .../org/apache/spark/ui/jobs/StagePage.scala  | 879 +++++++++++++-----
 .../org/apache/spark/ui/PagedTableSuite.scala |  99 ++
 6 files changed, 1102 insertions(+), 222 deletions(-)
 create mode 100644 core/src/main/scala/org/apache/spark/ui/PagedTable.scala
 create mode 100644 core/src/test/scala/org/apache/spark/ui/PagedTableSuite.scala

diff --git a/core/src/main/resources/org/apache/spark/ui/static/additional-metrics.js b/core/src/main/resources/org/apache/spark/ui/static/additional-metrics.js
index 0b450dc76bc38..3c8ddddf07b1e 100644
--- a/core/src/main/resources/org/apache/spark/ui/static/additional-metrics.js
+++ b/core/src/main/resources/org/apache/spark/ui/static/additional-metrics.js
@@ -19,6 +19,9 @@
  * to be registered after the page loads. */
 $(function() {
     $("span.expand-additional-metrics").click(function(){
+        var status = window.localStorage.getItem("expand-additional-metrics") == "true";
+        status = !status;
+
         // Expand the list of additional metrics.
         var additionalMetricsDiv = $(this).parent().find('.additional-metrics');
         $(additionalMetricsDiv).toggleClass('collapsed');
@@ -26,17 +29,31 @@ $(function() {
         // Switch the class of the arrow from open to closed.
         $(this).find('.expand-additional-metrics-arrow').toggleClass('arrow-open');
         $(this).find('.expand-additional-metrics-arrow').toggleClass('arrow-closed');
+
+        window.localStorage.setItem("expand-additional-metrics", "" + status);
     });
 
+    if (window.localStorage.getItem("expand-additional-metrics") == "true") {
+        // Set it to false so that the click function can revert it
+        window.localStorage.setItem("expand-additional-metrics", "false");
+        $("span.expand-additional-metrics").trigger("click");
+    }
+
     stripeSummaryTable();
 
     $('input[type="checkbox"]').click(function() {
-        var column = "table ." + $(this).attr("name");
+        var name = $(this).attr("name")
+        var column = "table ." + name;
+        var status = window.localStorage.getItem(name) == "true";
+        status = !status;
         $(column).toggle();
         stripeSummaryTable();
+        window.localStorage.setItem(name, "" + status);
     });
 
     $("#select-all-metrics").click(function() {
+       var status = window.localStorage.getItem("select-all-metrics") == "true";
+       status = !status;
        if (this.checked) {
           // Toggle all un-checked options.
           $('input[type="checkbox"]:not(:checked)').trigger('click');
@@ -44,6 +61,21 @@ $(function() {
           // Toggle all checked options.
           $('input[type="checkbox"]:checked').trigger('click');
        }
+       window.localStorage.setItem("select-all-metrics", "" + status);
+    });
+
+    if (window.localStorage.getItem("select-all-metrics") == "true") {
+        $("#select-all-metrics").attr('checked', status);
+    }
+
+    $("span.additional-metric-title").parent().find('input[type="checkbox"]').each(function() {
+        var name = $(this).attr("name")
+        // If name is undefined, then skip it because it's the "select-all-metrics" checkbox
+        if (name && window.localStorage.getItem(name) == "true") {
+            // Set it to false so that the click function can revert it
+            window.localStorage.setItem(name, "false");
+            $(this).trigger("click")
+        }
     });
 
     // Trigger a click on the checkbox if a user clicks the label next to it.
diff --git a/core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.js b/core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.js
index 9fa53baaf4212..4a893bc0189aa 100644
--- a/core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.js
+++ b/core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.js
@@ -72,6 +72,14 @@ var StagePageVizConstants = {
   rankSep: 40
 };
 
+/*
+ * Return "expand-dag-viz-arrow-job" if forJob is true.
+ * Otherwise, return "expand-dag-viz-arrow-stage".
+ */
+function expandDagVizArrowKey(forJob) {
+  return forJob ? "expand-dag-viz-arrow-job" : "expand-dag-viz-arrow-stage";
+}
+
 /*
  * Show or hide the RDD DAG visualization.
  *
@@ -79,6 +87,9 @@ var StagePageVizConstants = {
  * This is the narrow interface called from the Scala UI code.
  */
 function toggleDagViz(forJob) {
+  var status = window.localStorage.getItem(expandDagVizArrowKey(forJob)) == "true";
+  status = !status;
+
   var arrowSelector = ".expand-dag-viz-arrow";
   $(arrowSelector).toggleClass('arrow-closed');
   $(arrowSelector).toggleClass('arrow-open');
@@ -93,8 +104,24 @@ function toggleDagViz(forJob) {
     // Save the graph for later so we don't have to render it again
     graphContainer().style("display", "none");
   }
+
+  window.localStorage.setItem(expandDagVizArrowKey(forJob), "" + status);
 }
 
+$(function (){
+  if (window.localStorage.getItem(expandDagVizArrowKey(false)) == "true") {
+    // Set it to false so that the click function can revert it
+    window.localStorage.setItem(expandDagVizArrowKey(false), "false");
+    toggleDagViz(false);
+  }
+
+  if (window.localStorage.getItem(expandDagVizArrowKey(true)) == "true") {
+    // Set it to false so that the click function can revert it
+    window.localStorage.setItem(expandDagVizArrowKey(true), "false");
+    toggleDagViz(true);
+  }
+});
+
 /*
  * Render the RDD DAG visualization.
  *
diff --git a/core/src/main/resources/org/apache/spark/ui/static/timeline-view.js b/core/src/main/resources/org/apache/spark/ui/static/timeline-view.js
index ca74ef9d7e94e..f4453c71df1ea 100644
--- a/core/src/main/resources/org/apache/spark/ui/static/timeline-view.js
+++ b/core/src/main/resources/org/apache/spark/ui/static/timeline-view.js
@@ -66,14 +66,27 @@ function drawApplicationTimeline(groupArray, eventObjArray, startTime) {
   setupJobEventAction();
 
   $("span.expand-application-timeline").click(function() {
+    var status = window.localStorage.getItem("expand-application-timeline") == "true";
+    status = !status;
+
     $("#application-timeline").toggleClass('collapsed');
 
     // Switch the class of the arrow from open to closed.
     $(this).find('.expand-application-timeline-arrow').toggleClass('arrow-open');
     $(this).find('.expand-application-timeline-arrow').toggleClass('arrow-closed');
+
+    window.localStorage.setItem("expand-application-timeline", "" + status);
   });
 }
 
+$(function (){
+  if (window.localStorage.getItem("expand-application-timeline") == "true") {
+    // Set it to false so that the click function can revert it
+    window.localStorage.setItem("expand-application-timeline", "false");
+    $("span.expand-application-timeline").trigger('click');
+  }
+});
+
 function drawJobTimeline(groupArray, eventObjArray, startTime) {
   var groups = new vis.DataSet(groupArray);
   var items = new vis.DataSet(eventObjArray);
@@ -125,14 +138,27 @@ function drawJobTimeline(groupArray, eventObjArray, startTime) {
   setupStageEventAction();
 
   $("span.expand-job-timeline").click(function() {
+    var status = window.localStorage.getItem("expand-job-timeline") == "true";
+    status = !status;
+
     $("#job-timeline").toggleClass('collapsed');
 
     // Switch the class of the arrow from open to closed.
     $(this).find('.expand-job-timeline-arrow').toggleClass('arrow-open');
     $(this).find('.expand-job-timeline-arrow').toggleClass('arrow-closed');
+
+    window.localStorage.setItem("expand-job-timeline", "" + status);
   });
 }
 
+$(function (){
+  if (window.localStorage.getItem("expand-job-timeline") == "true") {
+    // Set it to false so that the click function can revert it
+    window.localStorage.setItem("expand-job-timeline", "false");
+    $("span.expand-job-timeline").trigger('click');
+  }
+});
+
 function drawTaskAssignmentTimeline(groupArray, eventObjArray, minLaunchTime, maxFinishTime) {
   var groups = new vis.DataSet(groupArray);
   var items = new vis.DataSet(eventObjArray);
@@ -176,14 +202,27 @@ function drawTaskAssignmentTimeline(groupArray, eventObjArray, minLaunchTime, ma
   setupZoomable("#task-assignment-timeline-zoom-lock", taskTimeline);
 
   $("span.expand-task-assignment-timeline").click(function() {
+    var status = window.localStorage.getItem("expand-task-assignment-timeline") == "true";
+    status = !status;
+
     $("#task-assignment-timeline").toggleClass("collapsed");
 
      // Switch the class of the arrow from open to closed.
     $(this).find(".expand-task-assignment-timeline-arrow").toggleClass("arrow-open");
     $(this).find(".expand-task-assignment-timeline-arrow").toggleClass("arrow-closed");
+
+    window.localStorage.setItem("expand-task-assignment-timeline", "" + status);
   });
 }
 
+$(function (){
+  if (window.localStorage.getItem("expand-task-assignment-timeline") == "true") {
+    // Set it to false so that the click function can revert it
+    window.localStorage.setItem("expand-task-assignment-timeline", "false");
+    $("span.expand-task-assignment-timeline").trigger('click');
+  }
+});
+
 function setupExecutorEventAction() {
   $(".item.box.executor").each(function () {
     $(this).hover(
diff --git a/core/src/main/scala/org/apache/spark/ui/PagedTable.scala b/core/src/main/scala/org/apache/spark/ui/PagedTable.scala
new file mode 100644
index 0000000000000..17d7b39c2d951
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/ui/PagedTable.scala
@@ -0,0 +1,246 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ui
+
+import scala.xml.{Node, Unparsed}
+
+/**
+ * A data source that provides data for a page.
+ *
+ * @param pageSize the number of rows in a page
+ */
+private[ui] abstract class PagedDataSource[T](val pageSize: Int) {
+
+  if (pageSize <= 0) {
+    throw new IllegalArgumentException("Page size must be positive")
+  }
+
+  /**
+   * Return the size of all data.
+   */
+  protected def dataSize: Int
+
+  /**
+   * Slice a range of data.
+   */
+  protected def sliceData(from: Int, to: Int): Seq[T]
+
+  /**
+   * Slice the data for this page
+   */
+  def pageData(page: Int): PageData[T] = {
+    val totalPages = (dataSize + pageSize - 1) / pageSize
+    if (page <= 0 || page > totalPages) {
+      throw new IndexOutOfBoundsException(
+        s"Page $page is out of range. Please select a page number between 1 and $totalPages.")
+    }
+    val from = (page - 1) * pageSize
+    val to = dataSize.min(page * pageSize)
+    PageData(totalPages, sliceData(from, to))
+  }
+
+}
+
+/**
+ * The data returned by `PagedDataSource.pageData`, including the page number, the number of total
+ * pages and the data in this page.
+ */
+private[ui] case class PageData[T](totalPage: Int, data: Seq[T])
+
+/**
+ * A paged table that will generate a HTML table for a specified page and also the page navigation.
+ */
+private[ui] trait PagedTable[T] {
+
+  def tableId: String
+
+  def tableCssClass: String
+
+  def dataSource: PagedDataSource[T]
+
+  def headers: Seq[Node]
+
+  def row(t: T): Seq[Node]
+
+  def table(page: Int): Seq[Node] = {
+    val _dataSource = dataSource
+    try {
+      val PageData(totalPages, data) = _dataSource.pageData(page)
+      <div>
+        {pageNavigation(page, _dataSource.pageSize, totalPages)}
+        <table class={tableCssClass} id={tableId}>
+          {headers}
+          <tbody>
+            {data.map(row)}
+          </tbody>
+        </table>
+      </div>
+    } catch {
+      case e: IndexOutOfBoundsException =>
+        val PageData(totalPages, _) = _dataSource.pageData(1)
+        <div>
+          {pageNavigation(1, _dataSource.pageSize, totalPages)}
+          <div class="alert alert-error">{e.getMessage}</div>
+        </div>
+    }
+  }
+
+  /**
+   * Return a page navigation.
+   * <ul>
+   *   <li>If the totalPages is 1, the page navigation will be empty</li>
+   *   <li>
+   *     If the totalPages is more than 1, it will create a page navigation including a group of
+   *     page numbers and a form to submit the page number.
+   *   </li>
+   * </ul>
+   *
+   * Here are some examples of the page navigation:
+   * {{{
+   * << < 11 12 13* 14 15 16 17 18 19 20 > >>
+   *
+   * This is the first group, so "<<" is hidden.
+   * < 1 2* 3 4 5 6 7 8 9 10 > >>
+   *
+   * This is the first group and the first page, so "<<" and "<" are hidden.
+   * 1* 2 3 4 5 6 7 8 9 10 > >>
+   *
+   * Assume totalPages is 19. This is the last group, so ">>" is hidden.
+   * << < 11 12 13* 14 15 16 17 18 19 >
+   *
+   * Assume totalPages is 19. This is the last group and the last page, so ">>" and ">" are hidden.
+   * << < 11 12 13 14 15 16 17 18 19*
+   *
+   * * means the current page number
+   * << means jumping to the first page of the previous group.
+   * < means jumping to the previous page.
+   * >> means jumping to the first page of the next group.
+   * > means jumping to the next page.
+   * }}}
+   */
+  private[ui] def pageNavigation(page: Int, pageSize: Int, totalPages: Int): Seq[Node] = {
+    if (totalPages == 1) {
+      Nil
+    } else {
+      // A group includes all page numbers will be shown in the page navigation.
+      // The size of group is 10 means there are 10 page numbers will be shown.
+      // The first group is 1 to 10, the second is 2 to 20, and so on
+      val groupSize = 10
+      val firstGroup = 0
+      val lastGroup = (totalPages - 1) / groupSize
+      val currentGroup = (page - 1) / groupSize
+      val startPage = currentGroup * groupSize + 1
+      val endPage = totalPages.min(startPage + groupSize - 1)
+      val pageTags = (startPage to endPage).map { p =>
+        if (p == page) {
+          // The current page should be disabled so that it cannot be clicked.
+          <li class="disabled"><a href="#">{p}</a></li>
+        } else {
+          <li><a href={pageLink(p)}>{p}</a></li>
+        }
+      }
+      val (goButtonJsFuncName, goButtonJsFunc) = goButtonJavascriptFunction
+      // When clicking the "Go" button, it will call this javascript method and then call
+      // "goButtonJsFuncName"
+      val formJs =
+        s"""$$(function(){
+          |  $$( "#form-task-page" ).submit(function(event) {
+          |    var page = $$("#form-task-page-no").val()
+          |    var pageSize = $$("#form-task-page-size").val()
+          |    pageSize = pageSize ? pageSize: 100;
+          |    if (page != "") {
+          |      ${goButtonJsFuncName}(page, pageSize);
+          |    }
+          |    event.preventDefault();
+          |  });
+          |});
+        """.stripMargin
+
+      <div>
+        <div>
+          <form id="form-task-page" class="form-inline pull-right" style="margin-bottom: 0px;">
+            <label>{totalPages} Pages. Jump to</label>
+            <input type="text" id="form-task-page-no" value={page.toString} class="span1" />
+            <label>. Show </label>
+            <input type="text" id="form-task-page-size" value={pageSize.toString} class="span1" />
+            <label>tasks in a page.</label>
+            <button type="submit" class="btn">Go</button>
+          </form>
+        </div>
+        <div class="pagination" style="margin-bottom: 0px;">
+          <span style="float: left; padding-top: 4px; padding-right: 4px;">Page: </span>
+          <ul>
+            {if (currentGroup > firstGroup) {
+            <li>
+              <a href={pageLink(startPage - groupSize)} aria-label="Previous Group">
+                <span aria-hidden="true">
+                  &lt;&lt;
+                </span>
+              </a>
+            </li>
+            }}
+            {if (page > 1) {
+            <li>
+            <a href={pageLink(page - 1)} aria-label="Previous">
+              <span aria-hidden="true">
+                &lt;
+              </span>
+            </a>
+            </li>
+            }}
+            {pageTags}
+            {if (page < totalPages) {
+            <li>
+              <a href={pageLink(page + 1)} aria-label="Next">
+                <span aria-hidden="true">&gt;</span>
+              </a>
+            </li>
+            }}
+            {if (currentGroup < lastGroup) {
+            <li>
+              <a href={pageLink(startPage + groupSize)} aria-label="Next Group">
+                <span aria-hidden="true">
+                  &gt;&gt;
+                </span>
+              </a>
+            </li>
+          }}
+          </ul>
+        </div>
+        <script>
+          {Unparsed(goButtonJsFunc)}
+
+          {Unparsed(formJs)}
+        </script>
+      </div>
+    }
+  }
+
+  /**
+   * Return a link to jump to a page.
+   */
+  def pageLink(page: Int): String
+
+  /**
+   * Only the implementation knows how to create the url with a page number and the page size, so we
+   * leave this one to the implementation. The implementation should create a JavaScript method that
+   * accepts a page number along with the page size and jumps to the page. The return value is this
+   * method name and its JavaScript codes.
+   */
+  def goButtonJavascriptFunction: (String, String)
+}
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
index 6e077bf3e70d5..cf04b5e59239b 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.ui.jobs
 
+import java.net.URLEncoder
 import java.util.Date
 import javax.servlet.http.HttpServletRequest
 
@@ -27,13 +28,14 @@ import org.apache.commons.lang3.StringEscapeUtils
 
 import org.apache.spark.executor.TaskMetrics
 import org.apache.spark.scheduler.{AccumulableInfo, TaskInfo}
-import org.apache.spark.ui.{ToolTips, WebUIPage, UIUtils}
+import org.apache.spark.ui._
 import org.apache.spark.ui.jobs.UIData._
-import org.apache.spark.ui.scope.RDDOperationGraph
 import org.apache.spark.util.{Utils, Distribution}
 
 /** Page showing statistics and task list for a given stage */
 private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") {
+  import StagePage._
+
   private val progressListener = parent.progressListener
   private val operationGraphListener = parent.operationGraphListener
 
@@ -74,6 +76,16 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") {
       val parameterAttempt = request.getParameter("attempt")
       require(parameterAttempt != null && parameterAttempt.nonEmpty, "Missing attempt parameter")
 
+      val parameterTaskPage = request.getParameter("task.page")
+      val parameterTaskSortColumn = request.getParameter("task.sort")
+      val parameterTaskSortDesc = request.getParameter("task.desc")
+      val parameterTaskPageSize = request.getParameter("task.pageSize")
+
+      val taskPage = Option(parameterTaskPage).map(_.toInt).getOrElse(1)
+      val taskSortColumn = Option(parameterTaskSortColumn).getOrElse("Index")
+      val taskSortDesc = Option(parameterTaskSortDesc).map(_.toBoolean).getOrElse(false)
+      val taskPageSize = Option(parameterTaskPageSize).map(_.toInt).getOrElse(100)
+
       // If this is set, expand the dag visualization by default
       val expandDagVizParam = request.getParameter("expandDagViz")
       val expandDagViz = expandDagVizParam != null && expandDagVizParam.toBoolean
@@ -231,52 +243,47 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") {
         accumulableRow,
         accumulables.values.toSeq)
 
-      val taskHeadersAndCssClasses: Seq[(String, String)] =
-        Seq(
-          ("Index", ""), ("ID", ""), ("Attempt", ""), ("Status", ""), ("Locality Level", ""),
-          ("Executor ID / Host", ""), ("Launch Time", ""), ("Duration", ""),
-          ("Scheduler Delay", TaskDetailsClassNames.SCHEDULER_DELAY),
-          ("Task Deserialization Time", TaskDetailsClassNames.TASK_DESERIALIZATION_TIME),
-          ("GC Time", ""),
-          ("Result Serialization Time", TaskDetailsClassNames.RESULT_SERIALIZATION_TIME),
-          ("Getting Result Time", TaskDetailsClassNames.GETTING_RESULT_TIME)) ++
-        {if (hasAccumulators) Seq(("Accumulators", "")) else Nil} ++
-        {if (stageData.hasInput) Seq(("Input Size / Records", "")) else Nil} ++
-        {if (stageData.hasOutput) Seq(("Output Size / Records", "")) else Nil} ++
-        {if (stageData.hasShuffleRead) {
-          Seq(("Shuffle Read Blocked Time", TaskDetailsClassNames.SHUFFLE_READ_BLOCKED_TIME),
-            ("Shuffle Read Size / Records", ""),
-            ("Shuffle Remote Reads", TaskDetailsClassNames.SHUFFLE_READ_REMOTE_SIZE))
-        } else {
-          Nil
-        }} ++
-        {if (stageData.hasShuffleWrite) {
-          Seq(("Write Time", ""), ("Shuffle Write Size / Records", ""))
-        } else {
-          Nil
-        }} ++
-        {if (stageData.hasBytesSpilled) {
-          Seq(("Shuffle Spill (Memory)", ""), ("Shuffle Spill (Disk)", ""))
-        } else {
-          Nil
-        }} ++
-        Seq(("Errors", ""))
-
-      val unzipped = taskHeadersAndCssClasses.unzip
-
       val currentTime = System.currentTimeMillis()
-      val taskTable = UIUtils.listingTable(
-        unzipped._1,
-        taskRow(
+      val (taskTable, taskTableHTML) = try {
+        val _taskTable = new TaskPagedTable(
+          UIUtils.prependBaseUri(parent.basePath) +
+            s"/stages/stage?id=${stageId}&attempt=${stageAttemptId}",
+          tasks,
           hasAccumulators,
           stageData.hasInput,
           stageData.hasOutput,
           stageData.hasShuffleRead,
           stageData.hasShuffleWrite,
           stageData.hasBytesSpilled,
-          currentTime),
-        tasks,
-        headerClasses = unzipped._2)
+          currentTime,
+          pageSize = taskPageSize,
+          sortColumn = taskSortColumn,
+          desc = taskSortDesc
+        )
+        (_taskTable, _taskTable.table(taskPage))
+      } catch {
+        case e @ (_ : IllegalArgumentException | _ : IndexOutOfBoundsException) =>
+          (null, <div class="alert alert-error">{e.getMessage}</div>)
+      }
+
+      val jsForScrollingDownToTaskTable =
+        <script>
+          {Unparsed {
+            """
+              |$(function() {
+              |  if (/.*&task.sort=.*$/.test(location.search)) {
+              |    var topOffset = $("#tasks-section").offset().top;
+              |    $("html,body").animate({scrollTop: topOffset}, 200);
+              |  }
+              |});
+            """.stripMargin
+           }
+          }
+        </script>
+
+      val taskIdsInPage = if (taskTable == null) Set.empty[Long]
+        else taskTable.dataSource.slicedTaskIds
+
       // Excludes tasks which failed and have incomplete metrics
       val validTasks = tasks.filter(t => t.taskInfo.status == "SUCCESS" && t.taskMetrics.isDefined)
 
@@ -499,12 +506,15 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") {
         dagViz ++
         maybeExpandDagViz ++
         showAdditionalMetrics ++
-        makeTimeline(stageData.taskData.values.toSeq, currentTime) ++
+        makeTimeline(
+          // Only show the tasks in the table
+          stageData.taskData.values.toSeq.filter(t => taskIdsInPage.contains(t.taskInfo.taskId)),
+          currentTime) ++
         <h4>Summary Metrics for {numCompleted} Completed Tasks</h4> ++
         <div>{summaryTable.getOrElse("No tasks have reported metrics yet.")}</div> ++
         <h4>Aggregated Metrics by Executor</h4> ++ executorTable.toNodeSeq ++
         maybeAccumulableTable ++
-        <h4>Tasks</h4> ++ taskTable
+        <h4 id="tasks-section">Tasks</h4> ++ taskTableHTML ++ jsForScrollingDownToTaskTable
       UIUtils.headerSparkPage(stageHeader, content, parent, showVisualization = true)
     }
   }
@@ -679,164 +689,619 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") {
     </script>
   }
 
-  def taskRow(
-      hasAccumulators: Boolean,
-      hasInput: Boolean,
-      hasOutput: Boolean,
-      hasShuffleRead: Boolean,
-      hasShuffleWrite: Boolean,
-      hasBytesSpilled: Boolean,
-      currentTime: Long)(taskData: TaskUIData): Seq[Node] = {
-    taskData match { case TaskUIData(info, metrics, errorMessage) =>
-      val duration = if (info.status == "RUNNING") info.timeRunning(currentTime)
-        else metrics.map(_.executorRunTime).getOrElse(1L)
-      val formatDuration = if (info.status == "RUNNING") UIUtils.formatDuration(duration)
-        else metrics.map(m => UIUtils.formatDuration(m.executorRunTime)).getOrElse("")
-      val schedulerDelay = metrics.map(getSchedulerDelay(info, _, currentTime)).getOrElse(0L)
-      val gcTime = metrics.map(_.jvmGCTime).getOrElse(0L)
-      val taskDeserializationTime = metrics.map(_.executorDeserializeTime).getOrElse(0L)
-      val serializationTime = metrics.map(_.resultSerializationTime).getOrElse(0L)
-      val gettingResultTime = getGettingResultTime(info, currentTime)
-
-      val maybeAccumulators = info.accumulables
-      val accumulatorsReadable = maybeAccumulators.map { acc =>
-        StringEscapeUtils.escapeHtml4(s"${acc.name}: ${acc.update.get}")
+}
+
+private[ui] object StagePage {
+  private[ui] def getGettingResultTime(info: TaskInfo, currentTime: Long): Long = {
+    if (info.gettingResult) {
+      if (info.finished) {
+        info.finishTime - info.gettingResultTime
+      } else {
+        // The task is still fetching the result.
+        currentTime - info.gettingResultTime
       }
+    } else {
+      0L
+    }
+  }
 
-      val maybeInput = metrics.flatMap(_.inputMetrics)
-      val inputSortable = maybeInput.map(_.bytesRead.toString).getOrElse("")
-      val inputReadable = maybeInput
-        .map(m => s"${Utils.bytesToString(m.bytesRead)} (${m.readMethod.toString.toLowerCase()})")
-        .getOrElse("")
-      val inputRecords = maybeInput.map(_.recordsRead.toString).getOrElse("")
-
-      val maybeOutput = metrics.flatMap(_.outputMetrics)
-      val outputSortable = maybeOutput.map(_.bytesWritten.toString).getOrElse("")
-      val outputReadable = maybeOutput
-        .map(m => s"${Utils.bytesToString(m.bytesWritten)}")
-        .getOrElse("")
-      val outputRecords = maybeOutput.map(_.recordsWritten.toString).getOrElse("")
-
-      val maybeShuffleRead = metrics.flatMap(_.shuffleReadMetrics)
-      val shuffleReadBlockedTimeSortable = maybeShuffleRead
-        .map(_.fetchWaitTime.toString).getOrElse("")
-      val shuffleReadBlockedTimeReadable =
-        maybeShuffleRead.map(ms => UIUtils.formatDuration(ms.fetchWaitTime)).getOrElse("")
-
-      val totalShuffleBytes = maybeShuffleRead.map(_.totalBytesRead)
-      val shuffleReadSortable = totalShuffleBytes.map(_.toString).getOrElse("")
-      val shuffleReadReadable = totalShuffleBytes.map(Utils.bytesToString).getOrElse("")
-      val shuffleReadRecords = maybeShuffleRead.map(_.recordsRead.toString).getOrElse("")
-
-      val remoteShuffleBytes = maybeShuffleRead.map(_.remoteBytesRead)
-      val shuffleReadRemoteSortable = remoteShuffleBytes.map(_.toString).getOrElse("")
-      val shuffleReadRemoteReadable = remoteShuffleBytes.map(Utils.bytesToString).getOrElse("")
-
-      val maybeShuffleWrite = metrics.flatMap(_.shuffleWriteMetrics)
-      val shuffleWriteSortable = maybeShuffleWrite.map(_.shuffleBytesWritten.toString).getOrElse("")
-      val shuffleWriteReadable = maybeShuffleWrite
-        .map(m => s"${Utils.bytesToString(m.shuffleBytesWritten)}").getOrElse("")
-      val shuffleWriteRecords = maybeShuffleWrite
-        .map(_.shuffleRecordsWritten.toString).getOrElse("")
-
-      val maybeWriteTime = metrics.flatMap(_.shuffleWriteMetrics).map(_.shuffleWriteTime)
-      val writeTimeSortable = maybeWriteTime.map(_.toString).getOrElse("")
-      val writeTimeReadable = maybeWriteTime.map(t => t / (1000 * 1000)).map { ms =>
-        if (ms == 0) "" else UIUtils.formatDuration(ms)
-      }.getOrElse("")
-
-      val maybeMemoryBytesSpilled = metrics.map(_.memoryBytesSpilled)
-      val memoryBytesSpilledSortable = maybeMemoryBytesSpilled.map(_.toString).getOrElse("")
-      val memoryBytesSpilledReadable =
-        maybeMemoryBytesSpilled.map(Utils.bytesToString).getOrElse("")
-
-      val maybeDiskBytesSpilled = metrics.map(_.diskBytesSpilled)
-      val diskBytesSpilledSortable = maybeDiskBytesSpilled.map(_.toString).getOrElse("")
-      val diskBytesSpilledReadable = maybeDiskBytesSpilled.map(Utils.bytesToString).getOrElse("")
-
-      <tr>
-        <td>{info.index}</td>
-        <td>{info.taskId}</td>
-        <td sorttable_customkey={info.attempt.toString}>{
-          if (info.speculative) s"${info.attempt} (speculative)" else info.attempt.toString
-        }</td>
-        <td>{info.status}</td>
-        <td>{info.taskLocality}</td>
-        <td>{info.executorId} / {info.host}</td>
-        <td>{UIUtils.formatDate(new Date(info.launchTime))}</td>
-        <td sorttable_customkey={duration.toString}>
-          {formatDuration}
-        </td>
-        <td sorttable_customkey={schedulerDelay.toString}
-            class={TaskDetailsClassNames.SCHEDULER_DELAY}>
-          {UIUtils.formatDuration(schedulerDelay.toLong)}
-        </td>
-        <td sorttable_customkey={taskDeserializationTime.toString}
-            class={TaskDetailsClassNames.TASK_DESERIALIZATION_TIME}>
-          {UIUtils.formatDuration(taskDeserializationTime.toLong)}
-        </td>
-        <td sorttable_customkey={gcTime.toString}>
-          {if (gcTime > 0) UIUtils.formatDuration(gcTime) else ""}
-        </td>
-        <td sorttable_customkey={serializationTime.toString}
-            class={TaskDetailsClassNames.RESULT_SERIALIZATION_TIME}>
-          {UIUtils.formatDuration(serializationTime)}
-        </td>
-        <td sorttable_customkey={gettingResultTime.toString}
-            class={TaskDetailsClassNames.GETTING_RESULT_TIME}>
-          {UIUtils.formatDuration(gettingResultTime)}
-        </td>
-        {if (hasAccumulators) {
-          <td>
-            {Unparsed(accumulatorsReadable.mkString("<br/>"))}
-          </td>
-        }}
-        {if (hasInput) {
-          <td sorttable_customkey={inputSortable}>
-            {s"$inputReadable / $inputRecords"}
-          </td>
-        }}
-        {if (hasOutput) {
-          <td sorttable_customkey={outputSortable}>
-            {s"$outputReadable / $outputRecords"}
-          </td>
-        }}
+  private[ui] def getSchedulerDelay(
+      info: TaskInfo, metrics: TaskMetrics, currentTime: Long): Long = {
+    if (info.finished) {
+      val totalExecutionTime = info.finishTime - info.launchTime
+      val executorOverhead = (metrics.executorDeserializeTime +
+        metrics.resultSerializationTime)
+      math.max(
+        0,
+        totalExecutionTime - metrics.executorRunTime - executorOverhead -
+          getGettingResultTime(info, currentTime))
+    } else {
+      // The task is still running and the metrics like executorRunTime are not available.
+      0L
+    }
+  }
+}
+
+private[ui] case class TaskTableRowInputData(inputSortable: Long, inputReadable: String)
+
+private[ui] case class TaskTableRowOutputData(outputSortable: Long, outputReadable: String)
+
+private[ui] case class TaskTableRowShuffleReadData(
+    shuffleReadBlockedTimeSortable: Long,
+    shuffleReadBlockedTimeReadable: String,
+    shuffleReadSortable: Long,
+    shuffleReadReadable: String,
+    shuffleReadRemoteSortable: Long,
+    shuffleReadRemoteReadable: String)
+
+private[ui] case class TaskTableRowShuffleWriteData(
+    writeTimeSortable: Long,
+    writeTimeReadable: String,
+    shuffleWriteSortable: Long,
+    shuffleWriteReadable: String)
+
+private[ui] case class TaskTableRowBytesSpilledData(
+    memoryBytesSpilledSortable: Long,
+    memoryBytesSpilledReadable: String,
+    diskBytesSpilledSortable: Long,
+    diskBytesSpilledReadable: String)
+
+/**
+ * Contains all data that needs for sorting and generating HTML. Using this one rather than
+ * TaskUIData to avoid creating duplicate contents during sorting the data.
+ */
+private[ui] case class TaskTableRowData(
+    index: Int,
+    taskId: Long,
+    attempt: Int,
+    speculative: Boolean,
+    status: String,
+    taskLocality: String,
+    executorIdAndHost: String,
+    launchTime: Long,
+    duration: Long,
+    formatDuration: String,
+    schedulerDelay: Long,
+    taskDeserializationTime: Long,
+    gcTime: Long,
+    serializationTime: Long,
+    gettingResultTime: Long,
+    accumulators: Option[String], // HTML
+    input: Option[TaskTableRowInputData],
+    output: Option[TaskTableRowOutputData],
+    shuffleRead: Option[TaskTableRowShuffleReadData],
+    shuffleWrite: Option[TaskTableRowShuffleWriteData],
+    bytesSpilled: Option[TaskTableRowBytesSpilledData],
+    error: String)
+
+private[ui] class TaskDataSource(
+    tasks: Seq[TaskUIData],
+    hasAccumulators: Boolean,
+    hasInput: Boolean,
+    hasOutput: Boolean,
+    hasShuffleRead: Boolean,
+    hasShuffleWrite: Boolean,
+    hasBytesSpilled: Boolean,
+    currentTime: Long,
+    pageSize: Int,
+    sortColumn: String,
+    desc: Boolean) extends PagedDataSource[TaskTableRowData](pageSize) {
+  import StagePage._
+
+  // Convert TaskUIData to TaskTableRowData which contains the final contents to show in the table
+  // so that we can avoid creating duplicate contents during sorting the data
+  private val data = tasks.map(taskRow).sorted(ordering(sortColumn, desc))
+
+  private var _slicedTaskIds: Set[Long] = null
+
+  override def dataSize: Int = data.size
+
+  override def sliceData(from: Int, to: Int): Seq[TaskTableRowData] = {
+    val r = data.slice(from, to)
+    _slicedTaskIds = r.map(_.taskId).toSet
+    r
+  }
+
+  def slicedTaskIds: Set[Long] = _slicedTaskIds
+
+  private def taskRow(taskData: TaskUIData): TaskTableRowData = {
+    val TaskUIData(info, metrics, errorMessage) = taskData
+    val duration = if (info.status == "RUNNING") info.timeRunning(currentTime)
+      else metrics.map(_.executorRunTime).getOrElse(1L)
+    val formatDuration = if (info.status == "RUNNING") UIUtils.formatDuration(duration)
+      else metrics.map(m => UIUtils.formatDuration(m.executorRunTime)).getOrElse("")
+    val schedulerDelay = metrics.map(getSchedulerDelay(info, _, currentTime)).getOrElse(0L)
+    val gcTime = metrics.map(_.jvmGCTime).getOrElse(0L)
+    val taskDeserializationTime = metrics.map(_.executorDeserializeTime).getOrElse(0L)
+    val serializationTime = metrics.map(_.resultSerializationTime).getOrElse(0L)
+    val gettingResultTime = getGettingResultTime(info, currentTime)
+
+    val maybeAccumulators = info.accumulables
+    val accumulatorsReadable = maybeAccumulators.map { acc =>
+      StringEscapeUtils.escapeHtml4(s"${acc.name}: ${acc.update.get}")
+    }
+
+    val maybeInput = metrics.flatMap(_.inputMetrics)
+    val inputSortable = maybeInput.map(_.bytesRead).getOrElse(0L)
+    val inputReadable = maybeInput
+      .map(m => s"${Utils.bytesToString(m.bytesRead)} (${m.readMethod.toString.toLowerCase()})")
+      .getOrElse("")
+    val inputRecords = maybeInput.map(_.recordsRead.toString).getOrElse("")
+
+    val maybeOutput = metrics.flatMap(_.outputMetrics)
+    val outputSortable = maybeOutput.map(_.bytesWritten).getOrElse(0L)
+    val outputReadable = maybeOutput
+      .map(m => s"${Utils.bytesToString(m.bytesWritten)}")
+      .getOrElse("")
+    val outputRecords = maybeOutput.map(_.recordsWritten.toString).getOrElse("")
+
+    val maybeShuffleRead = metrics.flatMap(_.shuffleReadMetrics)
+    val shuffleReadBlockedTimeSortable = maybeShuffleRead.map(_.fetchWaitTime).getOrElse(0L)
+    val shuffleReadBlockedTimeReadable =
+      maybeShuffleRead.map(ms => UIUtils.formatDuration(ms.fetchWaitTime)).getOrElse("")
+
+    val totalShuffleBytes = maybeShuffleRead.map(_.totalBytesRead)
+    val shuffleReadSortable = totalShuffleBytes.getOrElse(0L)
+    val shuffleReadReadable = totalShuffleBytes.map(Utils.bytesToString).getOrElse("")
+    val shuffleReadRecords = maybeShuffleRead.map(_.recordsRead.toString).getOrElse("")
+
+    val remoteShuffleBytes = maybeShuffleRead.map(_.remoteBytesRead)
+    val shuffleReadRemoteSortable = remoteShuffleBytes.getOrElse(0L)
+    val shuffleReadRemoteReadable = remoteShuffleBytes.map(Utils.bytesToString).getOrElse("")
+
+    val maybeShuffleWrite = metrics.flatMap(_.shuffleWriteMetrics)
+    val shuffleWriteSortable = maybeShuffleWrite.map(_.shuffleBytesWritten).getOrElse(0L)
+    val shuffleWriteReadable = maybeShuffleWrite
+      .map(m => s"${Utils.bytesToString(m.shuffleBytesWritten)}").getOrElse("")
+    val shuffleWriteRecords = maybeShuffleWrite
+      .map(_.shuffleRecordsWritten.toString).getOrElse("")
+
+    val maybeWriteTime = metrics.flatMap(_.shuffleWriteMetrics).map(_.shuffleWriteTime)
+    val writeTimeSortable = maybeWriteTime.getOrElse(0L)
+    val writeTimeReadable = maybeWriteTime.map(t => t / (1000 * 1000)).map { ms =>
+      if (ms == 0) "" else UIUtils.formatDuration(ms)
+    }.getOrElse("")
+
+    val maybeMemoryBytesSpilled = metrics.map(_.memoryBytesSpilled)
+    val memoryBytesSpilledSortable = maybeMemoryBytesSpilled.getOrElse(0L)
+    val memoryBytesSpilledReadable =
+      maybeMemoryBytesSpilled.map(Utils.bytesToString).getOrElse("")
+
+    val maybeDiskBytesSpilled = metrics.map(_.diskBytesSpilled)
+    val diskBytesSpilledSortable = maybeDiskBytesSpilled.getOrElse(0L)
+    val diskBytesSpilledReadable = maybeDiskBytesSpilled.map(Utils.bytesToString).getOrElse("")
+
+    val input =
+      if (hasInput) {
+        Some(TaskTableRowInputData(inputSortable, s"$inputReadable / $inputRecords"))
+      } else {
+        None
+      }
+
+    val output =
+      if (hasOutput) {
+        Some(TaskTableRowOutputData(outputSortable, s"$outputReadable / $outputRecords"))
+      } else {
+        None
+      }
+
+    val shuffleRead =
+      if (hasShuffleRead) {
+        Some(TaskTableRowShuffleReadData(
+          shuffleReadBlockedTimeSortable,
+          shuffleReadBlockedTimeReadable,
+          shuffleReadSortable,
+          s"$shuffleReadReadable / $shuffleReadRecords",
+          shuffleReadRemoteSortable,
+          shuffleReadRemoteReadable
+        ))
+      } else {
+        None
+      }
+
+    val shuffleWrite =
+      if (hasShuffleWrite) {
+        Some(TaskTableRowShuffleWriteData(
+          writeTimeSortable,
+          writeTimeReadable,
+          shuffleWriteSortable,
+          s"$shuffleWriteReadable / $shuffleWriteRecords"
+        ))
+      } else {
+        None
+      }
+
+    val bytesSpilled =
+      if (hasBytesSpilled) {
+        Some(TaskTableRowBytesSpilledData(
+          memoryBytesSpilledSortable,
+          memoryBytesSpilledReadable,
+          diskBytesSpilledSortable,
+          diskBytesSpilledReadable
+        ))
+      } else {
+        None
+      }
+
+    TaskTableRowData(
+      info.index,
+      info.taskId,
+      info.attempt,
+      info.speculative,
+      info.status,
+      info.taskLocality.toString,
+      s"${info.executorId} / ${info.host}",
+      info.launchTime,
+      duration,
+      formatDuration,
+      schedulerDelay,
+      taskDeserializationTime,
+      gcTime,
+      serializationTime,
+      gettingResultTime,
+      if (hasAccumulators) Some(accumulatorsReadable.mkString("<br/>")) else None,
+      input,
+      output,
+      shuffleRead,
+      shuffleWrite,
+      bytesSpilled,
+      errorMessage.getOrElse("")
+    )
+  }
+
+  /**
+   * Return Ordering according to sortColumn and desc
+   */
+  private def ordering(sortColumn: String, desc: Boolean): Ordering[TaskTableRowData] = {
+    val ordering = sortColumn match {
+      case "Index" => new Ordering[TaskTableRowData] {
+        override def compare(x: TaskTableRowData, y: TaskTableRowData): Int =
+          Ordering.Int.compare(x.index, y.index)
+      }
+      case "ID" => new Ordering[TaskTableRowData] {
+        override def compare(x: TaskTableRowData, y: TaskTableRowData): Int =
+          Ordering.Long.compare(x.taskId, y.taskId)
+      }
+      case "Attempt" => new Ordering[TaskTableRowData] {
+        override def compare(x: TaskTableRowData, y: TaskTableRowData): Int =
+          Ordering.Int.compare(x.attempt, y.attempt)
+      }
+      case "Status" => new Ordering[TaskTableRowData] {
+        override def compare(x: TaskTableRowData, y: TaskTableRowData): Int =
+          Ordering.String.compare(x.status, y.status)
+      }
+      case "Locality Level" => new Ordering[TaskTableRowData] {
+        override def compare(x: TaskTableRowData, y: TaskTableRowData): Int =
+          Ordering.String.compare(x.taskLocality, y.taskLocality)
+      }
+      case "Executor ID / Host" => new Ordering[TaskTableRowData] {
+        override def compare(x: TaskTableRowData, y: TaskTableRowData): Int =
+          Ordering.String.compare(x.executorIdAndHost, y.executorIdAndHost)
+      }
+      case "Launch Time" => new Ordering[TaskTableRowData] {
+        override def compare(x: TaskTableRowData, y: TaskTableRowData): Int =
+          Ordering.Long.compare(x.launchTime, y.launchTime)
+      }
+      case "Duration" => new Ordering[TaskTableRowData] {
+        override def compare(x: TaskTableRowData, y: TaskTableRowData): Int =
+          Ordering.Long.compare(x.duration, y.duration)
+      }
+      case "Scheduler Delay" => new Ordering[TaskTableRowData] {
+        override def compare(x: TaskTableRowData, y: TaskTableRowData): Int =
+          Ordering.Long.compare(x.schedulerDelay, y.schedulerDelay)
+      }
+      case "Task Deserialization Time" => new Ordering[TaskTableRowData] {
+        override def compare(x: TaskTableRowData, y: TaskTableRowData): Int =
+          Ordering.Long.compare(x.taskDeserializationTime, y.taskDeserializationTime)
+      }
+      case "GC Time" => new Ordering[TaskTableRowData] {
+        override def compare(x: TaskTableRowData, y: TaskTableRowData): Int =
+          Ordering.Long.compare(x.gcTime, y.gcTime)
+      }
+      case "Result Serialization Time" => new Ordering[TaskTableRowData] {
+        override def compare(x: TaskTableRowData, y: TaskTableRowData): Int =
+          Ordering.Long.compare(x.serializationTime, y.serializationTime)
+      }
+      case "Getting Result Time" => new Ordering[TaskTableRowData] {
+        override def compare(x: TaskTableRowData, y: TaskTableRowData): Int =
+          Ordering.Long.compare(x.gettingResultTime, y.gettingResultTime)
+      }
+      case "Accumulators" =>
+        if (hasAccumulators) {
+          new Ordering[TaskTableRowData] {
+            override def compare(x: TaskTableRowData, y: TaskTableRowData): Int =
+              Ordering.String.compare(x.accumulators.get, y.accumulators.get)
+          }
+        } else {
+          throw new IllegalArgumentException(
+            "Cannot sort by Accumulators because of no accumulators")
+        }
+      case "Input Size / Records" =>
+        if (hasInput) {
+          new Ordering[TaskTableRowData] {
+            override def compare(x: TaskTableRowData, y: TaskTableRowData): Int =
+              Ordering.Long.compare(x.input.get.inputSortable, y.input.get.inputSortable)
+          }
+        } else {
+          throw new IllegalArgumentException(
+            "Cannot sort by Input Size / Records because of no inputs")
+        }
+      case "Output Size / Records" =>
+        if (hasOutput) {
+          new Ordering[TaskTableRowData] {
+            override def compare(x: TaskTableRowData, y: TaskTableRowData): Int =
+              Ordering.Long.compare(x.output.get.outputSortable, y.output.get.outputSortable)
+          }
+        } else {
+          throw new IllegalArgumentException(
+            "Cannot sort by Output Size / Records because of no outputs")
+        }
+      // ShuffleRead
+      case "Shuffle Read Blocked Time" =>
+        if (hasShuffleRead) {
+          new Ordering[TaskTableRowData] {
+            override def compare(x: TaskTableRowData, y: TaskTableRowData): Int =
+              Ordering.Long.compare(x.shuffleRead.get.shuffleReadBlockedTimeSortable,
+                y.shuffleRead.get.shuffleReadBlockedTimeSortable)
+          }
+        } else {
+          throw new IllegalArgumentException(
+            "Cannot sort by Shuffle Read Blocked Time because of no shuffle reads")
+        }
+      case "Shuffle Read Size / Records" =>
+        if (hasShuffleRead) {
+          new Ordering[TaskTableRowData] {
+            override def compare(x: TaskTableRowData, y: TaskTableRowData): Int =
+              Ordering.Long.compare(x.shuffleRead.get.shuffleReadSortable,
+                y.shuffleRead.get.shuffleReadSortable)
+          }
+        } else {
+          throw new IllegalArgumentException(
+            "Cannot sort by Shuffle Read Size / Records because of no shuffle reads")
+        }
+      case "Shuffle Remote Reads" =>
+        if (hasShuffleRead) {
+          new Ordering[TaskTableRowData] {
+            override def compare(x: TaskTableRowData, y: TaskTableRowData): Int =
+              Ordering.Long.compare(x.shuffleRead.get.shuffleReadRemoteSortable,
+                y.shuffleRead.get.shuffleReadRemoteSortable)
+          }
+        } else {
+          throw new IllegalArgumentException(
+            "Cannot sort by Shuffle Remote Reads because of no shuffle reads")
+        }
+      // ShuffleWrite
+      case "Write Time" =>
+        if (hasShuffleWrite) {
+          new Ordering[TaskTableRowData] {
+            override def compare(x: TaskTableRowData, y: TaskTableRowData): Int =
+              Ordering.Long.compare(x.shuffleWrite.get.writeTimeSortable,
+                y.shuffleWrite.get.writeTimeSortable)
+          }
+        } else {
+          throw new IllegalArgumentException(
+            "Cannot sort by Write Time because of no shuffle writes")
+        }
+      case "Shuffle Write Size / Records" =>
+        if (hasShuffleWrite) {
+          new Ordering[TaskTableRowData] {
+            override def compare(x: TaskTableRowData, y: TaskTableRowData): Int =
+              Ordering.Long.compare(x.shuffleWrite.get.shuffleWriteSortable,
+                y.shuffleWrite.get.shuffleWriteSortable)
+          }
+        } else {
+          throw new IllegalArgumentException(
+            "Cannot sort by Shuffle Write Size / Records because of no shuffle writes")
+        }
+      // BytesSpilled
+      case "Shuffle Spill (Memory)" =>
+        if (hasBytesSpilled) {
+          new Ordering[TaskTableRowData] {
+            override def compare(x: TaskTableRowData, y: TaskTableRowData): Int =
+              Ordering.Long.compare(x.bytesSpilled.get.memoryBytesSpilledSortable,
+                y.bytesSpilled.get.memoryBytesSpilledSortable)
+          }
+        } else {
+          throw new IllegalArgumentException(
+            "Cannot sort by Shuffle Spill (Memory) because of no spills")
+        }
+      case "Shuffle Spill (Disk)" =>
+        if (hasBytesSpilled) {
+          new Ordering[TaskTableRowData] {
+            override def compare(x: TaskTableRowData, y: TaskTableRowData): Int =
+              Ordering.Long.compare(x.bytesSpilled.get.diskBytesSpilledSortable,
+                y.bytesSpilled.get.diskBytesSpilledSortable)
+          }
+        } else {
+          throw new IllegalArgumentException(
+            "Cannot sort by Shuffle Spill (Disk) because of no spills")
+        }
+      case "Errors" => new Ordering[TaskTableRowData] {
+        override def compare(x: TaskTableRowData, y: TaskTableRowData): Int =
+          Ordering.String.compare(x.error, y.error)
+      }
+      case unknownColumn => throw new IllegalArgumentException(s"Unknown column: $unknownColumn")
+    }
+    if (desc) {
+      ordering.reverse
+    } else {
+      ordering
+    }
+  }
+
+}
+
+private[ui] class TaskPagedTable(
+    basePath: String,
+    data: Seq[TaskUIData],
+    hasAccumulators: Boolean,
+    hasInput: Boolean,
+    hasOutput: Boolean,
+    hasShuffleRead: Boolean,
+    hasShuffleWrite: Boolean,
+    hasBytesSpilled: Boolean,
+    currentTime: Long,
+    pageSize: Int,
+    sortColumn: String,
+    desc: Boolean) extends PagedTable[TaskTableRowData]{
+
+  override def tableId: String = ""
+
+  override def tableCssClass: String = "table table-bordered table-condensed table-striped"
+
+  override val dataSource: TaskDataSource = new TaskDataSource(
+    data,
+    hasAccumulators,
+    hasInput,
+    hasOutput,
+    hasShuffleRead,
+    hasShuffleWrite,
+    hasBytesSpilled,
+    currentTime,
+    pageSize,
+    sortColumn,
+    desc
+  )
+
+  override def pageLink(page: Int): String = {
+    val encodedSortColumn = URLEncoder.encode(sortColumn, "UTF-8")
+    s"${basePath}&task.page=$page&task.sort=${encodedSortColumn}&task.desc=${desc}" +
+      s"&task.pageSize=${pageSize}"
+  }
+
+  override def goButtonJavascriptFunction: (String, String) = {
+    val jsFuncName = "goToTaskPage"
+    val encodedSortColumn = URLEncoder.encode(sortColumn, "UTF-8")
+    val jsFunc = s"""
+      |currentTaskPageSize = ${pageSize}
+      |function goToTaskPage(page, pageSize) {
+      |  // Set page to 1 if the page size changes
+      |  page = pageSize == currentTaskPageSize ? page : 1;
+      |  var url = "${basePath}&task.sort=${encodedSortColumn}&task.desc=${desc}" +
+      |    "&task.page=" + page + "&task.pageSize=" + pageSize;
+      |  window.location.href = url;
+      |}
+     """.stripMargin
+    (jsFuncName, jsFunc)
+  }
+
+  def headers: Seq[Node] = {
+    val taskHeadersAndCssClasses: Seq[(String, String)] =
+      Seq(
+        ("Index", ""), ("ID", ""), ("Attempt", ""), ("Status", ""), ("Locality Level", ""),
+        ("Executor ID / Host", ""), ("Launch Time", ""), ("Duration", ""),
+        ("Scheduler Delay", TaskDetailsClassNames.SCHEDULER_DELAY),
+        ("Task Deserialization Time", TaskDetailsClassNames.TASK_DESERIALIZATION_TIME),
+        ("GC Time", ""),
+        ("Result Serialization Time", TaskDetailsClassNames.RESULT_SERIALIZATION_TIME),
+        ("Getting Result Time", TaskDetailsClassNames.GETTING_RESULT_TIME)) ++
+        {if (hasAccumulators) Seq(("Accumulators", "")) else Nil} ++
+        {if (hasInput) Seq(("Input Size / Records", "")) else Nil} ++
+        {if (hasOutput) Seq(("Output Size / Records", "")) else Nil} ++
         {if (hasShuffleRead) {
-           <td sorttable_customkey={shuffleReadBlockedTimeSortable}
-             class={TaskDetailsClassNames.SHUFFLE_READ_BLOCKED_TIME}>
-             {shuffleReadBlockedTimeReadable}
-           </td>
-           <td sorttable_customkey={shuffleReadSortable}>
-             {s"$shuffleReadReadable / $shuffleReadRecords"}
-           </td>
-           <td sorttable_customkey={shuffleReadRemoteSortable}
-               class={TaskDetailsClassNames.SHUFFLE_READ_REMOTE_SIZE}>
-             {shuffleReadRemoteReadable}
-           </td>
-        }}
+          Seq(("Shuffle Read Blocked Time", TaskDetailsClassNames.SHUFFLE_READ_BLOCKED_TIME),
+            ("Shuffle Read Size / Records", ""),
+            ("Shuffle Remote Reads", TaskDetailsClassNames.SHUFFLE_READ_REMOTE_SIZE))
+        } else {
+          Nil
+        }} ++
         {if (hasShuffleWrite) {
-           <td sorttable_customkey={writeTimeSortable}>
-             {writeTimeReadable}
-           </td>
-           <td sorttable_customkey={shuffleWriteSortable}>
-             {s"$shuffleWriteReadable / $shuffleWriteRecords"}
-           </td>
-        }}
+          Seq(("Write Time", ""), ("Shuffle Write Size / Records", ""))
+        } else {
+          Nil
+        }} ++
         {if (hasBytesSpilled) {
-          <td sorttable_customkey={memoryBytesSpilledSortable}>
-            {memoryBytesSpilledReadable}
-          </td>
-          <td sorttable_customkey={diskBytesSpilledSortable}>
-            {diskBytesSpilledReadable}
-          </td>
-        }}
-        {errorMessageCell(errorMessage)}
-      </tr>
+          Seq(("Shuffle Spill (Memory)", ""), ("Shuffle Spill (Disk)", ""))
+        } else {
+          Nil
+        }} ++
+        Seq(("Errors", ""))
+
+    if (!taskHeadersAndCssClasses.map(_._1).contains(sortColumn)) {
+      new IllegalArgumentException(s"Unknown column: $sortColumn")
     }
+
+    val headerRow: Seq[Node] = {
+      taskHeadersAndCssClasses.map { case (header, cssClass) =>
+        if (header == sortColumn) {
+          val headerLink =
+            s"$basePath&task.sort=${URLEncoder.encode(header, "UTF-8")}&task.desc=${!desc}" +
+              s"&task.pageSize=${pageSize}"
+          val js = Unparsed(s"window.location.href='${headerLink}'")
+          val arrow = if (desc) "&#x25BE;" else "&#x25B4;" // UP or DOWN
+          <th class={cssClass} onclick={js} style="cursor: pointer;">
+            {header}
+            <span>&nbsp;{Unparsed(arrow)}</span>
+          </th>
+        } else {
+          val headerLink =
+            s"$basePath&task.sort=${URLEncoder.encode(header, "UTF-8")}&task.pageSize=${pageSize}"
+          val js = Unparsed(s"window.location.href='${headerLink}'")
+          <th class={cssClass} onclick={js} style="cursor: pointer;">
+            {header}
+          </th>
+        }
+      }
+    }
+    <thead>{headerRow}</thead>
+  }
+
+  def row(task: TaskTableRowData): Seq[Node] = {
+    <tr>
+      <td>{task.index}</td>
+      <td>{task.taskId}</td>
+      <td>{if (task.speculative) s"${task.attempt} (speculative)" else task.attempt.toString}</td>
+      <td>{task.status}</td>
+      <td>{task.taskLocality}</td>
+      <td>{task.executorIdAndHost}</td>
+      <td>{UIUtils.formatDate(new Date(task.launchTime))}</td>
+      <td>{task.formatDuration}</td>
+      <td class={TaskDetailsClassNames.SCHEDULER_DELAY}>
+        {UIUtils.formatDuration(task.schedulerDelay)}
+      </td>
+      <td class={TaskDetailsClassNames.TASK_DESERIALIZATION_TIME}>
+        {UIUtils.formatDuration(task.taskDeserializationTime)}
+      </td>
+      <td>
+        {if (task.gcTime > 0) UIUtils.formatDuration(task.gcTime) else ""}
+      </td>
+      <td class={TaskDetailsClassNames.RESULT_SERIALIZATION_TIME}>
+        {UIUtils.formatDuration(task.serializationTime)}
+      </td>
+      <td class={TaskDetailsClassNames.GETTING_RESULT_TIME}>
+        {UIUtils.formatDuration(task.gettingResultTime)}
+      </td>
+      {if (task.accumulators.nonEmpty) {
+        <td>{Unparsed(task.accumulators.get)}</td>
+      }}
+      {if (task.input.nonEmpty) {
+        <td>{task.input.get.inputReadable}</td>
+      }}
+      {if (task.output.nonEmpty) {
+        <td>{task.output.get.outputReadable}</td>
+      }}
+      {if (task.shuffleRead.nonEmpty) {
+        <td class={TaskDetailsClassNames.SHUFFLE_READ_BLOCKED_TIME}>
+          {task.shuffleRead.get.shuffleReadBlockedTimeReadable}
+        </td>
+        <td>{task.shuffleRead.get.shuffleReadReadable}</td>
+        <td class={TaskDetailsClassNames.SHUFFLE_READ_REMOTE_SIZE}>
+          {task.shuffleRead.get.shuffleReadRemoteReadable}
+        </td>
+      }}
+      {if (task.shuffleWrite.nonEmpty) {
+        <td>{task.shuffleWrite.get.writeTimeReadable}</td>
+        <td>{task.shuffleWrite.get.shuffleWriteReadable}</td>
+      }}
+      {if (task.bytesSpilled.nonEmpty) {
+        <td>{task.bytesSpilled.get.memoryBytesSpilledReadable}</td>
+        <td>{task.bytesSpilled.get.diskBytesSpilledReadable}</td>
+      }}
+      {errorMessageCell(task.error)}
+    </tr>
   }
 
-  private def errorMessageCell(errorMessage: Option[String]): Seq[Node] = {
-    val error = errorMessage.getOrElse("")
+  private def errorMessageCell(error: String): Seq[Node] = {
     val isMultiline = error.indexOf('\n') >= 0
     // Display the first line by default
     val errorSummary = StringEscapeUtils.escapeHtml4(
@@ -860,32 +1325,4 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") {
     }
     <td>{errorSummary}{details}</td>
   }
-
-  private def getGettingResultTime(info: TaskInfo, currentTime: Long): Long = {
-    if (info.gettingResult) {
-      if (info.finished) {
-        info.finishTime - info.gettingResultTime
-      } else {
-        // The task is still fetching the result.
-        currentTime - info.gettingResultTime
-      }
-    } else {
-      0L
-    }
-  }
-
-  private def getSchedulerDelay(info: TaskInfo, metrics: TaskMetrics, currentTime: Long): Long = {
-    if (info.finished) {
-      val totalExecutionTime = info.finishTime - info.launchTime
-      val executorOverhead = (metrics.executorDeserializeTime +
-        metrics.resultSerializationTime)
-      math.max(
-        0,
-        totalExecutionTime - metrics.executorRunTime - executorOverhead -
-          getGettingResultTime(info, currentTime))
-    } else {
-      // The task is still running and the metrics like executorRunTime are not available.
-      0L
-    }
-  }
 }
diff --git a/core/src/test/scala/org/apache/spark/ui/PagedTableSuite.scala b/core/src/test/scala/org/apache/spark/ui/PagedTableSuite.scala
new file mode 100644
index 0000000000000..cc76c141c53cc
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/ui/PagedTableSuite.scala
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ui
+
+import scala.xml.Node
+
+import org.apache.spark.SparkFunSuite
+
+class PagedDataSourceSuite extends SparkFunSuite {
+
+  test("basic") {
+    val dataSource1 = new SeqPagedDataSource[Int](1 to 5, pageSize = 2)
+    assert(dataSource1.pageData(1) === PageData(3, (1 to 2)))
+
+    val dataSource2 = new SeqPagedDataSource[Int](1 to 5, pageSize = 2)
+    assert(dataSource2.pageData(2) === PageData(3, (3 to 4)))
+
+    val dataSource3 = new SeqPagedDataSource[Int](1 to 5, pageSize = 2)
+    assert(dataSource3.pageData(3) === PageData(3, Seq(5)))
+
+    val dataSource4 = new SeqPagedDataSource[Int](1 to 5, pageSize = 2)
+    val e1 = intercept[IndexOutOfBoundsException] {
+      dataSource4.pageData(4)
+    }
+    assert(e1.getMessage === "Page 4 is out of range. Please select a page number between 1 and 3.")
+
+    val dataSource5 = new SeqPagedDataSource[Int](1 to 5, pageSize = 2)
+    val e2 = intercept[IndexOutOfBoundsException] {
+      dataSource5.pageData(0)
+    }
+    assert(e2.getMessage === "Page 0 is out of range. Please select a page number between 1 and 3.")
+
+  }
+}
+
+class PagedTableSuite extends SparkFunSuite {
+  test("pageNavigation") {
+    // Create a fake PagedTable to test pageNavigation
+    val pagedTable = new PagedTable[Int] {
+      override def tableId: String = ""
+
+      override def tableCssClass: String = ""
+
+      override def dataSource: PagedDataSource[Int] = null
+
+      override def pageLink(page: Int): String = page.toString
+
+      override def headers: Seq[Node] = Nil
+
+      override def row(t: Int): Seq[Node] = Nil
+
+      override def goButtonJavascriptFunction: (String, String) = ("", "")
+    }
+
+    assert(pagedTable.pageNavigation(1, 10, 1) === Nil)
+    assert(
+      (pagedTable.pageNavigation(1, 10, 2).head \\ "li").map(_.text.trim) === Seq("1", "2", ">"))
+    assert(
+      (pagedTable.pageNavigation(2, 10, 2).head \\ "li").map(_.text.trim) === Seq("<", "1", "2"))
+
+    assert((pagedTable.pageNavigation(1, 10, 100).head \\ "li").map(_.text.trim) ===
+      (1 to 10).map(_.toString) ++ Seq(">", ">>"))
+    assert((pagedTable.pageNavigation(2, 10, 100).head \\ "li").map(_.text.trim) ===
+      Seq("<") ++ (1 to 10).map(_.toString) ++ Seq(">", ">>"))
+
+    assert((pagedTable.pageNavigation(100, 10, 100).head \\ "li").map(_.text.trim) ===
+      Seq("<<", "<") ++ (91 to 100).map(_.toString))
+    assert((pagedTable.pageNavigation(99, 10, 100).head \\ "li").map(_.text.trim) ===
+      Seq("<<", "<") ++ (91 to 100).map(_.toString) ++ Seq(">"))
+
+    assert((pagedTable.pageNavigation(11, 10, 100).head \\ "li").map(_.text.trim) ===
+      Seq("<<", "<") ++ (11 to 20).map(_.toString) ++ Seq(">", ">>"))
+    assert((pagedTable.pageNavigation(93, 10, 97).head \\ "li").map(_.text.trim) ===
+      Seq("<<", "<") ++ (91 to 97).map(_.toString) ++ Seq(">"))
+  }
+}
+
+private[spark] class SeqPagedDataSource[T](seq: Seq[T], pageSize: Int)
+  extends PagedDataSource[T](pageSize) {
+
+  override protected def dataSize: Int = seq.size
+
+  override protected def sliceData(from: Int, to: Int): Seq[T] = seq.slice(from, to)
+}

From d45355ee224b734727255ff278a47801f5da7e93 Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Tue, 21 Jul 2015 09:55:42 -0700
Subject: [PATCH 304/549] [SPARK-5423] [CORE] Register a TaskCompletionListener
 to make sure release all resources

Make `DiskMapIterator.cleanup` idempotent and register a TaskCompletionListener to make sure call `cleanup`.

Author: zsxwing <zsxwing@gmail.com>

Closes #7529 from zsxwing/SPARK-5423 and squashes the following commits:

3e3c413 [zsxwing] Remove TODO
9556c78 [zsxwing] Fix NullPointerException for tests
3d574d9 [zsxwing] Register a TaskCompletionListener to make sure release all resources
---
 .../collection/ExternalAppendOnlyMap.scala    | 25 ++++++++++++++-----
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala b/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala
index 1e4531ef395ae..d166037351c31 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala
@@ -26,7 +26,7 @@ import scala.collection.mutable.ArrayBuffer
 
 import com.google.common.io.ByteStreams
 
-import org.apache.spark.{Logging, SparkEnv}
+import org.apache.spark.{Logging, SparkEnv, TaskContext}
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.serializer.{DeserializationStream, Serializer}
 import org.apache.spark.storage.{BlockId, BlockManager}
@@ -470,14 +470,27 @@ class ExternalAppendOnlyMap[K, V, C](
       item
     }
 
-    // TODO: Ensure this gets called even if the iterator isn't drained.
     private def cleanup() {
       batchIndex = batchOffsets.length  // Prevent reading any other batch
       val ds = deserializeStream
-      deserializeStream = null
-      fileStream = null
-      ds.close()
-      file.delete()
+      if (ds != null) {
+        ds.close()
+        deserializeStream = null
+      }
+      if (fileStream != null) {
+        fileStream.close()
+        fileStream = null
+      }
+      if (file.exists()) {
+        file.delete()
+      }
+    }
+
+    val context = TaskContext.get()
+    // context is null in some tests of ExternalAppendOnlyMapSuite because these tests don't run in
+    // a TaskContext.
+    if (context != null) {
+      context.addTaskCompletionListener(context => cleanup())
     }
   }
 

From 7f072c3d5ec50c65d76bd9f28fac124fce96a89e Mon Sep 17 00:00:00 2001
From: Tarek Auel <tarek.auel@googlemail.com>
Date: Tue, 21 Jul 2015 09:58:16 -0700
Subject: [PATCH 305/549] [SPARK-9154] [SQL] codegen StringFormat

Jira: https://issues.apache.org/jira/browse/SPARK-9154

Author: Tarek Auel <tarek.auel@googlemail.com>

Closes #7546 from tarekauel/SPARK-9154 and squashes the following commits:

a943d3e [Tarek Auel] [SPARK-9154] implicit input cast, added tests for null, support for null primitives
10b4de8 [Tarek Auel] [SPARK-9154][SQL] codegen removed fallback trait
cd8322b [Tarek Auel] [SPARK-9154][SQL] codegen string format
086caba [Tarek Auel] [SPARK-9154][SQL] codegen string format
---
 .../expressions/stringOperations.scala        | 42 ++++++++++++++++++-
 .../expressions/StringExpressionsSuite.scala  | 18 ++++----
 .../spark/sql/StringFunctionsSuite.scala      | 10 +++++
 3 files changed, 59 insertions(+), 11 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
index fe57d17f1ec14..280ae0e546358 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
@@ -526,7 +526,7 @@ case class StringRPad(str: Expression, len: Expression, pad: Expression)
 /**
  * Returns the input formatted according do printf-style format strings
  */
-case class StringFormat(children: Expression*) extends Expression with CodegenFallback {
+case class StringFormat(children: Expression*) extends Expression with ImplicitCastInputTypes {
 
   require(children.nonEmpty, "printf() should take at least 1 argument")
 
@@ -536,6 +536,10 @@ case class StringFormat(children: Expression*) extends Expression with CodegenFa
   private def format: Expression = children(0)
   private def args: Seq[Expression] = children.tail
 
+  override def inputTypes: Seq[AbstractDataType] =
+    children.zipWithIndex.map(x => if (x._2 == 0) StringType else AnyDataType)
+
+
   override def eval(input: InternalRow): Any = {
     val pattern = format.eval(input)
     if (pattern == null) {
@@ -551,6 +555,42 @@ case class StringFormat(children: Expression*) extends Expression with CodegenFa
     }
   }
 
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    val pattern = children.head.gen(ctx)
+
+    val argListGen = children.tail.map(x => (x.dataType, x.gen(ctx)))
+    val argListCode = argListGen.map(_._2.code + "\n")
+
+    val argListString = argListGen.foldLeft("")((s, v) => {
+      val nullSafeString =
+        if (ctx.boxedType(v._1) != ctx.javaType(v._1)) {
+          // Java primitives get boxed in order to allow null values.
+          s"(${v._2.isNull}) ? (${ctx.boxedType(v._1)}) null : " +
+            s"new ${ctx.boxedType(v._1)}(${v._2.primitive})"
+        } else {
+          s"(${v._2.isNull}) ? null : ${v._2.primitive}"
+        }
+      s + "," + nullSafeString
+    })
+
+    val form = ctx.freshName("formatter")
+    val formatter = classOf[java.util.Formatter].getName
+    val sb = ctx.freshName("sb")
+    val stringBuffer = classOf[StringBuffer].getName
+    s"""
+      ${pattern.code}
+      boolean ${ev.isNull} = ${pattern.isNull};
+      ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+      if (!${ev.isNull}) {
+        ${argListCode.mkString}
+        $stringBuffer $sb = new $stringBuffer();
+        $formatter $form = new $formatter($sb, ${classOf[Locale].getName}.US);
+        $form.format(${pattern.primitive}.toString() $argListString);
+        ${ev.primitive} = UTF8String.fromString($sb.toString());
+      }
+     """
+  }
+
   override def prettyName: String = "printf"
 }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
index 96c540ab36f08..3c2d88731beb4 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
@@ -351,18 +351,16 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
   }
 
   test("FORMAT") {
-    val f = 'f.string.at(0)
-    val d1 = 'd.int.at(1)
-    val s1 = 's.int.at(2)
-
-    val row1 = create_row("aa%d%s", 12, "cc")
-    val row2 = create_row(null, 12, "cc")
-    checkEvaluation(StringFormat(Literal("aa%d%s"), Literal(123), Literal("a")), "aa123a", row1)
+    checkEvaluation(StringFormat(Literal("aa%d%s"), Literal(123), Literal("a")), "aa123a")
     checkEvaluation(StringFormat(Literal("aa")), "aa", create_row(null))
-    checkEvaluation(StringFormat(Literal("aa%d%s"), Literal(123), Literal("a")), "aa123a", row1)
+    checkEvaluation(StringFormat(Literal("aa%d%s"), Literal(123), Literal("a")), "aa123a")
+    checkEvaluation(StringFormat(Literal("aa%d%s"), 12, "cc"), "aa12cc")
 
-    checkEvaluation(StringFormat(f, d1, s1), "aa12cc", row1)
-    checkEvaluation(StringFormat(f, d1, s1), null, row2)
+    checkEvaluation(StringFormat(Literal.create(null, StringType), 12, "cc"), null)
+    checkEvaluation(
+      StringFormat(Literal("aa%d%s"), Literal.create(null, IntegerType), "cc"), "aanullcc")
+    checkEvaluation(
+      StringFormat(Literal("aa%d%s"), 12, Literal.create(null, StringType)), "aa12null")
   }
 
   test("INSTR") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
index d1f855903ca4b..3702e73b4e74f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
@@ -132,6 +132,16 @@ class StringFunctionsSuite extends QueryTest {
     checkAnswer(
       df.selectExpr("printf(a, b, c)"),
       Row("aa123cc"))
+
+    val df2 = Seq(("aa%d%s".getBytes, 123, "cc")).toDF("a", "b", "c")
+
+    checkAnswer(
+      df2.select(formatString($"a", $"b", $"c"), formatString("aa%d%s", "b", "c")),
+      Row("aa123cc", "aa123cc"))
+
+    checkAnswer(
+      df2.selectExpr("printf(a, b, c)"),
+      Row("aa123cc"))
   }
 
   test("string instr function") {

From 89db3c0b6edcffed7e1e12c202e6827271ddba26 Mon Sep 17 00:00:00 2001
From: MechCoder <manojkumarsivaraj334@gmail.com>
Date: Tue, 21 Jul 2015 10:31:31 -0700
Subject: [PATCH 306/549] [SPARK-5989] [MLLIB] Model save/load for LDA

Add support for saving and loading LDA both the local and distributed versions.

Author: MechCoder <manojkumarsivaraj334@gmail.com>

Closes #6948 from MechCoder/lda_save_load and squashes the following commits:

49bcdce [MechCoder] minor style fixes
cc14054 [MechCoder] minor
4587d1d [MechCoder] Minor changes
c753122 [MechCoder] Load and save the model in private methods
2782326 [MechCoder] [SPARK-5989] Model save/load for LDA
---
 docs/mllib-clustering.md                      |  10 +-
 .../spark/mllib/clustering/LDAModel.scala     | 228 +++++++++++++++++-
 .../spark/mllib/clustering/LDASuite.scala     |  41 ++++
 3 files changed, 274 insertions(+), 5 deletions(-)

diff --git a/docs/mllib-clustering.md b/docs/mllib-clustering.md
index 0fc7036bffeaf..bb875ae2ae6cb 100644
--- a/docs/mllib-clustering.md
+++ b/docs/mllib-clustering.md
@@ -472,7 +472,7 @@ to the algorithm. We then output the topics, represented as probability distribu
 <div data-lang="scala" markdown="1">
 
 {% highlight scala %}
-import org.apache.spark.mllib.clustering.LDA
+import org.apache.spark.mllib.clustering.{LDA, DistributedLDAModel}
 import org.apache.spark.mllib.linalg.Vectors
 
 // Load and parse the data
@@ -492,6 +492,11 @@ for (topic <- Range(0, 3)) {
   for (word <- Range(0, ldaModel.vocabSize)) { print(" " + topics(word, topic)); }
   println()
 }
+
+// Save and load model.
+ldaModel.save(sc, "myLDAModel")
+val sameModel = DistributedLDAModel.load(sc, "myLDAModel")
+
 {% endhighlight %}
 </div>
 
@@ -551,6 +556,9 @@ public class JavaLDAExample {
       }
       System.out.println();
     }
+
+    ldaModel.save(sc.sc(), "myLDAModel");
+    DistributedLDAModel sameModel = DistributedLDAModel.load(sc.sc(), "myLDAModel");
   }
 }
 {% endhighlight %}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
index 974b26924dfb8..920b57756b625 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
@@ -17,15 +17,25 @@
 
 package org.apache.spark.mllib.clustering
 
-import breeze.linalg.{DenseMatrix => BDM, normalize, sum => brzSum}
+import breeze.linalg.{DenseMatrix => BDM, normalize, sum => brzSum, DenseVector => BDV}
 
+import org.apache.hadoop.fs.Path
+
+import org.json4s.DefaultFormats
+import org.json4s.JsonDSL._
+import org.json4s.jackson.JsonMethods._
+
+import org.apache.spark.SparkContext
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.api.java.JavaPairRDD
-import org.apache.spark.graphx.{VertexId, EdgeContext, Graph}
-import org.apache.spark.mllib.linalg.{Vectors, Vector, Matrices, Matrix}
+import org.apache.spark.graphx.{VertexId, Edge, EdgeContext, Graph}
+import org.apache.spark.mllib.linalg.{Vectors, Vector, Matrices, Matrix, DenseVector}
+import org.apache.spark.mllib.util.{Saveable, Loader}
 import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.{SQLContext, Row}
 import org.apache.spark.util.BoundedPriorityQueue
 
+
 /**
  * :: Experimental ::
  *
@@ -35,7 +45,7 @@ import org.apache.spark.util.BoundedPriorityQueue
  * including local and distributed data structures.
  */
 @Experimental
-abstract class LDAModel private[clustering] {
+abstract class LDAModel private[clustering] extends Saveable {
 
   /** Number of topics */
   def k: Int
@@ -176,6 +186,11 @@ class LocalLDAModel private[clustering] (
     }.toArray
   }
 
+  override protected def formatVersion = "1.0"
+
+  override def save(sc: SparkContext, path: String): Unit = {
+    LocalLDAModel.SaveLoadV1_0.save(sc, path, topicsMatrix)
+  }
   // TODO
   // override def logLikelihood(documents: RDD[(Long, Vector)]): Double = ???
 
@@ -184,6 +199,80 @@ class LocalLDAModel private[clustering] (
 
 }
 
+@Experimental
+object LocalLDAModel extends Loader[LocalLDAModel] {
+
+  private object SaveLoadV1_0 {
+
+    val thisFormatVersion = "1.0"
+
+    val thisClassName = "org.apache.spark.mllib.clustering.LocalLDAModel"
+
+    // Store the distribution of terms of each topic and the column index in topicsMatrix
+    // as a Row in data.
+    case class Data(topic: Vector, index: Int)
+
+    def save(sc: SparkContext, path: String, topicsMatrix: Matrix): Unit = {
+      val sqlContext = SQLContext.getOrCreate(sc)
+      import sqlContext.implicits._
+
+      val k = topicsMatrix.numCols
+      val metadata = compact(render
+        (("class" -> thisClassName) ~ ("version" -> thisFormatVersion) ~
+         ("k" -> k) ~ ("vocabSize" -> topicsMatrix.numRows)))
+      sc.parallelize(Seq(metadata), 1).saveAsTextFile(Loader.metadataPath(path))
+
+      val topicsDenseMatrix = topicsMatrix.toBreeze.toDenseMatrix
+      val topics = Range(0, k).map { topicInd =>
+        Data(Vectors.dense((topicsDenseMatrix(::, topicInd).toArray)), topicInd)
+      }.toSeq
+      sc.parallelize(topics, 1).toDF().write.parquet(Loader.dataPath(path))
+    }
+
+    def load(sc: SparkContext, path: String): LocalLDAModel = {
+      val dataPath = Loader.dataPath(path)
+      val sqlContext = SQLContext.getOrCreate(sc)
+      val dataFrame = sqlContext.read.parquet(dataPath)
+
+      Loader.checkSchema[Data](dataFrame.schema)
+      val topics = dataFrame.collect()
+      val vocabSize = topics(0).getAs[Vector](0).size
+      val k = topics.size
+
+      val brzTopics = BDM.zeros[Double](vocabSize, k)
+      topics.foreach { case Row(vec: Vector, ind: Int) =>
+        brzTopics(::, ind) := vec.toBreeze
+      }
+      new LocalLDAModel(Matrices.fromBreeze(brzTopics))
+    }
+  }
+
+  override def load(sc: SparkContext, path: String): LocalLDAModel = {
+    val (loadedClassName, loadedVersion, metadata) = Loader.loadMetadata(sc, path)
+    implicit val formats = DefaultFormats
+    val expectedK = (metadata \ "k").extract[Int]
+    val expectedVocabSize = (metadata \ "vocabSize").extract[Int]
+    val classNameV1_0 = SaveLoadV1_0.thisClassName
+
+    val model = (loadedClassName, loadedVersion) match {
+      case (className, "1.0") if className == classNameV1_0 =>
+        SaveLoadV1_0.load(sc, path)
+      case _ => throw new Exception(
+        s"LocalLDAModel.load did not recognize model with (className, format version):" +
+        s"($loadedClassName, $loadedVersion).  Supported:\n" +
+        s"  ($classNameV1_0, 1.0)")
+    }
+
+    val topicsMatrix = model.topicsMatrix
+    require(expectedK == topicsMatrix.numCols,
+      s"LocalLDAModel requires $expectedK topics, got ${topicsMatrix.numCols} topics")
+    require(expectedVocabSize == topicsMatrix.numRows,
+      s"LocalLDAModel requires $expectedVocabSize terms for each topic, " +
+      s"but got ${topicsMatrix.numRows}")
+    model
+  }
+}
+
 /**
  * :: Experimental ::
  *
@@ -354,4 +443,135 @@ class DistributedLDAModel private (
   // TODO:
   // override def topicDistributions(documents: RDD[(Long, Vector)]): RDD[(Long, Vector)] = ???
 
+  override protected def formatVersion = "1.0"
+
+  override def save(sc: SparkContext, path: String): Unit = {
+    DistributedLDAModel.SaveLoadV1_0.save(
+      sc, path, graph, globalTopicTotals, k, vocabSize, docConcentration, topicConcentration,
+      iterationTimes)
+  }
+}
+
+
+@Experimental
+object DistributedLDAModel extends Loader[DistributedLDAModel] {
+
+  private object SaveLoadV1_0 {
+
+    val thisFormatVersion = "1.0"
+
+    val classNameV1_0 = "org.apache.spark.mllib.clustering.DistributedLDAModel"
+
+    // Store globalTopicTotals as a Vector.
+    case class Data(globalTopicTotals: Vector)
+
+    // Store each term and document vertex with an id and the topicWeights.
+    case class VertexData(id: Long, topicWeights: Vector)
+
+    // Store each edge with the source id, destination id and tokenCounts.
+    case class EdgeData(srcId: Long, dstId: Long, tokenCounts: Double)
+
+    def save(
+        sc: SparkContext,
+        path: String,
+        graph: Graph[LDA.TopicCounts, LDA.TokenCount],
+        globalTopicTotals: LDA.TopicCounts,
+        k: Int,
+        vocabSize: Int,
+        docConcentration: Double,
+        topicConcentration: Double,
+        iterationTimes: Array[Double]): Unit = {
+      val sqlContext = SQLContext.getOrCreate(sc)
+      import sqlContext.implicits._
+
+      val metadata = compact(render
+        (("class" -> classNameV1_0) ~ ("version" -> thisFormatVersion) ~
+         ("k" -> k) ~ ("vocabSize" -> vocabSize) ~ ("docConcentration" -> docConcentration) ~
+         ("topicConcentration" -> topicConcentration) ~
+         ("iterationTimes" -> iterationTimes.toSeq)))
+      sc.parallelize(Seq(metadata), 1).saveAsTextFile(Loader.metadataPath(path))
+
+      val newPath = new Path(Loader.dataPath(path), "globalTopicTotals").toUri.toString
+      sc.parallelize(Seq(Data(Vectors.fromBreeze(globalTopicTotals)))).toDF()
+        .write.parquet(newPath)
+
+      val verticesPath = new Path(Loader.dataPath(path), "topicCounts").toUri.toString
+      graph.vertices.map { case (ind, vertex) =>
+        VertexData(ind, Vectors.fromBreeze(vertex))
+      }.toDF().write.parquet(verticesPath)
+
+      val edgesPath = new Path(Loader.dataPath(path), "tokenCounts").toUri.toString
+      graph.edges.map { case Edge(srcId, dstId, prop) =>
+        EdgeData(srcId, dstId, prop)
+      }.toDF().write.parquet(edgesPath)
+    }
+
+    def load(
+        sc: SparkContext,
+        path: String,
+        vocabSize: Int,
+        docConcentration: Double,
+        topicConcentration: Double,
+        iterationTimes: Array[Double]): DistributedLDAModel = {
+      val dataPath = new Path(Loader.dataPath(path), "globalTopicTotals").toUri.toString
+      val vertexDataPath = new Path(Loader.dataPath(path), "topicCounts").toUri.toString
+      val edgeDataPath = new Path(Loader.dataPath(path), "tokenCounts").toUri.toString
+      val sqlContext = SQLContext.getOrCreate(sc)
+      val dataFrame = sqlContext.read.parquet(dataPath)
+      val vertexDataFrame = sqlContext.read.parquet(vertexDataPath)
+      val edgeDataFrame = sqlContext.read.parquet(edgeDataPath)
+
+      Loader.checkSchema[Data](dataFrame.schema)
+      Loader.checkSchema[VertexData](vertexDataFrame.schema)
+      Loader.checkSchema[EdgeData](edgeDataFrame.schema)
+      val globalTopicTotals: LDA.TopicCounts =
+        dataFrame.first().getAs[Vector](0).toBreeze.toDenseVector
+      val vertices: RDD[(VertexId, LDA.TopicCounts)] = vertexDataFrame.map {
+        case Row(ind: Long, vec: Vector) => (ind, vec.toBreeze.toDenseVector)
+      }
+
+      val edges: RDD[Edge[LDA.TokenCount]] = edgeDataFrame.map {
+        case Row(srcId: Long, dstId: Long, prop: Double) => Edge(srcId, dstId, prop)
+      }
+      val graph: Graph[LDA.TopicCounts, LDA.TokenCount] = Graph(vertices, edges)
+
+      new DistributedLDAModel(graph, globalTopicTotals, globalTopicTotals.length, vocabSize,
+        docConcentration, topicConcentration, iterationTimes)
+    }
+
+  }
+
+  override def load(sc: SparkContext, path: String): DistributedLDAModel = {
+    val (loadedClassName, loadedVersion, metadata) = Loader.loadMetadata(sc, path)
+    implicit val formats = DefaultFormats
+    val expectedK = (metadata \ "k").extract[Int]
+    val vocabSize = (metadata \ "vocabSize").extract[Int]
+    val docConcentration = (metadata \ "docConcentration").extract[Double]
+    val topicConcentration = (metadata \ "topicConcentration").extract[Double]
+    val iterationTimes = (metadata \ "iterationTimes").extract[Seq[Double]]
+    val classNameV1_0 = SaveLoadV1_0.classNameV1_0
+
+    val model = (loadedClassName, loadedVersion) match {
+      case (className, "1.0") if className == classNameV1_0 => {
+        DistributedLDAModel.SaveLoadV1_0.load(
+          sc, path, vocabSize, docConcentration, topicConcentration, iterationTimes.toArray)
+      }
+      case _ => throw new Exception(
+        s"DistributedLDAModel.load did not recognize model with (className, format version):" +
+        s"($loadedClassName, $loadedVersion).  Supported: ($classNameV1_0, 1.0)")
+    }
+
+    require(model.vocabSize == vocabSize,
+      s"DistributedLDAModel requires $vocabSize vocabSize, got ${model.vocabSize} vocabSize")
+    require(model.docConcentration == docConcentration,
+      s"DistributedLDAModel requires $docConcentration docConcentration, " +
+      s"got ${model.docConcentration} docConcentration")
+    require(model.topicConcentration == topicConcentration,
+      s"DistributedLDAModel requires $topicConcentration docConcentration, " +
+      s"got ${model.topicConcentration} docConcentration")
+    require(expectedK == model.k,
+      s"DistributedLDAModel requires $expectedK topics, got ${model.k} topics")
+    model
+  }
+
 }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
index 03a8a2538b464..721a065658951 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
@@ -23,6 +23,7 @@ import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.linalg.{Vector, DenseMatrix, Matrix, Vectors}
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.mllib.util.TestingUtils._
+import org.apache.spark.util.Utils
 
 class LDASuite extends SparkFunSuite with MLlibTestSparkContext {
 
@@ -217,6 +218,46 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext {
     }
   }
 
+  test("model save/load") {
+    // Test for LocalLDAModel.
+    val localModel = new LocalLDAModel(tinyTopics)
+    val tempDir1 = Utils.createTempDir()
+    val path1 = tempDir1.toURI.toString
+
+    // Test for DistributedLDAModel.
+    val k = 3
+    val docConcentration = 1.2
+    val topicConcentration = 1.5
+    val lda = new LDA()
+    lda.setK(k)
+      .setDocConcentration(docConcentration)
+      .setTopicConcentration(topicConcentration)
+      .setMaxIterations(5)
+      .setSeed(12345)
+    val corpus = sc.parallelize(tinyCorpus, 2)
+    val distributedModel: DistributedLDAModel = lda.run(corpus).asInstanceOf[DistributedLDAModel]
+    val tempDir2 = Utils.createTempDir()
+    val path2 = tempDir2.toURI.toString
+
+    try {
+      localModel.save(sc, path1)
+      distributedModel.save(sc, path2)
+      val samelocalModel = LocalLDAModel.load(sc, path1)
+      assert(samelocalModel.topicsMatrix === localModel.topicsMatrix)
+      assert(samelocalModel.k === localModel.k)
+      assert(samelocalModel.vocabSize === localModel.vocabSize)
+
+      val sameDistributedModel = DistributedLDAModel.load(sc, path2)
+      assert(distributedModel.topicsMatrix === sameDistributedModel.topicsMatrix)
+      assert(distributedModel.k === sameDistributedModel.k)
+      assert(distributedModel.vocabSize === sameDistributedModel.vocabSize)
+      assert(distributedModel.iterationTimes === sameDistributedModel.iterationTimes)
+    } finally {
+      Utils.deleteRecursively(tempDir1)
+      Utils.deleteRecursively(tempDir2)
+    }
+  }
+
 }
 
 private[clustering] object LDASuite {

From 87d890cc105a7f41478433b28f53c9aa431db211 Mon Sep 17 00:00:00 2001
From: Michael Armbrust <michael@databricks.com>
Date: Tue, 21 Jul 2015 11:18:39 -0700
Subject: [PATCH 307/549] Revert "[SPARK-9154] [SQL] codegen StringFormat"

This reverts commit 7f072c3d5ec50c65d76bd9f28fac124fce96a89e.

Revert #7546

Author: Michael Armbrust <michael@databricks.com>

Closes #7570 from marmbrus/revert9154 and squashes the following commits:

ed2c32a [Michael Armbrust] Revert "[SPARK-9154] [SQL] codegen StringFormat"
---
 .../expressions/stringOperations.scala        | 42 +------------------
 .../expressions/StringExpressionsSuite.scala  | 18 ++++----
 .../spark/sql/StringFunctionsSuite.scala      | 10 -----
 3 files changed, 11 insertions(+), 59 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
index 280ae0e546358..fe57d17f1ec14 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
@@ -526,7 +526,7 @@ case class StringRPad(str: Expression, len: Expression, pad: Expression)
 /**
  * Returns the input formatted according do printf-style format strings
  */
-case class StringFormat(children: Expression*) extends Expression with ImplicitCastInputTypes {
+case class StringFormat(children: Expression*) extends Expression with CodegenFallback {
 
   require(children.nonEmpty, "printf() should take at least 1 argument")
 
@@ -536,10 +536,6 @@ case class StringFormat(children: Expression*) extends Expression with ImplicitC
   private def format: Expression = children(0)
   private def args: Seq[Expression] = children.tail
 
-  override def inputTypes: Seq[AbstractDataType] =
-    children.zipWithIndex.map(x => if (x._2 == 0) StringType else AnyDataType)
-
-
   override def eval(input: InternalRow): Any = {
     val pattern = format.eval(input)
     if (pattern == null) {
@@ -555,42 +551,6 @@ case class StringFormat(children: Expression*) extends Expression with ImplicitC
     }
   }
 
-  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
-    val pattern = children.head.gen(ctx)
-
-    val argListGen = children.tail.map(x => (x.dataType, x.gen(ctx)))
-    val argListCode = argListGen.map(_._2.code + "\n")
-
-    val argListString = argListGen.foldLeft("")((s, v) => {
-      val nullSafeString =
-        if (ctx.boxedType(v._1) != ctx.javaType(v._1)) {
-          // Java primitives get boxed in order to allow null values.
-          s"(${v._2.isNull}) ? (${ctx.boxedType(v._1)}) null : " +
-            s"new ${ctx.boxedType(v._1)}(${v._2.primitive})"
-        } else {
-          s"(${v._2.isNull}) ? null : ${v._2.primitive}"
-        }
-      s + "," + nullSafeString
-    })
-
-    val form = ctx.freshName("formatter")
-    val formatter = classOf[java.util.Formatter].getName
-    val sb = ctx.freshName("sb")
-    val stringBuffer = classOf[StringBuffer].getName
-    s"""
-      ${pattern.code}
-      boolean ${ev.isNull} = ${pattern.isNull};
-      ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
-      if (!${ev.isNull}) {
-        ${argListCode.mkString}
-        $stringBuffer $sb = new $stringBuffer();
-        $formatter $form = new $formatter($sb, ${classOf[Locale].getName}.US);
-        $form.format(${pattern.primitive}.toString() $argListString);
-        ${ev.primitive} = UTF8String.fromString($sb.toString());
-      }
-     """
-  }
-
   override def prettyName: String = "printf"
 }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
index 3c2d88731beb4..96c540ab36f08 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
@@ -351,16 +351,18 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
   }
 
   test("FORMAT") {
-    checkEvaluation(StringFormat(Literal("aa%d%s"), Literal(123), Literal("a")), "aa123a")
+    val f = 'f.string.at(0)
+    val d1 = 'd.int.at(1)
+    val s1 = 's.int.at(2)
+
+    val row1 = create_row("aa%d%s", 12, "cc")
+    val row2 = create_row(null, 12, "cc")
+    checkEvaluation(StringFormat(Literal("aa%d%s"), Literal(123), Literal("a")), "aa123a", row1)
     checkEvaluation(StringFormat(Literal("aa")), "aa", create_row(null))
-    checkEvaluation(StringFormat(Literal("aa%d%s"), Literal(123), Literal("a")), "aa123a")
-    checkEvaluation(StringFormat(Literal("aa%d%s"), 12, "cc"), "aa12cc")
+    checkEvaluation(StringFormat(Literal("aa%d%s"), Literal(123), Literal("a")), "aa123a", row1)
 
-    checkEvaluation(StringFormat(Literal.create(null, StringType), 12, "cc"), null)
-    checkEvaluation(
-      StringFormat(Literal("aa%d%s"), Literal.create(null, IntegerType), "cc"), "aanullcc")
-    checkEvaluation(
-      StringFormat(Literal("aa%d%s"), 12, Literal.create(null, StringType)), "aa12null")
+    checkEvaluation(StringFormat(f, d1, s1), "aa12cc", row1)
+    checkEvaluation(StringFormat(f, d1, s1), null, row2)
   }
 
   test("INSTR") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
index 3702e73b4e74f..d1f855903ca4b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
@@ -132,16 +132,6 @@ class StringFunctionsSuite extends QueryTest {
     checkAnswer(
       df.selectExpr("printf(a, b, c)"),
       Row("aa123cc"))
-
-    val df2 = Seq(("aa%d%s".getBytes, 123, "cc")).toDF("a", "b", "c")
-
-    checkAnswer(
-      df2.select(formatString($"a", $"b", $"c"), formatString("aa%d%s", "b", "c")),
-      Row("aa123cc", "aa123cc"))
-
-    checkAnswer(
-      df2.selectExpr("printf(a, b, c)"),
-      Row("aa123cc"))
   }
 
   test("string instr function") {

From 9ba7c64decfc92853bd281e9e7bfb95211080dd4 Mon Sep 17 00:00:00 2001
From: "navis.ryu" <navis@apache.org>
Date: Tue, 21 Jul 2015 11:52:52 -0700
Subject: [PATCH 308/549] [SPARK-8357] Fix unsafe memory leak on empty inputs
 in GeneratedAggregate

This patch fixes a managed memory leak in GeneratedAggregate.  The leak occurs when the unsafe aggregation path is used to perform grouped aggregation on an empty input; in this case, GeneratedAggregate allocates an UnsafeFixedWidthAggregationMap that is never cleaned up because `next()` is never called on the aggregate result iterator.

This patch fixes this by short-circuiting on empty inputs.

This patch is an updated version of #6810.

Closes #6810.

Author: navis.ryu <navis@apache.org>
Author: Josh Rosen <joshrosen@databricks.com>

Closes #7560 from JoshRosen/SPARK-8357 and squashes the following commits:

3486ce4 [Josh Rosen] Some minor cleanup
c649310 [Josh Rosen] Revert SparkPlan change:
3c7db0f [Josh Rosen] Merge remote-tracking branch 'origin/master' into SPARK-8357
adc8239 [Josh Rosen] Back out Projection changes.
c5419b3 [navis.ryu] addressed comments
143e1ef [navis.ryu] fixed format & added test for CCE case
735972f [navis.ryu] used new conf apis
1a02a55 [navis.ryu] Rolled-back test-conf cleanup & fixed possible CCE & added more tests
51178e8 [navis.ryu] addressed comments
4d326b9 [navis.ryu] fixed test fails
15c5afc [navis.ryu] added a test as suggested by JoshRosen
d396589 [navis.ryu] added comments
1b07556 [navis.ryu] [SPARK-8357] [SQL] Memory leakage on unsafe aggregation path with empty input
---
 .../sql/execution/GeneratedAggregate.scala    | 14 +++++-
 .../org/apache/spark/sql/SQLQuerySuite.scala  |  9 ++++
 .../spark/sql/execution/AggregateSuite.scala  | 48 +++++++++++++++++++
 3 files changed, 70 insertions(+), 1 deletion(-)
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/AggregateSuite.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala
index c069da016f9f0..ecde9c57139a6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala
@@ -266,7 +266,18 @@ case class GeneratedAggregate(
 
       val joinedRow = new JoinedRow3
 
-      if (groupingExpressions.isEmpty) {
+      if (!iter.hasNext) {
+        // This is an empty input, so return early so that we do not allocate data structures
+        // that won't be cleaned up (see SPARK-8357).
+        if (groupingExpressions.isEmpty) {
+          // This is a global aggregate, so return an empty aggregation buffer.
+          val resultProjection = resultProjectionBuilder()
+          Iterator(resultProjection(newAggregationBuffer(EmptyRow)))
+        } else {
+          // This is a grouped aggregate, so return an empty iterator.
+          Iterator[InternalRow]()
+        }
+      } else if (groupingExpressions.isEmpty) {
         // TODO: Codegening anything other than the updateProjection is probably over kill.
         val buffer = newAggregationBuffer(EmptyRow).asInstanceOf[MutableRow]
         var currentRow: InternalRow = null
@@ -280,6 +291,7 @@ case class GeneratedAggregate(
         val resultProjection = resultProjectionBuilder()
         Iterator(resultProjection(buffer))
       } else if (unsafeEnabled) {
+        assert(iter.hasNext, "There should be at least one row for this path")
         log.info("Using Unsafe-based aggregator")
         val aggregationMap = new UnsafeFixedWidthAggregationMap(
           newAggregationBuffer,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 61d5f2061ae18..beee10173fbc4 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -648,6 +648,15 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll with SQLTestUtils {
       Row(2, 1, 2, 2, 1))
   }
 
+  test("count of empty table") {
+    withTempTable("t") {
+      Seq.empty[(Int, Int)].toDF("a", "b").registerTempTable("t")
+      checkAnswer(
+        sql("select count(a) from t"),
+        Row(0))
+    }
+  }
+
   test("inner join where, one match per row") {
     checkAnswer(
       sql("SELECT * FROM upperCaseData JOIN lowerCaseData WHERE n = N"),
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/AggregateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/AggregateSuite.scala
new file mode 100644
index 0000000000000..20def6bef0c17
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/AggregateSuite.scala
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution
+
+import org.apache.spark.sql.SQLConf
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.test.TestSQLContext
+
+class AggregateSuite extends SparkPlanTest {
+
+  test("SPARK-8357 unsafe aggregation path should not leak memory with empty input") {
+    val codegenDefault = TestSQLContext.getConf(SQLConf.CODEGEN_ENABLED)
+    val unsafeDefault = TestSQLContext.getConf(SQLConf.UNSAFE_ENABLED)
+    try {
+      TestSQLContext.setConf(SQLConf.CODEGEN_ENABLED, true)
+      TestSQLContext.setConf(SQLConf.UNSAFE_ENABLED, true)
+      val df = Seq.empty[(Int, Int)].toDF("a", "b")
+      checkAnswer(
+        df,
+        GeneratedAggregate(
+          partial = true,
+          Seq(df.col("b").expr),
+          Seq(Alias(Count(df.col("a").expr), "cnt")()),
+          unsafeEnabled = true,
+          _: SparkPlan),
+        Seq.empty
+      )
+    } finally {
+      TestSQLContext.setConf(SQLConf.CODEGEN_ENABLED, codegenDefault)
+      TestSQLContext.setConf(SQLConf.UNSAFE_ENABLED, unsafeDefault)
+    }
+  }
+}

From 60c0ce134d90ef18852ed2c637d2f240b7f99ab9 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Tue, 21 Jul 2015 11:56:38 -0700
Subject: [PATCH 309/549] [SPARK-8906][SQL] Move all internal data source
 classes into execution.datasources.

This way, the sources package contains only public facing interfaces.

Author: Reynold Xin <rxin@databricks.com>

Closes #7565 from rxin/move-ds and squashes the following commits:

7661aff [Reynold Xin] Mima
9d5196a [Reynold Xin] Rearranged imports.
3dd7174 [Reynold Xin] [SPARK-8906][SQL] Move all internal data source classes into execution.datasources.
---
 project/MimaExcludes.scala                    | 47 +++++++++++++++++++
 .../org/apache/spark/sql/DataFrame.scala      |  2 +-
 .../apache/spark/sql/DataFrameReader.scala    |  4 +-
 .../apache/spark/sql/DataFrameWriter.scala    |  2 +-
 .../org/apache/spark/sql/SQLContext.scala     |  9 ++--
 .../spark/sql/execution/SparkStrategies.scala |  4 +-
 .../SqlNewHadoopRDD.scala                     |  9 ++--
 .../datasources}/DataSourceStrategy.scala     | 11 ++---
 .../datasources}/LogicalRelation.scala        |  7 +--
 .../datasources}/PartitioningUtils.scala      |  5 +-
 .../datasources}/commands.scala               |  5 +-
 .../datasources}/ddl.scala                    |  9 ++--
 .../datasources}/rules.scala                  | 10 ++--
 .../apache/spark/sql/parquet/newParquet.scala |  5 +-
 .../apache/spark/sql/sources/filters.scala    |  4 ++
 .../apache/spark/sql/sources/interfaces.scala |  4 +-
 .../org/apache/spark/sql/json/JsonSuite.scala |  2 +-
 .../sql/parquet/ParquetFilterSuite.scala      |  2 +-
 .../ParquetPartitionDiscoverySuite.scala      |  4 +-
 .../sources/CreateTableAsSelectSuite.scala    |  1 +
 .../sql/sources/ResolvedDataSourceSuite.scala |  1 +
 .../apache/spark/sql/hive/HiveContext.scala   |  6 +--
 .../spark/sql/hive/HiveMetastoreCatalog.scala | 11 +++--
 .../org/apache/spark/sql/hive/HiveQl.scala    |  2 +-
 .../spark/sql/hive/HiveStrategies.scala       |  2 +-
 .../spark/sql/hive/execution/commands.scala   |  1 +
 .../spark/sql/hive/orc/OrcRelation.scala      |  1 +
 .../sql/hive/MetastoreDataSourcesSuite.scala  |  2 +-
 .../hive/execution/HiveComparisonTest.scala   |  4 +-
 .../sql/hive/execution/SQLQuerySuite.scala    |  2 +-
 .../apache/spark/sql/hive/parquetSuites.scala |  2 +-
 .../sql/sources/hadoopFsRelationSuites.scala  |  6 ++-
 32 files changed, 124 insertions(+), 62 deletions(-)
 rename sql/core/src/main/scala/org/apache/spark/sql/{sources => execution}/SqlNewHadoopRDD.scala (99%)
 rename sql/core/src/main/scala/org/apache/spark/sql/{sources => execution/datasources}/DataSourceStrategy.scala (98%)
 rename sql/core/src/main/scala/org/apache/spark/sql/{sources => execution/datasources}/LogicalRelation.scala (88%)
 rename sql/core/src/main/scala/org/apache/spark/sql/{sources => execution/datasources}/PartitioningUtils.scala (99%)
 rename sql/core/src/main/scala/org/apache/spark/sql/{sources => execution/datasources}/commands.scala (99%)
 rename sql/core/src/main/scala/org/apache/spark/sql/{sources => execution/datasources}/ddl.scala (99%)
 rename sql/core/src/main/scala/org/apache/spark/sql/{sources => execution/datasources}/rules.scala (94%)

diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index a2595ff6c22f4..fa36629c37a35 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -104,6 +104,53 @@ object MimaExcludes {
             // SPARK-7422 add argmax for sparse vectors
             ProblemFilters.exclude[MissingMethodProblem](
               "org.apache.spark.mllib.linalg.Vector.argmax")
+          ) ++ Seq(
+            // SPARK-8906 Move all internal data source classes into execution.datasources
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.ResolvedDataSource"),
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.PreInsertCastAndRename$"),
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.CreateTableUsingAsSelect$"),
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.InsertIntoDataSource$"),
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.SqlNewHadoopPartition"),
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.PartitioningUtils$PartitionValues$"),
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.DefaultWriterContainer"),
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.PartitioningUtils$PartitionValues"),
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.RefreshTable$"),
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.CreateTempTableUsing$"),
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.PartitionSpec"),
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.DynamicPartitionWriterContainer"),
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.CreateTableUsingAsSelect"),
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.SqlNewHadoopRDD$"),
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.DescribeCommand$"),
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.PartitioningUtils$"),
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.SqlNewHadoopRDD"),
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.PreInsertCastAndRename"),
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.Partition$"),
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.LogicalRelation$"),
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.PartitioningUtils"),
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.LogicalRelation"),
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.Partition"),
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.BaseWriterContainer"),
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.PreWriteCheck"),
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.CreateTableUsing"),
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.RefreshTable"),
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.SqlNewHadoopRDD$NewHadoopMapPartitionsWithSplitRDD"),
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.DataSourceStrategy$"),
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.CreateTempTableUsing"),
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.CreateTempTableUsingAsSelect$"),
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.CreateTempTableUsingAsSelect"),
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.CreateTableUsing$"),
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.ResolvedDataSource$"),
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.PreWriteCheck$"),
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.InsertIntoDataSource"),
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.InsertIntoHadoopFsRelation"),
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.DDLParser"),
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.CaseInsensitiveMap"),
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.InsertIntoHadoopFsRelation$"),
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.DataSourceStrategy"),
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.SqlNewHadoopRDD$NewHadoopMapPartitionsWithSplitRDD$"),
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.PartitionSpec$"),
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.DescribeCommand"),
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.DDLException")
           )
 
         case v if v.startsWith("1.4") =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index 830fba35bb7bc..323ff17357fda 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -38,8 +38,8 @@ import org.apache.spark.sql.catalyst.plans.logical.{Filter, _}
 import org.apache.spark.sql.catalyst.plans.{Inner, JoinType}
 import org.apache.spark.sql.catalyst.{CatalystTypeConverters, ScalaReflection, SqlParser}
 import org.apache.spark.sql.execution.{EvaluatePython, ExplainCommand, LogicalRDD}
+import org.apache.spark.sql.execution.datasources.CreateTableUsingAsSelect
 import org.apache.spark.sql.json.JacksonGenerator
-import org.apache.spark.sql.sources.CreateTableUsingAsSelect
 import org.apache.spark.sql.types._
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.util.Utils
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index f1c1ddf898986..e9d782cdcd667 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -20,16 +20,16 @@ package org.apache.spark.sql
 import java.util.Properties
 
 import org.apache.hadoop.fs.Path
-import org.apache.spark.{Logging, Partition}
 
+import org.apache.spark.{Logging, Partition}
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.execution.datasources.{ResolvedDataSource, LogicalRelation}
 import org.apache.spark.sql.jdbc.{JDBCPartition, JDBCPartitioningInfo, JDBCRelation}
 import org.apache.spark.sql.json.JSONRelation
 import org.apache.spark.sql.parquet.ParquetRelation2
-import org.apache.spark.sql.sources.{LogicalRelation, ResolvedDataSource}
 import org.apache.spark.sql.types.StructType
 
 /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index 3e7b9cd7976c3..ee0201a9d4cb2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -22,8 +22,8 @@ import java.util.Properties
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation
 import org.apache.spark.sql.catalyst.plans.logical.InsertIntoTable
+import org.apache.spark.sql.execution.datasources.{CreateTableUsingAsSelect, ResolvedDataSource}
 import org.apache.spark.sql.jdbc.{JDBCWriteDetails, JdbcUtils}
-import org.apache.spark.sql.sources.{ResolvedDataSource, CreateTableUsingAsSelect}
 
 
 /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 2dda3ad1211fa..8b4528b5d52fe 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -39,8 +39,9 @@ import org.apache.spark.sql.catalyst.optimizer.{DefaultOptimizer, Optimizer}
 import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
 import org.apache.spark.sql.catalyst.rules.RuleExecutor
 import org.apache.spark.sql.catalyst.{InternalRow, ParserDialect, _}
-import org.apache.spark.sql.execution.{Filter, _}
-import org.apache.spark.sql.sources._
+import org.apache.spark.sql.execution._
+import org.apache.spark.sql.execution.datasources._
+import org.apache.spark.sql.sources.BaseRelation
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
 import org.apache.spark.util.Utils
@@ -146,11 +147,11 @@ class SQLContext(@transient val sparkContext: SparkContext)
     new Analyzer(catalog, functionRegistry, conf) {
       override val extendedResolutionRules =
         ExtractPythonUDFs ::
-        sources.PreInsertCastAndRename ::
+        PreInsertCastAndRename ::
         Nil
 
       override val extendedCheckRules = Seq(
-        sources.PreWriteCheck(catalog)
+        datasources.PreWriteCheck(catalog)
       )
     }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index 240332a80af0f..8cef7f200d2dc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.execution
 
+import org.apache.spark.sql.{SQLContext, Strategy, execution}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.planning._
@@ -25,10 +26,9 @@ import org.apache.spark.sql.catalyst.plans.logical.{BroadcastHint, LogicalPlan}
 import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.sql.columnar.{InMemoryColumnarTableScan, InMemoryRelation}
 import org.apache.spark.sql.execution.{DescribeCommand => RunnableDescribeCommand}
+import org.apache.spark.sql.execution.datasources.{CreateTableUsing, CreateTempTableUsing, DescribeCommand => LogicalDescribeCommand, _}
 import org.apache.spark.sql.parquet._
-import org.apache.spark.sql.sources.{CreateTableUsing, CreateTempTableUsing, DescribeCommand => LogicalDescribeCommand, _}
 import org.apache.spark.sql.types._
-import org.apache.spark.sql.{SQLContext, Strategy, execution}
 
 private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
   self: SQLContext#SparkPlanner =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/SqlNewHadoopRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SqlNewHadoopRDD.scala
similarity index 99%
rename from sql/core/src/main/scala/org/apache/spark/sql/sources/SqlNewHadoopRDD.scala
rename to sql/core/src/main/scala/org/apache/spark/sql/execution/SqlNewHadoopRDD.scala
index 2bdc341021256..e1c1a6c06268f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/SqlNewHadoopRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SqlNewHadoopRDD.scala
@@ -15,24 +15,23 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.sources
+package org.apache.spark.sql.execution
 
 import java.text.SimpleDateFormat
 import java.util.Date
 
+import org.apache.spark.{Partition => SparkPartition, _}
 import org.apache.hadoop.conf.{Configurable, Configuration}
 import org.apache.hadoop.io.Writable
 import org.apache.hadoop.mapreduce._
 import org.apache.hadoop.mapreduce.lib.input.{CombineFileSplit, FileSplit}
-import org.apache.spark.broadcast.Broadcast
-
-import org.apache.spark.{Partition => SparkPartition, _}
 import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.executor.DataReadMethod
 import org.apache.spark.mapreduce.SparkHadoopMapReduceUtil
-import org.apache.spark.rdd.{RDD, HadoopRDD}
 import org.apache.spark.rdd.NewHadoopRDD.NewHadoopMapPartitionsWithSplitRDD
+import org.apache.spark.rdd.{HadoopRDD, RDD}
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.util.{SerializableConfiguration, Utils}
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
similarity index 98%
rename from sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala
rename to sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
index 70c9e06927582..2b400926177fe 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
@@ -15,22 +15,21 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.sources
+package org.apache.spark.sql.execution.datasources
 
 import org.apache.spark.{Logging, TaskContext}
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.rdd.{MapPartitionsRDD, RDD, UnionRDD}
-import org.apache.spark.sql._
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions
+import org.apache.spark.sql.catalyst.{InternalRow, expressions}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.planning.PhysicalOperation
 import org.apache.spark.sql.catalyst.plans.logical
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types.{StringType, StructType}
-import org.apache.spark.sql.{SaveMode, Strategy, execution, sources}
-import org.apache.spark.util.{SerializableConfiguration, Utils}
+import org.apache.spark.sql.{SaveMode, Strategy, execution, sources, _}
 import org.apache.spark.unsafe.types.UTF8String
+import org.apache.spark.util.{SerializableConfiguration, Utils}
 
 /**
  * A Strategy for planning scans over data sources defined using the sources API.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/LogicalRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala
similarity index 88%
rename from sql/core/src/main/scala/org/apache/spark/sql/sources/LogicalRelation.scala
rename to sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala
index f374abffdd505..a7123dc845fa2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/LogicalRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala
@@ -14,11 +14,12 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.sql.sources
+package org.apache.spark.sql.execution.datasources
 
 import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation
-import org.apache.spark.sql.catalyst.expressions.{AttributeReference, AttributeMap}
-import org.apache.spark.sql.catalyst.plans.logical.{Statistics, LeafNode, LogicalPlan}
+import org.apache.spark.sql.catalyst.expressions.{AttributeMap, AttributeReference}
+import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, Statistics}
+import org.apache.spark.sql.sources.BaseRelation
 
 /**
  * Used to link a [[BaseRelation]] in to a logical query plan.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/PartitioningUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
similarity index 99%
rename from sql/core/src/main/scala/org/apache/spark/sql/sources/PartitioningUtils.scala
rename to sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
index 8b2a45d8e970a..6b4a359db22d1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/PartitioningUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
@@ -15,9 +15,9 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.sources
+package org.apache.spark.sql.execution.datasources
 
-import java.lang.{Double => JDouble, Float => JFloat, Integer => JInteger, Long => JLong}
+import java.lang.{Double => JDouble, Long => JLong}
 import java.math.{BigDecimal => JBigDecimal}
 
 import scala.collection.mutable.ArrayBuffer
@@ -29,6 +29,7 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{Cast, Literal}
 import org.apache.spark.sql.types._
 
+
 private[sql] case class Partition(values: InternalRow, path: String)
 
 private[sql] case class PartitionSpec(partitionColumns: StructType, partitions: Seq[Partition])
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/commands.scala
similarity index 99%
rename from sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala
rename to sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/commands.scala
index 5c6ef2dc90c73..84a0441e145c5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/commands.scala
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.sources
+package org.apache.spark.sql.execution.datasources
 
 import java.util.{Date, UUID}
 
@@ -24,7 +24,6 @@ import scala.collection.JavaConversions.asScalaIterator
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapreduce._
 import org.apache.hadoop.mapreduce.lib.output.{FileOutputCommitter => MapReduceFileOutputCommitter, FileOutputFormat}
-
 import org.apache.spark._
 import org.apache.spark.mapred.SparkHadoopMapRedUtil
 import org.apache.spark.mapreduce.SparkHadoopMapReduceUtil
@@ -35,9 +34,11 @@ import org.apache.spark.sql.catalyst.expressions.codegen.GenerateProjection
 import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project}
 import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
 import org.apache.spark.sql.execution.RunnableCommand
+import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types.StringType
 import org.apache.spark.util.SerializableConfiguration
 
+
 private[sql] case class InsertIntoDataSource(
     logicalRelation: LogicalRelation,
     query: LogicalPlan,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ddl.scala
similarity index 99%
rename from sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
rename to sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ddl.scala
index 5a8c97c773ee6..c8033d3c0470a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ddl.scala
@@ -15,23 +15,22 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.sources
+package org.apache.spark.sql.execution.datasources
 
 import scala.language.{existentials, implicitConversions}
 import scala.util.matching.Regex
 
 import org.apache.hadoop.fs.Path
-
 import org.apache.spark.Logging
 import org.apache.spark.deploy.SparkHadoopUtil
-import org.apache.spark.sql.catalyst.AbstractSparkSQLParser
+import org.apache.spark.sql.{AnalysisException, DataFrame, SQLContext, SaveMode}
 import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation
-import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
 import org.apache.spark.sql.catalyst.plans.logical._
+import org.apache.spark.sql.catalyst.{AbstractSparkSQLParser, InternalRow}
 import org.apache.spark.sql.execution.RunnableCommand
+import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types._
-import org.apache.spark.sql.{AnalysisException, DataFrame, SQLContext, SaveMode}
 import org.apache.spark.util.Utils
 
 /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/rules.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
similarity index 94%
rename from sql/core/src/main/scala/org/apache/spark/sql/sources/rules.scala
rename to sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
index 40ee048e2653e..11bb49b8d83de 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/rules.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
@@ -15,15 +15,15 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.sources
+package org.apache.spark.sql.execution.datasources
 
-import org.apache.spark.sql.{SaveMode, AnalysisException}
-import org.apache.spark.sql.catalyst.analysis.{EliminateSubQueries, Catalog}
-import org.apache.spark.sql.catalyst.expressions.{Attribute, Cast, Alias}
+import org.apache.spark.sql.{AnalysisException, SaveMode}
+import org.apache.spark.sql.catalyst.analysis.{Catalog, EliminateSubQueries}
+import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, Cast}
 import org.apache.spark.sql.catalyst.plans.logical
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules.Rule
-import org.apache.spark.sql.types.DataType
+import org.apache.spark.sql.sources.{BaseRelation, HadoopFsRelation, InsertableRelation}
 
 /**
  * A rule to do pre-insert data type casting and field renaming. Before we insert into
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
index e683eb0126004..2f9f880c70690 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
@@ -35,15 +35,18 @@ import org.apache.parquet.hadoop.metadata.CompressionCodecName
 import org.apache.parquet.hadoop.util.ContextUtil
 import org.apache.parquet.schema.MessageType
 
+import org.apache.spark.{Logging, Partition => SparkPartition, SparkException}
 import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.rdd.RDD
 import org.apache.spark.rdd.RDD._
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.execution.{SqlNewHadoopPartition, SqlNewHadoopRDD}
+import org.apache.spark.sql.execution.datasources.PartitionSpec
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types.{DataType, StructType}
 import org.apache.spark.util.{SerializableConfiguration, Utils}
-import org.apache.spark.{Logging, Partition => SparkPartition, SparkException}
+
 
 private[sql] class DefaultSource extends HadoopFsRelationProvider {
   override def createRelation(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/filters.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/filters.scala
index 24e86ca415c51..4d942e4f9287a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/filters.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/filters.scala
@@ -17,6 +17,10 @@
 
 package org.apache.spark.sql.sources
 
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// This file defines all the filters that we can push down to the data sources.
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
 /**
  * A filter predicate for data sources.
  *
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
index 2cd8b358d81c6..7cd005b959488 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
@@ -18,7 +18,6 @@
 package org.apache.spark.sql.sources
 
 import scala.collection.mutable
-import scala.collection.mutable.ArrayBuffer
 import scala.util.Try
 
 import org.apache.hadoop.conf.Configuration
@@ -33,6 +32,7 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.codegen.GenerateMutableProjection
 import org.apache.spark.sql.execution.RDDConversions
+import org.apache.spark.sql.execution.datasources.{PartitioningUtils, PartitionSpec, Partition}
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.sql._
 import org.apache.spark.util.SerializableConfiguration
@@ -523,7 +523,7 @@ abstract class HadoopFsRelation private[sql](maybePartitionSpec: Option[Partitio
     })
   }
 
-  private[sources] final def buildScan(
+  private[sql] final def buildScan(
       requiredColumns: Array[String],
       filters: Array[Filter],
       inputPaths: Array[String],
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
index 3475f9dd6787e..1d04513a44672 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
@@ -26,8 +26,8 @@ import org.scalactic.Tolerance._
 import org.apache.spark.sql.{QueryTest, Row, SQLConf}
 import org.apache.spark.sql.TestData._
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
+import org.apache.spark.sql.execution.datasources.LogicalRelation
 import org.apache.spark.sql.json.InferSchema.compatibleType
-import org.apache.spark.sql.sources.LogicalRelation
 import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala
index a2763c78b6450..23df102cd951d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala
@@ -24,7 +24,7 @@ import org.apache.parquet.filter2.predicate.{FilterPredicate, Operators}
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.planning.PhysicalOperation
-import org.apache.spark.sql.sources.LogicalRelation
+import org.apache.spark.sql.execution.datasources.LogicalRelation
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.{Column, DataFrame, QueryTest, Row, SQLConf}
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala
index 37b0a9fbf7a4e..4f98776b91160 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala
@@ -28,11 +28,11 @@ import org.apache.hadoop.fs.Path
 
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.Literal
-import org.apache.spark.sql.sources.PartitioningUtils._
-import org.apache.spark.sql.sources.{LogicalRelation, Partition, PartitionSpec}
+import org.apache.spark.sql.execution.datasources.{LogicalRelation, PartitionSpec, Partition, PartitioningUtils}
 import org.apache.spark.sql.types._
 import org.apache.spark.sql._
 import org.apache.spark.unsafe.types.UTF8String
+import PartitioningUtils._
 
 // The data where the partitioning key exists only in the directory structure.
 case class ParquetData(intField: Int, stringField: String)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala
index a71088430bfd5..1907e643c85dd 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala
@@ -22,6 +22,7 @@ import java.io.{File, IOException}
 import org.scalatest.BeforeAndAfterAll
 
 import org.apache.spark.sql.AnalysisException
+import org.apache.spark.sql.execution.datasources.DDLException
 import org.apache.spark.util.Utils
 
 class CreateTableAsSelectSuite extends DataSourceTest with BeforeAndAfterAll {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/ResolvedDataSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/ResolvedDataSourceSuite.scala
index 296b0d6f74a0c..3cbf5467b253a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/ResolvedDataSourceSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/ResolvedDataSourceSuite.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.sources
 
 import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.execution.datasources.ResolvedDataSource
 
 class ResolvedDataSourceSuite extends SparkFunSuite {
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index 4684d48aff889..cec7685bb6859 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -44,9 +44,9 @@ import org.apache.spark.sql.catalyst.ParserDialect
 import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.execution.{ExecutedCommand, ExtractPythonUDFs, SetCommand}
+import org.apache.spark.sql.execution.datasources.{PreWriteCheck, PreInsertCastAndRename, DataSourceStrategy}
 import org.apache.spark.sql.hive.client._
 import org.apache.spark.sql.hive.execution.{DescribeHiveTableCommand, HiveNativeCommand}
-import org.apache.spark.sql.sources.DataSourceStrategy
 import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
 
@@ -384,11 +384,11 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) with Logging {
         catalog.PreInsertionCasts ::
         ExtractPythonUDFs ::
         ResolveHiveWindowFunction ::
-        sources.PreInsertCastAndRename ::
+        PreInsertCastAndRename ::
         Nil
 
       override val extendedCheckRules = Seq(
-        sources.PreWriteCheck(catalog)
+        PreWriteCheck(catalog)
       )
     }
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index b15261b7914dd..0a2121c955871 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.hive
 
+import scala.collection.JavaConversions._
+
 import com.google.common.base.Objects
 import com.google.common.cache.{CacheBuilder, CacheLoader, LoadingCache}
 
@@ -28,6 +30,7 @@ import org.apache.hadoop.hive.ql.metadata._
 import org.apache.hadoop.hive.ql.plan.TableDesc
 
 import org.apache.spark.Logging
+import org.apache.spark.sql.{AnalysisException, SQLContext, SaveMode}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.{Catalog, MultiInstanceRelation, OverrideCatalog}
 import org.apache.spark.sql.catalyst.expressions._
@@ -35,14 +38,12 @@ import org.apache.spark.sql.catalyst.planning.PhysicalOperation
 import org.apache.spark.sql.catalyst.plans.logical
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules._
+import org.apache.spark.sql.execution.datasources
+import org.apache.spark.sql.execution.datasources.{Partition => ParquetPartition, PartitionSpec, CreateTableUsingAsSelect, ResolvedDataSource, LogicalRelation}
 import org.apache.spark.sql.hive.client._
 import org.apache.spark.sql.parquet.ParquetRelation2
-import org.apache.spark.sql.sources.{CreateTableUsingAsSelect, LogicalRelation, Partition => ParquetPartition, PartitionSpec, ResolvedDataSource}
 import org.apache.spark.sql.types._
-import org.apache.spark.sql.{AnalysisException, SQLContext, SaveMode, sources}
 
-/* Implicit conversions */
-import scala.collection.JavaConversions._
 
 private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: HiveContext)
   extends Catalog with Logging {
@@ -278,7 +279,7 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
             parquetRelation.paths.toSet == pathsInMetastore.toSet &&
             logical.schema.sameType(metastoreSchema) &&
             parquetRelation.partitionSpec == partitionSpecInMetastore.getOrElse {
-              PartitionSpec(StructType(Nil), Array.empty[sources.Partition])
+              PartitionSpec(StructType(Nil), Array.empty[datasources.Partition])
             }
 
           if (useCached) {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
index 7fc517b646b20..f5574509b0b38 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
@@ -37,7 +37,7 @@ import org.apache.spark.sql.catalyst.plans.logical
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.trees.CurrentOrigin
 import org.apache.spark.sql.execution.ExplainCommand
-import org.apache.spark.sql.sources.DescribeCommand
+import org.apache.spark.sql.execution.datasources.DescribeCommand
 import org.apache.spark.sql.hive.HiveShim._
 import org.apache.spark.sql.hive.client._
 import org.apache.spark.sql.hive.execution.{HiveNativeCommand, DropTable, AnalyzeTable, HiveScriptIOSchema}
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
index 9638a8201e190..a22c3292eff94 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
@@ -30,9 +30,9 @@ import org.apache.spark.sql.catalyst.planning._
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.execution.{DescribeCommand => RunnableDescribeCommand, _}
+import org.apache.spark.sql.execution.datasources.{CreateTableUsing, CreateTableUsingAsSelect, DescribeCommand}
 import org.apache.spark.sql.hive.execution._
 import org.apache.spark.sql.parquet.ParquetRelation
-import org.apache.spark.sql.sources.{CreateTableUsing, CreateTableUsingAsSelect, DescribeCommand}
 import org.apache.spark.sql.types.StringType
 
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
index 71fa3e9c33ad9..a47f9a4feb21b 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
@@ -23,6 +23,7 @@ import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.util._
 import org.apache.spark.sql.execution.RunnableCommand
+import org.apache.spark.sql.execution.datasources.{ResolvedDataSource, LogicalRelation}
 import org.apache.spark.sql.hive.HiveContext
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types._
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
index 48d35a60a759b..de63ee56dd8e6 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
@@ -37,6 +37,7 @@ import org.apache.spark.mapred.SparkHadoopMapRedUtil
 import org.apache.spark.rdd.{HadoopRDD, RDD}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.execution.datasources.PartitionSpec
 import org.apache.spark.sql.hive.{HiveContext, HiveInspectors, HiveMetastoreTypes, HiveShim}
 import org.apache.spark.sql.sources.{Filter, _}
 import org.apache.spark.sql.types.StructType
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
index d910af22c3dd1..e403f32efaf91 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
@@ -28,12 +28,12 @@ import org.apache.hadoop.mapred.InvalidInputException
 
 import org.apache.spark.Logging
 import org.apache.spark.sql._
+import org.apache.spark.sql.execution.datasources.LogicalRelation
 import org.apache.spark.sql.hive.client.{HiveTable, ManagedTable}
 import org.apache.spark.sql.hive.test.TestHive
 import org.apache.spark.sql.hive.test.TestHive._
 import org.apache.spark.sql.hive.test.TestHive.implicits._
 import org.apache.spark.sql.parquet.ParquetRelation2
-import org.apache.spark.sql.sources.LogicalRelation
 import org.apache.spark.sql.test.SQLTestUtils
 import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
index c9dd4c0935a72..efb04bf3d5097 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
@@ -22,11 +22,11 @@ import java.io._
 import org.scalatest.{BeforeAndAfterAll, GivenWhenThen}
 
 import org.apache.spark.{Logging, SparkFunSuite}
-import org.apache.spark.sql.sources.DescribeCommand
-import org.apache.spark.sql.execution.{SetCommand, ExplainCommand}
 import org.apache.spark.sql.catalyst.planning.PhysicalOperation
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.util._
+import org.apache.spark.sql.execution.{SetCommand, ExplainCommand}
+import org.apache.spark.sql.execution.datasources.DescribeCommand
 import org.apache.spark.sql.hive.test.TestHive
 
 /**
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index 05a1f0094e5e1..03428265422e6 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -23,12 +23,12 @@ import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.DefaultParserDialect
 import org.apache.spark.sql.catalyst.analysis.EliminateSubQueries
 import org.apache.spark.sql.catalyst.errors.DialectException
+import org.apache.spark.sql.execution.datasources.LogicalRelation
 import org.apache.spark.sql.hive.test.TestHive
 import org.apache.spark.sql.hive.test.TestHive._
 import org.apache.spark.sql.hive.test.TestHive.implicits._
 import org.apache.spark.sql.hive.{HiveContext, HiveQLDialect, MetastoreRelation}
 import org.apache.spark.sql.parquet.ParquetRelation2
-import org.apache.spark.sql.sources.LogicalRelation
 import org.apache.spark.sql.types._
 
 case class Nested1(f1: Nested2)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
index 9d79a4b007d66..82a8daf8b4b09 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
@@ -23,12 +23,12 @@ import org.scalatest.BeforeAndAfterAll
 
 import org.apache.spark.sql._
 import org.apache.spark.sql.execution.{ExecutedCommand, PhysicalRDD}
+import org.apache.spark.sql.execution.datasources.{InsertIntoDataSource, InsertIntoHadoopFsRelation, LogicalRelation}
 import org.apache.spark.sql.hive.execution.HiveTableScan
 import org.apache.spark.sql.hive.test.TestHive
 import org.apache.spark.sql.hive.test.TestHive._
 import org.apache.spark.sql.hive.test.TestHive.implicits._
 import org.apache.spark.sql.parquet.{ParquetRelation2, ParquetTableScan}
-import org.apache.spark.sql.sources.{InsertIntoDataSource, InsertIntoHadoopFsRelation, LogicalRelation}
 import org.apache.spark.sql.test.SQLTestUtils
 import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
index afecf9675e11f..1cef83fd5e990 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
@@ -17,10 +17,10 @@
 
 package org.apache.spark.sql.sources
 
-import scala.collection.JavaConversions._
-
 import java.io.File
 
+import scala.collection.JavaConversions._
+
 import com.google.common.io.Files
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
@@ -31,10 +31,12 @@ import org.apache.parquet.hadoop.ParquetOutputCommitter
 import org.apache.spark.{SparkException, SparkFunSuite}
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.sql._
+import org.apache.spark.sql.execution.datasources.LogicalRelation
 import org.apache.spark.sql.hive.test.TestHive
 import org.apache.spark.sql.test.SQLTestUtils
 import org.apache.spark.sql.types._
 
+
 abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils {
   override lazy val sqlContext: SQLContext = TestHive
 

From c07838b5a9cdf96c0f49055ea1c397e0f0e915d2 Mon Sep 17 00:00:00 2001
From: Dennis Huo <dhuo@google.com>
Date: Tue, 21 Jul 2015 13:12:11 -0700
Subject: [PATCH 310/549] [SPARK-9206] [SQL] Fix HiveContext classloading for
 GCS connector.

IsolatedClientLoader.isSharedClass includes all of com.google.\*, presumably
for Guava, protobuf, and/or other shared Google libraries, but needs to
count com.google.cloud.\* as "hive classes" when determining which ClassLoader
to use. Otherwise, things like HiveContext.parquetFile will throw a
ClassCastException when fs.defaultFS is set to a Google Cloud Storage (gs://)
path. On StackOverflow: http://stackoverflow.com/questions/31478955

EDIT: Adding yhuai who worked on the relevant classloading isolation pieces.

Author: Dennis Huo <dhuo@google.com>

Closes #7549 from dennishuo/dhuo-fix-hivecontext-gcs and squashes the following commits:

1f8db07 [Dennis Huo] Fix HiveContext classloading for GCS connector.
---
 .../org/apache/spark/sql/hive/client/IsolatedClientLoader.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala
index 3d609a66f3664..97fb98199991b 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala
@@ -125,7 +125,7 @@ private[hive] class IsolatedClientLoader(
     name.contains("log4j") ||
     name.startsWith("org.apache.spark.") ||
     name.startsWith("scala.") ||
-    name.startsWith("com.google") ||
+    (name.startsWith("com.google") && !name.startsWith("com.google.cloud")) ||
     name.startsWith("java.lang.") ||
     name.startsWith("java.net") ||
     sharedPrefixes.exists(name.startsWith)

From d4c7a7a3642a74ad40093c96c4bf45a62a470605 Mon Sep 17 00:00:00 2001
From: Tarek Auel <tarek.auel@googlemail.com>
Date: Tue, 21 Jul 2015 15:47:40 -0700
Subject: [PATCH 311/549] [SPARK-9154] [SQL] codegen StringFormat

Jira: https://issues.apache.org/jira/browse/SPARK-9154

fixes bug of #7546

marmbrus I can't reopen the other PR, because I didn't closed it. Can you trigger Jenkins?

Author: Tarek Auel <tarek.auel@googlemail.com>

Closes #7571 from tarekauel/SPARK-9154 and squashes the following commits:

dcae272 [Tarek Auel] [SPARK-9154][SQL] build fix
1487602 [Tarek Auel] Merge remote-tracking branch 'upstream/master' into SPARK-9154
f512c5f [Tarek Auel] [SPARK-9154][SQL] build fix
a943d3e [Tarek Auel] [SPARK-9154] implicit input cast, added tests for null, support for null primitives
10b4de8 [Tarek Auel] [SPARK-9154][SQL] codegen removed fallback trait
cd8322b [Tarek Auel] [SPARK-9154][SQL] codegen string format
086caba [Tarek Auel] [SPARK-9154][SQL] codegen string format
---
 .../expressions/stringOperations.scala        | 42 ++++++++++++++++++-
 .../expressions/StringExpressionsSuite.scala  | 18 ++++----
 .../org/apache/spark/sql/functions.scala      | 11 +++++
 .../spark/sql/StringFunctionsSuite.scala      | 10 +++++
 4 files changed, 70 insertions(+), 11 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
index fe57d17f1ec14..1f18a6e9ff8a5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
@@ -526,7 +526,7 @@ case class StringRPad(str: Expression, len: Expression, pad: Expression)
 /**
  * Returns the input formatted according do printf-style format strings
  */
-case class StringFormat(children: Expression*) extends Expression with CodegenFallback {
+case class StringFormat(children: Expression*) extends Expression with ImplicitCastInputTypes {
 
   require(children.nonEmpty, "printf() should take at least 1 argument")
 
@@ -536,6 +536,10 @@ case class StringFormat(children: Expression*) extends Expression with CodegenFa
   private def format: Expression = children(0)
   private def args: Seq[Expression] = children.tail
 
+  override def inputTypes: Seq[AbstractDataType] =
+    StringType :: List.fill(children.size - 1)(AnyDataType)
+
+
   override def eval(input: InternalRow): Any = {
     val pattern = format.eval(input)
     if (pattern == null) {
@@ -551,6 +555,42 @@ case class StringFormat(children: Expression*) extends Expression with CodegenFa
     }
   }
 
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    val pattern = children.head.gen(ctx)
+
+    val argListGen = children.tail.map(x => (x.dataType, x.gen(ctx)))
+    val argListCode = argListGen.map(_._2.code + "\n")
+
+    val argListString = argListGen.foldLeft("")((s, v) => {
+      val nullSafeString =
+        if (ctx.boxedType(v._1) != ctx.javaType(v._1)) {
+          // Java primitives get boxed in order to allow null values.
+          s"(${v._2.isNull}) ? (${ctx.boxedType(v._1)}) null : " +
+            s"new ${ctx.boxedType(v._1)}(${v._2.primitive})"
+        } else {
+          s"(${v._2.isNull}) ? null : ${v._2.primitive}"
+        }
+      s + "," + nullSafeString
+    })
+
+    val form = ctx.freshName("formatter")
+    val formatter = classOf[java.util.Formatter].getName
+    val sb = ctx.freshName("sb")
+    val stringBuffer = classOf[StringBuffer].getName
+    s"""
+      ${pattern.code}
+      boolean ${ev.isNull} = ${pattern.isNull};
+      ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+      if (!${ev.isNull}) {
+        ${argListCode.mkString}
+        $stringBuffer $sb = new $stringBuffer();
+        $formatter $form = new $formatter($sb, ${classOf[Locale].getName}.US);
+        $form.format(${pattern.primitive}.toString() $argListString);
+        ${ev.primitive} = UTF8String.fromString($sb.toString());
+      }
+     """
+  }
+
   override def prettyName: String = "printf"
 }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
index 96c540ab36f08..3c2d88731beb4 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
@@ -351,18 +351,16 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
   }
 
   test("FORMAT") {
-    val f = 'f.string.at(0)
-    val d1 = 'd.int.at(1)
-    val s1 = 's.int.at(2)
-
-    val row1 = create_row("aa%d%s", 12, "cc")
-    val row2 = create_row(null, 12, "cc")
-    checkEvaluation(StringFormat(Literal("aa%d%s"), Literal(123), Literal("a")), "aa123a", row1)
+    checkEvaluation(StringFormat(Literal("aa%d%s"), Literal(123), Literal("a")), "aa123a")
     checkEvaluation(StringFormat(Literal("aa")), "aa", create_row(null))
-    checkEvaluation(StringFormat(Literal("aa%d%s"), Literal(123), Literal("a")), "aa123a", row1)
+    checkEvaluation(StringFormat(Literal("aa%d%s"), Literal(123), Literal("a")), "aa123a")
+    checkEvaluation(StringFormat(Literal("aa%d%s"), 12, "cc"), "aa12cc")
 
-    checkEvaluation(StringFormat(f, d1, s1), "aa12cc", row1)
-    checkEvaluation(StringFormat(f, d1, s1), null, row2)
+    checkEvaluation(StringFormat(Literal.create(null, StringType), 12, "cc"), null)
+    checkEvaluation(
+      StringFormat(Literal("aa%d%s"), Literal.create(null, IntegerType), "cc"), "aanullcc")
+    checkEvaluation(
+      StringFormat(Literal("aa%d%s"), 12, Literal.create(null, StringType)), "aa12null")
   }
 
   test("INSTR") {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index d94d7335828c5..e5ff8ae7e3179 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -1741,6 +1741,17 @@ object functions {
    */
   def rtrim(e: Column): Column = StringTrimRight(e.expr)
 
+  /**
+   * Format strings in printf-style.
+   *
+   * @group string_funcs
+   * @since 1.5.0
+   */
+  @scala.annotation.varargs
+  def formatString(format: Column, arguments: Column*): Column = {
+    StringFormat((format +: arguments).map(_.expr): _*)
+  }
+
   /**
    * Format strings in printf-style.
    * NOTE: `format` is the string value of the formatter, not column name.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
index d1f855903ca4b..3702e73b4e74f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
@@ -132,6 +132,16 @@ class StringFunctionsSuite extends QueryTest {
     checkAnswer(
       df.selectExpr("printf(a, b, c)"),
       Row("aa123cc"))
+
+    val df2 = Seq(("aa%d%s".getBytes, 123, "cc")).toDF("a", "b", "c")
+
+    checkAnswer(
+      df2.select(formatString($"a", $"b", $"c"), formatString("aa%d%s", "b", "c")),
+      Row("aa123cc", "aa123cc"))
+
+    checkAnswer(
+      df2.selectExpr("printf(a, b, c)"),
+      Row("aa123cc"))
   }
 
   test("string instr function") {

From a4c83cb1e4b066cd60264b6572fd3e51d160d26a Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Tue, 21 Jul 2015 19:14:07 -0700
Subject: [PATCH 312/549] [SPARK-9154][SQL] Rename formatString to
 format_string.

Also make format_string the canonical form, rather than printf.

Author: Reynold Xin <rxin@databricks.com>

Closes #7579 from rxin/format_strings and squashes the following commits:

53ee54f [Reynold Xin] Fixed unit tests.
52357e1 [Reynold Xin] Add format_string alias.
b40a42a [Reynold Xin] [SPARK-9154][SQL] Rename formatString to format_string.
---
 .../catalyst/analysis/FunctionRegistry.scala   |  3 ++-
 .../expressions/stringOperations.scala         | 13 +++++--------
 .../expressions/StringExpressionsSuite.scala   | 14 +++++++-------
 .../scala/org/apache/spark/sql/functions.scala | 18 +++---------------
 .../spark/sql/StringFunctionsSuite.scala       | 12 +-----------
 5 files changed, 18 insertions(+), 42 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index e3d8d2adf2135..9c349838c28a1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -168,7 +168,8 @@ object FunctionRegistry {
     expression[StringLocate]("locate"),
     expression[StringLPad]("lpad"),
     expression[StringTrimLeft]("ltrim"),
-    expression[StringFormat]("printf"),
+    expression[FormatString]("format_string"),
+    expression[FormatString]("printf"),
     expression[StringRPad]("rpad"),
     expression[StringRepeat]("repeat"),
     expression[StringReverse]("reverse"),
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
index 1f18a6e9ff8a5..cf187ad5a0a9f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
@@ -526,29 +526,26 @@ case class StringRPad(str: Expression, len: Expression, pad: Expression)
 /**
  * Returns the input formatted according do printf-style format strings
  */
-case class StringFormat(children: Expression*) extends Expression with ImplicitCastInputTypes {
+case class FormatString(children: Expression*) extends Expression with ImplicitCastInputTypes {
 
-  require(children.nonEmpty, "printf() should take at least 1 argument")
+  require(children.nonEmpty, "format_string() should take at least 1 argument")
 
   override def foldable: Boolean = children.forall(_.foldable)
   override def nullable: Boolean = children(0).nullable
   override def dataType: DataType = StringType
-  private def format: Expression = children(0)
-  private def args: Seq[Expression] = children.tail
 
   override def inputTypes: Seq[AbstractDataType] =
     StringType :: List.fill(children.size - 1)(AnyDataType)
 
-
   override def eval(input: InternalRow): Any = {
-    val pattern = format.eval(input)
+    val pattern = children(0).eval(input)
     if (pattern == null) {
       null
     } else {
       val sb = new StringBuffer()
       val formatter = new java.util.Formatter(sb, Locale.US)
 
-      val arglist = args.map(_.eval(input).asInstanceOf[AnyRef])
+      val arglist = children.tail.map(_.eval(input).asInstanceOf[AnyRef])
       formatter.format(pattern.asInstanceOf[UTF8String].toString, arglist: _*)
 
       UTF8String.fromString(sb.toString)
@@ -591,7 +588,7 @@ case class StringFormat(children: Expression*) extends Expression with ImplicitC
      """
   }
 
-  override def prettyName: String = "printf"
+  override def prettyName: String = "format_string"
 }
 
 /**
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
index 3c2d88731beb4..3d294fda5d103 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
@@ -351,16 +351,16 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
   }
 
   test("FORMAT") {
-    checkEvaluation(StringFormat(Literal("aa%d%s"), Literal(123), Literal("a")), "aa123a")
-    checkEvaluation(StringFormat(Literal("aa")), "aa", create_row(null))
-    checkEvaluation(StringFormat(Literal("aa%d%s"), Literal(123), Literal("a")), "aa123a")
-    checkEvaluation(StringFormat(Literal("aa%d%s"), 12, "cc"), "aa12cc")
+    checkEvaluation(FormatString(Literal("aa%d%s"), Literal(123), Literal("a")), "aa123a")
+    checkEvaluation(FormatString(Literal("aa")), "aa", create_row(null))
+    checkEvaluation(FormatString(Literal("aa%d%s"), Literal(123), Literal("a")), "aa123a")
+    checkEvaluation(FormatString(Literal("aa%d%s"), 12, "cc"), "aa12cc")
 
-    checkEvaluation(StringFormat(Literal.create(null, StringType), 12, "cc"), null)
+    checkEvaluation(FormatString(Literal.create(null, StringType), 12, "cc"), null)
     checkEvaluation(
-      StringFormat(Literal("aa%d%s"), Literal.create(null, IntegerType), "cc"), "aanullcc")
+      FormatString(Literal("aa%d%s"), Literal.create(null, IntegerType), "cc"), "aanullcc")
     checkEvaluation(
-      StringFormat(Literal("aa%d%s"), 12, Literal.create(null, StringType)), "aa12null")
+      FormatString(Literal("aa%d%s"), 12, Literal.create(null, StringType)), "aa12null")
   }
 
   test("INSTR") {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index e5ff8ae7e3179..28159cbd5ab96 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -1742,26 +1742,14 @@ object functions {
   def rtrim(e: Column): Column = StringTrimRight(e.expr)
 
   /**
-   * Format strings in printf-style.
+   * Formats the arguments in printf-style and returns the result as a string column.
    *
    * @group string_funcs
    * @since 1.5.0
    */
   @scala.annotation.varargs
-  def formatString(format: Column, arguments: Column*): Column = {
-    StringFormat((format +: arguments).map(_.expr): _*)
-  }
-
-  /**
-   * Format strings in printf-style.
-   * NOTE: `format` is the string value of the formatter, not column name.
-   *
-   * @group string_funcs
-   * @since 1.5.0
-   */
-  @scala.annotation.varargs
-  def formatString(format: String, arguNames: String*): Column = {
-    StringFormat(lit(format).expr +: arguNames.map(Column(_).expr): _*)
+  def format_string(format: String, arguments: Column*): Column = {
+    FormatString((lit(format) +: arguments).map(_.expr): _*)
   }
 
   /**
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
index 3702e73b4e74f..0f9c986f649a1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
@@ -126,22 +126,12 @@ class StringFunctionsSuite extends QueryTest {
     val df = Seq(("aa%d%s", 123, "cc")).toDF("a", "b", "c")
 
     checkAnswer(
-      df.select(formatString("aa%d%s", "b", "c")),
+      df.select(format_string("aa%d%s", $"b", $"c")),
       Row("aa123cc"))
 
     checkAnswer(
       df.selectExpr("printf(a, b, c)"),
       Row("aa123cc"))
-
-    val df2 = Seq(("aa%d%s".getBytes, 123, "cc")).toDF("a", "b", "c")
-
-    checkAnswer(
-      df2.select(formatString($"a", $"b", $"c"), formatString("aa%d%s", "b", "c")),
-      Row("aa123cc", "aa123cc"))
-
-    checkAnswer(
-      df2.selectExpr("printf(a, b, c)"),
-      Row("aa123cc"))
   }
 
   test("string instr function") {

From 63f4bcc73f5a09c1790cc3c333f08b18609de6a4 Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Tue, 21 Jul 2015 22:50:27 -0700
Subject: [PATCH 313/549] [SPARK-9121] [SPARKR] Get rid of the warnings about
 `no visible global function definition` in SparkR

[[SPARK-9121] Get rid of the warnings about `no visible global function definition` in SparkR - ASF JIRA](https://issues.apache.org/jira/browse/SPARK-9121)

## The Result of `dev/lint-r`
[The result of lint-r for SPARK-9121 at the revision:1ddd0f2f1688560f88470e312b72af04364e2d49 when I have sent a PR](https://gist.github.com/yu-iskw/6f55953425901725edf6)

Author: Yu ISHIKAWA <yuu.ishikawa@gmail.com>

Closes #7567 from yu-iskw/SPARK-9121 and squashes the following commits:

c8cfd63 [Yu ISHIKAWA] Fix the typo
b1f19ed [Yu ISHIKAWA] Add a validate statement for local SparkR
1a03987 [Yu ISHIKAWA] Load the `testthat` package in `dev/lint-r.R`, instead of using the full path of function.
3a5e0ab [Yu ISHIKAWA] [SPARK-9121][SparkR] Get rid of the warnings about `no visible global function definition` in SparkR
---
 dev/lint-r.R | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/dev/lint-r.R b/dev/lint-r.R
index dcb1a184291e1..48bd6246096ae 100644
--- a/dev/lint-r.R
+++ b/dev/lint-r.R
@@ -15,15 +15,21 @@
 # limitations under the License.
 #
 
+argv <- commandArgs(TRUE)
+SPARK_ROOT_DIR <- as.character(argv[1])
+
 # Installs lintr from Github.
 # NOTE: The CRAN's version is too old to adapt to our rules.
 if ("lintr" %in% row.names(installed.packages())  == FALSE) {
   devtools::install_github("jimhester/lintr")
 }
-library(lintr)
 
-argv <- commandArgs(TRUE)
-SPARK_ROOT_DIR <- as.character(argv[1])
+library(lintr)
+library(methods)
+library(testthat)
+if (! library(SparkR, lib.loc = file.path(SPARK_ROOT_DIR, "R", "lib"), logical.return = TRUE)) {
+  stop("You should install SparkR in a local directory with `R/install-dev.sh`.")
+}
 
 path.to.package <- file.path(SPARK_ROOT_DIR, "R", "pkg")
 lint_package(path.to.package, cache = FALSE)

From f4785f5b82c57bce41d3dc26ed9e3c9e794c7558 Mon Sep 17 00:00:00 2001
From: Andrew Or <andrew@databricks.com>
Date: Tue, 21 Jul 2015 23:00:13 -0700
Subject: [PATCH 314/549] [SPARK-9232] [SQL] Duplicate code in JSONRelation

Author: Andrew Or <andrew@databricks.com>

Closes #7576 from andrewor14/clean-up-json-relation and squashes the following commits:

ea80803 [Andrew Or] Clean up duplicate code
---
 .../apache/spark/sql/json/JSONRelation.scala  | 50 ++++++++-----------
 1 file changed, 21 insertions(+), 29 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/json/JSONRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/json/JSONRelation.scala
index 25802d054ac00..922794ac9aac5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/json/JSONRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/json/JSONRelation.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.json
 
 import java.io.IOException
 
-import org.apache.hadoop.fs.Path
+import org.apache.hadoop.fs.{FileSystem, Path}
 
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.AnalysisException
@@ -87,20 +87,7 @@ private[sql] class DefaultSource
         case SaveMode.Append =>
           sys.error(s"Append mode is not supported by ${this.getClass.getCanonicalName}")
         case SaveMode.Overwrite => {
-          var success: Boolean = false
-          try {
-            success = fs.delete(filesystemPath, true)
-          } catch {
-            case e: IOException =>
-              throw new IOException(
-                s"Unable to clear output directory ${filesystemPath.toString} prior"
-                  + s" to writing to JSON table:\n${e.toString}")
-          }
-          if (!success) {
-            throw new IOException(
-              s"Unable to clear output directory ${filesystemPath.toString} prior"
-                + s" to writing to JSON table.")
-          }
+          JSONRelation.delete(filesystemPath, fs)
           true
         }
         case SaveMode.ErrorIfExists =>
@@ -195,20 +182,7 @@ private[sql] class JSONRelation(
 
     if (overwrite) {
       if (fs.exists(filesystemPath)) {
-        var success: Boolean = false
-        try {
-          success = fs.delete(filesystemPath, true)
-        } catch {
-          case e: IOException =>
-            throw new IOException(
-              s"Unable to clear output directory ${filesystemPath.toString} prior"
-                + s" to writing to JSON table:\n${e.toString}")
-        }
-        if (!success) {
-          throw new IOException(
-            s"Unable to clear output directory ${filesystemPath.toString} prior"
-              + s" to writing to JSON table.")
-        }
+        JSONRelation.delete(filesystemPath, fs)
       }
       // Write the data.
       data.toJSON.saveAsTextFile(filesystemPath.toString)
@@ -228,3 +202,21 @@ private[sql] class JSONRelation(
     case _ => false
   }
 }
+
+private object JSONRelation {
+
+  /** Delete the specified directory to overwrite it with new JSON data. */
+  def delete(dir: Path, fs: FileSystem): Unit = {
+    var success: Boolean = false
+    val failMessage = s"Unable to clear output directory $dir prior to writing to JSON table"
+    try {
+      success = fs.delete(dir, true /* recursive */)
+    } catch {
+      case e: IOException =>
+        throw new IOException(s"$failMessage\n${e.toString}")
+    }
+    if (!success) {
+      throw new IOException(failMessage)
+    }
+  }
+}

From c03299a18b4e076cabb4b7833a1e7632c5c0dabe Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Tue, 21 Jul 2015 23:26:11 -0700
Subject: [PATCH 315/549] [SPARK-4233] [SPARK-4367] [SPARK-3947] [SPARK-3056]
 [SQL] Aggregation Improvement

This is the first PR for the aggregation improvement, which is tracked by https://issues.apache.org/jira/browse/SPARK-4366 (umbrella JIRA). This PR contains work for its subtasks, SPARK-3056, SPARK-3947, SPARK-4233, and SPARK-4367.

This PR introduces a new code path for evaluating aggregate functions. This code path is guarded by `spark.sql.useAggregate2` and by default the value of this flag is true.

This new code path contains:
* A new aggregate function interface (`AggregateFunction2`) and 7 built-int aggregate functions based on this new interface (`AVG`, `COUNT`, `FIRST`, `LAST`, `MAX`, `MIN`, `SUM`)
* A UDAF interface (`UserDefinedAggregateFunction`) based on the new code path and two example UDAFs (`MyDoubleAvg` and `MyDoubleSum`).
* A sort-based aggregate operator (`Aggregate2Sort`) for the new aggregate function interface .
* A sort-based aggregate operator (`FinalAndCompleteAggregate2Sort`) for distinct aggregations (for distinct aggregations the query plan will use `Aggregate2Sort` and `FinalAndCompleteAggregate2Sort` together).

With this change, `spark.sql.useAggregate2` is `true`, the flow of compiling an aggregation query is:
1. Our analyzer looks up functions and returns aggregate functions built based on the old aggregate function interface.
2. When our planner is compiling the physical plan, it tries try to convert all aggregate functions to the ones built based on the new interface. The planner will fallback to the old code path if any of the following two conditions is true:
* code-gen is disabled.
* there is any function that cannot be converted (right now, Hive UDAFs).
* the schema of grouping expressions contain any complex data type.
* There are multiple distinct columns.

Right now, the new code path handles a single distinct column in the query (you can have multiple aggregate functions using that distinct column). For a query having a aggregate function with DISTINCT and regular aggregate functions, the generated plan will do partial aggregations for those regular aggregate function.

Thanks chenghao-intel for his initial work on it.

Author: Yin Huai <yhuai@databricks.com>
Author: Michael Armbrust <michael@databricks.com>

Closes #7458 from yhuai/UDAF and squashes the following commits:

7865f5e [Yin Huai] Put the catalyst expression in the comment of the generated code for it.
b04d6c8 [Yin Huai] Remove unnecessary change.
f1d5901 [Yin Huai] Merge remote-tracking branch 'upstream/master' into UDAF
35b0520 [Yin Huai] Use semanticEquals to replace grouping expressions in the output of the aggregate operator.
3b43b24 [Yin Huai] bug fix.
00eb298 [Yin Huai] Make it compile.
a3ca551 [Yin Huai] Merge remote-tracking branch 'upstream/master' into UDAF
e0afca3 [Yin Huai] Gracefully fallback to old aggregation code path.
8a8ac4a [Yin Huai] Merge remote-tracking branch 'upstream/master' into UDAF
88c7d4d [Yin Huai] Enable spark.sql.useAggregate2 by default for testing purpose.
dc96fd1 [Yin Huai] Many updates:
85c9c4b [Yin Huai] newline.
43de3de [Yin Huai] Merge remote-tracking branch 'upstream/master' into UDAF
c3614d7 [Yin Huai] Handle single distinct column.
68b8ee9 [Yin Huai] Support single distinct column set. WIP
3013579 [Yin Huai] Format.
d678aee [Yin Huai] Remove AggregateExpressionSuite.scala since our built-in aggregate functions will be based on AlgebraicAggregate and we need to have another way to test it.
e243ca6 [Yin Huai] Add aggregation iterators.
a101960 [Yin Huai] Change MyJavaUDAF to MyDoubleSum.
594cdf5 [Yin Huai] Change existing AggregateExpression to AggregateExpression1 and add an AggregateExpression as the common interface for both AggregateExpression1 and AggregateExpression2.
380880f [Yin Huai] Merge remote-tracking branch 'upstream/master' into UDAF
0a827b3 [Yin Huai] Add comments and doc. Move some classes to the right places.
a19fea6 [Yin Huai] Add UDAF interface.
262d4c4 [Yin Huai] Make it compile.
b2e358e [Yin Huai] Merge remote-tracking branch 'upstream/master' into UDAF
6edb5ac [Yin Huai] Format update.
70b169c [Yin Huai] Remove groupOrdering.
4721936 [Yin Huai] Add CheckAggregateFunction to extendedCheckRules.
d821a34 [Yin Huai] Cleanup.
32aea9c [Yin Huai] Merge remote-tracking branch 'upstream/master' into UDAF
5b46d41 [Yin Huai] Bug fix.
aff9534 [Yin Huai] Make Aggregate2Sort work with both algebraic AggregateFunctions and non-algebraic AggregateFunctions.
2857b55 [Yin Huai] Merge remote-tracking branch 'upstream/master' into UDAF
4435f20 [Yin Huai] Add ConvertAggregateFunction to HiveContext's analyzer.
1b490ed [Michael Armbrust] make hive test
8cfa6a9 [Michael Armbrust] add test
1b0bb3f [Yin Huai] Do not bind references in AlgebraicAggregate and use code gen for all places.
072209f [Yin Huai] Bug fix: Handle expressions in grouping columns that are not attribute references.
f7d9e54 [Michael Armbrust] Merge remote-tracking branch 'apache/master' into UDAF
39ee975 [Yin Huai] Code cleanup: Remove unnecesary AttributeReferences.
b7720ba [Yin Huai] Add an analysis rule to convert aggregate function to the new version.
5c00f3f [Michael Armbrust] First draft of codegen
6bbc6ba [Michael Armbrust] now with correct answers\!
f7996d0 [Michael Armbrust] Add AlgebraicAggregate
dded1c5 [Yin Huai] wip
---
 .../apache/spark/sql/catalyst/SqlParser.scala |   3 +-
 .../sql/catalyst/analysis/Analyzer.scala      |  24 +-
 .../sql/catalyst/analysis/CheckAnalysis.scala |   1 +
 .../sql/catalyst/analysis/unresolved.scala    |   5 +-
 .../catalyst/expressions/BoundAttribute.scala |   2 +-
 .../sql/catalyst/expressions/Expression.scala |   3 +-
 .../expressions/aggregate/functions.scala     | 292 +++++++
 .../expressions/aggregate/interfaces.scala    | 206 +++++
 .../sql/catalyst/expressions/aggregates.scala | 100 +--
 .../codegen/GenerateMutableProjection.scala   |  21 +-
 .../sql/catalyst/planning/patterns.scala      |   4 +-
 .../plans/logical/basicOperators.scala        |   1 +
 .../scala/org/apache/spark/sql/SQLConf.scala  |   5 +
 .../org/apache/spark/sql/SQLContext.scala     |   4 +
 .../apache/spark/sql/UDAFRegistration.scala   |  35 +
 .../spark/sql/execution/Aggregate.scala       |  12 +-
 .../apache/spark/sql/execution/Exchange.scala |  11 +-
 .../sql/execution/GeneratedAggregate.scala    |   2 +-
 .../spark/sql/execution/SparkStrategies.scala | 100 ++-
 .../aggregate/aggregateOperators.scala        | 173 ++++
 .../aggregate/sortBasedIterators.scala        | 749 ++++++++++++++++++
 .../spark/sql/execution/aggregate/utils.scala | 364 +++++++++
 .../sql/expressions/aggregate/udaf.scala      | 280 +++++++
 .../org/apache/spark/sql/functions.scala      |   4 +-
 .../org/apache/spark/sql/SQLQuerySuite.scala  |   4 +-
 .../spark/sql/execution/PlannerSuite.scala    |  26 +-
 .../HiveWindowFunctionQuerySuite.scala        |   1 +
 .../SortMergeCompatibilitySuite.scala         |   7 +
 .../apache/spark/sql/hive/HiveContext.scala   |   1 +
 .../org/apache/spark/sql/hive/HiveQl.scala    |   7 +-
 .../org/apache/spark/sql/hive/hiveUDFs.scala  |   8 +-
 .../spark/sql/hive/aggregate/MyDoubleAvg.java | 107 +++
 .../spark/sql/hive/aggregate/MyDoubleSum.java | 100 +++
 ...f_unhex-0-50131c0ba7b7a6b65c789a5a8497bada |   1 +
 ...f_unhex-1-11eb3cc5216d5446f4165007203acc47 |   1 +
 ...f_unhex-2-a660886085b8651852b9b77934848ae4 |  14 +
 ...f_unhex-3-4b2cf4050af229fde91ab53fd9f3af3e |   1 +
 ...f_unhex-4-7d3e094f139892ecef17de3fd63ca3c3 |   1 +
 .../execution/AggregationQuerySuite.scala     | 507 ++++++++++++
 39 files changed, 3087 insertions(+), 100 deletions(-)
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/functions.scala
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/UDAFRegistration.scala
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/aggregateOperators.scala
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/sortBasedIterators.scala
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/utils.scala
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/expressions/aggregate/udaf.scala
 create mode 100644 sql/hive/src/test/java/test/org/apache/spark/sql/hive/aggregate/MyDoubleAvg.java
 create mode 100644 sql/hive/src/test/java/test/org/apache/spark/sql/hive/aggregate/MyDoubleSum.java
 create mode 100644 sql/hive/src/test/resources/golden/udf_unhex-0-50131c0ba7b7a6b65c789a5a8497bada
 create mode 100644 sql/hive/src/test/resources/golden/udf_unhex-1-11eb3cc5216d5446f4165007203acc47
 create mode 100644 sql/hive/src/test/resources/golden/udf_unhex-2-a660886085b8651852b9b77934848ae4
 create mode 100644 sql/hive/src/test/resources/golden/udf_unhex-3-4b2cf4050af229fde91ab53fd9f3af3e
 create mode 100644 sql/hive/src/test/resources/golden/udf_unhex-4-7d3e094f139892ecef17de3fd63ca3c3
 create mode 100644 sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
index d4ef04c2294a2..c04bd6cd85187 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
@@ -266,11 +266,12 @@ class SqlParser extends AbstractSparkSQLParser with DataTypeParser {
       }
     }
     | ident ~ ("(" ~> repsep(expression, ",")) <~ ")" ^^
-      { case udfName ~ exprs => UnresolvedFunction(udfName, exprs) }
+      { case udfName ~ exprs => UnresolvedFunction(udfName, exprs, isDistinct = false) }
     | ident ~ ("(" ~ DISTINCT ~> repsep(expression, ",")) <~ ")" ^^ { case udfName ~ exprs =>
       lexical.normalizeKeyword(udfName) match {
         case "sum" => SumDistinct(exprs.head)
         case "count" => CountDistinct(exprs)
+        case name => UnresolvedFunction(name, exprs, isDistinct = true)
         case _ => throw new AnalysisException(s"function $udfName does not support DISTINCT")
       }
     }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index e58f3f64947f3..8cadbc57e87e1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.catalyst.analysis
 
 import org.apache.spark.sql.AnalysisException
+import org.apache.spark.sql.catalyst.expressions.aggregate.{Complete, AggregateExpression2, AggregateFunction2}
 import org.apache.spark.sql.catalyst.{SimpleCatalystConf, CatalystConf}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical._
@@ -277,7 +278,7 @@ class Analyzer(
         Project(
           projectList.flatMap {
             case s: Star => s.expand(child.output, resolver)
-            case UnresolvedAlias(f @ UnresolvedFunction(_, args)) if containsStar(args) =>
+            case UnresolvedAlias(f @ UnresolvedFunction(_, args, _)) if containsStar(args) =>
               val expandedArgs = args.flatMap {
                 case s: Star => s.expand(child.output, resolver)
                 case o => o :: Nil
@@ -517,9 +518,26 @@ class Analyzer(
     def apply(plan: LogicalPlan): LogicalPlan = plan transform {
       case q: LogicalPlan =>
         q transformExpressions {
-          case u @ UnresolvedFunction(name, children) =>
+          case u @ UnresolvedFunction(name, children, isDistinct) =>
             withPosition(u) {
-              registry.lookupFunction(name, children)
+              registry.lookupFunction(name, children) match {
+                // We get an aggregate function built based on AggregateFunction2 interface.
+                // So, we wrap it in AggregateExpression2.
+                case agg2: AggregateFunction2 => AggregateExpression2(agg2, Complete, isDistinct)
+                // Currently, our old aggregate function interface supports SUM(DISTINCT ...)
+                // and COUTN(DISTINCT ...).
+                case sumDistinct: SumDistinct => sumDistinct
+                case countDistinct: CountDistinct => countDistinct
+                // DISTINCT is not meaningful with Max and Min.
+                case max: Max if isDistinct => max
+                case min: Min if isDistinct => min
+                // For other aggregate functions, DISTINCT keyword is not supported for now.
+                // Once we converted to the new code path, we will allow using DISTINCT keyword.
+                case other if isDistinct =>
+                  failAnalysis(s"$name does not support DISTINCT keyword.")
+                // If it does not have DISTINCT keyword, we will return it as is.
+                case other => other
+              }
             }
         }
     }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
index c7f9713344c50..c203fcecf20fb 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.catalyst.analysis
 
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression2
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.types._
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
index 0daee1990a6e0..03da45b09f928 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
@@ -73,7 +73,10 @@ object UnresolvedAttribute {
   def quoted(name: String): UnresolvedAttribute = new UnresolvedAttribute(Seq(name))
 }
 
-case class UnresolvedFunction(name: String, children: Seq[Expression])
+case class UnresolvedFunction(
+    name: String,
+    children: Seq[Expression],
+    isDistinct: Boolean)
   extends Expression with Unevaluable {
 
   override def dataType: DataType = throw new UnresolvedException(this, "dataType")
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala
index b09aea03318da..b10a3c877434b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala
@@ -32,7 +32,7 @@ import org.apache.spark.sql.types._
 case class BoundReference(ordinal: Int, dataType: DataType, nullable: Boolean)
   extends LeafExpression with NamedExpression {
 
-  override def toString: String = s"input[$ordinal]"
+  override def toString: String = s"input[$ordinal, $dataType]"
 
   override def eval(input: InternalRow): Any = input(ordinal)
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
index aada25276adb7..29ae47e842ddb 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
@@ -96,7 +96,8 @@ abstract class Expression extends TreeNode[Expression] {
     val primitive = ctx.freshName("primitive")
     val ve = GeneratedExpressionCode("", isNull, primitive)
     ve.code = genCode(ctx, ve)
-    ve
+    // Add `this` in the comment.
+    ve.copy(s"/* $this */\n" + ve.code)
   }
 
   /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/functions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/functions.scala
new file mode 100644
index 0000000000000..b924af4cc84d8
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/functions.scala
@@ -0,0 +1,292 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions.aggregate
+
+import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.types._
+
+case class Average(child: Expression) extends AlgebraicAggregate {
+
+  override def children: Seq[Expression] = child :: Nil
+
+  override def nullable: Boolean = true
+
+  // Return data type.
+  override def dataType: DataType = resultType
+
+  // Expected input data type.
+  // TODO: Once we remove the old code path, we can use our analyzer to cast NullType
+  // to the default data type of the NumericType.
+  override def inputTypes: Seq[AbstractDataType] = Seq(TypeCollection(NumericType, NullType))
+
+  private val resultType = child.dataType match {
+    case DecimalType.Fixed(precision, scale) =>
+      DecimalType(precision + 4, scale + 4)
+    case DecimalType.Unlimited => DecimalType.Unlimited
+    case _ => DoubleType
+  }
+
+  private val sumDataType = child.dataType match {
+    case _ @ DecimalType() => DecimalType.Unlimited
+    case _ => DoubleType
+  }
+
+  private val currentSum = AttributeReference("currentSum", sumDataType)()
+  private val currentCount = AttributeReference("currentCount", LongType)()
+
+  override val bufferAttributes = currentSum :: currentCount :: Nil
+
+  override val initialValues = Seq(
+    /* currentSum = */ Cast(Literal(0), sumDataType),
+    /* currentCount = */ Literal(0L)
+  )
+
+  override val updateExpressions = Seq(
+    /* currentSum = */
+    Add(
+      currentSum,
+      Coalesce(Cast(child, sumDataType) :: Cast(Literal(0), sumDataType) :: Nil)),
+    /* currentCount = */ If(IsNull(child), currentCount, currentCount + 1L)
+  )
+
+  override val mergeExpressions = Seq(
+    /* currentSum = */ currentSum.left + currentSum.right,
+    /* currentCount = */ currentCount.left + currentCount.right
+  )
+
+  // If all input are nulls, currentCount will be 0 and we will get null after the division.
+  override val evaluateExpression = Cast(currentSum, resultType) / Cast(currentCount, resultType)
+}
+
+case class Count(child: Expression) extends AlgebraicAggregate {
+  override def children: Seq[Expression] = child :: Nil
+
+  override def nullable: Boolean = false
+
+  // Return data type.
+  override def dataType: DataType = LongType
+
+  // Expected input data type.
+  override def inputTypes: Seq[AbstractDataType] = Seq(AnyDataType)
+
+  private val currentCount = AttributeReference("currentCount", LongType)()
+
+  override val bufferAttributes = currentCount :: Nil
+
+  override val initialValues = Seq(
+    /* currentCount = */ Literal(0L)
+  )
+
+  override val updateExpressions = Seq(
+    /* currentCount = */ If(IsNull(child), currentCount, currentCount + 1L)
+  )
+
+  override val mergeExpressions = Seq(
+    /* currentCount = */ currentCount.left + currentCount.right
+  )
+
+  override val evaluateExpression = Cast(currentCount, LongType)
+}
+
+case class First(child: Expression) extends AlgebraicAggregate {
+
+  override def children: Seq[Expression] = child :: Nil
+
+  override def nullable: Boolean = true
+
+  // First is not a deterministic function.
+  override def deterministic: Boolean = false
+
+  // Return data type.
+  override def dataType: DataType = child.dataType
+
+  // Expected input data type.
+  override def inputTypes: Seq[AbstractDataType] = Seq(AnyDataType)
+
+  private val first = AttributeReference("first", child.dataType)()
+
+  override val bufferAttributes = first :: Nil
+
+  override val initialValues = Seq(
+    /* first = */ Literal.create(null, child.dataType)
+  )
+
+  override val updateExpressions = Seq(
+    /* first = */ If(IsNull(first), child, first)
+  )
+
+  override val mergeExpressions = Seq(
+    /* first = */ If(IsNull(first.left), first.right, first.left)
+  )
+
+  override val evaluateExpression = first
+}
+
+case class Last(child: Expression) extends AlgebraicAggregate {
+
+  override def children: Seq[Expression] = child :: Nil
+
+  override def nullable: Boolean = true
+
+  // Last is not a deterministic function.
+  override def deterministic: Boolean = false
+
+  // Return data type.
+  override def dataType: DataType = child.dataType
+
+  // Expected input data type.
+  override def inputTypes: Seq[AbstractDataType] = Seq(AnyDataType)
+
+  private val last = AttributeReference("last", child.dataType)()
+
+  override val bufferAttributes = last :: Nil
+
+  override val initialValues = Seq(
+    /* last = */ Literal.create(null, child.dataType)
+  )
+
+  override val updateExpressions = Seq(
+    /* last = */ If(IsNull(child), last, child)
+  )
+
+  override val mergeExpressions = Seq(
+    /* last = */ If(IsNull(last.right), last.left, last.right)
+  )
+
+  override val evaluateExpression = last
+}
+
+case class Max(child: Expression) extends AlgebraicAggregate {
+
+  override def children: Seq[Expression] = child :: Nil
+
+  override def nullable: Boolean = true
+
+  // Return data type.
+  override def dataType: DataType = child.dataType
+
+  // Expected input data type.
+  override def inputTypes: Seq[AbstractDataType] = Seq(AnyDataType)
+
+  private val max = AttributeReference("max", child.dataType)()
+
+  override val bufferAttributes = max :: Nil
+
+  override val initialValues = Seq(
+    /* max = */ Literal.create(null, child.dataType)
+  )
+
+  override val updateExpressions = Seq(
+    /* max = */ If(IsNull(child), max, If(IsNull(max), child, Greatest(Seq(max, child))))
+  )
+
+  override val mergeExpressions = {
+    val greatest = Greatest(Seq(max.left, max.right))
+    Seq(
+      /* max = */ If(IsNull(max.right), max.left, If(IsNull(max.left), max.right, greatest))
+    )
+  }
+
+  override val evaluateExpression = max
+}
+
+case class Min(child: Expression) extends AlgebraicAggregate {
+
+  override def children: Seq[Expression] = child :: Nil
+
+  override def nullable: Boolean = true
+
+  // Return data type.
+  override def dataType: DataType = child.dataType
+
+  // Expected input data type.
+  override def inputTypes: Seq[AbstractDataType] = Seq(AnyDataType)
+
+  private val min = AttributeReference("min", child.dataType)()
+
+  override val bufferAttributes = min :: Nil
+
+  override val initialValues = Seq(
+    /* min = */ Literal.create(null, child.dataType)
+  )
+
+  override val updateExpressions = Seq(
+    /* min = */ If(IsNull(child), min, If(IsNull(min), child, Least(Seq(min, child))))
+  )
+
+  override val mergeExpressions = {
+    val least = Least(Seq(min.left, min.right))
+    Seq(
+      /* min = */ If(IsNull(min.right), min.left, If(IsNull(min.left), min.right, least))
+    )
+  }
+
+  override val evaluateExpression = min
+}
+
+case class Sum(child: Expression) extends AlgebraicAggregate {
+
+  override def children: Seq[Expression] = child :: Nil
+
+  override def nullable: Boolean = true
+
+  // Return data type.
+  override def dataType: DataType = resultType
+
+  // Expected input data type.
+  override def inputTypes: Seq[AbstractDataType] =
+    Seq(TypeCollection(LongType, DoubleType, DecimalType, NullType))
+
+  private val resultType = child.dataType match {
+    case DecimalType.Fixed(precision, scale) =>
+      DecimalType(precision + 4, scale + 4)
+    case DecimalType.Unlimited => DecimalType.Unlimited
+    case _ => child.dataType
+  }
+
+  private val sumDataType = child.dataType match {
+    case _ @ DecimalType() => DecimalType.Unlimited
+    case _ => child.dataType
+  }
+
+  private val currentSum = AttributeReference("currentSum", sumDataType)()
+
+  private val zero = Cast(Literal(0), sumDataType)
+
+  override val bufferAttributes = currentSum :: Nil
+
+  override val initialValues = Seq(
+    /* currentSum = */ Literal.create(null, sumDataType)
+  )
+
+  override val updateExpressions = Seq(
+    /* currentSum = */
+    Coalesce(Seq(Add(Coalesce(Seq(currentSum, zero)), Cast(child, sumDataType)), currentSum))
+  )
+
+  override val mergeExpressions = {
+    val add = Add(Coalesce(Seq(currentSum.left, zero)), Cast(currentSum.right, sumDataType))
+    Seq(
+      /* currentSum = */
+      Coalesce(Seq(add, currentSum.left))
+    )
+  }
+
+  override val evaluateExpression = Cast(currentSum, resultType)
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala
new file mode 100644
index 0000000000000..577ede73cb01f
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala
@@ -0,0 +1,206 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions.aggregate
+
+import org.apache.spark.sql.catalyst.errors.TreeNodeException
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, CodeGenContext}
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.types._
+
+/** The mode of an [[AggregateFunction1]]. */
+private[sql] sealed trait AggregateMode
+
+/**
+ * An [[AggregateFunction1]] with [[Partial]] mode is used for partial aggregation.
+ * This function updates the given aggregation buffer with the original input of this
+ * function. When it has processed all input rows, the aggregation buffer is returned.
+ */
+private[sql] case object Partial extends AggregateMode
+
+/**
+ * An [[AggregateFunction1]] with [[PartialMerge]] mode is used to merge aggregation buffers
+ * containing intermediate results for this function.
+ * This function updates the given aggregation buffer by merging multiple aggregation buffers.
+ * When it has processed all input rows, the aggregation buffer is returned.
+ */
+private[sql] case object PartialMerge extends AggregateMode
+
+/**
+ * An [[AggregateFunction1]] with [[PartialMerge]] mode is used to merge aggregation buffers
+ * containing intermediate results for this function and the generate final result.
+ * This function updates the given aggregation buffer by merging multiple aggregation buffers.
+ * When it has processed all input rows, the final result of this function is returned.
+ */
+private[sql] case object Final extends AggregateMode
+
+/**
+ * An [[AggregateFunction2]] with [[Partial]] mode is used to evaluate this function directly
+ * from original input rows without any partial aggregation.
+ * This function updates the given aggregation buffer with the original input of this
+ * function. When it has processed all input rows, the final result of this function is returned.
+ */
+private[sql] case object Complete extends AggregateMode
+
+/**
+ * A place holder expressions used in code-gen, it does not change the corresponding value
+ * in the row.
+ */
+private[sql] case object NoOp extends Expression with Unevaluable {
+  override def nullable: Boolean = true
+  override def eval(input: InternalRow): Any = {
+    throw new TreeNodeException(
+      this, s"No function to evaluate expression. type: ${this.nodeName}")
+  }
+  override def dataType: DataType = NullType
+  override def children: Seq[Expression] = Nil
+}
+
+/**
+ * A container for an [[AggregateFunction2]] with its [[AggregateMode]] and a field
+ * (`isDistinct`) indicating if DISTINCT keyword is specified for this function.
+ * @param aggregateFunction
+ * @param mode
+ * @param isDistinct
+ */
+private[sql] case class AggregateExpression2(
+    aggregateFunction: AggregateFunction2,
+    mode: AggregateMode,
+    isDistinct: Boolean) extends AggregateExpression {
+
+  override def children: Seq[Expression] = aggregateFunction :: Nil
+  override def dataType: DataType = aggregateFunction.dataType
+  override def foldable: Boolean = false
+  override def nullable: Boolean = aggregateFunction.nullable
+
+  override def references: AttributeSet = {
+    val childReferemces = mode match {
+      case Partial | Complete => aggregateFunction.references.toSeq
+      case PartialMerge | Final => aggregateFunction.bufferAttributes
+    }
+
+    AttributeSet(childReferemces)
+  }
+
+  override def toString: String = s"(${aggregateFunction}2,mode=$mode,isDistinct=$isDistinct)"
+}
+
+abstract class AggregateFunction2
+  extends Expression with ImplicitCastInputTypes {
+
+  self: Product =>
+
+  /** An aggregate function is not foldable. */
+  override def foldable: Boolean = false
+
+  /**
+   * The offset of this function's buffer in the underlying buffer shared with other functions.
+   */
+  var bufferOffset: Int = 0
+
+  /** The schema of the aggregation buffer. */
+  def bufferSchema: StructType
+
+  /** Attributes of fields in bufferSchema. */
+  def bufferAttributes: Seq[AttributeReference]
+
+  /** Clones bufferAttributes. */
+  def cloneBufferAttributes: Seq[Attribute]
+
+  /**
+   * Initializes its aggregation buffer located in `buffer`.
+   * It will use bufferOffset to find the starting point of
+   * its buffer in the given `buffer` shared with other functions.
+   */
+  def initialize(buffer: MutableRow): Unit
+
+  /**
+   * Updates its aggregation buffer located in `buffer` based on the given `input`.
+   * It will use bufferOffset to find the starting point of its buffer in the given `buffer`
+   * shared with other functions.
+   */
+  def update(buffer: MutableRow, input: InternalRow): Unit
+
+  /**
+   * Updates its aggregation buffer located in `buffer1` by combining intermediate results
+   * in the current buffer and intermediate results from another buffer `buffer2`.
+   * It will use bufferOffset to find the starting point of its buffer in the given `buffer1`
+   * and `buffer2`.
+   */
+  def merge(buffer1: MutableRow, buffer2: InternalRow): Unit
+
+  override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String =
+    throw new UnsupportedOperationException(s"Cannot evaluate expression: $this")
+}
+
+/**
+ * A helper class for aggregate functions that can be implemented in terms of catalyst expressions.
+ */
+abstract class AlgebraicAggregate extends AggregateFunction2 with Serializable {
+  self: Product =>
+
+  val initialValues: Seq[Expression]
+  val updateExpressions: Seq[Expression]
+  val mergeExpressions: Seq[Expression]
+  val evaluateExpression: Expression
+
+  override lazy val cloneBufferAttributes = bufferAttributes.map(_.newInstance())
+
+  /**
+   * A helper class for representing an attribute used in merging two
+   * aggregation buffers. When merging two buffers, `bufferLeft` and `bufferRight`,
+   * we merge buffer values and then update bufferLeft. A [[RichAttribute]]
+   * of an [[AttributeReference]] `a` has two functions `left` and `right`,
+   * which represent `a` in `bufferLeft` and `bufferRight`, respectively.
+   * @param a
+   */
+  implicit class RichAttribute(a: AttributeReference) {
+    /** Represents this attribute at the mutable buffer side. */
+    def left: AttributeReference = a
+
+    /** Represents this attribute at the input buffer side (the data value is read-only). */
+    def right: AttributeReference = cloneBufferAttributes(bufferAttributes.indexOf(a))
+  }
+
+  /** An AlgebraicAggregate's bufferSchema is derived from bufferAttributes. */
+  override def bufferSchema: StructType = StructType.fromAttributes(bufferAttributes)
+
+  override def initialize(buffer: MutableRow): Unit = {
+    var i = 0
+    while (i < bufferAttributes.size) {
+      buffer(i + bufferOffset) = initialValues(i).eval()
+      i += 1
+    }
+  }
+
+  override def update(buffer: MutableRow, input: InternalRow): Unit = {
+    throw new UnsupportedOperationException(
+      "AlgebraicAggregate's update should not be called directly")
+  }
+
+  override def merge(buffer1: MutableRow, buffer2: InternalRow): Unit = {
+    throw new UnsupportedOperationException(
+      "AlgebraicAggregate's merge should not be called directly")
+  }
+
+  override def eval(buffer: InternalRow): Any = {
+    throw new UnsupportedOperationException(
+      "AlgebraicAggregate's eval should not be called directly")
+  }
+}
+
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
index d705a1286065c..e07c920a41d0a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
@@ -27,7 +27,9 @@ import org.apache.spark.sql.types._
 import org.apache.spark.util.collection.OpenHashSet
 
 
-trait AggregateExpression extends Expression with Unevaluable {
+trait AggregateExpression extends Expression with Unevaluable
+
+trait AggregateExpression1 extends AggregateExpression {
 
   /**
    * Aggregate expressions should not be foldable.
@@ -38,7 +40,7 @@ trait AggregateExpression extends Expression with Unevaluable {
    * Creates a new instance that can be used to compute this aggregate expression for a group
    * of input rows/
    */
-  def newInstance(): AggregateFunction
+  def newInstance(): AggregateFunction1
 }
 
 /**
@@ -54,10 +56,10 @@ case class SplitEvaluation(
     partialEvaluations: Seq[NamedExpression])
 
 /**
- * An [[AggregateExpression]] that can be partially computed without seeing all relevant tuples.
+ * An [[AggregateExpression1]] that can be partially computed without seeing all relevant tuples.
  * These partial evaluations can then be combined to compute the actual answer.
  */
-trait PartialAggregate extends AggregateExpression {
+trait PartialAggregate1 extends AggregateExpression1 {
 
   /**
    * Returns a [[SplitEvaluation]] that computes this aggregation using partial aggregation.
@@ -67,13 +69,13 @@ trait PartialAggregate extends AggregateExpression {
 
 /**
  * A specific implementation of an aggregate function. Used to wrap a generic
- * [[AggregateExpression]] with an algorithm that will be used to compute one specific result.
+ * [[AggregateExpression1]] with an algorithm that will be used to compute one specific result.
  */
-abstract class AggregateFunction
-  extends LeafExpression with AggregateExpression with Serializable {
+abstract class AggregateFunction1
+  extends LeafExpression with AggregateExpression1 with Serializable {
 
   /** Base should return the generic aggregate expression that this function is computing */
-  val base: AggregateExpression
+  val base: AggregateExpression1
 
   override def nullable: Boolean = base.nullable
   override def dataType: DataType = base.dataType
@@ -81,12 +83,12 @@ abstract class AggregateFunction
   def update(input: InternalRow): Unit
 
   // Do we really need this?
-  override def newInstance(): AggregateFunction = {
+  override def newInstance(): AggregateFunction1 = {
     makeCopy(productIterator.map { case a: AnyRef => a }.toArray)
   }
 }
 
-case class Min(child: Expression) extends UnaryExpression with PartialAggregate {
+case class Min(child: Expression) extends UnaryExpression with PartialAggregate1 {
 
   override def nullable: Boolean = true
   override def dataType: DataType = child.dataType
@@ -102,7 +104,7 @@ case class Min(child: Expression) extends UnaryExpression with PartialAggregate
     TypeUtils.checkForOrderingExpr(child.dataType, "function min")
 }
 
-case class MinFunction(expr: Expression, base: AggregateExpression) extends AggregateFunction {
+case class MinFunction(expr: Expression, base: AggregateExpression1) extends AggregateFunction1 {
   def this() = this(null, null) // Required for serialization.
 
   val currentMin: MutableLiteral = MutableLiteral(null, expr.dataType)
@@ -119,7 +121,7 @@ case class MinFunction(expr: Expression, base: AggregateExpression) extends Aggr
   override def eval(input: InternalRow): Any = currentMin.value
 }
 
-case class Max(child: Expression) extends UnaryExpression with PartialAggregate {
+case class Max(child: Expression) extends UnaryExpression with PartialAggregate1 {
 
   override def nullable: Boolean = true
   override def dataType: DataType = child.dataType
@@ -135,7 +137,7 @@ case class Max(child: Expression) extends UnaryExpression with PartialAggregate
     TypeUtils.checkForOrderingExpr(child.dataType, "function max")
 }
 
-case class MaxFunction(expr: Expression, base: AggregateExpression) extends AggregateFunction {
+case class MaxFunction(expr: Expression, base: AggregateExpression1) extends AggregateFunction1 {
   def this() = this(null, null) // Required for serialization.
 
   val currentMax: MutableLiteral = MutableLiteral(null, expr.dataType)
@@ -152,7 +154,7 @@ case class MaxFunction(expr: Expression, base: AggregateExpression) extends Aggr
   override def eval(input: InternalRow): Any = currentMax.value
 }
 
-case class Count(child: Expression) extends UnaryExpression with PartialAggregate {
+case class Count(child: Expression) extends UnaryExpression with PartialAggregate1 {
 
   override def nullable: Boolean = false
   override def dataType: LongType.type = LongType
@@ -165,7 +167,7 @@ case class Count(child: Expression) extends UnaryExpression with PartialAggregat
   override def newInstance(): CountFunction = new CountFunction(child, this)
 }
 
-case class CountFunction(expr: Expression, base: AggregateExpression) extends AggregateFunction {
+case class CountFunction(expr: Expression, base: AggregateExpression1) extends AggregateFunction1 {
   def this() = this(null, null) // Required for serialization.
 
   var count: Long = _
@@ -180,7 +182,7 @@ case class CountFunction(expr: Expression, base: AggregateExpression) extends Ag
   override def eval(input: InternalRow): Any = count
 }
 
-case class CountDistinct(expressions: Seq[Expression]) extends PartialAggregate {
+case class CountDistinct(expressions: Seq[Expression]) extends PartialAggregate1 {
   def this() = this(null)
 
   override def children: Seq[Expression] = expressions
@@ -200,8 +202,8 @@ case class CountDistinct(expressions: Seq[Expression]) extends PartialAggregate
 
 case class CountDistinctFunction(
     @transient expr: Seq[Expression],
-    @transient base: AggregateExpression)
-  extends AggregateFunction {
+    @transient base: AggregateExpression1)
+  extends AggregateFunction1 {
 
   def this() = this(null, null) // Required for serialization.
 
@@ -220,7 +222,7 @@ case class CountDistinctFunction(
   override def eval(input: InternalRow): Any = seen.size.toLong
 }
 
-case class CollectHashSet(expressions: Seq[Expression]) extends AggregateExpression {
+case class CollectHashSet(expressions: Seq[Expression]) extends AggregateExpression1 {
   def this() = this(null)
 
   override def children: Seq[Expression] = expressions
@@ -233,8 +235,8 @@ case class CollectHashSet(expressions: Seq[Expression]) extends AggregateExpress
 
 case class CollectHashSetFunction(
     @transient expr: Seq[Expression],
-    @transient base: AggregateExpression)
-  extends AggregateFunction {
+    @transient base: AggregateExpression1)
+  extends AggregateFunction1 {
 
   def this() = this(null, null) // Required for serialization.
 
@@ -255,7 +257,7 @@ case class CollectHashSetFunction(
   }
 }
 
-case class CombineSetsAndCount(inputSet: Expression) extends AggregateExpression {
+case class CombineSetsAndCount(inputSet: Expression) extends AggregateExpression1 {
   def this() = this(null)
 
   override def children: Seq[Expression] = inputSet :: Nil
@@ -269,8 +271,8 @@ case class CombineSetsAndCount(inputSet: Expression) extends AggregateExpression
 
 case class CombineSetsAndCountFunction(
     @transient inputSet: Expression,
-    @transient base: AggregateExpression)
-  extends AggregateFunction {
+    @transient base: AggregateExpression1)
+  extends AggregateFunction1 {
 
   def this() = this(null, null) // Required for serialization.
 
@@ -305,7 +307,7 @@ private[sql] case object HyperLogLogUDT extends UserDefinedType[HyperLogLog] {
 }
 
 case class ApproxCountDistinctPartition(child: Expression, relativeSD: Double)
-  extends UnaryExpression with AggregateExpression {
+  extends UnaryExpression with AggregateExpression1 {
 
   override def nullable: Boolean = false
   override def dataType: DataType = HyperLogLogUDT
@@ -317,9 +319,9 @@ case class ApproxCountDistinctPartition(child: Expression, relativeSD: Double)
 
 case class ApproxCountDistinctPartitionFunction(
     expr: Expression,
-    base: AggregateExpression,
+    base: AggregateExpression1,
     relativeSD: Double)
-  extends AggregateFunction {
+  extends AggregateFunction1 {
   def this() = this(null, null, 0) // Required for serialization.
 
   private val hyperLogLog = new HyperLogLog(relativeSD)
@@ -335,7 +337,7 @@ case class ApproxCountDistinctPartitionFunction(
 }
 
 case class ApproxCountDistinctMerge(child: Expression, relativeSD: Double)
-  extends UnaryExpression with AggregateExpression {
+  extends UnaryExpression with AggregateExpression1 {
 
   override def nullable: Boolean = false
   override def dataType: LongType.type = LongType
@@ -347,9 +349,9 @@ case class ApproxCountDistinctMerge(child: Expression, relativeSD: Double)
 
 case class ApproxCountDistinctMergeFunction(
     expr: Expression,
-    base: AggregateExpression,
+    base: AggregateExpression1,
     relativeSD: Double)
-  extends AggregateFunction {
+  extends AggregateFunction1 {
   def this() = this(null, null, 0) // Required for serialization.
 
   private val hyperLogLog = new HyperLogLog(relativeSD)
@@ -363,7 +365,7 @@ case class ApproxCountDistinctMergeFunction(
 }
 
 case class ApproxCountDistinct(child: Expression, relativeSD: Double = 0.05)
-  extends UnaryExpression with PartialAggregate {
+  extends UnaryExpression with PartialAggregate1 {
 
   override def nullable: Boolean = false
   override def dataType: LongType.type = LongType
@@ -381,7 +383,7 @@ case class ApproxCountDistinct(child: Expression, relativeSD: Double = 0.05)
   override def newInstance(): CountDistinctFunction = new CountDistinctFunction(child :: Nil, this)
 }
 
-case class Average(child: Expression) extends UnaryExpression with PartialAggregate {
+case class Average(child: Expression) extends UnaryExpression with PartialAggregate1 {
 
   override def prettyName: String = "avg"
 
@@ -427,8 +429,8 @@ case class Average(child: Expression) extends UnaryExpression with PartialAggreg
     TypeUtils.checkForNumericExpr(child.dataType, "function average")
 }
 
-case class AverageFunction(expr: Expression, base: AggregateExpression)
-  extends AggregateFunction {
+case class AverageFunction(expr: Expression, base: AggregateExpression1)
+  extends AggregateFunction1 {
 
   def this() = this(null, null) // Required for serialization.
 
@@ -474,7 +476,7 @@ case class AverageFunction(expr: Expression, base: AggregateExpression)
   }
 }
 
-case class Sum(child: Expression) extends UnaryExpression with PartialAggregate {
+case class Sum(child: Expression) extends UnaryExpression with PartialAggregate1 {
 
   override def nullable: Boolean = true
 
@@ -509,7 +511,7 @@ case class Sum(child: Expression) extends UnaryExpression with PartialAggregate
     TypeUtils.checkForNumericExpr(child.dataType, "function sum")
 }
 
-case class SumFunction(expr: Expression, base: AggregateExpression) extends AggregateFunction {
+case class SumFunction(expr: Expression, base: AggregateExpression1) extends AggregateFunction1 {
   def this() = this(null, null) // Required for serialization.
 
   private val calcType =
@@ -554,7 +556,7 @@ case class SumFunction(expr: Expression, base: AggregateExpression) extends Aggr
  *          <-- null         <-- no data
  * null     <-- null         <-- no data
  */
-case class CombineSum(child: Expression) extends AggregateExpression {
+case class CombineSum(child: Expression) extends AggregateExpression1 {
   def this() = this(null)
 
   override def children: Seq[Expression] = child :: Nil
@@ -564,8 +566,8 @@ case class CombineSum(child: Expression) extends AggregateExpression {
   override def newInstance(): CombineSumFunction = new CombineSumFunction(child, this)
 }
 
-case class CombineSumFunction(expr: Expression, base: AggregateExpression)
-  extends AggregateFunction {
+case class CombineSumFunction(expr: Expression, base: AggregateExpression1)
+  extends AggregateFunction1 {
 
   def this() = this(null, null) // Required for serialization.
 
@@ -601,7 +603,7 @@ case class CombineSumFunction(expr: Expression, base: AggregateExpression)
   }
 }
 
-case class SumDistinct(child: Expression) extends UnaryExpression with PartialAggregate {
+case class SumDistinct(child: Expression) extends UnaryExpression with PartialAggregate1 {
 
   def this() = this(null)
   override def nullable: Boolean = true
@@ -627,8 +629,8 @@ case class SumDistinct(child: Expression) extends UnaryExpression with PartialAg
     TypeUtils.checkForNumericExpr(child.dataType, "function sumDistinct")
 }
 
-case class SumDistinctFunction(expr: Expression, base: AggregateExpression)
-  extends AggregateFunction {
+case class SumDistinctFunction(expr: Expression, base: AggregateExpression1)
+  extends AggregateFunction1 {
 
   def this() = this(null, null) // Required for serialization.
 
@@ -653,7 +655,7 @@ case class SumDistinctFunction(expr: Expression, base: AggregateExpression)
   }
 }
 
-case class CombineSetsAndSum(inputSet: Expression, base: Expression) extends AggregateExpression {
+case class CombineSetsAndSum(inputSet: Expression, base: Expression) extends AggregateExpression1 {
   def this() = this(null, null)
 
   override def children: Seq[Expression] = inputSet :: Nil
@@ -667,8 +669,8 @@ case class CombineSetsAndSum(inputSet: Expression, base: Expression) extends Agg
 
 case class CombineSetsAndSumFunction(
     @transient inputSet: Expression,
-    @transient base: AggregateExpression)
-  extends AggregateFunction {
+    @transient base: AggregateExpression1)
+  extends AggregateFunction1 {
 
   def this() = this(null, null) // Required for serialization.
 
@@ -695,7 +697,7 @@ case class CombineSetsAndSumFunction(
   }
 }
 
-case class First(child: Expression) extends UnaryExpression with PartialAggregate {
+case class First(child: Expression) extends UnaryExpression with PartialAggregate1 {
   override def nullable: Boolean = true
   override def dataType: DataType = child.dataType
   override def toString: String = s"FIRST($child)"
@@ -709,7 +711,7 @@ case class First(child: Expression) extends UnaryExpression with PartialAggregat
   override def newInstance(): FirstFunction = new FirstFunction(child, this)
 }
 
-case class FirstFunction(expr: Expression, base: AggregateExpression) extends AggregateFunction {
+case class FirstFunction(expr: Expression, base: AggregateExpression1) extends AggregateFunction1 {
   def this() = this(null, null) // Required for serialization.
 
   var result: Any = null
@@ -723,7 +725,7 @@ case class FirstFunction(expr: Expression, base: AggregateExpression) extends Ag
   override def eval(input: InternalRow): Any = result
 }
 
-case class Last(child: Expression) extends UnaryExpression with PartialAggregate {
+case class Last(child: Expression) extends UnaryExpression with PartialAggregate1 {
   override def references: AttributeSet = child.references
   override def nullable: Boolean = true
   override def dataType: DataType = child.dataType
@@ -738,7 +740,7 @@ case class Last(child: Expression) extends UnaryExpression with PartialAggregate
   override def newInstance(): LastFunction = new LastFunction(child, this)
 }
 
-case class LastFunction(expr: Expression, base: AggregateExpression) extends AggregateFunction {
+case class LastFunction(expr: Expression, base: AggregateExpression1) extends AggregateFunction1 {
   def this() = this(null, null) // Required for serialization.
 
   var result: Any = null
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
index 03b4b3c216f49..d838268f46956 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.catalyst.expressions.codegen
 
 import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.expressions.aggregate.NoOp
 
 import scala.collection.mutable.ArrayBuffer
 
@@ -38,15 +39,17 @@ object GenerateMutableProjection extends CodeGenerator[Seq[Expression], () => Mu
 
   protected def create(expressions: Seq[Expression]): (() => MutableProjection) = {
     val ctx = newCodeGenContext()
-    val projectionCode = expressions.zipWithIndex.map { case (e, i) =>
-      val evaluationCode = e.gen(ctx)
-      evaluationCode.code +
-        s"""
-          if(${evaluationCode.isNull})
-            mutableRow.setNullAt($i);
-          else
-            ${ctx.setColumn("mutableRow", e.dataType, i, evaluationCode.primitive)};
-        """
+    val projectionCode = expressions.zipWithIndex.map {
+      case (NoOp, _) => ""
+      case (e, i) =>
+        val evaluationCode = e.gen(ctx)
+        evaluationCode.code +
+          s"""
+            if(${evaluationCode.isNull})
+              mutableRow.setNullAt($i);
+            else
+              ${ctx.setColumn("mutableRow", e.dataType, i, evaluationCode.primitive)};
+          """
     }
     // collect projections into blocks as function has 64kb codesize limit in JVM
     val projectionBlocks = new ArrayBuffer[String]()
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala
index 179a348d5baac..b8e3b0d53a505 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala
@@ -129,10 +129,10 @@ object PartialAggregation {
     case logical.Aggregate(groupingExpressions, aggregateExpressions, child) =>
       // Collect all aggregate expressions.
       val allAggregates =
-        aggregateExpressions.flatMap(_ collect { case a: AggregateExpression => a})
+        aggregateExpressions.flatMap(_ collect { case a: AggregateExpression1 => a})
       // Collect all aggregate expressions that can be computed partially.
       val partialAggregates =
-        aggregateExpressions.flatMap(_ collect { case p: PartialAggregate => p})
+        aggregateExpressions.flatMap(_ collect { case p: PartialAggregate1 => p})
 
       // Only do partial aggregation if supported by all aggregate expressions.
       if (allAggregates.size == partialAggregates.size) {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
index 986c315b3173a..6aefa9f67556a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.catalyst.plans.logical
 
 import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression2
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.types._
 import org.apache.spark.util.collection.OpenHashSet
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
index 78c780bdc5797..1474b170ba896 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
@@ -402,6 +402,9 @@ private[spark] object SQLConf {
     defaultValue = Some(true),
     isPublic = false)
 
+  val USE_SQL_AGGREGATE2 = booleanConf("spark.sql.useAggregate2",
+    defaultValue = Some(true), doc = "<TODO>")
+
   val USE_SQL_SERIALIZER2 = booleanConf(
     "spark.sql.useSerializer2",
     defaultValue = Some(true), isPublic = false)
@@ -473,6 +476,8 @@ private[sql] class SQLConf extends Serializable with CatalystConf {
 
   private[spark] def unsafeEnabled: Boolean = getConf(UNSAFE_ENABLED)
 
+  private[spark] def useSqlAggregate2: Boolean = getConf(USE_SQL_AGGREGATE2)
+
   private[spark] def useSqlSerializer2: Boolean = getConf(USE_SQL_SERIALIZER2)
 
   private[spark] def autoBroadcastJoinThreshold: Int = getConf(AUTO_BROADCASTJOIN_THRESHOLD)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 8b4528b5d52fe..49bfe74b680af 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -285,6 +285,9 @@ class SQLContext(@transient val sparkContext: SparkContext)
   @transient
   val udf: UDFRegistration = new UDFRegistration(this)
 
+  @transient
+  val udaf: UDAFRegistration = new UDAFRegistration(this)
+
   /**
    * Returns true if the table is currently cached in-memory.
    * @group cachemgmt
@@ -863,6 +866,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
       DDLStrategy ::
       TakeOrderedAndProject ::
       HashAggregation ::
+      Aggregation ::
       LeftSemiJoin ::
       HashJoin ::
       InMemoryScans ::
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/UDAFRegistration.scala b/sql/core/src/main/scala/org/apache/spark/sql/UDAFRegistration.scala
new file mode 100644
index 0000000000000..5b872f5e3eecd
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/UDAFRegistration.scala
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql
+
+import org.apache.spark.Logging
+import org.apache.spark.sql.catalyst.expressions.{Expression}
+import org.apache.spark.sql.expressions.aggregate.{ScalaUDAF, UserDefinedAggregateFunction}
+
+class UDAFRegistration private[sql] (sqlContext: SQLContext) extends Logging {
+
+  private val functionRegistry = sqlContext.functionRegistry
+
+  def register(
+      name: String,
+      func: UserDefinedAggregateFunction): UserDefinedAggregateFunction = {
+    def builder(children: Seq[Expression]) = ScalaUDAF(children, func)
+    functionRegistry.registerFunction(name, builder)
+    func
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Aggregate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Aggregate.scala
index 3cd60a2aa55ed..c2c945321db95 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Aggregate.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Aggregate.scala
@@ -68,14 +68,14 @@ case class Aggregate(
    *                        output.
    */
   case class ComputedAggregate(
-      unbound: AggregateExpression,
-      aggregate: AggregateExpression,
+      unbound: AggregateExpression1,
+      aggregate: AggregateExpression1,
       resultAttribute: AttributeReference)
 
   /** A list of aggregates that need to be computed for each group. */
   private[this] val computedAggregates = aggregateExpressions.flatMap { agg =>
     agg.collect {
-      case a: AggregateExpression =>
+      case a: AggregateExpression1 =>
         ComputedAggregate(
           a,
           BindReferences.bindReference(a, child.output),
@@ -87,8 +87,8 @@ case class Aggregate(
   private[this] val computedSchema = computedAggregates.map(_.resultAttribute)
 
   /** Creates a new aggregate buffer for a group. */
-  private[this] def newAggregateBuffer(): Array[AggregateFunction] = {
-    val buffer = new Array[AggregateFunction](computedAggregates.length)
+  private[this] def newAggregateBuffer(): Array[AggregateFunction1] = {
+    val buffer = new Array[AggregateFunction1](computedAggregates.length)
     var i = 0
     while (i < computedAggregates.length) {
       buffer(i) = computedAggregates(i).aggregate.newInstance()
@@ -146,7 +146,7 @@ case class Aggregate(
       }
     } else {
       child.execute().mapPartitions { iter =>
-        val hashTable = new HashMap[InternalRow, Array[AggregateFunction]]
+        val hashTable = new HashMap[InternalRow, Array[AggregateFunction1]]
         val groupingProjection = new InterpretedMutableProjection(groupingExpressions, child.output)
 
         var currentRow: InternalRow = null
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
index 2750053594f99..d31e265a293e9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
@@ -247,8 +247,15 @@ private[sql] case class EnsureRequirements(sqlContext: SQLContext) extends Rule[
         }
 
         def addSortIfNecessary(child: SparkPlan): SparkPlan = {
-          if (rowOrdering.nonEmpty && child.outputOrdering != rowOrdering) {
-            sqlContext.planner.BasicOperators.getSortOperator(rowOrdering, global = false, child)
+
+          if (rowOrdering.nonEmpty) {
+            // If child.outputOrdering is [a, b] and rowOrdering is [a], we do not need to sort.
+            val minSize = Seq(rowOrdering.size, child.outputOrdering.size).min
+            if (minSize == 0 || rowOrdering.take(minSize) != child.outputOrdering.take(minSize)) {
+              sqlContext.planner.BasicOperators.getSortOperator(rowOrdering, global = false, child)
+            } else {
+              child
+            }
           } else {
             child
           }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala
index ecde9c57139a6..0e63f2fe29cb3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala
@@ -69,7 +69,7 @@ case class GeneratedAggregate(
 
   protected override def doExecute(): RDD[InternalRow] = {
     val aggregatesToCompute = aggregateExpressions.flatMap { a =>
-      a.collect { case agg: AggregateExpression => agg}
+      a.collect { case agg: AggregateExpression1 => agg}
     }
 
     // If you add any new function support, please add tests in org.apache.spark.sql.SQLQuerySuite
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index 8cef7f200d2dc..f54aa2027f6a6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.execution
 import org.apache.spark.sql.{SQLContext, Strategy, execution}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression2}
 import org.apache.spark.sql.catalyst.planning._
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical.{BroadcastHint, LogicalPlan}
@@ -148,7 +149,8 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
              if canBeCodeGened(
                   allAggregates(partialComputation) ++
                   allAggregates(rewrittenAggregateExpressions)) &&
-               codegenEnabled =>
+               codegenEnabled &&
+               !canBeConvertedToNewAggregation(plan) =>
           execution.GeneratedAggregate(
             partial = false,
             namedGroupingAttributes,
@@ -167,7 +169,7 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
              rewrittenAggregateExpressions,
              groupingExpressions,
              partialComputation,
-             child) =>
+             child) if !canBeConvertedToNewAggregation(plan) =>
         execution.Aggregate(
           partial = false,
           namedGroupingAttributes,
@@ -181,7 +183,14 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
       case _ => Nil
     }
 
-    def canBeCodeGened(aggs: Seq[AggregateExpression]): Boolean = !aggs.exists {
+    def canBeConvertedToNewAggregation(plan: LogicalPlan): Boolean = {
+      aggregate.Utils.tryConvert(
+        plan,
+        sqlContext.conf.useSqlAggregate2,
+        sqlContext.conf.codegenEnabled).isDefined
+    }
+
+    def canBeCodeGened(aggs: Seq[AggregateExpression1]): Boolean = !aggs.exists {
       case _: CombineSum | _: Sum | _: Count | _: Max | _: Min |  _: CombineSetsAndCount => false
       // The generated set implementation is pretty limited ATM.
       case CollectHashSet(exprs) if exprs.size == 1  &&
@@ -189,10 +198,74 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
       case _ => true
     }
 
-    def allAggregates(exprs: Seq[Expression]): Seq[AggregateExpression] =
-      exprs.flatMap(_.collect { case a: AggregateExpression => a })
+    def allAggregates(exprs: Seq[Expression]): Seq[AggregateExpression1] =
+      exprs.flatMap(_.collect { case a: AggregateExpression1 => a })
   }
 
+  /**
+   * Used to plan the aggregate operator for expressions based on the AggregateFunction2 interface.
+   */
+  object Aggregation extends Strategy {
+    def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
+      case p: logical.Aggregate =>
+        val converted =
+          aggregate.Utils.tryConvert(
+            p,
+            sqlContext.conf.useSqlAggregate2,
+            sqlContext.conf.codegenEnabled)
+        converted match {
+          case None => Nil // Cannot convert to new aggregation code path.
+          case Some(logical.Aggregate(groupingExpressions, resultExpressions, child)) =>
+            // Extracts all distinct aggregate expressions from the resultExpressions.
+            val aggregateExpressions = resultExpressions.flatMap { expr =>
+              expr.collect {
+                case agg: AggregateExpression2 => agg
+              }
+            }.toSet.toSeq
+            // For those distinct aggregate expressions, we create a map from the
+            // aggregate function to the corresponding attribute of the function.
+            val aggregateFunctionMap = aggregateExpressions.map { agg =>
+              val aggregateFunction = agg.aggregateFunction
+              (aggregateFunction, agg.isDistinct) ->
+                Alias(aggregateFunction, aggregateFunction.toString)().toAttribute
+            }.toMap
+
+            val (functionsWithDistinct, functionsWithoutDistinct) =
+              aggregateExpressions.partition(_.isDistinct)
+            if (functionsWithDistinct.map(_.aggregateFunction.children).distinct.length > 1) {
+              // This is a sanity check. We should not reach here when we have multiple distinct
+              // column sets (aggregate.NewAggregation will not match).
+              sys.error(
+                "Multiple distinct column sets are not supported by the new aggregation" +
+                  "code path.")
+            }
+
+            val aggregateOperator =
+              if (functionsWithDistinct.isEmpty) {
+                aggregate.Utils.planAggregateWithoutDistinct(
+                  groupingExpressions,
+                  aggregateExpressions,
+                  aggregateFunctionMap,
+                  resultExpressions,
+                  planLater(child))
+              } else {
+                aggregate.Utils.planAggregateWithOneDistinct(
+                  groupingExpressions,
+                  functionsWithDistinct,
+                  functionsWithoutDistinct,
+                  aggregateFunctionMap,
+                  resultExpressions,
+                  planLater(child))
+              }
+
+            aggregateOperator
+        }
+
+      case _ => Nil
+    }
+  }
+
+
   object BroadcastNestedLoopJoin extends Strategy {
     def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
       case logical.Join(left, right, joinType, condition) =>
@@ -336,8 +409,21 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
         execution.Filter(condition, planLater(child)) :: Nil
       case e @ logical.Expand(_, _, _, child) =>
         execution.Expand(e.projections, e.output, planLater(child)) :: Nil
-      case logical.Aggregate(group, agg, child) =>
-        execution.Aggregate(partial = false, group, agg, planLater(child)) :: Nil
+      case a @ logical.Aggregate(group, agg, child) => {
+        val useNewAggregation =
+          aggregate.Utils.tryConvert(
+            a,
+            sqlContext.conf.useSqlAggregate2,
+            sqlContext.conf.codegenEnabled).isDefined
+        if (useNewAggregation) {
+          // If this logical.Aggregate can be planned to use new aggregation code path
+          // (i.e. it can be planned by the Strategy Aggregation), we will not use the old
+          // aggregation code path.
+          Nil
+        } else {
+          execution.Aggregate(partial = false, group, agg, planLater(child)) :: Nil
+        }
+      }
       case logical.Window(projectList, windowExpressions, spec, child) =>
         execution.Window(projectList, windowExpressions, spec, planLater(child)) :: Nil
       case logical.Sample(lb, ub, withReplacement, seed, child) =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/aggregateOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/aggregateOperators.scala
new file mode 100644
index 0000000000000..0c9082897f390
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/aggregateOperators.scala
@@ -0,0 +1,173 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.aggregate
+
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.errors._
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.expressions.aggregate._
+import org.apache.spark.sql.catalyst.plans.physical.{AllTuples, ClusteredDistribution, Distribution, UnspecifiedDistribution}
+import org.apache.spark.sql.execution.{SparkPlan, UnaryNode}
+
+case class Aggregate2Sort(
+    requiredChildDistributionExpressions: Option[Seq[Expression]],
+    groupingExpressions: Seq[NamedExpression],
+    aggregateExpressions: Seq[AggregateExpression2],
+    aggregateAttributes: Seq[Attribute],
+    resultExpressions: Seq[NamedExpression],
+    child: SparkPlan)
+  extends UnaryNode {
+
+  override def canProcessUnsafeRows: Boolean = true
+
+  override def references: AttributeSet = {
+    val referencesInResults =
+      AttributeSet(resultExpressions.flatMap(_.references)) -- AttributeSet(aggregateAttributes)
+
+    AttributeSet(
+      groupingExpressions.flatMap(_.references) ++
+      aggregateExpressions.flatMap(_.references) ++
+      referencesInResults)
+  }
+
+  override def requiredChildDistribution: List[Distribution] = {
+    requiredChildDistributionExpressions match {
+      case Some(exprs) if exprs.length == 0 => AllTuples :: Nil
+      case Some(exprs) if exprs.length > 0 => ClusteredDistribution(exprs) :: Nil
+      case None => UnspecifiedDistribution :: Nil
+    }
+  }
+
+  override def requiredChildOrdering: Seq[Seq[SortOrder]] = {
+    // TODO: We should not sort the input rows if they are just in reversed order.
+    groupingExpressions.map(SortOrder(_, Ascending)) :: Nil
+  }
+
+  override def outputOrdering: Seq[SortOrder] = {
+    // It is possible that the child.outputOrdering starts with the required
+    // ordering expressions (e.g. we require [a] as the sort expression and the
+    // child's outputOrdering is [a, b]). We can only guarantee the output rows
+    // are sorted by values of groupingExpressions.
+    groupingExpressions.map(SortOrder(_, Ascending))
+  }
+
+  override def output: Seq[Attribute] = resultExpressions.map(_.toAttribute)
+
+  protected override def doExecute(): RDD[InternalRow] = attachTree(this, "execute") {
+    child.execute().mapPartitions { iter =>
+      if (aggregateExpressions.length == 0) {
+        new GroupingIterator(
+          groupingExpressions,
+          resultExpressions,
+          newMutableProjection,
+          child.output,
+          iter)
+      } else {
+        val aggregationIterator: SortAggregationIterator = {
+          aggregateExpressions.map(_.mode).distinct.toList match {
+            case Partial :: Nil =>
+              new PartialSortAggregationIterator(
+                groupingExpressions,
+                aggregateExpressions,
+                newMutableProjection,
+                child.output,
+                iter)
+            case PartialMerge :: Nil =>
+              new PartialMergeSortAggregationIterator(
+                groupingExpressions,
+                aggregateExpressions,
+                newMutableProjection,
+                child.output,
+                iter)
+            case Final :: Nil =>
+              new FinalSortAggregationIterator(
+                groupingExpressions,
+                aggregateExpressions,
+                aggregateAttributes,
+                resultExpressions,
+                newMutableProjection,
+                child.output,
+                iter)
+            case other =>
+              sys.error(
+                s"Could not evaluate ${aggregateExpressions} because we do not support evaluate " +
+                  s"modes $other in this operator.")
+          }
+        }
+
+        aggregationIterator
+      }
+    }
+  }
+}
+
+case class FinalAndCompleteAggregate2Sort(
+    previousGroupingExpressions: Seq[NamedExpression],
+    groupingExpressions: Seq[NamedExpression],
+    finalAggregateExpressions: Seq[AggregateExpression2],
+    finalAggregateAttributes: Seq[Attribute],
+    completeAggregateExpressions: Seq[AggregateExpression2],
+    completeAggregateAttributes: Seq[Attribute],
+    resultExpressions: Seq[NamedExpression],
+    child: SparkPlan)
+  extends UnaryNode {
+  override def references: AttributeSet = {
+    val referencesInResults =
+      AttributeSet(resultExpressions.flatMap(_.references)) --
+        AttributeSet(finalAggregateExpressions) --
+        AttributeSet(completeAggregateExpressions)
+
+    AttributeSet(
+      groupingExpressions.flatMap(_.references) ++
+        finalAggregateExpressions.flatMap(_.references) ++
+        completeAggregateExpressions.flatMap(_.references) ++
+        referencesInResults)
+  }
+
+  override def requiredChildDistribution: List[Distribution] = {
+    if (groupingExpressions.isEmpty) {
+      AllTuples :: Nil
+    } else {
+      ClusteredDistribution(groupingExpressions) :: Nil
+    }
+  }
+
+  override def requiredChildOrdering: Seq[Seq[SortOrder]] =
+    groupingExpressions.map(SortOrder(_, Ascending)) :: Nil
+
+  override def output: Seq[Attribute] = resultExpressions.map(_.toAttribute)
+
+  protected override def doExecute(): RDD[InternalRow] = attachTree(this, "execute") {
+    child.execute().mapPartitions { iter =>
+
+      new FinalAndCompleteSortAggregationIterator(
+        previousGroupingExpressions.length,
+        groupingExpressions,
+        finalAggregateExpressions,
+        finalAggregateAttributes,
+        completeAggregateExpressions,
+        completeAggregateAttributes,
+        resultExpressions,
+        newMutableProjection,
+        child.output,
+        iter)
+    }
+  }
+
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/sortBasedIterators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/sortBasedIterators.scala
new file mode 100644
index 0000000000000..ce1cbdc9cb090
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/sortBasedIterators.scala
@@ -0,0 +1,749 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.aggregate
+
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.expressions.aggregate._
+import org.apache.spark.sql.types.NullType
+
+import scala.collection.mutable.ArrayBuffer
+
+/**
+ * An iterator used to evaluate aggregate functions. It assumes that input rows
+ * are already grouped by values of `groupingExpressions`.
+ */
+private[sql] abstract class SortAggregationIterator(
+    groupingExpressions: Seq[NamedExpression],
+    aggregateExpressions: Seq[AggregateExpression2],
+    newMutableProjection: (Seq[Expression], Seq[Attribute]) => (() => MutableProjection),
+    inputAttributes: Seq[Attribute],
+    inputIter: Iterator[InternalRow])
+  extends Iterator[InternalRow] {
+
+  ///////////////////////////////////////////////////////////////////////////
+  // Static fields for this iterator
+  ///////////////////////////////////////////////////////////////////////////
+
+  protected val aggregateFunctions: Array[AggregateFunction2] = {
+    var bufferOffset = initialBufferOffset
+    val functions = new Array[AggregateFunction2](aggregateExpressions.length)
+    var i = 0
+    while (i < aggregateExpressions.length) {
+      val func = aggregateExpressions(i).aggregateFunction
+      val funcWithBoundReferences = aggregateExpressions(i).mode match {
+        case Partial | Complete if !func.isInstanceOf[AlgebraicAggregate] =>
+          // We need to create BoundReferences if the function is not an
+          // AlgebraicAggregate (it does not support code-gen) and the mode of
+          // this function is Partial or Complete because we will call eval of this
+          // function's children in the update method of this aggregate function.
+          // Those eval calls require BoundReferences to work.
+          BindReferences.bindReference(func, inputAttributes)
+        case _ => func
+      }
+      // Set bufferOffset for this function. It is important that setting bufferOffset
+      // happens after all potential bindReference operations because bindReference
+      // will create a new instance of the function.
+      funcWithBoundReferences.bufferOffset = bufferOffset
+      bufferOffset += funcWithBoundReferences.bufferSchema.length
+      functions(i) = funcWithBoundReferences
+      i += 1
+    }
+    functions
+  }
+
+  // All non-algebraic aggregate functions.
+  protected val nonAlgebraicAggregateFunctions: Array[AggregateFunction2] = {
+    aggregateFunctions.collect {
+      case func: AggregateFunction2 if !func.isInstanceOf[AlgebraicAggregate] => func
+    }.toArray
+  }
+
+  // Positions of those non-algebraic aggregate functions in aggregateFunctions.
+  // For example, we have func1, func2, func3, func4 in aggregateFunctions, and
+  // func2 and func3 are non-algebraic aggregate functions.
+  // nonAlgebraicAggregateFunctionPositions will be [1, 2].
+  protected val nonAlgebraicAggregateFunctionPositions: Array[Int] = {
+    val positions = new ArrayBuffer[Int]()
+    var i = 0
+    while (i < aggregateFunctions.length) {
+      aggregateFunctions(i) match {
+        case agg: AlgebraicAggregate =>
+        case _ => positions += i
+      }
+      i += 1
+    }
+    positions.toArray
+  }
+
+  // This is used to project expressions for the grouping expressions.
+  protected val groupGenerator =
+    newMutableProjection(groupingExpressions, inputAttributes)()
+
+  // The underlying buffer shared by all aggregate functions.
+  protected val buffer: MutableRow = {
+    // The number of elements of the underlying buffer of this operator.
+    // All aggregate functions are sharing this underlying buffer and they find their
+    // buffer values through bufferOffset.
+    var size = initialBufferOffset
+    var i = 0
+    while (i < aggregateFunctions.length) {
+      size += aggregateFunctions(i).bufferSchema.length
+      i += 1
+    }
+    new GenericMutableRow(size)
+  }
+
+  protected val joinedRow = new JoinedRow4
+
+  protected val placeholderExpressions = Seq.fill(initialBufferOffset)(NoOp)
+
+  // This projection is used to initialize buffer values for all AlgebraicAggregates.
+  protected val algebraicInitialProjection = {
+    val initExpressions = placeholderExpressions ++ aggregateFunctions.flatMap {
+      case ae: AlgebraicAggregate => ae.initialValues
+      case agg: AggregateFunction2 => Seq.fill(agg.bufferAttributes.length)(NoOp)
+    }
+    newMutableProjection(initExpressions, Nil)().target(buffer)
+  }
+
+  ///////////////////////////////////////////////////////////////////////////
+  // Mutable states
+  ///////////////////////////////////////////////////////////////////////////
+
+  // The partition key of the current partition.
+  protected var currentGroupingKey: InternalRow = _
+  // The partition key of next partition.
+  protected var nextGroupingKey: InternalRow = _
+  // The first row of next partition.
+  protected var firstRowInNextGroup: InternalRow = _
+  // Indicates if we has new group of rows to process.
+  protected var hasNewGroup: Boolean = true
+
+  ///////////////////////////////////////////////////////////////////////////
+  // Private methods
+  ///////////////////////////////////////////////////////////////////////////
+
+  /** Initializes buffer values for all aggregate functions. */
+  protected def initializeBuffer(): Unit = {
+    algebraicInitialProjection(EmptyRow)
+    var i = 0
+    while (i < nonAlgebraicAggregateFunctions.length) {
+      nonAlgebraicAggregateFunctions(i).initialize(buffer)
+      i += 1
+    }
+  }
+
+  protected def initialize(): Unit = {
+    if (inputIter.hasNext) {
+      initializeBuffer()
+      val currentRow = inputIter.next().copy()
+      // partitionGenerator is a mutable projection. Since we need to track nextGroupingKey,
+      // we are making a copy at here.
+      nextGroupingKey = groupGenerator(currentRow).copy()
+      firstRowInNextGroup = currentRow
+    } else {
+      // This iter is an empty one.
+      hasNewGroup = false
+    }
+  }
+
+  /** Processes rows in the current group. It will stop when it find a new group. */
+  private def processCurrentGroup(): Unit = {
+    currentGroupingKey = nextGroupingKey
+    // Now, we will start to find all rows belonging to this group.
+    // We create a variable to track if we see the next group.
+    var findNextPartition = false
+    // firstRowInNextGroup is the first row of this group. We first process it.
+    processRow(firstRowInNextGroup)
+    // The search will stop when we see the next group or there is no
+    // input row left in the iter.
+    while (inputIter.hasNext && !findNextPartition) {
+      val currentRow = inputIter.next()
+      // Get the grouping key based on the grouping expressions.
+      // For the below compare method, we do not need to make a copy of groupingKey.
+      val groupingKey = groupGenerator(currentRow)
+      // Check if the current row belongs the current input row.
+      currentGroupingKey.equals(groupingKey)
+
+      if (currentGroupingKey == groupingKey) {
+        processRow(currentRow)
+      } else {
+        // We find a new group.
+        findNextPartition = true
+        nextGroupingKey = groupingKey.copy()
+        firstRowInNextGroup = currentRow.copy()
+      }
+    }
+    // We have not seen a new group. It means that there is no new row in the input
+    // iter. The current group is the last group of the iter.
+    if (!findNextPartition) {
+      hasNewGroup = false
+    }
+  }
+
+  ///////////////////////////////////////////////////////////////////////////
+  // Public methods
+  ///////////////////////////////////////////////////////////////////////////
+
+  override final def hasNext: Boolean = hasNewGroup
+
+  override final def next(): InternalRow = {
+    if (hasNext) {
+      // Process the current group.
+      processCurrentGroup()
+      // Generate output row for the current group.
+      val outputRow = generateOutput()
+      // Initilize buffer values for the next group.
+      initializeBuffer()
+
+      outputRow
+    } else {
+      // no more result
+      throw new NoSuchElementException
+    }
+  }
+
+  ///////////////////////////////////////////////////////////////////////////
+  // Methods that need to be implemented
+  ///////////////////////////////////////////////////////////////////////////
+
+  protected def initialBufferOffset: Int
+
+  protected def processRow(row: InternalRow): Unit
+
+  protected def generateOutput(): InternalRow
+
+  ///////////////////////////////////////////////////////////////////////////
+  // Initialize this iterator
+  ///////////////////////////////////////////////////////////////////////////
+
+  initialize()
+}
+
+/**
+ * An iterator only used to group input rows according to values of `groupingExpressions`.
+ * It assumes that input rows are already grouped by values of `groupingExpressions`.
+ */
+class GroupingIterator(
+    groupingExpressions: Seq[NamedExpression],
+    resultExpressions: Seq[NamedExpression],
+    newMutableProjection: (Seq[Expression], Seq[Attribute]) => (() => MutableProjection),
+    inputAttributes: Seq[Attribute],
+    inputIter: Iterator[InternalRow])
+  extends SortAggregationIterator(
+    groupingExpressions,
+    Nil,
+    newMutableProjection,
+    inputAttributes,
+    inputIter) {
+
+  private val resultProjection =
+    newMutableProjection(resultExpressions, groupingExpressions.map(_.toAttribute))()
+
+  override protected def initialBufferOffset: Int = 0
+
+  override protected def processRow(row: InternalRow): Unit = {
+    // Since we only do grouping, there is nothing to do at here.
+  }
+
+  override protected def generateOutput(): InternalRow = {
+    resultProjection(currentGroupingKey)
+  }
+}
+
+/**
+ * An iterator used to do partial aggregations (for those aggregate functions with mode Partial).
+ * It assumes that input rows are already grouped by values of `groupingExpressions`.
+ * The format of its output rows is:
+ * |groupingExpr1|...|groupingExprN|aggregationBuffer1|...|aggregationBufferN|
+ */
+class PartialSortAggregationIterator(
+    groupingExpressions: Seq[NamedExpression],
+    aggregateExpressions: Seq[AggregateExpression2],
+    newMutableProjection: (Seq[Expression], Seq[Attribute]) => (() => MutableProjection),
+    inputAttributes: Seq[Attribute],
+    inputIter: Iterator[InternalRow])
+  extends SortAggregationIterator(
+    groupingExpressions,
+    aggregateExpressions,
+    newMutableProjection,
+    inputAttributes,
+    inputIter) {
+
+  // This projection is used to update buffer values for all AlgebraicAggregates.
+  private val algebraicUpdateProjection = {
+    val bufferSchema = aggregateFunctions.flatMap {
+      case ae: AlgebraicAggregate => ae.bufferAttributes
+      case agg: AggregateFunction2 => agg.bufferAttributes
+    }
+    val updateExpressions = aggregateFunctions.flatMap {
+      case ae: AlgebraicAggregate => ae.updateExpressions
+      case agg: AggregateFunction2 => Seq.fill(agg.bufferAttributes.length)(NoOp)
+    }
+    newMutableProjection(updateExpressions, bufferSchema ++ inputAttributes)().target(buffer)
+  }
+
+  override protected def initialBufferOffset: Int = 0
+
+  override protected def processRow(row: InternalRow): Unit = {
+    // Process all algebraic aggregate functions.
+    algebraicUpdateProjection(joinedRow(buffer, row))
+    // Process all non-algebraic aggregate functions.
+    var i = 0
+    while (i < nonAlgebraicAggregateFunctions.length) {
+      nonAlgebraicAggregateFunctions(i).update(buffer, row)
+      i += 1
+    }
+  }
+
+  override protected def generateOutput(): InternalRow = {
+    // We just output the grouping expressions and the underlying buffer.
+    joinedRow(currentGroupingKey, buffer).copy()
+  }
+}
+
+/**
+ * An iterator used to do partial merge aggregations (for those aggregate functions with mode
+ * PartialMerge). It assumes that input rows are already grouped by values of
+ * `groupingExpressions`.
+ * The format of its input rows is:
+ * |groupingExpr1|...|groupingExprN|aggregationBuffer1|...|aggregationBufferN|
+ *
+ * The format of its internal buffer is:
+ * |placeholder1|...|placeholderN|aggregationBuffer1|...|aggregationBufferN|
+ * Every placeholder is for a grouping expression.
+ * The actual buffers are stored after placeholderN.
+ * The reason that we have placeholders at here is to make our underlying buffer have the same
+ * length with a input row.
+ *
+ * The format of its output rows is:
+ * |groupingExpr1|...|groupingExprN|aggregationBuffer1|...|aggregationBufferN|
+ */
+class PartialMergeSortAggregationIterator(
+    groupingExpressions: Seq[NamedExpression],
+    aggregateExpressions: Seq[AggregateExpression2],
+    newMutableProjection: (Seq[Expression], Seq[Attribute]) => (() => MutableProjection),
+    inputAttributes: Seq[Attribute],
+    inputIter: Iterator[InternalRow])
+  extends SortAggregationIterator(
+    groupingExpressions,
+    aggregateExpressions,
+    newMutableProjection,
+    inputAttributes,
+    inputIter) {
+
+  private val placeholderAttribtues =
+    Seq.fill(initialBufferOffset)(AttributeReference("placeholder", NullType)())
+
+  // This projection is used to merge buffer values for all AlgebraicAggregates.
+  private val algebraicMergeProjection = {
+    val bufferSchemata =
+      placeholderAttribtues ++ aggregateFunctions.flatMap {
+        case ae: AlgebraicAggregate => ae.bufferAttributes
+        case agg: AggregateFunction2 => agg.bufferAttributes
+      } ++ placeholderAttribtues ++ aggregateFunctions.flatMap {
+        case ae: AlgebraicAggregate => ae.cloneBufferAttributes
+        case agg: AggregateFunction2 => agg.cloneBufferAttributes
+      }
+    val mergeExpressions = placeholderExpressions ++ aggregateFunctions.flatMap {
+      case ae: AlgebraicAggregate => ae.mergeExpressions
+      case agg: AggregateFunction2 => Seq.fill(agg.bufferAttributes.length)(NoOp)
+    }
+
+    newMutableProjection(mergeExpressions, bufferSchemata)()
+  }
+
+  // This projection is used to extract aggregation buffers from the underlying buffer.
+  // We need it because the underlying buffer has placeholders at its beginning.
+  private val extractsBufferValues = {
+    val expressions = aggregateFunctions.flatMap {
+      case agg => agg.bufferAttributes
+    }
+
+    newMutableProjection(expressions, inputAttributes)()
+  }
+
+  override protected def initialBufferOffset: Int = groupingExpressions.length
+
+  override protected def processRow(row: InternalRow): Unit = {
+    // Process all algebraic aggregate functions.
+    algebraicMergeProjection.target(buffer)(joinedRow(buffer, row))
+    // Process all non-algebraic aggregate functions.
+    var i = 0
+    while (i < nonAlgebraicAggregateFunctions.length) {
+      nonAlgebraicAggregateFunctions(i).merge(buffer, row)
+      i += 1
+    }
+  }
+
+  override protected def generateOutput(): InternalRow = {
+    // We output grouping expressions and aggregation buffers.
+    joinedRow(currentGroupingKey, extractsBufferValues(buffer))
+  }
+}
+
+/**
+ * An iterator used to do final aggregations (for those aggregate functions with mode
+ * Final). It assumes that input rows are already grouped by values of
+ * `groupingExpressions`.
+ * The format of its input rows is:
+ * |groupingExpr1|...|groupingExprN|aggregationBuffer1|...|aggregationBufferN|
+ *
+ * The format of its internal buffer is:
+ * |placeholder1|...|placeholder N|aggregationBuffer1|...|aggregationBufferN|
+ * Every placeholder is for a grouping expression.
+ * The actual buffers are stored after placeholderN.
+ * The reason that we have placeholders at here is to make our underlying buffer have the same
+ * length with a input row.
+ *
+ * The format of its output rows is represented by the schema of `resultExpressions`.
+ */
+class FinalSortAggregationIterator(
+    groupingExpressions: Seq[NamedExpression],
+    aggregateExpressions: Seq[AggregateExpression2],
+    aggregateAttributes: Seq[Attribute],
+    resultExpressions: Seq[NamedExpression],
+    newMutableProjection: (Seq[Expression], Seq[Attribute]) => (() => MutableProjection),
+    inputAttributes: Seq[Attribute],
+    inputIter: Iterator[InternalRow])
+  extends SortAggregationIterator(
+    groupingExpressions,
+    aggregateExpressions,
+    newMutableProjection,
+    inputAttributes,
+    inputIter) {
+
+  // The result of aggregate functions.
+  private val aggregateResult: MutableRow = new GenericMutableRow(aggregateAttributes.length)
+
+  // The projection used to generate the output rows of this operator.
+  // This is only used when we are generating final results of aggregate functions.
+  private val resultProjection =
+    newMutableProjection(
+      resultExpressions, groupingExpressions.map(_.toAttribute) ++ aggregateAttributes)()
+
+  private val offsetAttributes =
+    Seq.fill(initialBufferOffset)(AttributeReference("placeholder", NullType)())
+
+  // This projection is used to merge buffer values for all AlgebraicAggregates.
+  private val algebraicMergeProjection = {
+    val bufferSchemata =
+      offsetAttributes ++ aggregateFunctions.flatMap {
+        case ae: AlgebraicAggregate => ae.bufferAttributes
+        case agg: AggregateFunction2 => agg.bufferAttributes
+      } ++ offsetAttributes ++ aggregateFunctions.flatMap {
+        case ae: AlgebraicAggregate => ae.cloneBufferAttributes
+        case agg: AggregateFunction2 => agg.cloneBufferAttributes
+      }
+    val mergeExpressions = placeholderExpressions ++ aggregateFunctions.flatMap {
+      case ae: AlgebraicAggregate => ae.mergeExpressions
+      case agg: AggregateFunction2 => Seq.fill(agg.bufferAttributes.length)(NoOp)
+    }
+
+    newMutableProjection(mergeExpressions, bufferSchemata)()
+  }
+
+  // This projection is used to evaluate all AlgebraicAggregates.
+  private val algebraicEvalProjection = {
+    val bufferSchemata =
+      offsetAttributes ++ aggregateFunctions.flatMap {
+        case ae: AlgebraicAggregate => ae.bufferAttributes
+        case agg: AggregateFunction2 => agg.bufferAttributes
+      } ++ offsetAttributes ++ aggregateFunctions.flatMap {
+        case ae: AlgebraicAggregate => ae.cloneBufferAttributes
+        case agg: AggregateFunction2 => agg.cloneBufferAttributes
+      }
+    val evalExpressions = aggregateFunctions.map {
+      case ae: AlgebraicAggregate => ae.evaluateExpression
+      case agg: AggregateFunction2 => NoOp
+    }
+
+    newMutableProjection(evalExpressions, bufferSchemata)()
+  }
+
+  override protected def initialBufferOffset: Int = groupingExpressions.length
+
+  override def initialize(): Unit = {
+    if (inputIter.hasNext) {
+      initializeBuffer()
+      val currentRow = inputIter.next().copy()
+      // partitionGenerator is a mutable projection. Since we need to track nextGroupingKey,
+      // we are making a copy at here.
+      nextGroupingKey = groupGenerator(currentRow).copy()
+      firstRowInNextGroup = currentRow
+    } else {
+      if (groupingExpressions.isEmpty) {
+        // If there is no grouping expression, we need to generate a single row as the output.
+        initializeBuffer()
+        // Right now, the buffer only contains initial buffer values. Because
+        // merging two buffers with initial values will generate a row that
+        // still store initial values. We set the currentRow as the copy of the current buffer.
+        val currentRow = buffer.copy()
+        nextGroupingKey = groupGenerator(currentRow).copy()
+        firstRowInNextGroup = currentRow
+      } else {
+        // This iter is an empty one.
+        hasNewGroup = false
+      }
+    }
+  }
+
+  override protected def processRow(row: InternalRow): Unit = {
+    // Process all algebraic aggregate functions.
+    algebraicMergeProjection.target(buffer)(joinedRow(buffer, row))
+    // Process all non-algebraic aggregate functions.
+    var i = 0
+    while (i < nonAlgebraicAggregateFunctions.length) {
+      nonAlgebraicAggregateFunctions(i).merge(buffer, row)
+      i += 1
+    }
+  }
+
+  override protected def generateOutput(): InternalRow = {
+    // Generate results for all algebraic aggregate functions.
+    algebraicEvalProjection.target(aggregateResult)(buffer)
+    // Generate results for all non-algebraic aggregate functions.
+    var i = 0
+    while (i < nonAlgebraicAggregateFunctions.length) {
+      aggregateResult.update(
+        nonAlgebraicAggregateFunctionPositions(i),
+        nonAlgebraicAggregateFunctions(i).eval(buffer))
+      i += 1
+    }
+    resultProjection(joinedRow(currentGroupingKey, aggregateResult))
+  }
+}
+
+/**
+ * An iterator used to do both final aggregations (for those aggregate functions with mode
+ * Final) and complete aggregations (for those aggregate functions with mode Complete).
+ * It assumes that input rows are already grouped by values of `groupingExpressions`.
+ * The format of its input rows is:
+ * |groupingExpr1|...|groupingExprN|col1|...|colM|aggregationBuffer1|...|aggregationBufferN|
+ * col1 to colM are columns used by aggregate functions with Complete mode.
+ * aggregationBuffer1 to aggregationBufferN are buffers used by aggregate functions with
+ * Final mode.
+ *
+ * The format of its internal buffer is:
+ * |placeholder1|...|placeholder(N+M)|aggregationBuffer1|...|aggregationBuffer(N+M)|
+ * The first N placeholders represent slots of grouping expressions.
+ * Then, next M placeholders represent slots of col1 to colM.
+ * For aggregation buffers, first N aggregation buffers are used by N aggregate functions with
+ * mode Final. Then, the last M aggregation buffers are used by M aggregate functions with mode
+ * Complete. The reason that we have placeholders at here is to make our underlying buffer
+ * have the same length with a input row.
+ *
+ * The format of its output rows is represented by the schema of `resultExpressions`.
+ */
+class FinalAndCompleteSortAggregationIterator(
+    override protected val initialBufferOffset: Int,
+    groupingExpressions: Seq[NamedExpression],
+    finalAggregateExpressions: Seq[AggregateExpression2],
+    finalAggregateAttributes: Seq[Attribute],
+    completeAggregateExpressions: Seq[AggregateExpression2],
+    completeAggregateAttributes: Seq[Attribute],
+    resultExpressions: Seq[NamedExpression],
+    newMutableProjection: (Seq[Expression], Seq[Attribute]) => (() => MutableProjection),
+    inputAttributes: Seq[Attribute],
+    inputIter: Iterator[InternalRow])
+  extends SortAggregationIterator(
+    groupingExpressions,
+    // TODO: document the ordering
+    finalAggregateExpressions ++ completeAggregateExpressions,
+    newMutableProjection,
+    inputAttributes,
+    inputIter) {
+
+  // The result of aggregate functions.
+  private val aggregateResult: MutableRow =
+    new GenericMutableRow(completeAggregateAttributes.length + finalAggregateAttributes.length)
+
+  // The projection used to generate the output rows of this operator.
+  // This is only used when we are generating final results of aggregate functions.
+  private val resultProjection = {
+    val inputSchema =
+      groupingExpressions.map(_.toAttribute) ++
+        finalAggregateAttributes ++
+        completeAggregateAttributes
+    newMutableProjection(resultExpressions, inputSchema)()
+  }
+
+  private val offsetAttributes =
+    Seq.fill(initialBufferOffset)(AttributeReference("placeholder", NullType)())
+
+  // All aggregate functions with mode Final.
+  private val finalAggregateFunctions: Array[AggregateFunction2] = {
+    val functions = new Array[AggregateFunction2](finalAggregateExpressions.length)
+    var i = 0
+    while (i < finalAggregateExpressions.length) {
+      functions(i) = aggregateFunctions(i)
+      i += 1
+    }
+    functions
+  }
+
+  // All non-algebraic aggregate functions with mode Final.
+  private val finalNonAlgebraicAggregateFunctions: Array[AggregateFunction2] = {
+    finalAggregateFunctions.collect {
+      case func: AggregateFunction2 if !func.isInstanceOf[AlgebraicAggregate] => func
+    }.toArray
+  }
+
+  // All aggregate functions with mode Complete.
+  private val completeAggregateFunctions: Array[AggregateFunction2] = {
+    val functions = new Array[AggregateFunction2](completeAggregateExpressions.length)
+    var i = 0
+    while (i < completeAggregateExpressions.length) {
+      functions(i) = aggregateFunctions(finalAggregateFunctions.length + i)
+      i += 1
+    }
+    functions
+  }
+
+  // All non-algebraic aggregate functions with mode Complete.
+  private val completeNonAlgebraicAggregateFunctions: Array[AggregateFunction2] = {
+    completeAggregateFunctions.collect {
+      case func: AggregateFunction2 if !func.isInstanceOf[AlgebraicAggregate] => func
+    }.toArray
+  }
+
+  // This projection is used to merge buffer values for all AlgebraicAggregates with mode
+  // Final.
+  private val finalAlgebraicMergeProjection = {
+    val numCompleteOffsetAttributes =
+      completeAggregateFunctions.map(_.bufferAttributes.length).sum
+    val completeOffsetAttributes =
+      Seq.fill(numCompleteOffsetAttributes)(AttributeReference("placeholder", NullType)())
+    val completeOffsetExpressions = Seq.fill(numCompleteOffsetAttributes)(NoOp)
+
+    val bufferSchemata =
+      offsetAttributes ++ finalAggregateFunctions.flatMap {
+        case ae: AlgebraicAggregate => ae.bufferAttributes
+        case agg: AggregateFunction2 => agg.bufferAttributes
+      } ++ completeOffsetAttributes ++ offsetAttributes ++ finalAggregateFunctions.flatMap {
+        case ae: AlgebraicAggregate => ae.cloneBufferAttributes
+        case agg: AggregateFunction2 => agg.cloneBufferAttributes
+      } ++ completeOffsetAttributes
+    val mergeExpressions =
+      placeholderExpressions ++ finalAggregateFunctions.flatMap {
+        case ae: AlgebraicAggregate => ae.mergeExpressions
+        case agg: AggregateFunction2 => Seq.fill(agg.bufferAttributes.length)(NoOp)
+      } ++ completeOffsetExpressions
+
+    newMutableProjection(mergeExpressions, bufferSchemata)()
+  }
+
+  // This projection is used to update buffer values for all AlgebraicAggregates with mode
+  // Complete.
+  private val completeAlgebraicUpdateProjection = {
+    val numFinalOffsetAttributes = finalAggregateFunctions.map(_.bufferAttributes.length).sum
+    val finalOffsetAttributes =
+      Seq.fill(numFinalOffsetAttributes)(AttributeReference("placeholder", NullType)())
+    val finalOffsetExpressions = Seq.fill(numFinalOffsetAttributes)(NoOp)
+
+    val bufferSchema =
+      offsetAttributes ++ finalOffsetAttributes ++ completeAggregateFunctions.flatMap {
+        case ae: AlgebraicAggregate => ae.bufferAttributes
+        case agg: AggregateFunction2 => agg.bufferAttributes
+      }
+    val updateExpressions =
+      placeholderExpressions ++ finalOffsetExpressions ++ completeAggregateFunctions.flatMap {
+        case ae: AlgebraicAggregate => ae.updateExpressions
+        case agg: AggregateFunction2 => Seq.fill(agg.bufferAttributes.length)(NoOp)
+      }
+    newMutableProjection(updateExpressions, bufferSchema ++ inputAttributes)().target(buffer)
+  }
+
+  // This projection is used to evaluate all AlgebraicAggregates.
+  private val algebraicEvalProjection = {
+    val bufferSchemata =
+      offsetAttributes ++ aggregateFunctions.flatMap {
+        case ae: AlgebraicAggregate => ae.bufferAttributes
+        case agg: AggregateFunction2 => agg.bufferAttributes
+      } ++ offsetAttributes ++ aggregateFunctions.flatMap {
+        case ae: AlgebraicAggregate => ae.cloneBufferAttributes
+        case agg: AggregateFunction2 => agg.cloneBufferAttributes
+      }
+    val evalExpressions = aggregateFunctions.map {
+      case ae: AlgebraicAggregate => ae.evaluateExpression
+      case agg: AggregateFunction2 => NoOp
+    }
+
+    newMutableProjection(evalExpressions, bufferSchemata)()
+  }
+
+  override def initialize(): Unit = {
+    if (inputIter.hasNext) {
+      initializeBuffer()
+      val currentRow = inputIter.next().copy()
+      // partitionGenerator is a mutable projection. Since we need to track nextGroupingKey,
+      // we are making a copy at here.
+      nextGroupingKey = groupGenerator(currentRow).copy()
+      firstRowInNextGroup = currentRow
+    } else {
+      if (groupingExpressions.isEmpty) {
+        // If there is no grouping expression, we need to generate a single row as the output.
+        initializeBuffer()
+        // Right now, the buffer only contains initial buffer values. Because
+        // merging two buffers with initial values will generate a row that
+        // still store initial values. We set the currentRow as the copy of the current buffer.
+        val currentRow = buffer.copy()
+        nextGroupingKey = groupGenerator(currentRow).copy()
+        firstRowInNextGroup = currentRow
+      } else {
+        // This iter is an empty one.
+        hasNewGroup = false
+      }
+    }
+  }
+
+  override protected def processRow(row: InternalRow): Unit = {
+    val input = joinedRow(buffer, row)
+    // For all aggregate functions with mode Complete, update buffers.
+    completeAlgebraicUpdateProjection(input)
+    var i = 0
+    while (i < completeNonAlgebraicAggregateFunctions.length) {
+      completeNonAlgebraicAggregateFunctions(i).update(buffer, row)
+      i += 1
+    }
+
+    // For all aggregate functions with mode Final, merge buffers.
+    finalAlgebraicMergeProjection.target(buffer)(input)
+    i = 0
+    while (i < finalNonAlgebraicAggregateFunctions.length) {
+      finalNonAlgebraicAggregateFunctions(i).merge(buffer, row)
+      i += 1
+    }
+  }
+
+  override protected def generateOutput(): InternalRow = {
+    // Generate results for all algebraic aggregate functions.
+    algebraicEvalProjection.target(aggregateResult)(buffer)
+    // Generate results for all non-algebraic aggregate functions.
+    var i = 0
+    while (i < nonAlgebraicAggregateFunctions.length) {
+      aggregateResult.update(
+        nonAlgebraicAggregateFunctionPositions(i),
+        nonAlgebraicAggregateFunctions(i).eval(buffer))
+      i += 1
+    }
+
+    resultProjection(joinedRow(currentGroupingKey, aggregateResult))
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/utils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/utils.scala
new file mode 100644
index 0000000000000..1cb27710e0480
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/utils.scala
@@ -0,0 +1,364 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.aggregate
+
+import org.apache.spark.sql.AnalysisException
+import org.apache.spark.sql.catalyst._
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.expressions.aggregate._
+import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, LogicalPlan}
+import org.apache.spark.sql.execution.SparkPlan
+import org.apache.spark.sql.types.{StructType, MapType, ArrayType}
+
+/**
+ * Utility functions used by the query planner to convert our plan to new aggregation code path.
+ */
+object Utils {
+  // Right now, we do not support complex types in the grouping key schema.
+  private def supportsGroupingKeySchema(aggregate: Aggregate): Boolean = {
+    val hasComplexTypes = aggregate.groupingExpressions.map(_.dataType).exists {
+      case array: ArrayType => true
+      case map: MapType => true
+      case struct: StructType => true
+      case _ => false
+    }
+
+    !hasComplexTypes
+  }
+
+  private def tryConvert(plan: LogicalPlan): Option[Aggregate] = plan match {
+    case p: Aggregate if supportsGroupingKeySchema(p) =>
+      val converted = p.transformExpressionsDown {
+        case expressions.Average(child) =>
+          aggregate.AggregateExpression2(
+            aggregateFunction = aggregate.Average(child),
+            mode = aggregate.Complete,
+            isDistinct = false)
+
+        case expressions.Count(child) =>
+          aggregate.AggregateExpression2(
+            aggregateFunction = aggregate.Count(child),
+            mode = aggregate.Complete,
+            isDistinct = false)
+
+        // We do not support multiple COUNT DISTINCT columns for now.
+        case expressions.CountDistinct(children) if children.length == 1 =>
+          aggregate.AggregateExpression2(
+            aggregateFunction = aggregate.Count(children.head),
+            mode = aggregate.Complete,
+            isDistinct = true)
+
+        case expressions.First(child) =>
+          aggregate.AggregateExpression2(
+            aggregateFunction = aggregate.First(child),
+            mode = aggregate.Complete,
+            isDistinct = false)
+
+        case expressions.Last(child) =>
+          aggregate.AggregateExpression2(
+            aggregateFunction = aggregate.Last(child),
+            mode = aggregate.Complete,
+            isDistinct = false)
+
+        case expressions.Max(child) =>
+          aggregate.AggregateExpression2(
+            aggregateFunction = aggregate.Max(child),
+            mode = aggregate.Complete,
+            isDistinct = false)
+
+        case expressions.Min(child) =>
+          aggregate.AggregateExpression2(
+            aggregateFunction = aggregate.Min(child),
+            mode = aggregate.Complete,
+            isDistinct = false)
+
+        case expressions.Sum(child) =>
+          aggregate.AggregateExpression2(
+            aggregateFunction = aggregate.Sum(child),
+            mode = aggregate.Complete,
+            isDistinct = false)
+
+        case expressions.SumDistinct(child) =>
+          aggregate.AggregateExpression2(
+            aggregateFunction = aggregate.Sum(child),
+            mode = aggregate.Complete,
+            isDistinct = true)
+      }
+      // Check if there is any expressions.AggregateExpression1 left.
+      // If so, we cannot convert this plan.
+      val hasAggregateExpression1 = converted.aggregateExpressions.exists { expr =>
+        // For every expressions, check if it contains AggregateExpression1.
+        expr.find {
+          case agg: expressions.AggregateExpression1 => true
+          case other => false
+        }.isDefined
+      }
+
+      // Check if there are multiple distinct columns.
+      val aggregateExpressions = converted.aggregateExpressions.flatMap { expr =>
+        expr.collect {
+          case agg: AggregateExpression2 => agg
+        }
+      }.toSet.toSeq
+      val functionsWithDistinct = aggregateExpressions.filter(_.isDistinct)
+      val hasMultipleDistinctColumnSets =
+        if (functionsWithDistinct.map(_.aggregateFunction.children).distinct.length > 1) {
+          true
+        } else {
+          false
+        }
+
+      if (!hasAggregateExpression1 && !hasMultipleDistinctColumnSets) Some(converted) else None
+
+    case other => None
+  }
+
+  private def checkInvalidAggregateFunction2(aggregate: Aggregate): Unit = {
+    // If the plan cannot be converted, we will do a final round check to if the original
+    // logical.Aggregate contains both AggregateExpression1 and AggregateExpression2. If so,
+    // we need to throw an exception.
+    val aggregateFunction2s = aggregate.aggregateExpressions.flatMap { expr =>
+      expr.collect {
+        case agg: AggregateExpression2 => agg.aggregateFunction
+      }
+    }.distinct
+    if (aggregateFunction2s.nonEmpty) {
+      // For functions implemented based on the new interface, prepare a list of function names.
+      val invalidFunctions = {
+        if (aggregateFunction2s.length > 1) {
+          s"${aggregateFunction2s.tail.map(_.nodeName).mkString(",")} " +
+            s"and ${aggregateFunction2s.head.nodeName} are"
+        } else {
+          s"${aggregateFunction2s.head.nodeName} is"
+        }
+      }
+      val errorMessage =
+        s"${invalidFunctions} implemented based on the new Aggregate Function " +
+          s"interface and it cannot be used with functions implemented based on " +
+          s"the old Aggregate Function interface."
+      throw new AnalysisException(errorMessage)
+    }
+  }
+
+  def tryConvert(
+      plan: LogicalPlan,
+      useNewAggregation: Boolean,
+      codeGenEnabled: Boolean): Option[Aggregate] = plan match {
+    case p: Aggregate if useNewAggregation && codeGenEnabled =>
+      val converted = tryConvert(p)
+      if (converted.isDefined) {
+        converted
+      } else {
+        checkInvalidAggregateFunction2(p)
+        None
+      }
+    case p: Aggregate =>
+      checkInvalidAggregateFunction2(p)
+      None
+    case other => None
+  }
+
+  def planAggregateWithoutDistinct(
+      groupingExpressions: Seq[Expression],
+      aggregateExpressions: Seq[AggregateExpression2],
+      aggregateFunctionMap: Map[(AggregateFunction2, Boolean), Attribute],
+      resultExpressions: Seq[NamedExpression],
+      child: SparkPlan): Seq[SparkPlan] = {
+    // 1. Create an Aggregate Operator for partial aggregations.
+    val namedGroupingExpressions = groupingExpressions.map {
+      case ne: NamedExpression => ne -> ne
+      // If the expression is not a NamedExpressions, we add an alias.
+      // So, when we generate the result of the operator, the Aggregate Operator
+      // can directly get the Seq of attributes representing the grouping expressions.
+      case other =>
+        val withAlias = Alias(other, other.toString)()
+        other -> withAlias
+    }
+    val groupExpressionMap = namedGroupingExpressions.toMap
+    val namedGroupingAttributes = namedGroupingExpressions.map(_._2.toAttribute)
+    val partialAggregateExpressions = aggregateExpressions.map {
+      case AggregateExpression2(aggregateFunction, mode, isDistinct) =>
+        AggregateExpression2(aggregateFunction, Partial, isDistinct)
+    }
+    val partialAggregateAttributes = partialAggregateExpressions.flatMap { agg =>
+      agg.aggregateFunction.bufferAttributes
+    }
+    val partialAggregate =
+      Aggregate2Sort(
+        None: Option[Seq[Expression]],
+        namedGroupingExpressions.map(_._2),
+        partialAggregateExpressions,
+        partialAggregateAttributes,
+        namedGroupingAttributes ++ partialAggregateAttributes,
+        child)
+
+    // 2. Create an Aggregate Operator for final aggregations.
+    val finalAggregateExpressions = aggregateExpressions.map {
+      case AggregateExpression2(aggregateFunction, mode, isDistinct) =>
+        AggregateExpression2(aggregateFunction, Final, isDistinct)
+    }
+    val finalAggregateAttributes =
+      finalAggregateExpressions.map {
+        expr => aggregateFunctionMap(expr.aggregateFunction, expr.isDistinct)
+      }
+    val rewrittenResultExpressions = resultExpressions.map { expr =>
+      expr.transformDown {
+        case agg: AggregateExpression2 =>
+          aggregateFunctionMap(agg.aggregateFunction, agg.isDistinct).toAttribute
+        case expression =>
+          // We do not rely on the equality check at here since attributes may
+          // different cosmetically. Instead, we use semanticEquals.
+          groupExpressionMap.collectFirst {
+            case (expr, ne) if expr semanticEquals expression => ne.toAttribute
+          }.getOrElse(expression)
+      }.asInstanceOf[NamedExpression]
+    }
+    val finalAggregate = Aggregate2Sort(
+      Some(namedGroupingAttributes),
+      namedGroupingAttributes,
+      finalAggregateExpressions,
+      finalAggregateAttributes,
+      rewrittenResultExpressions,
+      partialAggregate)
+
+    finalAggregate :: Nil
+  }
+
+  def planAggregateWithOneDistinct(
+      groupingExpressions: Seq[Expression],
+      functionsWithDistinct: Seq[AggregateExpression2],
+      functionsWithoutDistinct: Seq[AggregateExpression2],
+      aggregateFunctionMap: Map[(AggregateFunction2, Boolean), Attribute],
+      resultExpressions: Seq[NamedExpression],
+      child: SparkPlan): Seq[SparkPlan] = {
+
+    // 1. Create an Aggregate Operator for partial aggregations.
+    // The grouping expressions are original groupingExpressions and
+    // distinct columns. For example, for avg(distinct value) ... group by key
+    // the grouping expressions of this Aggregate Operator will be [key, value].
+    val namedGroupingExpressions = groupingExpressions.map {
+      case ne: NamedExpression => ne -> ne
+      // If the expression is not a NamedExpressions, we add an alias.
+      // So, when we generate the result of the operator, the Aggregate Operator
+      // can directly get the Seq of attributes representing the grouping expressions.
+      case other =>
+        val withAlias = Alias(other, other.toString)()
+        other -> withAlias
+    }
+    val groupExpressionMap = namedGroupingExpressions.toMap
+    val namedGroupingAttributes = namedGroupingExpressions.map(_._2.toAttribute)
+
+    // It is safe to call head at here since functionsWithDistinct has at least one
+    // AggregateExpression2.
+    val distinctColumnExpressions =
+      functionsWithDistinct.head.aggregateFunction.children
+    val namedDistinctColumnExpressions = distinctColumnExpressions.map {
+      case ne: NamedExpression => ne -> ne
+      case other =>
+        val withAlias = Alias(other, other.toString)()
+        other -> withAlias
+    }
+    val distinctColumnExpressionMap = namedDistinctColumnExpressions.toMap
+    val distinctColumnAttributes = namedDistinctColumnExpressions.map(_._2.toAttribute)
+
+    val partialAggregateExpressions = functionsWithoutDistinct.map {
+      case AggregateExpression2(aggregateFunction, mode, _) =>
+        AggregateExpression2(aggregateFunction, Partial, false)
+    }
+    val partialAggregateAttributes = partialAggregateExpressions.flatMap { agg =>
+      agg.aggregateFunction.bufferAttributes
+    }
+    val partialAggregate =
+      Aggregate2Sort(
+        None: Option[Seq[Expression]],
+        (namedGroupingExpressions ++ namedDistinctColumnExpressions).map(_._2),
+        partialAggregateExpressions,
+        partialAggregateAttributes,
+        namedGroupingAttributes ++ distinctColumnAttributes ++ partialAggregateAttributes,
+        child)
+
+    // 2. Create an Aggregate Operator for partial merge aggregations.
+    val partialMergeAggregateExpressions = functionsWithoutDistinct.map {
+      case AggregateExpression2(aggregateFunction, mode, _) =>
+        AggregateExpression2(aggregateFunction, PartialMerge, false)
+    }
+    val partialMergeAggregateAttributes =
+      partialMergeAggregateExpressions.map {
+        expr => aggregateFunctionMap(expr.aggregateFunction, expr.isDistinct)
+      }
+    val partialMergeAggregate =
+      Aggregate2Sort(
+        Some(namedGroupingAttributes),
+        namedGroupingAttributes ++ distinctColumnAttributes,
+        partialMergeAggregateExpressions,
+        partialMergeAggregateAttributes,
+        namedGroupingAttributes ++ distinctColumnAttributes ++ partialMergeAggregateAttributes,
+        partialAggregate)
+
+    // 3. Create an Aggregate Operator for partial merge aggregations.
+    val finalAggregateExpressions = functionsWithoutDistinct.map {
+      case AggregateExpression2(aggregateFunction, mode, _) =>
+        AggregateExpression2(aggregateFunction, Final, false)
+    }
+    val finalAggregateAttributes =
+      finalAggregateExpressions.map {
+        expr => aggregateFunctionMap(expr.aggregateFunction, expr.isDistinct)
+      }
+    val (completeAggregateExpressions, completeAggregateAttributes) = functionsWithDistinct.map {
+      // Children of an AggregateFunction with DISTINCT keyword has already
+      // been evaluated. At here, we need to replace original children
+      // to AttributeReferences.
+      case agg @ AggregateExpression2(aggregateFunction, mode, isDistinct) =>
+        val rewrittenAggregateFunction = aggregateFunction.transformDown {
+          case expr if distinctColumnExpressionMap.contains(expr) =>
+            distinctColumnExpressionMap(expr).toAttribute
+        }.asInstanceOf[AggregateFunction2]
+        // We rewrite the aggregate function to a non-distinct aggregation because
+        // its input will have distinct arguments.
+        val rewrittenAggregateExpression =
+          AggregateExpression2(rewrittenAggregateFunction, Complete, false)
+
+        val aggregateFunctionAttribute = aggregateFunctionMap(agg.aggregateFunction, isDistinct)
+        (rewrittenAggregateExpression -> aggregateFunctionAttribute)
+    }.unzip
+
+    val rewrittenResultExpressions = resultExpressions.map { expr =>
+      expr.transform {
+        case agg: AggregateExpression2 =>
+          aggregateFunctionMap(agg.aggregateFunction, agg.isDistinct).toAttribute
+        case expression =>
+          // We do not rely on the equality check at here since attributes may
+          // different cosmetically. Instead, we use semanticEquals.
+          groupExpressionMap.collectFirst {
+            case (expr, ne) if expr semanticEquals expression => ne.toAttribute
+          }.getOrElse(expression)
+      }.asInstanceOf[NamedExpression]
+    }
+    val finalAndCompleteAggregate = FinalAndCompleteAggregate2Sort(
+      namedGroupingAttributes ++ distinctColumnAttributes,
+      namedGroupingAttributes,
+      finalAggregateExpressions,
+      finalAggregateAttributes,
+      completeAggregateExpressions,
+      completeAggregateAttributes,
+      rewrittenResultExpressions,
+      partialMergeAggregate)
+
+    finalAndCompleteAggregate :: Nil
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/aggregate/udaf.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/aggregate/udaf.scala
new file mode 100644
index 0000000000000..6c49a906c848a
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/aggregate/udaf.scala
@@ -0,0 +1,280 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.expressions.aggregate
+
+import org.apache.spark.Logging
+import org.apache.spark.sql.catalyst.expressions.codegen.GenerateMutableProjection
+import org.apache.spark.sql.catalyst.{InternalRow, CatalystTypeConverters}
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateFunction2
+import org.apache.spark.sql.types._
+import org.apache.spark.sql.Row
+
+/**
+ * The abstract class for implementing user-defined aggregate function.
+ */
+abstract class UserDefinedAggregateFunction extends Serializable {
+
+  /**
+   * A [[StructType]] represents data types of input arguments of this aggregate function.
+   * For example, if a [[UserDefinedAggregateFunction]] expects two input arguments
+   * with type of [[DoubleType]] and [[LongType]], the returned [[StructType]] will look like
+   *
+   * ```
+   *   StructType(Seq(StructField("doubleInput", DoubleType), StructField("longInput", LongType)))
+   * ```
+   *
+   * The name of a field of this [[StructType]] is only used to identify the corresponding
+   * input argument. Users can choose names to identify the input arguments.
+   */
+  def inputSchema: StructType
+
+  /**
+   * A [[StructType]] represents data types of values in the aggregation buffer.
+   * For example, if a [[UserDefinedAggregateFunction]]'s buffer has two values
+   * (i.e. two intermediate values) with type of [[DoubleType]] and [[LongType]],
+   * the returned [[StructType]] will look like
+   *
+   * ```
+   *   StructType(Seq(StructField("doubleInput", DoubleType), StructField("longInput", LongType)))
+   * ```
+   *
+   * The name of a field of this [[StructType]] is only used to identify the corresponding
+   * buffer value. Users can choose names to identify the input arguments.
+   */
+  def bufferSchema: StructType
+
+  /**
+   * The [[DataType]] of the returned value of this [[UserDefinedAggregateFunction]].
+   */
+  def returnDataType: DataType
+
+  /** Indicates if this function is deterministic. */
+  def deterministic: Boolean
+
+  /**
+   *  Initializes the given aggregation buffer. Initial values set by this method should satisfy
+   *  the condition that when merging two buffers with initial values, the new buffer should
+   *  still store initial values.
+   */
+  def initialize(buffer: MutableAggregationBuffer): Unit
+
+  /** Updates the given aggregation buffer `buffer` with new input data from `input`. */
+  def update(buffer: MutableAggregationBuffer, input: Row): Unit
+
+  /** Merges two aggregation buffers and stores the updated buffer values back in `buffer1`. */
+  def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit
+
+  /**
+   * Calculates the final result of this [[UserDefinedAggregateFunction]] based on the given
+   * aggregation buffer.
+   */
+  def evaluate(buffer: Row): Any
+}
+
+private[sql] abstract class AggregationBuffer(
+    toCatalystConverters: Array[Any => Any],
+    toScalaConverters: Array[Any => Any],
+    bufferOffset: Int)
+  extends Row {
+
+  override def length: Int = toCatalystConverters.length
+
+  protected val offsets: Array[Int] = {
+    val newOffsets = new Array[Int](length)
+    var i = 0
+    while (i < newOffsets.length) {
+      newOffsets(i) = bufferOffset + i
+      i += 1
+    }
+    newOffsets
+  }
+}
+
+/**
+ * A Mutable [[Row]] representing an mutable aggregation buffer.
+ */
+class MutableAggregationBuffer private[sql] (
+    toCatalystConverters: Array[Any => Any],
+    toScalaConverters: Array[Any => Any],
+    bufferOffset: Int,
+    var underlyingBuffer: MutableRow)
+  extends AggregationBuffer(toCatalystConverters, toScalaConverters, bufferOffset) {
+
+  override def get(i: Int): Any = {
+    if (i >= length || i < 0) {
+      throw new IllegalArgumentException(
+        s"Could not access ${i}th value in this buffer because it only has $length values.")
+    }
+    toScalaConverters(i)(underlyingBuffer(offsets(i)))
+  }
+
+  def update(i: Int, value: Any): Unit = {
+    if (i >= length || i < 0) {
+      throw new IllegalArgumentException(
+        s"Could not update ${i}th value in this buffer because it only has $length values.")
+    }
+    underlyingBuffer.update(offsets(i), toCatalystConverters(i)(value))
+  }
+
+  override def copy(): MutableAggregationBuffer = {
+    new MutableAggregationBuffer(
+      toCatalystConverters,
+      toScalaConverters,
+      bufferOffset,
+      underlyingBuffer)
+  }
+}
+
+/**
+ * A [[Row]] representing an immutable aggregation buffer.
+ */
+class InputAggregationBuffer private[sql] (
+    toCatalystConverters: Array[Any => Any],
+    toScalaConverters: Array[Any => Any],
+    bufferOffset: Int,
+    var underlyingInputBuffer: Row)
+  extends AggregationBuffer(toCatalystConverters, toScalaConverters, bufferOffset) {
+
+  override def get(i: Int): Any = {
+    if (i >= length || i < 0) {
+      throw new IllegalArgumentException(
+        s"Could not access ${i}th value in this buffer because it only has $length values.")
+    }
+    toScalaConverters(i)(underlyingInputBuffer(offsets(i)))
+  }
+
+  override def copy(): InputAggregationBuffer = {
+    new InputAggregationBuffer(
+      toCatalystConverters,
+      toScalaConverters,
+      bufferOffset,
+      underlyingInputBuffer)
+  }
+}
+
+/**
+ * The internal wrapper used to hook a [[UserDefinedAggregateFunction]] `udaf` in the
+ * internal aggregation code path.
+ * @param children
+ * @param udaf
+ */
+case class ScalaUDAF(
+    children: Seq[Expression],
+    udaf: UserDefinedAggregateFunction)
+  extends AggregateFunction2 with Logging {
+
+  require(
+    children.length == udaf.inputSchema.length,
+    s"$udaf only accepts ${udaf.inputSchema.length} arguments, " +
+      s"but ${children.length} are provided.")
+
+  override def nullable: Boolean = true
+
+  override def dataType: DataType = udaf.returnDataType
+
+  override def deterministic: Boolean = udaf.deterministic
+
+  override val inputTypes: Seq[DataType] = udaf.inputSchema.map(_.dataType)
+
+  override val bufferSchema: StructType = udaf.bufferSchema
+
+  override val bufferAttributes: Seq[AttributeReference] = bufferSchema.toAttributes
+
+  override lazy val cloneBufferAttributes = bufferAttributes.map(_.newInstance())
+
+  val childrenSchema: StructType = {
+    val inputFields = children.zipWithIndex.map {
+      case (child, index) =>
+        StructField(s"input$index", child.dataType, child.nullable, Metadata.empty)
+    }
+    StructType(inputFields)
+  }
+
+  lazy val inputProjection = {
+    val inputAttributes = childrenSchema.toAttributes
+    log.debug(
+      s"Creating MutableProj: $children, inputSchema: $inputAttributes.")
+    try {
+      GenerateMutableProjection.generate(children, inputAttributes)()
+    } catch {
+      case e: Exception =>
+        log.error("Failed to generate mutable projection, fallback to interpreted", e)
+        new InterpretedMutableProjection(children, inputAttributes)
+    }
+  }
+
+  val inputToScalaConverters: Any => Any =
+    CatalystTypeConverters.createToScalaConverter(childrenSchema)
+
+  val bufferValuesToCatalystConverters: Array[Any => Any] = bufferSchema.fields.map { field =>
+    CatalystTypeConverters.createToCatalystConverter(field.dataType)
+  }
+
+  val bufferValuesToScalaConverters: Array[Any => Any] = bufferSchema.fields.map { field =>
+    CatalystTypeConverters.createToScalaConverter(field.dataType)
+  }
+
+  lazy val inputAggregateBuffer: InputAggregationBuffer =
+    new InputAggregationBuffer(
+      bufferValuesToCatalystConverters,
+      bufferValuesToScalaConverters,
+      bufferOffset,
+      null)
+
+  lazy val mutableAggregateBuffer: MutableAggregationBuffer =
+    new MutableAggregationBuffer(
+      bufferValuesToCatalystConverters,
+      bufferValuesToScalaConverters,
+      bufferOffset,
+      null)
+
+
+  override def initialize(buffer: MutableRow): Unit = {
+    mutableAggregateBuffer.underlyingBuffer = buffer
+
+    udaf.initialize(mutableAggregateBuffer)
+  }
+
+  override def update(buffer: MutableRow, input: InternalRow): Unit = {
+    mutableAggregateBuffer.underlyingBuffer = buffer
+
+    udaf.update(
+      mutableAggregateBuffer,
+      inputToScalaConverters(inputProjection(input)).asInstanceOf[Row])
+  }
+
+  override def merge(buffer1: MutableRow, buffer2: InternalRow): Unit = {
+    mutableAggregateBuffer.underlyingBuffer = buffer1
+    inputAggregateBuffer.underlyingInputBuffer = buffer2
+
+    udaf.merge(mutableAggregateBuffer, inputAggregateBuffer)
+  }
+
+  override def eval(buffer: InternalRow = null): Any = {
+    inputAggregateBuffer.underlyingInputBuffer = buffer
+
+    udaf.evaluate(inputAggregateBuffer)
+  }
+
+  override def toString: String = {
+    s"""${udaf.getClass.getSimpleName}(${children.mkString(",")})"""
+  }
+
+  override def nodeName: String = udaf.getClass.getSimpleName
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 28159cbd5ab96..bfeecbe8b2ab5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -2420,7 +2420,7 @@ object functions {
    * @since 1.5.0
    */
   def callUDF(udfName: String, cols: Column*): Column = {
-    UnresolvedFunction(udfName, cols.map(_.expr))
+    UnresolvedFunction(udfName, cols.map(_.expr), isDistinct = false)
   }
 
   /**
@@ -2449,7 +2449,7 @@ object functions {
       exprs(i) = cols(i).expr
       i += 1
     }
-    UnresolvedFunction(udfName, exprs)
+    UnresolvedFunction(udfName, exprs, isDistinct = false)
   }
 
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index beee10173fbc4..ab8dce603c117 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -23,6 +23,7 @@ import java.sql.Timestamp
 
 import org.apache.spark.sql.catalyst.DefaultParserDialect
 import org.apache.spark.sql.catalyst.errors.DialectException
+import org.apache.spark.sql.execution.aggregate.Aggregate2Sort
 import org.apache.spark.sql.execution.GeneratedAggregate
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.TestData._
@@ -204,6 +205,7 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll with SQLTestUtils {
       var hasGeneratedAgg = false
       df.queryExecution.executedPlan.foreach {
         case generatedAgg: GeneratedAggregate => hasGeneratedAgg = true
+        case newAggregate: Aggregate2Sort => hasGeneratedAgg = true
         case _ =>
       }
       if (!hasGeneratedAgg) {
@@ -285,7 +287,7 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll with SQLTestUtils {
       // Aggregate with Code generation handling all null values
       testCodeGen(
         "SELECT  sum('a'), avg('a'), count(null) FROM testData",
-        Row(0, null, 0) :: Nil)
+        Row(null, null, 0) :: Nil)
     } finally {
       sqlContext.dropTempTable("testData3x")
       sqlContext.setConf(SQLConf.CODEGEN_ENABLED, originalValue)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
index 3dd24130af81a..3d71deb13e884 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.execution
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.TestData._
 import org.apache.spark.sql.catalyst.plans._
+import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.execution.joins.{BroadcastHashJoin, ShuffledHashJoin}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.test.TestSQLContext._
@@ -30,6 +31,20 @@ import org.apache.spark.sql.{Row, SQLConf, execution}
 
 
 class PlannerSuite extends SparkFunSuite {
+  private def testPartialAggregationPlan(query: LogicalPlan): Unit = {
+    val plannedOption = HashAggregation(query).headOption.orElse(Aggregation(query).headOption)
+    val planned =
+      plannedOption.getOrElse(
+        fail(s"Could query play aggregation query $query. Is it an aggregation query?"))
+    val aggregations = planned.collect { case n if n.nodeName contains "Aggregate" => n }
+
+    // For the new aggregation code path, there will be three aggregate operator for
+    // distinct aggregations.
+    assert(
+      aggregations.size == 2 || aggregations.size == 3,
+      s"The plan of query $query does not have partial aggregations.")
+  }
+
   test("unions are collapsed") {
     val query = testData.unionAll(testData).unionAll(testData).logicalPlan
     val planned = BasicOperators(query).head
@@ -42,23 +57,18 @@ class PlannerSuite extends SparkFunSuite {
 
   test("count is partially aggregated") {
     val query = testData.groupBy('value).agg(count('key)).queryExecution.analyzed
-    val planned = HashAggregation(query).head
-    val aggregations = planned.collect { case n if n.nodeName contains "Aggregate" => n }
-
-    assert(aggregations.size === 2)
+    testPartialAggregationPlan(query)
   }
 
   test("count distinct is partially aggregated") {
     val query = testData.groupBy('value).agg(countDistinct('key)).queryExecution.analyzed
-    val planned = HashAggregation(query)
-    assert(planned.nonEmpty)
+    testPartialAggregationPlan(query)
   }
 
   test("mixed aggregates are partially aggregated") {
     val query =
       testData.groupBy('value).agg(count('value), countDistinct('key)).queryExecution.analyzed
-    val planned = HashAggregation(query)
-    assert(planned.nonEmpty)
+    testPartialAggregationPlan(query)
   }
 
   test("sizeInBytes estimation of limit operator for broadcast hash join optimization") {
diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveWindowFunctionQuerySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveWindowFunctionQuerySuite.scala
index 31a49a3683338..24a758f53170a 100644
--- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveWindowFunctionQuerySuite.scala
+++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveWindowFunctionQuerySuite.scala
@@ -833,6 +833,7 @@ abstract class HiveWindowFunctionQueryFileBaseSuite
     "windowing_adjust_rowcontainer_sz"
   )
 
+  // Only run those query tests in the realWhileList (do not try other ignored query files).
   override def testCases: Seq[(String, File)] = super.testCases.filter {
     case (name, _) => realWhiteList.contains(name)
   }
diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/SortMergeCompatibilitySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/SortMergeCompatibilitySuite.scala
index f458567e5d7ea..1fe4fe9629c02 100644
--- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/SortMergeCompatibilitySuite.scala
+++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/SortMergeCompatibilitySuite.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.hive.execution
 
+import java.io.File
+
 import org.apache.spark.sql.SQLConf
 import org.apache.spark.sql.hive.test.TestHive
 
@@ -159,4 +161,9 @@ class SortMergeCompatibilitySuite extends HiveCompatibilitySuite {
     "join_reorder4",
     "join_star"
   )
+
+  // Only run those query tests in the realWhileList (do not try other ignored query files).
+  override def testCases: Seq[(String, File)] = super.testCases.filter {
+    case (name, _) => realWhiteList.contains(name)
+  }
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index cec7685bb6859..4cdb83c5116f9 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -451,6 +451,7 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) with Logging {
       DataSinks,
       Scripts,
       HashAggregation,
+      Aggregation,
       LeftSemiJoin,
       HashJoin,
       BasicOperators,
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
index f5574509b0b38..8518e333e8058 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
@@ -1464,9 +1464,12 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
 
     /* UDFs - Must be last otherwise will preempt built in functions */
     case Token("TOK_FUNCTION", Token(name, Nil) :: args) =>
-      UnresolvedFunction(name, args.map(nodeToExpr))
+      UnresolvedFunction(name, args.map(nodeToExpr), isDistinct = false)
+    // Aggregate function with DISTINCT keyword.
+    case Token("TOK_FUNCTIONDI", Token(name, Nil) :: args) =>
+      UnresolvedFunction(name, args.map(nodeToExpr), isDistinct = true)
     case Token("TOK_FUNCTIONSTAR", Token(name, Nil) :: args) =>
-      UnresolvedFunction(name, UnresolvedStar(None) :: Nil)
+      UnresolvedFunction(name, UnresolvedStar(None) :: Nil, isDistinct = false)
 
     /* Literals */
     case Token("TOK_NULL", Nil) => Literal.create(null, NullType)
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
index 4d23c7035c03d..3259b50acc765 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
@@ -409,7 +409,7 @@ private[hive] case class HiveWindowFunction(
 
 private[hive] case class HiveGenericUDAF(
     funcWrapper: HiveFunctionWrapper,
-    children: Seq[Expression]) extends AggregateExpression
+    children: Seq[Expression]) extends AggregateExpression1
   with HiveInspectors {
 
   type UDFType = AbstractGenericUDAFResolver
@@ -441,7 +441,7 @@ private[hive] case class HiveGenericUDAF(
 /** It is used as a wrapper for the hive functions which uses UDAF interface */
 private[hive] case class HiveUDAF(
     funcWrapper: HiveFunctionWrapper,
-    children: Seq[Expression]) extends AggregateExpression
+    children: Seq[Expression]) extends AggregateExpression1
   with HiveInspectors {
 
   type UDFType = UDAF
@@ -550,9 +550,9 @@ private[hive] case class HiveGenericUDTF(
 private[hive] case class HiveUDAFFunction(
     funcWrapper: HiveFunctionWrapper,
     exprs: Seq[Expression],
-    base: AggregateExpression,
+    base: AggregateExpression1,
     isUDAFBridgeRequired: Boolean = false)
-  extends AggregateFunction
+  extends AggregateFunction1
   with HiveInspectors {
 
   def this() = this(null, null, null)
diff --git a/sql/hive/src/test/java/test/org/apache/spark/sql/hive/aggregate/MyDoubleAvg.java b/sql/hive/src/test/java/test/org/apache/spark/sql/hive/aggregate/MyDoubleAvg.java
new file mode 100644
index 0000000000000..5c9d0e97a99c6
--- /dev/null
+++ b/sql/hive/src/test/java/test/org/apache/spark/sql/hive/aggregate/MyDoubleAvg.java
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package test.org.apache.spark.sql.hive.aggregate;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.expressions.aggregate.MutableAggregationBuffer;
+import org.apache.spark.sql.expressions.aggregate.UserDefinedAggregateFunction;
+import org.apache.spark.sql.types.DataType;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+
+public class MyDoubleAvg extends UserDefinedAggregateFunction {
+
+  private StructType _inputDataType;
+
+  private StructType _bufferSchema;
+
+  private DataType _returnDataType;
+
+  public MyDoubleAvg() {
+    List<StructField> inputfields = new ArrayList<StructField>();
+    inputfields.add(DataTypes.createStructField("inputDouble", DataTypes.DoubleType, true));
+    _inputDataType = DataTypes.createStructType(inputfields);
+
+    List<StructField> bufferFields = new ArrayList<StructField>();
+    bufferFields.add(DataTypes.createStructField("bufferSum", DataTypes.DoubleType, true));
+    bufferFields.add(DataTypes.createStructField("bufferCount", DataTypes.LongType, true));
+    _bufferSchema = DataTypes.createStructType(bufferFields);
+
+    _returnDataType = DataTypes.DoubleType;
+  }
+
+  @Override public StructType inputSchema() {
+    return _inputDataType;
+  }
+
+  @Override public StructType bufferSchema() {
+    return _bufferSchema;
+  }
+
+  @Override public DataType returnDataType() {
+    return _returnDataType;
+  }
+
+  @Override public boolean deterministic() {
+    return true;
+  }
+
+  @Override public void initialize(MutableAggregationBuffer buffer) {
+    buffer.update(0, null);
+    buffer.update(1, 0L);
+  }
+
+  @Override public void update(MutableAggregationBuffer buffer, Row input) {
+    if (!input.isNullAt(0)) {
+      if (buffer.isNullAt(0)) {
+        buffer.update(0, input.getDouble(0));
+        buffer.update(1, 1L);
+      } else {
+        Double newValue = input.getDouble(0) + buffer.getDouble(0);
+        buffer.update(0, newValue);
+        buffer.update(1, buffer.getLong(1) + 1L);
+      }
+    }
+  }
+
+  @Override public void merge(MutableAggregationBuffer buffer1, Row buffer2) {
+    if (!buffer2.isNullAt(0)) {
+      if (buffer1.isNullAt(0)) {
+        buffer1.update(0, buffer2.getDouble(0));
+        buffer1.update(1, buffer2.getLong(1));
+      } else {
+        Double newValue = buffer2.getDouble(0) + buffer1.getDouble(0);
+        buffer1.update(0, newValue);
+        buffer1.update(1, buffer1.getLong(1) + buffer2.getLong(1));
+      }
+    }
+  }
+
+  @Override public Object evaluate(Row buffer) {
+    if (buffer.isNullAt(0)) {
+      return null;
+    } else {
+      return buffer.getDouble(0) / buffer.getLong(1) + 100.0;
+    }
+  }
+}
+
diff --git a/sql/hive/src/test/java/test/org/apache/spark/sql/hive/aggregate/MyDoubleSum.java b/sql/hive/src/test/java/test/org/apache/spark/sql/hive/aggregate/MyDoubleSum.java
new file mode 100644
index 0000000000000..1d4587a27c787
--- /dev/null
+++ b/sql/hive/src/test/java/test/org/apache/spark/sql/hive/aggregate/MyDoubleSum.java
@@ -0,0 +1,100 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package test.org.apache.spark.sql.hive.aggregate;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.spark.sql.expressions.aggregate.MutableAggregationBuffer;
+import org.apache.spark.sql.expressions.aggregate.UserDefinedAggregateFunction;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+import org.apache.spark.sql.types.DataType;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.Row;
+
+public class MyDoubleSum extends UserDefinedAggregateFunction {
+
+  private StructType _inputDataType;
+
+  private StructType _bufferSchema;
+
+  private DataType _returnDataType;
+
+  public MyDoubleSum() {
+    List<StructField> inputfields = new ArrayList<StructField>();
+    inputfields.add(DataTypes.createStructField("inputDouble", DataTypes.DoubleType, true));
+    _inputDataType = DataTypes.createStructType(inputfields);
+
+    List<StructField> bufferFields = new ArrayList<StructField>();
+    bufferFields.add(DataTypes.createStructField("bufferDouble", DataTypes.DoubleType, true));
+    _bufferSchema = DataTypes.createStructType(bufferFields);
+
+    _returnDataType = DataTypes.DoubleType;
+  }
+
+  @Override public StructType inputSchema() {
+    return _inputDataType;
+  }
+
+  @Override public StructType bufferSchema() {
+    return _bufferSchema;
+  }
+
+  @Override public DataType returnDataType() {
+    return _returnDataType;
+  }
+
+  @Override public boolean deterministic() {
+    return true;
+  }
+
+  @Override public void initialize(MutableAggregationBuffer buffer) {
+    buffer.update(0, null);
+  }
+
+  @Override public void update(MutableAggregationBuffer buffer, Row input) {
+    if (!input.isNullAt(0)) {
+      if (buffer.isNullAt(0)) {
+        buffer.update(0, input.getDouble(0));
+      } else {
+        Double newValue = input.getDouble(0) + buffer.getDouble(0);
+        buffer.update(0, newValue);
+      }
+    }
+  }
+
+  @Override public void merge(MutableAggregationBuffer buffer1, Row buffer2) {
+    if (!buffer2.isNullAt(0)) {
+      if (buffer1.isNullAt(0)) {
+        buffer1.update(0, buffer2.getDouble(0));
+      } else {
+        Double newValue = buffer2.getDouble(0) + buffer1.getDouble(0);
+        buffer1.update(0, newValue);
+      }
+    }
+  }
+
+  @Override public Object evaluate(Row buffer) {
+    if (buffer.isNullAt(0)) {
+      return null;
+    } else {
+      return buffer.getDouble(0);
+    }
+  }
+}
diff --git a/sql/hive/src/test/resources/golden/udf_unhex-0-50131c0ba7b7a6b65c789a5a8497bada b/sql/hive/src/test/resources/golden/udf_unhex-0-50131c0ba7b7a6b65c789a5a8497bada
new file mode 100644
index 0000000000000..573541ac9702d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_unhex-0-50131c0ba7b7a6b65c789a5a8497bada
@@ -0,0 +1 @@
+0
diff --git a/sql/hive/src/test/resources/golden/udf_unhex-1-11eb3cc5216d5446f4165007203acc47 b/sql/hive/src/test/resources/golden/udf_unhex-1-11eb3cc5216d5446f4165007203acc47
new file mode 100644
index 0000000000000..44b2a42cc26c5
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_unhex-1-11eb3cc5216d5446f4165007203acc47
@@ -0,0 +1 @@
+unhex(str) - Converts hexadecimal argument to binary
diff --git a/sql/hive/src/test/resources/golden/udf_unhex-2-a660886085b8651852b9b77934848ae4 b/sql/hive/src/test/resources/golden/udf_unhex-2-a660886085b8651852b9b77934848ae4
new file mode 100644
index 0000000000000..97af3b812a429
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_unhex-2-a660886085b8651852b9b77934848ae4
@@ -0,0 +1,14 @@
+unhex(str) - Converts hexadecimal argument to binary
+Performs the inverse operation of HEX(str). That is, it interprets
+each pair of hexadecimal digits in the argument as a number and
+converts it to the byte representation of the number. The
+resulting characters are returned as a binary string.
+
+Example:
+> SELECT DECODE(UNHEX('4D7953514C'), 'UTF-8') from src limit 1;
+'MySQL'
+
+The characters in the argument string must be legal hexadecimal
+digits: '0' .. '9', 'A' .. 'F', 'a' .. 'f'. If UNHEX() encounters
+any nonhexadecimal digits in the argument, it returns NULL. Also,
+if there are an odd number of characters a leading 0 is appended.
diff --git a/sql/hive/src/test/resources/golden/udf_unhex-3-4b2cf4050af229fde91ab53fd9f3af3e b/sql/hive/src/test/resources/golden/udf_unhex-3-4b2cf4050af229fde91ab53fd9f3af3e
new file mode 100644
index 0000000000000..b4a6f2b692227
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_unhex-3-4b2cf4050af229fde91ab53fd9f3af3e
@@ -0,0 +1 @@
+MySQL	1267	a	-4	
diff --git a/sql/hive/src/test/resources/golden/udf_unhex-4-7d3e094f139892ecef17de3fd63ca3c3 b/sql/hive/src/test/resources/golden/udf_unhex-4-7d3e094f139892ecef17de3fd63ca3c3
new file mode 100644
index 0000000000000..3a67adaf0a9a8
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_unhex-4-7d3e094f139892ecef17de3fd63ca3c3
@@ -0,0 +1 @@
+NULL	NULL	NULL
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala
new file mode 100644
index 0000000000000..0375eb79add95
--- /dev/null
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala
@@ -0,0 +1,507 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.execution
+
+import org.apache.spark.sql.execution.aggregate.Aggregate2Sort
+import org.apache.spark.sql.hive.test.TestHive
+import org.apache.spark.sql.test.SQLTestUtils
+import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
+import org.apache.spark.sql.{AnalysisException, QueryTest, Row}
+import org.scalatest.BeforeAndAfterAll
+import test.org.apache.spark.sql.hive.aggregate.{MyDoubleAvg, MyDoubleSum}
+
+class AggregationQuerySuite extends QueryTest with SQLTestUtils with BeforeAndAfterAll {
+
+  override val sqlContext = TestHive
+  import sqlContext.implicits._
+
+  var originalUseAggregate2: Boolean = _
+
+  override def beforeAll(): Unit = {
+    originalUseAggregate2 = sqlContext.conf.useSqlAggregate2
+    sqlContext.sql("set spark.sql.useAggregate2=true")
+    val data1 = Seq[(Integer, Integer)](
+      (1, 10),
+      (null, -60),
+      (1, 20),
+      (1, 30),
+      (2, 0),
+      (null, -10),
+      (2, -1),
+      (2, null),
+      (2, null),
+      (null, 100),
+      (3, null),
+      (null, null),
+      (3, null)).toDF("key", "value")
+    data1.write.saveAsTable("agg1")
+
+    val data2 = Seq[(Integer, Integer, Integer)](
+      (1, 10, -10),
+      (null, -60, 60),
+      (1, 30, -30),
+      (1, 30, 30),
+      (2, 1, 1),
+      (null, -10, 10),
+      (2, -1, null),
+      (2, 1, 1),
+      (2, null, 1),
+      (null, 100, -10),
+      (3, null, 3),
+      (null, null, null),
+      (3, null, null)).toDF("key", "value1", "value2")
+    data2.write.saveAsTable("agg2")
+
+    val emptyDF = sqlContext.createDataFrame(
+      sqlContext.sparkContext.emptyRDD[Row],
+      StructType(StructField("key", StringType) :: StructField("value", IntegerType) :: Nil))
+    emptyDF.registerTempTable("emptyTable")
+
+    // Register UDAFs
+    sqlContext.udaf.register("mydoublesum", new MyDoubleSum)
+    sqlContext.udaf.register("mydoubleavg", new MyDoubleAvg)
+  }
+
+  override def afterAll(): Unit = {
+    sqlContext.sql("DROP TABLE IF EXISTS agg1")
+    sqlContext.sql("DROP TABLE IF EXISTS agg2")
+    sqlContext.dropTempTable("emptyTable")
+    sqlContext.sql(s"set spark.sql.useAggregate2=$originalUseAggregate2")
+  }
+
+  test("empty table") {
+    // If there is no GROUP BY clause and the table is empty, we will generate a single row.
+    checkAnswer(
+      sqlContext.sql(
+        """
+          |SELECT
+          |  AVG(value),
+          |  COUNT(*),
+          |  COUNT(key),
+          |  COUNT(value),
+          |  FIRST(key),
+          |  LAST(value),
+          |  MAX(key),
+          |  MIN(value),
+          |  SUM(key)
+          |FROM emptyTable
+        """.stripMargin),
+      Row(null, 0, 0, 0, null, null, null, null, null) :: Nil)
+
+    checkAnswer(
+      sqlContext.sql(
+        """
+          |SELECT
+          |  AVG(value),
+          |  COUNT(*),
+          |  COUNT(key),
+          |  COUNT(value),
+          |  FIRST(key),
+          |  LAST(value),
+          |  MAX(key),
+          |  MIN(value),
+          |  SUM(key),
+          |  COUNT(DISTINCT value)
+          |FROM emptyTable
+        """.stripMargin),
+      Row(null, 0, 0, 0, null, null, null, null, null, 0) :: Nil)
+
+    // If there is a GROUP BY clause and the table is empty, there is no output.
+    checkAnswer(
+      sqlContext.sql(
+        """
+          |SELECT
+          |  AVG(value),
+          |  COUNT(*),
+          |  COUNT(value),
+          |  FIRST(value),
+          |  LAST(value),
+          |  MAX(value),
+          |  MIN(value),
+          |  SUM(value),
+          |  COUNT(DISTINCT value)
+          |FROM emptyTable
+          |GROUP BY key
+        """.stripMargin),
+      Nil)
+  }
+
+  test("only do grouping") {
+    checkAnswer(
+      sqlContext.sql(
+        """
+          |SELECT key
+          |FROM agg1
+          |GROUP BY key
+        """.stripMargin),
+      Row(1) :: Row(2) :: Row(3) :: Row(null) :: Nil)
+
+    checkAnswer(
+      sqlContext.sql(
+        """
+          |SELECT DISTINCT value1, key
+          |FROM agg2
+        """.stripMargin),
+      Row(10, 1) ::
+        Row(-60, null) ::
+        Row(30, 1) ::
+        Row(1, 2) ::
+        Row(-10, null) ::
+        Row(-1, 2) ::
+        Row(null, 2) ::
+        Row(100, null) ::
+        Row(null, 3) ::
+        Row(null, null) :: Nil)
+
+    checkAnswer(
+      sqlContext.sql(
+        """
+          |SELECT value1, key
+          |FROM agg2
+          |GROUP BY key, value1
+        """.stripMargin),
+      Row(10, 1) ::
+        Row(-60, null) ::
+        Row(30, 1) ::
+        Row(1, 2) ::
+        Row(-10, null) ::
+        Row(-1, 2) ::
+        Row(null, 2) ::
+        Row(100, null) ::
+        Row(null, 3) ::
+        Row(null, null) :: Nil)
+  }
+
+  test("case in-sensitive resolution") {
+    checkAnswer(
+      sqlContext.sql(
+        """
+          |SELECT avg(value), kEY - 100
+          |FROM agg1
+          |GROUP BY Key - 100
+        """.stripMargin),
+      Row(20.0, -99) :: Row(-0.5, -98) :: Row(null, -97) :: Row(10.0, null) :: Nil)
+
+    checkAnswer(
+      sqlContext.sql(
+        """
+          |SELECT sum(distinct value1), kEY - 100, count(distinct value1)
+          |FROM agg2
+          |GROUP BY Key - 100
+        """.stripMargin),
+      Row(40, -99, 2) :: Row(0, -98, 2) :: Row(null, -97, 0) :: Row(30, null, 3) :: Nil)
+
+    checkAnswer(
+      sqlContext.sql(
+        """
+          |SELECT valUe * key - 100
+          |FROM agg1
+          |GROUP BY vAlue * keY - 100
+        """.stripMargin),
+      Row(-90) ::
+        Row(-80) ::
+        Row(-70) ::
+        Row(-100) ::
+        Row(-102) ::
+        Row(null) :: Nil)
+  }
+
+  test("test average no key in output") {
+    checkAnswer(
+      sqlContext.sql(
+        """
+          |SELECT avg(value)
+          |FROM agg1
+          |GROUP BY key
+        """.stripMargin),
+      Row(-0.5) :: Row(20.0) :: Row(null) :: Row(10.0) :: Nil)
+  }
+
+  test("test average") {
+    checkAnswer(
+      sqlContext.sql(
+        """
+          |SELECT key, avg(value)
+          |FROM agg1
+          |GROUP BY key
+        """.stripMargin),
+      Row(1, 20.0) :: Row(2, -0.5) :: Row(3, null) :: Row(null, 10.0) :: Nil)
+
+    checkAnswer(
+      sqlContext.sql(
+        """
+          |SELECT avg(value), key
+          |FROM agg1
+          |GROUP BY key
+        """.stripMargin),
+      Row(20.0, 1) :: Row(-0.5, 2) :: Row(null, 3) :: Row(10.0, null) :: Nil)
+
+    checkAnswer(
+      sqlContext.sql(
+        """
+          |SELECT avg(value) + 1.5, key + 10
+          |FROM agg1
+          |GROUP BY key + 10
+        """.stripMargin),
+      Row(21.5, 11) :: Row(1.0, 12) :: Row(null, 13) :: Row(11.5, null) :: Nil)
+
+    checkAnswer(
+      sqlContext.sql(
+        """
+          |SELECT avg(value) FROM agg1
+        """.stripMargin),
+      Row(11.125) :: Nil)
+
+    checkAnswer(
+      sqlContext.sql(
+        """
+          |SELECT avg(null)
+        """.stripMargin),
+      Row(null) :: Nil)
+  }
+
+  test("udaf") {
+    checkAnswer(
+      sqlContext.sql(
+        """
+          |SELECT
+          |  key,
+          |  mydoublesum(value + 1.5 * key),
+          |  mydoubleavg(value),
+          |  avg(value - key),
+          |  mydoublesum(value - 1.5 * key),
+          |  avg(value)
+          |FROM agg1
+          |GROUP BY key
+        """.stripMargin),
+      Row(1, 64.5, 120.0, 19.0, 55.5, 20.0) ::
+        Row(2, 5.0, 99.5, -2.5, -7.0, -0.5) ::
+        Row(3, null, null, null, null, null) ::
+        Row(null, null, 110.0, null, null, 10.0) :: Nil)
+  }
+
+  test("non-AlgebraicAggregate aggreguate function") {
+    checkAnswer(
+      sqlContext.sql(
+        """
+          |SELECT mydoublesum(value), key
+          |FROM agg1
+          |GROUP BY key
+        """.stripMargin),
+      Row(60.0, 1) :: Row(-1.0, 2) :: Row(null, 3) :: Row(30.0, null) :: Nil)
+
+    checkAnswer(
+      sqlContext.sql(
+        """
+          |SELECT mydoublesum(value) FROM agg1
+        """.stripMargin),
+      Row(89.0) :: Nil)
+
+    checkAnswer(
+      sqlContext.sql(
+        """
+          |SELECT mydoublesum(null)
+        """.stripMargin),
+      Row(null) :: Nil)
+  }
+
+  test("non-AlgebraicAggregate and AlgebraicAggregate aggreguate function") {
+    checkAnswer(
+      sqlContext.sql(
+        """
+          |SELECT mydoublesum(value), key, avg(value)
+          |FROM agg1
+          |GROUP BY key
+        """.stripMargin),
+      Row(60.0, 1, 20.0) ::
+        Row(-1.0, 2, -0.5) ::
+        Row(null, 3, null) ::
+        Row(30.0, null, 10.0) :: Nil)
+
+    checkAnswer(
+      sqlContext.sql(
+        """
+          |SELECT
+          |  mydoublesum(value + 1.5 * key),
+          |  avg(value - key),
+          |  key,
+          |  mydoublesum(value - 1.5 * key),
+          |  avg(value)
+          |FROM agg1
+          |GROUP BY key
+        """.stripMargin),
+      Row(64.5, 19.0, 1, 55.5, 20.0) ::
+        Row(5.0, -2.5, 2, -7.0, -0.5) ::
+        Row(null, null, 3, null, null) ::
+        Row(null, null, null, null, 10.0) :: Nil)
+  }
+
+  test("single distinct column set") {
+    // DISTINCT is not meaningful with Max and Min, so we just ignore the DISTINCT keyword.
+    checkAnswer(
+      sqlContext.sql(
+        """
+          |SELECT
+          |  min(distinct value1),
+          |  sum(distinct value1),
+          |  avg(value1),
+          |  avg(value2),
+          |  max(distinct value1)
+          |FROM agg2
+        """.stripMargin),
+      Row(-60, 70.0, 101.0/9.0, 5.6, 100.0))
+
+    checkAnswer(
+      sqlContext.sql(
+        """
+          |SELECT
+          |  mydoubleavg(distinct value1),
+          |  avg(value1),
+          |  avg(value2),
+          |  key,
+          |  mydoubleavg(value1 - 1),
+          |  mydoubleavg(distinct value1) * 0.1,
+          |  avg(value1 + value2)
+          |FROM agg2
+          |GROUP BY key
+        """.stripMargin),
+      Row(120.0, 70.0/3.0, -10.0/3.0, 1, 67.0/3.0 + 100.0, 12.0, 20.0) ::
+        Row(100.0, 1.0/3.0, 1.0, 2, -2.0/3.0 + 100.0, 10.0, 2.0) ::
+        Row(null, null, 3.0, 3, null, null, null) ::
+        Row(110.0, 10.0, 20.0, null, 109.0, 11.0, 30.0) :: Nil)
+
+    checkAnswer(
+      sqlContext.sql(
+        """
+          |SELECT
+          |  key,
+          |  mydoubleavg(distinct value1),
+          |  mydoublesum(value2),
+          |  mydoublesum(distinct value1),
+          |  mydoubleavg(distinct value1),
+          |  mydoubleavg(value1)
+          |FROM agg2
+          |GROUP BY key
+        """.stripMargin),
+      Row(1, 120.0, -10.0, 40.0, 120.0, 70.0/3.0 + 100.0) ::
+        Row(2, 100.0, 3.0, 0.0, 100.0, 1.0/3.0 + 100.0) ::
+        Row(3, null, 3.0, null, null, null) ::
+        Row(null, 110.0, 60.0, 30.0, 110.0, 110.0) :: Nil)
+  }
+
+  test("test count") {
+    checkAnswer(
+      sqlContext.sql(
+        """
+          |SELECT
+          |  count(value2),
+          |  value1,
+          |  count(*),
+          |  count(1),
+          |  key
+          |FROM agg2
+          |GROUP BY key, value1
+        """.stripMargin),
+      Row(1, 10, 1, 1, 1) ::
+        Row(1, -60, 1, 1, null) ::
+        Row(2, 30, 2, 2, 1) ::
+        Row(2, 1, 2, 2, 2) ::
+        Row(1, -10, 1, 1, null) ::
+        Row(0, -1, 1, 1, 2) ::
+        Row(1, null, 1, 1, 2) ::
+        Row(1, 100, 1, 1, null) ::
+        Row(1, null, 2, 2, 3) ::
+        Row(0, null, 1, 1, null) :: Nil)
+
+    checkAnswer(
+      sqlContext.sql(
+        """
+          |SELECT
+          |  count(value2),
+          |  value1,
+          |  count(*),
+          |  count(1),
+          |  key,
+          |  count(DISTINCT abs(value2))
+          |FROM agg2
+          |GROUP BY key, value1
+        """.stripMargin),
+      Row(1, 10, 1, 1, 1, 1) ::
+        Row(1, -60, 1, 1, null, 1) ::
+        Row(2, 30, 2, 2, 1, 1) ::
+        Row(2, 1, 2, 2, 2, 1) ::
+        Row(1, -10, 1, 1, null, 1) ::
+        Row(0, -1, 1, 1, 2, 0) ::
+        Row(1, null, 1, 1, 2, 1) ::
+        Row(1, 100, 1, 1, null, 1) ::
+        Row(1, null, 2, 2, 3, 1) ::
+        Row(0, null, 1, 1, null, 0) :: Nil)
+  }
+
+  test("error handling") {
+    sqlContext.sql(s"set spark.sql.useAggregate2=false")
+    var errorMessage = intercept[AnalysisException] {
+      sqlContext.sql(
+        """
+          |SELECT
+          |  key,
+          |  sum(value + 1.5 * key),
+          |  mydoublesum(value),
+          |  mydoubleavg(value)
+          |FROM agg1
+          |GROUP BY key
+        """.stripMargin).collect()
+    }.getMessage
+    assert(errorMessage.contains("implemented based on the new Aggregate Function interface"))
+
+    // TODO: once we support Hive UDAF in the new interface,
+    // we can remove the following two tests.
+    sqlContext.sql(s"set spark.sql.useAggregate2=true")
+    errorMessage = intercept[AnalysisException] {
+      sqlContext.sql(
+        """
+          |SELECT
+          |  key,
+          |  mydoublesum(value + 1.5 * key),
+          |  stddev_samp(value)
+          |FROM agg1
+          |GROUP BY key
+        """.stripMargin).collect()
+    }.getMessage
+    assert(errorMessage.contains("implemented based on the new Aggregate Function interface"))
+
+    // This will fall back to the old aggregate
+    val newAggregateOperators = sqlContext.sql(
+      """
+        |SELECT
+        |  key,
+        |  sum(value + 1.5 * key),
+        |  stddev_samp(value)
+        |FROM agg1
+        |GROUP BY key
+      """.stripMargin).queryExecution.executedPlan.collect {
+      case agg: Aggregate2Sort => agg
+    }
+    val message =
+      "We should fallback to the old aggregation code path if there is any aggregate function " +
+        "that cannot be converted to the new interface."
+    assert(newAggregateOperators.isEmpty, message)
+
+    sqlContext.sql(s"set spark.sql.useAggregate2=true")
+  }
+}

From b55a36bc30a628d76baa721d38789fc219eccc27 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Wed, 22 Jul 2015 09:32:42 -0700
Subject: [PATCH 316/549] [SPARK-9254] [BUILD] [HOTFIX] sbt-launch-lib.bash
 should support HTTP/HTTPS redirection

Target file(s) can be hosted on CDN nodes. HTTP/HTTPS redirection must be supported to download these files.

Author: Cheng Lian <lian@databricks.com>

Closes #7597 from liancheng/spark-9254 and squashes the following commits:

fd266ca [Cheng Lian] Uses `--fail' to make curl return non-zero value and remove garbage output when the download fails
a7cbfb3 [Cheng Lian] Supports HTTP/HTTPS redirection
---
 build/sbt-launch-lib.bash | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/build/sbt-launch-lib.bash b/build/sbt-launch-lib.bash
index 504be48b358fa..7930a38b9674a 100755
--- a/build/sbt-launch-lib.bash
+++ b/build/sbt-launch-lib.bash
@@ -51,9 +51,13 @@ acquire_sbt_jar () {
     printf "Attempting to fetch sbt\n"
     JAR_DL="${JAR}.part"
     if [ $(command -v curl) ]; then
-      (curl --silent ${URL1} > "${JAR_DL}" || curl --silent ${URL2} > "${JAR_DL}") && mv "${JAR_DL}" "${JAR}"
+      (curl --fail --location --silent ${URL1} > "${JAR_DL}" ||\
+        (rm -f "${JAR_DL}" && curl --fail --location --silent ${URL2} > "${JAR_DL}")) &&\
+        mv "${JAR_DL}" "${JAR}"
     elif [ $(command -v wget) ]; then
-      (wget --quiet ${URL1} -O "${JAR_DL}" || wget --quiet ${URL2} -O "${JAR_DL}") && mv "${JAR_DL}" "${JAR}"
+      (wget --quiet ${URL1} -O "${JAR_DL}" ||\
+        (rm -f "${JAR_DL}" && wget --quiet ${URL2} -O "${JAR_DL}")) &&\
+        mv "${JAR_DL}" "${JAR}"
     else
       printf "You do not have curl or wget installed, please install sbt manually from http://www.scala-sbt.org/\n"
       exit -1

From 76520955fddbda87a5c53d0a394dedc91dce67e8 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@outlook.com>
Date: Wed, 22 Jul 2015 11:45:51 -0700
Subject: [PATCH 317/549] [SPARK-9082] [SQL] Filter using non-deterministic
 expressions should not be pushed down

Author: Wenchen Fan <cloud0fan@outlook.com>

Closes #7446 from cloud-fan/filter and squashes the following commits:

330021e [Wenchen Fan] add exists to tree node
2cab68c [Wenchen Fan] more enhance
949be07 [Wenchen Fan] push down part of predicate if possible
3912f84 [Wenchen Fan] address comments
8ce15ca [Wenchen Fan] fix bug
557158e [Wenchen Fan] Filter using non-deterministic expressions should not be pushed down
---
 .../sql/catalyst/optimizer/Optimizer.scala    | 50 +++++++++++++++----
 .../optimizer/FilterPushdownSuite.scala       | 45 ++++++++++++++++-
 2 files changed, 84 insertions(+), 11 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index e42f0b9a247e3..d2db3dd3d078e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -541,20 +541,50 @@ object SimplifyFilters extends Rule[LogicalPlan] {
  *
  * This heuristic is valid assuming the expression evaluation cost is minimal.
  */
-object PushPredicateThroughProject extends Rule[LogicalPlan] {
+object PushPredicateThroughProject extends Rule[LogicalPlan] with PredicateHelper {
   def apply(plan: LogicalPlan): LogicalPlan = plan transform {
     case filter @ Filter(condition, project @ Project(fields, grandChild)) =>
-      val sourceAliases = fields.collect { case a @ Alias(c, _) =>
-        (a.toAttribute: Attribute) -> c
-      }.toMap
-      project.copy(child = filter.copy(
-        replaceAlias(condition, sourceAliases),
-        grandChild))
+      // Create a map of Aliases to their values from the child projection.
+      // e.g., 'SELECT a + b AS c, d ...' produces Map(c -> a + b).
+      val aliasMap = AttributeMap(fields.collect {
+        case a: Alias => (a.toAttribute, a.child)
+      })
+
+      // Split the condition into small conditions by `And`, so that we can push down part of this
+      // condition without nondeterministic expressions.
+      val andConditions = splitConjunctivePredicates(condition)
+      val nondeterministicConditions = andConditions.filter(hasNondeterministic(_, aliasMap))
+
+      // If there is no nondeterministic conditions, push down the whole condition.
+      if (nondeterministicConditions.isEmpty) {
+        project.copy(child = Filter(replaceAlias(condition, aliasMap), grandChild))
+      } else {
+        // If they are all nondeterministic conditions, leave it un-changed.
+        if (nondeterministicConditions.length == andConditions.length) {
+          filter
+        } else {
+          val deterministicConditions = andConditions.filterNot(hasNondeterministic(_, aliasMap))
+          // Push down the small conditions without nondeterministic expressions.
+          val pushedCondition = deterministicConditions.map(replaceAlias(_, aliasMap)).reduce(And)
+          Filter(nondeterministicConditions.reduce(And),
+            project.copy(child = Filter(pushedCondition, grandChild)))
+        }
+      }
+  }
+
+  private def hasNondeterministic(
+      condition: Expression,
+      sourceAliases: AttributeMap[Expression]) = {
+    condition.collect {
+      case a: Attribute if sourceAliases.contains(a) => sourceAliases(a)
+    }.exists(!_.deterministic)
   }
 
-  private def replaceAlias(condition: Expression, sourceAliases: Map[Attribute, Expression]) = {
-    condition transform {
-      case a: AttributeReference => sourceAliases.getOrElse(a, a)
+  // Substitute any attributes that are produced by the child projection, so that we safely
+  // eliminate it.
+  private def replaceAlias(condition: Expression, sourceAliases: AttributeMap[Expression]) = {
+    condition.transform {
+      case a: Attribute => sourceAliases.getOrElse(a, a)
     }
   }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
index dc28b3ffb59ee..0f1fde2fb0f67 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.optimizer
 
 import org.apache.spark.sql.catalyst.analysis
 import org.apache.spark.sql.catalyst.analysis.EliminateSubQueries
-import org.apache.spark.sql.catalyst.expressions.{SortOrder, Ascending, Count, Explode}
+import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.plans.{LeftSemi, PlanTest, LeftOuter, RightOuter}
 import org.apache.spark.sql.catalyst.rules._
@@ -146,6 +146,49 @@ class FilterPushdownSuite extends PlanTest {
     comparePlans(optimized, correctAnswer)
   }
 
+  test("nondeterministic: can't push down filter through project") {
+    val originalQuery = testRelation
+      .select(Rand(10).as('rand), 'a)
+      .where('rand > 5 || 'a > 5)
+      .analyze
+
+    val optimized = Optimize.execute(originalQuery)
+
+    comparePlans(optimized, originalQuery)
+  }
+
+  test("nondeterministic: push down part of filter through project") {
+    val originalQuery = testRelation
+      .select(Rand(10).as('rand), 'a)
+      .where('rand > 5 && 'a > 5)
+      .analyze
+
+    val optimized = Optimize.execute(originalQuery)
+
+    val correctAnswer = testRelation
+      .where('a > 5)
+      .select(Rand(10).as('rand), 'a)
+      .where('rand > 5)
+      .analyze
+
+    comparePlans(optimized, correctAnswer)
+  }
+
+  test("nondeterministic: push down filter through project") {
+    val originalQuery = testRelation
+      .select(Rand(10).as('rand), 'a)
+      .where('a > 5 && 'a < 10)
+      .analyze
+
+    val optimized = Optimize.execute(originalQuery)
+    val correctAnswer = testRelation
+      .where('a > 5 && 'a < 10)
+      .select(Rand(10).as('rand), 'a)
+      .analyze
+
+    comparePlans(optimized, correctAnswer)
+  }
+
   test("filters: combines filters") {
     val originalQuery = testRelation
       .select('a)

From 86f80e2b4759e574fe3eb91695f81b644db87242 Mon Sep 17 00:00:00 2001
From: Yijie Shen <henry.yijieshen@gmail.com>
Date: Wed, 22 Jul 2015 12:19:59 -0700
Subject: [PATCH 318/549] [SPARK-9165] [SQL] codegen for CreateArray,
 CreateStruct and CreateNamedStruct

JIRA: https://issues.apache.org/jira/browse/SPARK-9165

Author: Yijie Shen <henry.yijieshen@gmail.com>

Closes #7537 from yjshen/array_struct_codegen and squashes the following commits:

3a6dce6 [Yijie Shen] use infix notion in createArray test
5e90f0a [Yijie Shen] resolve comments: classOf
39cefb8 [Yijie Shen] codegen for createArray createStruct & createNamedStruct
---
 .../expressions/complexTypeCreator.scala      | 65 +++++++++++++++++--
 .../expressions/ComplexTypeSuite.scala        | 16 +++++
 2 files changed, 76 insertions(+), 5 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
index f9fd04c02aaef..20b1eaab8e303 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
@@ -17,16 +17,18 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+import scala.collection.mutable
+
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
-import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
+import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, CodeGenContext}
 import org.apache.spark.sql.catalyst.util.TypeUtils
 import org.apache.spark.sql.types._
 
 /**
  * Returns an Array containing the evaluation of all children expressions.
  */
-case class CreateArray(children: Seq[Expression]) extends Expression with CodegenFallback {
+case class CreateArray(children: Seq[Expression]) extends Expression {
 
   override def foldable: Boolean = children.forall(_.foldable)
 
@@ -45,14 +47,31 @@ case class CreateArray(children: Seq[Expression]) extends Expression with Codege
     children.map(_.eval(input))
   }
 
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    val arraySeqClass = classOf[mutable.ArraySeq[Any]].getName
+    s"""
+      boolean ${ev.isNull} = false;
+      $arraySeqClass<Object> ${ev.primitive} = new $arraySeqClass<Object>(${children.size});
+    """ +
+      children.zipWithIndex.map { case (e, i) =>
+        val eval = e.gen(ctx)
+        eval.code + s"""
+          if (${eval.isNull}) {
+            ${ev.primitive}.update($i, null);
+          } else {
+            ${ev.primitive}.update($i, ${eval.primitive});
+          }
+         """
+      }.mkString("\n")
+  }
+
   override def prettyName: String = "array"
 }
 
 /**
  * Returns a Row containing the evaluation of all children expressions.
- * TODO: [[CreateStruct]] does not support codegen.
  */
-case class CreateStruct(children: Seq[Expression]) extends Expression with CodegenFallback {
+case class CreateStruct(children: Seq[Expression]) extends Expression {
 
   override def foldable: Boolean = children.forall(_.foldable)
 
@@ -76,6 +95,24 @@ case class CreateStruct(children: Seq[Expression]) extends Expression with Codeg
     InternalRow(children.map(_.eval(input)): _*)
   }
 
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    val rowClass = classOf[GenericMutableRow].getName
+    s"""
+      boolean ${ev.isNull} = false;
+      final $rowClass ${ev.primitive} = new $rowClass(${children.size});
+    """ +
+      children.zipWithIndex.map { case (e, i) =>
+        val eval = e.gen(ctx)
+        eval.code + s"""
+        if (${eval.isNull}) {
+          ${ev.primitive}.update($i, null);
+        } else {
+          ${ev.primitive}.update($i, ${eval.primitive});
+        }
+       """
+      }.mkString("\n")
+  }
+
   override def prettyName: String = "struct"
 }
 
@@ -84,7 +121,7 @@ case class CreateStruct(children: Seq[Expression]) extends Expression with Codeg
  *
  * @param children Seq(name1, val1, name2, val2, ...)
  */
-case class CreateNamedStruct(children: Seq[Expression]) extends Expression with CodegenFallback {
+case class CreateNamedStruct(children: Seq[Expression]) extends Expression {
 
   private lazy val (nameExprs, valExprs) =
     children.grouped(2).map { case Seq(name, value) => (name, value) }.toList.unzip
@@ -122,5 +159,23 @@ case class CreateNamedStruct(children: Seq[Expression]) extends Expression with
     InternalRow(valExprs.map(_.eval(input)): _*)
   }
 
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    val rowClass = classOf[GenericMutableRow].getName
+    s"""
+      boolean ${ev.isNull} = false;
+      final $rowClass ${ev.primitive} = new $rowClass(${valExprs.size});
+    """ +
+      valExprs.zipWithIndex.map { case (e, i) =>
+        val eval = e.gen(ctx)
+        eval.code + s"""
+        if (${eval.isNull}) {
+          ${ev.primitive}.update($i, null);
+        } else {
+          ${ev.primitive}.update($i, ${eval.primitive});
+        }
+       """
+      }.mkString("\n")
+  }
+
   override def prettyName: String = "named_struct"
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala
index e3042143632aa..a8aee8f634e03 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala
@@ -117,6 +117,22 @@ class ComplexTypeSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(getArrayStructFields(nullArrayStruct, "a"), null)
   }
 
+  test("CreateArray") {
+    val intSeq = Seq(5, 10, 15, 20, 25)
+    val longSeq = intSeq.map(_.toLong)
+    val strSeq = intSeq.map(_.toString)
+    checkEvaluation(CreateArray(intSeq.map(Literal(_))), intSeq, EmptyRow)
+    checkEvaluation(CreateArray(longSeq.map(Literal(_))), longSeq, EmptyRow)
+    checkEvaluation(CreateArray(strSeq.map(Literal(_))), strSeq, EmptyRow)
+
+    val intWithNull = intSeq.map(Literal(_)) :+ Literal.create(null, IntegerType)
+    val longWithNull = longSeq.map(Literal(_)) :+ Literal.create(null, LongType)
+    val strWithNull = strSeq.map(Literal(_)) :+ Literal.create(null, StringType)
+    checkEvaluation(CreateArray(intWithNull), intSeq :+ null, EmptyRow)
+    checkEvaluation(CreateArray(longWithNull), longSeq :+ null, EmptyRow)
+    checkEvaluation(CreateArray(strWithNull), strSeq :+ null, EmptyRow)
+  }
+
   test("CreateStruct") {
     val row = create_row(1, 2, 3)
     val c1 = 'a.int.at(0)

From e0b7ba59a1ace9b78a1ad6f3f07fe153db20b52c Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Wed, 22 Jul 2015 13:02:43 -0700
Subject: [PATCH 319/549] [SPARK-9024] Unsafe
 HashJoin/HashOuterJoin/HashSemiJoin

This PR introduce unsafe version (using UnsafeRow) of HashJoin, HashOuterJoin and HashSemiJoin, including the broadcast one and shuffle one (except FullOuterJoin, which is better to be implemented using SortMergeJoin).

It use HashMap to store UnsafeRow right now, will change to use BytesToBytesMap for better performance (in another PR).

Author: Davies Liu <davies@databricks.com>

Closes #7480 from davies/unsafe_join and squashes the following commits:

6294b1e [Davies Liu] fix projection
10583f1 [Davies Liu] Merge branch 'master' of github.com:apache/spark into unsafe_join
dede020 [Davies Liu] fix test
84c9807 [Davies Liu] address comments
a05b4f6 [Davies Liu] support UnsafeRow in LeftSemiJoinBNL and BroadcastNestedLoopJoin
611d2ed [Davies Liu] Merge branch 'master' of github.com:apache/spark into unsafe_join
9481ae8 [Davies Liu] return UnsafeRow after join()
ca2b40f [Davies Liu] revert unrelated change
68f5cd9 [Davies Liu] Merge branch 'master' of github.com:apache/spark into unsafe_join
0f4380d [Davies Liu] ada a comment
69e38f5 [Davies Liu] Merge branch 'master' of github.com:apache/spark into unsafe_join
1a40f02 [Davies Liu] refactor
ab1690f [Davies Liu] address comments
60371f2 [Davies Liu] use UnsafeRow in SemiJoin
a6c0b7d [Davies Liu] Merge branch 'master' of github.com:apache/spark into unsafe_join
184b852 [Davies Liu] fix style
6acbb11 [Davies Liu] fix tests
95d0762 [Davies Liu] remove println
bea4a50 [Davies Liu] Unsafe HashJoin
---
 .../sql/catalyst/expressions/UnsafeRow.java   | 50 ++++++++++-
 .../execution/UnsafeExternalRowSorter.java    | 10 +--
 .../catalyst/expressions/BoundAttribute.scala | 19 ++++-
 .../sql/catalyst/expressions/Projection.scala | 34 +++++++-
 .../execution/joins/BroadcastHashJoin.scala   |  2 +-
 .../joins/BroadcastHashOuterJoin.scala        | 32 ++-----
 .../joins/BroadcastLeftSemiJoinHash.scala     |  5 +-
 .../joins/BroadcastNestedLoopJoin.scala       | 37 +++++---
 .../spark/sql/execution/joins/HashJoin.scala  | 43 ++++++++--
 .../sql/execution/joins/HashOuterJoin.scala   | 82 +++++++++++++++---
 .../sql/execution/joins/HashSemiJoin.scala    | 74 ++++++++++------
 .../sql/execution/joins/HashedRelation.scala  | 85 ++++++++++++++++++-
 .../sql/execution/joins/LeftSemiJoinBNL.scala |  3 +
 .../execution/joins/LeftSemiJoinHash.scala    |  4 +-
 .../execution/joins/ShuffledHashJoin.scala    |  2 +-
 .../joins/ShuffledHashOuterJoin.scala         | 13 +--
 .../sql/execution/rowFormatConverters.scala   | 21 +++--
 .../org/apache/spark/sql/UnsafeRowSuite.scala |  4 +-
 .../execution/joins/HashedRelationSuite.scala | 49 ++++++++---
 .../spark/unsafe/hash/Murmur3_x86_32.java     | 10 ++-
 20 files changed, 444 insertions(+), 135 deletions(-)

diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
index 6ce03a48e9538..7f08bf7b742dc 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
@@ -20,10 +20,11 @@
 import java.io.IOException;
 import java.io.OutputStream;
 
-import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.catalyst.util.ObjectPool;
 import org.apache.spark.unsafe.PlatformDependent;
+import org.apache.spark.unsafe.array.ByteArrayMethods;
 import org.apache.spark.unsafe.bitset.BitSetMethods;
+import org.apache.spark.unsafe.hash.Murmur3_x86_32;
 import org.apache.spark.unsafe.types.UTF8String;
 
 
@@ -354,7 +355,7 @@ public double getDouble(int i) {
    * This method is only supported on UnsafeRows that do not use ObjectPools.
    */
   @Override
-  public InternalRow copy() {
+  public UnsafeRow copy() {
     if (pool != null) {
       throw new UnsupportedOperationException(
         "Copy is not supported for UnsafeRows that use object pools");
@@ -404,8 +405,51 @@ public void writeToStream(OutputStream out, byte[] writeBuffer) throws IOExcepti
     }
   }
 
+  @Override
+  public int hashCode() {
+    return Murmur3_x86_32.hashUnsafeWords(baseObject, baseOffset, sizeInBytes, 42);
+  }
+
+  @Override
+  public boolean equals(Object other) {
+    if (other instanceof UnsafeRow) {
+      UnsafeRow o = (UnsafeRow) other;
+      return (sizeInBytes == o.sizeInBytes) &&
+        ByteArrayMethods.arrayEquals(baseObject, baseOffset, o.baseObject, o.baseOffset,
+          sizeInBytes);
+    }
+    return false;
+  }
+
+  /**
+   * Returns the underlying bytes for this UnsafeRow.
+   */
+  public byte[] getBytes() {
+    if (baseObject instanceof byte[] && baseOffset == PlatformDependent.BYTE_ARRAY_OFFSET
+        && (((byte[]) baseObject).length == sizeInBytes)) {
+      return (byte[]) baseObject;
+    } else {
+      byte[] bytes = new byte[sizeInBytes];
+      PlatformDependent.copyMemory(baseObject, baseOffset, bytes,
+        PlatformDependent.BYTE_ARRAY_OFFSET, sizeInBytes);
+      return bytes;
+    }
+  }
+
+  // This is for debugging
+  @Override
+  public String toString() {
+    StringBuilder build = new StringBuilder("[");
+    for (int i = 0; i < sizeInBytes; i += 8) {
+      build.append(PlatformDependent.UNSAFE.getLong(baseObject, baseOffset + i));
+      build.append(',');
+    }
+    build.append(']');
+    return build.toString();
+  }
+
   @Override
   public boolean anyNull() {
-    return BitSetMethods.anySet(baseObject, baseOffset, bitSetWidthInBytes);
+    return BitSetMethods.anySet(baseObject, baseOffset, bitSetWidthInBytes / 8);
   }
 }
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java b/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java
index d1d81c87bb052..39fd6e1bc6d13 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java
@@ -28,11 +28,10 @@
 import org.apache.spark.TaskContext;
 import org.apache.spark.sql.AbstractScalaRowIterator;
 import org.apache.spark.sql.catalyst.InternalRow;
-import org.apache.spark.sql.catalyst.expressions.UnsafeColumnWriter;
 import org.apache.spark.sql.catalyst.expressions.UnsafeProjection;
 import org.apache.spark.sql.catalyst.expressions.UnsafeRow;
 import org.apache.spark.sql.catalyst.util.ObjectPool;
-import org.apache.spark.sql.types.*;
+import org.apache.spark.sql.types.StructType;
 import org.apache.spark.unsafe.PlatformDependent;
 import org.apache.spark.util.collection.unsafe.sort.PrefixComparator;
 import org.apache.spark.util.collection.unsafe.sort.RecordComparator;
@@ -176,12 +175,7 @@ public Iterator<InternalRow> sort(Iterator<InternalRow> inputIterator) throws IO
    */
   public static boolean supportsSchema(StructType schema) {
     // TODO: add spilling note to explain why we do this for now:
-    for (StructField field : schema.fields()) {
-      if (!UnsafeColumnWriter.canEmbed(field.dataType())) {
-        return false;
-      }
-    }
-    return true;
+    return UnsafeProjection.canSupport(schema);
   }
 
   private static final class RowComparator extends RecordComparator {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala
index b10a3c877434b..4a13b687bf4ce 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala
@@ -21,7 +21,6 @@ import org.apache.spark.Logging
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.errors.attachTree
 import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode}
-import org.apache.spark.sql.catalyst.trees
 import org.apache.spark.sql.types._
 
 /**
@@ -34,7 +33,23 @@ case class BoundReference(ordinal: Int, dataType: DataType, nullable: Boolean)
 
   override def toString: String = s"input[$ordinal, $dataType]"
 
-  override def eval(input: InternalRow): Any = input(ordinal)
+  // Use special getter for primitive types (for UnsafeRow)
+  override def eval(input: InternalRow): Any = {
+    if (input.isNullAt(ordinal)) {
+      null
+    } else {
+      dataType match {
+        case BooleanType => input.getBoolean(ordinal)
+        case ByteType => input.getByte(ordinal)
+        case ShortType => input.getShort(ordinal)
+        case IntegerType | DateType => input.getInt(ordinal)
+        case LongType | TimestampType => input.getLong(ordinal)
+        case FloatType => input.getFloat(ordinal)
+        case DoubleType => input.getDouble(ordinal)
+        case _ => input.get(ordinal)
+      }
+    }
+  }
 
   override def name: String = s"i[$ordinal]"
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
index 24b01ea55110e..69758e653eba0 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
@@ -83,12 +83,42 @@ abstract class UnsafeProjection extends Projection {
 }
 
 object UnsafeProjection {
+
+  /*
+   * Returns whether UnsafeProjection can support given StructType, Array[DataType] or
+   * Seq[Expression].
+   */
+  def canSupport(schema: StructType): Boolean = canSupport(schema.fields.map(_.dataType))
+  def canSupport(types: Array[DataType]): Boolean = types.forall(UnsafeColumnWriter.canEmbed(_))
+  def canSupport(exprs: Seq[Expression]): Boolean = canSupport(exprs.map(_.dataType).toArray)
+
+  /**
+   * Returns an UnsafeProjection for given StructType.
+   */
   def create(schema: StructType): UnsafeProjection = create(schema.fields.map(_.dataType))
 
-  def create(fields: Seq[DataType]): UnsafeProjection = {
+  /**
+   * Returns an UnsafeProjection for given Array of DataTypes.
+   */
+  def create(fields: Array[DataType]): UnsafeProjection = {
     val exprs = fields.zipWithIndex.map(x => new BoundReference(x._2, x._1, true))
+    create(exprs)
+  }
+
+  /**
+   * Returns an UnsafeProjection for given sequence of Expressions (bounded).
+   */
+  def create(exprs: Seq[Expression]): UnsafeProjection = {
     GenerateUnsafeProjection.generate(exprs)
   }
+
+  /**
+   * Returns an UnsafeProjection for given sequence of Expressions, which will be bound to
+   * `inputSchema`.
+   */
+  def create(exprs: Seq[Expression], inputSchema: Seq[Attribute]): UnsafeProjection = {
+    create(exprs.map(BindReferences.bindReference(_, inputSchema)))
+  }
 }
 
 /**
@@ -96,6 +126,8 @@ object UnsafeProjection {
  */
 case class FromUnsafeProjection(fields: Seq[DataType]) extends Projection {
 
+  def this(schema: StructType) = this(schema.fields.map(_.dataType))
+
   private[this] val expressions = fields.zipWithIndex.map { case (dt, idx) =>
     new BoundReference(idx, dt, true)
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoin.scala
index 7ffdce60d2955..abaa4a6ce86a2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoin.scala
@@ -62,7 +62,7 @@ case class BroadcastHashJoin(
   private val broadcastFuture = future {
     // Note that we use .execute().collect() because we don't want to convert data to Scala types
     val input: Array[InternalRow] = buildPlan.execute().map(_.copy()).collect()
-    val hashed = HashedRelation(input.iterator, buildSideKeyGenerator, input.length)
+    val hashed = buildHashRelation(input.iterator)
     sparkContext.broadcast(hashed)
   }(BroadcastHashJoin.broadcastHashJoinExecutionContext)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashOuterJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashOuterJoin.scala
index ab757fc7de6cd..c9d1a880f4ef4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashOuterJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashOuterJoin.scala
@@ -17,6 +17,9 @@
 
 package org.apache.spark.sql.execution.joins
 
+import scala.concurrent._
+import scala.concurrent.duration._
+
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
@@ -26,10 +29,6 @@ import org.apache.spark.sql.catalyst.plans.{JoinType, LeftOuter, RightOuter}
 import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}
 import org.apache.spark.util.ThreadUtils
 
-import scala.collection.JavaConversions._
-import scala.concurrent._
-import scala.concurrent.duration._
-
 /**
  * :: DeveloperApi ::
  * Performs a outer hash join for two child relations.  When the output RDD of this operator is
@@ -58,28 +57,11 @@ case class BroadcastHashOuterJoin(
   override def requiredChildDistribution: Seq[Distribution] =
     UnspecifiedDistribution :: UnspecifiedDistribution :: Nil
 
-  private[this] lazy val (buildPlan, streamedPlan) = joinType match {
-    case RightOuter => (left, right)
-    case LeftOuter => (right, left)
-    case x =>
-      throw new IllegalArgumentException(
-        s"BroadcastHashOuterJoin should not take $x as the JoinType")
-  }
-
-  private[this] lazy val (buildKeys, streamedKeys) = joinType match {
-    case RightOuter => (leftKeys, rightKeys)
-    case LeftOuter => (rightKeys, leftKeys)
-    case x =>
-      throw new IllegalArgumentException(
-        s"BroadcastHashOuterJoin should not take $x as the JoinType")
-  }
-
   @transient
   private val broadcastFuture = future {
     // Note that we use .execute().collect() because we don't want to convert data to Scala types
     val input: Array[InternalRow] = buildPlan.execute().map(_.copy()).collect()
-    // buildHashTable uses code-generated rows as keys, which are not serializable
-    val hashed = buildHashTable(input.iterator, newProjection(buildKeys, buildPlan.output))
+    val hashed = buildHashRelation(input.iterator)
     sparkContext.broadcast(hashed)
   }(BroadcastHashOuterJoin.broadcastHashOuterJoinExecutionContext)
 
@@ -89,21 +71,21 @@ case class BroadcastHashOuterJoin(
     streamedPlan.execute().mapPartitions { streamedIter =>
       val joinedRow = new JoinedRow()
       val hashTable = broadcastRelation.value
-      val keyGenerator = newProjection(streamedKeys, streamedPlan.output)
+      val keyGenerator = streamedKeyGenerator
 
       joinType match {
         case LeftOuter =>
           streamedIter.flatMap(currentRow => {
             val rowKey = keyGenerator(currentRow)
             joinedRow.withLeft(currentRow)
-            leftOuterIterator(rowKey, joinedRow, hashTable.getOrElse(rowKey, EMPTY_LIST))
+            leftOuterIterator(rowKey, joinedRow, hashTable.get(rowKey))
           })
 
         case RightOuter =>
           streamedIter.flatMap(currentRow => {
             val rowKey = keyGenerator(currentRow)
             joinedRow.withRight(currentRow)
-            rightOuterIterator(rowKey, hashTable.getOrElse(rowKey, EMPTY_LIST), joinedRow)
+            rightOuterIterator(rowKey, hashTable.get(rowKey), joinedRow)
           })
 
         case x =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastLeftSemiJoinHash.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastLeftSemiJoinHash.scala
index 2750f58b005ac..f71c0ce352904 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastLeftSemiJoinHash.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastLeftSemiJoinHash.scala
@@ -40,15 +40,14 @@ case class BroadcastLeftSemiJoinHash(
     val buildIter = right.execute().map(_.copy()).collect().toIterator
 
     if (condition.isEmpty) {
-      // rowKey may be not serializable (from codegen)
-      val hashSet = buildKeyHashSet(buildIter, copy = true)
+      val hashSet = buildKeyHashSet(buildIter)
       val broadcastedRelation = sparkContext.broadcast(hashSet)
 
       left.execute().mapPartitions { streamIter =>
         hashSemiJoin(streamIter, broadcastedRelation.value)
       }
     } else {
-      val hashRelation = HashedRelation(buildIter, rightKeyGenerator)
+      val hashRelation = buildHashRelation(buildIter)
       val broadcastedRelation = sparkContext.broadcast(hashRelation)
 
       left.execute().mapPartitions { streamIter =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastNestedLoopJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastNestedLoopJoin.scala
index 60b4266fad8b1..700636966f8be 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastNestedLoopJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastNestedLoopJoin.scala
@@ -44,6 +44,19 @@ case class BroadcastNestedLoopJoin(
     case BuildLeft => (right, left)
   }
 
+  override def outputsUnsafeRows: Boolean = left.outputsUnsafeRows || right.outputsUnsafeRows
+  override def canProcessUnsafeRows: Boolean = true
+
+  @transient private[this] lazy val resultProjection: Projection = {
+    if (outputsUnsafeRows) {
+      UnsafeProjection.create(schema)
+    } else {
+      new Projection {
+        override def apply(r: InternalRow): InternalRow = r
+      }
+    }
+  }
+
   override def outputPartitioning: Partitioning = streamed.outputPartitioning
 
   override def output: Seq[Attribute] = {
@@ -74,6 +87,7 @@ case class BroadcastNestedLoopJoin(
       val includedBroadcastTuples =
         new scala.collection.mutable.BitSet(broadcastedRelation.value.size)
       val joinedRow = new JoinedRow
+
       val leftNulls = new GenericMutableRow(left.output.size)
       val rightNulls = new GenericMutableRow(right.output.size)
 
@@ -86,11 +100,11 @@ case class BroadcastNestedLoopJoin(
           val broadcastedRow = broadcastedRelation.value(i)
           buildSide match {
             case BuildRight if boundCondition(joinedRow(streamedRow, broadcastedRow)) =>
-              matchedRows += joinedRow(streamedRow, broadcastedRow).copy()
+              matchedRows += resultProjection(joinedRow(streamedRow, broadcastedRow)).copy()
               streamRowMatched = true
               includedBroadcastTuples += i
             case BuildLeft if boundCondition(joinedRow(broadcastedRow, streamedRow)) =>
-              matchedRows += joinedRow(broadcastedRow, streamedRow).copy()
+              matchedRows += resultProjection(joinedRow(broadcastedRow, streamedRow)).copy()
               streamRowMatched = true
               includedBroadcastTuples += i
             case _ =>
@@ -100,9 +114,9 @@ case class BroadcastNestedLoopJoin(
 
         (streamRowMatched, joinType, buildSide) match {
           case (false, LeftOuter | FullOuter, BuildRight) =>
-            matchedRows += joinedRow(streamedRow, rightNulls).copy()
+            matchedRows += resultProjection(joinedRow(streamedRow, rightNulls)).copy()
           case (false, RightOuter | FullOuter, BuildLeft) =>
-            matchedRows += joinedRow(leftNulls, streamedRow).copy()
+            matchedRows += resultProjection(joinedRow(leftNulls, streamedRow)).copy()
           case _ =>
         }
       }
@@ -110,12 +124,9 @@ case class BroadcastNestedLoopJoin(
     }
 
     val includedBroadcastTuples = matchesOrStreamedRowsWithNulls.map(_._2)
-    val allIncludedBroadcastTuples =
-      if (includedBroadcastTuples.count == 0) {
-        new scala.collection.mutable.BitSet(broadcastedRelation.value.size)
-      } else {
-        includedBroadcastTuples.reduce(_ ++ _)
-      }
+    val allIncludedBroadcastTuples = includedBroadcastTuples.fold(
+      new scala.collection.mutable.BitSet(broadcastedRelation.value.size)
+    )(_ ++ _)
 
     val leftNulls = new GenericMutableRow(left.output.size)
     val rightNulls = new GenericMutableRow(right.output.size)
@@ -127,8 +138,10 @@ case class BroadcastNestedLoopJoin(
       while (i < rel.length) {
         if (!allIncludedBroadcastTuples.contains(i)) {
           (joinType, buildSide) match {
-            case (RightOuter | FullOuter, BuildRight) => buf += new JoinedRow(leftNulls, rel(i))
-            case (LeftOuter | FullOuter, BuildLeft) => buf += new JoinedRow(rel(i), rightNulls)
+            case (RightOuter | FullOuter, BuildRight) =>
+              buf += resultProjection(new JoinedRow(leftNulls, rel(i)))
+            case (LeftOuter | FullOuter, BuildLeft) =>
+              buf += resultProjection(new JoinedRow(rel(i), rightNulls))
             case _ =>
           }
         }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala
index ff85ea3f6a410..ae34409bcfcca 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala
@@ -44,11 +44,20 @@ trait HashJoin {
 
   override def output: Seq[Attribute] = left.output ++ right.output
 
-  @transient protected lazy val buildSideKeyGenerator: Projection =
-    newProjection(buildKeys, buildPlan.output)
+  protected[this] def supportUnsafe: Boolean = {
+    (self.codegenEnabled && UnsafeProjection.canSupport(buildKeys)
+      && UnsafeProjection.canSupport(self.schema))
+  }
+
+  override def outputsUnsafeRows: Boolean = supportUnsafe
+  override def canProcessUnsafeRows: Boolean = supportUnsafe
 
-  @transient protected lazy val streamSideKeyGenerator: () => MutableProjection =
-    newMutableProjection(streamedKeys, streamedPlan.output)
+  @transient protected lazy val streamSideKeyGenerator: Projection =
+    if (supportUnsafe) {
+      UnsafeProjection.create(streamedKeys, streamedPlan.output)
+    } else {
+      newMutableProjection(streamedKeys, streamedPlan.output)()
+    }
 
   protected def hashJoin(
       streamIter: Iterator[InternalRow],
@@ -61,8 +70,17 @@ trait HashJoin {
 
       // Mutable per row objects.
       private[this] val joinRow = new JoinedRow2
+      private[this] val resultProjection: Projection = {
+        if (supportUnsafe) {
+          UnsafeProjection.create(self.schema)
+        } else {
+          new Projection {
+            override def apply(r: InternalRow): InternalRow = r
+          }
+        }
+      }
 
-      private[this] val joinKeys = streamSideKeyGenerator()
+      private[this] val joinKeys = streamSideKeyGenerator
 
       override final def hasNext: Boolean =
         (currentMatchPosition != -1 && currentMatchPosition < currentHashMatches.size) ||
@@ -74,7 +92,7 @@ trait HashJoin {
           case BuildLeft => joinRow(currentHashMatches(currentMatchPosition), currentStreamedRow)
         }
         currentMatchPosition += 1
-        ret
+        resultProjection(ret)
       }
 
       /**
@@ -89,8 +107,9 @@ trait HashJoin {
 
         while (currentHashMatches == null && streamIter.hasNext) {
           currentStreamedRow = streamIter.next()
-          if (!joinKeys(currentStreamedRow).anyNull) {
-            currentHashMatches = hashedRelation.get(joinKeys.currentValue)
+          val key = joinKeys(currentStreamedRow)
+          if (!key.anyNull) {
+            currentHashMatches = hashedRelation.get(key)
           }
         }
 
@@ -103,4 +122,12 @@ trait HashJoin {
       }
     }
   }
+
+  protected[this] def buildHashRelation(buildIter: Iterator[InternalRow]): HashedRelation = {
+    if (supportUnsafe) {
+      UnsafeHashedRelation(buildIter, buildKeys, buildPlan)
+    } else {
+      HashedRelation(buildIter, newProjection(buildKeys, buildPlan.output))
+    }
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashOuterJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashOuterJoin.scala
index 74a7db7761758..6bf2f82954046 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashOuterJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashOuterJoin.scala
@@ -23,7 +23,7 @@ import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, UnknownPartitioning}
-import org.apache.spark.sql.catalyst.plans.{FullOuter, JoinType, LeftOuter, RightOuter}
+import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.util.collection.CompactBuffer
 
@@ -38,7 +38,7 @@ trait HashOuterJoin {
   val left: SparkPlan
   val right: SparkPlan
 
-override def outputPartitioning: Partitioning = joinType match {
+  override def outputPartitioning: Partitioning = joinType match {
     case LeftOuter => left.outputPartitioning
     case RightOuter => right.outputPartitioning
     case FullOuter => UnknownPartitioning(left.outputPartitioning.numPartitions)
@@ -59,6 +59,49 @@ override def outputPartitioning: Partitioning = joinType match {
     }
   }
 
+  protected[this] lazy val (buildPlan, streamedPlan) = joinType match {
+    case RightOuter => (left, right)
+    case LeftOuter => (right, left)
+    case x =>
+      throw new IllegalArgumentException(
+        s"HashOuterJoin should not take $x as the JoinType")
+  }
+
+  protected[this] lazy val (buildKeys, streamedKeys) = joinType match {
+    case RightOuter => (leftKeys, rightKeys)
+    case LeftOuter => (rightKeys, leftKeys)
+    case x =>
+      throw new IllegalArgumentException(
+        s"HashOuterJoin should not take $x as the JoinType")
+  }
+
+  protected[this] def supportUnsafe: Boolean = {
+    (self.codegenEnabled && joinType != FullOuter
+      && UnsafeProjection.canSupport(buildKeys)
+      && UnsafeProjection.canSupport(self.schema))
+  }
+
+  override def outputsUnsafeRows: Boolean = supportUnsafe
+  override def canProcessUnsafeRows: Boolean = supportUnsafe
+
+  protected[this] def streamedKeyGenerator(): Projection = {
+    if (supportUnsafe) {
+      UnsafeProjection.create(streamedKeys, streamedPlan.output)
+    } else {
+      newProjection(streamedKeys, streamedPlan.output)
+    }
+  }
+
+  @transient private[this] lazy val resultProjection: Projection = {
+    if (supportUnsafe) {
+      UnsafeProjection.create(self.schema)
+    } else {
+      new Projection {
+        override def apply(r: InternalRow): InternalRow = r
+      }
+    }
+  }
+
   @transient private[this] lazy val DUMMY_LIST = CompactBuffer[InternalRow](null)
   @transient protected[this] lazy val EMPTY_LIST = CompactBuffer[InternalRow]()
 
@@ -76,16 +119,20 @@ override def outputPartitioning: Partitioning = joinType match {
       rightIter: Iterable[InternalRow]): Iterator[InternalRow] = {
     val ret: Iterable[InternalRow] = {
       if (!key.anyNull) {
-        val temp = rightIter.collect {
-          case r if boundCondition(joinedRow.withRight(r)) => joinedRow.copy()
+        val temp = if (rightIter != null) {
+          rightIter.collect {
+            case r if boundCondition(joinedRow.withRight(r)) => resultProjection(joinedRow).copy()
+          }
+        } else {
+          List.empty
         }
         if (temp.isEmpty) {
-          joinedRow.withRight(rightNullRow).copy :: Nil
+          resultProjection(joinedRow.withRight(rightNullRow)).copy :: Nil
         } else {
           temp
         }
       } else {
-        joinedRow.withRight(rightNullRow).copy :: Nil
+        resultProjection(joinedRow.withRight(rightNullRow)).copy :: Nil
       }
     }
     ret.iterator
@@ -97,17 +144,21 @@ override def outputPartitioning: Partitioning = joinType match {
       joinedRow: JoinedRow): Iterator[InternalRow] = {
     val ret: Iterable[InternalRow] = {
       if (!key.anyNull) {
-        val temp = leftIter.collect {
-          case l if boundCondition(joinedRow.withLeft(l)) =>
-            joinedRow.copy()
+        val temp = if (leftIter != null) {
+          leftIter.collect {
+            case l if boundCondition(joinedRow.withLeft(l)) =>
+              resultProjection(joinedRow).copy()
+          }
+        } else {
+          List.empty
         }
         if (temp.isEmpty) {
-          joinedRow.withLeft(leftNullRow).copy :: Nil
+          resultProjection(joinedRow.withLeft(leftNullRow)).copy :: Nil
         } else {
           temp
         }
       } else {
-        joinedRow.withLeft(leftNullRow).copy :: Nil
+        resultProjection(joinedRow.withLeft(leftNullRow)).copy :: Nil
       }
     }
     ret.iterator
@@ -159,6 +210,7 @@ override def outputPartitioning: Partitioning = joinType match {
     }
   }
 
+  // This is only used by FullOuter
   protected[this] def buildHashTable(
       iter: Iterator[InternalRow],
       keyGenerator: Projection): JavaHashMap[InternalRow, CompactBuffer[InternalRow]] = {
@@ -178,4 +230,12 @@ override def outputPartitioning: Partitioning = joinType match {
 
     hashTable
   }
+
+  protected[this] def buildHashRelation(buildIter: Iterator[InternalRow]): HashedRelation = {
+    if (supportUnsafe) {
+      UnsafeHashedRelation(buildIter, buildKeys, buildPlan)
+    } else {
+      HashedRelation(buildIter, newProjection(buildKeys, buildPlan.output))
+    }
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashSemiJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashSemiJoin.scala
index 1b983bc3a90f9..7f49264d40354 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashSemiJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashSemiJoin.scala
@@ -32,34 +32,45 @@ trait HashSemiJoin {
 
   override def output: Seq[Attribute] = left.output
 
-  @transient protected lazy val rightKeyGenerator: Projection =
-    newProjection(rightKeys, right.output)
+  protected[this] def supportUnsafe: Boolean = {
+    (self.codegenEnabled && UnsafeProjection.canSupport(leftKeys)
+      && UnsafeProjection.canSupport(rightKeys)
+      && UnsafeProjection.canSupport(left.schema))
+  }
+
+  override def outputsUnsafeRows: Boolean = right.outputsUnsafeRows
+  override def canProcessUnsafeRows: Boolean = supportUnsafe
+
+  @transient protected lazy val leftKeyGenerator: Projection =
+    if (supportUnsafe) {
+      UnsafeProjection.create(leftKeys, left.output)
+    } else {
+      newMutableProjection(leftKeys, left.output)()
+    }
 
-  @transient protected lazy val leftKeyGenerator: () => MutableProjection =
-    newMutableProjection(leftKeys, left.output)
+  @transient protected lazy val rightKeyGenerator: Projection =
+    if (supportUnsafe) {
+      UnsafeProjection.create(rightKeys, right.output)
+    } else {
+      newMutableProjection(rightKeys, right.output)()
+    }
 
   @transient private lazy val boundCondition =
     newPredicate(condition.getOrElse(Literal(true)), left.output ++ right.output)
 
-  protected def buildKeyHashSet(
-      buildIter: Iterator[InternalRow],
-      copy: Boolean): java.util.Set[InternalRow] = {
+  protected def buildKeyHashSet(buildIter: Iterator[InternalRow]): java.util.Set[InternalRow] = {
     val hashSet = new java.util.HashSet[InternalRow]()
     var currentRow: InternalRow = null
 
     // Create a Hash set of buildKeys
+    val rightKey = rightKeyGenerator
     while (buildIter.hasNext) {
       currentRow = buildIter.next()
-      val rowKey = rightKeyGenerator(currentRow)
+      val rowKey = rightKey(currentRow)
       if (!rowKey.anyNull) {
         val keyExists = hashSet.contains(rowKey)
         if (!keyExists) {
-          if (copy) {
-            hashSet.add(rowKey.copy())
-          } else {
-            // rowKey may be not serializable (from codegen)
-            hashSet.add(rowKey)
-          }
+          hashSet.add(rowKey.copy())
         }
       }
     }
@@ -67,25 +78,34 @@ trait HashSemiJoin {
   }
 
   protected def hashSemiJoin(
-      streamIter: Iterator[InternalRow],
-      hashedRelation: HashedRelation): Iterator[InternalRow] = {
-    val joinKeys = leftKeyGenerator()
-    val joinedRow = new JoinedRow
+    streamIter: Iterator[InternalRow],
+    hashSet: java.util.Set[InternalRow]): Iterator[InternalRow] = {
+    val joinKeys = leftKeyGenerator
     streamIter.filter(current => {
-      lazy val rowBuffer = hashedRelation.get(joinKeys.currentValue)
-      !joinKeys(current).anyNull && rowBuffer != null && rowBuffer.exists {
-        (build: InternalRow) => boundCondition(joinedRow(current, build))
-      }
+      val key = joinKeys(current)
+      !key.anyNull && hashSet.contains(key)
     })
   }
 
+  protected def buildHashRelation(buildIter: Iterator[InternalRow]): HashedRelation = {
+    if (supportUnsafe) {
+      UnsafeHashedRelation(buildIter, rightKeys, right)
+    } else {
+      HashedRelation(buildIter, newProjection(rightKeys, right.output))
+    }
+  }
+
   protected def hashSemiJoin(
       streamIter: Iterator[InternalRow],
-      hashSet: java.util.Set[InternalRow]): Iterator[InternalRow] = {
-    val joinKeys = leftKeyGenerator()
+      hashedRelation: HashedRelation): Iterator[InternalRow] = {
+    val joinKeys = leftKeyGenerator
     val joinedRow = new JoinedRow
-    streamIter.filter(current => {
-      !joinKeys(current.copy()).anyNull && hashSet.contains(joinKeys.currentValue)
-    })
+    streamIter.filter { current =>
+      val key = joinKeys(current)
+      lazy val rowBuffer = hashedRelation.get(key)
+      !key.anyNull && rowBuffer != null && rowBuffer.exists {
+        (row: InternalRow) => boundCondition(joinedRow(current, row))
+      }
+    }
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
index 6b51f5d4151d3..8d5731afd59b8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
@@ -17,12 +17,13 @@
 
 package org.apache.spark.sql.execution.joins
 
-import java.io.{ObjectInput, ObjectOutput, Externalizable}
+import java.io.{Externalizable, ObjectInput, ObjectOutput}
 import java.util.{HashMap => JavaHashMap}
 
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.Projection
-import org.apache.spark.sql.execution.SparkSqlSerializer
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.execution.{SparkPlan, SparkSqlSerializer}
+import org.apache.spark.sql.types.StructType
 import org.apache.spark.util.collection.CompactBuffer
 
 
@@ -98,7 +99,6 @@ final class UniqueKeyHashedRelation(private var hashTable: JavaHashMap[InternalR
   }
 }
 
-
 // TODO(rxin): a version of [[HashedRelation]] backed by arrays for consecutive integer keys.
 
 
@@ -148,3 +148,80 @@ private[joins] object HashedRelation {
     }
   }
 }
+
+
+/**
+ * A HashedRelation for UnsafeRow, which is backed by BytesToBytesMap that maps the key into a
+ * sequence of values.
+ *
+ * TODO(davies): use BytesToBytesMap
+ */
+private[joins] final class UnsafeHashedRelation(
+    private var hashTable: JavaHashMap[UnsafeRow, CompactBuffer[UnsafeRow]])
+  extends HashedRelation with Externalizable {
+
+  def this() = this(null)  // Needed for serialization
+
+  override def get(key: InternalRow): CompactBuffer[InternalRow] = {
+    val unsafeKey = key.asInstanceOf[UnsafeRow]
+    // Thanks to type eraser
+    hashTable.get(unsafeKey).asInstanceOf[CompactBuffer[InternalRow]]
+  }
+
+  override def writeExternal(out: ObjectOutput): Unit = {
+    writeBytes(out, SparkSqlSerializer.serialize(hashTable))
+  }
+
+  override def readExternal(in: ObjectInput): Unit = {
+    hashTable = SparkSqlSerializer.deserialize(readBytes(in))
+  }
+}
+
+private[joins] object UnsafeHashedRelation {
+
+  def apply(
+      input: Iterator[InternalRow],
+      buildKeys: Seq[Expression],
+      buildPlan: SparkPlan,
+      sizeEstimate: Int = 64): HashedRelation = {
+    val boundedKeys = buildKeys.map(BindReferences.bindReference(_, buildPlan.output))
+    apply(input, boundedKeys, buildPlan.schema, sizeEstimate)
+  }
+
+  // Used for tests
+  def apply(
+      input: Iterator[InternalRow],
+      buildKeys: Seq[Expression],
+      rowSchema: StructType,
+      sizeEstimate: Int): HashedRelation = {
+
+    // TODO: Use BytesToBytesMap.
+    val hashTable = new JavaHashMap[UnsafeRow, CompactBuffer[UnsafeRow]](sizeEstimate)
+    val toUnsafe = UnsafeProjection.create(rowSchema)
+    val keyGenerator = UnsafeProjection.create(buildKeys)
+
+    // Create a mapping of buildKeys -> rows
+    while (input.hasNext) {
+      val currentRow = input.next()
+      val unsafeRow = if (currentRow.isInstanceOf[UnsafeRow]) {
+        currentRow.asInstanceOf[UnsafeRow]
+      } else {
+        toUnsafe(currentRow)
+      }
+      val rowKey = keyGenerator(unsafeRow)
+      if (!rowKey.anyNull) {
+        val existingMatchList = hashTable.get(rowKey)
+        val matchList = if (existingMatchList == null) {
+          val newMatchList = new CompactBuffer[UnsafeRow]()
+          hashTable.put(rowKey.copy(), newMatchList)
+          newMatchList
+        } else {
+          existingMatchList
+        }
+        matchList += unsafeRow.copy()
+      }
+    }
+
+    new UnsafeHashedRelation(hashTable)
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/LeftSemiJoinBNL.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/LeftSemiJoinBNL.scala
index db5be9f453674..4443455ef11fe 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/LeftSemiJoinBNL.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/LeftSemiJoinBNL.scala
@@ -39,6 +39,9 @@ case class LeftSemiJoinBNL(
 
   override def output: Seq[Attribute] = left.output
 
+  override def outputsUnsafeRows: Boolean = streamed.outputsUnsafeRows
+  override def canProcessUnsafeRows: Boolean = true
+
   /** The Streamed Relation */
   override def left: SparkPlan = streamed
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/LeftSemiJoinHash.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/LeftSemiJoinHash.scala
index 9eaac817d9268..874712a4e739f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/LeftSemiJoinHash.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/LeftSemiJoinHash.scala
@@ -43,10 +43,10 @@ case class LeftSemiJoinHash(
   protected override def doExecute(): RDD[InternalRow] = {
     right.execute().zipPartitions(left.execute()) { (buildIter, streamIter) =>
       if (condition.isEmpty) {
-        val hashSet = buildKeyHashSet(buildIter, copy = false)
+        val hashSet = buildKeyHashSet(buildIter)
         hashSemiJoin(streamIter, hashSet)
       } else {
-        val hashRelation = HashedRelation(buildIter, rightKeyGenerator)
+        val hashRelation = buildHashRelation(buildIter)
         hashSemiJoin(streamIter, hashRelation)
       }
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashJoin.scala
index 5439e10a60b2a..948d0ccebceb0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashJoin.scala
@@ -45,7 +45,7 @@ case class ShuffledHashJoin(
 
   protected override def doExecute(): RDD[InternalRow] = {
     buildPlan.execute().zipPartitions(streamedPlan.execute()) { (buildIter, streamIter) =>
-      val hashed = HashedRelation(buildIter, buildSideKeyGenerator)
+      val hashed = buildHashRelation(buildIter)
       hashJoin(streamIter, hashed)
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashOuterJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashOuterJoin.scala
index ab0a6ad56acde..f54f1edd38ec8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashOuterJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashOuterJoin.scala
@@ -50,24 +50,25 @@ case class ShuffledHashOuterJoin(
       // TODO this probably can be replaced by external sort (sort merged join?)
       joinType match {
         case LeftOuter =>
-          val rightHashTable = buildHashTable(rightIter, newProjection(rightKeys, right.output))
-          val keyGenerator = newProjection(leftKeys, left.output)
+          val hashed = buildHashRelation(rightIter)
+          val keyGenerator = streamedKeyGenerator()
           leftIter.flatMap( currentRow => {
             val rowKey = keyGenerator(currentRow)
             joinedRow.withLeft(currentRow)
-            leftOuterIterator(rowKey, joinedRow, rightHashTable.getOrElse(rowKey, EMPTY_LIST))
+            leftOuterIterator(rowKey, joinedRow, hashed.get(rowKey))
           })
 
         case RightOuter =>
-          val leftHashTable = buildHashTable(leftIter, newProjection(leftKeys, left.output))
-          val keyGenerator = newProjection(rightKeys, right.output)
+          val hashed = buildHashRelation(leftIter)
+          val keyGenerator = streamedKeyGenerator()
           rightIter.flatMap ( currentRow => {
             val rowKey = keyGenerator(currentRow)
             joinedRow.withRight(currentRow)
-            rightOuterIterator(rowKey, leftHashTable.getOrElse(rowKey, EMPTY_LIST), joinedRow)
+            rightOuterIterator(rowKey, hashed.get(rowKey), joinedRow)
           })
 
         case FullOuter =>
+          // TODO(davies): use UnsafeRow
           val leftHashTable = buildHashTable(leftIter, newProjection(leftKeys, left.output))
           val rightHashTable = buildHashTable(rightIter, newProjection(rightKeys, right.output))
           (leftHashTable.keySet ++ rightHashTable.keySet).iterator.flatMap { key =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/rowFormatConverters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/rowFormatConverters.scala
index 421d510e6782d..29f3beb3cb3c8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/rowFormatConverters.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/rowFormatConverters.scala
@@ -29,6 +29,9 @@ import org.apache.spark.sql.catalyst.rules.Rule
  */
 @DeveloperApi
 case class ConvertToUnsafe(child: SparkPlan) extends UnaryNode {
+
+  require(UnsafeProjection.canSupport(child.schema), s"Cannot convert ${child.schema} to Unsafe")
+
   override def output: Seq[Attribute] = child.output
   override def outputsUnsafeRows: Boolean = true
   override def canProcessUnsafeRows: Boolean = false
@@ -93,11 +96,19 @@ private[sql] object EnsureRowFormats extends Rule[SparkPlan] {
       }
     case operator: SparkPlan if handlesBothSafeAndUnsafeRows(operator) =>
       if (operator.children.map(_.outputsUnsafeRows).toSet.size != 1) {
-        // If this operator's children produce both unsafe and safe rows, then convert everything
-        // to unsafe rows
-        operator.withNewChildren {
-          operator.children.map {
-            c => if (!c.outputsUnsafeRows) ConvertToUnsafe(c) else c
+        // If this operator's children produce both unsafe and safe rows,
+        // convert everything unsafe rows if all the schema of them are support by UnsafeRow
+        if (operator.children.forall(c => UnsafeProjection.canSupport(c.schema))) {
+          operator.withNewChildren {
+            operator.children.map {
+              c => if (!c.outputsUnsafeRows) ConvertToUnsafe(c) else c
+            }
+          }
+        } else {
+          operator.withNewChildren {
+            operator.children.map {
+              c => if (c.outputsUnsafeRows) ConvertToSafe(c) else c
+            }
           }
         }
       } else {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UnsafeRowSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UnsafeRowSuite.scala
index 3854dc1b7a3d1..d36e2639376e7 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/UnsafeRowSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/UnsafeRowSuite.scala
@@ -22,7 +22,7 @@ import java.io.ByteArrayOutputStream
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{UnsafeRow, UnsafeProjection}
-import org.apache.spark.sql.types.{IntegerType, StringType}
+import org.apache.spark.sql.types.{DataType, IntegerType, StringType}
 import org.apache.spark.unsafe.PlatformDependent
 import org.apache.spark.unsafe.memory.MemoryAllocator
 import org.apache.spark.unsafe.types.UTF8String
@@ -31,7 +31,7 @@ class UnsafeRowSuite extends SparkFunSuite {
   test("writeToStream") {
     val row = InternalRow.apply(UTF8String.fromString("hello"), UTF8String.fromString("world"), 123)
     val arrayBackedUnsafeRow: UnsafeRow =
-      UnsafeProjection.create(Seq(StringType, StringType, IntegerType)).apply(row)
+      UnsafeProjection.create(Array[DataType](StringType, StringType, IntegerType)).apply(row)
     assert(arrayBackedUnsafeRow.getBaseObject.isInstanceOf[Array[Byte]])
     val bytesFromArrayBackedRow: Array[Byte] = {
       val baos = new ByteArrayOutputStream()
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala
index 9d9858b1c6151..9dd2220f0967e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala
@@ -19,7 +19,9 @@ package org.apache.spark.sql.execution.joins
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.Projection
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.execution.SparkSqlSerializer
+import org.apache.spark.sql.types.{StructField, StructType, IntegerType}
 import org.apache.spark.util.collection.CompactBuffer
 
 
@@ -35,13 +37,13 @@ class HashedRelationSuite extends SparkFunSuite {
     val hashed = HashedRelation(data.iterator, keyProjection)
     assert(hashed.isInstanceOf[GeneralHashedRelation])
 
-    assert(hashed.get(data(0)) == CompactBuffer[InternalRow](data(0)))
-    assert(hashed.get(data(1)) == CompactBuffer[InternalRow](data(1)))
+    assert(hashed.get(data(0)) === CompactBuffer[InternalRow](data(0)))
+    assert(hashed.get(data(1)) === CompactBuffer[InternalRow](data(1)))
     assert(hashed.get(InternalRow(10)) === null)
 
     val data2 = CompactBuffer[InternalRow](data(2))
     data2 += data(2)
-    assert(hashed.get(data(2)) == data2)
+    assert(hashed.get(data(2)) === data2)
   }
 
   test("UniqueKeyHashedRelation") {
@@ -49,15 +51,40 @@ class HashedRelationSuite extends SparkFunSuite {
     val hashed = HashedRelation(data.iterator, keyProjection)
     assert(hashed.isInstanceOf[UniqueKeyHashedRelation])
 
-    assert(hashed.get(data(0)) == CompactBuffer[InternalRow](data(0)))
-    assert(hashed.get(data(1)) == CompactBuffer[InternalRow](data(1)))
-    assert(hashed.get(data(2)) == CompactBuffer[InternalRow](data(2)))
+    assert(hashed.get(data(0)) === CompactBuffer[InternalRow](data(0)))
+    assert(hashed.get(data(1)) === CompactBuffer[InternalRow](data(1)))
+    assert(hashed.get(data(2)) === CompactBuffer[InternalRow](data(2)))
     assert(hashed.get(InternalRow(10)) === null)
 
     val uniqHashed = hashed.asInstanceOf[UniqueKeyHashedRelation]
-    assert(uniqHashed.getValue(data(0)) == data(0))
-    assert(uniqHashed.getValue(data(1)) == data(1))
-    assert(uniqHashed.getValue(data(2)) == data(2))
-    assert(uniqHashed.getValue(InternalRow(10)) == null)
+    assert(uniqHashed.getValue(data(0)) === data(0))
+    assert(uniqHashed.getValue(data(1)) === data(1))
+    assert(uniqHashed.getValue(data(2)) === data(2))
+    assert(uniqHashed.getValue(InternalRow(10)) === null)
+  }
+
+  test("UnsafeHashedRelation") {
+    val data = Array(InternalRow(0), InternalRow(1), InternalRow(2), InternalRow(2))
+    val buildKey = Seq(BoundReference(0, IntegerType, false))
+    val schema = StructType(StructField("a", IntegerType, true) :: Nil)
+    val hashed = UnsafeHashedRelation(data.iterator, buildKey, schema, 1)
+    assert(hashed.isInstanceOf[UnsafeHashedRelation])
+
+    val toUnsafeKey = UnsafeProjection.create(schema)
+    val unsafeData = data.map(toUnsafeKey(_).copy()).toArray
+    assert(hashed.get(unsafeData(0)) === CompactBuffer[InternalRow](unsafeData(0)))
+    assert(hashed.get(unsafeData(1)) === CompactBuffer[InternalRow](unsafeData(1)))
+    assert(hashed.get(toUnsafeKey(InternalRow(10))) === null)
+
+    val data2 = CompactBuffer[InternalRow](unsafeData(2).copy())
+    data2 += unsafeData(2).copy()
+    assert(hashed.get(unsafeData(2)) === data2)
+
+    val hashed2 = SparkSqlSerializer.deserialize(SparkSqlSerializer.serialize(hashed))
+      .asInstanceOf[UnsafeHashedRelation]
+    assert(hashed2.get(unsafeData(0)) === CompactBuffer[InternalRow](unsafeData(0)))
+    assert(hashed2.get(unsafeData(1)) === CompactBuffer[InternalRow](unsafeData(1)))
+    assert(hashed2.get(toUnsafeKey(InternalRow(10))) === null)
+    assert(hashed2.get(unsafeData(2)) === data2)
   }
 }
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java b/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java
index 85cd02469adb7..61f483ced3217 100644
--- a/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java
+++ b/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java
@@ -44,12 +44,16 @@ public int hashInt(int input) {
     return fmix(h1, 4);
   }
 
-  public int hashUnsafeWords(Object baseObject, long baseOffset, int lengthInBytes) {
+  public int hashUnsafeWords(Object base, long offset, int lengthInBytes) {
+    return hashUnsafeWords(base, offset, lengthInBytes, seed);
+  }
+
+  public static int hashUnsafeWords(Object base, long offset, int lengthInBytes, int seed) {
     // This is based on Guava's `Murmur32_Hasher.processRemaining(ByteBuffer)` method.
     assert (lengthInBytes % 8 == 0): "lengthInBytes must be a multiple of 8 (word-aligned)";
     int h1 = seed;
-    for (int offset = 0; offset < lengthInBytes; offset += 4) {
-      int halfWord = PlatformDependent.UNSAFE.getInt(baseObject, baseOffset + offset);
+    for (int i = 0; i < lengthInBytes; i += 4) {
+      int halfWord = PlatformDependent.UNSAFE.getInt(base, offset + i);
       int k1 = mixK1(halfWord);
       h1 = mixH1(h1, k1);
     }

From 8486cd853104255b4eb013860bba793eef4e74e7 Mon Sep 17 00:00:00 2001
From: Feynman Liang <fliang@databricks.com>
Date: Wed, 22 Jul 2015 13:06:01 -0700
Subject: [PATCH 320/549] [SPARK-9224] [MLLIB] OnlineLDA Performance
 Improvements

In-place updates, reduce number of transposes, and vectorize operations in OnlineLDA implementation.

Author: Feynman Liang <fliang@databricks.com>

Closes #7454 from feynmanliang/OnlineLDA-perf-improvements and squashes the following commits:

78b0f5a [Feynman Liang] Make in-place variables vals, fix BLAS error
7f62a55 [Feynman Liang] --amend
c62cb1e [Feynman Liang] Outer product for stats, revert Range slicing
aead650 [Feynman Liang] Range slice, in-place update, reduce transposes
---
 .../spark/mllib/clustering/LDAOptimizer.scala | 59 +++++++++----------
 1 file changed, 27 insertions(+), 32 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
index 8e5154b902d1d..b960ae6c0708d 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
@@ -19,15 +19,15 @@ package org.apache.spark.mllib.clustering
 
 import java.util.Random
 
-import breeze.linalg.{DenseVector => BDV, DenseMatrix => BDM, sum, normalize, kron}
-import breeze.numerics.{digamma, exp, abs}
+import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV, normalize, sum}
+import breeze.numerics.{abs, digamma, exp}
 import breeze.stats.distributions.{Gamma, RandBasis}
 
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.graphx._
 import org.apache.spark.graphx.impl.GraphImpl
 import org.apache.spark.mllib.impl.PeriodicGraphCheckpointer
-import org.apache.spark.mllib.linalg.{Matrices, SparseVector, DenseVector, Vector}
+import org.apache.spark.mllib.linalg.{DenseVector, Matrices, SparseVector, Vector}
 import org.apache.spark.rdd.RDD
 
 /**
@@ -370,7 +370,7 @@ final class OnlineLDAOptimizer extends LDAOptimizer {
     iteration += 1
     val k = this.k
     val vocabSize = this.vocabSize
-    val Elogbeta = dirichletExpectation(lambda)
+    val Elogbeta = dirichletExpectation(lambda).t
     val expElogbeta = exp(Elogbeta)
     val alpha = this.alpha
     val gammaShape = this.gammaShape
@@ -385,41 +385,36 @@ final class OnlineLDAOptimizer extends LDAOptimizer {
           case v => throw new IllegalArgumentException("Online LDA does not support vector type "
             + v.getClass)
         }
+        if (!ids.isEmpty) {
+
+          // Initialize the variational distribution q(theta|gamma) for the mini-batch
+          val gammad: BDV[Double] =
+            new Gamma(gammaShape, 1.0 / gammaShape).samplesVector(k) // K
+          val expElogthetad: BDV[Double] = exp(digamma(gammad) - digamma(sum(gammad))) // K
+          val expElogbetad: BDM[Double] = expElogbeta(ids, ::).toDenseMatrix // ids * K
+
+          val phinorm: BDV[Double] = expElogbetad * expElogthetad :+ 1e-100 // ids
+          var meanchange = 1D
+          val ctsVector = new BDV[Double](cts) // ids
+
+          // Iterate between gamma and phi until convergence
+          while (meanchange > 1e-3) {
+            val lastgamma = gammad.copy
+            //        K                  K * ids               ids
+            gammad := (expElogthetad :* (expElogbetad.t * (ctsVector :/ phinorm))) :+ alpha
+            expElogthetad := exp(digamma(gammad) - digamma(sum(gammad)))
+            phinorm := expElogbetad * expElogthetad :+ 1e-100
+            meanchange = sum(abs(gammad - lastgamma)) / k
+          }
 
-        // Initialize the variational distribution q(theta|gamma) for the mini-batch
-        var gammad = new Gamma(gammaShape, 1.0 / gammaShape).samplesVector(k).t // 1 * K
-        var Elogthetad = digamma(gammad) - digamma(sum(gammad))     // 1 * K
-        var expElogthetad = exp(Elogthetad)                         // 1 * K
-        val expElogbetad = expElogbeta(::, ids).toDenseMatrix       // K * ids
-
-        var phinorm = expElogthetad * expElogbetad + 1e-100         // 1 * ids
-        var meanchange = 1D
-        val ctsVector = new BDV[Double](cts).t                      // 1 * ids
-
-        // Iterate between gamma and phi until convergence
-        while (meanchange > 1e-3) {
-          val lastgamma = gammad
-          //        1*K                  1 * ids               ids * k
-          gammad = (expElogthetad :* ((ctsVector / phinorm) * expElogbetad.t)) + alpha
-          Elogthetad = digamma(gammad) - digamma(sum(gammad))
-          expElogthetad = exp(Elogthetad)
-          phinorm = expElogthetad * expElogbetad + 1e-100
-          meanchange = sum(abs(gammad - lastgamma)) / k
-        }
-
-        val m1 = expElogthetad.t
-        val m2 = (ctsVector / phinorm).t.toDenseVector
-        var i = 0
-        while (i < ids.size) {
-          stat(::, ids(i)) := stat(::, ids(i)) + m1 * m2(i)
-          i += 1
+          stat(::, ids) := expElogthetad.asDenseMatrix.t * (ctsVector :/ phinorm).asDenseMatrix
         }
       }
       Iterator(stat)
     }
 
     val statsSum: BDM[Double] = stats.reduce(_ += _)
-    val batchResult = statsSum :* expElogbeta
+    val batchResult = statsSum :* expElogbeta.t
 
     // Note that this is an optimization to avoid batch.count
     update(batchResult, iteration, (miniBatchFraction * corpusSize).ceil.toInt)

From cf21d05f8b5fae52b118fb8846f43d6fda1aea41 Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Wed, 22 Jul 2015 13:28:09 -0700
Subject: [PATCH 321/549] [SPARK-4366] [SQL] [Follow-up] Fix SqlParser
 compiling warning.

Author: Yin Huai <yhuai@databricks.com>

Closes #7588 from yhuai/SPARK-4366-update1 and squashes the following commits:

25f5f36 [Yin Huai] Fix SqlParser Warning.
---
 .../main/scala/org/apache/spark/sql/catalyst/SqlParser.scala   | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
index c04bd6cd85187..29cfc064da89a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
@@ -271,8 +271,7 @@ class SqlParser extends AbstractSparkSQLParser with DataTypeParser {
       lexical.normalizeKeyword(udfName) match {
         case "sum" => SumDistinct(exprs.head)
         case "count" => CountDistinct(exprs)
-        case name => UnresolvedFunction(name, exprs, isDistinct = true)
-        case _ => throw new AnalysisException(s"function $udfName does not support DISTINCT")
+        case _ => UnresolvedFunction(udfName, exprs, isDistinct = true)
       }
     }
     | APPROXIMATE ~> ident ~ ("(" ~ DISTINCT ~> expression <~ ")") ^^ { case udfName ~ exp =>

From 1aca9c13c144fa336af6afcfa666128bf77c49d4 Mon Sep 17 00:00:00 2001
From: Feynman Liang <fliang@databricks.com>
Date: Wed, 22 Jul 2015 15:07:05 -0700
Subject: [PATCH 322/549] [SPARK-8536] [MLLIB] Generalize OnlineLDAOptimizer to
 asymmetric document-topic Dirichlet priors

Modify `LDA` to take asymmetric document-topic prior distributions and `OnlineLDAOptimizer` to use the asymmetric prior during variational inference.

This PR only generalizes `OnlineLDAOptimizer` and the associated `LocalLDAModel`; `EMLDAOptimizer` and `DistributedLDAModel` still only support symmetric `alpha` (checked during `EMLDAOptimizer.initialize`).

Author: Feynman Liang <fliang@databricks.com>

Closes #7575 from feynmanliang/SPARK-8536-LDA-asymmetric-priors and squashes the following commits:

af8fbb7 [Feynman Liang] Fix merge errors
ef5821d [Feynman Liang] Merge remote-tracking branch 'apache/master' into SPARK-8536-LDA-asymmetric-priors
58f1d7b [Feynman Liang] Fix from review feedback
a6dcf70 [Feynman Liang] Change docConcentration interface and move LDAOptimizer validation to initialize, add sad path tests
72038ff [Feynman Liang] Add tests referenced against gensim
d4284fa [Feynman Liang] Generalize OnlineLDA to asymmetric priors, no tests
---
 .../apache/spark/mllib/clustering/LDA.scala   | 49 +++++++----
 .../spark/mllib/clustering/LDAOptimizer.scala | 27 ++++--
 .../spark/mllib/clustering/LDASuite.scala     | 82 +++++++++++++++++--
 3 files changed, 126 insertions(+), 32 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
index a410547a72fda..ab124e6d77c5e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
@@ -23,11 +23,10 @@ import org.apache.spark.Logging
 import org.apache.spark.annotation.{DeveloperApi, Experimental}
 import org.apache.spark.api.java.JavaPairRDD
 import org.apache.spark.graphx._
-import org.apache.spark.mllib.linalg.Vector
+import org.apache.spark.mllib.linalg.{Vector, Vectors}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.util.Utils
 
-
 /**
  * :: Experimental ::
  *
@@ -49,14 +48,15 @@ import org.apache.spark.util.Utils
 class LDA private (
     private var k: Int,
     private var maxIterations: Int,
-    private var docConcentration: Double,
+    private var docConcentration: Vector,
     private var topicConcentration: Double,
     private var seed: Long,
     private var checkpointInterval: Int,
     private var ldaOptimizer: LDAOptimizer) extends Logging {
 
-  def this() = this(k = 10, maxIterations = 20, docConcentration = -1, topicConcentration = -1,
-    seed = Utils.random.nextLong(), checkpointInterval = 10, ldaOptimizer = new EMLDAOptimizer)
+  def this() = this(k = 10, maxIterations = 20, docConcentration = Vectors.dense(-1),
+    topicConcentration = -1, seed = Utils.random.nextLong(), checkpointInterval = 10,
+    ldaOptimizer = new EMLDAOptimizer)
 
   /**
    * Number of topics to infer.  I.e., the number of soft cluster centers.
@@ -77,37 +77,50 @@ class LDA private (
    * Concentration parameter (commonly named "alpha") for the prior placed on documents'
    * distributions over topics ("theta").
    *
-   * This is the parameter to a symmetric Dirichlet distribution.
+   * This is the parameter to a Dirichlet distribution.
    */
-  def getDocConcentration: Double = this.docConcentration
+  def getDocConcentration: Vector = this.docConcentration
 
   /**
    * Concentration parameter (commonly named "alpha") for the prior placed on documents'
    * distributions over topics ("theta").
    *
-   * This is the parameter to a symmetric Dirichlet distribution, where larger values
-   * mean more smoothing (more regularization).
+   * This is the parameter to a Dirichlet distribution, where larger values mean more smoothing
+   * (more regularization).
    *
-   * If set to -1, then docConcentration is set automatically.
-   *  (default = -1 = automatic)
+   * If set to a singleton vector Vector(-1), then docConcentration is set automatically. If set to
+   * singleton vector Vector(t) where t != -1, then t is replicated to a vector of length k during
+   * [[LDAOptimizer.initialize()]]. Otherwise, the [[docConcentration]] vector must be length k.
+   * (default = Vector(-1) = automatic)
    *
    * Optimizer-specific parameter settings:
    *  - EM
-   *     - Value should be > 1.0
-   *     - default = (50 / k) + 1, where 50/k is common in LDA libraries and +1 follows
-   *       Asuncion et al. (2009), who recommend a +1 adjustment for EM.
+   *     - Currently only supports symmetric distributions, so all values in the vector should be
+   *       the same.
+   *     - Values should be > 1.0
+   *     - default = uniformly (50 / k) + 1, where 50/k is common in LDA libraries and +1 follows
+   *       from Asuncion et al. (2009), who recommend a +1 adjustment for EM.
    *  - Online
-   *     - Value should be >= 0
-   *     - default = (1.0 / k), following the implementation from
+   *     - Values should be >= 0
+   *     - default = uniformly (1.0 / k), following the implementation from
    *       [[https://github.com/Blei-Lab/onlineldavb]].
    */
-  def setDocConcentration(docConcentration: Double): this.type = {
+  def setDocConcentration(docConcentration: Vector): this.type = {
     this.docConcentration = docConcentration
     this
   }
 
+  /** Replicates Double to create a symmetric prior */
+  def setDocConcentration(docConcentration: Double): this.type = {
+    this.docConcentration = Vectors.dense(docConcentration)
+    this
+  }
+
   /** Alias for [[getDocConcentration]] */
-  def getAlpha: Double = getDocConcentration
+  def getAlpha: Vector = getDocConcentration
+
+  /** Alias for [[setDocConcentration()]] */
+  def setAlpha(alpha: Vector): this.type = setDocConcentration(alpha)
 
   /** Alias for [[setDocConcentration()]] */
   def setAlpha(alpha: Double): this.type = setDocConcentration(alpha)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
index b960ae6c0708d..f4170a3d98dd8 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
@@ -27,7 +27,7 @@ import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.graphx._
 import org.apache.spark.graphx.impl.GraphImpl
 import org.apache.spark.mllib.impl.PeriodicGraphCheckpointer
-import org.apache.spark.mllib.linalg.{DenseVector, Matrices, SparseVector, Vector}
+import org.apache.spark.mllib.linalg.{DenseVector, Matrices, SparseVector, Vector, Vectors}
 import org.apache.spark.rdd.RDD
 
 /**
@@ -95,8 +95,11 @@ final class EMLDAOptimizer extends LDAOptimizer {
    * Compute bipartite term/doc graph.
    */
   override private[clustering] def initialize(docs: RDD[(Long, Vector)], lda: LDA): LDAOptimizer = {
+    val docConcentration = lda.getDocConcentration(0)
+    require({
+      lda.getDocConcentration.toArray.forall(_ == docConcentration)
+    }, "EMLDAOptimizer currently only supports symmetric document-topic priors")
 
-    val docConcentration = lda.getDocConcentration
     val topicConcentration = lda.getTopicConcentration
     val k = lda.getK
 
@@ -229,10 +232,10 @@ final class OnlineLDAOptimizer extends LDAOptimizer {
   private var vocabSize: Int = 0
 
   /** alias for docConcentration */
-  private var alpha: Double = 0
+  private var alpha: Vector = Vectors.dense(0)
 
   /** (private[clustering] for debugging)  Get docConcentration */
-  private[clustering] def getAlpha: Double = alpha
+  private[clustering] def getAlpha: Vector = alpha
 
   /** alias for topicConcentration */
   private var eta: Double = 0
@@ -343,7 +346,19 @@ final class OnlineLDAOptimizer extends LDAOptimizer {
     this.k = lda.getK
     this.corpusSize = docs.count()
     this.vocabSize = docs.first()._2.size
-    this.alpha = if (lda.getDocConcentration == -1) 1.0 / k else lda.getDocConcentration
+    this.alpha = if (lda.getDocConcentration.size == 1) {
+      if (lda.getDocConcentration(0) == -1) Vectors.dense(Array.fill(k)(1.0 / k))
+      else {
+        require(lda.getDocConcentration(0) >= 0, s"all entries in alpha must be >=0, got: $alpha")
+        Vectors.dense(Array.fill(k)(lda.getDocConcentration(0)))
+      }
+    } else {
+      require(lda.getDocConcentration.size == k, s"alpha must have length k, got: $alpha")
+      lda.getDocConcentration.foreachActive { case (_, x) =>
+        require(x >= 0, s"all entries in alpha must be >= 0, got: $alpha")
+      }
+      lda.getDocConcentration
+    }
     this.eta = if (lda.getTopicConcentration == -1) 1.0 / k else lda.getTopicConcentration
     this.randomGenerator = new Random(lda.getSeed)
 
@@ -372,7 +387,7 @@ final class OnlineLDAOptimizer extends LDAOptimizer {
     val vocabSize = this.vocabSize
     val Elogbeta = dirichletExpectation(lambda).t
     val expElogbeta = exp(Elogbeta)
-    val alpha = this.alpha
+    val alpha = this.alpha.toBreeze
     val gammaShape = this.gammaShape
 
     val stats: RDD[BDM[Double]] = batch.mapPartitions { docs =>
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
index 721a065658951..da70d9bd7c790 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
@@ -20,7 +20,7 @@ package org.apache.spark.mllib.clustering
 import breeze.linalg.{DenseMatrix => BDM}
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.mllib.linalg.{Vector, DenseMatrix, Matrix, Vectors}
+import org.apache.spark.mllib.linalg.{DenseMatrix, Matrix, Vector, Vectors}
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.mllib.util.TestingUtils._
 import org.apache.spark.util.Utils
@@ -132,22 +132,38 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext {
 
   test("setter alias") {
     val lda = new LDA().setAlpha(2.0).setBeta(3.0)
-    assert(lda.getAlpha === 2.0)
-    assert(lda.getDocConcentration === 2.0)
+    assert(lda.getAlpha.toArray.forall(_ === 2.0))
+    assert(lda.getDocConcentration.toArray.forall(_ === 2.0))
     assert(lda.getBeta === 3.0)
     assert(lda.getTopicConcentration === 3.0)
   }
 
+  test("initializing with alpha length != k or 1 fails") {
+    intercept[IllegalArgumentException] {
+      val lda = new LDA().setK(2).setAlpha(Vectors.dense(1, 2, 3, 4))
+      val corpus = sc.parallelize(tinyCorpus, 2)
+      lda.run(corpus)
+    }
+  }
+
+  test("initializing with elements in alpha < 0 fails") {
+    intercept[IllegalArgumentException] {
+      val lda = new LDA().setK(4).setAlpha(Vectors.dense(-1, 2, 3, 4))
+      val corpus = sc.parallelize(tinyCorpus, 2)
+      lda.run(corpus)
+    }
+  }
+
   test("OnlineLDAOptimizer initialization") {
     val lda = new LDA().setK(2)
     val corpus = sc.parallelize(tinyCorpus, 2)
     val op = new OnlineLDAOptimizer().initialize(corpus, lda)
     op.setKappa(0.9876).setMiniBatchFraction(0.123).setTau0(567)
-    assert(op.getAlpha == 0.5) // default 1.0 / k
-    assert(op.getEta == 0.5)   // default 1.0 / k
-    assert(op.getKappa == 0.9876)
-    assert(op.getMiniBatchFraction == 0.123)
-    assert(op.getTau0 == 567)
+    assert(op.getAlpha.toArray.forall(_ === 0.5)) // default 1.0 / k
+    assert(op.getEta === 0.5)   // default 1.0 / k
+    assert(op.getKappa === 0.9876)
+    assert(op.getMiniBatchFraction === 0.123)
+    assert(op.getTau0 === 567)
   }
 
   test("OnlineLDAOptimizer one iteration") {
@@ -218,6 +234,56 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext {
     }
   }
 
+  test("OnlineLDAOptimizer with asymmetric prior") {
+    def toydata: Array[(Long, Vector)] = Array(
+      Vectors.sparse(6, Array(0, 1), Array(1, 1)),
+      Vectors.sparse(6, Array(1, 2), Array(1, 1)),
+      Vectors.sparse(6, Array(0, 2), Array(1, 1)),
+      Vectors.sparse(6, Array(3, 4), Array(1, 1)),
+      Vectors.sparse(6, Array(3, 5), Array(1, 1)),
+      Vectors.sparse(6, Array(4, 5), Array(1, 1))
+    ).zipWithIndex.map { case (wordCounts, docId) => (docId.toLong, wordCounts) }
+
+    val docs = sc.parallelize(toydata)
+    val op = new OnlineLDAOptimizer().setMiniBatchFraction(1).setTau0(1024).setKappa(0.51)
+      .setGammaShape(1e10)
+    val lda = new LDA().setK(2)
+      .setDocConcentration(Vectors.dense(0.00001, 0.1))
+      .setTopicConcentration(0.01)
+      .setMaxIterations(100)
+      .setOptimizer(op)
+      .setSeed(12345)
+
+    val ldaModel = lda.run(docs)
+    val topicIndices = ldaModel.describeTopics(maxTermsPerTopic = 10)
+    val topics = topicIndices.map { case (terms, termWeights) =>
+      terms.zip(termWeights)
+    }
+
+    /* Verify results with Python:
+
+       import numpy as np
+       from gensim import models
+       corpus = [
+           [(0, 1.0), (1, 1.0)],
+           [(1, 1.0), (2, 1.0)],
+           [(0, 1.0), (2, 1.0)],
+           [(3, 1.0), (4, 1.0)],
+           [(3, 1.0), (5, 1.0)],
+           [(4, 1.0), (5, 1.0)]]
+       np.random.seed(10)
+       lda = models.ldamodel.LdaModel(
+           corpus=corpus, alpha=np.array([0.00001, 0.1]), num_topics=2, update_every=0, passes=100)
+       lda.print_topics()
+
+       > ['0.167*0 + 0.167*1 + 0.167*2 + 0.167*3 + 0.167*4 + 0.167*5',
+          '0.167*0 + 0.167*1 + 0.167*2 + 0.167*4 + 0.167*3 + 0.167*5']
+     */
+    topics.foreach { topic =>
+      assert(topic.forall { case (_, p) => p ~= 0.167 absTol 0.05 })
+    }
+  }
+
   test("model save/load") {
     // Test for LocalLDAModel.
     val localModel = new LocalLDAModel(tinyTopics)

From fe26584a1f5b472fb2e87aa7259aec822a619a3b Mon Sep 17 00:00:00 2001
From: Matei Zaharia <matei@databricks.com>
Date: Wed, 22 Jul 2015 15:28:09 -0700
Subject: [PATCH 323/549] [SPARK-9244] Increase some memory defaults

There are a few memory limits that people hit often and that we could
make higher, especially now that memory sizes have grown.

- spark.akka.frameSize: This defaults at 10 but is often hit for map
  output statuses in large shuffles. This memory is not fully allocated
  up-front, so we can just make this larger and still not affect jobs
  that never sent a status that large. We increase it to 128.

- spark.executor.memory: Defaults at 512m, which is really small. We
  increase it to 1g.

Author: Matei Zaharia <matei@databricks.com>

Closes #7586 from mateiz/configs and squashes the following commits:

ce0038a [Matei Zaharia] [SPARK-9244] Increase some memory defaults
---
 .../scala/org/apache/spark/SparkContext.scala |  2 +-
 .../org/apache/spark/util/AkkaUtils.scala     |  2 +-
 .../java/org/apache/spark/JavaAPISuite.java   |  2 +-
 .../apache/spark/ContextCleanerSuite.scala    |  4 ++--
 .../org/apache/spark/DistributedSuite.scala   | 16 +++++++--------
 .../scala/org/apache/spark/DriverSuite.scala  |  2 +-
 .../spark/ExternalShuffleServiceSuite.scala   |  2 +-
 .../org/apache/spark/FileServerSuite.scala    |  6 +++---
 .../apache/spark/JobCancellationSuite.scala   |  4 ++--
 .../scala/org/apache/spark/ShuffleSuite.scala | 20 +++++++++----------
 .../SparkContextSchedulerCreationSuite.scala  |  2 +-
 .../spark/broadcast/BroadcastSuite.scala      |  8 ++++----
 .../spark/deploy/LogUrlsStandaloneSuite.scala |  4 ++--
 .../spark/deploy/SparkSubmitSuite.scala       |  4 ++--
 .../CoarseGrainedSchedulerBackendSuite.scala  |  2 +-
 .../scheduler/EventLoggingListenerSuite.scala |  2 +-
 .../spark/scheduler/ReplayListenerSuite.scala |  2 +-
 .../SparkListenerWithClusterSuite.scala       |  2 +-
 .../KryoSerializerDistributedSuite.scala      |  2 +-
 .../ExternalAppendOnlyMapSuite.scala          | 10 +++++-----
 .../util/collection/ExternalSorterSuite.scala | 14 ++++++-------
 docs/configuration.md                         | 16 +++++++--------
 .../mllib/util/LocalClusterSparkContext.scala |  2 +-
 python/pyspark/tests.py                       |  6 +++---
 .../org/apache/spark/repl/ReplSuite.scala     | 10 +++++-----
 .../org/apache/spark/repl/ReplSuite.scala     |  8 ++++----
 .../spark/sql/hive/HiveSparkSubmitSuite.scala |  4 ++--
 27 files changed, 78 insertions(+), 80 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index d00c012d80560..4976e5eb49468 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -471,7 +471,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
       .orElse(Option(System.getenv("SPARK_MEM"))
       .map(warnSparkMem))
       .map(Utils.memoryStringToMb)
-      .getOrElse(512)
+      .getOrElse(1024)
 
     // Convert java options to env vars as a work around
     // since we can't set env vars directly in sbt.
diff --git a/core/src/main/scala/org/apache/spark/util/AkkaUtils.scala b/core/src/main/scala/org/apache/spark/util/AkkaUtils.scala
index c179833e5b06a..78e7ddc27d1c7 100644
--- a/core/src/main/scala/org/apache/spark/util/AkkaUtils.scala
+++ b/core/src/main/scala/org/apache/spark/util/AkkaUtils.scala
@@ -128,7 +128,7 @@ private[spark] object AkkaUtils extends Logging {
 
   /** Returns the configured max frame size for Akka messages in bytes. */
   def maxFrameSizeBytes(conf: SparkConf): Int = {
-    val frameSizeInMB = conf.getInt("spark.akka.frameSize", 10)
+    val frameSizeInMB = conf.getInt("spark.akka.frameSize", 128)
     if (frameSizeInMB > AKKA_MAX_FRAME_SIZE_IN_MB) {
       throw new IllegalArgumentException(
         s"spark.akka.frameSize should not be greater than $AKKA_MAX_FRAME_SIZE_IN_MB MB")
diff --git a/core/src/test/java/org/apache/spark/JavaAPISuite.java b/core/src/test/java/org/apache/spark/JavaAPISuite.java
index 1b04a3b1cff0e..e948ca33471a4 100644
--- a/core/src/test/java/org/apache/spark/JavaAPISuite.java
+++ b/core/src/test/java/org/apache/spark/JavaAPISuite.java
@@ -1783,7 +1783,7 @@ public void testGuavaOptional() {
     // Stop the context created in setUp() and start a local-cluster one, to force usage of the
     // assembly.
     sc.stop();
-    JavaSparkContext localCluster = new JavaSparkContext("local-cluster[1,1,512]", "JavaAPISuite");
+    JavaSparkContext localCluster = new JavaSparkContext("local-cluster[1,1,1024]", "JavaAPISuite");
     try {
       JavaRDD<Integer> rdd1 = localCluster.parallelize(Arrays.asList(1, 2, null), 3);
       JavaRDD<Optional<Integer>> rdd2 = rdd1.map(
diff --git a/core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala b/core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala
index 501fe186bfd7c..26858ef2774fc 100644
--- a/core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala
@@ -292,7 +292,7 @@ class ContextCleanerSuite extends ContextCleanerSuiteBase {
     sc.stop()
 
     val conf2 = new SparkConf()
-      .setMaster("local-cluster[2, 1, 512]")
+      .setMaster("local-cluster[2, 1, 1024]")
       .setAppName("ContextCleanerSuite")
       .set("spark.cleaner.referenceTracking.blocking", "true")
       .set("spark.cleaner.referenceTracking.blocking.shuffle", "true")
@@ -370,7 +370,7 @@ class SortShuffleContextCleanerSuite extends ContextCleanerSuiteBase(classOf[Sor
     sc.stop()
 
     val conf2 = new SparkConf()
-      .setMaster("local-cluster[2, 1, 512]")
+      .setMaster("local-cluster[2, 1, 1024]")
       .setAppName("ContextCleanerSuite")
       .set("spark.cleaner.referenceTracking.blocking", "true")
       .set("spark.cleaner.referenceTracking.blocking.shuffle", "true")
diff --git a/core/src/test/scala/org/apache/spark/DistributedSuite.scala b/core/src/test/scala/org/apache/spark/DistributedSuite.scala
index 2300bcff4f118..600c1403b0344 100644
--- a/core/src/test/scala/org/apache/spark/DistributedSuite.scala
+++ b/core/src/test/scala/org/apache/spark/DistributedSuite.scala
@@ -29,7 +29,7 @@ class NotSerializableExn(val notSer: NotSerializableClass) extends Throwable() {
 
 class DistributedSuite extends SparkFunSuite with Matchers with LocalSparkContext {
 
-  val clusterUrl = "local-cluster[2,1,512]"
+  val clusterUrl = "local-cluster[2,1,1024]"
 
   test("task throws not serializable exception") {
     // Ensures that executors do not crash when an exn is not serializable. If executors crash,
@@ -40,7 +40,7 @@ class DistributedSuite extends SparkFunSuite with Matchers with LocalSparkContex
     val numSlaves = 3
     val numPartitions = 10
 
-    sc = new SparkContext("local-cluster[%s,1,512]".format(numSlaves), "test")
+    sc = new SparkContext("local-cluster[%s,1,1024]".format(numSlaves), "test")
     val data = sc.parallelize(1 to 100, numPartitions).
       map(x => throw new NotSerializableExn(new NotSerializableClass))
     intercept[SparkException] {
@@ -50,16 +50,16 @@ class DistributedSuite extends SparkFunSuite with Matchers with LocalSparkContex
   }
 
   test("local-cluster format") {
-    sc = new SparkContext("local-cluster[2,1,512]", "test")
+    sc = new SparkContext("local-cluster[2,1,1024]", "test")
     assert(sc.parallelize(1 to 2, 2).count() == 2)
     resetSparkContext()
-    sc = new SparkContext("local-cluster[2 , 1 , 512]", "test")
+    sc = new SparkContext("local-cluster[2 , 1 , 1024]", "test")
     assert(sc.parallelize(1 to 2, 2).count() == 2)
     resetSparkContext()
-    sc = new SparkContext("local-cluster[2, 1, 512]", "test")
+    sc = new SparkContext("local-cluster[2, 1, 1024]", "test")
     assert(sc.parallelize(1 to 2, 2).count() == 2)
     resetSparkContext()
-    sc = new SparkContext("local-cluster[ 2, 1, 512 ]", "test")
+    sc = new SparkContext("local-cluster[ 2, 1, 1024 ]", "test")
     assert(sc.parallelize(1 to 2, 2).count() == 2)
     resetSparkContext()
   }
@@ -276,7 +276,7 @@ class DistributedSuite extends SparkFunSuite with Matchers with LocalSparkContex
     DistributedSuite.amMaster = true
     // Using more than two nodes so we don't have a symmetric communication pattern and might
     // cache a partially correct list of peers.
-    sc = new SparkContext("local-cluster[3,1,512]", "test")
+    sc = new SparkContext("local-cluster[3,1,1024]", "test")
     for (i <- 1 to 3) {
       val data = sc.parallelize(Seq(true, false, false, false), 4)
       data.persist(StorageLevel.MEMORY_ONLY_2)
@@ -294,7 +294,7 @@ class DistributedSuite extends SparkFunSuite with Matchers with LocalSparkContex
 
   test("unpersist RDDs") {
     DistributedSuite.amMaster = true
-    sc = new SparkContext("local-cluster[3,1,512]", "test")
+    sc = new SparkContext("local-cluster[3,1,1024]", "test")
     val data = sc.parallelize(Seq(true, false, false, false), 4)
     data.persist(StorageLevel.MEMORY_ONLY_2)
     data.count
diff --git a/core/src/test/scala/org/apache/spark/DriverSuite.scala b/core/src/test/scala/org/apache/spark/DriverSuite.scala
index b2262033ca238..454b7e607a51b 100644
--- a/core/src/test/scala/org/apache/spark/DriverSuite.scala
+++ b/core/src/test/scala/org/apache/spark/DriverSuite.scala
@@ -29,7 +29,7 @@ class DriverSuite extends SparkFunSuite with Timeouts {
 
   ignore("driver should exit after finishing without cleanup (SPARK-530)") {
     val sparkHome = sys.props.getOrElse("spark.test.home", fail("spark.test.home is not set!"))
-    val masters = Table("master", "local", "local-cluster[2,1,512]")
+    val masters = Table("master", "local", "local-cluster[2,1,1024]")
     forAll(masters) { (master: String) =>
       val process = Utils.executeCommand(
         Seq(s"$sparkHome/bin/spark-class", "org.apache.spark.DriverWithoutCleanup", master),
diff --git a/core/src/test/scala/org/apache/spark/ExternalShuffleServiceSuite.scala b/core/src/test/scala/org/apache/spark/ExternalShuffleServiceSuite.scala
index 140012226fdbb..c38d70252add1 100644
--- a/core/src/test/scala/org/apache/spark/ExternalShuffleServiceSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ExternalShuffleServiceSuite.scala
@@ -51,7 +51,7 @@ class ExternalShuffleServiceSuite extends ShuffleSuite with BeforeAndAfterAll {
 
   // This test ensures that the external shuffle service is actually in use for the other tests.
   test("using external shuffle service") {
-    sc = new SparkContext("local-cluster[2,1,512]", "test", conf)
+    sc = new SparkContext("local-cluster[2,1,1024]", "test", conf)
     sc.env.blockManager.externalShuffleServiceEnabled should equal(true)
     sc.env.blockManager.shuffleClient.getClass should equal(classOf[ExternalShuffleClient])
 
diff --git a/core/src/test/scala/org/apache/spark/FileServerSuite.scala b/core/src/test/scala/org/apache/spark/FileServerSuite.scala
index 876418aa13029..1255e71af6c0b 100644
--- a/core/src/test/scala/org/apache/spark/FileServerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/FileServerSuite.scala
@@ -139,7 +139,7 @@ class FileServerSuite extends SparkFunSuite with LocalSparkContext {
   }
 
   test("Distributing files on a standalone cluster") {
-    sc = new SparkContext("local-cluster[1,1,512]", "test", newConf)
+    sc = new SparkContext("local-cluster[1,1,1024]", "test", newConf)
     sc.addFile(tmpFile.toString)
     val testData = Array((1, 1), (1, 1), (2, 1), (3, 5), (2, 2), (3, 0))
     val result = sc.parallelize(testData).reduceByKey {
@@ -153,7 +153,7 @@ class FileServerSuite extends SparkFunSuite with LocalSparkContext {
   }
 
   test ("Dynamically adding JARS on a standalone cluster") {
-    sc = new SparkContext("local-cluster[1,1,512]", "test", newConf)
+    sc = new SparkContext("local-cluster[1,1,1024]", "test", newConf)
     sc.addJar(tmpJarUrl)
     val testData = Array((1, 1))
     sc.parallelize(testData).foreach { x =>
@@ -164,7 +164,7 @@ class FileServerSuite extends SparkFunSuite with LocalSparkContext {
   }
 
   test ("Dynamically adding JARS on a standalone cluster using local: URL") {
-    sc = new SparkContext("local-cluster[1,1,512]", "test", newConf)
+    sc = new SparkContext("local-cluster[1,1,1024]", "test", newConf)
     sc.addJar(tmpJarUrl.replace("file", "local"))
     val testData = Array((1, 1))
     sc.parallelize(testData).foreach { x =>
diff --git a/core/src/test/scala/org/apache/spark/JobCancellationSuite.scala b/core/src/test/scala/org/apache/spark/JobCancellationSuite.scala
index 340a9e327107e..1168eb0b802f2 100644
--- a/core/src/test/scala/org/apache/spark/JobCancellationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/JobCancellationSuite.scala
@@ -64,7 +64,7 @@ class JobCancellationSuite extends SparkFunSuite with Matchers with BeforeAndAft
 
   test("cluster mode, FIFO scheduler") {
     val conf = new SparkConf().set("spark.scheduler.mode", "FIFO")
-    sc = new SparkContext("local-cluster[2,1,512]", "test", conf)
+    sc = new SparkContext("local-cluster[2,1,1024]", "test", conf)
     testCount()
     testTake()
     // Make sure we can still launch tasks.
@@ -75,7 +75,7 @@ class JobCancellationSuite extends SparkFunSuite with Matchers with BeforeAndAft
     val conf = new SparkConf().set("spark.scheduler.mode", "FAIR")
     val xmlPath = getClass.getClassLoader.getResource("fairscheduler.xml").getFile()
     conf.set("spark.scheduler.allocation.file", xmlPath)
-    sc = new SparkContext("local-cluster[2,1,512]", "test", conf)
+    sc = new SparkContext("local-cluster[2,1,1024]", "test", conf)
     testCount()
     testTake()
     // Make sure we can still launch tasks.
diff --git a/core/src/test/scala/org/apache/spark/ShuffleSuite.scala b/core/src/test/scala/org/apache/spark/ShuffleSuite.scala
index b68102bfb949f..d91b799ecfc08 100644
--- a/core/src/test/scala/org/apache/spark/ShuffleSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ShuffleSuite.scala
@@ -47,7 +47,7 @@ abstract class ShuffleSuite extends SparkFunSuite with Matchers with LocalSparkC
   }
 
   test("shuffle non-zero block size") {
-    sc = new SparkContext("local-cluster[2,1,512]", "test", conf)
+    sc = new SparkContext("local-cluster[2,1,1024]", "test", conf)
     val NUM_BLOCKS = 3
 
     val a = sc.parallelize(1 to 10, 2)
@@ -73,7 +73,7 @@ abstract class ShuffleSuite extends SparkFunSuite with Matchers with LocalSparkC
 
   test("shuffle serializer") {
     // Use a local cluster with 2 processes to make sure there are both local and remote blocks
-    sc = new SparkContext("local-cluster[2,1,512]", "test", conf)
+    sc = new SparkContext("local-cluster[2,1,1024]", "test", conf)
     val a = sc.parallelize(1 to 10, 2)
     val b = a.map { x =>
       (x, new NonJavaSerializableClass(x * 2))
@@ -89,7 +89,7 @@ abstract class ShuffleSuite extends SparkFunSuite with Matchers with LocalSparkC
 
   test("zero sized blocks") {
     // Use a local cluster with 2 processes to make sure there are both local and remote blocks
-    sc = new SparkContext("local-cluster[2,1,512]", "test", conf)
+    sc = new SparkContext("local-cluster[2,1,1024]", "test", conf)
 
     // 201 partitions (greater than "spark.shuffle.sort.bypassMergeThreshold") from 4 keys
     val NUM_BLOCKS = 201
@@ -116,7 +116,7 @@ abstract class ShuffleSuite extends SparkFunSuite with Matchers with LocalSparkC
 
   test("zero sized blocks without kryo") {
     // Use a local cluster with 2 processes to make sure there are both local and remote blocks
-    sc = new SparkContext("local-cluster[2,1,512]", "test", conf)
+    sc = new SparkContext("local-cluster[2,1,1024]", "test", conf)
 
     // 201 partitions (greater than "spark.shuffle.sort.bypassMergeThreshold") from 4 keys
     val NUM_BLOCKS = 201
@@ -141,7 +141,7 @@ abstract class ShuffleSuite extends SparkFunSuite with Matchers with LocalSparkC
 
   test("shuffle on mutable pairs") {
     // Use a local cluster with 2 processes to make sure there are both local and remote blocks
-    sc = new SparkContext("local-cluster[2,1,512]", "test", conf)
+    sc = new SparkContext("local-cluster[2,1,1024]", "test", conf)
     def p[T1, T2](_1: T1, _2: T2): MutablePair[T1, T2] = MutablePair(_1, _2)
     val data = Array(p(1, 1), p(1, 2), p(1, 3), p(2, 1))
     val pairs: RDD[MutablePair[Int, Int]] = sc.parallelize(data, 2)
@@ -154,7 +154,7 @@ abstract class ShuffleSuite extends SparkFunSuite with Matchers with LocalSparkC
   test("sorting on mutable pairs") {
     // This is not in SortingSuite because of the local cluster setup.
     // Use a local cluster with 2 processes to make sure there are both local and remote blocks
-    sc = new SparkContext("local-cluster[2,1,512]", "test", conf)
+    sc = new SparkContext("local-cluster[2,1,1024]", "test", conf)
     def p[T1, T2](_1: T1, _2: T2): MutablePair[T1, T2] = MutablePair(_1, _2)
     val data = Array(p(1, 11), p(3, 33), p(100, 100), p(2, 22))
     val pairs: RDD[MutablePair[Int, Int]] = sc.parallelize(data, 2)
@@ -168,7 +168,7 @@ abstract class ShuffleSuite extends SparkFunSuite with Matchers with LocalSparkC
 
   test("cogroup using mutable pairs") {
     // Use a local cluster with 2 processes to make sure there are both local and remote blocks
-    sc = new SparkContext("local-cluster[2,1,512]", "test", conf)
+    sc = new SparkContext("local-cluster[2,1,1024]", "test", conf)
     def p[T1, T2](_1: T1, _2: T2): MutablePair[T1, T2] = MutablePair(_1, _2)
     val data1 = Seq(p(1, 1), p(1, 2), p(1, 3), p(2, 1))
     val data2 = Seq(p(1, "11"), p(1, "12"), p(2, "22"), p(3, "3"))
@@ -195,7 +195,7 @@ abstract class ShuffleSuite extends SparkFunSuite with Matchers with LocalSparkC
 
   test("subtract mutable pairs") {
     // Use a local cluster with 2 processes to make sure there are both local and remote blocks
-    sc = new SparkContext("local-cluster[2,1,512]", "test", conf)
+    sc = new SparkContext("local-cluster[2,1,1024]", "test", conf)
     def p[T1, T2](_1: T1, _2: T2): MutablePair[T1, T2] = MutablePair(_1, _2)
     val data1 = Seq(p(1, 1), p(1, 2), p(1, 3), p(2, 1), p(3, 33))
     val data2 = Seq(p(1, "11"), p(1, "12"), p(2, "22"))
@@ -210,7 +210,7 @@ abstract class ShuffleSuite extends SparkFunSuite with Matchers with LocalSparkC
   test("sort with Java non serializable class - Kryo") {
     // Use a local cluster with 2 processes to make sure there are both local and remote blocks
     val myConf = conf.clone().set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
-    sc = new SparkContext("local-cluster[2,1,512]", "test", myConf)
+    sc = new SparkContext("local-cluster[2,1,1024]", "test", myConf)
     val a = sc.parallelize(1 to 10, 2)
     val b = a.map { x =>
       (new NonJavaSerializableClass(x), x)
@@ -223,7 +223,7 @@ abstract class ShuffleSuite extends SparkFunSuite with Matchers with LocalSparkC
 
   test("sort with Java non serializable class - Java") {
     // Use a local cluster with 2 processes to make sure there are both local and remote blocks
-    sc = new SparkContext("local-cluster[2,1,512]", "test", conf)
+    sc = new SparkContext("local-cluster[2,1,1024]", "test", conf)
     val a = sc.parallelize(1 to 10, 2)
     val b = a.map { x =>
       (new NonJavaSerializableClass(x), x)
diff --git a/core/src/test/scala/org/apache/spark/SparkContextSchedulerCreationSuite.scala b/core/src/test/scala/org/apache/spark/SparkContextSchedulerCreationSuite.scala
index dba46f101c580..e5a14a69ef05f 100644
--- a/core/src/test/scala/org/apache/spark/SparkContextSchedulerCreationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/SparkContextSchedulerCreationSuite.scala
@@ -123,7 +123,7 @@ class SparkContextSchedulerCreationSuite
   }
 
   test("local-cluster") {
-    createTaskScheduler("local-cluster[3, 14, 512]").backend match {
+    createTaskScheduler("local-cluster[3, 14, 1024]").backend match {
       case s: SparkDeploySchedulerBackend => // OK
       case _ => fail()
     }
diff --git a/core/src/test/scala/org/apache/spark/broadcast/BroadcastSuite.scala b/core/src/test/scala/org/apache/spark/broadcast/BroadcastSuite.scala
index c054c718075f8..48e74f06f79b1 100644
--- a/core/src/test/scala/org/apache/spark/broadcast/BroadcastSuite.scala
+++ b/core/src/test/scala/org/apache/spark/broadcast/BroadcastSuite.scala
@@ -69,7 +69,7 @@ class BroadcastSuite extends SparkFunSuite with LocalSparkContext {
     val conf = httpConf.clone
     conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
     conf.set("spark.broadcast.compress", "true")
-    sc = new SparkContext("local-cluster[%d, 1, 512]".format(numSlaves), "test", conf)
+    sc = new SparkContext("local-cluster[%d, 1, 1024]".format(numSlaves), "test", conf)
     val list = List[Int](1, 2, 3, 4)
     val broadcast = sc.broadcast(list)
     val results = sc.parallelize(1 to numSlaves).map(x => (x, broadcast.value.sum))
@@ -97,7 +97,7 @@ class BroadcastSuite extends SparkFunSuite with LocalSparkContext {
     val conf = torrentConf.clone
     conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
     conf.set("spark.broadcast.compress", "true")
-    sc = new SparkContext("local-cluster[%d, 1, 512]".format(numSlaves), "test", conf)
+    sc = new SparkContext("local-cluster[%d, 1, 1024]".format(numSlaves), "test", conf)
     val list = List[Int](1, 2, 3, 4)
     val broadcast = sc.broadcast(list)
     val results = sc.parallelize(1 to numSlaves).map(x => (x, broadcast.value.sum))
@@ -125,7 +125,7 @@ class BroadcastSuite extends SparkFunSuite with LocalSparkContext {
   test("Test Lazy Broadcast variables with TorrentBroadcast") {
     val numSlaves = 2
     val conf = torrentConf.clone
-    sc = new SparkContext("local-cluster[%d, 1, 512]".format(numSlaves), "test", conf)
+    sc = new SparkContext("local-cluster[%d, 1, 1024]".format(numSlaves), "test", conf)
     val rdd = sc.parallelize(1 to numSlaves)
 
     val results = new DummyBroadcastClass(rdd).doSomething()
@@ -308,7 +308,7 @@ class BroadcastSuite extends SparkFunSuite with LocalSparkContext {
 
     sc = if (distributed) {
       val _sc =
-        new SparkContext("local-cluster[%d, 1, 512]".format(numSlaves), "test", broadcastConf)
+        new SparkContext("local-cluster[%d, 1, 1024]".format(numSlaves), "test", broadcastConf)
       // Wait until all salves are up
       _sc.jobProgressListener.waitUntilExecutorsUp(numSlaves, 10000)
       _sc
diff --git a/core/src/test/scala/org/apache/spark/deploy/LogUrlsStandaloneSuite.scala b/core/src/test/scala/org/apache/spark/deploy/LogUrlsStandaloneSuite.scala
index ddc92814c0acf..cbd2aee10c0e2 100644
--- a/core/src/test/scala/org/apache/spark/deploy/LogUrlsStandaloneSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/LogUrlsStandaloneSuite.scala
@@ -33,7 +33,7 @@ class LogUrlsStandaloneSuite extends SparkFunSuite with LocalSparkContext {
   private val WAIT_TIMEOUT_MILLIS = 10000
 
   test("verify that correct log urls get propagated from workers") {
-    sc = new SparkContext("local-cluster[2,1,512]", "test")
+    sc = new SparkContext("local-cluster[2,1,1024]", "test")
 
     val listener = new SaveExecutorInfo
     sc.addSparkListener(listener)
@@ -66,7 +66,7 @@ class LogUrlsStandaloneSuite extends SparkFunSuite with LocalSparkContext {
     }
     val conf = new MySparkConf().set(
       "spark.extraListeners", classOf[SaveExecutorInfo].getName)
-    sc = new SparkContext("local-cluster[2,1,512]", "test", conf)
+    sc = new SparkContext("local-cluster[2,1,1024]", "test", conf)
 
     // Trigger a job so that executors get added
     sc.parallelize(1 to 100, 4).map(_.toString).count()
diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
index 343d28eef8359..aa78bfe30974c 100644
--- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
@@ -337,7 +337,7 @@ class SparkSubmitSuite
     val args = Seq(
       "--class", JarCreationTest.getClass.getName.stripSuffix("$"),
       "--name", "testApp",
-      "--master", "local-cluster[2,1,512]",
+      "--master", "local-cluster[2,1,1024]",
       "--jars", jarsString,
       unusedJar.toString, "SparkSubmitClassA", "SparkSubmitClassB")
     runSparkSubmit(args)
@@ -352,7 +352,7 @@ class SparkSubmitSuite
       val args = Seq(
         "--class", JarCreationTest.getClass.getName.stripSuffix("$"),
         "--name", "testApp",
-        "--master", "local-cluster[2,1,512]",
+        "--master", "local-cluster[2,1,1024]",
         "--packages", Seq(main, dep).mkString(","),
         "--repositories", repo,
         "--conf", "spark.ui.enabled=false",
diff --git a/core/src/test/scala/org/apache/spark/scheduler/CoarseGrainedSchedulerBackendSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/CoarseGrainedSchedulerBackendSuite.scala
index 34145691153ce..eef6aafa624ee 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/CoarseGrainedSchedulerBackendSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/CoarseGrainedSchedulerBackendSuite.scala
@@ -26,7 +26,7 @@ class CoarseGrainedSchedulerBackendSuite extends SparkFunSuite with LocalSparkCo
     val conf = new SparkConf
     conf.set("spark.akka.frameSize", "1")
     conf.set("spark.default.parallelism", "1")
-    sc = new SparkContext("local-cluster[2 , 1 , 512]", "test", conf)
+    sc = new SparkContext("local-cluster[2, 1, 1024]", "test", conf)
     val frameSize = AkkaUtils.maxFrameSizeBytes(sc.conf)
     val buffer = new SerializableBuffer(java.nio.ByteBuffer.allocate(2 * frameSize))
     val larger = sc.parallelize(Seq(buffer))
diff --git a/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala
index f681f21b6205e..5cb2d4225d281 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala
@@ -180,7 +180,7 @@ class EventLoggingListenerSuite extends SparkFunSuite with LocalSparkContext wit
     // into SPARK-6688.
     val conf = getLoggingConf(testDirPath, compressionCodec)
       .set("spark.hadoop.fs.defaultFS", "unsupported://example.com")
-    val sc = new SparkContext("local-cluster[2,2,512]", "test", conf)
+    val sc = new SparkContext("local-cluster[2,2,1024]", "test", conf)
     assert(sc.eventLogger.isDefined)
     val eventLogger = sc.eventLogger.get
     val eventLogPath = eventLogger.logPath
diff --git a/core/src/test/scala/org/apache/spark/scheduler/ReplayListenerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/ReplayListenerSuite.scala
index 4e3defb43a021..103fc19369c97 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/ReplayListenerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/ReplayListenerSuite.scala
@@ -102,7 +102,7 @@ class ReplayListenerSuite extends SparkFunSuite with BeforeAndAfter {
     fileSystem.mkdirs(logDirPath)
 
     val conf = EventLoggingListenerSuite.getLoggingConf(logDirPath, codecName)
-    val sc = new SparkContext("local-cluster[2,1,512]", "Test replay", conf)
+    val sc = new SparkContext("local-cluster[2,1,1024]", "Test replay", conf)
 
     // Run a few jobs
     sc.parallelize(1 to 100, 1).count()
diff --git a/core/src/test/scala/org/apache/spark/scheduler/SparkListenerWithClusterSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SparkListenerWithClusterSuite.scala
index d97fba00976d2..d1e23ed527ff1 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/SparkListenerWithClusterSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/SparkListenerWithClusterSuite.scala
@@ -34,7 +34,7 @@ class SparkListenerWithClusterSuite extends SparkFunSuite with LocalSparkContext
   val WAIT_TIMEOUT_MILLIS = 10000
 
   before {
-    sc = new SparkContext("local-cluster[2,1,512]", "SparkListenerSuite")
+    sc = new SparkContext("local-cluster[2,1,1024]", "SparkListenerSuite")
   }
 
   test("SparkListener sends executor added message") {
diff --git a/core/src/test/scala/org/apache/spark/serializer/KryoSerializerDistributedSuite.scala b/core/src/test/scala/org/apache/spark/serializer/KryoSerializerDistributedSuite.scala
index 353b97469cd11..935a091f14f9b 100644
--- a/core/src/test/scala/org/apache/spark/serializer/KryoSerializerDistributedSuite.scala
+++ b/core/src/test/scala/org/apache/spark/serializer/KryoSerializerDistributedSuite.scala
@@ -35,7 +35,7 @@ class KryoSerializerDistributedSuite extends SparkFunSuite {
     val jar = TestUtils.createJarWithClasses(List(AppJarRegistrator.customClassName))
     conf.setJars(List(jar.getPath))
 
-    val sc = new SparkContext("local-cluster[2,1,512]", "test", conf)
+    val sc = new SparkContext("local-cluster[2,1,1024]", "test", conf)
     val original = Thread.currentThread.getContextClassLoader
     val loader = new java.net.URLClassLoader(Array(jar), Utils.getContextOrSparkClassLoader)
     SparkEnv.get.serializer.setDefaultClassLoader(loader)
diff --git a/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala
index 79eba61a87251..9c362f0de7076 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala
@@ -244,7 +244,7 @@ class ExternalAppendOnlyMapSuite extends SparkFunSuite with LocalSparkContext {
   private def testSimpleSpilling(codec: Option[String] = None): Unit = {
     val conf = createSparkConf(loadDefaults = true, codec)  // Load defaults for Spark home
     conf.set("spark.shuffle.memoryFraction", "0.001")
-    sc = new SparkContext("local-cluster[1,1,512]", "test", conf)
+    sc = new SparkContext("local-cluster[1,1,1024]", "test", conf)
 
     // reduceByKey - should spill ~8 times
     val rddA = sc.parallelize(0 until 100000).map(i => (i/2, i))
@@ -292,7 +292,7 @@ class ExternalAppendOnlyMapSuite extends SparkFunSuite with LocalSparkContext {
   test("spilling with hash collisions") {
     val conf = createSparkConf(loadDefaults = true)
     conf.set("spark.shuffle.memoryFraction", "0.001")
-    sc = new SparkContext("local-cluster[1,1,512]", "test", conf)
+    sc = new SparkContext("local-cluster[1,1,1024]", "test", conf)
     val map = createExternalMap[String]
 
     val collisionPairs = Seq(
@@ -341,7 +341,7 @@ class ExternalAppendOnlyMapSuite extends SparkFunSuite with LocalSparkContext {
   test("spilling with many hash collisions") {
     val conf = createSparkConf(loadDefaults = true)
     conf.set("spark.shuffle.memoryFraction", "0.0001")
-    sc = new SparkContext("local-cluster[1,1,512]", "test", conf)
+    sc = new SparkContext("local-cluster[1,1,1024]", "test", conf)
     val map = new ExternalAppendOnlyMap[FixedHashObject, Int, Int](_ => 1, _ + _, _ + _)
 
     // Insert 10 copies each of lots of objects whose hash codes are either 0 or 1. This causes
@@ -366,7 +366,7 @@ class ExternalAppendOnlyMapSuite extends SparkFunSuite with LocalSparkContext {
   test("spilling with hash collisions using the Int.MaxValue key") {
     val conf = createSparkConf(loadDefaults = true)
     conf.set("spark.shuffle.memoryFraction", "0.001")
-    sc = new SparkContext("local-cluster[1,1,512]", "test", conf)
+    sc = new SparkContext("local-cluster[1,1,1024]", "test", conf)
     val map = createExternalMap[Int]
 
     (1 to 100000).foreach { i => map.insert(i, i) }
@@ -383,7 +383,7 @@ class ExternalAppendOnlyMapSuite extends SparkFunSuite with LocalSparkContext {
   test("spilling with null keys and values") {
     val conf = createSparkConf(loadDefaults = true)
     conf.set("spark.shuffle.memoryFraction", "0.001")
-    sc = new SparkContext("local-cluster[1,1,512]", "test", conf)
+    sc = new SparkContext("local-cluster[1,1,1024]", "test", conf)
     val map = createExternalMap[Int]
 
     map.insertAll((1 to 100000).iterator.map(i => (i, i)))
diff --git a/core/src/test/scala/org/apache/spark/util/collection/ExternalSorterSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/ExternalSorterSuite.scala
index 9cefa612f5491..986cd8623d145 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/ExternalSorterSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/ExternalSorterSuite.scala
@@ -176,7 +176,7 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
   def testSpillingInLocalCluster(conf: SparkConf) {
     conf.set("spark.shuffle.memoryFraction", "0.001")
     conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
-    sc = new SparkContext("local-cluster[1,1,512]", "test", conf)
+    sc = new SparkContext("local-cluster[1,1,1024]", "test", conf)
 
     // reduceByKey - should spill ~8 times
     val rddA = sc.parallelize(0 until 100000).map(i => (i/2, i))
@@ -254,7 +254,7 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
   def spillingInLocalClusterWithManyReduceTasks(conf: SparkConf) {
     conf.set("spark.shuffle.memoryFraction", "0.001")
     conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
-    sc = new SparkContext("local-cluster[2,1,512]", "test", conf)
+    sc = new SparkContext("local-cluster[2,1,1024]", "test", conf)
 
     // reduceByKey - should spill ~4 times per executor
     val rddA = sc.parallelize(0 until 100000).map(i => (i/2, i))
@@ -554,7 +554,7 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
   test("spilling with hash collisions") {
     val conf = createSparkConf(true, false)
     conf.set("spark.shuffle.memoryFraction", "0.001")
-    sc = new SparkContext("local-cluster[1,1,512]", "test", conf)
+    sc = new SparkContext("local-cluster[1,1,1024]", "test", conf)
 
     def createCombiner(i: String): ArrayBuffer[String] = ArrayBuffer[String](i)
     def mergeValue(buffer: ArrayBuffer[String], i: String): ArrayBuffer[String] = buffer += i
@@ -611,7 +611,7 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
   test("spilling with many hash collisions") {
     val conf = createSparkConf(true, false)
     conf.set("spark.shuffle.memoryFraction", "0.0001")
-    sc = new SparkContext("local-cluster[1,1,512]", "test", conf)
+    sc = new SparkContext("local-cluster[1,1,1024]", "test", conf)
 
     val agg = new Aggregator[FixedHashObject, Int, Int](_ => 1, _ + _, _ + _)
     val sorter = new ExternalSorter[FixedHashObject, Int, Int](Some(agg), None, None, None)
@@ -634,7 +634,7 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
   test("spilling with hash collisions using the Int.MaxValue key") {
     val conf = createSparkConf(true, false)
     conf.set("spark.shuffle.memoryFraction", "0.001")
-    sc = new SparkContext("local-cluster[1,1,512]", "test", conf)
+    sc = new SparkContext("local-cluster[1,1,1024]", "test", conf)
 
     def createCombiner(i: Int): ArrayBuffer[Int] = ArrayBuffer[Int](i)
     def mergeValue(buffer: ArrayBuffer[Int], i: Int): ArrayBuffer[Int] = buffer += i
@@ -658,7 +658,7 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
   test("spilling with null keys and values") {
     val conf = createSparkConf(true, false)
     conf.set("spark.shuffle.memoryFraction", "0.001")
-    sc = new SparkContext("local-cluster[1,1,512]", "test", conf)
+    sc = new SparkContext("local-cluster[1,1,1024]", "test", conf)
 
     def createCombiner(i: String): ArrayBuffer[String] = ArrayBuffer[String](i)
     def mergeValue(buffer: ArrayBuffer[String], i: String): ArrayBuffer[String] = buffer += i
@@ -695,7 +695,7 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
   def sortWithoutBreakingSortingContracts(conf: SparkConf) {
     conf.set("spark.shuffle.memoryFraction", "0.01")
     conf.set("spark.shuffle.manager", "sort")
-    sc = new SparkContext("local-cluster[1,1,512]", "test", conf)
+    sc = new SparkContext("local-cluster[1,1,1024]", "test", conf)
 
     // Using wrongOrdering to show integer overflow introduced exception.
     val rand = new Random(100L)
diff --git a/docs/configuration.md b/docs/configuration.md
index 8a186ee51c1ca..fea259204ae68 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -31,7 +31,6 @@ which can help detect bugs that only exist when we run in a distributed context.
 val conf = new SparkConf()
              .setMaster("local[2]")
              .setAppName("CountingSheep")
-             .set("spark.executor.memory", "1g")
 val sc = new SparkContext(conf)
 {% endhighlight %}
 
@@ -84,7 +83,7 @@ Running `./bin/spark-submit --help` will show the entire list of these options.
 each line consists of a key and a value separated by whitespace. For example:
 
     spark.master            spark://5.6.7.8:7077
-    spark.executor.memory   512m
+    spark.executor.memory   4g
     spark.eventLog.enabled  true
     spark.serializer        org.apache.spark.serializer.KryoSerializer
 
@@ -150,10 +149,9 @@ of the most common options to set are:
 </tr>
 <tr>
   <td><code>spark.executor.memory</code></td>
-  <td>512m</td>
+  <td>1g</td>
   <td>
-    Amount of memory to use per executor process, in the same format as JVM memory strings
-    (e.g. <code>512m</code>, <code>2g</code>).
+    Amount of memory to use per executor process (e.g. <code>2g</code>, <code>8g</code>).
   </td>
 </tr>
 <tr>
@@ -886,11 +884,11 @@ Apart from these, the following properties are also available, and may be useful
 </tr>
 <tr>
   <td><code>spark.akka.frameSize</code></td>
-  <td>10</td>
+  <td>128</td>
   <td>
-    Maximum message size to allow in "control plane" communication (for serialized tasks and task
-    results), in MB. Increase this if your tasks need to send back large results to the driver
-    (e.g. using <code>collect()</code> on a large dataset).
+    Maximum message size to allow in "control plane" communication; generally only applies to map
+    output size information sent between executors and the driver. Increase this if you are running
+    jobs with many thousands of map and reduce tasks and see messages about the frame size.
   </td>
 </tr>
 <tr>
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/util/LocalClusterSparkContext.scala b/mllib/src/test/scala/org/apache/spark/mllib/util/LocalClusterSparkContext.scala
index 5e9101cdd3804..525ab68c7921a 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/util/LocalClusterSparkContext.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/util/LocalClusterSparkContext.scala
@@ -26,7 +26,7 @@ trait LocalClusterSparkContext extends BeforeAndAfterAll { self: Suite =>
 
   override def beforeAll() {
     val conf = new SparkConf()
-      .setMaster("local-cluster[2, 1, 512]")
+      .setMaster("local-cluster[2, 1, 1024]")
       .setAppName("test-cluster")
       .set("spark.akka.frameSize", "1") // set to 1MB to detect direct serialization of data
     sc = new SparkContext(conf)
diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py
index 5be9937cb04b2..8bfed074c9052 100644
--- a/python/pyspark/tests.py
+++ b/python/pyspark/tests.py
@@ -1823,7 +1823,7 @@ def test_module_dependency_on_cluster(self):
             |    return x + 1
             """)
         proc = subprocess.Popen([self.sparkSubmit, "--py-files", zip, "--master",
-                                "local-cluster[1,1,512]", script],
+                                "local-cluster[1,1,1024]", script],
                                 stdout=subprocess.PIPE)
         out, err = proc.communicate()
         self.assertEqual(0, proc.returncode)
@@ -1857,7 +1857,7 @@ def test_package_dependency_on_cluster(self):
         self.create_spark_package("a:mylib:0.1")
         proc = subprocess.Popen([self.sparkSubmit, "--packages", "a:mylib:0.1", "--repositories",
                                  "file:" + self.programDir, "--master",
-                                 "local-cluster[1,1,512]", script], stdout=subprocess.PIPE)
+                                 "local-cluster[1,1,1024]", script], stdout=subprocess.PIPE)
         out, err = proc.communicate()
         self.assertEqual(0, proc.returncode)
         self.assertIn("[2, 3, 4]", out.decode('utf-8'))
@@ -1876,7 +1876,7 @@ def test_single_script_on_cluster(self):
         # this will fail if you have different spark.executor.memory
         # in conf/spark-defaults.conf
         proc = subprocess.Popen(
-            [self.sparkSubmit, "--master", "local-cluster[1,1,512]", script],
+            [self.sparkSubmit, "--master", "local-cluster[1,1,1024]", script],
             stdout=subprocess.PIPE)
         out, err = proc.communicate()
         self.assertEqual(0, proc.returncode)
diff --git a/repl/scala-2.10/src/test/scala/org/apache/spark/repl/ReplSuite.scala b/repl/scala-2.10/src/test/scala/org/apache/spark/repl/ReplSuite.scala
index f150fec7db945..5674dcd669bee 100644
--- a/repl/scala-2.10/src/test/scala/org/apache/spark/repl/ReplSuite.scala
+++ b/repl/scala-2.10/src/test/scala/org/apache/spark/repl/ReplSuite.scala
@@ -211,7 +211,7 @@ class ReplSuite extends SparkFunSuite {
   }
 
   test("local-cluster mode") {
-    val output = runInterpreter("local-cluster[1,1,512]",
+    val output = runInterpreter("local-cluster[1,1,1024]",
       """
         |var v = 7
         |def getV() = v
@@ -233,7 +233,7 @@ class ReplSuite extends SparkFunSuite {
   }
 
   test("SPARK-1199 two instances of same class don't type check.") {
-    val output = runInterpreter("local-cluster[1,1,512]",
+    val output = runInterpreter("local-cluster[1,1,1024]",
       """
         |case class Sum(exp: String, exp2: String)
         |val a = Sum("A", "B")
@@ -256,7 +256,7 @@ class ReplSuite extends SparkFunSuite {
 
   test("SPARK-2576 importing SQLContext.implicits._") {
     // We need to use local-cluster to test this case.
-    val output = runInterpreter("local-cluster[1,1,512]",
+    val output = runInterpreter("local-cluster[1,1,1024]",
       """
         |val sqlContext = new org.apache.spark.sql.SQLContext(sc)
         |import sqlContext.implicits._
@@ -325,9 +325,9 @@ class ReplSuite extends SparkFunSuite {
     assertDoesNotContain("Exception", output)
     assertContains("ret: Array[Foo] = Array(Foo(1),", output)
   }
-  
+
   test("collecting objects of class defined in repl - shuffling") {
-    val output = runInterpreter("local-cluster[1,1,512]",
+    val output = runInterpreter("local-cluster[1,1,1024]",
       """
         |case class Foo(i: Int)
         |val list = List((1, Foo(1)), (1, Foo(2)))
diff --git a/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala b/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala
index e1cee97de32bc..bf8997998e00d 100644
--- a/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala
+++ b/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala
@@ -209,7 +209,7 @@ class ReplSuite extends SparkFunSuite {
   }
 
   test("local-cluster mode") {
-    val output = runInterpreter("local-cluster[1,1,512]",
+    val output = runInterpreter("local-cluster[1,1,1024]",
       """
         |var v = 7
         |def getV() = v
@@ -231,7 +231,7 @@ class ReplSuite extends SparkFunSuite {
   }
 
   test("SPARK-1199 two instances of same class don't type check.") {
-    val output = runInterpreter("local-cluster[1,1,512]",
+    val output = runInterpreter("local-cluster[1,1,1024]",
       """
         |case class Sum(exp: String, exp2: String)
         |val a = Sum("A", "B")
@@ -254,7 +254,7 @@ class ReplSuite extends SparkFunSuite {
 
   test("SPARK-2576 importing SQLContext.createDataFrame.") {
     // We need to use local-cluster to test this case.
-    val output = runInterpreter("local-cluster[1,1,512]",
+    val output = runInterpreter("local-cluster[1,1,1024]",
       """
         |val sqlContext = new org.apache.spark.sql.SQLContext(sc)
         |import sqlContext.implicits._
@@ -314,7 +314,7 @@ class ReplSuite extends SparkFunSuite {
   }
 
   test("collecting objects of class defined in repl - shuffling") {
-    val output = runInterpreter("local-cluster[1,1,512]",
+    val output = runInterpreter("local-cluster[1,1,1024]",
       """
         |case class Foo(i: Int)
         |val list = List((1, Foo(1)), (1, Foo(2)))
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala
index bee2ecbedb244..72b35959a491b 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala
@@ -53,7 +53,7 @@ class HiveSparkSubmitSuite
     val args = Seq(
       "--class", SparkSubmitClassLoaderTest.getClass.getName.stripSuffix("$"),
       "--name", "SparkSubmitClassLoaderTest",
-      "--master", "local-cluster[2,1,512]",
+      "--master", "local-cluster[2,1,1024]",
       "--jars", jarsString,
       unusedJar.toString, "SparkSubmitClassA", "SparkSubmitClassB")
     runSparkSubmit(args)
@@ -64,7 +64,7 @@ class HiveSparkSubmitSuite
     val args = Seq(
       "--class", SparkSQLConfTest.getClass.getName.stripSuffix("$"),
       "--name", "SparkSQLConfTest",
-      "--master", "local-cluster[2,1,512]",
+      "--master", "local-cluster[2,1,1024]",
       unusedJar.toString)
     runSparkSubmit(args)
   }

From 798dff7b4baa952c609725b852bcb6a9c9e5a317 Mon Sep 17 00:00:00 2001
From: Iulian Dragos <jaguarul@gmail.com>
Date: Wed, 22 Jul 2015 15:54:08 -0700
Subject: [PATCH 324/549] [SPARK-8975] [STREAMING] Adds a mechanism to send a
 new rate from the driver to the block generator
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

First step for [SPARK-7398](https://issues.apache.org/jira/browse/SPARK-7398).

tdas huitseeker

Author: Iulian Dragos <jaguarul@gmail.com>
Author: François Garillot <francois@garillot.net>

Closes #7471 from dragos/topic/streaming-bp/dynamic-rate and squashes the following commits:

8941cf9 [Iulian Dragos] Renames and other nitpicks.
162d9e5 [Iulian Dragos] Use Reflection for accessing truly private `executor` method and use the listener bus to know when receivers have registered (`onStart` is called before receivers have registered, leading to flaky behavior).
210f495 [Iulian Dragos] Revert "Added a few tests that measure the receiver’s rate."
0c51959 [Iulian Dragos] Added a few tests that measure the receiver’s rate.
261a051 [Iulian Dragos] - removed field to hold the current rate limit in rate limiter - made rate limit a Long and default to Long.MaxValue (consequence of the above) - removed custom `waitUntil` and replaced it by `eventually`
cd1397d [Iulian Dragos] Add a test for the propagation of a new rate limit from driver to receivers.
6369b30 [Iulian Dragos] Merge pull request #15 from huitseeker/SPARK-8975
d15de42 [François Garillot] [SPARK-8975][Streaming] Adds Ratelimiter unit tests w.r.t. spark.streaming.receiver.maxRate
4721c7d [François Garillot] [SPARK-8975][Streaming] Add a mechanism to send a new rate from the driver to the block generator
---
 .../streaming/receiver/RateLimiter.scala      | 30 +++++++--
 .../spark/streaming/receiver/Receiver.scala   |  2 +-
 .../streaming/receiver/ReceiverMessage.scala  |  3 +-
 .../receiver/ReceiverSupervisor.scala         |  3 +
 .../receiver/ReceiverSupervisorImpl.scala     |  6 ++
 .../streaming/scheduler/ReceiverTracker.scala |  9 ++-
 .../streaming/receiver/RateLimiterSuite.scala | 46 ++++++++++++++
 .../scheduler/ReceiverTrackerSuite.scala      | 62 +++++++++++++++++++
 8 files changed, 153 insertions(+), 8 deletions(-)
 create mode 100644 streaming/src/test/scala/org/apache/spark/streaming/receiver/RateLimiterSuite.scala

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/RateLimiter.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/RateLimiter.scala
index 8df542b367d27..f663def4c0511 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/receiver/RateLimiter.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/receiver/RateLimiter.scala
@@ -34,12 +34,32 @@ import org.apache.spark.{Logging, SparkConf}
   */
 private[receiver] abstract class RateLimiter(conf: SparkConf) extends Logging {
 
-  private val desiredRate = conf.getInt("spark.streaming.receiver.maxRate", 0)
-  private lazy val rateLimiter = GuavaRateLimiter.create(desiredRate)
+  // treated as an upper limit
+  private val maxRateLimit = conf.getLong("spark.streaming.receiver.maxRate", Long.MaxValue)
+  private lazy val rateLimiter = GuavaRateLimiter.create(maxRateLimit.toDouble)
 
   def waitToPush() {
-    if (desiredRate > 0) {
-      rateLimiter.acquire()
-    }
+    rateLimiter.acquire()
   }
+
+  /**
+   * Return the current rate limit. If no limit has been set so far, it returns {{{Long.MaxValue}}}.
+   */
+  def getCurrentLimit: Long =
+    rateLimiter.getRate.toLong
+
+  /**
+   * Set the rate limit to `newRate`. The new rate will not exceed the maximum rate configured by
+   * {{{spark.streaming.receiver.maxRate}}}, even if `newRate` is higher than that.
+   *
+   * @param newRate A new rate in events per second. It has no effect if it's 0 or negative.
+   */
+  private[receiver] def updateRate(newRate: Long): Unit =
+    if (newRate > 0) {
+      if (maxRateLimit > 0) {
+        rateLimiter.setRate(newRate.min(maxRateLimit))
+      } else {
+        rateLimiter.setRate(newRate)
+      }
+    }
 }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/Receiver.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/Receiver.scala
index 5b5a3fe648602..7504fa44d9fae 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/receiver/Receiver.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/receiver/Receiver.scala
@@ -271,7 +271,7 @@ abstract class Receiver[T](val storageLevel: StorageLevel) extends Serializable
   }
 
   /** Get the attached executor. */
-  private def executor = {
+  private def executor: ReceiverSupervisor = {
     assert(executor_ != null, "Executor has not been attached to this receiver")
     executor_
   }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverMessage.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverMessage.scala
index 7bf3c33319491..1eb55affaa9d0 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverMessage.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverMessage.scala
@@ -23,4 +23,5 @@ import org.apache.spark.streaming.Time
 private[streaming] sealed trait ReceiverMessage extends Serializable
 private[streaming] object StopReceiver extends ReceiverMessage
 private[streaming] case class CleanupOldBlocks(threshTime: Time) extends ReceiverMessage
-
+private[streaming] case class UpdateRateLimit(elementsPerSecond: Long)
+                   extends ReceiverMessage
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisor.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisor.scala
index 6467029a277b2..a7c220f426ecf 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisor.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisor.scala
@@ -59,6 +59,9 @@ private[streaming] abstract class ReceiverSupervisor(
   /** Time between a receiver is stopped and started again */
   private val defaultRestartDelay = conf.getInt("spark.streaming.receiverRestartDelay", 2000)
 
+  /** The current maximum rate limit for this receiver. */
+  private[streaming] def getCurrentRateLimit: Option[Long] = None
+
   /** Exception associated with the stopping of the receiver */
   @volatile protected var stoppingError: Throwable = null
 
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisorImpl.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisorImpl.scala
index f6ba66b3ae036..2f6841ee8879c 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisorImpl.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisorImpl.scala
@@ -77,6 +77,9 @@ private[streaming] class ReceiverSupervisorImpl(
         case CleanupOldBlocks(threshTime) =>
           logDebug("Received delete old batch signal")
           cleanupOldBlocks(threshTime)
+        case UpdateRateLimit(eps) =>
+          logInfo(s"Received a new rate limit: $eps.")
+          blockGenerator.updateRate(eps)
       }
     })
 
@@ -98,6 +101,9 @@ private[streaming] class ReceiverSupervisorImpl(
     }
   }, streamId, env.conf)
 
+  override private[streaming] def getCurrentRateLimit: Option[Long] =
+    Some(blockGenerator.getCurrentLimit)
+
   /** Push a single record of received data into block generator. */
   def pushSingle(data: Any) {
     blockGenerator.addData(data)
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala
index 6910d81d9866e..9cc6ffcd12f61 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala
@@ -26,7 +26,7 @@ import org.apache.spark.{Logging, SparkEnv, SparkException}
 import org.apache.spark.rpc._
 import org.apache.spark.streaming.{StreamingContext, Time}
 import org.apache.spark.streaming.receiver.{CleanupOldBlocks, Receiver, ReceiverSupervisorImpl,
-  StopReceiver}
+  StopReceiver, UpdateRateLimit}
 import org.apache.spark.util.SerializableConfiguration
 
 /**
@@ -226,6 +226,13 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false
     logError(s"Deregistered receiver for stream $streamId: $messageWithError")
   }
 
+  /** Update a receiver's maximum ingestion rate */
+  def sendRateUpdate(streamUID: Int, newRate: Long): Unit = {
+    for (info <- receiverInfo.get(streamUID); eP <- Option(info.endpoint)) {
+      eP.send(UpdateRateLimit(newRate))
+    }
+  }
+
   /** Add new blocks for the given stream */
   private def addBlock(receivedBlockInfo: ReceivedBlockInfo): Boolean = {
     receivedBlockTracker.addBlock(receivedBlockInfo)
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/receiver/RateLimiterSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/receiver/RateLimiterSuite.scala
new file mode 100644
index 0000000000000..c6330eb3673fb
--- /dev/null
+++ b/streaming/src/test/scala/org/apache/spark/streaming/receiver/RateLimiterSuite.scala
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.streaming.receiver
+
+import org.apache.spark.SparkConf
+import org.apache.spark.SparkFunSuite
+
+/** Testsuite for testing the network receiver behavior */
+class RateLimiterSuite extends SparkFunSuite {
+
+  test("rate limiter initializes even without a maxRate set") {
+    val conf = new SparkConf()
+    val rateLimiter = new RateLimiter(conf){}
+    rateLimiter.updateRate(105)
+    assert(rateLimiter.getCurrentLimit == 105)
+  }
+
+  test("rate limiter updates when below maxRate") {
+    val conf = new SparkConf().set("spark.streaming.receiver.maxRate", "110")
+    val rateLimiter = new RateLimiter(conf){}
+    rateLimiter.updateRate(105)
+    assert(rateLimiter.getCurrentLimit == 105)
+  }
+
+  test("rate limiter stays below maxRate despite large updates") {
+    val conf = new SparkConf().set("spark.streaming.receiver.maxRate", "100")
+    val rateLimiter = new RateLimiter(conf){}
+    rateLimiter.updateRate(105)
+    assert(rateLimiter.getCurrentLimit === 100)
+  }
+}
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala
index a6e783861dbe6..aadb7231757b8 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala
@@ -17,11 +17,17 @@
 
 package org.apache.spark.streaming.scheduler
 
+import org.scalatest.concurrent.Eventually._
+import org.scalatest.concurrent.Timeouts
+import org.scalatest.time.SpanSugar._
 import org.apache.spark.streaming._
 import org.apache.spark.SparkConf
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.streaming.receiver._
 import org.apache.spark.util.Utils
+import org.apache.spark.streaming.dstream.InputDStream
+import scala.reflect.ClassTag
+import org.apache.spark.streaming.dstream.ReceiverInputDStream
 
 /** Testsuite for receiver scheduling */
 class ReceiverTrackerSuite extends TestSuiteBase {
@@ -72,8 +78,64 @@ class ReceiverTrackerSuite extends TestSuiteBase {
     assert(locations(0).length === 1)
     assert(locations(3).length === 1)
   }
+
+  test("Receiver tracker - propagates rate limit") {
+    object ReceiverStartedWaiter extends StreamingListener {
+      @volatile
+      var started = false
+
+      override def onReceiverStarted(receiverStarted: StreamingListenerReceiverStarted): Unit = {
+        started = true
+      }
+    }
+
+    ssc.addStreamingListener(ReceiverStartedWaiter)
+    ssc.scheduler.listenerBus.start(ssc.sc)
+
+    val newRateLimit = 100L
+    val inputDStream = new RateLimitInputDStream(ssc)
+    val tracker = new ReceiverTracker(ssc)
+    tracker.start()
+
+    // we wait until the Receiver has registered with the tracker,
+    // otherwise our rate update is lost
+    eventually(timeout(5 seconds)) {
+      assert(ReceiverStartedWaiter.started)
+    }
+    tracker.sendRateUpdate(inputDStream.id, newRateLimit)
+    // this is an async message, we need to wait a bit for it to be processed
+    eventually(timeout(3 seconds)) {
+      assert(inputDStream.getCurrentRateLimit.get === newRateLimit)
+    }
+  }
 }
 
+/** An input DStream with a hard-coded receiver that gives access to internals for testing. */
+private class RateLimitInputDStream(@transient ssc_ : StreamingContext)
+  extends ReceiverInputDStream[Int](ssc_) {
+
+  override def getReceiver(): DummyReceiver = SingletonDummyReceiver
+
+  def getCurrentRateLimit: Option[Long] = {
+    invokeExecutorMethod.getCurrentRateLimit
+  }
+
+  private def invokeExecutorMethod: ReceiverSupervisor = {
+    val c = classOf[Receiver[_]]
+    val ex = c.getDeclaredMethod("executor")
+    ex.setAccessible(true)
+    ex.invoke(SingletonDummyReceiver).asInstanceOf[ReceiverSupervisor]
+  }
+}
+
+/**
+ * A Receiver as an object so we can read its rate limit.
+ *
+ * @note It's necessary to be a top-level object, or else serialization would create another
+ *       one on the executor side and we won't be able to read its rate limit.
+ */
+private object SingletonDummyReceiver extends DummyReceiver
+
 /**
  * Dummy receiver implementation
  */

From 430cd7815dc7875edd126af4b90752ba8a380cf2 Mon Sep 17 00:00:00 2001
From: Kenichi Maehashi <webmaster@kenichimaehashi.com>
Date: Wed, 22 Jul 2015 16:15:44 -0700
Subject: [PATCH 325/549] [SPARK-9180] fix spark-shell to accept --name option

This patch fixes [[SPARK-9180]](https://issues.apache.org/jira/browse/SPARK-9180).
Users can now set the app name of spark-shell using `spark-shell --name "whatever"`.

Author: Kenichi Maehashi <webmaster@kenichimaehashi.com>

Closes #7512 from kmaehashi/fix-spark-shell-app-name and squashes the following commits:

e24991a [Kenichi Maehashi] use setIfMissing instead of setAppName
18aa4ad [Kenichi Maehashi] fix spark-shell to accept --name option
---
 bin/spark-shell                                               | 4 ++--
 bin/spark-shell2.cmd                                          | 2 +-
 .../src/main/scala/org/apache/spark/repl/SparkILoop.scala     | 2 +-
 .../src/main/scala/org/apache/spark/repl/Main.scala           | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/bin/spark-shell b/bin/spark-shell
index a6dc863d83fc6..00ab7afd118b5 100755
--- a/bin/spark-shell
+++ b/bin/spark-shell
@@ -47,11 +47,11 @@ function main() {
     # (see https://github.com/sbt/sbt/issues/562).
     stty -icanon min 1 -echo > /dev/null 2>&1
     export SPARK_SUBMIT_OPTS="$SPARK_SUBMIT_OPTS -Djline.terminal=unix"
-    "$FWDIR"/bin/spark-submit --class org.apache.spark.repl.Main "$@"
+    "$FWDIR"/bin/spark-submit --class org.apache.spark.repl.Main --name "Spark shell" "$@"
     stty icanon echo > /dev/null 2>&1
   else
     export SPARK_SUBMIT_OPTS
-    "$FWDIR"/bin/spark-submit --class org.apache.spark.repl.Main "$@"
+    "$FWDIR"/bin/spark-submit --class org.apache.spark.repl.Main --name "Spark shell" "$@"
   fi
 }
 
diff --git a/bin/spark-shell2.cmd b/bin/spark-shell2.cmd
index 251309d67f860..b9b0f510d7f5d 100644
--- a/bin/spark-shell2.cmd
+++ b/bin/spark-shell2.cmd
@@ -32,4 +32,4 @@ if "x%SPARK_SUBMIT_OPTS%"=="x" (
 set SPARK_SUBMIT_OPTS="%SPARK_SUBMIT_OPTS% -Dscala.usejavacp=true"
 
 :run_shell
-%SPARK_HOME%\bin\spark-submit2.cmd --class org.apache.spark.repl.Main %*
+%SPARK_HOME%\bin\spark-submit2.cmd --class org.apache.spark.repl.Main --name "Spark shell" %*
diff --git a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoop.scala b/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoop.scala
index 8f7f9074d3f03..8130868fe1487 100644
--- a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoop.scala
+++ b/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoop.scala
@@ -1008,9 +1008,9 @@ class SparkILoop(
     val jars = SparkILoop.getAddedJars
     val conf = new SparkConf()
       .setMaster(getMaster())
-      .setAppName("Spark shell")
       .setJars(jars)
       .set("spark.repl.class.uri", intp.classServerUri)
+      .setIfMissing("spark.app.name", "Spark shell")
     if (execUri != null) {
       conf.set("spark.executor.uri", execUri)
     }
diff --git a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/Main.scala b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/Main.scala
index eed4a379afa60..be31eb2eda546 100644
--- a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/Main.scala
+++ b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/Main.scala
@@ -65,9 +65,9 @@ object Main extends Logging {
     val jars = getAddedJars
     val conf = new SparkConf()
       .setMaster(getMaster)
-      .setAppName("Spark shell")
       .setJars(jars)
       .set("spark.repl.class.uri", classServer.uri)
+      .setIfMissing("spark.app.name", "Spark shell")
     logInfo("Spark class server started at " + classServer.uri)
     if (execUri != null) {
       conf.set("spark.executor.uri", execUri)

From 5307c9d3f7a35c0276b72e743e3a62a44d2bd0f5 Mon Sep 17 00:00:00 2001
From: MechCoder <manojkumarsivaraj334@gmail.com>
Date: Wed, 22 Jul 2015 17:22:12 -0700
Subject: [PATCH 326/549] [SPARK-9223] [PYSPARK] [MLLIB] Support model
 save/load in LDA

Since save / load has been merged in LDA, it takes no time to write the wrappers in Python as well.

Author: MechCoder <manojkumarsivaraj334@gmail.com>

Closes #7587 from MechCoder/python_lda_save_load and squashes the following commits:

c8e4ea7 [MechCoder] [SPARK-9223] [PySpark] Support model save/load in LDA
---
 python/pyspark/mllib/clustering.py | 43 +++++++++++++++++++++++++++++-
 1 file changed, 42 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/mllib/clustering.py b/python/pyspark/mllib/clustering.py
index 8a92f6911c24b..58ad99d46e23b 100644
--- a/python/pyspark/mllib/clustering.py
+++ b/python/pyspark/mllib/clustering.py
@@ -20,6 +20,7 @@
 
 if sys.version > '3':
     xrange = range
+    basestring = str
 
 from math import exp, log
 
@@ -579,7 +580,7 @@ class LDAModel(JavaModelWrapper):
     Blei, Ng, and Jordan.  "Latent Dirichlet Allocation."  JMLR, 2003.
 
     >>> from pyspark.mllib.linalg import Vectors
-    >>> from numpy.testing import assert_almost_equal
+    >>> from numpy.testing import assert_almost_equal, assert_equal
     >>> data = [
     ...     [1, Vectors.dense([0.0, 1.0])],
     ...     [2, SparseVector(2, {0: 1.0})],
@@ -591,6 +592,19 @@ class LDAModel(JavaModelWrapper):
     >>> topics = model.topicsMatrix()
     >>> topics_expect = array([[0.5,  0.5], [0.5, 0.5]])
     >>> assert_almost_equal(topics, topics_expect, 1)
+
+    >>> import os, tempfile
+    >>> from shutil import rmtree
+    >>> path = tempfile.mkdtemp()
+    >>> model.save(sc, path)
+    >>> sameModel = LDAModel.load(sc, path)
+    >>> assert_equal(sameModel.topicsMatrix(), model.topicsMatrix())
+    >>> sameModel.vocabSize() == model.vocabSize()
+    True
+    >>> try:
+    ...     rmtree(path)
+    ... except OSError:
+    ...     pass
     """
 
     def topicsMatrix(self):
@@ -601,6 +615,33 @@ def vocabSize(self):
         """Vocabulary size (number of terms or terms in the vocabulary)"""
         return self.call("vocabSize")
 
+    def save(self, sc, path):
+        """Save the LDAModel on to disk.
+
+        :param sc: SparkContext
+        :param path: str, path to where the model needs to be stored.
+        """
+        if not isinstance(sc, SparkContext):
+            raise TypeError("sc should be a SparkContext, got type %s" % type(sc))
+        if not isinstance(path, basestring):
+            raise TypeError("path should be a basestring, got type %s" % type(path))
+        self._java_model.save(sc._jsc.sc(), path)
+
+    @classmethod
+    def load(cls, sc, path):
+        """Load the LDAModel from disk.
+
+        :param sc: SparkContext
+        :param path: str, path to where the model is stored.
+        """
+        if not isinstance(sc, SparkContext):
+            raise TypeError("sc should be a SparkContext, got type %s" % type(sc))
+        if not isinstance(path, basestring):
+            raise TypeError("path should be a basestring, got type %s" % type(path))
+        java_model = sc._jvm.org.apache.spark.mllib.clustering.DistributedLDAModel.load(
+            sc._jsc.sc(), path)
+        return cls(java_model)
+
 
 class LDA(object):
 

From a721ee52705100dbd7852f80f92cde4375517e48 Mon Sep 17 00:00:00 2001
From: martinzapletal <zapletal-martin@email.cz>
Date: Wed, 22 Jul 2015 17:35:05 -0700
Subject: [PATCH 327/549] [SPARK-8484] [ML] Added TrainValidationSplit for
 hyper-parameter tuning.

- [X] Added TrainValidationSplit for hyper-parameter tuning. It randomly splits the input dataset into train and validation and use evaluation metric on the validation set to select the best model. It should be similar to CrossValidator, but simpler and less expensive.
- [X] Simplified replacement of https://github.com/apache/spark/pull/6996

Author: martinzapletal <zapletal-martin@email.cz>

Closes #7337 from zapletal-martin/SPARK-8484-TrainValidationSplit and squashes the following commits:

cafc949 [martinzapletal] Review comments https://github.com/apache/spark/pull/7337.
511b398 [martinzapletal] Merge remote-tracking branch 'upstream/master' into SPARK-8484-TrainValidationSplit
f4fc9c4 [martinzapletal] SPARK-8484 Resolved feedback to https://github.com/apache/spark/pull/7337
00c4f5a [martinzapletal] SPARK-8484. Styling.
d699506 [martinzapletal] SPARK-8484. Styling.
93ed2ee [martinzapletal] Styling.
3bc1853 [martinzapletal] SPARK-8484. Styling.
2aa6f43 [martinzapletal] SPARK-8484. Added TrainValidationSplit for hyper-parameter tuning. It randomly splits the input dataset into train and validation and use evaluation metric on the validation set to select the best model.
21662eb [martinzapletal] SPARK-8484. Added TrainValidationSplit for hyper-parameter tuning. It randomly splits the input dataset into train and validation and use evaluation metric on the validation set to select the best model.
---
 .../spark/ml/tuning/CrossValidator.scala      |  33 +---
 .../ml/tuning/TrainValidationSplit.scala      | 168 ++++++++++++++++++
 .../spark/ml/tuning/ValidatorParams.scala     |  60 +++++++
 .../ml/tuning/TrainValidationSplitSuite.scala | 139 +++++++++++++++
 4 files changed, 368 insertions(+), 32 deletions(-)
 create mode 100644 mllib/src/main/scala/org/apache/spark/ml/tuning/TrainValidationSplit.scala
 create mode 100644 mllib/src/main/scala/org/apache/spark/ml/tuning/ValidatorParams.scala
 create mode 100644 mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala

diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
index e2444ab65b43b..f979319cc4b58 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
@@ -32,38 +32,7 @@ import org.apache.spark.sql.types.StructType
 /**
  * Params for [[CrossValidator]] and [[CrossValidatorModel]].
  */
-private[ml] trait CrossValidatorParams extends Params {
-
-  /**
-   * param for the estimator to be cross-validated
-   * @group param
-   */
-  val estimator: Param[Estimator[_]] = new Param(this, "estimator", "estimator for selection")
-
-  /** @group getParam */
-  def getEstimator: Estimator[_] = $(estimator)
-
-  /**
-   * param for estimator param maps
-   * @group param
-   */
-  val estimatorParamMaps: Param[Array[ParamMap]] =
-    new Param(this, "estimatorParamMaps", "param maps for the estimator")
-
-  /** @group getParam */
-  def getEstimatorParamMaps: Array[ParamMap] = $(estimatorParamMaps)
-
-  /**
-   * param for the evaluator used to select hyper-parameters that maximize the cross-validated
-   * metric
-   * @group param
-   */
-  val evaluator: Param[Evaluator] = new Param(this, "evaluator",
-    "evaluator used to select hyper-parameters that maximize the cross-validated metric")
-
-  /** @group getParam */
-  def getEvaluator: Evaluator = $(evaluator)
-
+private[ml] trait CrossValidatorParams extends ValidatorParams {
   /**
    * Param for number of folds for cross validation.  Must be >= 2.
    * Default: 3
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/TrainValidationSplit.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/TrainValidationSplit.scala
new file mode 100644
index 0000000000000..c0edc730b6fd6
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/TrainValidationSplit.scala
@@ -0,0 +1,168 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.tuning
+
+import org.apache.spark.Logging
+import org.apache.spark.annotation.Experimental
+import org.apache.spark.ml.evaluation.Evaluator
+import org.apache.spark.ml.{Estimator, Model}
+import org.apache.spark.ml.param.{DoubleParam, ParamMap, ParamValidators}
+import org.apache.spark.ml.util.Identifiable
+import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.types.StructType
+
+/**
+ * Params for [[TrainValidationSplit]] and [[TrainValidationSplitModel]].
+ */
+private[ml] trait TrainValidationSplitParams extends ValidatorParams {
+  /**
+   * Param for ratio between train and validation data. Must be between 0 and 1.
+   * Default: 0.75
+   * @group param
+   */
+  val trainRatio: DoubleParam = new DoubleParam(this, "trainRatio",
+    "ratio between training set and validation set (>= 0 && <= 1)", ParamValidators.inRange(0, 1))
+
+  /** @group getParam */
+  def getTrainRatio: Double = $(trainRatio)
+
+  setDefault(trainRatio -> 0.75)
+}
+
+/**
+ * :: Experimental ::
+ * Validation for hyper-parameter tuning.
+ * Randomly splits the input dataset into train and validation sets,
+ * and uses evaluation metric on the validation set to select the best model.
+ * Similar to [[CrossValidator]], but only splits the set once.
+ */
+@Experimental
+class TrainValidationSplit(override val uid: String) extends Estimator[TrainValidationSplitModel]
+  with TrainValidationSplitParams with Logging {
+
+  def this() = this(Identifiable.randomUID("tvs"))
+
+  /** @group setParam */
+  def setEstimator(value: Estimator[_]): this.type = set(estimator, value)
+
+  /** @group setParam */
+  def setEstimatorParamMaps(value: Array[ParamMap]): this.type = set(estimatorParamMaps, value)
+
+  /** @group setParam */
+  def setEvaluator(value: Evaluator): this.type = set(evaluator, value)
+
+  /** @group setParam */
+  def setTrainRatio(value: Double): this.type = set(trainRatio, value)
+
+  override def fit(dataset: DataFrame): TrainValidationSplitModel = {
+    val schema = dataset.schema
+    transformSchema(schema, logging = true)
+    val sqlCtx = dataset.sqlContext
+    val est = $(estimator)
+    val eval = $(evaluator)
+    val epm = $(estimatorParamMaps)
+    val numModels = epm.length
+    val metrics = new Array[Double](epm.length)
+
+    val Array(training, validation) =
+      dataset.rdd.randomSplit(Array($(trainRatio), 1 - $(trainRatio)))
+    val trainingDataset = sqlCtx.createDataFrame(training, schema).cache()
+    val validationDataset = sqlCtx.createDataFrame(validation, schema).cache()
+
+    // multi-model training
+    logDebug(s"Train split with multiple sets of parameters.")
+    val models = est.fit(trainingDataset, epm).asInstanceOf[Seq[Model[_]]]
+    trainingDataset.unpersist()
+    var i = 0
+    while (i < numModels) {
+      // TODO: duplicate evaluator to take extra params from input
+      val metric = eval.evaluate(models(i).transform(validationDataset, epm(i)))
+      logDebug(s"Got metric $metric for model trained with ${epm(i)}.")
+      metrics(i) += metric
+      i += 1
+    }
+    validationDataset.unpersist()
+
+    logInfo(s"Train validation split metrics: ${metrics.toSeq}")
+    val (bestMetric, bestIndex) = metrics.zipWithIndex.maxBy(_._1)
+    logInfo(s"Best set of parameters:\n${epm(bestIndex)}")
+    logInfo(s"Best train validation split metric: $bestMetric.")
+    val bestModel = est.fit(dataset, epm(bestIndex)).asInstanceOf[Model[_]]
+    copyValues(new TrainValidationSplitModel(uid, bestModel, metrics).setParent(this))
+  }
+
+  override def transformSchema(schema: StructType): StructType = {
+    $(estimator).transformSchema(schema)
+  }
+
+  override def validateParams(): Unit = {
+    super.validateParams()
+    val est = $(estimator)
+    for (paramMap <- $(estimatorParamMaps)) {
+      est.copy(paramMap).validateParams()
+    }
+  }
+
+  override def copy(extra: ParamMap): TrainValidationSplit = {
+    val copied = defaultCopy(extra).asInstanceOf[TrainValidationSplit]
+    if (copied.isDefined(estimator)) {
+      copied.setEstimator(copied.getEstimator.copy(extra))
+    }
+    if (copied.isDefined(evaluator)) {
+      copied.setEvaluator(copied.getEvaluator.copy(extra))
+    }
+    copied
+  }
+}
+
+/**
+ * :: Experimental ::
+ * Model from train validation split.
+ *
+ * @param uid Id.
+ * @param bestModel Estimator determined best model.
+ * @param validationMetrics Evaluated validation metrics.
+ */
+@Experimental
+class TrainValidationSplitModel private[ml] (
+    override val uid: String,
+    val bestModel: Model[_],
+    val validationMetrics: Array[Double])
+  extends Model[TrainValidationSplitModel] with TrainValidationSplitParams {
+
+  override def validateParams(): Unit = {
+    bestModel.validateParams()
+  }
+
+  override def transform(dataset: DataFrame): DataFrame = {
+    transformSchema(dataset.schema, logging = true)
+    bestModel.transform(dataset)
+  }
+
+  override def transformSchema(schema: StructType): StructType = {
+    bestModel.transformSchema(schema)
+  }
+
+  override def copy(extra: ParamMap): TrainValidationSplitModel = {
+    val copied = new TrainValidationSplitModel (
+      uid,
+      bestModel.copy(extra).asInstanceOf[Model[_]],
+      validationMetrics.clone())
+    copyValues(copied, extra)
+  }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/ValidatorParams.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/ValidatorParams.scala
new file mode 100644
index 0000000000000..8897ab0825acd
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/ValidatorParams.scala
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.tuning
+
+import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.ml.Estimator
+import org.apache.spark.ml.evaluation.Evaluator
+import org.apache.spark.ml.param.{ParamMap, Param, Params}
+
+/**
+ * :: DeveloperApi ::
+ * Common params for [[TrainValidationSplitParams]] and [[CrossValidatorParams]].
+ */
+@DeveloperApi
+private[ml] trait ValidatorParams extends Params {
+
+  /**
+   * param for the estimator to be validated
+   * @group param
+   */
+  val estimator: Param[Estimator[_]] = new Param(this, "estimator", "estimator for selection")
+
+  /** @group getParam */
+  def getEstimator: Estimator[_] = $(estimator)
+
+  /**
+   * param for estimator param maps
+   * @group param
+   */
+  val estimatorParamMaps: Param[Array[ParamMap]] =
+    new Param(this, "estimatorParamMaps", "param maps for the estimator")
+
+  /** @group getParam */
+  def getEstimatorParamMaps: Array[ParamMap] = $(estimatorParamMaps)
+
+  /**
+   * param for the evaluator used to select hyper-parameters that maximize the validated metric
+   * @group param
+   */
+  val evaluator: Param[Evaluator] = new Param(this, "evaluator",
+    "evaluator used to select hyper-parameters that maximize the validated metric")
+
+  /** @group getParam */
+  def getEvaluator: Evaluator = $(evaluator)
+}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala
new file mode 100644
index 0000000000000..c8e58f216cceb
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala
@@ -0,0 +1,139 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.tuning
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.ml.{Estimator, Model}
+import org.apache.spark.ml.classification.LogisticRegression
+import org.apache.spark.ml.evaluation.{BinaryClassificationEvaluator, Evaluator, RegressionEvaluator}
+import org.apache.spark.ml.param.ParamMap
+import org.apache.spark.ml.param.shared.HasInputCol
+import org.apache.spark.ml.regression.LinearRegression
+import org.apache.spark.mllib.classification.LogisticRegressionSuite.generateLogisticInput
+import org.apache.spark.mllib.util.{LinearDataGenerator, MLlibTestSparkContext}
+import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.types.StructType
+
+class TrainValidationSplitSuite extends SparkFunSuite with MLlibTestSparkContext {
+  test("train validation with logistic regression") {
+    val dataset = sqlContext.createDataFrame(
+      sc.parallelize(generateLogisticInput(1.0, 1.0, 100, 42), 2))
+
+    val lr = new LogisticRegression
+    val lrParamMaps = new ParamGridBuilder()
+      .addGrid(lr.regParam, Array(0.001, 1000.0))
+      .addGrid(lr.maxIter, Array(0, 10))
+      .build()
+    val eval = new BinaryClassificationEvaluator
+    val cv = new TrainValidationSplit()
+      .setEstimator(lr)
+      .setEstimatorParamMaps(lrParamMaps)
+      .setEvaluator(eval)
+      .setTrainRatio(0.5)
+    val cvModel = cv.fit(dataset)
+    val parent = cvModel.bestModel.parent.asInstanceOf[LogisticRegression]
+    assert(cv.getTrainRatio === 0.5)
+    assert(parent.getRegParam === 0.001)
+    assert(parent.getMaxIter === 10)
+    assert(cvModel.validationMetrics.length === lrParamMaps.length)
+  }
+
+  test("train validation with linear regression") {
+    val dataset = sqlContext.createDataFrame(
+        sc.parallelize(LinearDataGenerator.generateLinearInput(
+            6.3, Array(4.7, 7.2), Array(0.9, -1.3), Array(0.7, 1.2), 100, 42, 0.1), 2))
+
+    val trainer = new LinearRegression
+    val lrParamMaps = new ParamGridBuilder()
+      .addGrid(trainer.regParam, Array(1000.0, 0.001))
+      .addGrid(trainer.maxIter, Array(0, 10))
+      .build()
+    val eval = new RegressionEvaluator()
+    val cv = new TrainValidationSplit()
+      .setEstimator(trainer)
+      .setEstimatorParamMaps(lrParamMaps)
+      .setEvaluator(eval)
+      .setTrainRatio(0.5)
+    val cvModel = cv.fit(dataset)
+    val parent = cvModel.bestModel.parent.asInstanceOf[LinearRegression]
+    assert(parent.getRegParam === 0.001)
+    assert(parent.getMaxIter === 10)
+    assert(cvModel.validationMetrics.length === lrParamMaps.length)
+
+      eval.setMetricName("r2")
+    val cvModel2 = cv.fit(dataset)
+    val parent2 = cvModel2.bestModel.parent.asInstanceOf[LinearRegression]
+    assert(parent2.getRegParam === 0.001)
+    assert(parent2.getMaxIter === 10)
+    assert(cvModel2.validationMetrics.length === lrParamMaps.length)
+  }
+
+  test("validateParams should check estimatorParamMaps") {
+    import TrainValidationSplitSuite._
+
+    val est = new MyEstimator("est")
+    val eval = new MyEvaluator
+    val paramMaps = new ParamGridBuilder()
+      .addGrid(est.inputCol, Array("input1", "input2"))
+      .build()
+
+    val cv = new TrainValidationSplit()
+      .setEstimator(est)
+      .setEstimatorParamMaps(paramMaps)
+      .setEvaluator(eval)
+      .setTrainRatio(0.5)
+    cv.validateParams() // This should pass.
+
+    val invalidParamMaps = paramMaps :+ ParamMap(est.inputCol -> "")
+    cv.setEstimatorParamMaps(invalidParamMaps)
+    intercept[IllegalArgumentException] {
+      cv.validateParams()
+    }
+  }
+}
+
+object TrainValidationSplitSuite {
+
+  abstract class MyModel extends Model[MyModel]
+
+  class MyEstimator(override val uid: String) extends Estimator[MyModel] with HasInputCol {
+
+    override def validateParams(): Unit = require($(inputCol).nonEmpty)
+
+    override def fit(dataset: DataFrame): MyModel = {
+      throw new UnsupportedOperationException
+    }
+
+    override def transformSchema(schema: StructType): StructType = {
+      throw new UnsupportedOperationException
+    }
+
+    override def copy(extra: ParamMap): MyEstimator = defaultCopy(extra)
+  }
+
+  class MyEvaluator extends Evaluator {
+
+    override def evaluate(dataset: DataFrame): Double = {
+      throw new UnsupportedOperationException
+    }
+
+    override val uid: String = "eval"
+
+    override def copy(extra: ParamMap): MyEvaluator = defaultCopy(extra)
+  }
+}

From d71a13f475df2d05a7db9e25738d1353cbc8cfc7 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Wed, 22 Jul 2015 21:02:19 -0700
Subject: [PATCH 328/549] [SPARK-9262][build] Treat Scala compiler warnings as
 errors

I've seen a few cases in the past few weeks that the compiler is throwing warnings that are caused by legitimate bugs. This patch upgrades warnings to errors, except deprecation warnings.

Note that ideally we should be able to mark deprecation warnings as errors as well. However, due to the lack of ability to suppress individual warning messages in the Scala compiler, we cannot do that (since we do need to access deprecated APIs in Hadoop).

Most of the work are done by ericl.

Author: Reynold Xin <rxin@databricks.com>
Author: Eric Liang <ekl@databricks.com>

Closes #7598 from rxin/warnings and squashes the following commits:

beb311b [Reynold Xin] Fixed tests.
542c031 [Reynold Xin] Fixed one more warning.
87c354a [Reynold Xin] Fixed all non-deprecation warnings.
78660ac [Eric Liang] first effort to fix warnings
---
 .../apache/spark/api/r/RBackendHandler.scala  |  1 +
 .../org/apache/spark/rdd/CoGroupedRDD.scala   |  7 ++--
 .../org/apache/spark/util/JsonProtocol.scala  |  2 ++
 .../util/SerializableConfiguration.scala      |  2 --
 .../spark/util/SerializableJobConf.scala      |  2 --
 .../stat/test/KolmogorovSmirnovTest.scala     |  4 +--
 project/SparkBuild.scala                      | 33 ++++++++++++++++++-
 .../sql/catalyst/CatalystTypeConverters.scala |  3 +-
 .../apache/spark/sql/DataFrameWriter.scala    |  3 ++
 .../sql/execution/datasources/commands.scala  |  4 ++-
 .../spark/sql/hive/orc/OrcFilters.scala       |  6 ++--
 .../sql/sources/hadoopFsRelationSuites.scala  |  6 ++--
 12 files changed, 55 insertions(+), 18 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/api/r/RBackendHandler.scala b/core/src/main/scala/org/apache/spark/api/r/RBackendHandler.scala
index 9658e9a696ffa..a5de10fe89c42 100644
--- a/core/src/main/scala/org/apache/spark/api/r/RBackendHandler.scala
+++ b/core/src/main/scala/org/apache/spark/api/r/RBackendHandler.scala
@@ -20,6 +20,7 @@ package org.apache.spark.api.r
 import java.io.{ByteArrayInputStream, ByteArrayOutputStream, DataInputStream, DataOutputStream}
 
 import scala.collection.mutable.HashMap
+import scala.language.existentials
 
 import io.netty.channel.ChannelHandler.Sharable
 import io.netty.channel.{ChannelHandlerContext, SimpleChannelInboundHandler}
diff --git a/core/src/main/scala/org/apache/spark/rdd/CoGroupedRDD.scala b/core/src/main/scala/org/apache/spark/rdd/CoGroupedRDD.scala
index 658e8c8b89318..130b58882d8ee 100644
--- a/core/src/main/scala/org/apache/spark/rdd/CoGroupedRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/CoGroupedRDD.scala
@@ -94,13 +94,14 @@ class CoGroupedRDD[K](@transient var rdds: Seq[RDD[_ <: Product2[K, _]]], part:
   }
 
   override def getDependencies: Seq[Dependency[_]] = {
-    rdds.map { rdd: RDD[_ <: Product2[K, _]] =>
+    rdds.map { rdd: RDD[_] =>
       if (rdd.partitioner == Some(part)) {
         logDebug("Adding one-to-one dependency with " + rdd)
         new OneToOneDependency(rdd)
       } else {
         logDebug("Adding shuffle dependency with " + rdd)
-        new ShuffleDependency[K, Any, CoGroupCombiner](rdd, part, serializer)
+        new ShuffleDependency[K, Any, CoGroupCombiner](
+          rdd.asInstanceOf[RDD[_ <: Product2[K, _]]], part, serializer)
       }
     }
   }
@@ -133,7 +134,7 @@ class CoGroupedRDD[K](@transient var rdds: Seq[RDD[_ <: Product2[K, _]]], part:
     // A list of (rdd iterator, dependency number) pairs
     val rddIterators = new ArrayBuffer[(Iterator[Product2[K, Any]], Int)]
     for ((dep, depNum) <- dependencies.zipWithIndex) dep match {
-      case oneToOneDependency: OneToOneDependency[Product2[K, Any]] =>
+      case oneToOneDependency: OneToOneDependency[Product2[K, Any]] @unchecked =>
         val dependencyPartition = split.narrowDeps(depNum).get.split
         // Read them from the parent
         val it = oneToOneDependency.rdd.iterator(dependencyPartition, context)
diff --git a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
index a078f14af52a1..c600319d9ddb4 100644
--- a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
@@ -94,6 +94,8 @@ private[spark] object JsonProtocol {
         logStartToJson(logStart)
       case metricsUpdate: SparkListenerExecutorMetricsUpdate =>
         executorMetricsUpdateToJson(metricsUpdate)
+      case blockUpdated: SparkListenerBlockUpdated =>
+        throw new MatchError(blockUpdated)  // TODO(ekl) implement this
     }
   }
 
diff --git a/core/src/main/scala/org/apache/spark/util/SerializableConfiguration.scala b/core/src/main/scala/org/apache/spark/util/SerializableConfiguration.scala
index 30bcf1d2f24d5..3354a923273ff 100644
--- a/core/src/main/scala/org/apache/spark/util/SerializableConfiguration.scala
+++ b/core/src/main/scala/org/apache/spark/util/SerializableConfiguration.scala
@@ -20,8 +20,6 @@ import java.io.{ObjectInputStream, ObjectOutputStream}
 
 import org.apache.hadoop.conf.Configuration
 
-import org.apache.spark.util.Utils
-
 private[spark]
 class SerializableConfiguration(@transient var value: Configuration) extends Serializable {
   private def writeObject(out: ObjectOutputStream): Unit = Utils.tryOrIOException {
diff --git a/core/src/main/scala/org/apache/spark/util/SerializableJobConf.scala b/core/src/main/scala/org/apache/spark/util/SerializableJobConf.scala
index afbcc6efc850c..cadae472b3f85 100644
--- a/core/src/main/scala/org/apache/spark/util/SerializableJobConf.scala
+++ b/core/src/main/scala/org/apache/spark/util/SerializableJobConf.scala
@@ -21,8 +21,6 @@ import java.io.{ObjectInputStream, ObjectOutputStream}
 
 import org.apache.hadoop.mapred.JobConf
 
-import org.apache.spark.util.Utils
-
 private[spark]
 class SerializableJobConf(@transient var value: JobConf) extends Serializable {
   private def writeObject(out: ObjectOutputStream): Unit = Utils.tryOrIOException {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/KolmogorovSmirnovTest.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/KolmogorovSmirnovTest.scala
index d89b0059d83f3..2b3ed6df486c9 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/KolmogorovSmirnovTest.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/KolmogorovSmirnovTest.scala
@@ -20,7 +20,7 @@ package org.apache.spark.mllib.stat.test
 import scala.annotation.varargs
 
 import org.apache.commons.math3.distribution.{NormalDistribution, RealDistribution}
-import org.apache.commons.math3.stat.inference.KolmogorovSmirnovTest
+import org.apache.commons.math3.stat.inference.{KolmogorovSmirnovTest => CommonMathKolmogorovSmirnovTest}
 
 import org.apache.spark.Logging
 import org.apache.spark.rdd.RDD
@@ -187,7 +187,7 @@ private[stat] object KolmogorovSmirnovTest extends Logging {
   }
 
   private def evalOneSampleP(ksStat: Double, n: Long): KolmogorovSmirnovTestResult = {
-    val pval = 1 - new KolmogorovSmirnovTest().cdf(ksStat, n.toInt)
+    val pval = 1 - new CommonMathKolmogorovSmirnovTest().cdf(ksStat, n.toInt)
     new KolmogorovSmirnovTestResult(pval, ksStat, NullHypothesis.OneSampleTwoSided.toString)
   }
 }
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index 12828547d7077..61a05d375d99e 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -154,7 +154,38 @@ object SparkBuild extends PomBuild {
       if (major.toInt >= 1 && minor.toInt >= 8) Seq("-Xdoclint:all", "-Xdoclint:-missing") else Seq.empty
     },
 
-    javacOptions in Compile ++= Seq("-encoding", "UTF-8")
+    javacOptions in Compile ++= Seq("-encoding", "UTF-8"),
+
+    // Implements -Xfatal-warnings, ignoring deprecation warnings.
+    // Code snippet taken from https://issues.scala-lang.org/browse/SI-8410.
+    compile in Compile := {
+      val analysis = (compile in Compile).value
+      val s = streams.value
+
+      def logProblem(l: (=> String) => Unit, f: File, p: xsbti.Problem) = {
+        l(f.toString + ":" + p.position.line.fold("")(_ + ":") + " " + p.message)
+        l(p.position.lineContent)
+        l("")
+      }
+
+      var failed = 0
+      analysis.infos.allInfos.foreach { case (k, i) =>
+        i.reportedProblems foreach { p =>
+          val deprecation = p.message.contains("is deprecated")
+
+          if (!deprecation) {
+            failed = failed + 1
+          }
+
+          logProblem(if (deprecation) s.log.warn else s.log.error, k, p)
+        }
+      }
+
+      if (failed > 0) {
+        sys.error(s"$failed fatal warnings")
+      }
+      analysis
+    }
   )
 
   def enable(settings: Seq[Setting[_]])(projectRef: ProjectRef) = {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala
index 8f63d2120ad0e..ae0ab2f4c63f5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala
@@ -24,6 +24,7 @@ import java.util.{Map => JavaMap}
 import javax.annotation.Nullable
 
 import scala.collection.mutable.HashMap
+import scala.language.existentials
 
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.expressions._
@@ -401,7 +402,7 @@ object CatalystTypeConverters {
     case seq: Seq[Any] => seq.map(convertToCatalyst)
     case r: Row => InternalRow(r.toSeq.map(convertToCatalyst): _*)
     case arr: Array[Any] => arr.toSeq.map(convertToCatalyst).toArray
-    case m: Map[Any, Any] =>
+    case m: Map[_, _] =>
       m.map { case (k, v) => (convertToCatalyst(k), convertToCatalyst(v)) }.toMap
     case other => other
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index ee0201a9d4cb2..05da05d7b8050 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -197,6 +197,9 @@ final class DataFrameWriter private[sql](df: DataFrame) {
           // the table. But, insertInto with Overwrite requires the schema of data be the same
           // the schema of the table.
           insertInto(tableName)
+
+        case SaveMode.Overwrite =>
+          throw new UnsupportedOperationException("overwrite mode unsupported.")
       }
     } else {
       val cmd =
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/commands.scala
index 84a0441e145c5..cd2aa7f7433c5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/commands.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/commands.scala
@@ -100,7 +100,7 @@ private[sql] case class InsertIntoHadoopFsRelation(
     val pathExists = fs.exists(qualifiedOutputPath)
     val doInsertion = (mode, pathExists) match {
       case (SaveMode.ErrorIfExists, true) =>
-        sys.error(s"path $qualifiedOutputPath already exists.")
+        throw new AnalysisException(s"path $qualifiedOutputPath already exists.")
       case (SaveMode.Overwrite, true) =>
         fs.delete(qualifiedOutputPath, true)
         true
@@ -108,6 +108,8 @@ private[sql] case class InsertIntoHadoopFsRelation(
         true
       case (SaveMode.Ignore, exists) =>
         !exists
+      case (s, exists) =>
+        throw new IllegalStateException(s"unsupported save mode $s ($exists)")
     }
     // If we are appending data to an existing dir.
     val isAppend = pathExists && (mode == SaveMode.Append)
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFilters.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFilters.scala
index 250e73a4dba92..ddd5d24717add 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFilters.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFilters.scala
@@ -41,10 +41,10 @@ private[orc] object OrcFilters extends Logging {
   private def buildSearchArgument(expression: Filter, builder: Builder): Option[Builder] = {
     def newBuilder = SearchArgument.FACTORY.newBuilder()
 
-    def isSearchableLiteral(value: Any) = value match {
+    def isSearchableLiteral(value: Any): Boolean = value match {
       // These are types recognized by the `SearchArgumentImpl.BuilderImpl.boxLiteral()` method.
-      case _: String | _: Long | _: Double | _: DateWritable | _: HiveDecimal | _: HiveChar |
-           _: HiveVarchar | _: Byte | _: Short | _: Integer | _: Float => true
+      case _: String | _: Long | _: Double | _: Byte | _: Short | _: Integer | _: Float => true
+      case _: DateWritable | _: HiveDecimal | _: HiveChar | _: HiveVarchar => true
       case _ => false
     }
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
index 1cef83fd5e990..2a8748d913569 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
@@ -134,7 +134,7 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils {
 
   test("save()/load() - non-partitioned table - ErrorIfExists") {
     withTempDir { file =>
-      intercept[RuntimeException] {
+      intercept[AnalysisException] {
         testDF.write.format(dataSourceName).mode(SaveMode.ErrorIfExists).save(file.getCanonicalPath)
       }
     }
@@ -233,7 +233,7 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils {
 
   test("save()/load() - partitioned table - ErrorIfExists") {
     withTempDir { file =>
-      intercept[RuntimeException] {
+      intercept[AnalysisException] {
         partitionedTestDF.write
           .format(dataSourceName)
           .mode(SaveMode.ErrorIfExists)
@@ -696,7 +696,7 @@ class ParquetHadoopFsRelationSuite extends HadoopFsRelationTest {
       // This should only complain that the destination directory already exists, rather than file
       // "empty" is not a Parquet file.
       assert {
-        intercept[RuntimeException] {
+        intercept[AnalysisException] {
           df.write.format("parquet").mode(SaveMode.ErrorIfExists).save(path)
         }.getMessage.contains("already exists")
       }

From b217230f2a96c6d5a0554c593bdf1d1374878688 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Wed, 22 Jul 2015 21:04:04 -0700
Subject: [PATCH 329/549] [SPARK-9144] Remove
 DAGScheduler.runLocallyWithinThread and spark.localExecution.enabled

Spark has an option called spark.localExecution.enabled; according to the docs:

> Enables Spark to run certain jobs, such as first() or take() on the driver, without sending tasks to the cluster. This can make certain jobs execute very quickly, but may require shipping a whole partition of data to the driver.

This feature ends up adding quite a bit of complexity to DAGScheduler, especially in the runLocallyWithinThread method, but as far as I know nobody uses this feature (I searched the mailing list and haven't seen any recent mentions of the configuration nor stacktraces including the runLocally method). As a step towards scheduler complexity reduction, I propose that we remove this feature and all code related to it for Spark 1.5.

This pull request simply brings #7484 up to date.

Author: Josh Rosen <joshrosen@databricks.com>
Author: Reynold Xin <rxin@databricks.com>

Closes #7585 from rxin/remove-local-exec and squashes the following commits:

84bd10e [Reynold Xin] Python fix.
1d9739a [Reynold Xin] Merge pull request #7484 from JoshRosen/remove-localexecution
eec39fa [Josh Rosen] Remove allowLocal(); deprecate user-facing uses of it.
b0835dc [Josh Rosen] Remove local execution code in DAGScheduler
8975d96 [Josh Rosen] Remove local execution tests.
ffa8c9b [Josh Rosen] Remove documentation for configuration
---
 .../scala/org/apache/spark/SparkContext.scala |  86 ++++++++++---
 .../apache/spark/api/java/JavaRDDLike.scala   |   2 +-
 .../apache/spark/api/python/PythonRDD.scala   |   5 +-
 .../org/apache/spark/executor/Executor.scala  |   2 -
 .../apache/spark/rdd/PairRDDFunctions.scala   |   2 +-
 .../main/scala/org/apache/spark/rdd/RDD.scala |   4 +-
 .../apache/spark/rdd/ZippedWithIndexRDD.scala |   3 +-
 .../apache/spark/scheduler/DAGScheduler.scala | 117 +++---------------
 .../spark/scheduler/DAGSchedulerEvent.scala   |   1 -
 .../scala/org/apache/spark/rdd/RDDSuite.scala |   2 +-
 .../spark/scheduler/DAGSchedulerSuite.scala   |  83 ++-----------
 .../OutputCommitCoordinatorSuite.scala        |   4 +-
 .../spark/scheduler/SparkListenerSuite.scala  |   2 +-
 docs/configuration.md                         |   9 --
 .../spark/streaming/kafka/KafkaRDD.scala      |   3 +-
 .../apache/spark/mllib/rdd/SlidingRDD.scala   |   2 +-
 python/pyspark/context.py                     |   3 +-
 python/pyspark/rdd.py                         |   4 +-
 .../spark/sql/execution/SparkPlan.scala       |   3 +-
 19 files changed, 108 insertions(+), 229 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index 4976e5eb49468..6a6b94a271cfc 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -1758,16 +1758,13 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
 
   /**
    * Run a function on a given set of partitions in an RDD and pass the results to the given
-   * handler function. This is the main entry point for all actions in Spark. The allowLocal
-   * flag specifies whether the scheduler can run the computation on the driver rather than
-   * shipping it out to the cluster, for short actions like first().
+   * handler function. This is the main entry point for all actions in Spark.
    */
   def runJob[T, U: ClassTag](
       rdd: RDD[T],
       func: (TaskContext, Iterator[T]) => U,
       partitions: Seq[Int],
-      allowLocal: Boolean,
-      resultHandler: (Int, U) => Unit) {
+      resultHandler: (Int, U) => Unit): Unit = {
     if (stopped.get()) {
       throw new IllegalStateException("SparkContext has been shutdown")
     }
@@ -1777,54 +1774,104 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
     if (conf.getBoolean("spark.logLineage", false)) {
       logInfo("RDD's recursive dependencies:\n" + rdd.toDebugString)
     }
-    dagScheduler.runJob(rdd, cleanedFunc, partitions, callSite, allowLocal,
-      resultHandler, localProperties.get)
+    dagScheduler.runJob(rdd, cleanedFunc, partitions, callSite, resultHandler, localProperties.get)
     progressBar.foreach(_.finishAll())
     rdd.doCheckpoint()
   }
 
   /**
-   * Run a function on a given set of partitions in an RDD and return the results as an array. The
-   * allowLocal flag specifies whether the scheduler can run the computation on the driver rather
-   * than shipping it out to the cluster, for short actions like first().
+   * Run a function on a given set of partitions in an RDD and return the results as an array.
+   */
+  def runJob[T, U: ClassTag](
+      rdd: RDD[T],
+      func: (TaskContext, Iterator[T]) => U,
+      partitions: Seq[Int]): Array[U] = {
+    val results = new Array[U](partitions.size)
+    runJob[T, U](rdd, func, partitions, (index, res) => results(index) = res)
+    results
+  }
+
+  /**
+   * Run a job on a given set of partitions of an RDD, but take a function of type
+   * `Iterator[T] => U` instead of `(TaskContext, Iterator[T]) => U`.
+   */
+  def runJob[T, U: ClassTag](
+      rdd: RDD[T],
+      func: Iterator[T] => U,
+      partitions: Seq[Int]): Array[U] = {
+    val cleanedFunc = clean(func)
+    runJob(rdd, (ctx: TaskContext, it: Iterator[T]) => cleanedFunc(it), partitions)
+  }
+
+
+  /**
+   * Run a function on a given set of partitions in an RDD and pass the results to the given
+   * handler function. This is the main entry point for all actions in Spark.
+   *
+   * The allowLocal flag is deprecated as of Spark 1.5.0+.
+   */
+  @deprecated("use the version of runJob without the allowLocal parameter", "1.5.0")
+  def runJob[T, U: ClassTag](
+      rdd: RDD[T],
+      func: (TaskContext, Iterator[T]) => U,
+      partitions: Seq[Int],
+      allowLocal: Boolean,
+      resultHandler: (Int, U) => Unit): Unit = {
+    if (allowLocal) {
+      logWarning("sc.runJob with allowLocal=true is deprecated in Spark 1.5.0+")
+    }
+    runJob(rdd, func, partitions, resultHandler)
+  }
+
+  /**
+   * Run a function on a given set of partitions in an RDD and return the results as an array.
+   *
+   * The allowLocal flag is deprecated as of Spark 1.5.0+.
    */
+  @deprecated("use the version of runJob without the allowLocal parameter", "1.5.0")
   def runJob[T, U: ClassTag](
       rdd: RDD[T],
       func: (TaskContext, Iterator[T]) => U,
       partitions: Seq[Int],
       allowLocal: Boolean
       ): Array[U] = {
-    val results = new Array[U](partitions.size)
-    runJob[T, U](rdd, func, partitions, allowLocal, (index, res) => results(index) = res)
-    results
+    if (allowLocal) {
+      logWarning("sc.runJob with allowLocal=true is deprecated in Spark 1.5.0+")
+    }
+    runJob(rdd, func, partitions)
   }
 
   /**
    * Run a job on a given set of partitions of an RDD, but take a function of type
    * `Iterator[T] => U` instead of `(TaskContext, Iterator[T]) => U`.
+   *
+   * The allowLocal argument is deprecated as of Spark 1.5.0+.
    */
+  @deprecated("use the version of runJob without the allowLocal parameter", "1.5.0")
   def runJob[T, U: ClassTag](
       rdd: RDD[T],
       func: Iterator[T] => U,
       partitions: Seq[Int],
       allowLocal: Boolean
       ): Array[U] = {
-    val cleanedFunc = clean(func)
-    runJob(rdd, (ctx: TaskContext, it: Iterator[T]) => cleanedFunc(it), partitions, allowLocal)
+    if (allowLocal) {
+      logWarning("sc.runJob with allowLocal=true is deprecated in Spark 1.5.0+")
+    }
+    runJob(rdd, func, partitions)
   }
 
   /**
    * Run a job on all partitions in an RDD and return the results in an array.
    */
   def runJob[T, U: ClassTag](rdd: RDD[T], func: (TaskContext, Iterator[T]) => U): Array[U] = {
-    runJob(rdd, func, 0 until rdd.partitions.size, false)
+    runJob(rdd, func, 0 until rdd.partitions.length)
   }
 
   /**
    * Run a job on all partitions in an RDD and return the results in an array.
    */
   def runJob[T, U: ClassTag](rdd: RDD[T], func: Iterator[T] => U): Array[U] = {
-    runJob(rdd, func, 0 until rdd.partitions.size, false)
+    runJob(rdd, func, 0 until rdd.partitions.length)
   }
 
   /**
@@ -1835,7 +1882,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
     processPartition: (TaskContext, Iterator[T]) => U,
     resultHandler: (Int, U) => Unit)
   {
-    runJob[T, U](rdd, processPartition, 0 until rdd.partitions.size, false, resultHandler)
+    runJob[T, U](rdd, processPartition, 0 until rdd.partitions.length, resultHandler)
   }
 
   /**
@@ -1847,7 +1894,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
       resultHandler: (Int, U) => Unit)
   {
     val processFunc = (context: TaskContext, iter: Iterator[T]) => processPartition(iter)
-    runJob[T, U](rdd, processFunc, 0 until rdd.partitions.size, false, resultHandler)
+    runJob[T, U](rdd, processFunc, 0 until rdd.partitions.length, resultHandler)
   }
 
   /**
@@ -1892,7 +1939,6 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
       (context: TaskContext, iter: Iterator[T]) => cleanF(iter),
       partitions,
       callSite,
-      allowLocal = false,
       resultHandler,
       localProperties.get)
     new SimpleFutureAction(waiter, resultFunc)
diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala b/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala
index c95615a5a9307..829fae1d1d9bf 100644
--- a/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala
+++ b/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala
@@ -364,7 +364,7 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable {
     // This is useful for implementing `take` from other language frontends
     // like Python where the data is serialized.
     import scala.collection.JavaConversions._
-    val res = context.runJob(rdd, (it: Iterator[T]) => it.toArray, partitionIds, true)
+    val res = context.runJob(rdd, (it: Iterator[T]) => it.toArray, partitionIds)
     res.map(x => new java.util.ArrayList(x.toSeq)).toArray
   }
 
diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
index dc9f62f39e6d5..598953ac3bcc8 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
@@ -358,12 +358,11 @@ private[spark] object PythonRDD extends Logging {
   def runJob(
       sc: SparkContext,
       rdd: JavaRDD[Array[Byte]],
-      partitions: JArrayList[Int],
-      allowLocal: Boolean): Int = {
+      partitions: JArrayList[Int]): Int = {
     type ByteArray = Array[Byte]
     type UnrolledPartition = Array[ByteArray]
     val allPartitions: Array[UnrolledPartition] =
-      sc.runJob(rdd, (x: Iterator[ByteArray]) => x.toArray, partitions, allowLocal)
+      sc.runJob(rdd, (x: Iterator[ByteArray]) => x.toArray, partitions)
     val flattenedPartition: UnrolledPartition = Array.concat(allPartitions: _*)
     serveIterator(flattenedPartition.iterator,
       s"serve RDD ${rdd.id} with partitions ${partitions.mkString(",")}")
diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala
index 66624ffbe4790..581b40003c6c4 100644
--- a/core/src/main/scala/org/apache/spark/executor/Executor.scala
+++ b/core/src/main/scala/org/apache/spark/executor/Executor.scala
@@ -215,8 +215,6 @@ private[spark] class Executor(
             attemptNumber = attemptNumber,
             metricsSystem = env.metricsSystem)
         } finally {
-          // Note: this memory freeing logic is duplicated in DAGScheduler.runLocallyWithinThread;
-          // when changing this, make sure to update both copies.
           val freedMemory = taskMemoryManager.cleanUpAllAllocatedMemory()
           if (freedMemory > 0) {
             val errMsg = s"Managed memory leak detected; size = $freedMemory bytes, TID = $taskId"
diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
index 91a6a2d039852..326fafb230a40 100644
--- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
@@ -881,7 +881,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
           }
           buf
         } : Seq[V]
-        val res = self.context.runJob(self, process, Array(index), false)
+        val res = self.context.runJob(self, process, Array(index))
         res(0)
       case None =>
         self.filter(_._1 == key).map(_._2).collect()
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
index 9f7ebae3e9af3..394c6686cbabd 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -897,7 +897,7 @@ abstract class RDD[T: ClassTag](
    */
   def toLocalIterator: Iterator[T] = withScope {
     def collectPartition(p: Int): Array[T] = {
-      sc.runJob(this, (iter: Iterator[T]) => iter.toArray, Seq(p), allowLocal = false).head
+      sc.runJob(this, (iter: Iterator[T]) => iter.toArray, Seq(p)).head
     }
     (0 until partitions.length).iterator.flatMap(i => collectPartition(i))
   }
@@ -1273,7 +1273,7 @@ abstract class RDD[T: ClassTag](
 
         val left = num - buf.size
         val p = partsScanned until math.min(partsScanned + numPartsToTry, totalParts)
-        val res = sc.runJob(this, (it: Iterator[T]) => it.take(left).toArray, p, allowLocal = true)
+        val res = sc.runJob(this, (it: Iterator[T]) => it.take(left).toArray, p)
 
         res.foreach(buf ++= _.take(num - buf.size))
         partsScanned += numPartsToTry
diff --git a/core/src/main/scala/org/apache/spark/rdd/ZippedWithIndexRDD.scala b/core/src/main/scala/org/apache/spark/rdd/ZippedWithIndexRDD.scala
index 523aaf2b860b5..e277ae28d588f 100644
--- a/core/src/main/scala/org/apache/spark/rdd/ZippedWithIndexRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/ZippedWithIndexRDD.scala
@@ -50,8 +50,7 @@ class ZippedWithIndexRDD[T: ClassTag](@transient prev: RDD[T]) extends RDD[(T, L
       prev.context.runJob(
         prev,
         Utils.getIteratorSize _,
-        0 until n - 1, // do not need to count the last partition
-        allowLocal = false
+        0 until n - 1 // do not need to count the last partition
       ).scanLeft(0L)(_ + _)
     }
   }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
index b829d06923404..552dabcfa5139 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -38,7 +38,6 @@ import org.apache.spark.partial.{ApproximateActionListener, ApproximateEvaluator
 import org.apache.spark.rdd.RDD
 import org.apache.spark.rpc.RpcTimeout
 import org.apache.spark.storage._
-import org.apache.spark.unsafe.memory.TaskMemoryManager
 import org.apache.spark.util._
 import org.apache.spark.storage.BlockManagerMessages.BlockManagerHeartbeat
 
@@ -128,10 +127,6 @@ class DAGScheduler(
   // This is only safe because DAGScheduler runs in a single thread.
   private val closureSerializer = SparkEnv.get.closureSerializer.newInstance()
 
-
-  /** If enabled, we may run certain actions like take() and first() locally. */
-  private val localExecutionEnabled = sc.getConf.getBoolean("spark.localExecution.enabled", false)
-
   /** If enabled, FetchFailed will not cause stage retry, in order to surface the problem. */
   private val disallowStageRetryForTest = sc.getConf.getBoolean("spark.test.noStageRetry", false)
 
@@ -515,7 +510,6 @@ class DAGScheduler(
       func: (TaskContext, Iterator[T]) => U,
       partitions: Seq[Int],
       callSite: CallSite,
-      allowLocal: Boolean,
       resultHandler: (Int, U) => Unit,
       properties: Properties): JobWaiter[U] = {
     // Check to make sure we are not launching a task on a partition that does not exist.
@@ -535,7 +529,7 @@ class DAGScheduler(
     val func2 = func.asInstanceOf[(TaskContext, Iterator[_]) => _]
     val waiter = new JobWaiter(this, jobId, partitions.size, resultHandler)
     eventProcessLoop.post(JobSubmitted(
-      jobId, rdd, func2, partitions.toArray, allowLocal, callSite, waiter,
+      jobId, rdd, func2, partitions.toArray, callSite, waiter,
       SerializationUtils.clone(properties)))
     waiter
   }
@@ -545,11 +539,10 @@ class DAGScheduler(
       func: (TaskContext, Iterator[T]) => U,
       partitions: Seq[Int],
       callSite: CallSite,
-      allowLocal: Boolean,
       resultHandler: (Int, U) => Unit,
       properties: Properties): Unit = {
     val start = System.nanoTime
-    val waiter = submitJob(rdd, func, partitions, callSite, allowLocal, resultHandler, properties)
+    val waiter = submitJob(rdd, func, partitions, callSite, resultHandler, properties)
     waiter.awaitResult() match {
       case JobSucceeded =>
         logInfo("Job %d finished: %s, took %f s".format
@@ -576,8 +569,7 @@ class DAGScheduler(
     val partitions = (0 until rdd.partitions.size).toArray
     val jobId = nextJobId.getAndIncrement()
     eventProcessLoop.post(JobSubmitted(
-      jobId, rdd, func2, partitions, allowLocal = false, callSite, listener,
-      SerializationUtils.clone(properties)))
+      jobId, rdd, func2, partitions, callSite, listener, SerializationUtils.clone(properties)))
     listener.awaitResult()    // Will throw an exception if the job fails
   }
 
@@ -654,74 +646,6 @@ class DAGScheduler(
     }
   }
 
-  /**
-   * Run a job on an RDD locally, assuming it has only a single partition and no dependencies.
-   * We run the operation in a separate thread just in case it takes a bunch of time, so that we
-   * don't block the DAGScheduler event loop or other concurrent jobs.
-   */
-  protected def runLocally(job: ActiveJob) {
-    logInfo("Computing the requested partition locally")
-    new Thread("Local computation of job " + job.jobId) {
-      override def run() {
-        runLocallyWithinThread(job)
-      }
-    }.start()
-  }
-
-  // Broken out for easier testing in DAGSchedulerSuite.
-  protected def runLocallyWithinThread(job: ActiveJob) {
-    var jobResult: JobResult = JobSucceeded
-    try {
-      val rdd = job.finalStage.rdd
-      val split = rdd.partitions(job.partitions(0))
-      val taskMemoryManager = new TaskMemoryManager(env.executorMemoryManager)
-      val taskContext =
-        new TaskContextImpl(
-          job.finalStage.id,
-          job.partitions(0),
-          taskAttemptId = 0,
-          attemptNumber = 0,
-          taskMemoryManager = taskMemoryManager,
-          metricsSystem = env.metricsSystem,
-          runningLocally = true)
-      TaskContext.setTaskContext(taskContext)
-      try {
-        val result = job.func(taskContext, rdd.iterator(split, taskContext))
-        job.listener.taskSucceeded(0, result)
-      } finally {
-        taskContext.markTaskCompleted()
-        TaskContext.unset()
-        // Note: this memory freeing logic is duplicated in Executor.run(); when changing this,
-        // make sure to update both copies.
-        val freedMemory = taskMemoryManager.cleanUpAllAllocatedMemory()
-        if (freedMemory > 0) {
-          if (sc.getConf.getBoolean("spark.unsafe.exceptionOnMemoryLeak", false)) {
-            throw new SparkException(s"Managed memory leak detected; size = $freedMemory bytes")
-          } else {
-            logError(s"Managed memory leak detected; size = $freedMemory bytes")
-          }
-        }
-      }
-    } catch {
-      case e: Exception =>
-        val exception = new SparkDriverExecutionException(e)
-        jobResult = JobFailed(exception)
-        job.listener.jobFailed(exception)
-      case oom: OutOfMemoryError =>
-        val exception = new SparkException("Local job aborted due to out of memory error", oom)
-        jobResult = JobFailed(exception)
-        job.listener.jobFailed(exception)
-    } finally {
-      val s = job.finalStage
-      // clean up data structures that were populated for a local job,
-      // but that won't get cleaned up via the normal paths through
-      // completion events or stage abort
-      stageIdToStage -= s.id
-      jobIdToStageIds -= job.jobId
-      listenerBus.post(SparkListenerJobEnd(job.jobId, clock.getTimeMillis(), jobResult))
-    }
-  }
-
   /** Finds the earliest-created active job that needs the stage */
   // TODO: Probably should actually find among the active jobs that need this
   // stage the one with the highest priority (highest-priority pool, earliest created).
@@ -784,7 +708,6 @@ class DAGScheduler(
       finalRDD: RDD[_],
       func: (TaskContext, Iterator[_]) => _,
       partitions: Array[Int],
-      allowLocal: Boolean,
       callSite: CallSite,
       listener: JobListener,
       properties: Properties) {
@@ -802,29 +725,20 @@ class DAGScheduler(
     if (finalStage != null) {
       val job = new ActiveJob(jobId, finalStage, func, partitions, callSite, listener, properties)
       clearCacheLocs()
-      logInfo("Got job %s (%s) with %d output partitions (allowLocal=%s)".format(
-        job.jobId, callSite.shortForm, partitions.length, allowLocal))
+      logInfo("Got job %s (%s) with %d output partitions".format(
+        job.jobId, callSite.shortForm, partitions.length))
       logInfo("Final stage: " + finalStage + "(" + finalStage.name + ")")
       logInfo("Parents of final stage: " + finalStage.parents)
       logInfo("Missing parents: " + getMissingParentStages(finalStage))
-      val shouldRunLocally =
-        localExecutionEnabled && allowLocal && finalStage.parents.isEmpty && partitions.length == 1
       val jobSubmissionTime = clock.getTimeMillis()
-      if (shouldRunLocally) {
-        // Compute very short actions like first() or take() with no parent stages locally.
-        listenerBus.post(
-          SparkListenerJobStart(job.jobId, jobSubmissionTime, Seq.empty, properties))
-        runLocally(job)
-      } else {
-        jobIdToActiveJob(jobId) = job
-        activeJobs += job
-        finalStage.resultOfJob = Some(job)
-        val stageIds = jobIdToStageIds(jobId).toArray
-        val stageInfos = stageIds.flatMap(id => stageIdToStage.get(id).map(_.latestInfo))
-        listenerBus.post(
-          SparkListenerJobStart(job.jobId, jobSubmissionTime, stageInfos, properties))
-        submitStage(finalStage)
-      }
+      jobIdToActiveJob(jobId) = job
+      activeJobs += job
+      finalStage.resultOfJob = Some(job)
+      val stageIds = jobIdToStageIds(jobId).toArray
+      val stageInfos = stageIds.flatMap(id => stageIdToStage.get(id).map(_.latestInfo))
+      listenerBus.post(
+        SparkListenerJobStart(job.jobId, jobSubmissionTime, stageInfos, properties))
+      submitStage(finalStage)
     }
     submitWaitingStages()
   }
@@ -1486,9 +1400,8 @@ private[scheduler] class DAGSchedulerEventProcessLoop(dagScheduler: DAGScheduler
   }
 
   private def doOnReceive(event: DAGSchedulerEvent): Unit = event match {
-    case JobSubmitted(jobId, rdd, func, partitions, allowLocal, callSite, listener, properties) =>
-      dagScheduler.handleJobSubmitted(jobId, rdd, func, partitions, allowLocal, callSite,
-        listener, properties)
+    case JobSubmitted(jobId, rdd, func, partitions, callSite, listener, properties) =>
+      dagScheduler.handleJobSubmitted(jobId, rdd, func, partitions, callSite, listener, properties)
 
     case StageCancelled(stageId) =>
       dagScheduler.handleStageCancellation(stageId)
diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGSchedulerEvent.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGSchedulerEvent.scala
index a927eae2b04be..a213d419cf033 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGSchedulerEvent.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGSchedulerEvent.scala
@@ -40,7 +40,6 @@ private[scheduler] case class JobSubmitted(
     finalRDD: RDD[_],
     func: (TaskContext, Iterator[_]) => _,
     partitions: Array[Int],
-    allowLocal: Boolean,
     callSite: CallSite,
     listener: JobListener,
     properties: Properties = null)
diff --git a/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala b/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala
index f6da9f98ad253..5f718ea9f7be1 100644
--- a/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala
@@ -679,7 +679,7 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext {
 
   test("runJob on an invalid partition") {
     intercept[IllegalArgumentException] {
-      sc.runJob(sc.parallelize(1 to 10, 2), {iter: Iterator[Int] => iter.size}, Seq(0, 1, 2), false)
+      sc.runJob(sc.parallelize(1 to 10, 2), {iter: Iterator[Int] => iter.size}, Seq(0, 1, 2))
     }
   }
 
diff --git a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
index 3462a82c9cdd3..86dff8fb577d5 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
@@ -153,9 +153,7 @@ class DAGSchedulerSuite
   }
 
   before {
-    // Enable local execution for this test
-    val conf = new SparkConf().set("spark.localExecution.enabled", "true")
-    sc = new SparkContext("local", "DAGSchedulerSuite", conf)
+    sc = new SparkContext("local", "DAGSchedulerSuite")
     sparkListener.submittedStageInfos.clear()
     sparkListener.successfulStages.clear()
     sparkListener.failedStages.clear()
@@ -172,12 +170,7 @@ class DAGSchedulerSuite
         sc.listenerBus,
         mapOutputTracker,
         blockManagerMaster,
-        sc.env) {
-      override def runLocally(job: ActiveJob) {
-        // don't bother with the thread while unit testing
-        runLocallyWithinThread(job)
-      }
-    }
+        sc.env)
     dagEventProcessLoopTester = new DAGSchedulerEventProcessLoopTester(scheduler)
   }
 
@@ -241,10 +234,9 @@ class DAGSchedulerSuite
       rdd: RDD[_],
       partitions: Array[Int],
       func: (TaskContext, Iterator[_]) => _ = jobComputeFunc,
-      allowLocal: Boolean = false,
       listener: JobListener = jobListener): Int = {
     val jobId = scheduler.nextJobId.getAndIncrement()
-    runEvent(JobSubmitted(jobId, rdd, func, partitions, allowLocal, CallSite("", ""), listener))
+    runEvent(JobSubmitted(jobId, rdd, func, partitions, CallSite("", ""), listener))
     jobId
   }
 
@@ -284,37 +276,6 @@ class DAGSchedulerSuite
     assertDataStructuresEmpty()
   }
 
-  test("local job") {
-    val rdd = new PairOfIntsRDD(sc, Nil) {
-      override def compute(split: Partition, context: TaskContext): Iterator[(Int, Int)] =
-        Array(42 -> 0).iterator
-      override def getPartitions: Array[Partition] =
-        Array( new Partition { override def index: Int = 0 } )
-      override def getPreferredLocations(split: Partition): List[String] = Nil
-      override def toString: String = "DAGSchedulerSuite Local RDD"
-    }
-    val jobId = scheduler.nextJobId.getAndIncrement()
-    runEvent(
-      JobSubmitted(jobId, rdd, jobComputeFunc, Array(0), true, CallSite("", ""), jobListener))
-    assert(results === Map(0 -> 42))
-    assertDataStructuresEmpty()
-  }
-
-  test("local job oom") {
-    val rdd = new PairOfIntsRDD(sc, Nil) {
-      override def compute(split: Partition, context: TaskContext): Iterator[(Int, Int)] =
-        throw new java.lang.OutOfMemoryError("test local job oom")
-      override def getPartitions = Array( new Partition { override def index = 0 } )
-      override def getPreferredLocations(split: Partition) = Nil
-      override def toString = "DAGSchedulerSuite Local RDD"
-    }
-    val jobId = scheduler.nextJobId.getAndIncrement()
-    runEvent(
-      JobSubmitted(jobId, rdd, jobComputeFunc, Array(0), true, CallSite("", ""), jobListener))
-    assert(results.size == 0)
-    assertDataStructuresEmpty()
-  }
-
   test("run trivial job w/ dependency") {
     val baseRdd = new MyRDD(sc, 1, Nil)
     val finalRdd = new MyRDD(sc, 1, List(new OneToOneDependency(baseRdd)))
@@ -452,12 +413,7 @@ class DAGSchedulerSuite
       sc.listenerBus,
       mapOutputTracker,
       blockManagerMaster,
-      sc.env) {
-      override def runLocally(job: ActiveJob) {
-        // don't bother with the thread while unit testing
-        runLocallyWithinThread(job)
-      }
-    }
+      sc.env)
     dagEventProcessLoopTester = new DAGSchedulerEventProcessLoopTester(noKillScheduler)
     val jobId = submit(new MyRDD(sc, 1, Nil), Array(0))
     cancel(jobId)
@@ -889,40 +845,23 @@ class DAGSchedulerSuite
     // Run this on executors
     sc.parallelize(1 to 10, 2).foreach { item => acc.add(1) }
 
-    // Run this within a local thread
-    sc.parallelize(1 to 10, 2).map { item => acc.add(1) }.take(1)
-
-    // Make sure we can still run local commands as well as cluster commands.
+    // Make sure we can still run commands
     assert(sc.parallelize(1 to 10, 2).count() === 10)
-    assert(sc.parallelize(1 to 10, 2).first() === 1)
   }
 
   test("misbehaved resultHandler should not crash DAGScheduler and SparkContext") {
-    val e1 = intercept[SparkDriverExecutionException] {
-      val rdd = sc.parallelize(1 to 10, 2)
-      sc.runJob[Int, Int](
-        rdd,
-        (context: TaskContext, iter: Iterator[Int]) => iter.size,
-        Seq(0),
-        allowLocal = true,
-        (part: Int, result: Int) => throw new DAGSchedulerSuiteDummyException)
-    }
-    assert(e1.getCause.isInstanceOf[DAGSchedulerSuiteDummyException])
-
-    val e2 = intercept[SparkDriverExecutionException] {
+    val e = intercept[SparkDriverExecutionException] {
       val rdd = sc.parallelize(1 to 10, 2)
       sc.runJob[Int, Int](
         rdd,
         (context: TaskContext, iter: Iterator[Int]) => iter.size,
         Seq(0, 1),
-        allowLocal = false,
         (part: Int, result: Int) => throw new DAGSchedulerSuiteDummyException)
     }
-    assert(e2.getCause.isInstanceOf[DAGSchedulerSuiteDummyException])
+    assert(e.getCause.isInstanceOf[DAGSchedulerSuiteDummyException])
 
-    // Make sure we can still run local commands as well as cluster commands.
+    // Make sure we can still run commands
     assert(sc.parallelize(1 to 10, 2).count() === 10)
-    assert(sc.parallelize(1 to 10, 2).first() === 1)
   }
 
   test("getPartitions exceptions should not crash DAGScheduler and SparkContext (SPARK-8606)") {
@@ -935,9 +874,8 @@ class DAGSchedulerSuite
       rdd.reduceByKey(_ + _, 1).count()
     }
 
-    // Make sure we can still run local commands as well as cluster commands.
+    // Make sure we can still run commands
     assert(sc.parallelize(1 to 10, 2).count() === 10)
-    assert(sc.parallelize(1 to 10, 2).first() === 1)
   }
 
   test("getPreferredLocations errors should not crash DAGScheduler and SparkContext (SPARK-8606)") {
@@ -951,9 +889,8 @@ class DAGSchedulerSuite
     }
     assert(e1.getMessage.contains(classOf[DAGSchedulerSuiteDummyException].getName))
 
-    // Make sure we can still run local commands as well as cluster commands.
+    // Make sure we can still run commands
     assert(sc.parallelize(1 to 10, 2).count() === 10)
-    assert(sc.parallelize(1 to 10, 2).first() === 1)
   }
 
   test("accumulator not calculated for resubmitted result stage") {
diff --git a/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala
index a9036da9cc93d..e5ecd4b7c2610 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala
@@ -134,14 +134,14 @@ class OutputCommitCoordinatorSuite extends SparkFunSuite with BeforeAndAfter {
   test("Only one of two duplicate commit tasks should commit") {
     val rdd = sc.parallelize(Seq(1), 1)
     sc.runJob(rdd, OutputCommitFunctions(tempDir.getAbsolutePath).commitSuccessfully _,
-      0 until rdd.partitions.size, allowLocal = false)
+      0 until rdd.partitions.size)
     assert(tempDir.list().size === 1)
   }
 
   test("If commit fails, if task is retried it should not be locked, and will succeed.") {
     val rdd = sc.parallelize(Seq(1), 1)
     sc.runJob(rdd, OutputCommitFunctions(tempDir.getAbsolutePath).failFirstCommitAttempt _,
-      0 until rdd.partitions.size, allowLocal = false)
+      0 until rdd.partitions.size)
     assert(tempDir.list().size === 1)
   }
 
diff --git a/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala
index 651295b7344c5..730535ece7878 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala
@@ -188,7 +188,7 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match
     sc.addSparkListener(listener)
     val rdd1 = sc.parallelize(1 to 100, 4)
     val rdd2 = rdd1.map(_.toString)
-    sc.runJob(rdd2, (items: Iterator[String]) => items.size, Seq(0, 1), true)
+    sc.runJob(rdd2, (items: Iterator[String]) => items.size, Seq(0, 1))
 
     sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS)
 
diff --git a/docs/configuration.md b/docs/configuration.md
index fea259204ae68..200f3cd212e46 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -1048,15 +1048,6 @@ Apart from these, the following properties are also available, and may be useful
     infinite (all available cores) on Mesos.
   </td>
 </tr>
-<tr>
-  <td><code>spark.localExecution.enabled</code></td>
-  <td>false</td>
-  <td>
-    Enables Spark to run certain jobs, such as first() or take() on the driver, without sending
-    tasks to the cluster. This can make certain jobs execute very quickly, but may require
-    shipping a whole partition of data to the driver.
-  </td>
-</tr>
 <tr>
   <td><code>spark.locality.wait</code></td>
   <td>3s</td>
diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaRDD.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaRDD.scala
index c5cd2154772ac..1a9d78c0d4f59 100644
--- a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaRDD.scala
+++ b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaRDD.scala
@@ -98,8 +98,7 @@ class KafkaRDD[
     val res = context.runJob(
       this,
       (tc: TaskContext, it: Iterator[R]) => it.take(parts(tc.partitionId)).toArray,
-      parts.keys.toArray,
-      allowLocal = true)
+      parts.keys.toArray)
     res.foreach(buf ++= _)
     buf.toArray
   }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/rdd/SlidingRDD.scala b/mllib/src/main/scala/org/apache/spark/mllib/rdd/SlidingRDD.scala
index 35e81fcb3de0d..1facf83d806d0 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/rdd/SlidingRDD.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/rdd/SlidingRDD.scala
@@ -72,7 +72,7 @@ class SlidingRDD[T: ClassTag](@transient val parent: RDD[T], val windowSize: Int
       val w1 = windowSize - 1
       // Get the first w1 items of each partition, starting from the second partition.
       val nextHeads =
-        parent.context.runJob(parent, (iter: Iterator[T]) => iter.take(w1).toArray, 1 until n, true)
+        parent.context.runJob(parent, (iter: Iterator[T]) => iter.take(w1).toArray, 1 until n)
       val partitions = mutable.ArrayBuffer[SlidingRDDPartition[T]]()
       var i = 0
       var partitionIndex = 0
diff --git a/python/pyspark/context.py b/python/pyspark/context.py
index 43bde5ae41e23..eb5b0bbbdac4b 100644
--- a/python/pyspark/context.py
+++ b/python/pyspark/context.py
@@ -913,8 +913,7 @@ def runJob(self, rdd, partitionFunc, partitions=None, allowLocal=False):
         # by runJob() in order to avoid having to pass a Python lambda into
         # SparkContext#runJob.
         mappedRDD = rdd.mapPartitions(partitionFunc)
-        port = self._jvm.PythonRDD.runJob(self._jsc.sc(), mappedRDD._jrdd, partitions,
-                                          allowLocal)
+        port = self._jvm.PythonRDD.runJob(self._jsc.sc(), mappedRDD._jrdd, partitions)
         return list(_load_from_socket(port, mappedRDD._jrdd_deserializer))
 
     def show_profiles(self):
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index 7e788148d981c..fa8e0a0574a62 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -1293,7 +1293,7 @@ def takeUpToNumLeft(iterator):
                     taken += 1
 
             p = range(partsScanned, min(partsScanned + numPartsToTry, totalParts))
-            res = self.context.runJob(self, takeUpToNumLeft, p, True)
+            res = self.context.runJob(self, takeUpToNumLeft, p)
 
             items += res
             partsScanned += numPartsToTry
@@ -2193,7 +2193,7 @@ def lookup(self, key):
         values = self.filter(lambda kv: kv[0] == key).values()
 
         if self.partitioner is not None:
-            return self.ctx.runJob(values, lambda x: x, [self.partitioner(key)], False)
+            return self.ctx.runJob(values, lambda x: x, [self.partitioner(key)])
 
         return values.collect()
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
index b0d56b7bf0b86..50c27def8ea54 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
@@ -165,8 +165,7 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ
       val p = partsScanned until math.min(partsScanned + numPartsToTry, totalParts)
       val sc = sqlContext.sparkContext
       val res =
-        sc.runJob(childRDD, (it: Iterator[InternalRow]) => it.take(left).toArray, p,
-          allowLocal = false)
+        sc.runJob(childRDD, (it: Iterator[InternalRow]) => it.take(left).toArray, p)
 
       res.foreach(buf ++= _.take(n - buf.size))
       partsScanned += numPartsToTry

From 2f5cbd860e487e7339e627dd7e2c9baa5116b819 Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Wed, 22 Jul 2015 21:40:23 -0700
Subject: [PATCH 330/549] [SPARK-8364] [SPARKR] Add crosstab to SparkR
 DataFrames

Add `crosstab` to SparkR DataFrames, which takes two column names and returns a local R data.frame. This is similar to `table` in R. However, `table` in SparkR is used for loading SQL tables as DataFrames. The return type is data.frame instead table for `crosstab` to be compatible with Scala/Python.

I couldn't run R tests successfully on my local. Many unit tests failed. So let's try Jenkins.

Author: Xiangrui Meng <meng@databricks.com>

Closes #7318 from mengxr/SPARK-8364 and squashes the following commits:

d75e894 [Xiangrui Meng] fix tests
53f6ddd [Xiangrui Meng] fix tests
f1348d6 [Xiangrui Meng] update test
47cb088 [Xiangrui Meng] Merge remote-tracking branch 'apache/master' into SPARK-8364
5621262 [Xiangrui Meng] first version without test
---
 R/pkg/NAMESPACE                  |  1 +
 R/pkg/R/DataFrame.R              | 28 ++++++++++++++++++++++++++++
 R/pkg/R/generics.R               |  4 ++++
 R/pkg/inst/tests/test_sparkSQL.R | 13 +++++++++++++
 4 files changed, 46 insertions(+)

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 5834813319bfd..7f7a8a2e4de24 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -26,6 +26,7 @@ exportMethods("arrange",
               "collect",
               "columns",
               "count",
+              "crosstab",
               "describe",
               "distinct",
               "dropna",
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index a58433df3c8c1..06dd6b75dff3d 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -1554,3 +1554,31 @@ setMethod("fillna",
             }
             dataFrame(sdf)
           })
+
+#' crosstab
+#'
+#' Computes a pair-wise frequency table of the given columns. Also known as a contingency
+#' table. The number of distinct values for each column should be less than 1e4. At most 1e6
+#' non-zero pair frequencies will be returned.
+#'
+#' @param col1 name of the first column. Distinct items will make the first item of each row.
+#' @param col2 name of the second column. Distinct items will make the column names of the output.
+#' @return a local R data.frame representing the contingency table. The first column of each row
+#'         will be the distinct values of `col1` and the column names will be the distinct values
+#'         of `col2`. The name of the first column will be `$col1_$col2`. Pairs that have no
+#'         occurrences will have `null` as their counts.
+#'
+#' @rdname statfunctions
+#' @export
+#' @examples
+#' \dontrun{
+#' df <- jsonFile(sqlCtx, "/path/to/file.json")
+#' ct = crosstab(df, "title", "gender")
+#' }
+setMethod("crosstab",
+          signature(x = "DataFrame", col1 = "character", col2 = "character"),
+          function(x, col1, col2) {
+            statFunctions <- callJMethod(x@sdf, "stat")
+            sct <- callJMethod(statFunctions, "crosstab", col1, col2)
+            collect(dataFrame(sct))
+          })
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index 39b5586f7c90e..836e0175c391f 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -59,6 +59,10 @@ setGeneric("count", function(x) { standardGeneric("count") })
 # @export
 setGeneric("countByValue", function(x) { standardGeneric("countByValue") })
 
+# @rdname statfunctions
+# @export
+setGeneric("crosstab", function(x, col1, col2) { standardGeneric("crosstab") })
+
 # @rdname distinct
 # @export
 setGeneric("distinct", function(x, numPartitions = 1) { standardGeneric("distinct") })
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index a3039d36c9402..62fe48a5d6c7b 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -987,6 +987,19 @@ test_that("fillna() on a DataFrame", {
   expect_identical(expected, actual)
 })
 
+test_that("crosstab() on a DataFrame", {
+  rdd <- lapply(parallelize(sc, 0:3), function(x) {
+    list(paste0("a", x %% 3), paste0("b", x %% 2))
+  })
+  df <- toDF(rdd, list("a", "b"))
+  ct <- crosstab(df, "a", "b")
+  ordered <- ct[order(ct$a_b),]
+  row.names(ordered) <- NULL
+  expected <- data.frame("a_b" = c("a0", "a1", "a2"), "b0" = c(1, 0, 1), "b1" = c(1, 1, 0),
+                         stringsAsFactors = FALSE, row.names = NULL)
+  expect_identical(expected, ordered)
+})
+
 unlink(parquetPath)
 unlink(jsonPath)
 unlink(jsonPathNa)

From 410dd41cf6618b93b6daa6147d17339deeaa49ae Mon Sep 17 00:00:00 2001
From: "Joseph K. Bradley" <joseph@databricks.com>
Date: Wed, 22 Jul 2015 23:27:25 -0700
Subject: [PATCH 331/549] [SPARK-9268] [ML] Removed varargs annotation from
 Params.setDefault taking multiple params

Removed varargs annotation from Params.setDefault taking multiple params.

Though varargs is technically correct, it often requires that developers do clean assembly, rather than (not clean) assembly, which is a nuisance during development.

CC: mengxr

Author: Joseph K. Bradley <joseph@databricks.com>

Closes #7604 from jkbradley/params-setdefault-varargs and squashes the following commits:

6016dc6 [Joseph K. Bradley] removed varargs annotation from Params.setDefault taking multiple params
---
 mllib/src/main/scala/org/apache/spark/ml/param/params.scala  | 5 ++++-
 .../test/java/org/apache/spark/ml/param/JavaTestParams.java  | 3 ---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
index 824efa5ed4b28..954aa17e26a02 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
@@ -476,11 +476,14 @@ trait Params extends Identifiable with Serializable {
   /**
    * Sets default values for a list of params.
    *
+   * Note: Java developers should use the single-parameter [[setDefault()]].
+   *       Annotating this with varargs can cause compilation failures due to a Scala compiler bug.
+   *       See SPARK-9268.
+   *
    * @param paramPairs  a list of param pairs that specify params and their default values to set
    *                    respectively. Make sure that the params are initialized before this method
    *                    gets called.
    */
-  @varargs
   protected final def setDefault(paramPairs: ParamPair[_]*): this.type = {
     paramPairs.foreach { p =>
       setDefault(p.param.asInstanceOf[Param[Any]], p.value)
diff --git a/mllib/src/test/java/org/apache/spark/ml/param/JavaTestParams.java b/mllib/src/test/java/org/apache/spark/ml/param/JavaTestParams.java
index 3ae09d39ef500..dc6ce8061f62b 100644
--- a/mllib/src/test/java/org/apache/spark/ml/param/JavaTestParams.java
+++ b/mllib/src/test/java/org/apache/spark/ml/param/JavaTestParams.java
@@ -96,11 +96,8 @@ private void init() {
       new DoubleArrayParam(this, "myDoubleArrayParam", "this is a double param");
 
     setDefault(myIntParam(), 1);
-    setDefault(myIntParam().w(1));
     setDefault(myDoubleParam(), 0.5);
-    setDefault(myIntParam().w(1), myDoubleParam().w(0.5));
     setDefault(myDoubleArrayParam(), new double[] {1.0, 2.0});
-    setDefault(myDoubleArrayParam().w(new double[] {1.0, 2.0}));
   }
 
   @Override

From 825ab1e4526059a77e3278769797c4d065f48bd3 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@appier.com>
Date: Wed, 22 Jul 2015 23:29:26 -0700
Subject: [PATCH 332/549] [SPARK-7254] [MLLIB] Run PowerIterationClustering
 directly on graph

JIRA: https://issues.apache.org/jira/browse/SPARK-7254

Author: Liang-Chi Hsieh <viirya@appier.com>
Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #6054 from viirya/pic_on_graph and squashes the following commits:

8b87b81 [Liang-Chi Hsieh] Fix scala style.
a22fb8b [Liang-Chi Hsieh] For comment.
ef565a0 [Liang-Chi Hsieh] Fix indentation.
d249aa1 [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into pic_on_graph
82d7351 [Liang-Chi Hsieh] Run PowerIterationClustering directly on graph.
---
 .../clustering/PowerIterationClustering.scala | 46 ++++++++++++++++++
 .../PowerIterationClusteringSuite.scala       | 48 +++++++++++++++++++
 2 files changed, 94 insertions(+)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
index e7a243f854e33..407e43a024a2e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
@@ -153,6 +153,27 @@ class PowerIterationClustering private[clustering] (
     this
   }
 
+  /**
+   * Run the PIC algorithm on Graph.
+   *
+   * @param graph an affinity matrix represented as graph, which is the matrix A in the PIC paper.
+   *              The similarity s,,ij,, represented as the edge between vertices (i, j) must
+   *              be nonnegative. This is a symmetric matrix and hence s,,ij,, = s,,ji,,. For
+   *              any (i, j) with nonzero similarity, there should be either (i, j, s,,ij,,)
+   *              or (j, i, s,,ji,,) in the input. Tuples with i = j are ignored, because we
+   *              assume s,,ij,, = 0.0.
+   *
+   * @return a [[PowerIterationClusteringModel]] that contains the clustering result
+   */
+  def run(graph: Graph[Double, Double]): PowerIterationClusteringModel = {
+    val w = normalize(graph)
+    val w0 = initMode match {
+      case "random" => randomInit(w)
+      case "degree" => initDegreeVector(w)
+    }
+    pic(w0)
+  }
+
   /**
    * Run the PIC algorithm.
    *
@@ -212,6 +233,31 @@ object PowerIterationClustering extends Logging {
   @Experimental
   case class Assignment(id: Long, cluster: Int)
 
+  /**
+   * Normalizes the affinity graph (A) and returns the normalized affinity matrix (W).
+   */
+  private[clustering]
+  def normalize(graph: Graph[Double, Double]): Graph[Double, Double] = {
+    val vD = graph.aggregateMessages[Double](
+      sendMsg = ctx => {
+        val i = ctx.srcId
+        val j = ctx.dstId
+        val s = ctx.attr
+        if (s < 0.0) {
+          throw new SparkException("Similarity must be nonnegative but found s($i, $j) = $s.")
+        }
+        if (s > 0.0) {
+          ctx.sendToSrc(s)
+        }
+      },
+      mergeMsg = _ + _,
+      TripletFields.EdgeOnly)
+    GraphImpl.fromExistingRDDs(vD, graph.edges)
+      .mapTriplets(
+        e => e.attr / math.max(e.srcAttr, MLUtils.EPSILON),
+        TripletFields.Src)
+  }
+
   /**
    * Normalizes the affinity matrix (A) by row sums and returns the normalized affinity matrix (W).
    */
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/PowerIterationClusteringSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/PowerIterationClusteringSuite.scala
index 19e65f1b53ab5..189000512155f 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/PowerIterationClusteringSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/PowerIterationClusteringSuite.scala
@@ -68,6 +68,54 @@ class PowerIterationClusteringSuite extends SparkFunSuite with MLlibTestSparkCon
     assert(predictions2.toSet == Set((0 to 3).toSet, (4 to 15).toSet))
   }
 
+  test("power iteration clustering on graph") {
+    /*
+     We use the following graph to test PIC. All edges are assigned similarity 1.0 except 0.1 for
+     edge (3, 4).
+
+     15-14 -13 -12
+     |           |
+     4 . 3 - 2  11
+     |   | x |   |
+     5   0 - 1  10
+     |           |
+     6 - 7 - 8 - 9
+     */
+
+    val similarities = Seq[(Long, Long, Double)]((0, 1, 1.0), (0, 2, 1.0), (0, 3, 1.0), (1, 2, 1.0),
+      (1, 3, 1.0), (2, 3, 1.0), (3, 4, 0.1), // (3, 4) is a weak edge
+      (4, 5, 1.0), (4, 15, 1.0), (5, 6, 1.0), (6, 7, 1.0), (7, 8, 1.0), (8, 9, 1.0), (9, 10, 1.0),
+      (10, 11, 1.0), (11, 12, 1.0), (12, 13, 1.0), (13, 14, 1.0), (14, 15, 1.0))
+
+    val edges = similarities.flatMap { case (i, j, s) =>
+      if (i != j) {
+        Seq(Edge(i, j, s), Edge(j, i, s))
+      } else {
+        None
+      }
+    }
+    val graph = Graph.fromEdges(sc.parallelize(edges, 2), 0.0)
+
+    val model = new PowerIterationClustering()
+      .setK(2)
+      .run(graph)
+    val predictions = Array.fill(2)(mutable.Set.empty[Long])
+    model.assignments.collect().foreach { a =>
+      predictions(a.cluster) += a.id
+    }
+    assert(predictions.toSet == Set((0 to 3).toSet, (4 to 15).toSet))
+
+    val model2 = new PowerIterationClustering()
+      .setK(2)
+      .setInitializationMode("degree")
+      .run(sc.parallelize(similarities, 2))
+    val predictions2 = Array.fill(2)(mutable.Set.empty[Long])
+    model2.assignments.collect().foreach { a =>
+      predictions2(a.cluster) += a.id
+    }
+    assert(predictions2.toSet == Set((0 to 3).toSet, (4 to 15).toSet))
+  }
+
   test("normalize and powerIter") {
     /*
      Test normalize() with the following graph:

From 6d0d8b406942edcf9fc97e76fb227ff1eb35ca3a Mon Sep 17 00:00:00 2001
From: Yijie Shen <henry.yijieshen@gmail.com>
Date: Wed, 22 Jul 2015 23:44:08 -0700
Subject: [PATCH 333/549] [SPARK-8935] [SQL] Implement code generation for all
 casts

JIRA: https://issues.apache.org/jira/browse/SPARK-8935

Author: Yijie Shen <henry.yijieshen@gmail.com>

Closes #7365 from yjshen/cast_codegen and squashes the following commits:

ef6e8b5 [Yijie Shen] getColumn and setColumn in struct cast, autounboxing in array and map
eaece18 [Yijie Shen] remove null case in cast code gen
fd7eba4 [Yijie Shen] resolve comments
80378a5 [Yijie Shen] the missing self cast
611d66e [Yijie Shen] Bug fix: NullType & primitive object unboxing
6d5c0fe [Yijie Shen] rebase and add Interval codegen
9424b65 [Yijie Shen] tiny style fix
4a1c801 [Yijie Shen] remove CodeHolder class, use function instead.
3f5df88 [Yijie Shen] CodeHolder for complex dataTypes
c286f13 [Yijie Shen] moved all the cast code into class body
4edfd76 [Yijie Shen] [WIP] finished primitive part
---
 .../spark/sql/catalyst/expressions/Cast.scala | 523 ++++++++++++++++--
 .../expressions/DateExpressionsSuite.scala    |  36 +-
 2 files changed, 508 insertions(+), 51 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
index 3346d3c9f9e61..e66cd828481bf 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -26,6 +26,8 @@ import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.{Interval, UTF8String}
 
+import scala.collection.mutable
+
 
 object Cast {
 
@@ -418,51 +420,506 @@ case class Cast(child: Expression, dataType: DataType)
   protected override def nullSafeEval(input: Any): Any = cast(input)
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
-    // TODO: Add support for more data types.
-    (child.dataType, dataType) match {
+    val eval = child.gen(ctx)
+    val nullSafeCast = nullSafeCastFunction(child.dataType, dataType, ctx)
+    eval.code +
+      castCode(ctx, eval.primitive, eval.isNull, ev.primitive, ev.isNull, dataType, nullSafeCast)
+  }
+
+  // three function arguments are: child.primitive, result.primitive and result.isNull
+  // it returns the code snippets to be put in null safe evaluation region
+  private[this] type CastFunction = (String, String, String) => String
+
+  private[this] def nullSafeCastFunction(
+      from: DataType,
+      to: DataType,
+      ctx: CodeGenContext): CastFunction = to match {
+
+    case _ if from == NullType => (c, evPrim, evNull) => s"$evNull = true;"
+    case _ if to == from => (c, evPrim, evNull) => s"$evPrim = $c;"
+    case StringType => castToStringCode(from, ctx)
+    case BinaryType => castToBinaryCode(from)
+    case DateType => castToDateCode(from, ctx)
+    case decimal: DecimalType => castToDecimalCode(from, decimal)
+    case TimestampType => castToTimestampCode(from, ctx)
+    case IntervalType => castToIntervalCode(from)
+    case BooleanType => castToBooleanCode(from)
+    case ByteType => castToByteCode(from)
+    case ShortType => castToShortCode(from)
+    case IntegerType => castToIntCode(from)
+    case FloatType => castToFloatCode(from)
+    case LongType => castToLongCode(from)
+    case DoubleType => castToDoubleCode(from)
+
+    case array: ArrayType => castArrayCode(from.asInstanceOf[ArrayType], array, ctx)
+    case map: MapType => castMapCode(from.asInstanceOf[MapType], map, ctx)
+    case struct: StructType => castStructCode(from.asInstanceOf[StructType], struct, ctx)
+  }
+
+  // Since we need to cast child expressions recursively inside ComplexTypes, such as Map's
+  // Key and Value, Struct's field, we need to name out all the variable names involved in a cast.
+  private[this] def castCode(ctx: CodeGenContext, childPrim: String, childNull: String,
+    resultPrim: String, resultNull: String, resultType: DataType, cast: CastFunction): String = {
+    s"""
+      boolean $resultNull = $childNull;
+      ${ctx.javaType(resultType)} $resultPrim = ${ctx.defaultValue(resultType)};
+      if (!${childNull}) {
+        ${cast(childPrim, resultPrim, resultNull)}
+      }
+    """
+  }
+
+  private[this] def castToStringCode(from: DataType, ctx: CodeGenContext): CastFunction = {
+    from match {
+      case BinaryType =>
+        (c, evPrim, evNull) => s"$evPrim = UTF8String.fromBytes($c);"
+      case DateType =>
+        (c, evPrim, evNull) => s"""$evPrim = UTF8String.fromString(
+          org.apache.spark.sql.catalyst.util.DateTimeUtils.dateToString($c));"""
+      case TimestampType =>
+        (c, evPrim, evNull) => s"""$evPrim = UTF8String.fromString(
+          org.apache.spark.sql.catalyst.util.DateTimeUtils.timestampToString($c));"""
+      case _ =>
+        (c, evPrim, evNull) => s"$evPrim = UTF8String.fromString(String.valueOf($c));"
+    }
+  }
+
+  private[this] def castToBinaryCode(from: DataType): CastFunction = from match {
+    case StringType =>
+      (c, evPrim, evNull) => s"$evPrim = $c.getBytes();"
+  }
+
+  private[this] def castToDateCode(
+      from: DataType,
+      ctx: CodeGenContext): CastFunction = from match {
+    case StringType =>
+      val intOpt = ctx.freshName("intOpt")
+      (c, evPrim, evNull) => s"""
+        scala.Option<Integer> $intOpt =
+          org.apache.spark.sql.catalyst.util.DateTimeUtils.stringToDate($c);
+        if ($intOpt.isDefined()) {
+          $evPrim = ((Integer) $intOpt.get()).intValue();
+        } else {
+          $evNull = true;
+        }
+       """
+    case TimestampType =>
+      (c, evPrim, evNull) =>
+        s"$evPrim = org.apache.spark.sql.catalyst.util.DateTimeUtils.millisToDays($c / 1000L);";
+    case _ =>
+      (c, evPrim, evNull) => s"$evNull = true;"
+  }
+
+  private[this] def changePrecision(d: String, decimalType: DecimalType,
+      evPrim: String, evNull: String): String = {
+    decimalType match {
+      case DecimalType.Unlimited =>
+        s"$evPrim = $d;"
+      case DecimalType.Fixed(precision, scale) =>
+        s"""
+          if ($d.changePrecision($precision, $scale)) {
+            $evPrim = $d;
+          } else {
+            $evNull = true;
+          }
+        """
+    }
+  }
 
-      case (BinaryType, StringType) =>
-        defineCodeGen (ctx, ev, c =>
-          s"UTF8String.fromBytes($c)")
+  private[this] def castToDecimalCode(from: DataType, target: DecimalType): CastFunction = {
+    from match {
+      case StringType =>
+        (c, evPrim, evNull) =>
+          s"""
+            try {
+              org.apache.spark.sql.types.Decimal tmpDecimal =
+                new org.apache.spark.sql.types.Decimal().set(
+                  new scala.math.BigDecimal(
+                    new java.math.BigDecimal($c.toString())));
+              ${changePrecision("tmpDecimal", target, evPrim, evNull)}
+            } catch (java.lang.NumberFormatException e) {
+              $evNull = true;
+            }
+          """
+      case BooleanType =>
+        (c, evPrim, evNull) =>
+          s"""
+            org.apache.spark.sql.types.Decimal tmpDecimal = null;
+            if ($c) {
+              tmpDecimal = new org.apache.spark.sql.types.Decimal().set(1);
+            } else {
+              tmpDecimal = new org.apache.spark.sql.types.Decimal().set(0);
+            }
+            ${changePrecision("tmpDecimal", target, evPrim, evNull)}
+          """
+      case DateType =>
+        // date can't cast to decimal in Hive
+        (c, evPrim, evNull) => s"$evNull = true;"
+      case TimestampType =>
+        // Note that we lose precision here.
+        (c, evPrim, evNull) =>
+          s"""
+            org.apache.spark.sql.types.Decimal tmpDecimal =
+              new org.apache.spark.sql.types.Decimal().set(
+                scala.math.BigDecimal.valueOf(${timestampToDoubleCode(c)}));
+            ${changePrecision("tmpDecimal", target, evPrim, evNull)}
+          """
+      case DecimalType() =>
+        (c, evPrim, evNull) =>
+          s"""
+            org.apache.spark.sql.types.Decimal tmpDecimal = $c.clone();
+            ${changePrecision("tmpDecimal", target, evPrim, evNull)}
+          """
+      case LongType =>
+        (c, evPrim, evNull) =>
+          s"""
+            org.apache.spark.sql.types.Decimal tmpDecimal =
+              new org.apache.spark.sql.types.Decimal().set($c);
+            ${changePrecision("tmpDecimal", target, evPrim, evNull)}
+          """
+      case x: NumericType =>
+        // All other numeric types can be represented precisely as Doubles
+        (c, evPrim, evNull) =>
+          s"""
+            try {
+              org.apache.spark.sql.types.Decimal tmpDecimal =
+                new org.apache.spark.sql.types.Decimal().set(
+                  scala.math.BigDecimal.valueOf((double) $c));
+              ${changePrecision("tmpDecimal", target, evPrim, evNull)}
+            } catch (java.lang.NumberFormatException e) {
+              $evNull = true;
+            }
+          """
+    }
+  }
 
-      case (DateType, StringType) =>
-        defineCodeGen(ctx, ev, c =>
-          s"""UTF8String.fromString(
-                org.apache.spark.sql.catalyst.util.DateTimeUtils.dateToString($c))""")
+  private[this] def castToTimestampCode(
+      from: DataType,
+      ctx: CodeGenContext): CastFunction = from match {
+    case StringType =>
+      val longOpt = ctx.freshName("longOpt")
+      (c, evPrim, evNull) =>
+        s"""
+          scala.Option<Long> $longOpt =
+            org.apache.spark.sql.catalyst.util.DateTimeUtils.stringToTimestamp($c);
+          if ($longOpt.isDefined()) {
+            $evPrim = ((Long) $longOpt.get()).longValue();
+          } else {
+            $evNull = true;
+          }
+         """
+    case BooleanType =>
+      (c, evPrim, evNull) => s"$evPrim = $c ? 1L : 0;"
+    case _: IntegralType =>
+      (c, evPrim, evNull) => s"$evPrim = ${longToTimeStampCode(c)};"
+    case DateType =>
+      (c, evPrim, evNull) =>
+        s"$evPrim = org.apache.spark.sql.catalyst.util.DateTimeUtils.daysToMillis($c) * 1000;"
+    case DecimalType() =>
+      (c, evPrim, evNull) => s"$evPrim = ${decimalToTimestampCode(c)};"
+    case DoubleType =>
+      (c, evPrim, evNull) =>
+        s"""
+          if (Double.isNaN($c) || Double.isInfinite($c)) {
+            $evNull = true;
+          } else {
+            $evPrim = (long)($c * 1000000L);
+          }
+        """
+    case FloatType =>
+      (c, evPrim, evNull) =>
+        s"""
+          if (Float.isNaN($c) || Float.isInfinite($c)) {
+            $evNull = true;
+          } else {
+            $evPrim = (long)($c * 1000000L);
+          }
+        """
+  }
 
-      case (TimestampType, StringType) =>
-        defineCodeGen(ctx, ev, c =>
-          s"""UTF8String.fromString(
-                org.apache.spark.sql.catalyst.util.DateTimeUtils.timestampToString($c))""")
+  private[this] def castToIntervalCode(from: DataType): CastFunction = from match {
+    case StringType =>
+      (c, evPrim, evNull) =>
+        s"$evPrim = org.apache.spark.unsafe.types.Interval.fromString($c.toString());"
+  }
+
+  private[this] def decimalToTimestampCode(d: String): String =
+    s"($d.toBigDecimal().bigDecimal().multiply(new java.math.BigDecimal(1000000L))).longValue()"
+  private[this] def longToTimeStampCode(l: String): String = s"$l * 1000L"
+  private[this] def timestampToIntegerCode(ts: String): String =
+    s"java.lang.Math.floor((double) $ts / 1000000L)"
+  private[this] def timestampToDoubleCode(ts: String): String = s"$ts / 1000000.0"
+
+  private[this] def castToBooleanCode(from: DataType): CastFunction = from match {
+    case StringType =>
+      (c, evPrim, evNull) => s"$evPrim = $c.numBytes() != 0;"
+    case TimestampType =>
+      (c, evPrim, evNull) => s"$evPrim = $c != 0;"
+    case DateType =>
+      // Hive would return null when cast from date to boolean
+      (c, evPrim, evNull) => s"$evNull = true;"
+    case DecimalType() =>
+      (c, evPrim, evNull) => s"$evPrim = !$c.isZero();"
+    case n: NumericType =>
+      (c, evPrim, evNull) => s"$evPrim = $c != 0;"
+  }
+
+  private[this] def castToByteCode(from: DataType): CastFunction = from match {
+    case StringType =>
+      (c, evPrim, evNull) =>
+        s"""
+          try {
+            $evPrim = Byte.valueOf($c.toString());
+          } catch (java.lang.NumberFormatException e) {
+            $evNull = true;
+          }
+        """
+    case BooleanType =>
+      (c, evPrim, evNull) => s"$evPrim = $c ? 1 : 0;"
+    case DateType =>
+      (c, evPrim, evNull) => s"$evNull = true;"
+    case TimestampType =>
+      (c, evPrim, evNull) => s"$evPrim = (byte) ${timestampToIntegerCode(c)};"
+    case DecimalType() =>
+      (c, evPrim, evNull) => s"$evPrim = $c.toByte();"
+    case x: NumericType =>
+      (c, evPrim, evNull) => s"$evPrim = (byte) $c;"
+  }
 
-      case (_, StringType) =>
-        defineCodeGen(ctx, ev, c => s"UTF8String.fromString(String.valueOf($c))")
+  private[this] def castToShortCode(from: DataType): CastFunction = from match {
+    case StringType =>
+      (c, evPrim, evNull) =>
+        s"""
+          try {
+            $evPrim = Short.valueOf($c.toString());
+          } catch (java.lang.NumberFormatException e) {
+            $evNull = true;
+          }
+        """
+    case BooleanType =>
+      (c, evPrim, evNull) => s"$evPrim = $c ? 1 : 0;"
+    case DateType =>
+      (c, evPrim, evNull) => s"$evNull = true;"
+    case TimestampType =>
+      (c, evPrim, evNull) => s"$evPrim = (short) ${timestampToIntegerCode(c)};"
+    case DecimalType() =>
+      (c, evPrim, evNull) => s"$evPrim = $c.toShort();"
+    case x: NumericType =>
+      (c, evPrim, evNull) => s"$evPrim = (short) $c;"
+  }
 
-      case (StringType, IntervalType) =>
-        defineCodeGen(ctx, ev, c =>
-          s"org.apache.spark.unsafe.types.Interval.fromString($c.toString())")
+  private[this] def castToIntCode(from: DataType): CastFunction = from match {
+    case StringType =>
+      (c, evPrim, evNull) =>
+        s"""
+          try {
+            $evPrim = Integer.valueOf($c.toString());
+          } catch (java.lang.NumberFormatException e) {
+            $evNull = true;
+          }
+        """
+    case BooleanType =>
+      (c, evPrim, evNull) => s"$evPrim = $c ? 1 : 0;"
+    case DateType =>
+      (c, evPrim, evNull) => s"$evNull = true;"
+    case TimestampType =>
+      (c, evPrim, evNull) => s"$evPrim = (int) ${timestampToIntegerCode(c)};"
+    case DecimalType() =>
+      (c, evPrim, evNull) => s"$evPrim = $c.toInt();"
+    case x: NumericType =>
+      (c, evPrim, evNull) => s"$evPrim = (int) $c;"
+  }
 
-      // fallback for DecimalType, this must be before other numeric types
-      case (_, dt: DecimalType) =>
-        super.genCode(ctx, ev)
+  private[this] def castToLongCode(from: DataType): CastFunction = from match {
+    case StringType =>
+      (c, evPrim, evNull) =>
+        s"""
+          try {
+            $evPrim = Long.valueOf($c.toString());
+          } catch (java.lang.NumberFormatException e) {
+            $evNull = true;
+          }
+        """
+    case BooleanType =>
+      (c, evPrim, evNull) => s"$evPrim = $c ? 1 : 0;"
+    case DateType =>
+      (c, evPrim, evNull) => s"$evNull = true;"
+    case TimestampType =>
+      (c, evPrim, evNull) => s"$evPrim = (long) ${timestampToIntegerCode(c)};"
+    case DecimalType() =>
+      (c, evPrim, evNull) => s"$evPrim = $c.toLong();"
+    case x: NumericType =>
+      (c, evPrim, evNull) => s"$evPrim = (long) $c;"
+  }
 
-      case (BooleanType, dt: NumericType) =>
-        defineCodeGen(ctx, ev, c => s"(${ctx.javaType(dt)})($c ? 1 : 0)")
+  private[this] def castToFloatCode(from: DataType): CastFunction = from match {
+    case StringType =>
+      (c, evPrim, evNull) =>
+        s"""
+          try {
+            $evPrim = Float.valueOf($c.toString());
+          } catch (java.lang.NumberFormatException e) {
+            $evNull = true;
+          }
+        """
+    case BooleanType =>
+      (c, evPrim, evNull) => s"$evPrim = $c ? 1 : 0;"
+    case DateType =>
+      (c, evPrim, evNull) => s"$evNull = true;"
+    case TimestampType =>
+      (c, evPrim, evNull) => s"$evPrim = (float) (${timestampToDoubleCode(c)});"
+    case DecimalType() =>
+      (c, evPrim, evNull) => s"$evPrim = $c.toFloat();"
+    case x: NumericType =>
+      (c, evPrim, evNull) => s"$evPrim = (float) $c;"
+  }
 
-      case (dt: DecimalType, BooleanType) =>
-        defineCodeGen(ctx, ev, c => s"!$c.isZero()")
+  private[this] def castToDoubleCode(from: DataType): CastFunction = from match {
+    case StringType =>
+      (c, evPrim, evNull) =>
+        s"""
+          try {
+            $evPrim = Double.valueOf($c.toString());
+          } catch (java.lang.NumberFormatException e) {
+            $evNull = true;
+          }
+        """
+    case BooleanType =>
+      (c, evPrim, evNull) => s"$evPrim = $c ? 1 : 0;"
+    case DateType =>
+      (c, evPrim, evNull) => s"$evNull = true;"
+    case TimestampType =>
+      (c, evPrim, evNull) => s"$evPrim = ${timestampToDoubleCode(c)};"
+    case DecimalType() =>
+      (c, evPrim, evNull) => s"$evPrim = $c.toDouble();"
+    case x: NumericType =>
+      (c, evPrim, evNull) => s"$evPrim = (double) $c;"
+  }
 
-      case (dt: NumericType, BooleanType) =>
-        defineCodeGen(ctx, ev, c => s"$c != 0")
+  private[this] def castArrayCode(
+      from: ArrayType, to: ArrayType, ctx: CodeGenContext): CastFunction = {
+    val elementCast = nullSafeCastFunction(from.elementType, to.elementType, ctx)
+
+    val arraySeqClass = classOf[mutable.ArraySeq[Any]].getName
+    val fromElementNull = ctx.freshName("feNull")
+    val fromElementPrim = ctx.freshName("fePrim")
+    val toElementNull = ctx.freshName("teNull")
+    val toElementPrim = ctx.freshName("tePrim")
+    val size = ctx.freshName("n")
+    val j = ctx.freshName("j")
+    val result = ctx.freshName("result")
+
+    (c, evPrim, evNull) =>
+      s"""
+        final int $size = $c.size();
+        final $arraySeqClass<Object> $result = new $arraySeqClass<Object>($size);
+        for (int $j = 0; $j < $size; $j ++) {
+          if ($c.apply($j) == null) {
+            $result.update($j, null);
+          } else {
+            boolean $fromElementNull = false;
+            ${ctx.javaType(from.elementType)} $fromElementPrim =
+              (${ctx.boxedType(from.elementType)}) $c.apply($j);
+            ${castCode(ctx, fromElementPrim,
+              fromElementNull, toElementPrim, toElementNull, to.elementType, elementCast)}
+            if ($toElementNull) {
+              $result.update($j, null);
+            } else {
+              $result.update($j, $toElementPrim);
+            }
+          }
+        }
+        $evPrim = $result;
+      """
+  }
 
-      case (_: DecimalType, dt: NumericType) =>
-        defineCodeGen(ctx, ev, c => s"($c).to${ctx.primitiveTypeName(dt)}()")
+  private[this] def castMapCode(from: MapType, to: MapType, ctx: CodeGenContext): CastFunction = {
+    val keyCast = nullSafeCastFunction(from.keyType, to.keyType, ctx)
+    val valueCast = nullSafeCastFunction(from.valueType, to.valueType, ctx)
+
+    val hashMapClass = classOf[mutable.HashMap[Any, Any]].getName
+    val fromKeyPrim = ctx.freshName("fkp")
+    val fromKeyNull = ctx.freshName("fkn")
+    val fromValuePrim = ctx.freshName("fvp")
+    val fromValueNull = ctx.freshName("fvn")
+    val toKeyPrim = ctx.freshName("tkp")
+    val toKeyNull = ctx.freshName("tkn")
+    val toValuePrim = ctx.freshName("tvp")
+    val toValueNull = ctx.freshName("tvn")
+    val result = ctx.freshName("result")
+
+    (c, evPrim, evNull) =>
+      s"""
+        final $hashMapClass $result = new $hashMapClass();
+        scala.collection.Iterator iter = $c.iterator();
+        while (iter.hasNext()) {
+          scala.Tuple2 kv = (scala.Tuple2) iter.next();
+          boolean $fromKeyNull = false;
+          ${ctx.javaType(from.keyType)} $fromKeyPrim =
+            (${ctx.boxedType(from.keyType)}) kv._1();
+          ${castCode(ctx, fromKeyPrim,
+            fromKeyNull, toKeyPrim, toKeyNull, to.keyType, keyCast)}
+
+          boolean $fromValueNull = kv._2() == null;
+          if ($fromValueNull) {
+            $result.put($toKeyPrim, null);
+          } else {
+            ${ctx.javaType(from.valueType)} $fromValuePrim =
+              (${ctx.boxedType(from.valueType)}) kv._2();
+            ${castCode(ctx, fromValuePrim,
+              fromValueNull, toValuePrim, toValueNull, to.valueType, valueCast)}
+            if ($toValueNull) {
+              $result.put($toKeyPrim, null);
+            } else {
+              $result.put($toKeyPrim, $toValuePrim);
+            }
+          }
+        }
+        $evPrim = $result;
+      """
+  }
 
-      case (_: NumericType, dt: NumericType) =>
-        defineCodeGen(ctx, ev, c => s"(${ctx.javaType(dt)})($c)")
+  private[this] def castStructCode(
+      from: StructType, to: StructType, ctx: CodeGenContext): CastFunction = {
 
-      case other =>
-        super.genCode(ctx, ev)
+    val fieldsCasts = from.fields.zip(to.fields).map {
+      case (fromField, toField) => nullSafeCastFunction(fromField.dataType, toField.dataType, ctx)
     }
+    val rowClass = classOf[GenericMutableRow].getName
+    val result = ctx.freshName("result")
+    val tmpRow = ctx.freshName("tmpRow")
+
+    val fieldsEvalCode = fieldsCasts.zipWithIndex.map { case (cast, i) => {
+      val fromFieldPrim = ctx.freshName("ffp")
+      val fromFieldNull = ctx.freshName("ffn")
+      val toFieldPrim = ctx.freshName("tfp")
+      val toFieldNull = ctx.freshName("tfn")
+      val fromType = ctx.javaType(from.fields(i).dataType)
+      s"""
+        boolean $fromFieldNull = $tmpRow.isNullAt($i);
+        if ($fromFieldNull) {
+          $result.setNullAt($i);
+        } else {
+          $fromType $fromFieldPrim =
+            ${ctx.getColumn(tmpRow, from.fields(i).dataType, i)};
+          ${castCode(ctx, fromFieldPrim,
+            fromFieldNull, toFieldPrim, toFieldNull, to.fields(i).dataType, cast)}
+          if ($toFieldNull) {
+            $result.setNullAt($i);
+          } else {
+            ${ctx.setColumn(result, to.fields(i).dataType, i, toFieldPrim)};
+          }
+        }
+       """
+      }
+    }.mkString("\n")
+
+    (c, evPrim, evNull) =>
+      s"""
+        final $rowClass $result = new $rowClass(${fieldsCasts.size});
+        final InternalRow $tmpRow = $c;
+        $fieldsEvalCode
+        $evPrim = $result.copy();
+      """
   }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
index f724bab4d8839..bdba6ce891386 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
@@ -39,7 +39,7 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
           val c = Calendar.getInstance()
           c.set(y, m, 28, 0, 0, 0)
           c.add(Calendar.DATE, i)
-          checkEvaluation(DayOfYear(Cast(Literal(new Date(c.getTimeInMillis)), DateType)),
+          checkEvaluation(DayOfYear(Literal(new Date(c.getTimeInMillis))),
             sdfDay.format(c.getTime).toInt)
         }
       }
@@ -51,7 +51,7 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
           val c = Calendar.getInstance()
           c.set(y, m, 28, 0, 0, 0)
           c.add(Calendar.DATE, i)
-          checkEvaluation(DayOfYear(Cast(Literal(new Date(c.getTimeInMillis)), DateType)),
+          checkEvaluation(DayOfYear(Literal(new Date(c.getTimeInMillis))),
             sdfDay.format(c.getTime).toInt)
         }
       }
@@ -63,7 +63,7 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
           val c = Calendar.getInstance()
           c.set(y, m, 28, 0, 0, 0)
           c.add(Calendar.DATE, i)
-          checkEvaluation(DayOfYear(Cast(Literal(new Date(c.getTimeInMillis)), DateType)),
+          checkEvaluation(DayOfYear(Literal(new Date(c.getTimeInMillis))),
             sdfDay.format(c.getTime).toInt)
         }
       }
@@ -75,7 +75,7 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
           val c = Calendar.getInstance()
           c.set(y, m, 28, 0, 0, 0)
           c.add(Calendar.DATE, i)
-          checkEvaluation(DayOfYear(Cast(Literal(new Date(c.getTimeInMillis)), DateType)),
+          checkEvaluation(DayOfYear(Literal(new Date(c.getTimeInMillis))),
             sdfDay.format(c.getTime).toInt)
         }
       }
@@ -87,7 +87,7 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
           val c = Calendar.getInstance()
           c.set(y, m, 28, 0, 0, 0)
           c.add(Calendar.DATE, i)
-          checkEvaluation(DayOfYear(Cast(Literal(new Date(c.getTimeInMillis)), DateType)),
+          checkEvaluation(DayOfYear(Literal(new Date(c.getTimeInMillis))),
             sdfDay.format(c.getTime).toInt)
         }
       }
@@ -96,7 +96,7 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
 
   test("Year") {
     checkEvaluation(Year(Literal.create(null, DateType)), null)
-    checkEvaluation(Year(Cast(Literal(d), DateType)), 2015)
+    checkEvaluation(Year(Literal(d)), 2015)
     checkEvaluation(Year(Cast(Literal(sdfDate.format(d)), DateType)), 2015)
     checkEvaluation(Year(Cast(Literal(ts), DateType)), 2013)
 
@@ -106,7 +106,7 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
         c.set(y, m, 28)
         (0 to 5 * 24).foreach { i =>
           c.add(Calendar.HOUR_OF_DAY, 1)
-          checkEvaluation(Year(Cast(Literal(new Date(c.getTimeInMillis)), DateType)),
+          checkEvaluation(Year(Literal(new Date(c.getTimeInMillis))),
             c.get(Calendar.YEAR))
         }
       }
@@ -115,7 +115,7 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
 
   test("Quarter") {
     checkEvaluation(Quarter(Literal.create(null, DateType)), null)
-    checkEvaluation(Quarter(Cast(Literal(d), DateType)), 2)
+    checkEvaluation(Quarter(Literal(d)), 2)
     checkEvaluation(Quarter(Cast(Literal(sdfDate.format(d)), DateType)), 2)
     checkEvaluation(Quarter(Cast(Literal(ts), DateType)), 4)
 
@@ -125,7 +125,7 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
         c.set(y, m, 28, 0, 0, 0)
         (0 to 5 * 24).foreach { i =>
           c.add(Calendar.HOUR_OF_DAY, 1)
-          checkEvaluation(Quarter(Cast(Literal(new Date(c.getTimeInMillis)), DateType)),
+          checkEvaluation(Quarter(Literal(new Date(c.getTimeInMillis))),
             c.get(Calendar.MONTH) / 3 + 1)
         }
       }
@@ -134,7 +134,7 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
 
   test("Month") {
     checkEvaluation(Month(Literal.create(null, DateType)), null)
-    checkEvaluation(Month(Cast(Literal(d), DateType)), 4)
+    checkEvaluation(Month(Literal(d)), 4)
     checkEvaluation(Month(Cast(Literal(sdfDate.format(d)), DateType)), 4)
     checkEvaluation(Month(Cast(Literal(ts), DateType)), 11)
 
@@ -144,7 +144,7 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
           val c = Calendar.getInstance()
           c.set(y, m, 28, 0, 0, 0)
           c.add(Calendar.HOUR_OF_DAY, i)
-          checkEvaluation(Month(Cast(Literal(new Date(c.getTimeInMillis)), DateType)),
+          checkEvaluation(Month(Literal(new Date(c.getTimeInMillis))),
             c.get(Calendar.MONTH) + 1)
         }
       }
@@ -156,7 +156,7 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
           val c = Calendar.getInstance()
           c.set(y, m, 28, 0, 0, 0)
           c.add(Calendar.HOUR_OF_DAY, i)
-          checkEvaluation(Month(Cast(Literal(new Date(c.getTimeInMillis)), DateType)),
+          checkEvaluation(Month(Literal(new Date(c.getTimeInMillis))),
             c.get(Calendar.MONTH) + 1)
         }
       }
@@ -166,7 +166,7 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
   test("Day / DayOfMonth") {
     checkEvaluation(DayOfMonth(Cast(Literal("2000-02-29"), DateType)), 29)
     checkEvaluation(DayOfMonth(Literal.create(null, DateType)), null)
-    checkEvaluation(DayOfMonth(Cast(Literal(d), DateType)), 8)
+    checkEvaluation(DayOfMonth(Literal(d)), 8)
     checkEvaluation(DayOfMonth(Cast(Literal(sdfDate.format(d)), DateType)), 8)
     checkEvaluation(DayOfMonth(Cast(Literal(ts), DateType)), 8)
 
@@ -175,7 +175,7 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
       c.set(y, 0, 1, 0, 0, 0)
       (0 to 365).foreach { d =>
         c.add(Calendar.DATE, 1)
-        checkEvaluation(DayOfMonth(Cast(Literal(new Date(c.getTimeInMillis)), DateType)),
+        checkEvaluation(DayOfMonth(Literal(new Date(c.getTimeInMillis))),
           c.get(Calendar.DAY_OF_MONTH))
       }
     }
@@ -190,14 +190,14 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     val c = Calendar.getInstance()
     (0 to 60 by 5).foreach { s =>
       c.set(2015, 18, 3, 3, 5, s)
-      checkEvaluation(Second(Cast(Literal(new Timestamp(c.getTimeInMillis)), TimestampType)),
+      checkEvaluation(Second(Literal(new Timestamp(c.getTimeInMillis))),
         c.get(Calendar.SECOND))
     }
   }
 
   test("WeekOfYear") {
     checkEvaluation(WeekOfYear(Literal.create(null, DateType)), null)
-    checkEvaluation(WeekOfYear(Cast(Literal(d), DateType)), 15)
+    checkEvaluation(WeekOfYear(Literal(d)), 15)
     checkEvaluation(WeekOfYear(Cast(Literal(sdfDate.format(d)), DateType)), 15)
     checkEvaluation(WeekOfYear(Cast(Literal(ts), DateType)), 45)
     checkEvaluation(WeekOfYear(Cast(Literal("2011-05-06"), DateType)), 18)
@@ -223,7 +223,7 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
       (0 to 60 by 15).foreach { m =>
         (0 to 60 by 15).foreach { s =>
           c.set(2015, 18, 3, h, m, s)
-          checkEvaluation(Hour(Cast(Literal(new Timestamp(c.getTimeInMillis)), TimestampType)),
+          checkEvaluation(Hour(Literal(new Timestamp(c.getTimeInMillis))),
             c.get(Calendar.HOUR_OF_DAY))
         }
       }
@@ -240,7 +240,7 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     (0 to 60 by 5).foreach { m =>
       (0 to 60 by 15).foreach { s =>
         c.set(2015, 18, 3, 3, m, s)
-        checkEvaluation(Minute(Cast(Literal(new Timestamp(c.getTimeInMillis)), TimestampType)),
+        checkEvaluation(Minute(Literal(new Timestamp(c.getTimeInMillis))),
           c.get(Calendar.MINUTE))
       }
     }

From b983d493b490ca8bafe7eb988b62a250987ae353 Mon Sep 17 00:00:00 2001
From: "Perinkulam I. Ganesh" <gip@us.ibm.com>
Date: Thu, 23 Jul 2015 07:46:20 +0100
Subject: [PATCH 334/549] [SPARK-8695] [CORE] [MLLIB] TreeAggregation shouldn't
 be triggered when it doesn't save wall-clock time.

Author: Perinkulam I. Ganesh <gip@us.ibm.com>

Closes #7397 from piganesh/SPARK-8695 and squashes the following commits:

041620c [Perinkulam I. Ganesh] [SPARK-8695][CORE][MLlib] TreeAggregation shouldn't be triggered when it doesn't save wall-clock time.
9ad067c [Perinkulam I. Ganesh] [SPARK-8695] [core] [WIP] TreeAggregation shouldn't be triggered for 5 partitions
a6fed07 [Perinkulam I. Ganesh] [SPARK-8695] [core] [WIP] TreeAggregation shouldn't be triggered for 5 partitions
---
 core/src/main/scala/org/apache/spark/rdd/RDD.scala | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
index 394c6686cbabd..6d61d227382d7 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -1082,7 +1082,9 @@ abstract class RDD[T: ClassTag](
       val scale = math.max(math.ceil(math.pow(numPartitions, 1.0 / depth)).toInt, 2)
       // If creating an extra level doesn't help reduce
       // the wall-clock time, we stop tree aggregation.
-      while (numPartitions > scale + numPartitions / scale) {
+
+      // Don't trigger TreeAggregation when it doesn't save wall-clock time
+      while (numPartitions > scale + math.ceil(numPartitions.toDouble / scale)) {
         numPartitions /= scale
         val curNumPartitions = numPartitions
         partiallyAggregated = partiallyAggregated.mapPartitionsWithIndex {

From ac3ae0f2be88e0b53f65342efe5fcbe67b5c2106 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Thu, 23 Jul 2015 00:43:26 -0700
Subject: [PATCH 335/549] [SPARK-9266] Prevent "managed memory leak detected"
 exception from masking original exception

When a task fails with an exception and also fails to properly clean up its managed memory, the `spark.unsafe.exceptionOnMemoryLeak` memory leak detection mechanism's exceptions will mask the original exception that caused the task to fail. We should throw the memory leak exception only if no other exception occurred.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #7603 from JoshRosen/SPARK-9266 and squashes the following commits:

c268cb5 [Josh Rosen] Merge remote-tracking branch 'origin/master' into SPARK-9266
c1f0167 [Josh Rosen] Fix the error masking problem
448eae8 [Josh Rosen] Add regression test
---
 .../org/apache/spark/executor/Executor.scala  |  7 ++++--
 .../scala/org/apache/spark/FailureSuite.scala | 25 +++++++++++++++++++
 2 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala
index 581b40003c6c4..e76664f1bd7b0 100644
--- a/core/src/main/scala/org/apache/spark/executor/Executor.scala
+++ b/core/src/main/scala/org/apache/spark/executor/Executor.scala
@@ -209,16 +209,19 @@ private[spark] class Executor(
 
         // Run the actual task and measure its runtime.
         taskStart = System.currentTimeMillis()
+        var threwException = true
         val (value, accumUpdates) = try {
-          task.run(
+          val res = task.run(
             taskAttemptId = taskId,
             attemptNumber = attemptNumber,
             metricsSystem = env.metricsSystem)
+          threwException = false
+          res
         } finally {
           val freedMemory = taskMemoryManager.cleanUpAllAllocatedMemory()
           if (freedMemory > 0) {
             val errMsg = s"Managed memory leak detected; size = $freedMemory bytes, TID = $taskId"
-            if (conf.getBoolean("spark.unsafe.exceptionOnMemoryLeak", false)) {
+            if (conf.getBoolean("spark.unsafe.exceptionOnMemoryLeak", false) && !threwException) {
               throw new SparkException(errMsg)
             } else {
               logError(errMsg)
diff --git a/core/src/test/scala/org/apache/spark/FailureSuite.scala b/core/src/test/scala/org/apache/spark/FailureSuite.scala
index b099cd3fb7965..69cb4b44cf7ef 100644
--- a/core/src/test/scala/org/apache/spark/FailureSuite.scala
+++ b/core/src/test/scala/org/apache/spark/FailureSuite.scala
@@ -141,5 +141,30 @@ class FailureSuite extends SparkFunSuite with LocalSparkContext {
     FailureSuiteState.clear()
   }
 
+  test("managed memory leak error should not mask other failures (SPARK-9266") {
+    val conf = new SparkConf().set("spark.unsafe.exceptionOnMemoryLeak", "true")
+    sc = new SparkContext("local[1,1]", "test", conf)
+
+    // If a task leaks memory but fails due to some other cause, then make sure that the original
+    // cause is preserved
+    val thrownDueToTaskFailure = intercept[SparkException] {
+      sc.parallelize(Seq(0)).mapPartitions { iter =>
+        TaskContext.get().taskMemoryManager().allocate(128)
+        throw new Exception("intentional task failure")
+        iter
+      }.count()
+    }
+    assert(thrownDueToTaskFailure.getMessage.contains("intentional task failure"))
+
+    // If the task succeeded but memory was leaked, then the task should fail due to that leak
+    val thrownDueToMemoryLeak = intercept[SparkException] {
+      sc.parallelize(Seq(0)).mapPartitions { iter =>
+        TaskContext.get().taskMemoryManager().allocate(128)
+        iter
+      }.count()
+    }
+    assert(thrownDueToMemoryLeak.getMessage.contains("memory leak"))
+  }
+
   // TODO: Need to add tests with shuffle fetch failures.
 }

From fb36397b3ce569d77db26df07ac339731cc07b1c Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Thu, 23 Jul 2015 01:51:34 -0700
Subject: [PATCH 336/549] Revert "[SPARK-8579] [SQL] support arbitrary object
 in UnsafeRow"

Reverts ObjectPool. As it stands, it has a few problems:

1. ObjectPool doesn't work with spilling and memory accounting.
2. I don't think in the long run the idea of an object pool is what we want to support, since it essentially goes back to unmanaged memory, and creates pressure on GC, and is hard to account for the total in memory size.
3. The ObjectPool patch removed the specialized getters for strings and binary, and as a result, actually introduced branches when reading non primitive data types.

If we do want to support arbitrary user defined types in the future, I think we can just add an object array in UnsafeRow, rather than relying on indirect memory addressing through a pool. We also need to pick execution strategies that are optimized for those, rather than keeping a lot of unserialized JVM objects in memory during aggregation.

This is probably the hardest thing I had to revert in Spark, due to recent patches that also change the same part of the code. Would be great to get a careful look.

Author: Reynold Xin <rxin@databricks.com>

Closes #7591 from rxin/revert-object-pool and squashes the following commits:

01db0bc [Reynold Xin] Scala style.
eda89fc [Reynold Xin] Fixed describe.
2967118 [Reynold Xin] Fixed accessor for JoinedRow.
e3294eb [Reynold Xin] Merge branch 'master' into revert-object-pool
657855f [Reynold Xin] Temp commit.
c20f2c8 [Reynold Xin] Style fix.
fe37079 [Reynold Xin] Revert "[SPARK-8579] [SQL] support arbitrary object in UnsafeRow"
---
 project/SparkBuild.scala                      |   2 +-
 .../UnsafeFixedWidthAggregationMap.java       | 150 ++++++------
 .../sql/catalyst/expressions/UnsafeRow.java   | 229 ++++++++----------
 .../spark/sql/catalyst/util/ObjectPool.java   |  78 ------
 .../sql/catalyst/util/UniqueObjectPool.java   |  59 -----
 .../execution/UnsafeExternalRowSorter.java    |  16 +-
 .../sql/catalyst/CatalystTypeConverters.scala |   3 +-
 .../spark/sql/catalyst/InternalRow.scala      |   9 +-
 .../catalyst/expressions/BoundAttribute.scala |   2 +
 .../sql/catalyst/expressions/Projection.scala |  53 ++++
 .../expressions/UnsafeRowConverter.scala      |  42 ++--
 .../expressions/codegen/CodeGenerator.scala   |   9 +-
 .../codegen/GenerateUnsafeProjection.scala    |   4 +-
 .../plans/logical/LocalRelation.scala         |   7 +
 .../UnsafeFixedWidthAggregationMapSuite.scala |  65 ++---
 .../expressions/UnsafeRowConverterSuite.scala | 137 +++--------
 .../sql/catalyst/util/ObjectPoolSuite.scala   |  57 -----
 .../org/apache/spark/sql/DataFrame.scala      |  13 +-
 .../sql/execution/GeneratedAggregate.scala    |  17 +-
 .../spark/sql/execution/LocalTableScan.scala  |   2 -
 .../sql/execution/UnsafeRowSerializer.scala   |   8 +-
 .../org/apache/spark/sql/UnsafeRowSuite.scala |   3 +-
 .../execution/UnsafeExternalSortSuite.scala   |   7 +-
 .../execution/UnsafeRowSerializerSuite.scala  |  14 +-
 24 files changed, 355 insertions(+), 631 deletions(-)
 delete mode 100644 sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/util/ObjectPool.java
 delete mode 100644 sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/util/UniqueObjectPool.java
 delete mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/ObjectPoolSuite.scala

diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index 61a05d375d99e..b5b0adf630b9e 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -543,7 +543,7 @@ object TestSettings {
     javaOptions in Test += "-Dspark.ui.enabled=false",
     javaOptions in Test += "-Dspark.ui.showConsoleProgress=false",
     javaOptions in Test += "-Dspark.driver.allowMultipleContexts=true",
-    javaOptions in Test += "-Dspark.unsafe.exceptionOnMemoryLeak=true",
+    //javaOptions in Test += "-Dspark.unsafe.exceptionOnMemoryLeak=true",
     javaOptions in Test += "-Dsun.io.serialization.extendedDebugInfo=true",
     javaOptions in Test += "-Dderby.system.durability=test",
     javaOptions in Test ++= System.getProperties.filter(_._1 startsWith "spark")
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMap.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMap.java
index 79d55b36dab01..2f7e84a7f59e2 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMap.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMap.java
@@ -19,11 +19,9 @@
 
 import java.util.Iterator;
 
-import scala.Function1;
-
 import org.apache.spark.sql.catalyst.InternalRow;
-import org.apache.spark.sql.catalyst.util.ObjectPool;
-import org.apache.spark.sql.catalyst.util.UniqueObjectPool;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
 import org.apache.spark.unsafe.PlatformDependent;
 import org.apache.spark.unsafe.map.BytesToBytesMap;
 import org.apache.spark.unsafe.memory.MemoryLocation;
@@ -40,48 +38,26 @@ public final class UnsafeFixedWidthAggregationMap {
    * An empty aggregation buffer, encoded in UnsafeRow format. When inserting a new key into the
    * map, we copy this buffer and use it as the value.
    */
-  private final byte[] emptyBuffer;
+  private final byte[] emptyAggregationBuffer;
 
-  /**
-   * An empty row used by `initProjection`
-   */
-  private static final InternalRow emptyRow = new GenericInternalRow();
+  private final StructType aggregationBufferSchema;
 
-  /**
-   * Whether can the empty aggregation buffer be reuse without calling `initProjection` or not.
-   */
-  private final boolean reuseEmptyBuffer;
+  private final StructType groupingKeySchema;
 
   /**
-   * The projection used to initialize the emptyBuffer
+   * Encodes grouping keys as UnsafeRows.
    */
-  private final Function1<InternalRow, InternalRow> initProjection;
-
-  /**
-   * Encodes grouping keys or buffers as UnsafeRows.
-   */
-  private final UnsafeRowConverter keyConverter;
-  private final UnsafeRowConverter bufferConverter;
+  private final UnsafeRowConverter groupingKeyToUnsafeRowConverter;
 
   /**
    * A hashmap which maps from opaque bytearray keys to bytearray values.
    */
   private final BytesToBytesMap map;
 
-  /**
-   * An object pool for objects that are used in grouping keys.
-   */
-  private final UniqueObjectPool keyPool;
-
-  /**
-   * An object pool for objects that are used in aggregation buffers.
-   */
-  private final ObjectPool bufferPool;
-
   /**
    * Re-used pointer to the current aggregation buffer
    */
-  private final UnsafeRow currentBuffer = new UnsafeRow();
+  private final UnsafeRow currentAggregationBuffer = new UnsafeRow();
 
   /**
    * Scratch space that is used when encoding grouping keys into UnsafeRow format.
@@ -93,41 +69,69 @@ public final class UnsafeFixedWidthAggregationMap {
 
   private final boolean enablePerfMetrics;
 
+  /**
+   * @return true if UnsafeFixedWidthAggregationMap supports grouping keys with the given schema,
+   *         false otherwise.
+   */
+  public static boolean supportsGroupKeySchema(StructType schema) {
+    for (StructField field: schema.fields()) {
+      if (!UnsafeRow.readableFieldTypes.contains(field.dataType())) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  /**
+   * @return true if UnsafeFixedWidthAggregationMap supports aggregation buffers with the given
+   *         schema, false otherwise.
+   */
+  public static boolean supportsAggregationBufferSchema(StructType schema) {
+    for (StructField field: schema.fields()) {
+      if (!UnsafeRow.settableFieldTypes.contains(field.dataType())) {
+        return false;
+      }
+    }
+    return true;
+  }
+
   /**
    * Create a new UnsafeFixedWidthAggregationMap.
    *
-   * @param initProjection the default value for new keys (a "zero" of the agg. function)
-   * @param keyConverter the converter of the grouping key, used for row conversion.
-   * @param bufferConverter the converter of the aggregation buffer, used for row conversion.
+   * @param emptyAggregationBuffer the default value for new keys (a "zero" of the agg. function)
+   * @param aggregationBufferSchema the schema of the aggregation buffer, used for row conversion.
+   * @param groupingKeySchema the schema of the grouping key, used for row conversion.
    * @param memoryManager the memory manager used to allocate our Unsafe memory structures.
    * @param initialCapacity the initial capacity of the map (a sizing hint to avoid re-hashing).
    * @param enablePerfMetrics if true, performance metrics will be recorded (has minor perf impact)
    */
   public UnsafeFixedWidthAggregationMap(
-      Function1<InternalRow, InternalRow> initProjection,
-      UnsafeRowConverter keyConverter,
-      UnsafeRowConverter bufferConverter,
+      InternalRow emptyAggregationBuffer,
+      StructType aggregationBufferSchema,
+      StructType groupingKeySchema,
       TaskMemoryManager memoryManager,
       int initialCapacity,
       boolean enablePerfMetrics) {
-    this.initProjection = initProjection;
-    this.keyConverter = keyConverter;
-    this.bufferConverter = bufferConverter;
-    this.enablePerfMetrics = enablePerfMetrics;
-
+    this.emptyAggregationBuffer =
+      convertToUnsafeRow(emptyAggregationBuffer, aggregationBufferSchema);
+    this.aggregationBufferSchema = aggregationBufferSchema;
+    this.groupingKeyToUnsafeRowConverter = new UnsafeRowConverter(groupingKeySchema);
+    this.groupingKeySchema = groupingKeySchema;
     this.map = new BytesToBytesMap(memoryManager, initialCapacity, enablePerfMetrics);
-    this.keyPool = new UniqueObjectPool(100);
-    this.bufferPool = new ObjectPool(initialCapacity);
+    this.enablePerfMetrics = enablePerfMetrics;
+  }
 
-    InternalRow initRow = initProjection.apply(emptyRow);
-    int emptyBufferSize = bufferConverter.getSizeRequirement(initRow);
-    this.emptyBuffer = new byte[emptyBufferSize];
-    int writtenLength = bufferConverter.writeRow(
-      initRow, emptyBuffer, PlatformDependent.BYTE_ARRAY_OFFSET, emptyBufferSize,
-      bufferPool);
-    assert (writtenLength == emptyBuffer.length): "Size requirement calculation was wrong!";
-    // re-use the empty buffer only when there is no object saved in pool.
-    reuseEmptyBuffer = bufferPool.size() == 0;
+  /**
+   * Convert a Java object row into an UnsafeRow, allocating it into a new byte array.
+   */
+  private static byte[] convertToUnsafeRow(InternalRow javaRow, StructType schema) {
+    final UnsafeRowConverter converter = new UnsafeRowConverter(schema);
+    final int size = converter.getSizeRequirement(javaRow);
+    final byte[] unsafeRow = new byte[size];
+    final int writtenLength =
+      converter.writeRow(javaRow, unsafeRow, PlatformDependent.BYTE_ARRAY_OFFSET, size);
+    assert (writtenLength == unsafeRow.length): "Size requirement calculation was wrong!";
+    return unsafeRow;
   }
 
   /**
@@ -135,17 +139,16 @@ public UnsafeFixedWidthAggregationMap(
    * return the same object.
    */
   public UnsafeRow getAggregationBuffer(InternalRow groupingKey) {
-    final int groupingKeySize = keyConverter.getSizeRequirement(groupingKey);
+    final int groupingKeySize = groupingKeyToUnsafeRowConverter.getSizeRequirement(groupingKey);
     // Make sure that the buffer is large enough to hold the key. If it's not, grow it:
     if (groupingKeySize > groupingKeyConversionScratchSpace.length) {
       groupingKeyConversionScratchSpace = new byte[groupingKeySize];
     }
-    final int actualGroupingKeySize = keyConverter.writeRow(
+    final int actualGroupingKeySize = groupingKeyToUnsafeRowConverter.writeRow(
       groupingKey,
       groupingKeyConversionScratchSpace,
       PlatformDependent.BYTE_ARRAY_OFFSET,
-      groupingKeySize,
-      keyPool);
+      groupingKeySize);
     assert (groupingKeySize == actualGroupingKeySize) : "Size requirement calculation was wrong!";
 
     // Probe our map using the serialized key
@@ -156,32 +159,25 @@ public UnsafeRow getAggregationBuffer(InternalRow groupingKey) {
     if (!loc.isDefined()) {
       // This is the first time that we've seen this grouping key, so we'll insert a copy of the
       // empty aggregation buffer into the map:
-      if (!reuseEmptyBuffer) {
-        // There is some objects referenced by emptyBuffer, so generate a new one
-        InternalRow initRow = initProjection.apply(emptyRow);
-        bufferConverter.writeRow(initRow, emptyBuffer, PlatformDependent.BYTE_ARRAY_OFFSET,
-          groupingKeySize, bufferPool);
-      }
       loc.putNewKey(
         groupingKeyConversionScratchSpace,
         PlatformDependent.BYTE_ARRAY_OFFSET,
         groupingKeySize,
-        emptyBuffer,
+        emptyAggregationBuffer,
         PlatformDependent.BYTE_ARRAY_OFFSET,
-        emptyBuffer.length
+        emptyAggregationBuffer.length
       );
     }
 
     // Reset the pointer to point to the value that we just stored or looked up:
     final MemoryLocation address = loc.getValueAddress();
-    currentBuffer.pointTo(
+    currentAggregationBuffer.pointTo(
       address.getBaseObject(),
       address.getBaseOffset(),
-      bufferConverter.numFields(),
-      loc.getValueLength(),
-      bufferPool
+      aggregationBufferSchema.length(),
+      loc.getValueLength()
     );
-    return currentBuffer;
+    return currentAggregationBuffer;
   }
 
   /**
@@ -217,16 +213,14 @@ public MapEntry next() {
         entry.key.pointTo(
           keyAddress.getBaseObject(),
           keyAddress.getBaseOffset(),
-          keyConverter.numFields(),
-          loc.getKeyLength(),
-          keyPool
+          groupingKeySchema.length(),
+          loc.getKeyLength()
         );
         entry.value.pointTo(
           valueAddress.getBaseObject(),
           valueAddress.getBaseOffset(),
-          bufferConverter.numFields(),
-          loc.getValueLength(),
-          bufferPool
+          aggregationBufferSchema.length(),
+          loc.getValueLength()
         );
         return entry;
       }
@@ -254,8 +248,6 @@ public void printPerfMetrics() {
     System.out.println("Number of hash collisions: " + map.getNumHashCollisions());
     System.out.println("Time spent resizing (ns): " + map.getTimeSpentResizingNs());
     System.out.println("Total memory consumption (bytes): " + map.getTotalMemoryConsumption());
-    System.out.println("Number of unique objects in keys: " + keyPool.size());
-    System.out.println("Number of objects in buffers: " + bufferPool.size());
   }
 
 }
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
index 7f08bf7b742dc..fa1216b455a9e 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
@@ -19,14 +19,19 @@
 
 import java.io.IOException;
 import java.io.OutputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
 
-import org.apache.spark.sql.catalyst.util.ObjectPool;
+import org.apache.spark.sql.types.DataType;
 import org.apache.spark.unsafe.PlatformDependent;
 import org.apache.spark.unsafe.array.ByteArrayMethods;
 import org.apache.spark.unsafe.bitset.BitSetMethods;
 import org.apache.spark.unsafe.hash.Murmur3_x86_32;
 import org.apache.spark.unsafe.types.UTF8String;
 
+import static org.apache.spark.sql.types.DataTypes.*;
 
 /**
  * An Unsafe implementation of Row which is backed by raw memory instead of Java objects.
@@ -40,20 +45,7 @@
  * primitive types, such as long, double, or int, we store the value directly in the word. For
  * fields with non-primitive or variable-length values, we store a relative offset (w.r.t. the
  * base address of the row) that points to the beginning of the variable-length field, and length
- * (they are combined into a long). For other objects, they are stored in a pool, the indexes of
- * them are hold in the the word.
- *
- * In order to support fast hashing and equality checks for UnsafeRows that contain objects
- * when used as grouping key in BytesToBytesMap, we put the objects in an UniqueObjectPool to make
- * sure all the key have the same index for same object, then we can hash/compare the objects by
- * hash/compare the index.
- *
- * For non-primitive types, the word of a field could be:
- *   UNION {
- *     [1] [offset: 31bits] [length: 31bits]  // StringType
- *     [0] [offset: 31bits] [length: 31bits]  // BinaryType
- *     - [index: 63bits]                      // StringType, Binary, index to object in pool
- *   }
+ * (they are combined into a long).
  *
  * Instances of `UnsafeRow` act as pointers to row data stored in this format.
  */
@@ -62,13 +54,9 @@ public final class UnsafeRow extends MutableRow {
   private Object baseObject;
   private long baseOffset;
 
-  /** A pool to hold non-primitive objects */
-  private ObjectPool pool;
-
   public Object getBaseObject() { return baseObject; }
   public long getBaseOffset() { return baseOffset; }
   public int getSizeInBytes() { return sizeInBytes; }
-  public ObjectPool getPool() { return pool; }
 
   /** The number of fields in this row, used for calculating the bitset width (and in assertions) */
   private int numFields;
@@ -89,7 +77,42 @@ public static int calculateBitSetWidthInBytes(int numFields) {
     return ((numFields / 64) + (numFields % 64 == 0 ? 0 : 1)) * 8;
   }
 
-  public static final long OFFSET_BITS = 31L;
+  /**
+   * Field types that can be updated in place in UnsafeRows (e.g. we support set() for these types)
+   */
+  public static final Set<DataType> settableFieldTypes;
+
+  /**
+   * Fields types can be read(but not set (e.g. set() will throw UnsupportedOperationException).
+   */
+  public static final Set<DataType> readableFieldTypes;
+
+  // TODO: support DecimalType
+  static {
+    settableFieldTypes = Collections.unmodifiableSet(
+      new HashSet<>(
+        Arrays.asList(new DataType[] {
+          NullType,
+          BooleanType,
+          ByteType,
+          ShortType,
+          IntegerType,
+          LongType,
+          FloatType,
+          DoubleType,
+          DateType,
+          TimestampType
+    })));
+
+    // We support get() on a superset of the types for which we support set():
+    final Set<DataType> _readableFieldTypes = new HashSet<>(
+      Arrays.asList(new DataType[]{
+        StringType,
+        BinaryType
+      }));
+    _readableFieldTypes.addAll(settableFieldTypes);
+    readableFieldTypes = Collections.unmodifiableSet(_readableFieldTypes);
+  }
 
   /**
    * Construct a new UnsafeRow. The resulting row won't be usable until `pointTo()` has been called,
@@ -104,17 +127,14 @@ public UnsafeRow() { }
    * @param baseOffset the offset within the base object
    * @param numFields the number of fields in this row
    * @param sizeInBytes the size of this row's backing data, in bytes
-   * @param pool the object pool to hold arbitrary objects
    */
-  public void pointTo(
-      Object baseObject, long baseOffset, int numFields, int sizeInBytes, ObjectPool pool) {
+  public void pointTo(Object baseObject, long baseOffset, int numFields, int sizeInBytes) {
     assert numFields >= 0 : "numFields should >= 0";
     this.bitSetWidthInBytes = calculateBitSetWidthInBytes(numFields);
     this.baseObject = baseObject;
     this.baseOffset = baseOffset;
     this.numFields = numFields;
     this.sizeInBytes = sizeInBytes;
-    this.pool = pool;
   }
 
   private void assertIndexIsValid(int index) {
@@ -137,68 +157,9 @@ private void setNotNullAt(int i) {
     BitSetMethods.unset(baseObject, baseOffset, i);
   }
 
-  /**
-   * Updates the column `i` as Object `value`, which cannot be primitive types.
-   */
   @Override
-  public void update(int i, Object value) {
-    if (value == null) {
-      if (!isNullAt(i)) {
-        // remove the old value from pool
-        long idx = getLong(i);
-        if (idx <= 0) {
-          // this is the index of old value in pool, remove it
-          pool.replace((int)-idx, null);
-        } else {
-          // there will be some garbage left (UTF8String or byte[])
-        }
-        setNullAt(i);
-      }
-      return;
-    }
-
-    if (isNullAt(i)) {
-      // there is not an old value, put the new value into pool
-      int idx = pool.put(value);
-      setLong(i, (long)-idx);
-    } else {
-      // there is an old value, check the type, then replace it or update it
-      long v = getLong(i);
-      if (v <= 0) {
-        // it's the index in the pool, replace old value with new one
-        int idx = (int)-v;
-        pool.replace(idx, value);
-      } else {
-        // old value is UTF8String or byte[], try to reuse the space
-        boolean isString;
-        byte[] newBytes;
-        if (value instanceof UTF8String) {
-          newBytes = ((UTF8String) value).getBytes();
-          isString = true;
-        } else {
-          newBytes = (byte[]) value;
-          isString = false;
-        }
-        int offset = (int) ((v >> OFFSET_BITS) & Integer.MAX_VALUE);
-        int oldLength = (int) (v & Integer.MAX_VALUE);
-        if (newBytes.length <= oldLength) {
-          // the new value can fit in the old buffer, re-use it
-          PlatformDependent.copyMemory(
-            newBytes,
-            PlatformDependent.BYTE_ARRAY_OFFSET,
-            baseObject,
-            baseOffset + offset,
-            newBytes.length);
-          long flag = isString ? 1L << (OFFSET_BITS * 2) : 0L;
-          setLong(i, flag | (((long) offset) << OFFSET_BITS) | (long) newBytes.length);
-        } else {
-          // Cannot fit in the buffer
-          int idx = pool.put(value);
-          setLong(i, (long) -idx);
-        }
-      }
-    }
-    setNotNullAt(i);
+  public void update(int ordinal, Object value) {
+    throw new UnsupportedOperationException();
   }
 
   @Override
@@ -256,40 +217,14 @@ public void setFloat(int ordinal, float value) {
     PlatformDependent.UNSAFE.putFloat(baseObject, getFieldOffset(ordinal), value);
   }
 
-  /**
-   * Returns the object for column `i`, which should not be primitive type.
-   */
+  @Override
+  public int size() {
+    return numFields;
+  }
+
   @Override
   public Object get(int i) {
-    assertIndexIsValid(i);
-    if (isNullAt(i)) {
-      return null;
-    }
-    long v = PlatformDependent.UNSAFE.getLong(baseObject, getFieldOffset(i));
-    if (v <= 0) {
-      // It's an index to object in the pool.
-      int idx = (int)-v;
-      return pool.get(idx);
-    } else {
-      // The column could be StingType or BinaryType
-      boolean isString = (v >> (OFFSET_BITS * 2)) > 0;
-      int offset = (int) ((v >> OFFSET_BITS) & Integer.MAX_VALUE);
-      int size = (int) (v & Integer.MAX_VALUE);
-      final byte[] bytes = new byte[size];
-      // TODO(davies): Avoid the copy once we can manage the life cycle of Row well.
-      PlatformDependent.copyMemory(
-        baseObject,
-        baseOffset + offset,
-        bytes,
-        PlatformDependent.BYTE_ARRAY_OFFSET,
-        size
-      );
-      if (isString) {
-        return UTF8String.fromBytes(bytes);
-      } else {
-        return bytes;
-      }
-    }
+    throw new UnsupportedOperationException();
   }
 
   @Override
@@ -348,6 +283,38 @@ public double getDouble(int i) {
     }
   }
 
+  @Override
+  public UTF8String getUTF8String(int i) {
+    assertIndexIsValid(i);
+    return isNullAt(i) ? null : UTF8String.fromBytes(getBinary(i));
+  }
+
+  @Override
+  public byte[] getBinary(int i) {
+    if (isNullAt(i)) {
+      return null;
+    } else {
+      assertIndexIsValid(i);
+      final long offsetAndSize = getLong(i);
+      final int offset = (int) (offsetAndSize >> 32);
+      final int size = (int) (offsetAndSize & ((1L << 32) - 1));
+      final byte[] bytes = new byte[size];
+      PlatformDependent.copyMemory(
+        baseObject,
+        baseOffset + offset,
+        bytes,
+        PlatformDependent.BYTE_ARRAY_OFFSET,
+        size
+      );
+      return bytes;
+    }
+  }
+
+  @Override
+  public String getString(int i) {
+    return getUTF8String(i).toString();
+  }
+
   /**
    * Copies this row, returning a self-contained UnsafeRow that stores its data in an internal
    * byte array rather than referencing data stored in a data page.
@@ -356,23 +323,17 @@ public double getDouble(int i) {
    */
   @Override
   public UnsafeRow copy() {
-    if (pool != null) {
-      throw new UnsupportedOperationException(
-        "Copy is not supported for UnsafeRows that use object pools");
-    } else {
-      UnsafeRow rowCopy = new UnsafeRow();
-      final byte[] rowDataCopy = new byte[sizeInBytes];
-      PlatformDependent.copyMemory(
-        baseObject,
-        baseOffset,
-        rowDataCopy,
-        PlatformDependent.BYTE_ARRAY_OFFSET,
-        sizeInBytes
-      );
-      rowCopy.pointTo(
-        rowDataCopy, PlatformDependent.BYTE_ARRAY_OFFSET, numFields, sizeInBytes, null);
-      return rowCopy;
-    }
+    UnsafeRow rowCopy = new UnsafeRow();
+    final byte[] rowDataCopy = new byte[sizeInBytes];
+    PlatformDependent.copyMemory(
+      baseObject,
+      baseOffset,
+      rowDataCopy,
+      PlatformDependent.BYTE_ARRAY_OFFSET,
+      sizeInBytes
+    );
+    rowCopy.pointTo(rowDataCopy, PlatformDependent.BYTE_ARRAY_OFFSET, numFields, sizeInBytes);
+    return rowCopy;
   }
 
   /**
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/util/ObjectPool.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/util/ObjectPool.java
deleted file mode 100644
index 97f89a7d0b758..0000000000000
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/util/ObjectPool.java
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.catalyst.util;
-
-/**
- * A object pool stores a collection of objects in array, then they can be referenced by the
- * pool plus an index.
- */
-public class ObjectPool {
-
-  /**
-   * An array to hold objects, which will grow as needed.
-   */
-  private Object[] objects;
-
-  /**
-   * How many objects in the pool.
-   */
-  private int numObj;
-
-  public ObjectPool(int capacity) {
-    objects = new Object[capacity];
-    numObj = 0;
-  }
-
-  /**
-   * Returns how many objects in the pool.
-   */
-  public int size() {
-    return numObj;
-  }
-
-  /**
-   * Returns the object at position `idx` in the array.
-   */
-  public Object get(int idx) {
-    assert (idx < numObj);
-    return objects[idx];
-  }
-
-  /**
-   * Puts an object `obj` at the end of array, returns the index of it.
-   * <p/>
-   * The array will grow as needed.
-   */
-  public int put(Object obj) {
-    if (numObj >= objects.length) {
-      Object[] tmp = new Object[objects.length * 2];
-      System.arraycopy(objects, 0, tmp, 0, objects.length);
-      objects = tmp;
-    }
-    objects[numObj++] = obj;
-    return numObj - 1;
-  }
-
-  /**
-   * Replaces the object at `idx` with new one `obj`.
-   */
-  public void replace(int idx, Object obj) {
-    assert (idx < numObj);
-    objects[idx] = obj;
-  }
-}
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/util/UniqueObjectPool.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/util/UniqueObjectPool.java
deleted file mode 100644
index d512392dcaacc..0000000000000
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/util/UniqueObjectPool.java
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.catalyst.util;
-
-import java.util.HashMap;
-
-/**
- * An unique object pool stores a collection of unique objects in it.
- */
-public class UniqueObjectPool extends ObjectPool {
-
-  /**
-   * A hash map from objects to their indexes in the array.
-   */
-  private HashMap<Object, Integer> objIndex;
-
-  public UniqueObjectPool(int capacity) {
-    super(capacity);
-    objIndex = new HashMap<Object, Integer>();
-  }
-
-  /**
-   * Put an object `obj` into the pool. If there is an existing object equals to `obj`, it will
-   * return the index of the existing one.
-   */
-  @Override
-  public int put(Object obj) {
-    if (objIndex.containsKey(obj)) {
-      return objIndex.get(obj);
-    } else {
-      int idx = super.put(obj);
-      objIndex.put(obj, idx);
-      return idx;
-    }
-  }
-
-  /**
-   * The objects can not be replaced.
-   */
-  @Override
-  public void replace(int idx, Object obj) {
-    throw new UnsupportedOperationException();
-  }
-}
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java b/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java
index 39fd6e1bc6d13..be4ff400c4754 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java
@@ -30,7 +30,6 @@
 import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.catalyst.expressions.UnsafeProjection;
 import org.apache.spark.sql.catalyst.expressions.UnsafeRow;
-import org.apache.spark.sql.catalyst.util.ObjectPool;
 import org.apache.spark.sql.types.StructType;
 import org.apache.spark.unsafe.PlatformDependent;
 import org.apache.spark.util.collection.unsafe.sort.PrefixComparator;
@@ -72,7 +71,7 @@ public UnsafeExternalRowSorter(
       sparkEnv.shuffleMemoryManager(),
       sparkEnv.blockManager(),
       taskContext,
-      new RowComparator(ordering, schema.length(), null),
+      new RowComparator(ordering, schema.length()),
       prefixComparator,
       4096,
       sparkEnv.conf()
@@ -140,8 +139,7 @@ public InternalRow next() {
               sortedIterator.getBaseObject(),
               sortedIterator.getBaseOffset(),
               numFields,
-              sortedIterator.getRecordLength(),
-              null);
+              sortedIterator.getRecordLength());
             if (!hasNext()) {
               row.copy(); // so that we don't have dangling pointers to freed page
               cleanupResources();
@@ -174,27 +172,25 @@ public Iterator<InternalRow> sort(Iterator<InternalRow> inputIterator) throws IO
    * Return true if UnsafeExternalRowSorter can sort rows with the given schema, false otherwise.
    */
   public static boolean supportsSchema(StructType schema) {
-    // TODO: add spilling note to explain why we do this for now:
     return UnsafeProjection.canSupport(schema);
   }
 
   private static final class RowComparator extends RecordComparator {
     private final Ordering<InternalRow> ordering;
     private final int numFields;
-    private final ObjectPool objPool;
     private final UnsafeRow row1 = new UnsafeRow();
     private final UnsafeRow row2 = new UnsafeRow();
 
-    public RowComparator(Ordering<InternalRow> ordering, int numFields, ObjectPool objPool) {
+    public RowComparator(Ordering<InternalRow> ordering, int numFields) {
       this.numFields = numFields;
       this.ordering = ordering;
-      this.objPool = objPool;
     }
 
     @Override
     public int compare(Object baseObj1, long baseOff1, Object baseObj2, long baseOff2) {
-      row1.pointTo(baseObj1, baseOff1, numFields, -1, objPool);
-      row2.pointTo(baseObj2, baseOff2, numFields, -1, objPool);
+      // TODO: Why are the sizes -1?
+      row1.pointTo(baseObj1, baseOff1, numFields, -1);
+      row2.pointTo(baseObj2, baseOff2, numFields, -1);
       return ordering.compare(row1, row2);
     }
   }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala
index ae0ab2f4c63f5..4067833d5e648 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala
@@ -281,7 +281,8 @@ object CatalystTypeConverters {
     }
     override def toScala(catalystValue: UTF8String): String =
       if (catalystValue == null) null else catalystValue.toString
-    override def toScalaImpl(row: InternalRow, column: Int): String = row(column).toString
+    override def toScalaImpl(row: InternalRow, column: Int): String =
+      row.getUTF8String(column).toString
   }
 
   private object DateConverter extends CatalystTypeConverter[Date, Date, Any] {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala
index 024973a6b9fcd..c7ec49b3d6c3d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala
@@ -27,11 +27,12 @@ import org.apache.spark.unsafe.types.UTF8String
  */
 abstract class InternalRow extends Row {
 
+  def getUTF8String(i: Int): UTF8String = getAs[UTF8String](i)
+
+  def getBinary(i: Int): Array[Byte] = getAs[Array[Byte]](i)
+
   // This is only use for test
-  override def getString(i: Int): String = {
-    val str = getAs[UTF8String](i)
-    if (str != null) str.toString else null
-  }
+  override def getString(i: Int): String = getAs[UTF8String](i).toString
 
   // These expensive API should not be used internally.
   final override def getDecimal(i: Int): java.math.BigDecimal =
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala
index 4a13b687bf4ce..6aa4930cb8587 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala
@@ -46,6 +46,8 @@ case class BoundReference(ordinal: Int, dataType: DataType, nullable: Boolean)
         case LongType | TimestampType => input.getLong(ordinal)
         case FloatType => input.getFloat(ordinal)
         case DoubleType => input.getDouble(ordinal)
+        case StringType => input.getUTF8String(ordinal)
+        case BinaryType => input.getBinary(ordinal)
         case _ => input.get(ordinal)
       }
     }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
index 69758e653eba0..04872fbc8b091 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.catalyst.expressions
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.codegen.{GenerateUnsafeProjection, GenerateMutableProjection}
 import org.apache.spark.sql.types.{StructType, DataType}
+import org.apache.spark.unsafe.types.UTF8String
 
 /**
  * A [[Projection]] that is calculated by calling the `eval` of each of the specified expressions.
@@ -177,6 +178,14 @@ class JoinedRow extends InternalRow {
 
   override def length: Int = row1.length + row2.length
 
+  override def getUTF8String(i: Int): UTF8String = {
+    if (i < row1.length) row1.getUTF8String(i) else row2.getUTF8String(i - row1.length)
+  }
+
+  override def getBinary(i: Int): Array[Byte] = {
+    if (i < row1.length) row1.getBinary(i) else row2.getBinary(i - row1.length)
+  }
+
   override def get(i: Int): Any =
     if (i < row1.length) row1(i) else row2(i - row1.length)
 
@@ -271,6 +280,14 @@ class JoinedRow2 extends InternalRow {
 
   override def length: Int = row1.length + row2.length
 
+  override def getUTF8String(i: Int): UTF8String = {
+    if (i < row1.length) row1.getUTF8String(i) else row2.getUTF8String(i - row1.length)
+  }
+
+  override def getBinary(i: Int): Array[Byte] = {
+    if (i < row1.length) row1.getBinary(i) else row2.getBinary(i - row1.length)
+  }
+
   override def get(i: Int): Any =
     if (i < row1.length) row1(i) else row2(i - row1.length)
 
@@ -359,6 +376,15 @@ class JoinedRow3 extends InternalRow {
 
   override def length: Int = row1.length + row2.length
 
+  override def getUTF8String(i: Int): UTF8String = {
+    if (i < row1.length) row1.getUTF8String(i) else row2.getUTF8String(i - row1.length)
+  }
+
+  override def getBinary(i: Int): Array[Byte] = {
+    if (i < row1.length) row1.getBinary(i) else row2.getBinary(i - row1.length)
+  }
+
+
   override def get(i: Int): Any =
     if (i < row1.length) row1(i) else row2(i - row1.length)
 
@@ -447,6 +473,15 @@ class JoinedRow4 extends InternalRow {
 
   override def length: Int = row1.length + row2.length
 
+  override def getUTF8String(i: Int): UTF8String = {
+    if (i < row1.length) row1.getUTF8String(i) else row2.getUTF8String(i - row1.length)
+  }
+
+  override def getBinary(i: Int): Array[Byte] = {
+    if (i < row1.length) row1.getBinary(i) else row2.getBinary(i - row1.length)
+  }
+
+
   override def get(i: Int): Any =
     if (i < row1.length) row1(i) else row2(i - row1.length)
 
@@ -535,6 +570,15 @@ class JoinedRow5 extends InternalRow {
 
   override def length: Int = row1.length + row2.length
 
+  override def getUTF8String(i: Int): UTF8String = {
+    if (i < row1.length) row1.getUTF8String(i) else row2.getUTF8String(i - row1.length)
+  }
+
+  override def getBinary(i: Int): Array[Byte] = {
+    if (i < row1.length) row1.getBinary(i) else row2.getBinary(i - row1.length)
+  }
+
+
   override def get(i: Int): Any =
     if (i < row1.length) row1(i) else row2(i - row1.length)
 
@@ -623,6 +667,15 @@ class JoinedRow6 extends InternalRow {
 
   override def length: Int = row1.length + row2.length
 
+  override def getUTF8String(i: Int): UTF8String = {
+    if (i < row1.length) row1.getUTF8String(i) else row2.getUTF8String(i - row1.length)
+  }
+
+  override def getBinary(i: Int): Array[Byte] = {
+    if (i < row1.length) row1.getBinary(i) else row2.getBinary(i - row1.length)
+  }
+
+
   override def get(i: Int): Any =
     if (i < row1.length) row1(i) else row2(i - row1.length)
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverter.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverter.scala
index 885ab091fcdf5..c47b16c0f8585 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverter.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverter.scala
@@ -17,13 +17,15 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+import scala.util.Try
+
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.util.ObjectPool
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.PlatformDependent
 import org.apache.spark.unsafe.array.ByteArrayMethods
 import org.apache.spark.unsafe.types.UTF8String
 
+
 /**
  * Converts Rows into UnsafeRow format. This class is NOT thread-safe.
  *
@@ -35,8 +37,6 @@ class UnsafeRowConverter(fieldTypes: Array[DataType]) {
     this(schema.fields.map(_.dataType))
   }
 
-  def numFields: Int = fieldTypes.length
-
   /** Re-used pointer to the unsafe row being written */
   private[this] val unsafeRow = new UnsafeRow()
 
@@ -77,9 +77,8 @@ class UnsafeRowConverter(fieldTypes: Array[DataType]) {
       row: InternalRow,
       baseObject: Object,
       baseOffset: Long,
-      rowLengthInBytes: Int,
-      pool: ObjectPool): Int = {
-    unsafeRow.pointTo(baseObject, baseOffset, writers.length, rowLengthInBytes, pool)
+      rowLengthInBytes: Int): Int = {
+    unsafeRow.pointTo(baseObject, baseOffset, writers.length, rowLengthInBytes)
 
     if (writers.length > 0) {
       // zero-out the bitset
@@ -94,16 +93,16 @@ class UnsafeRowConverter(fieldTypes: Array[DataType]) {
     }
 
     var fieldNumber = 0
-    var cursor: Int = fixedLengthSize
+    var appendCursor: Int = fixedLengthSize
     while (fieldNumber < writers.length) {
       if (row.isNullAt(fieldNumber)) {
         unsafeRow.setNullAt(fieldNumber)
       } else {
-        cursor += writers(fieldNumber).write(row, unsafeRow, fieldNumber, cursor)
+        appendCursor += writers(fieldNumber).write(row, unsafeRow, fieldNumber, appendCursor)
       }
       fieldNumber += 1
     }
-    cursor
+    appendCursor
   }
 
 }
@@ -118,11 +117,11 @@ private abstract class UnsafeColumnWriter {
    * @param source the row being converted
    * @param target a pointer to the converted unsafe row
    * @param column the column to write
-   * @param cursor the offset from the start of the unsafe row to the end of the row;
+   * @param appendCursor the offset from the start of the unsafe row to the end of the row;
    *                     used for calculating where variable-length data should be written
    * @return the number of variable-length bytes written
    */
-  def write(source: InternalRow, target: UnsafeRow, column: Int, cursor: Int): Int
+  def write(source: InternalRow, target: UnsafeRow, column: Int, appendCursor: Int): Int
 
   /**
    * Return the number of bytes that are needed to write this variable-length value.
@@ -144,21 +143,19 @@ private object UnsafeColumnWriter {
       case DoubleType => DoubleUnsafeColumnWriter
       case StringType => StringUnsafeColumnWriter
       case BinaryType => BinaryUnsafeColumnWriter
-      case t => ObjectUnsafeColumnWriter
+      case t =>
+        throw new UnsupportedOperationException(s"Do not know how to write columns of type $t")
     }
   }
 
   /**
    * Returns whether the dataType can be embedded into UnsafeRow (not using ObjectPool).
    */
-  def canEmbed(dataType: DataType): Boolean = {
-    forType(dataType) != ObjectUnsafeColumnWriter
-  }
+  def canEmbed(dataType: DataType): Boolean = Try(forType(dataType)).isSuccess
 }
 
 // ------------------------------------------------------------------------------------------------
 
-
 private abstract class PrimitiveUnsafeColumnWriter extends UnsafeColumnWriter {
   // Primitives don't write to the variable-length region:
   def getSize(sourceRow: InternalRow, column: Int): Int = 0
@@ -249,8 +246,7 @@ private abstract class BytesUnsafeColumnWriter extends UnsafeColumnWriter {
       offset,
       numBytes
     )
-    val flag = if (isString) 1L << (UnsafeRow.OFFSET_BITS * 2) else 0
-    target.setLong(column, flag | (cursor.toLong << UnsafeRow.OFFSET_BITS) | numBytes.toLong)
+    target.setLong(column, (cursor.toLong << 32) | numBytes.toLong)
     ByteArrayMethods.roundNumberOfBytesToNearestWord(numBytes)
   }
 }
@@ -278,13 +274,3 @@ private object BinaryUnsafeColumnWriter extends BytesUnsafeColumnWriter {
   def getSize(value: Array[Byte]): Int =
     ByteArrayMethods.roundNumberOfBytesToNearestWord(value.length)
 }
-
-private object ObjectUnsafeColumnWriter extends UnsafeColumnWriter {
-  override def getSize(sourceRow: InternalRow, column: Int): Int = 0
-  override def write(source: InternalRow, target: UnsafeRow, column: Int, cursor: Int): Int = {
-    val obj = source.get(column)
-    val idx = target.getPool.put(obj)
-    target.setLong(column, - idx)
-    0
-  }
-}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index 319dcd1c04316..48225e1574600 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -105,10 +105,11 @@ class CodeGenContext {
    */
   def getColumn(row: String, dataType: DataType, ordinal: Int): String = {
     val jt = javaType(dataType)
-    if (isPrimitiveType(jt)) {
-      s"$row.get${primitiveTypeName(jt)}($ordinal)"
-    } else {
-      s"($jt)$row.apply($ordinal)"
+    dataType match {
+      case _ if isPrimitiveType(jt) => s"$row.get${primitiveTypeName(jt)}($ordinal)"
+      case StringType => s"$row.getUTF8String($ordinal)"
+      case BinaryType => s"$row.getBinary($ordinal)"
+      case _ => s"($jt)$row.apply($ordinal)"
     }
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
index 3a8e8302b24fd..d65e5c38ebf5c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
@@ -98,7 +98,7 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
       }
 
       public UnsafeRow apply(InternalRow i) {
-        ${allExprs}
+        $allExprs
 
         // additionalSize had '+' in the beginning
         int numBytes = $fixedSize $additionalSize;
@@ -106,7 +106,7 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
           buffer = new byte[numBytes];
         }
         target.pointTo(buffer, org.apache.spark.unsafe.PlatformDependent.BYTE_ARRAY_OFFSET,
-          ${expressions.size}, numBytes, null);
+          ${expressions.size}, numBytes);
         int cursor = $fixedSize;
         $writers
         return target;
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LocalRelation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LocalRelation.scala
index 1868f119f0e97..e3e7a11dba973 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LocalRelation.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LocalRelation.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.catalyst.plans.logical
 
+import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow, analysis}
 import org.apache.spark.sql.types.{StructField, StructType}
@@ -28,6 +29,12 @@ object LocalRelation {
     new LocalRelation(StructType(output1 +: output).toAttributes)
   }
 
+  def fromExternalRows(output: Seq[Attribute], data: Seq[Row]): LocalRelation = {
+    val schema = StructType.fromAttributes(output)
+    val converter = CatalystTypeConverters.createToCatalystConverter(schema)
+    LocalRelation(output, data.map(converter(_).asInstanceOf[InternalRow]))
+  }
+
   def fromProduct(output: Seq[Attribute], data: Seq[Product]): LocalRelation = {
     val schema = StructType.fromAttributes(output)
     val converter = CatalystTypeConverters.createToCatalystConverter(schema)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMapSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMapSuite.scala
index c9667e90a0aaa..7566cb59e34ee 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMapSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMapSuite.scala
@@ -24,9 +24,8 @@ import org.scalatest.{BeforeAndAfterEach, Matchers}
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.codegen.GenerateProjection
 import org.apache.spark.sql.types._
-import org.apache.spark.unsafe.memory.{ExecutorMemoryManager, MemoryAllocator, TaskMemoryManager}
+import org.apache.spark.unsafe.memory.{ExecutorMemoryManager, TaskMemoryManager, MemoryAllocator}
 import org.apache.spark.unsafe.types.UTF8String
 
 
@@ -35,10 +34,10 @@ class UnsafeFixedWidthAggregationMapSuite
   with Matchers
   with BeforeAndAfterEach {
 
+  import UnsafeFixedWidthAggregationMap._
+
   private val groupKeySchema = StructType(StructField("product", StringType) :: Nil)
   private val aggBufferSchema = StructType(StructField("salePrice", IntegerType) :: Nil)
-  private def emptyProjection: Projection =
-    GenerateProjection.generate(Seq(Literal(0)), Seq(AttributeReference("price", IntegerType)()))
   private def emptyAggregationBuffer: InternalRow = InternalRow(0)
 
   private var memoryManager: TaskMemoryManager = null
@@ -54,11 +53,21 @@ class UnsafeFixedWidthAggregationMapSuite
     }
   }
 
+  test("supported schemas") {
+    assert(!supportsAggregationBufferSchema(StructType(StructField("x", StringType) :: Nil)))
+    assert(supportsGroupKeySchema(StructType(StructField("x", StringType) :: Nil)))
+
+    assert(
+      !supportsAggregationBufferSchema(StructType(StructField("x", ArrayType(IntegerType)) :: Nil)))
+    assert(
+      !supportsGroupKeySchema(StructType(StructField("x", ArrayType(IntegerType)) :: Nil)))
+  }
+
   test("empty map") {
     val map = new UnsafeFixedWidthAggregationMap(
-      emptyProjection,
-      new UnsafeRowConverter(groupKeySchema),
-      new UnsafeRowConverter(aggBufferSchema),
+      emptyAggregationBuffer,
+      aggBufferSchema,
+      groupKeySchema,
       memoryManager,
       1024, // initial capacity
       false // disable perf metrics
@@ -69,9 +78,9 @@ class UnsafeFixedWidthAggregationMapSuite
 
   test("updating values for a single key") {
     val map = new UnsafeFixedWidthAggregationMap(
-      emptyProjection,
-      new UnsafeRowConverter(groupKeySchema),
-      new UnsafeRowConverter(aggBufferSchema),
+      emptyAggregationBuffer,
+      aggBufferSchema,
+      groupKeySchema,
       memoryManager,
       1024, // initial capacity
       false // disable perf metrics
@@ -95,9 +104,9 @@ class UnsafeFixedWidthAggregationMapSuite
 
   test("inserting large random keys") {
     val map = new UnsafeFixedWidthAggregationMap(
-      emptyProjection,
-      new UnsafeRowConverter(groupKeySchema),
-      new UnsafeRowConverter(aggBufferSchema),
+      emptyAggregationBuffer,
+      aggBufferSchema,
+      groupKeySchema,
       memoryManager,
       128, // initial capacity
       false // disable perf metrics
@@ -112,36 +121,6 @@ class UnsafeFixedWidthAggregationMapSuite
     }.toSet
     seenKeys.size should be (groupKeys.size)
     seenKeys should be (groupKeys)
-
-    map.free()
-  }
-
-  test("with decimal in the key and values") {
-    val groupKeySchema = StructType(StructField("price", DecimalType(10, 0)) :: Nil)
-    val aggBufferSchema = StructType(StructField("amount", DecimalType.Unlimited) :: Nil)
-    val emptyProjection = GenerateProjection.generate(Seq(Literal(Decimal(0))),
-      Seq(AttributeReference("price", DecimalType.Unlimited)()))
-    val map = new UnsafeFixedWidthAggregationMap(
-      emptyProjection,
-      new UnsafeRowConverter(groupKeySchema),
-      new UnsafeRowConverter(aggBufferSchema),
-      memoryManager,
-      1, // initial capacity
-      false // disable perf metrics
-    )
-
-    (0 until 100).foreach { i =>
-      val groupKey = InternalRow(Decimal(i % 10))
-      val row = map.getAggregationBuffer(groupKey)
-      row.update(0, Decimal(i))
-    }
-    val seenKeys: Set[Int] = map.iterator().asScala.map { entry =>
-      entry.key.getAs[Decimal](0).toInt
-    }.toSet
-    seenKeys.size should be (10)
-    seenKeys should be ((0 until 10).toSet)
-
-    map.free()
   }
 
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala
index dff5faf9f6ec8..8819234e78e60 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala
@@ -24,7 +24,7 @@ import org.scalatest.Matchers
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.util.{ObjectPool, DateTimeUtils}
+import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.PlatformDependent
 import org.apache.spark.unsafe.array.ByteArrayMethods
@@ -45,12 +45,11 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers {
     assert(sizeRequired === 8 + (3 * 8))
     val buffer: Array[Long] = new Array[Long](sizeRequired / 8)
     val numBytesWritten =
-      converter.writeRow(row, buffer, PlatformDependent.LONG_ARRAY_OFFSET, sizeRequired, null)
+      converter.writeRow(row, buffer, PlatformDependent.LONG_ARRAY_OFFSET, sizeRequired)
     assert(numBytesWritten === sizeRequired)
 
     val unsafeRow = new UnsafeRow()
-    unsafeRow.pointTo(
-      buffer, PlatformDependent.LONG_ARRAY_OFFSET, fieldTypes.length, sizeRequired, null)
+    unsafeRow.pointTo(buffer, PlatformDependent.LONG_ARRAY_OFFSET, fieldTypes.length, sizeRequired)
     assert(unsafeRow.getLong(0) === 0)
     assert(unsafeRow.getLong(1) === 1)
     assert(unsafeRow.getInt(2) === 2)
@@ -87,67 +86,15 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers {
       ByteArrayMethods.roundNumberOfBytesToNearestWord("World".getBytes.length))
     val buffer: Array[Long] = new Array[Long](sizeRequired / 8)
     val numBytesWritten = converter.writeRow(
-      row, buffer, PlatformDependent.LONG_ARRAY_OFFSET, sizeRequired, null)
+      row, buffer, PlatformDependent.LONG_ARRAY_OFFSET, sizeRequired)
     assert(numBytesWritten === sizeRequired)
 
     val unsafeRow = new UnsafeRow()
-    val pool = new ObjectPool(10)
     unsafeRow.pointTo(
-      buffer, PlatformDependent.LONG_ARRAY_OFFSET, fieldTypes.length, sizeRequired, pool)
+      buffer, PlatformDependent.LONG_ARRAY_OFFSET, fieldTypes.length, sizeRequired)
     assert(unsafeRow.getLong(0) === 0)
     assert(unsafeRow.getString(1) === "Hello")
-    assert(unsafeRow.get(2) === "World".getBytes)
-
-    unsafeRow.update(1, UTF8String.fromString("World"))
-    assert(unsafeRow.getString(1) === "World")
-    assert(pool.size === 0)
-    unsafeRow.update(1, UTF8String.fromString("Hello World"))
-    assert(unsafeRow.getString(1) === "Hello World")
-    assert(pool.size === 1)
-
-    unsafeRow.update(2, "World".getBytes)
-    assert(unsafeRow.get(2) === "World".getBytes)
-    assert(pool.size === 1)
-    unsafeRow.update(2, "Hello World".getBytes)
-    assert(unsafeRow.get(2) === "Hello World".getBytes)
-    assert(pool.size === 2)
-
-    // We do not support copy() for UnsafeRows that reference ObjectPools
-    intercept[UnsupportedOperationException] {
-      unsafeRow.copy()
-    }
-  }
-
-  test("basic conversion with primitive, decimal and array") {
-    val fieldTypes: Array[DataType] = Array(LongType, DecimalType(10, 0), ArrayType(StringType))
-    val converter = new UnsafeRowConverter(fieldTypes)
-
-    val row = new SpecificMutableRow(fieldTypes)
-    row.setLong(0, 0)
-    row.update(1, Decimal(1))
-    row.update(2, Array(2))
-
-    val pool = new ObjectPool(10)
-    val sizeRequired: Int = converter.getSizeRequirement(row)
-    assert(sizeRequired === 8 + (8 * 3))
-    val buffer: Array[Long] = new Array[Long](sizeRequired / 8)
-    val numBytesWritten =
-      converter.writeRow(row, buffer, PlatformDependent.LONG_ARRAY_OFFSET, sizeRequired, pool)
-    assert(numBytesWritten === sizeRequired)
-    assert(pool.size === 2)
-
-    val unsafeRow = new UnsafeRow()
-    unsafeRow.pointTo(
-      buffer, PlatformDependent.LONG_ARRAY_OFFSET, fieldTypes.length, sizeRequired, pool)
-    assert(unsafeRow.getLong(0) === 0)
-    assert(unsafeRow.get(1) === Decimal(1))
-    assert(unsafeRow.get(2) === Array(2))
-
-    unsafeRow.update(1, Decimal(2))
-    assert(unsafeRow.get(1) === Decimal(2))
-    unsafeRow.update(2, Array(3, 4))
-    assert(unsafeRow.get(2) === Array(3, 4))
-    assert(pool.size === 2)
+    assert(unsafeRow.getBinary(2) === "World".getBytes)
   }
 
   test("basic conversion with primitive, string, date and timestamp types") {
@@ -165,25 +112,25 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers {
       ByteArrayMethods.roundNumberOfBytesToNearestWord("Hello".getBytes.length))
     val buffer: Array[Long] = new Array[Long](sizeRequired / 8)
     val numBytesWritten =
-      converter.writeRow(row, buffer, PlatformDependent.LONG_ARRAY_OFFSET, sizeRequired, null)
+      converter.writeRow(row, buffer, PlatformDependent.LONG_ARRAY_OFFSET, sizeRequired)
     assert(numBytesWritten === sizeRequired)
 
     val unsafeRow = new UnsafeRow()
     unsafeRow.pointTo(
-      buffer, PlatformDependent.LONG_ARRAY_OFFSET, fieldTypes.length, sizeRequired, null)
+      buffer, PlatformDependent.LONG_ARRAY_OFFSET, fieldTypes.length, sizeRequired)
     assert(unsafeRow.getLong(0) === 0)
     assert(unsafeRow.getString(1) === "Hello")
     // Date is represented as Int in unsafeRow
     assert(DateTimeUtils.toJavaDate(unsafeRow.getInt(2)) === Date.valueOf("1970-01-01"))
     // Timestamp is represented as Long in unsafeRow
     DateTimeUtils.toJavaTimestamp(unsafeRow.getLong(3)) should be
-      (Timestamp.valueOf("2015-05-08 08:10:25"))
+    (Timestamp.valueOf("2015-05-08 08:10:25"))
 
     unsafeRow.setInt(2, DateTimeUtils.fromJavaDate(Date.valueOf("2015-06-22")))
     assert(DateTimeUtils.toJavaDate(unsafeRow.getInt(2)) === Date.valueOf("2015-06-22"))
     unsafeRow.setLong(3, DateTimeUtils.fromJavaTimestamp(Timestamp.valueOf("2015-06-22 08:10:25")))
     DateTimeUtils.toJavaTimestamp(unsafeRow.getLong(3)) should be
-      (Timestamp.valueOf("2015-06-22 08:10:25"))
+    (Timestamp.valueOf("2015-06-22 08:10:25"))
   }
 
   test("null handling") {
@@ -197,9 +144,9 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers {
       FloatType,
       DoubleType,
       StringType,
-      BinaryType,
-      DecimalType.Unlimited,
-      ArrayType(IntegerType)
+      BinaryType
+      // DecimalType.Unlimited,
+      // ArrayType(IntegerType)
     )
     val converter = new UnsafeRowConverter(fieldTypes)
 
@@ -215,14 +162,13 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers {
     val createdFromNullBuffer: Array[Long] = new Array[Long](sizeRequired / 8)
     val numBytesWritten = converter.writeRow(
       rowWithAllNullColumns, createdFromNullBuffer, PlatformDependent.LONG_ARRAY_OFFSET,
-      sizeRequired, null)
+      sizeRequired)
     assert(numBytesWritten === sizeRequired)
 
     val createdFromNull = new UnsafeRow()
     createdFromNull.pointTo(
-      createdFromNullBuffer, PlatformDependent.LONG_ARRAY_OFFSET, fieldTypes.length,
-      sizeRequired, null)
-    for (i <- 0 to fieldTypes.length - 1) {
+      createdFromNullBuffer, PlatformDependent.LONG_ARRAY_OFFSET, fieldTypes.length, sizeRequired)
+    for (i <- fieldTypes.indices) {
       assert(createdFromNull.isNullAt(i))
     }
     assert(createdFromNull.getBoolean(1) === false)
@@ -232,10 +178,10 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers {
     assert(createdFromNull.getLong(5) === 0)
     assert(java.lang.Float.isNaN(createdFromNull.getFloat(6)))
     assert(java.lang.Double.isNaN(createdFromNull.getDouble(7)))
-    assert(createdFromNull.getString(8) === null)
-    assert(createdFromNull.get(9) === null)
-    assert(createdFromNull.get(10) === null)
-    assert(createdFromNull.get(11) === null)
+    assert(createdFromNull.getUTF8String(8) === null)
+    assert(createdFromNull.getBinary(9) === null)
+    // assert(createdFromNull.get(10) === null)
+    // assert(createdFromNull.get(11) === null)
 
     // If we have an UnsafeRow with columns that are initially non-null and we null out those
     // columns, then the serialized row representation should be identical to what we would get by
@@ -252,19 +198,18 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers {
       r.setDouble(7, 700)
       r.update(8, UTF8String.fromString("hello"))
       r.update(9, "world".getBytes)
-      r.update(10, Decimal(10))
-      r.update(11, Array(11))
+      // r.update(10, Decimal(10))
+      // r.update(11, Array(11))
       r
     }
-    val pool = new ObjectPool(1)
     val setToNullAfterCreationBuffer: Array[Long] = new Array[Long](sizeRequired / 8 + 2)
     converter.writeRow(
       rowWithNoNullColumns, setToNullAfterCreationBuffer, PlatformDependent.LONG_ARRAY_OFFSET,
-      sizeRequired, pool)
+      sizeRequired)
     val setToNullAfterCreation = new UnsafeRow()
     setToNullAfterCreation.pointTo(
       setToNullAfterCreationBuffer, PlatformDependent.LONG_ARRAY_OFFSET, fieldTypes.length,
-      sizeRequired, pool)
+      sizeRequired)
 
     assert(setToNullAfterCreation.isNullAt(0) === rowWithNoNullColumns.isNullAt(0))
     assert(setToNullAfterCreation.getBoolean(1) === rowWithNoNullColumns.getBoolean(1))
@@ -275,14 +220,11 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers {
     assert(setToNullAfterCreation.getFloat(6) === rowWithNoNullColumns.getFloat(6))
     assert(setToNullAfterCreation.getDouble(7) === rowWithNoNullColumns.getDouble(7))
     assert(setToNullAfterCreation.getString(8) === rowWithNoNullColumns.getString(8))
-    assert(setToNullAfterCreation.get(9) === rowWithNoNullColumns.get(9))
-    assert(setToNullAfterCreation.get(10) === rowWithNoNullColumns.get(10))
-    assert(setToNullAfterCreation.get(11) === rowWithNoNullColumns.get(11))
+    assert(setToNullAfterCreation.getBinary(9) === rowWithNoNullColumns.get(9))
+    // assert(setToNullAfterCreation.get(10) === rowWithNoNullColumns.get(10))
+    // assert(setToNullAfterCreation.get(11) === rowWithNoNullColumns.get(11))
 
-    for (i <- 0 to fieldTypes.length - 1) {
-      if (i >= 8) {
-        setToNullAfterCreation.update(i, null)
-      }
+    for (i <- fieldTypes.indices) {
       setToNullAfterCreation.setNullAt(i)
     }
     // There are some garbage left in the var-length area
@@ -297,10 +239,10 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers {
     setToNullAfterCreation.setLong(5, 500)
     setToNullAfterCreation.setFloat(6, 600)
     setToNullAfterCreation.setDouble(7, 700)
-    setToNullAfterCreation.update(8, UTF8String.fromString("hello"))
-    setToNullAfterCreation.update(9, "world".getBytes)
-    setToNullAfterCreation.update(10, Decimal(10))
-    setToNullAfterCreation.update(11, Array(11))
+    // setToNullAfterCreation.update(8, UTF8String.fromString("hello"))
+    // setToNullAfterCreation.update(9, "world".getBytes)
+    // setToNullAfterCreation.update(10, Decimal(10))
+    // setToNullAfterCreation.update(11, Array(11))
 
     assert(setToNullAfterCreation.isNullAt(0) === rowWithNoNullColumns.isNullAt(0))
     assert(setToNullAfterCreation.getBoolean(1) === rowWithNoNullColumns.getBoolean(1))
@@ -310,10 +252,10 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers {
     assert(setToNullAfterCreation.getLong(5) === rowWithNoNullColumns.getLong(5))
     assert(setToNullAfterCreation.getFloat(6) === rowWithNoNullColumns.getFloat(6))
     assert(setToNullAfterCreation.getDouble(7) === rowWithNoNullColumns.getDouble(7))
-    assert(setToNullAfterCreation.getString(8) === rowWithNoNullColumns.getString(8))
-    assert(setToNullAfterCreation.get(9) === rowWithNoNullColumns.get(9))
-    assert(setToNullAfterCreation.get(10) === rowWithNoNullColumns.get(10))
-    assert(setToNullAfterCreation.get(11) === rowWithNoNullColumns.get(11))
+    // assert(setToNullAfterCreation.getString(8) === rowWithNoNullColumns.getString(8))
+    // assert(setToNullAfterCreation.get(9) === rowWithNoNullColumns.get(9))
+    // assert(setToNullAfterCreation.get(10) === rowWithNoNullColumns.get(10))
+    // assert(setToNullAfterCreation.get(11) === rowWithNoNullColumns.get(11))
   }
 
   test("NaN canonicalization") {
@@ -330,12 +272,9 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers {
     val converter = new UnsafeRowConverter(fieldTypes)
     val row1Buffer = new Array[Byte](converter.getSizeRequirement(row1))
     val row2Buffer = new Array[Byte](converter.getSizeRequirement(row2))
-    converter.writeRow(
-      row1, row1Buffer, PlatformDependent.BYTE_ARRAY_OFFSET, row1Buffer.length, null)
-    converter.writeRow(
-      row2, row2Buffer, PlatformDependent.BYTE_ARRAY_OFFSET, row2Buffer.length, null)
+    converter.writeRow(row1, row1Buffer, PlatformDependent.BYTE_ARRAY_OFFSET, row1Buffer.length)
+    converter.writeRow(row2, row2Buffer, PlatformDependent.BYTE_ARRAY_OFFSET, row2Buffer.length)
 
     assert(row1Buffer.toSeq === row2Buffer.toSeq)
   }
-
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/ObjectPoolSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/ObjectPoolSuite.scala
deleted file mode 100644
index 94764df4b9cdb..0000000000000
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/ObjectPoolSuite.scala
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.catalyst.util
-
-import org.scalatest.Matchers
-
-import org.apache.spark.SparkFunSuite
-
-class ObjectPoolSuite extends SparkFunSuite with Matchers {
-
-  test("pool") {
-    val pool = new ObjectPool(1)
-    assert(pool.put(1) === 0)
-    assert(pool.put("hello") === 1)
-    assert(pool.put(false) === 2)
-
-    assert(pool.get(0) === 1)
-    assert(pool.get(1) === "hello")
-    assert(pool.get(2) === false)
-    assert(pool.size() === 3)
-
-    pool.replace(1, "world")
-    assert(pool.get(1) === "world")
-    assert(pool.size() === 3)
-  }
-
-  test("unique pool") {
-    val pool = new UniqueObjectPool(1)
-    assert(pool.put(1) === 0)
-    assert(pool.put("hello") === 1)
-    assert(pool.put(1) === 0)
-    assert(pool.put("hello") === 1)
-
-    assert(pool.get(0) === 1)
-    assert(pool.get(1) === "hello")
-    assert(pool.size() === 2)
-
-    intercept[UnsupportedOperationException] {
-      pool.replace(1, "world")
-    }
-  }
-}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index 323ff17357fda..fa942a1f8fd93 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -20,6 +20,8 @@ package org.apache.spark.sql
 import java.io.CharArrayWriter
 import java.util.Properties
 
+import org.apache.spark.unsafe.types.UTF8String
+
 import scala.language.implicitConversions
 import scala.reflect.ClassTag
 import scala.reflect.runtime.universe.TypeTag
@@ -1282,7 +1284,7 @@ class DataFrame private[sql](
 
     val outputCols = (if (cols.isEmpty) numericColumns.map(_.prettyString) else cols).toList
 
-    val ret: Seq[InternalRow] = if (outputCols.nonEmpty) {
+    val ret: Seq[Row] = if (outputCols.nonEmpty) {
       val aggExprs = statistics.flatMap { case (_, colToAgg) =>
         outputCols.map(c => Column(Cast(colToAgg(Column(c).expr), StringType)).as(c))
       }
@@ -1290,19 +1292,18 @@ class DataFrame private[sql](
       val row = agg(aggExprs.head, aggExprs.tail: _*).head().toSeq
 
       // Pivot the data so each summary is one row
-      row.grouped(outputCols.size).toSeq.zip(statistics).map {
-        case (aggregation, (statistic, _)) =>
-          InternalRow(statistic :: aggregation.toList: _*)
+      row.grouped(outputCols.size).toSeq.zip(statistics).map { case (aggregation, (statistic, _)) =>
+        Row(statistic :: aggregation.toList: _*)
       }
     } else {
       // If there are no output columns, just output a single column that contains the stats.
-      statistics.map { case (name, _) => InternalRow(name) }
+      statistics.map { case (name, _) => Row(name) }
     }
 
     // All columns are string type
     val schema = StructType(
       StructField("summary", StringType) :: outputCols.map(StructField(_, StringType))).toAttributes
-    LocalRelation(schema, ret)
+    LocalRelation.fromExternalRows(schema, ret)
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala
index 0e63f2fe29cb3..16176abe3a51d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala
@@ -239,6 +239,11 @@ case class GeneratedAggregate(
       StructType(fields)
     }
 
+    val schemaSupportsUnsafe: Boolean = {
+      UnsafeFixedWidthAggregationMap.supportsAggregationBufferSchema(aggregationBufferSchema) &&
+        UnsafeFixedWidthAggregationMap.supportsGroupKeySchema(groupKeySchema)
+    }
+
     child.execute().mapPartitions { iter =>
       // Builds a new custom class for holding the results of aggregation for a group.
       val initialValues = computeFunctions.flatMap(_.initialValues)
@@ -290,13 +295,14 @@ case class GeneratedAggregate(
 
         val resultProjection = resultProjectionBuilder()
         Iterator(resultProjection(buffer))
-      } else if (unsafeEnabled) {
+
+      } else if (unsafeEnabled && schemaSupportsUnsafe) {
         assert(iter.hasNext, "There should be at least one row for this path")
         log.info("Using Unsafe-based aggregator")
         val aggregationMap = new UnsafeFixedWidthAggregationMap(
-          newAggregationBuffer,
-          new UnsafeRowConverter(groupKeySchema),
-          new UnsafeRowConverter(aggregationBufferSchema),
+          newAggregationBuffer(EmptyRow),
+          aggregationBufferSchema,
+          groupKeySchema,
           TaskContext.get.taskMemoryManager(),
           1024 * 16, // initial capacity
           false // disable tracking of performance metrics
@@ -331,6 +337,9 @@ case class GeneratedAggregate(
           }
         }
       } else {
+        if (unsafeEnabled) {
+          log.info("Not using Unsafe-based aggregator because it is not supported for this schema")
+        }
         val buffers = new java.util.HashMap[InternalRow, MutableRow]()
 
         var currentRow: InternalRow = null
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/LocalTableScan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/LocalTableScan.scala
index cd341180b6100..34e926e4582be 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/LocalTableScan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/LocalTableScan.scala
@@ -34,13 +34,11 @@ private[sql] case class LocalTableScan(
 
   protected override def doExecute(): RDD[InternalRow] = rdd
 
-
   override def executeCollect(): Array[Row] = {
     val converter = CatalystTypeConverters.createToScalaConverter(schema)
     rows.map(converter(_).asInstanceOf[Row]).toArray
   }
 
-
   override def executeTake(limit: Int): Array[Row] = {
     val converter = CatalystTypeConverters.createToScalaConverter(schema)
     rows.map(converter(_).asInstanceOf[Row]).take(limit).toArray
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/UnsafeRowSerializer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/UnsafeRowSerializer.scala
index 318550e5ed899..16498da080c88 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/UnsafeRowSerializer.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/UnsafeRowSerializer.scala
@@ -37,9 +37,6 @@ import org.apache.spark.unsafe.PlatformDependent
  * Note that this serializer implements only the [[Serializer]] methods that are used during
  * shuffle, so certain [[SerializerInstance]] methods will throw UnsupportedOperationException.
  *
- * This serializer does not support UnsafeRows that use
- * [[org.apache.spark.sql.catalyst.util.ObjectPool]].
- *
  * @param numFields the number of fields in the row being serialized.
  */
 private[sql] class UnsafeRowSerializer(numFields: Int) extends Serializer with Serializable {
@@ -65,7 +62,6 @@ private class UnsafeRowSerializerInstance(numFields: Int) extends SerializerInst
 
     override def writeValue[T: ClassTag](value: T): SerializationStream = {
       val row = value.asInstanceOf[UnsafeRow]
-      assert(row.getPool == null, "UnsafeRowSerializer does not support ObjectPool")
       dOut.writeInt(row.getSizeInBytes)
       row.writeToStream(out, writeBuffer)
       this
@@ -118,7 +114,7 @@ private class UnsafeRowSerializerInstance(numFields: Int) extends SerializerInst
               rowBuffer = new Array[Byte](rowSize)
             }
             ByteStreams.readFully(in, rowBuffer, 0, rowSize)
-            row.pointTo(rowBuffer, PlatformDependent.BYTE_ARRAY_OFFSET, numFields, rowSize, null)
+            row.pointTo(rowBuffer, PlatformDependent.BYTE_ARRAY_OFFSET, numFields, rowSize)
             rowSize = dIn.readInt() // read the next row's size
             if (rowSize == EOF) { // We are returning the last row in this stream
               val _rowTuple = rowTuple
@@ -152,7 +148,7 @@ private class UnsafeRowSerializerInstance(numFields: Int) extends SerializerInst
           rowBuffer = new Array[Byte](rowSize)
         }
         ByteStreams.readFully(in, rowBuffer, 0, rowSize)
-        row.pointTo(rowBuffer, PlatformDependent.BYTE_ARRAY_OFFSET, numFields, rowSize, null)
+        row.pointTo(rowBuffer, PlatformDependent.BYTE_ARRAY_OFFSET, numFields, rowSize)
         row.asInstanceOf[T]
       }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UnsafeRowSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UnsafeRowSuite.scala
index d36e2639376e7..ad3bb1744cb3c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/UnsafeRowSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/UnsafeRowSuite.scala
@@ -53,8 +53,7 @@ class UnsafeRowSuite extends SparkFunSuite {
           offheapRowPage.getBaseObject,
           offheapRowPage.getBaseOffset,
           3, // num fields
-          arrayBackedUnsafeRow.getSizeInBytes,
-          null // object pool
+          arrayBackedUnsafeRow.getSizeInBytes
         )
         assert(offheapUnsafeRow.getBaseObject === null)
         val baos = new ByteArrayOutputStream()
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeExternalSortSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeExternalSortSuite.scala
index 5fe73f7e0b072..7a4baa9e4a49d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeExternalSortSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeExternalSortSuite.scala
@@ -39,7 +39,7 @@ class UnsafeExternalSortSuite extends SparkPlanTest with BeforeAndAfterAll {
   ignore("sort followed by limit should not leak memory") {
     // TODO: this test is going to fail until we implement a proper iterator interface
     // with a close() method.
-    TestSQLContext.sparkContext.conf.set("spark.unsafe.exceptionOnMemoryLeak", "true")
+    TestSQLContext.sparkContext.conf.set("spark.unsafe.exceptionOnMemoryLeak", "false")
     checkThatPlansAgree(
       (1 to 100).map(v => Tuple1(v)).toDF("a"),
       (child: SparkPlan) => Limit(10, UnsafeExternalSort('a.asc :: Nil, true, child)),
@@ -58,7 +58,7 @@ class UnsafeExternalSortSuite extends SparkPlanTest with BeforeAndAfterAll {
         sortAnswers = false
       )
     } finally {
-      TestSQLContext.sparkContext.conf.set("spark.unsafe.exceptionOnMemoryLeak", "true")
+      TestSQLContext.sparkContext.conf.set("spark.unsafe.exceptionOnMemoryLeak", "false")
 
     }
   }
@@ -91,7 +91,8 @@ class UnsafeExternalSortSuite extends SparkPlanTest with BeforeAndAfterAll {
       assert(UnsafeExternalSort.supportsSchema(inputDf.schema))
       checkThatPlansAgree(
         inputDf,
-        UnsafeExternalSort(sortOrder, global = true, _: SparkPlan, testSpillFrequency = 23),
+        plan => ConvertToSafe(
+          UnsafeExternalSort(sortOrder, global = true, plan: SparkPlan, testSpillFrequency = 23)),
         Sort(sortOrder, global = true, _: SparkPlan),
         sortAnswers = false
       )
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala
index bd788ec8c14b1..a1e1695717e23 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala
@@ -23,29 +23,25 @@ import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
 import org.apache.spark.sql.catalyst.expressions.{UnsafeRow, UnsafeRowConverter}
-import org.apache.spark.sql.catalyst.util.ObjectPool
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.PlatformDependent
 
 class UnsafeRowSerializerSuite extends SparkFunSuite {
 
-  private def toUnsafeRow(
-      row: Row,
-      schema: Array[DataType],
-      objPool: ObjectPool = null): UnsafeRow = {
+  private def toUnsafeRow(row: Row, schema: Array[DataType]): UnsafeRow = {
     val internalRow = CatalystTypeConverters.convertToCatalyst(row).asInstanceOf[InternalRow]
     val rowConverter = new UnsafeRowConverter(schema)
     val rowSizeInBytes = rowConverter.getSizeRequirement(internalRow)
     val byteArray = new Array[Byte](rowSizeInBytes)
     rowConverter.writeRow(
-      internalRow, byteArray, PlatformDependent.BYTE_ARRAY_OFFSET, rowSizeInBytes, objPool)
+      internalRow, byteArray, PlatformDependent.BYTE_ARRAY_OFFSET, rowSizeInBytes)
     val unsafeRow = new UnsafeRow()
-    unsafeRow.pointTo(
-      byteArray, PlatformDependent.BYTE_ARRAY_OFFSET, row.length, rowSizeInBytes, objPool)
+    unsafeRow.pointTo(byteArray, PlatformDependent.BYTE_ARRAY_OFFSET, row.length, rowSizeInBytes)
     unsafeRow
   }
 
-  test("toUnsafeRow() test helper method") {
+  ignore("toUnsafeRow() test helper method") {
+    // This currently doesnt work because the generic getter throws an exception.
     val row = Row("Hello", 123)
     val unsafeRow = toUnsafeRow(row, Array(StringType, IntegerType))
     assert(row.getString(0) === unsafeRow.get(0).toString)

From 26ed22aec8af42c6dc161e0a2827a4235a49a9a4 Mon Sep 17 00:00:00 2001
From: "Zhang, Liye" <liye.zhang@intel.com>
Date: Thu, 23 Jul 2015 12:43:54 +0100
Subject: [PATCH 337/549] [SPARK-9212] [CORE] upgrade Netty version to
 4.0.29.Final

related JIRA: [SPARK-9212](https://issues.apache.org/jira/browse/SPARK-9212) and [SPARK-8101](https://issues.apache.org/jira/browse/SPARK-8101)

Author: Zhang, Liye <liye.zhang@intel.com>

Closes #7562 from liyezhang556520/SPARK-9212 and squashes the following commits:

1917729 [Zhang, Liye] SPARK-9212 upgrade Netty version to 4.0.29.Final
---
 pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pom.xml b/pom.xml
index 1f44dc8abe1d4..35fc8c44bc1b0 100644
--- a/pom.xml
+++ b/pom.xml
@@ -573,7 +573,7 @@
       <dependency>
         <groupId>io.netty</groupId>
         <artifactId>netty-all</artifactId>
-        <version>4.0.28.Final</version>
+        <version>4.0.29.Final</version>
       </dependency>
       <dependency>
         <groupId>org.apache.derby</groupId>

From 52ef76de219c4bf19c54c99414b89a67d0bf457b Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@outlook.com>
Date: Thu, 23 Jul 2015 09:37:53 -0700
Subject: [PATCH 338/549] [SPARK-9082] [SQL] [FOLLOW-UP] use `partition` in
 `PushPredicateThroughProject`

a follow up of https://github.com/apache/spark/pull/7446

Author: Wenchen Fan <cloud0fan@outlook.com>

Closes #7607 from cloud-fan/tmp and squashes the following commits:

7106989 [Wenchen Fan] use `partition` in `PushPredicateThroughProject`
---
 .../sql/catalyst/optimizer/Optimizer.scala    | 22 +++++++------------
 1 file changed, 8 insertions(+), 14 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index d2db3dd3d078e..b59f800e7cc0f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -553,33 +553,27 @@ object PushPredicateThroughProject extends Rule[LogicalPlan] with PredicateHelpe
       // Split the condition into small conditions by `And`, so that we can push down part of this
       // condition without nondeterministic expressions.
       val andConditions = splitConjunctivePredicates(condition)
-      val nondeterministicConditions = andConditions.filter(hasNondeterministic(_, aliasMap))
+
+      val (deterministic, nondeterministic) = andConditions.partition(_.collect {
+        case a: Attribute if aliasMap.contains(a) => aliasMap(a)
+      }.forall(_.deterministic))
 
       // If there is no nondeterministic conditions, push down the whole condition.
-      if (nondeterministicConditions.isEmpty) {
+      if (nondeterministic.isEmpty) {
         project.copy(child = Filter(replaceAlias(condition, aliasMap), grandChild))
       } else {
         // If they are all nondeterministic conditions, leave it un-changed.
-        if (nondeterministicConditions.length == andConditions.length) {
+        if (deterministic.isEmpty) {
           filter
         } else {
-          val deterministicConditions = andConditions.filterNot(hasNondeterministic(_, aliasMap))
           // Push down the small conditions without nondeterministic expressions.
-          val pushedCondition = deterministicConditions.map(replaceAlias(_, aliasMap)).reduce(And)
-          Filter(nondeterministicConditions.reduce(And),
+          val pushedCondition = deterministic.map(replaceAlias(_, aliasMap)).reduce(And)
+          Filter(nondeterministic.reduce(And),
             project.copy(child = Filter(pushedCondition, grandChild)))
         }
       }
   }
 
-  private def hasNondeterministic(
-      condition: Expression,
-      sourceAliases: AttributeMap[Expression]) = {
-    condition.collect {
-      case a: Attribute if sourceAliases.contains(a) => sourceAliases(a)
-    }.exists(!_.deterministic)
-  }
-
   // Substitute any attributes that are produced by the child projection, so that we safely
   // eliminate it.
   private def replaceAlias(condition: Expression, sourceAliases: AttributeMap[Expression]) = {

From 19aeab57c1b0c739edb5ba351f98e930e1a0f984 Mon Sep 17 00:00:00 2001
From: Cheng Hao <hao.cheng@intel.com>
Date: Thu, 23 Jul 2015 10:28:20 -0700
Subject: [PATCH 339/549] [Build][Minor] Fix building error & performance

1. When build the latest code with sbt, it throws exception like:
[error] /home/hcheng/git/catalyst/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala:78: match may not be exhaustive.
[error] It would fail on the following input: UNKNOWN
[error]       val classNameByStatus = status match {
[error]

2. Potential performance issue when implicitly convert an Array[Any] to Seq[Any]

Author: Cheng Hao <hao.cheng@intel.com>

Closes #7611 from chenghao-intel/toseq and squashes the following commits:

cab75c5 [Cheng Hao] remove the toArray
24df682 [Cheng Hao] fix building error & performance
---
 core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala  | 1 +
 .../org/apache/spark/sql/catalyst/CatalystTypeConverters.scala  | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala
index 2ce670ad02e97..e72547df7254b 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala
@@ -79,6 +79,7 @@ private[ui] class AllJobsPage(parent: JobsTab) extends WebUIPage("") {
         case JobExecutionStatus.SUCCEEDED => "succeeded"
         case JobExecutionStatus.FAILED => "failed"
         case JobExecutionStatus.RUNNING => "running"
+        case JobExecutionStatus.UNKNOWN => "unknown"
       }
 
       // The timeline library treats contents as HTML, so we have to escape them; for the
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala
index 4067833d5e648..bfaee04f33b7f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala
@@ -402,7 +402,7 @@ object CatalystTypeConverters {
     case d: JavaBigDecimal => BigDecimalConverter.toCatalyst(d)
     case seq: Seq[Any] => seq.map(convertToCatalyst)
     case r: Row => InternalRow(r.toSeq.map(convertToCatalyst): _*)
-    case arr: Array[Any] => arr.toSeq.map(convertToCatalyst).toArray
+    case arr: Array[Any] => arr.map(convertToCatalyst)
     case m: Map[_, _] =>
       m.map { case (k, v) => (convertToCatalyst(k), convertToCatalyst(v)) }.toMap
     case other => other

From d2666a3c70dad037776dc4015fa561356381357b Mon Sep 17 00:00:00 2001
From: Yijie Shen <henry.yijieshen@gmail.com>
Date: Thu, 23 Jul 2015 10:31:12 -0700
Subject: [PATCH 340/549] [SPARK-9183] confusing error message when looking up
 missing function in Spark SQL

JIRA: https://issues.apache.org/jira/browse/SPARK-9183

cc rxin

Author: Yijie Shen <henry.yijieshen@gmail.com>

Closes #7613 from yjshen/npe_udf and squashes the following commits:

44f58f2 [Yijie Shen] add jira ticket number
903c963 [Yijie Shen] add explanation comments
f44dd3c [Yijie Shen] Change two hive class LogLevel to avoid annoying messages
---
 conf/log4j.properties.template                                | 4 ++++
 .../resources/org/apache/spark/log4j-defaults-repl.properties | 4 ++++
 .../main/resources/org/apache/spark/log4j-defaults.properties | 4 ++++
 3 files changed, 12 insertions(+)

diff --git a/conf/log4j.properties.template b/conf/log4j.properties.template
index 3a2a88219818f..27006e45e932b 100644
--- a/conf/log4j.properties.template
+++ b/conf/log4j.properties.template
@@ -10,3 +10,7 @@ log4j.logger.org.spark-project.jetty=WARN
 log4j.logger.org.spark-project.jetty.util.component.AbstractLifeCycle=ERROR
 log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
 log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
+
+# SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support
+log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL
+log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR
diff --git a/core/src/main/resources/org/apache/spark/log4j-defaults-repl.properties b/core/src/main/resources/org/apache/spark/log4j-defaults-repl.properties
index b146f8a784127..689afea64f8db 100644
--- a/core/src/main/resources/org/apache/spark/log4j-defaults-repl.properties
+++ b/core/src/main/resources/org/apache/spark/log4j-defaults-repl.properties
@@ -10,3 +10,7 @@ log4j.logger.org.spark-project.jetty=WARN
 log4j.logger.org.spark-project.jetty.util.component.AbstractLifeCycle=ERROR
 log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
 log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
+
+# SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support
+log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL
+log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR
diff --git a/core/src/main/resources/org/apache/spark/log4j-defaults.properties b/core/src/main/resources/org/apache/spark/log4j-defaults.properties
index 3a2a88219818f..27006e45e932b 100644
--- a/core/src/main/resources/org/apache/spark/log4j-defaults.properties
+++ b/core/src/main/resources/org/apache/spark/log4j-defaults.properties
@@ -10,3 +10,7 @@ log4j.logger.org.spark-project.jetty=WARN
 log4j.logger.org.spark-project.jetty.util.component.AbstractLifeCycle=ERROR
 log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
 log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
+
+# SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support
+log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL
+log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR

From ecfb3127670c7f15e3a15e7f51fa578532480cda Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Thu, 23 Jul 2015 10:32:11 -0700
Subject: [PATCH 341/549] [SPARK-9243] [Documentation] null -> zero in crosstab
 doc

We forgot to update doc. brkyvz

Author: Xiangrui Meng <meng@databricks.com>

Closes #7608 from mengxr/SPARK-9243 and squashes the following commits:

0ea3236 [Xiangrui Meng] null -> zero in crosstab doc
---
 R/pkg/R/DataFrame.R                                             | 2 +-
 python/pyspark/sql/dataframe.py                                 | 2 +-
 .../scala/org/apache/spark/sql/DataFrameStatFunctions.scala     | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 06dd6b75dff3d..f4c93d3c7dd67 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -1566,7 +1566,7 @@ setMethod("fillna",
 #' @return a local R data.frame representing the contingency table. The first column of each row
 #'         will be the distinct values of `col1` and the column names will be the distinct values
 #'         of `col2`. The name of the first column will be `$col1_$col2`. Pairs that have no
-#'         occurrences will have `null` as their counts.
+#'         occurrences will have zero as their counts.
 #'
 #' @rdname statfunctions
 #' @export
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 83e02b85f06f1..d76e051bd73a1 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -1130,7 +1130,7 @@ def crosstab(self, col1, col2):
         non-zero pair frequencies will be returned.
         The first column of each row will be the distinct values of `col1` and the column names
         will be the distinct values of `col2`. The name of the first column will be `$col1_$col2`.
-        Pairs that have no occurrences will have `null` as their counts.
+        Pairs that have no occurrences will have zero as their counts.
         :func:`DataFrame.crosstab` and :func:`DataFrameStatFunctions.crosstab` are aliases.
 
         :param col1: The name of the first column. Distinct items will make the first item of
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
index 587869e57f96e..4ec58082e7aef 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
@@ -77,7 +77,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
    * pair frequencies will be returned.
    * The first column of each row will be the distinct values of `col1` and the column names will
    * be the distinct values of `col2`. The name of the first column will be `$col1_$col2`. Counts
-   * will be returned as `Long`s. Pairs that have no occurrences will have `null` as their counts.
+   * will be returned as `Long`s. Pairs that have no occurrences will have zero as their counts.
    * Null elements will be replaced by "null", and back ticks will be dropped from elements if they
    * exist.
    *

From 662d60db3f4a758b6869de5bd971d23bd5962c3b Mon Sep 17 00:00:00 2001
From: David Arroyo Cazorla <darroyo@stratio.com>
Date: Thu, 23 Jul 2015 10:34:32 -0700
Subject: [PATCH 342/549] [SPARK-5447][SQL] Replace reference 'schema rdd' with
 DataFrame @rxin.

Author: David Arroyo Cazorla <darroyo@stratio.com>

Closes #7618 from darroyocazorla/master and squashes the following commits:

5f91379 [David Arroyo Cazorla] [SPARK-5447][SQL] Replace reference 'schema rdd' with DataFrame
---
 .../scala/org/apache/spark/sql/execution/CacheManager.scala     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala
index a4b38d364d54a..d3e5c378d037d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala
@@ -84,7 +84,7 @@ private[sql] class CacheManager(sqlContext: SQLContext) extends Logging {
   }
 
   /**
-   * Caches the data produced by the logical representation of the given schema rdd.  Unlike
+   * Caches the data produced by the logical representation of the given [[DataFrame]]. Unlike
    * `RDD.cache()`, the default storage level is set to be `MEMORY_AND_DISK` because recomputing
    * the in-memory columnar representation of the underlying table is expensive.
    */

From b2f3aca1e8c182b93e250f9d9c4aa69f97eaa11a Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Thu, 23 Jul 2015 16:08:07 -0700
Subject: [PATCH 343/549] [SPARK-9286] [SQL] Methods in Unevaluable should be
 final and AlgebraicAggregate should extend Unevaluable.

This patch marks the Unevaluable.eval() and UnevaluablegenCode() methods as final and fixes two cases where they were overridden.  It also updates AggregateFunction2 to extend Unevaluable.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #7627 from JoshRosen/unevaluable-fix and squashes the following commits:

8d9ed22 [Josh Rosen] AlgebraicAggregate should extend Unevaluable
65329c2 [Josh Rosen] Do not have AggregateFunction1 inherit from AggregateExpression1
fa68a22 [Josh Rosen] Make eval() and genCode() final
---
 .../sql/catalyst/expressions/Expression.scala     |  4 ++--
 .../expressions/aggregate/interfaces.scala        | 15 +++------------
 .../sql/catalyst/expressions/aggregates.scala     | 11 +++++------
 3 files changed, 10 insertions(+), 20 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
index 29ae47e842ddb..3f72e6e184db1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
@@ -184,10 +184,10 @@ abstract class Expression extends TreeNode[Expression] {
  */
 trait Unevaluable extends Expression {
 
-  override def eval(input: InternalRow = null): Any =
+  final override def eval(input: InternalRow = null): Any =
     throw new UnsupportedOperationException(s"Cannot evaluate expression: $this")
 
-  override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String =
+  final override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String =
     throw new UnsupportedOperationException(s"Cannot evaluate expression: $this")
 }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala
index 577ede73cb01f..d3fee1ade05e6 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala
@@ -63,10 +63,6 @@ private[sql] case object Complete extends AggregateMode
  */
 private[sql] case object NoOp extends Expression with Unevaluable {
   override def nullable: Boolean = true
-  override def eval(input: InternalRow): Any = {
-    throw new TreeNodeException(
-      this, s"No function to evaluate expression. type: ${this.nodeName}")
-  }
   override def dataType: DataType = NullType
   override def children: Seq[Expression] = Nil
 }
@@ -151,8 +147,7 @@ abstract class AggregateFunction2
 /**
  * A helper class for aggregate functions that can be implemented in terms of catalyst expressions.
  */
-abstract class AlgebraicAggregate extends AggregateFunction2 with Serializable {
-  self: Product =>
+abstract class AlgebraicAggregate extends AggregateFunction2 with Serializable with Unevaluable {
 
   val initialValues: Seq[Expression]
   val updateExpressions: Seq[Expression]
@@ -188,19 +183,15 @@ abstract class AlgebraicAggregate extends AggregateFunction2 with Serializable {
     }
   }
 
-  override def update(buffer: MutableRow, input: InternalRow): Unit = {
+  override final def update(buffer: MutableRow, input: InternalRow): Unit = {
     throw new UnsupportedOperationException(
       "AlgebraicAggregate's update should not be called directly")
   }
 
-  override def merge(buffer1: MutableRow, buffer2: InternalRow): Unit = {
+  override final def merge(buffer1: MutableRow, buffer2: InternalRow): Unit = {
     throw new UnsupportedOperationException(
       "AlgebraicAggregate's merge should not be called directly")
   }
 
-  override def eval(buffer: InternalRow): Any = {
-    throw new UnsupportedOperationException(
-      "AlgebraicAggregate's eval should not be called directly")
-  }
 }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
index e07c920a41d0a..d3295b8bafa80 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
@@ -20,8 +20,8 @@ package org.apache.spark.sql.catalyst.expressions
 import com.clearspring.analytics.stream.cardinality.HyperLogLog
 
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.errors.TreeNodeException
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
+import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode}
 import org.apache.spark.sql.catalyst.util.TypeUtils
 import org.apache.spark.sql.types._
 import org.apache.spark.util.collection.OpenHashSet
@@ -71,8 +71,7 @@ trait PartialAggregate1 extends AggregateExpression1 {
  * A specific implementation of an aggregate function. Used to wrap a generic
  * [[AggregateExpression1]] with an algorithm that will be used to compute one specific result.
  */
-abstract class AggregateFunction1
-  extends LeafExpression with AggregateExpression1 with Serializable {
+abstract class AggregateFunction1 extends LeafExpression with Serializable {
 
   /** Base should return the generic aggregate expression that this function is computing */
   val base: AggregateExpression1
@@ -82,9 +81,9 @@ abstract class AggregateFunction1
 
   def update(input: InternalRow): Unit
 
-  // Do we really need this?
-  override def newInstance(): AggregateFunction1 = {
-    makeCopy(productIterator.map { case a: AnyRef => a }.toArray)
+  override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    throw new UnsupportedOperationException(
+      "AggregateFunction1 should not be used for generated aggregates")
   }
 }
 

From bebe3f7b45f7b0a96f20d5af9b80633fd40cff06 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Thu, 23 Jul 2015 17:49:33 -0700
Subject: [PATCH 344/549] [SPARK-9207] [SQL] Enables Parquet filter push-down
 by default

PARQUET-136 and PARQUET-173 have been fixed in parquet-mr 1.7.0. It's time to enable filter push-down by default now.

Author: Cheng Lian <lian@databricks.com>

Closes #7612 from liancheng/spark-9207 and squashes the following commits:

77e6b5e [Cheng Lian] Enables Parquet filter push-down by default
---
 docs/sql-programming-guide.md                            | 9 ++-------
 .../src/main/scala/org/apache/spark/sql/SQLConf.scala    | 8 ++------
 2 files changed, 4 insertions(+), 13 deletions(-)

diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 5838bc172fe86..95945eb7fc8a0 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -1332,13 +1332,8 @@ Configuration of Parquet can be done using the `setConf` method on `SQLContext`
 </tr>
 <tr>
   <td><code>spark.sql.parquet.filterPushdown</code></td>
-  <td>false</td>
-  <td>
-    Turn on Parquet filter pushdown optimization. This feature is turned off by default because of a known
-    bug in Parquet 1.6.0rc3 (<a href="https://issues.apache.org/jira/browse/PARQUET-136">PARQUET-136</a>).
-    However, if your table doesn't contain any nullable string or binary columns, it's still safe to turn
-    this feature on.
-  </td>
+  <td>true</td>
+  <td>Enables Parquet filter push-down optimization when set to true.</td>
 </tr>
 <tr>
   <td><code>spark.sql.hive.convertMetastoreParquet</code></td>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
index 1474b170ba896..2a641b9d64a95 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
@@ -273,12 +273,8 @@ private[spark] object SQLConf {
       "uncompressed, snappy, gzip, lzo.")
 
   val PARQUET_FILTER_PUSHDOWN_ENABLED = booleanConf("spark.sql.parquet.filterPushdown",
-    defaultValue = Some(false),
-    doc = "Turn on Parquet filter pushdown optimization. This feature is turned off by default " +
-      "because of a known bug in Parquet 1.6.0rc3 " +
-      "(PARQUET-136, https://issues.apache.org/jira/browse/PARQUET-136). However, " +
-      "if your table doesn't contain any nullable string or binary columns, it's still safe to " +
-      "turn this feature on.")
+    defaultValue = Some(true),
+    doc = "Enables Parquet filter push-down optimization when set to true.")
 
   val PARQUET_USE_DATA_SOURCE_API = booleanConf("spark.sql.parquet.useDataSourceApi",
     defaultValue = Some(true),

From 8a94eb23d53e291441e3144a1b800fe054457040 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Thu, 23 Jul 2015 18:31:13 -0700
Subject: [PATCH 345/549] [SPARK-9069] [SPARK-9264] [SQL] remove unlimited
 precision support for DecimalType

Romove Decimal.Unlimited (change to support precision up to 38, to match with Hive and other databases).

In order to keep backward source compatibility, Decimal.Unlimited is still there, but change to Decimal(38, 18).

If no precision and scale is provide, it's Decimal(10, 0) as before.

Author: Davies Liu <davies@databricks.com>

Closes #7605 from davies/decimal_unlimited and squashes the following commits:

aa3f115 [Davies Liu] fix tests and style
fb0d20d [Davies Liu] address comments
bfaae35 [Davies Liu] fix style
df93657 [Davies Liu] address comments and clean up
06727fd [Davies Liu] Merge branch 'master' of github.com:apache/spark into decimal_unlimited
4c28969 [Davies Liu] fix tests
8d783cc [Davies Liu] fix tests
788631c [Davies Liu] fix double with decimal in Union/except
1779bde [Davies Liu] fix scala style
c9c7c78 [Davies Liu] remove Decimal.Unlimited
---
 .../spark/ml/attribute/AttributeSuite.scala   |   2 +-
 python/pyspark/sql/types.py                   |  36 +--
 .../org/apache/spark/sql/types/DataTypes.java |   8 +-
 .../sql/catalyst/JavaTypeInference.scala      |   2 +-
 .../spark/sql/catalyst/ScalaReflection.scala  |  10 +-
 .../apache/spark/sql/catalyst/SqlParser.scala |   5 +-
 .../catalyst/analysis/HiveTypeCoercion.scala  | 255 +++++++-----------
 .../spark/sql/catalyst/dsl/package.scala      |   2 +-
 .../spark/sql/catalyst/expressions/Cast.scala |   7 +-
 .../expressions/aggregate/functions.scala     |  24 +-
 .../sql/catalyst/expressions/aggregates.scala |  46 ++--
 .../sql/catalyst/expressions/arithmetic.scala |  17 +-
 .../sql/catalyst/expressions/literals.scala   |   6 +-
 .../spark/sql/catalyst/plans/QueryPlan.scala  |   4 +-
 .../org/apache/spark/sql/types/DataType.scala |   4 +-
 .../spark/sql/types/DataTypeParser.scala      |   2 +-
 .../apache/spark/sql/types/DecimalType.scala  | 110 +++++---
 .../spark/sql/RandomDataGenerator.scala       |   4 +-
 .../spark/sql/RandomDataGeneratorSuite.scala  |   4 +-
 .../sql/catalyst/ScalaReflectionSuite.scala   |  14 +-
 .../sql/catalyst/analysis/AnalysisSuite.scala |   6 +-
 .../analysis/DecimalPrecisionSuite.scala      |  54 ++--
 .../analysis/HiveTypeCoercionSuite.scala      |  45 ++--
 .../sql/catalyst/expressions/CastSuite.scala  |  46 ++--
 .../ConditionalExpressionSuite.scala          |   2 +-
 .../expressions/LiteralExpressionSuite.scala  |   2 +-
 .../expressions/NullFunctionsSuite.scala      |   2 +-
 .../UnsafeFixedWidthAggregationMapSuite.scala |   2 +
 .../expressions/UnsafeRowConverterSuite.scala |   2 +-
 .../spark/sql/types/DataTypeParserSuite.scala |   4 +-
 .../spark/sql/types/DataTypeSuite.scala       |   4 +-
 .../spark/sql/types/DataTypeTestUtils.scala   |   2 +-
 .../scala/org/apache/spark/sql/Column.scala   |   2 +-
 .../spark/sql/columnar/ColumnType.scala       |   2 +-
 .../sql/execution/GeneratedAggregate.scala    |  10 +-
 .../datasources/PartitioningUtils.scala       |   7 +-
 .../org/apache/spark/sql/jdbc/JDBCRDD.scala   |  22 +-
 .../org/apache/spark/sql/jdbc/jdbc.scala      |   7 +-
 .../apache/spark/sql/json/InferSchema.scala   |  11 +-
 .../sql/parquet/CatalystSchemaConverter.scala |   4 -
 .../sql/parquet/ParquetTableSupport.scala     |   8 +-
 .../spark/sql/JavaApplySchemaSuite.java       |  14 +-
 .../columnar/InMemoryColumnarQuerySuite.scala |   2 +-
 .../spark/sql/execution/PlannerSuite.scala    |   2 +-
 .../execution/SparkSqlSerializer2Suite.scala  |   4 +-
 .../org/apache/spark/sql/jdbc/JDBCSuite.scala |  20 +-
 .../org/apache/spark/sql/json/JsonSuite.scala |  57 ++--
 .../spark/sql/parquet/ParquetIOSuite.scala    |   8 -
 .../ParquetPartitionDiscoverySuite.scala      |   2 +-
 .../spark/sql/sources/DDLTestSuite.scala      |   2 +-
 .../spark/sql/sources/TableScanSuite.scala    |   2 +-
 .../spark/sql/hive/HiveInspectors.scala       |   9 +-
 .../org/apache/spark/sql/hive/HiveQl.scala    |   4 +-
 53 files changed, 459 insertions(+), 473 deletions(-)

diff --git a/mllib/src/test/scala/org/apache/spark/ml/attribute/AttributeSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/attribute/AttributeSuite.scala
index c5fd2f9d5a22a..6355e0f179496 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/attribute/AttributeSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/attribute/AttributeSuite.scala
@@ -218,7 +218,7 @@ class AttributeSuite extends SparkFunSuite {
     // Attribute.fromStructField should accept any NumericType, not just DoubleType
     val longFldWithMeta = new StructField("x", LongType, false, metadata)
     assert(Attribute.fromStructField(longFldWithMeta).isNumeric)
-    val decimalFldWithMeta = new StructField("x", DecimalType(None), false, metadata)
+    val decimalFldWithMeta = new StructField("x", DecimalType(38, 18), false, metadata)
     assert(Attribute.fromStructField(decimalFldWithMeta).isNumeric)
   }
 }
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index 10ad89ea14a8d..b97d50c945f24 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -194,30 +194,33 @@ def fromInternal(self, ts):
 
 class DecimalType(FractionalType):
     """Decimal (decimal.Decimal) data type.
+
+    The DecimalType must have fixed precision (the maximum total number of digits)
+    and scale (the number of digits on the right of dot). For example, (5, 2) can
+    support the value from [-999.99 to 999.99].
+
+    The precision can be up to 38, the scale must less or equal to precision.
+
+    When create a DecimalType, the default precision and scale is (10, 0). When infer
+    schema from decimal.Decimal objects, it will be DecimalType(38, 18).
+
+    :param precision: the maximum total number of digits (default: 10)
+    :param scale: the number of digits on right side of dot. (default: 0)
     """
 
-    def __init__(self, precision=None, scale=None):
+    def __init__(self, precision=10, scale=0):
         self.precision = precision
         self.scale = scale
-        self.hasPrecisionInfo = precision is not None
+        self.hasPrecisionInfo = True  # this is public API
 
     def simpleString(self):
-        if self.hasPrecisionInfo:
-            return "decimal(%d,%d)" % (self.precision, self.scale)
-        else:
-            return "decimal(10,0)"
+        return "decimal(%d,%d)" % (self.precision, self.scale)
 
     def jsonValue(self):
-        if self.hasPrecisionInfo:
-            return "decimal(%d,%d)" % (self.precision, self.scale)
-        else:
-            return "decimal"
+        return "decimal(%d,%d)" % (self.precision, self.scale)
 
     def __repr__(self):
-        if self.hasPrecisionInfo:
-            return "DecimalType(%d,%d)" % (self.precision, self.scale)
-        else:
-            return "DecimalType()"
+        return "DecimalType(%d,%d)" % (self.precision, self.scale)
 
 
 class DoubleType(FractionalType):
@@ -761,7 +764,10 @@ def _infer_type(obj):
         return obj.__UDT__
 
     dataType = _type_mappings.get(type(obj))
-    if dataType is not None:
+    if dataType is DecimalType:
+        # the precision and scale of `obj` may be different from row to row.
+        return DecimalType(38, 18)
+    elif dataType is not None:
         return dataType()
 
     if isinstance(obj, dict):
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/types/DataTypes.java b/sql/catalyst/src/main/java/org/apache/spark/sql/types/DataTypes.java
index d22ad6794d608..5703de42393de 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/types/DataTypes.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/types/DataTypes.java
@@ -111,12 +111,18 @@ public static ArrayType createArrayType(DataType elementType, boolean containsNu
     return new ArrayType(elementType, containsNull);
   }
 
+  /**
+   * Creates a DecimalType by specifying the precision and scale.
+   */
   public static DecimalType createDecimalType(int precision, int scale) {
     return DecimalType$.MODULE$.apply(precision, scale);
   }
 
+  /**
+   * Creates a DecimalType with default precision and scale, which are 10 and 0.
+   */
   public static DecimalType createDecimalType() {
-    return DecimalType$.MODULE$.Unlimited();
+    return DecimalType$.MODULE$.USER_DEFAULT();
   }
 
   /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala
index 9a3f9694e4c48..88a457f87ce4e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala
@@ -75,7 +75,7 @@ private [sql] object JavaTypeInference {
       case c: Class[_] if c == classOf[java.lang.Float] => (FloatType, true)
       case c: Class[_] if c == classOf[java.lang.Boolean] => (BooleanType, true)
 
-      case c: Class[_] if c == classOf[java.math.BigDecimal] => (DecimalType(), true)
+      case c: Class[_] if c == classOf[java.math.BigDecimal] => (DecimalType.SYSTEM_DEFAULT, true)
       case c: Class[_] if c == classOf[java.sql.Date] => (DateType, true)
       case c: Class[_] if c == classOf[java.sql.Timestamp] => (TimestampType, true)
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
index 21b1de1ab9cb1..2442341da106d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
@@ -131,10 +131,10 @@ trait ScalaReflection {
       case t if t <:< localTypeOf[String] => Schema(StringType, nullable = true)
       case t if t <:< localTypeOf[java.sql.Timestamp] => Schema(TimestampType, nullable = true)
       case t if t <:< localTypeOf[java.sql.Date] => Schema(DateType, nullable = true)
-      case t if t <:< localTypeOf[BigDecimal] => Schema(DecimalType.Unlimited, nullable = true)
+      case t if t <:< localTypeOf[BigDecimal] => Schema(DecimalType.SYSTEM_DEFAULT, nullable = true)
       case t if t <:< localTypeOf[java.math.BigDecimal] =>
-        Schema(DecimalType.Unlimited, nullable = true)
-      case t if t <:< localTypeOf[Decimal] => Schema(DecimalType.Unlimited, nullable = true)
+        Schema(DecimalType.SYSTEM_DEFAULT, nullable = true)
+      case t if t <:< localTypeOf[Decimal] => Schema(DecimalType.SYSTEM_DEFAULT, nullable = true)
       case t if t <:< localTypeOf[java.lang.Integer] => Schema(IntegerType, nullable = true)
       case t if t <:< localTypeOf[java.lang.Long] => Schema(LongType, nullable = true)
       case t if t <:< localTypeOf[java.lang.Double] => Schema(DoubleType, nullable = true)
@@ -167,8 +167,8 @@ trait ScalaReflection {
     case obj: Float => FloatType
     case obj: Double => DoubleType
     case obj: java.sql.Date => DateType
-    case obj: java.math.BigDecimal => DecimalType.Unlimited
-    case obj: Decimal => DecimalType.Unlimited
+    case obj: java.math.BigDecimal => DecimalType.SYSTEM_DEFAULT
+    case obj: Decimal => DecimalType.SYSTEM_DEFAULT
     case obj: java.sql.Timestamp => TimestampType
     case null => NullType
     // For other cases, there is no obvious mapping from the type of the given object to a
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
index 29cfc064da89a..c494e5d704213 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
@@ -322,7 +322,10 @@ class SqlParser extends AbstractSparkSQLParser with DataTypeParser {
 
   protected lazy val numericLiteral: Parser[Literal] =
     ( integral  ^^ { case i => Literal(toNarrowestIntegerType(i)) }
-    | sign.? ~ unsignedFloat ^^ { case s ~ f => Literal((s.getOrElse("") + f).toDouble) }
+    | sign.? ~ unsignedFloat ^^ {
+      // TODO(davies): some precisions may loss, we should create decimal literal
+      case s ~ f => Literal(BigDecimal(s.getOrElse("") + f).doubleValue())
+    }
     )
 
   protected lazy val unsignedFloat: Parser[String] =
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
index e214545726249..d56ceeadc9e85 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
@@ -19,7 +19,9 @@ package org.apache.spark.sql.catalyst.analysis
 
 import javax.annotation.Nullable
 
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, CodeGenContext}
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.types._
@@ -58,8 +60,7 @@ object HiveTypeCoercion {
       IntegerType,
       LongType,
       FloatType,
-      DoubleType,
-      DecimalType.Unlimited)
+      DoubleType)
 
   /**
    * Find the tightest common type of two types that might be used in a binary expression.
@@ -72,15 +73,16 @@ object HiveTypeCoercion {
     case (NullType, t1) => Some(t1)
     case (t1, NullType) => Some(t1)
 
-    // Promote numeric types to the highest of the two and all numeric types to unlimited decimal
+    case (t1: IntegralType, t2: DecimalType) if t2.isWiderThan(t1) =>
+      Some(t2)
+    case (t1: DecimalType, t2: IntegralType) if t1.isWiderThan(t2) =>
+      Some(t1)
+
+    // Promote numeric types to the highest of the two
     case (t1, t2) if Seq(t1, t2).forall(numericPrecedence.contains) =>
       val index = numericPrecedence.lastIndexWhere(t => t == t1 || t == t2)
       Some(numericPrecedence(index))
 
-    // Fixed-precision decimals can up-cast into unlimited
-    case (DecimalType.Unlimited, _: DecimalType) => Some(DecimalType.Unlimited)
-    case (_: DecimalType, DecimalType.Unlimited) => Some(DecimalType.Unlimited)
-
     case _ => None
   }
 
@@ -101,7 +103,7 @@ object HiveTypeCoercion {
     types.foldLeft[Option[DataType]](Some(NullType))((r, c) => r match {
       case None => None
       case Some(d) =>
-        findTightestCommonTypeOfTwo(d, c).orElse(findTightestCommonTypeToString(d, c))
+        findTightestCommonTypeToString(d, c)
     })
   }
 
@@ -158,6 +160,9 @@ object HiveTypeCoercion {
    *     converted to DOUBLE.
    *   - TINYINT, SMALLINT, and INT can all be converted to FLOAT.
    *   - BOOLEAN types cannot be converted to any other type.
+   *   - Any integral numeric type can be implicitly converted to decimal type.
+   *   - two different decimal types will be converted into a wider decimal type for both of them.
+   *   - decimal type will be converted into double if there float or double together with it.
    *
    * Additionally, all types when UNION-ed with strings will be promoted to strings.
    * Other string conversions are handled by PromoteStrings.
@@ -166,55 +171,50 @@ object HiveTypeCoercion {
    * - IntegerType to FloatType
    * - LongType to FloatType
    * - LongType to DoubleType
+   * - DecimalType to Double
+   *
+   * This rule is only applied to Union/Except/Intersect
    */
   object WidenTypes extends Rule[LogicalPlan] {
 
-    private[this] def widenOutputTypes(planName: String, left: LogicalPlan, right: LogicalPlan):
-        (LogicalPlan, LogicalPlan) = {
-
-      // TODO: with fixed-precision decimals
-      val castedInput = left.output.zip(right.output).map {
-        // When a string is found on one side, make the other side a string too.
-        case (lhs, rhs) if lhs.dataType == StringType && rhs.dataType != StringType =>
-          (lhs, Alias(Cast(rhs, StringType), rhs.name)())
-        case (lhs, rhs) if lhs.dataType != StringType && rhs.dataType == StringType =>
-          (Alias(Cast(lhs, StringType), lhs.name)(), rhs)
+    private[this] def widenOutputTypes(
+        planName: String,
+        left: LogicalPlan,
+        right: LogicalPlan): (LogicalPlan, LogicalPlan) = {
 
+      val castedTypes = left.output.zip(right.output).map {
         case (lhs, rhs) if lhs.dataType != rhs.dataType =>
-          logDebug(s"Resolving mismatched $planName input ${lhs.dataType}, ${rhs.dataType}")
-          findTightestCommonTypeOfTwo(lhs.dataType, rhs.dataType).map { widestType =>
-            val newLeft =
-              if (lhs.dataType == widestType) lhs else Alias(Cast(lhs, widestType), lhs.name)()
-            val newRight =
-              if (rhs.dataType == widestType) rhs else Alias(Cast(rhs, widestType), rhs.name)()
-
-            (newLeft, newRight)
-          }.getOrElse {
-            // If there is no applicable conversion, leave expression unchanged.
-            (lhs, rhs)
+          (lhs.dataType, rhs.dataType) match {
+            case (t1: DecimalType, t2: DecimalType) =>
+              Some(DecimalPrecision.widerDecimalType(t1, t2))
+            case (t: IntegralType, d: DecimalType) =>
+              Some(DecimalPrecision.widerDecimalType(DecimalType.forType(t), d))
+            case (d: DecimalType, t: IntegralType) =>
+              Some(DecimalPrecision.widerDecimalType(DecimalType.forType(t), d))
+            case (t: FractionalType, d: DecimalType) =>
+              Some(DoubleType)
+            case (d: DecimalType, t: FractionalType) =>
+              Some(DoubleType)
+            case _ =>
+              findTightestCommonTypeToString(lhs.dataType, rhs.dataType)
           }
-
-        case other => other
+        case other => None
       }
 
-      val (castedLeft, castedRight) = castedInput.unzip
-
-      val newLeft =
-        if (castedLeft.map(_.dataType) != left.output.map(_.dataType)) {
-          logDebug(s"Widening numeric types in $planName $castedLeft ${left.output}")
-          Project(castedLeft, left)
-        } else {
-          left
+      def castOutput(plan: LogicalPlan): LogicalPlan = {
+        val casted = plan.output.zip(castedTypes).map {
+          case (hs, Some(dt)) if dt != hs.dataType =>
+            Alias(Cast(hs, dt), hs.name)()
+          case (hs, _) => hs
         }
+        Project(casted, plan)
+      }
 
-      val newRight =
-        if (castedRight.map(_.dataType) != right.output.map(_.dataType)) {
-          logDebug(s"Widening numeric types in $planName $castedRight ${right.output}")
-          Project(castedRight, right)
-        } else {
-          right
-        }
-      (newLeft, newRight)
+      if (castedTypes.exists(_.isDefined)) {
+        (castOutput(left), castOutput(right))
+      } else {
+        (left, right)
+      }
     }
 
     def apply(plan: LogicalPlan): LogicalPlan = plan transform {
@@ -334,144 +334,94 @@ object HiveTypeCoercion {
    * - SHORT gets turned into DECIMAL(5, 0)
    * - INT gets turned into DECIMAL(10, 0)
    * - LONG gets turned into DECIMAL(20, 0)
-   * - FLOAT and DOUBLE
-   *   1. Union, Intersect and Except operations:
-   *      FLOAT gets turned into DECIMAL(7, 7), DOUBLE gets turned into DECIMAL(15, 15) (this is the
-   *      same as Hive)
-   *   2. Other operation:
-   *      FLOAT and DOUBLE cause fixed-length decimals to turn into DOUBLE (this is the same as Hive,
-   *   but note that unlimited decimals are considered bigger than doubles in WidenTypes)
+   * - FLOAT and DOUBLE cause fixed-length decimals to turn into DOUBLE
+   *
+   * Note: Union/Except/Interact is handled by WidenTypes
    */
   // scalastyle:on
   object DecimalPrecision extends Rule[LogicalPlan] {
     import scala.math.{max, min}
 
-    // Conversion rules for integer types into fixed-precision decimals
-    private val intTypeToFixed: Map[DataType, DecimalType] = Map(
-      ByteType -> DecimalType(3, 0),
-      ShortType -> DecimalType(5, 0),
-      IntegerType -> DecimalType(10, 0),
-      LongType -> DecimalType(20, 0)
-    )
-
     private def isFloat(t: DataType): Boolean = t == FloatType || t == DoubleType
 
-    // Conversion rules for float and double into fixed-precision decimals
-    private val floatTypeToFixed: Map[DataType, DecimalType] = Map(
-      FloatType -> DecimalType(7, 7),
-      DoubleType -> DecimalType(15, 15)
-    )
-
-    private def castDecimalPrecision(
-        left: LogicalPlan,
-        right: LogicalPlan): (LogicalPlan, LogicalPlan) = {
-      val castedInput = left.output.zip(right.output).map {
-        case (lhs, rhs) if lhs.dataType != rhs.dataType =>
-          (lhs.dataType, rhs.dataType) match {
-            case (DecimalType.Fixed(p1, s1), DecimalType.Fixed(p2, s2)) =>
-              // Decimals with precision/scale p1/s2 and p2/s2  will be promoted to
-              // DecimalType(max(s1, s2) + max(p1-s1, p2-s2), max(s1, s2))
-              val fixedType = DecimalType(max(s1, s2) + max(p1 - s1, p2 - s2), max(s1, s2))
-              (Alias(Cast(lhs, fixedType), lhs.name)(), Alias(Cast(rhs, fixedType), rhs.name)())
-            case (t, DecimalType.Fixed(p, s)) if intTypeToFixed.contains(t) =>
-              (Alias(Cast(lhs, intTypeToFixed(t)), lhs.name)(), rhs)
-            case (DecimalType.Fixed(p, s), t) if intTypeToFixed.contains(t) =>
-              (lhs, Alias(Cast(rhs, intTypeToFixed(t)), rhs.name)())
-            case (t, DecimalType.Fixed(p, s)) if floatTypeToFixed.contains(t) =>
-              (Alias(Cast(lhs, floatTypeToFixed(t)), lhs.name)(), rhs)
-            case (DecimalType.Fixed(p, s), t) if floatTypeToFixed.contains(t) =>
-              (lhs, Alias(Cast(rhs, floatTypeToFixed(t)), rhs.name)())
-            case _ => (lhs, rhs)
-          }
-        case other => other
-      }
-
-      val (castedLeft, castedRight) = castedInput.unzip
+    // Returns the wider decimal type that's wider than both of them
+    def widerDecimalType(d1: DecimalType, d2: DecimalType): DecimalType = {
+      widerDecimalType(d1.precision, d1.scale, d2.precision, d2.scale)
+    }
+    // max(s1, s2) + max(p1-s1, p2-s2), max(s1, s2)
+    def widerDecimalType(p1: Int, s1: Int, p2: Int, s2: Int): DecimalType = {
+      val scale = max(s1, s2)
+      val range = max(p1 - s1, p2 - s2)
+      DecimalType.bounded(range + scale, scale)
+    }
 
-      val newLeft =
-        if (castedLeft.map(_.dataType) != left.output.map(_.dataType)) {
-          Project(castedLeft, left)
-        } else {
-          left
-        }
+    /**
+     * An expression used to wrap the children when promote the precision of DecimalType to avoid
+     * promote multiple times.
+     */
+    case class ChangePrecision(child: Expression) extends UnaryExpression {
+      override def dataType: DataType = child.dataType
+      override def eval(input: InternalRow): Any = child.eval(input)
+      override def gen(ctx: CodeGenContext): GeneratedExpressionCode = child.gen(ctx)
+      override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = ""
+      override def prettyName: String = "change_precision"
+    }
 
-      val newRight =
-        if (castedRight.map(_.dataType) != right.output.map(_.dataType)) {
-          Project(castedRight, right)
-        } else {
-          right
-        }
-      (newLeft, newRight)
+    def changePrecision(e: Expression, dataType: DataType): Expression = {
+      ChangePrecision(Cast(e, dataType))
     }
 
     def apply(plan: LogicalPlan): LogicalPlan = plan transform {
-      // fix decimal precision for union, intersect and except
-      case u @ Union(left, right) if u.childrenResolved && !u.resolved =>
-        val (newLeft, newRight) = castDecimalPrecision(left, right)
-        Union(newLeft, newRight)
-      case i @ Intersect(left, right) if i.childrenResolved && !i.resolved =>
-        val (newLeft, newRight) = castDecimalPrecision(left, right)
-        Intersect(newLeft, newRight)
-      case e @ Except(left, right) if e.childrenResolved && !e.resolved =>
-        val (newLeft, newRight) = castDecimalPrecision(left, right)
-        Except(newLeft, newRight)
-
       // fix decimal precision for expressions
       case q => q.transformExpressions {
         // Skip nodes whose children have not been resolved yet
         case e if !e.childrenResolved => e
 
+        // Skip nodes who is already promoted
+        case e: BinaryArithmetic if e.left.isInstanceOf[ChangePrecision] => e
+
         case Add(e1 @ DecimalType.Expression(p1, s1), e2 @ DecimalType.Expression(p2, s2)) =>
-          Cast(
-            Add(Cast(e1, DecimalType.Unlimited), Cast(e2, DecimalType.Unlimited)),
-            DecimalType(max(s1, s2) + max(p1 - s1, p2 - s2) + 1, max(s1, s2))
-          )
+          val dt = DecimalType.bounded(max(s1, s2) + max(p1 - s1, p2 - s2) + 1, max(s1, s2))
+          Add(changePrecision(e1, dt), changePrecision(e2, dt))
 
         case Subtract(e1 @ DecimalType.Expression(p1, s1), e2 @ DecimalType.Expression(p2, s2)) =>
-          Cast(
-            Subtract(Cast(e1, DecimalType.Unlimited), Cast(e2, DecimalType.Unlimited)),
-            DecimalType(max(s1, s2) + max(p1 - s1, p2 - s2) + 1, max(s1, s2))
-          )
+          val dt = DecimalType.bounded(max(s1, s2) + max(p1 - s1, p2 - s2) + 1, max(s1, s2))
+          Subtract(changePrecision(e1, dt), changePrecision(e2, dt))
 
         case Multiply(e1 @ DecimalType.Expression(p1, s1), e2 @ DecimalType.Expression(p2, s2)) =>
-          Cast(
-            Multiply(Cast(e1, DecimalType.Unlimited), Cast(e2, DecimalType.Unlimited)),
-            DecimalType(p1 + p2 + 1, s1 + s2)
-          )
+          val dt = DecimalType.bounded(p1 + p2 + 1, s1 + s2)
+          Multiply(changePrecision(e1, dt), changePrecision(e2, dt))
 
         case Divide(e1 @ DecimalType.Expression(p1, s1), e2 @ DecimalType.Expression(p2, s2)) =>
-          Cast(
-            Divide(Cast(e1, DecimalType.Unlimited), Cast(e2, DecimalType.Unlimited)),
-            DecimalType(p1 - s1 + s2 + max(6, s1 + p2 + 1), max(6, s1 + p2 + 1))
-          )
+          val dt = DecimalType.bounded(p1 - s1 + s2 + max(6, s1 + p2 + 1), max(6, s1 + p2 + 1))
+          Divide(changePrecision(e1, dt), changePrecision(e2, dt))
 
         case Remainder(e1 @ DecimalType.Expression(p1, s1), e2 @ DecimalType.Expression(p2, s2)) =>
-          Cast(
-            Remainder(Cast(e1, DecimalType.Unlimited), Cast(e2, DecimalType.Unlimited)),
-            DecimalType(min(p1 - s1, p2 - s2) + max(s1, s2), max(s1, s2))
-          )
+          val resultType = DecimalType.bounded(min(p1 - s1, p2 - s2) + max(s1, s2), max(s1, s2))
+          // resultType may have lower precision, so we cast them into wider type first.
+          val widerType = widerDecimalType(p1, s1, p2, s2)
+          Cast(Remainder(changePrecision(e1, widerType), changePrecision(e2, widerType)),
+            resultType)
 
         case Pmod(e1 @ DecimalType.Expression(p1, s1), e2 @ DecimalType.Expression(p2, s2)) =>
-          Cast(
-            Pmod(Cast(e1, DecimalType.Unlimited), Cast(e2, DecimalType.Unlimited)),
-            DecimalType(min(p1 - s1, p2 - s2) + max(s1, s2), max(s1, s2))
-          )
+          val resultType = DecimalType.bounded(min(p1 - s1, p2 - s2) + max(s1, s2), max(s1, s2))
+          // resultType may have lower precision, so we cast them into wider type first.
+          val widerType = widerDecimalType(p1, s1, p2, s2)
+          Cast(Pmod(changePrecision(e1, widerType), changePrecision(e2, widerType)), resultType)
 
-        // When we compare 2 decimal types with different precisions, cast them to the smallest
-        // common precision.
         case b @ BinaryComparison(e1 @ DecimalType.Expression(p1, s1),
                                   e2 @ DecimalType.Expression(p2, s2)) if p1 != p2 || s1 != s2 =>
-          val resultType = DecimalType(max(p1, p2), max(s1, s2))
+          val resultType = widerDecimalType(p1, s1, p2, s2)
           b.makeCopy(Array(Cast(e1, resultType), Cast(e2, resultType)))
 
         // Promote integers inside a binary expression with fixed-precision decimals to decimals,
         // and fixed-precision decimals in an expression with floats / doubles to doubles
         case b @ BinaryOperator(left, right) if left.dataType != right.dataType =>
           (left.dataType, right.dataType) match {
-            case (t, DecimalType.Fixed(p, s)) if intTypeToFixed.contains(t) =>
-              b.makeCopy(Array(Cast(left, intTypeToFixed(t)), right))
-            case (DecimalType.Fixed(p, s), t) if intTypeToFixed.contains(t) =>
-              b.makeCopy(Array(left, Cast(right, intTypeToFixed(t))))
+            case (t: IntegralType, DecimalType.Fixed(p, s)) =>
+              b.makeCopy(Array(Cast(left, DecimalType.forType(t)), right))
+            case (DecimalType.Fixed(p, s), t: IntegralType) =>
+              b.makeCopy(Array(left, Cast(right, DecimalType.forType(t))))
             case (t, DecimalType.Fixed(p, s)) if isFloat(t) =>
               b.makeCopy(Array(left, Cast(right, DoubleType)))
             case (DecimalType.Fixed(p, s), t) if isFloat(t) =>
@@ -485,7 +435,6 @@ object HiveTypeCoercion {
         // SUM and AVERAGE are handled by the implementations of those expressions
       }
     }
-
   }
 
   /**
@@ -563,7 +512,7 @@ object HiveTypeCoercion {
       case e if !e.childrenResolved => e
 
       case Cast(e @ StringType(), t: IntegralType) =>
-        Cast(Cast(e, DecimalType.Unlimited), t)
+        Cast(Cast(e, DecimalType.forType(LongType)), t)
     }
   }
 
@@ -756,8 +705,8 @@ object HiveTypeCoercion {
         // Implicit cast among numeric types. When we reach here, input type is not acceptable.
 
         // If input is a numeric type but not decimal, and we expect a decimal type,
-        // cast the input to unlimited precision decimal.
-        case (_: NumericType, DecimalType) => Cast(e, DecimalType.Unlimited)
+        // cast the input to decimal.
+        case (d: NumericType, DecimalType) => Cast(e, DecimalType.forType(d))
         // For any other numeric types, implicitly cast to each other, e.g. long -> int, int -> long
         case (_: NumericType, target: NumericType) => Cast(e, target)
 
@@ -766,7 +715,7 @@ object HiveTypeCoercion {
         case (TimestampType, DateType) => Cast(e, DateType)
 
         // Implicit cast from/to string
-        case (StringType, DecimalType) => Cast(e, DecimalType.Unlimited)
+        case (StringType, DecimalType) => Cast(e, DecimalType.SYSTEM_DEFAULT)
         case (StringType, target: NumericType) => Cast(e, target)
         case (StringType, DateType) => Cast(e, DateType)
         case (StringType, TimestampType) => Cast(e, TimestampType)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
index 51821757967d2..a7e3a49327655 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
@@ -201,7 +201,7 @@ package object dsl {
 
       /** Creates a new AttributeReference of type decimal */
       def decimal: AttributeReference =
-        AttributeReference(s, DecimalType.Unlimited, nullable = true)()
+        AttributeReference(s, DecimalType.SYSTEM_DEFAULT, nullable = true)()
 
       /** Creates a new AttributeReference of type decimal */
       def decimal(precision: Int, scale: Int): AttributeReference =
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
index e66cd828481bf..c66854d52c50b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -300,12 +300,7 @@ case class Cast(child: Expression, dataType: DataType)
    * NOTE: this modifies `value` in-place, so don't call it on external data.
    */
   private[this] def changePrecision(value: Decimal, decimalType: DecimalType): Decimal = {
-    decimalType match {
-      case DecimalType.Unlimited =>
-        value
-      case DecimalType.Fixed(precision, scale) =>
-        if (value.changePrecision(precision, scale)) value else null
-    }
+    if (value.changePrecision(decimalType.precision, decimalType.scale)) value else null
   }
 
   private[this] def castToDecimal(from: DataType, target: DecimalType): Any => Any = from match {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/functions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/functions.scala
index b924af4cc84d8..88fb516e64aaf 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/functions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/functions.scala
@@ -36,14 +36,13 @@ case class Average(child: Expression) extends AlgebraicAggregate {
   override def inputTypes: Seq[AbstractDataType] = Seq(TypeCollection(NumericType, NullType))
 
   private val resultType = child.dataType match {
-    case DecimalType.Fixed(precision, scale) =>
-      DecimalType(precision + 4, scale + 4)
-    case DecimalType.Unlimited => DecimalType.Unlimited
+    case DecimalType.Fixed(p, s) =>
+      DecimalType.bounded(p + 4, s + 4)
     case _ => DoubleType
   }
 
   private val sumDataType = child.dataType match {
-    case _ @ DecimalType() => DecimalType.Unlimited
+    case _ @ DecimalType.Fixed(p, s) => DecimalType.bounded(p + 10, s)
     case _ => DoubleType
   }
 
@@ -71,7 +70,14 @@ case class Average(child: Expression) extends AlgebraicAggregate {
   )
 
   // If all input are nulls, currentCount will be 0 and we will get null after the division.
-  override val evaluateExpression = Cast(currentSum, resultType) / Cast(currentCount, resultType)
+  override val evaluateExpression = child.dataType match {
+    case DecimalType.Fixed(p, s) =>
+      // increase the precision and scale to prevent precision loss
+      val dt = DecimalType.bounded(p + 14, s + 4)
+      Cast(Cast(currentSum, dt) / Cast(currentCount, dt), resultType)
+    case _ =>
+      Cast(currentSum, resultType) / Cast(currentCount, resultType)
+  }
 }
 
 case class Count(child: Expression) extends AlgebraicAggregate {
@@ -255,15 +261,11 @@ case class Sum(child: Expression) extends AlgebraicAggregate {
 
   private val resultType = child.dataType match {
     case DecimalType.Fixed(precision, scale) =>
-      DecimalType(precision + 4, scale + 4)
-    case DecimalType.Unlimited => DecimalType.Unlimited
+      DecimalType.bounded(precision + 10, scale)
     case _ => child.dataType
   }
 
-  private val sumDataType = child.dataType match {
-    case _ @ DecimalType() => DecimalType.Unlimited
-    case _ => child.dataType
-  }
+  private val sumDataType = resultType
 
   private val currentSum = AttributeReference("currentSum", sumDataType)()
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
index d3295b8bafa80..73fde4e9164d7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
@@ -390,22 +390,21 @@ case class Average(child: Expression) extends UnaryExpression with PartialAggreg
 
   override def dataType: DataType = child.dataType match {
     case DecimalType.Fixed(precision, scale) =>
-      DecimalType(precision + 4, scale + 4)  // Add 4 digits after decimal point, like Hive
-    case DecimalType.Unlimited =>
-      DecimalType.Unlimited
+      // Add 4 digits after decimal point, like Hive
+      DecimalType.bounded(precision + 4, scale + 4)
     case _ =>
       DoubleType
   }
 
   override def asPartial: SplitEvaluation = {
     child.dataType match {
-      case DecimalType.Fixed(_, _) | DecimalType.Unlimited =>
-        // Turn the child to unlimited decimals for calculation, before going back to fixed
-        val partialSum = Alias(Sum(Cast(child, DecimalType.Unlimited)), "PartialSum")()
+      case DecimalType.Fixed(precision, scale) =>
+        val partialSum = Alias(Sum(child), "PartialSum")()
         val partialCount = Alias(Count(child), "PartialCount")()
 
-        val castedSum = Cast(Sum(partialSum.toAttribute), DecimalType.Unlimited)
-        val castedCount = Cast(Sum(partialCount.toAttribute), DecimalType.Unlimited)
+        // partialSum already increase the precision by 10
+        val castedSum = Cast(Sum(partialSum.toAttribute), partialSum.dataType)
+        val castedCount = Sum(partialCount.toAttribute)
         SplitEvaluation(
           Cast(Divide(castedSum, castedCount), dataType),
           partialCount :: partialSum :: Nil)
@@ -435,8 +434,8 @@ case class AverageFunction(expr: Expression, base: AggregateExpression1)
 
   private val calcType =
     expr.dataType match {
-      case DecimalType.Fixed(_, _) =>
-        DecimalType.Unlimited
+      case DecimalType.Fixed(precision, scale) =>
+        DecimalType.bounded(precision + 10, scale)
       case _ =>
         expr.dataType
     }
@@ -454,10 +453,9 @@ case class AverageFunction(expr: Expression, base: AggregateExpression1)
       null
     } else {
       expr.dataType match {
-        case DecimalType.Fixed(_, _) =>
-          Cast(Divide(
-            Cast(sum, DecimalType.Unlimited),
-            Cast(Literal(count), DecimalType.Unlimited)), dataType).eval(null)
+        case DecimalType.Fixed(precision, scale) =>
+          val dt = DecimalType.bounded(precision + 14, scale + 4)
+          Cast(Divide(Cast(sum, dt), Cast(Literal(count), dt)), dataType).eval(null)
         case _ =>
           Divide(
             Cast(sum, dataType),
@@ -481,9 +479,8 @@ case class Sum(child: Expression) extends UnaryExpression with PartialAggregate1
 
   override def dataType: DataType = child.dataType match {
     case DecimalType.Fixed(precision, scale) =>
-      DecimalType(precision + 10, scale)  // Add 10 digits left of decimal point, like Hive
-    case DecimalType.Unlimited =>
-      DecimalType.Unlimited
+      // Add 10 digits left of decimal point, like Hive
+      DecimalType.bounded(precision + 10, scale)
     case _ =>
       child.dataType
   }
@@ -491,7 +488,7 @@ case class Sum(child: Expression) extends UnaryExpression with PartialAggregate1
   override def asPartial: SplitEvaluation = {
     child.dataType match {
       case DecimalType.Fixed(_, _) =>
-        val partialSum = Alias(Sum(Cast(child, DecimalType.Unlimited)), "PartialSum")()
+        val partialSum = Alias(Sum(child), "PartialSum")()
         SplitEvaluation(
           Cast(CombineSum(partialSum.toAttribute), dataType),
           partialSum :: Nil)
@@ -515,8 +512,8 @@ case class SumFunction(expr: Expression, base: AggregateExpression1) extends Agg
 
   private val calcType =
     expr.dataType match {
-      case DecimalType.Fixed(_, _) =>
-        DecimalType.Unlimited
+      case DecimalType.Fixed(precision, scale) =>
+        DecimalType.bounded(precision + 10, scale)
       case _ =>
         expr.dataType
     }
@@ -572,8 +569,8 @@ case class CombineSumFunction(expr: Expression, base: AggregateExpression1)
 
   private val calcType =
     expr.dataType match {
-      case DecimalType.Fixed(_, _) =>
-        DecimalType.Unlimited
+      case DecimalType.Fixed(precision, scale) =>
+        DecimalType.bounded(precision + 10, scale)
       case _ =>
         expr.dataType
     }
@@ -608,9 +605,8 @@ case class SumDistinct(child: Expression) extends UnaryExpression with PartialAg
   override def nullable: Boolean = true
   override def dataType: DataType = child.dataType match {
     case DecimalType.Fixed(precision, scale) =>
-      DecimalType(precision + 10, scale)  // Add 10 digits left of decimal point, like Hive
-    case DecimalType.Unlimited =>
-      DecimalType.Unlimited
+      // Add 10 digits left of decimal point, like Hive
+      DecimalType.bounded(precision + 10, scale)
     case _ =>
       child.dataType
   }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
index 05b5ad88fee8f..7c254a8750a9f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
@@ -88,6 +88,8 @@ abstract class BinaryArithmetic extends BinaryOperator {
 
   override def dataType: DataType = left.dataType
 
+  override lazy val resolved = childrenResolved && checkInputDataTypes().isSuccess
+
   /** Name of the function for this expression on a [[Decimal]] type. */
   def decimalMethod: String =
     sys.error("BinaryArithmetics must override either decimalMethod or genCode")
@@ -114,9 +116,6 @@ case class Add(left: Expression, right: Expression) extends BinaryArithmetic {
 
   override def symbol: String = "+"
 
-  override lazy val resolved =
-    childrenResolved && checkInputDataTypes().isSuccess && !DecimalType.isFixed(dataType)
-
   private lazy val numeric = TypeUtils.getNumeric(dataType)
 
   protected override def nullSafeEval(input1: Any, input2: Any): Any = {
@@ -146,9 +145,6 @@ case class Subtract(left: Expression, right: Expression) extends BinaryArithmeti
 
   override def symbol: String = "-"
 
-  override lazy val resolved =
-    childrenResolved && checkInputDataTypes().isSuccess && !DecimalType.isFixed(dataType)
-
   private lazy val numeric = TypeUtils.getNumeric(dataType)
 
   protected override def nullSafeEval(input1: Any, input2: Any): Any = {
@@ -179,9 +175,6 @@ case class Multiply(left: Expression, right: Expression) extends BinaryArithmeti
   override def symbol: String = "*"
   override def decimalMethod: String = "$times"
 
-  override lazy val resolved =
-    childrenResolved && checkInputDataTypes().isSuccess && !DecimalType.isFixed(dataType)
-
   private lazy val numeric = TypeUtils.getNumeric(dataType)
 
   protected override def nullSafeEval(input1: Any, input2: Any): Any = numeric.times(input1, input2)
@@ -195,9 +188,6 @@ case class Divide(left: Expression, right: Expression) extends BinaryArithmetic
   override def decimalMethod: String = "$div"
   override def nullable: Boolean = true
 
-  override lazy val resolved =
-    childrenResolved && checkInputDataTypes().isSuccess && !DecimalType.isFixed(dataType)
-
   private lazy val div: (Any, Any) => Any = dataType match {
     case ft: FractionalType => ft.fractional.asInstanceOf[Fractional[Any]].div
     case it: IntegralType => it.integral.asInstanceOf[Integral[Any]].quot
@@ -260,9 +250,6 @@ case class Remainder(left: Expression, right: Expression) extends BinaryArithmet
   override def decimalMethod: String = "remainder"
   override def nullable: Boolean = true
 
-  override lazy val resolved =
-    childrenResolved && checkInputDataTypes().isSuccess && !DecimalType.isFixed(dataType)
-
   private lazy val integral = dataType match {
     case i: IntegralType => i.integral.asInstanceOf[Integral[Any]]
     case i: FractionalType => i.asIntegral.asInstanceOf[Integral[Any]]
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
index f25ac32679587..85060b7893556 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
@@ -36,9 +36,9 @@ object Literal {
     case s: Short => Literal(s, ShortType)
     case s: String => Literal(UTF8String.fromString(s), StringType)
     case b: Boolean => Literal(b, BooleanType)
-    case d: BigDecimal => Literal(Decimal(d), DecimalType.Unlimited)
-    case d: java.math.BigDecimal => Literal(Decimal(d), DecimalType.Unlimited)
-    case d: Decimal => Literal(d, DecimalType.Unlimited)
+    case d: BigDecimal => Literal(Decimal(d), DecimalType(d.precision, d.scale))
+    case d: java.math.BigDecimal => Literal(Decimal(d), DecimalType(d.precision(), d.scale()))
+    case d: Decimal => Literal(d, DecimalType(d.precision, d.scale))
     case t: Timestamp => Literal(DateTimeUtils.fromJavaTimestamp(t), TimestampType)
     case d: Date => Literal(DateTimeUtils.fromJavaDate(d), DateType)
     case a: Array[Byte] => Literal(a, BinaryType)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
index d06a7a2add754..c610f70d38437 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
@@ -17,9 +17,9 @@
 
 package org.apache.spark.sql.catalyst.plans
 
-import org.apache.spark.sql.catalyst.expressions.{VirtualColumn, Attribute, AttributeSet, Expression}
+import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeSet, Expression, VirtualColumn}
 import org.apache.spark.sql.catalyst.trees.TreeNode
-import org.apache.spark.sql.types.{ArrayType, DataType, StructField, StructType}
+import org.apache.spark.sql.types.{DataType, StructType}
 
 abstract class QueryPlan[PlanType <: TreeNode[PlanType]] extends TreeNode[PlanType] {
   self: PlanType =>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala
index e98fd2583b931..591fb26e67c4a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala
@@ -106,7 +106,7 @@ object DataType {
   private def nameToType(name: String): DataType = {
     val FIXED_DECIMAL = """decimal\(\s*(\d+)\s*,\s*(\d+)\s*\)""".r
     name match {
-      case "decimal" => DecimalType.Unlimited
+      case "decimal" => DecimalType.USER_DEFAULT
       case FIXED_DECIMAL(precision, scale) => DecimalType(precision.toInt, scale.toInt)
       case other => nonDecimalNameToType(other)
     }
@@ -177,7 +177,7 @@ object DataType {
         | "BinaryType" ^^^ BinaryType
         | "BooleanType" ^^^ BooleanType
         | "DateType" ^^^ DateType
-        | "DecimalType()" ^^^ DecimalType.Unlimited
+        | "DecimalType()" ^^^ DecimalType.USER_DEFAULT
         | fixedDecimalType
         | "TimestampType" ^^^ TimestampType
         )
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataTypeParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataTypeParser.scala
index 6b43224feb1f2..6e081ea9237bd 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataTypeParser.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataTypeParser.scala
@@ -48,7 +48,7 @@ private[sql] trait DataTypeParser extends StandardTokenParsers {
     "(?i)binary".r ^^^ BinaryType |
     "(?i)boolean".r ^^^ BooleanType |
     fixedDecimalType |
-    "(?i)decimal".r ^^^ DecimalType.Unlimited |
+    "(?i)decimal".r ^^^ DecimalType.USER_DEFAULT |
     "(?i)date".r ^^^ DateType |
     "(?i)timestamp".r ^^^ TimestampType |
     varchar
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala
index 377c75f6e85a5..26b24616d98ec 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala
@@ -26,25 +26,46 @@ import org.apache.spark.sql.catalyst.expressions.Expression
 
 
 /** Precision parameters for a Decimal */
+@deprecated("Use DecimalType(precision, scale) directly", "1.5")
 case class PrecisionInfo(precision: Int, scale: Int) {
   if (scale > precision) {
     throw new AnalysisException(
       s"Decimal scale ($scale) cannot be greater than precision ($precision).")
   }
+  if (precision > DecimalType.MAX_PRECISION) {
+    throw new AnalysisException(
+      s"DecimalType can only support precision up to 38"
+    )
+  }
 }
 
 /**
  * :: DeveloperApi ::
  * The data type representing `java.math.BigDecimal` values.
- * A Decimal that might have fixed precision and scale, or unlimited values for these.
+ * A Decimal that must have fixed precision (the maximum number of digits) and scale (the number
+ * of digits on right side of dot).
+ *
+ * The precision can be up to 38, scale can also be up to 38 (less or equal to precision).
+ *
+ * The default precision and scale is (10, 0).
  *
  * Please use [[DataTypes.createDecimalType()]] to create a specific instance.
  */
 @DeveloperApi
-case class DecimalType(precisionInfo: Option[PrecisionInfo]) extends FractionalType {
+case class DecimalType(precision: Int, scale: Int) extends FractionalType {
+
+  // default constructor for Java
+  def this(precision: Int) = this(precision, 0)
+  def this() = this(10)
+
+  @deprecated("Use DecimalType(precision, scale) instead", "1.5")
+  def this(precisionInfo: Option[PrecisionInfo]) {
+    this(precisionInfo.getOrElse(PrecisionInfo(10, 0)).precision,
+      precisionInfo.getOrElse(PrecisionInfo(10, 0)).scale)
+  }
 
-  /** No-arg constructor for kryo. */
-  protected def this() = this(null)
+  @deprecated("Use DecimalType.precision and DecimalType.scale instead", "1.5")
+  val precisionInfo = Some(PrecisionInfo(precision, scale))
 
   private[sql] type InternalType = Decimal
   @transient private[sql] lazy val tag = ScalaReflectionLock.synchronized { typeTag[InternalType] }
@@ -53,18 +74,16 @@ case class DecimalType(precisionInfo: Option[PrecisionInfo]) extends FractionalT
   private[sql] val ordering = Decimal.DecimalIsFractional
   private[sql] val asIntegral = Decimal.DecimalAsIfIntegral
 
-  def precision: Int = precisionInfo.map(_.precision).getOrElse(-1)
-
-  def scale: Int = precisionInfo.map(_.scale).getOrElse(-1)
+  override def typeName: String = s"decimal($precision,$scale)"
 
-  override def typeName: String = precisionInfo match {
-    case Some(PrecisionInfo(precision, scale)) => s"decimal($precision,$scale)"
-    case None => "decimal"
-  }
+  override def toString: String = s"DecimalType($precision,$scale)"
 
-  override def toString: String = precisionInfo match {
-    case Some(PrecisionInfo(precision, scale)) => s"DecimalType($precision,$scale)"
-    case None => "DecimalType()"
+  private[sql] def isWiderThan(other: DataType): Boolean = other match {
+    case dt: DecimalType =>
+      (precision - scale) >= (dt.precision - dt.scale) && scale >= dt.scale
+    case dt: IntegralType =>
+      isWiderThan(DecimalType.forType(dt))
+    case _ => false
   }
 
   /**
@@ -72,10 +91,7 @@ case class DecimalType(precisionInfo: Option[PrecisionInfo]) extends FractionalT
    */
   override def defaultSize: Int = 4096
 
-  override def simpleString: String = precisionInfo match {
-    case Some(PrecisionInfo(precision, scale)) => s"decimal($precision,$scale)"
-    case None => "decimal(10,0)"
-  }
+  override def simpleString: String = s"decimal($precision,$scale)"
 
   private[spark] override def asNullable: DecimalType = this
 }
@@ -83,8 +99,47 @@ case class DecimalType(precisionInfo: Option[PrecisionInfo]) extends FractionalT
 
 /** Extra factory methods and pattern matchers for Decimals */
 object DecimalType extends AbstractDataType {
+  import scala.math.min
+
+  val MAX_PRECISION = 38
+  val MAX_SCALE = 38
+  val SYSTEM_DEFAULT: DecimalType = DecimalType(MAX_PRECISION, 18)
+  val USER_DEFAULT: DecimalType = DecimalType(10, 0)
+
+  @deprecated("Does not support unlimited precision, please specify the precision and scale", "1.5")
+  val Unlimited: DecimalType = SYSTEM_DEFAULT
+
+  // The decimal types compatible with other numberic types
+  private[sql] val ByteDecimal = DecimalType(3, 0)
+  private[sql] val ShortDecimal = DecimalType(5, 0)
+  private[sql] val IntDecimal = DecimalType(10, 0)
+  private[sql] val LongDecimal = DecimalType(20, 0)
+  private[sql] val FloatDecimal = DecimalType(14, 7)
+  private[sql] val DoubleDecimal = DecimalType(30, 15)
+
+  private[sql] def forType(dataType: DataType): DecimalType = dataType match {
+    case ByteType => ByteDecimal
+    case ShortType => ShortDecimal
+    case IntegerType => IntDecimal
+    case LongType => LongDecimal
+    case FloatType => FloatDecimal
+    case DoubleType => DoubleDecimal
+  }
 
-  override private[sql] def defaultConcreteType: DataType = Unlimited
+  @deprecated("please specify precision and scale", "1.5")
+  def apply(): DecimalType = USER_DEFAULT
+
+  @deprecated("Use DecimalType(precision, scale) instead", "1.5")
+  def apply(precisionInfo: Option[PrecisionInfo]) {
+    this(precisionInfo.getOrElse(PrecisionInfo(10, 0)).precision,
+      precisionInfo.getOrElse(PrecisionInfo(10, 0)).scale)
+  }
+
+  private[sql] def bounded(precision: Int, scale: Int): DecimalType = {
+    DecimalType(min(precision, MAX_PRECISION), min(scale, MAX_SCALE))
+  }
+
+  override private[sql] def defaultConcreteType: DataType = SYSTEM_DEFAULT
 
   override private[sql] def acceptsType(other: DataType): Boolean = {
     other.isInstanceOf[DecimalType]
@@ -92,31 +147,18 @@ object DecimalType extends AbstractDataType {
 
   override private[sql] def simpleString: String = "decimal"
 
-  val Unlimited: DecimalType = DecimalType(None)
-
   private[sql] object Fixed {
-    def unapply(t: DecimalType): Option[(Int, Int)] =
-      t.precisionInfo.map(p => (p.precision, p.scale))
+    def unapply(t: DecimalType): Option[(Int, Int)] = Some((t.precision, t.scale))
   }
 
   private[sql] object Expression {
     def unapply(e: Expression): Option[(Int, Int)] = e.dataType match {
-      case t: DecimalType => t.precisionInfo.map(p => (p.precision, p.scale))
+      case t: DecimalType => Some((t.precision, t.scale))
       case _ => None
     }
   }
 
-  def apply(): DecimalType = Unlimited
-
-  def apply(precision: Int, scale: Int): DecimalType =
-    DecimalType(Some(PrecisionInfo(precision, scale)))
-
   def unapply(t: DataType): Boolean = t.isInstanceOf[DecimalType]
 
   def unapply(e: Expression): Boolean = e.dataType.isInstanceOf[DecimalType]
-
-  def isFixed(dataType: DataType): Boolean = dataType match {
-    case DecimalType.Fixed(_, _) => true
-    case _ => false
-  }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala
index 13aad467fa578..b9f2ad7ec0481 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala
@@ -94,8 +94,8 @@ object RandomDataGenerator {
       case BooleanType => Some(() => rand.nextBoolean())
       case DateType => Some(() => new java.sql.Date(rand.nextInt()))
       case TimestampType => Some(() => new java.sql.Timestamp(rand.nextLong()))
-      case DecimalType.Unlimited => Some(
-        () => BigDecimal.apply(rand.nextLong, rand.nextInt, MathContext.UNLIMITED))
+      case DecimalType.Fixed(precision, scale) => Some(
+        () => BigDecimal.apply(rand.nextLong, rand.nextInt, new MathContext(precision)))
       case DoubleType => randomNumeric[Double](
         rand, r => longBitsToDouble(r.nextLong()), Seq(Double.MinValue, Double.MinPositiveValue,
           Double.MaxValue, Double.PositiveInfinity, Double.NegativeInfinity, Double.NaN, 0.0))
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGeneratorSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGeneratorSuite.scala
index dbba93dba668e..677ba0a18040c 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGeneratorSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGeneratorSuite.scala
@@ -50,9 +50,7 @@ class RandomDataGeneratorSuite extends SparkFunSuite {
   for (
     dataType <- DataTypeTestUtils.atomicTypes;
     nullable <- Seq(true, false)
-    if !dataType.isInstanceOf[DecimalType] ||
-      dataType.asInstanceOf[DecimalType].precisionInfo.isEmpty
-  ) {
+    if !dataType.isInstanceOf[DecimalType]) {
     test(s"$dataType (nullable=$nullable)") {
       testRandomDataGeneration(dataType)
     }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala
index b4b00f558463f..3b848cfdf737f 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala
@@ -102,7 +102,7 @@ class ScalaReflectionSuite extends SparkFunSuite {
         StructField("byteField", ByteType, nullable = true),
         StructField("booleanField", BooleanType, nullable = true),
         StructField("stringField", StringType, nullable = true),
-        StructField("decimalField", DecimalType.Unlimited, nullable = true),
+        StructField("decimalField", DecimalType.SYSTEM_DEFAULT, nullable = true),
         StructField("dateField", DateType, nullable = true),
         StructField("timestampField", TimestampType, nullable = true),
         StructField("binaryField", BinaryType, nullable = true))),
@@ -216,7 +216,7 @@ class ScalaReflectionSuite extends SparkFunSuite {
     assert(DoubleType === typeOfObject(1.7976931348623157E308))
 
     // DecimalType
-    assert(DecimalType.Unlimited ===
+    assert(DecimalType.SYSTEM_DEFAULT ===
       typeOfObject(new java.math.BigDecimal("1.7976931348623157E318")))
 
     // DateType
@@ -229,19 +229,19 @@ class ScalaReflectionSuite extends SparkFunSuite {
     assert(NullType === typeOfObject(null))
 
     def typeOfObject1: PartialFunction[Any, DataType] = typeOfObject orElse {
-      case value: java.math.BigInteger => DecimalType.Unlimited
-      case value: java.math.BigDecimal => DecimalType.Unlimited
+      case value: java.math.BigInteger => DecimalType.SYSTEM_DEFAULT
+      case value: java.math.BigDecimal => DecimalType.SYSTEM_DEFAULT
       case _ => StringType
     }
 
-    assert(DecimalType.Unlimited === typeOfObject1(
+    assert(DecimalType.SYSTEM_DEFAULT === typeOfObject1(
       new BigInteger("92233720368547758070")))
-    assert(DecimalType.Unlimited === typeOfObject1(
+    assert(DecimalType.SYSTEM_DEFAULT === typeOfObject1(
       new java.math.BigDecimal("1.7976931348623157E318")))
     assert(StringType === typeOfObject1(BigInt("92233720368547758070")))
 
     def typeOfObject2: PartialFunction[Any, DataType] = typeOfObject orElse {
-      case value: java.math.BigInteger => DecimalType.Unlimited
+      case value: java.math.BigInteger => DecimalType.SYSTEM_DEFAULT
     }
 
     intercept[MatchError](typeOfObject2(BigInt("92233720368547758070")))
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
index 58df1de983a09..7e67427237a65 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
@@ -55,7 +55,7 @@ object AnalysisSuite {
     AttributeReference("a", StringType)(),
     AttributeReference("b", StringType)(),
     AttributeReference("c", DoubleType)(),
-    AttributeReference("d", DecimalType.Unlimited)(),
+    AttributeReference("d", DecimalType.SYSTEM_DEFAULT)(),
     AttributeReference("e", ShortType)())
 
   val nestedRelation = LocalRelation(
@@ -158,7 +158,7 @@ class AnalysisSuite extends SparkFunSuite with BeforeAndAfter {
       AttributeReference("a", StringType)(),
       AttributeReference("b", StringType)(),
       AttributeReference("c", DoubleType)(),
-      AttributeReference("d", DecimalType.Unlimited)(),
+      AttributeReference("d", DecimalType(10, 2))(),
       AttributeReference("e", ShortType)())
 
     val plan = caseInsensitiveAnalyzer.execute(
@@ -173,7 +173,7 @@ class AnalysisSuite extends SparkFunSuite with BeforeAndAfter {
     assert(pl(0).dataType == DoubleType)
     assert(pl(1).dataType == DoubleType)
     assert(pl(2).dataType == DoubleType)
-    assert(pl(3).dataType == DecimalType.Unlimited)
+    assert(pl(3).dataType == DoubleType)  // StringType will be promoted into Double
     assert(pl(4).dataType == DoubleType)
   }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala
index 7bac97b7894f5..f9f15e7a6608d 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala
@@ -34,7 +34,7 @@ class DecimalPrecisionSuite extends SparkFunSuite with BeforeAndAfter {
     AttributeReference("i", IntegerType)(),
     AttributeReference("d1", DecimalType(2, 1))(),
     AttributeReference("d2", DecimalType(5, 2))(),
-    AttributeReference("u", DecimalType.Unlimited)(),
+    AttributeReference("u", DecimalType.SYSTEM_DEFAULT)(),
     AttributeReference("f", FloatType)(),
     AttributeReference("b", DoubleType)()
   )
@@ -92,11 +92,11 @@ class DecimalPrecisionSuite extends SparkFunSuite with BeforeAndAfter {
   }
 
   test("Comparison operations") {
-    checkComparison(EqualTo(i, d1), DecimalType(10, 1))
+    checkComparison(EqualTo(i, d1), DecimalType(11, 1))
     checkComparison(EqualNullSafe(d2, d1), DecimalType(5, 2))
-    checkComparison(LessThan(i, d1), DecimalType(10, 1))
+    checkComparison(LessThan(i, d1), DecimalType(11, 1))
     checkComparison(LessThanOrEqual(d1, d2), DecimalType(5, 2))
-    checkComparison(GreaterThan(d2, u), DecimalType.Unlimited)
+    checkComparison(GreaterThan(d2, u), DecimalType.SYSTEM_DEFAULT)
     checkComparison(GreaterThanOrEqual(d1, f), DoubleType)
     checkComparison(GreaterThan(d2, d2), DecimalType(5, 2))
   }
@@ -106,12 +106,12 @@ class DecimalPrecisionSuite extends SparkFunSuite with BeforeAndAfter {
     checkUnion(i, d2, DecimalType(12, 2))
     checkUnion(d1, d2, DecimalType(5, 2))
     checkUnion(d2, d1, DecimalType(5, 2))
-    checkUnion(d1, f, DecimalType(8, 7))
-    checkUnion(f, d2, DecimalType(10, 7))
-    checkUnion(d1, b, DecimalType(16, 15))
-    checkUnion(b, d2, DecimalType(18, 15))
-    checkUnion(d1, u, DecimalType.Unlimited)
-    checkUnion(u, d2, DecimalType.Unlimited)
+    checkUnion(d1, f, DoubleType)
+    checkUnion(f, d2, DoubleType)
+    checkUnion(d1, b, DoubleType)
+    checkUnion(b, d2, DoubleType)
+    checkUnion(d1, u, DecimalType.SYSTEM_DEFAULT)
+    checkUnion(u, d2, DecimalType.SYSTEM_DEFAULT)
   }
 
   test("bringing in primitive types") {
@@ -125,13 +125,33 @@ class DecimalPrecisionSuite extends SparkFunSuite with BeforeAndAfter {
     checkType(Add(d1, Cast(i, DoubleType)), DoubleType)
   }
 
-  test("unlimited decimals make everything else cast up") {
-    for (expr <- Seq(d1, d2, i, f, u)) {
-      checkType(Add(expr, u), DecimalType.Unlimited)
-      checkType(Subtract(expr, u), DecimalType.Unlimited)
-      checkType(Multiply(expr, u), DecimalType.Unlimited)
-      checkType(Divide(expr, u), DecimalType.Unlimited)
-      checkType(Remainder(expr, u), DecimalType.Unlimited)
+  test("maximum decimals") {
+    for (expr <- Seq(d1, d2, i, u)) {
+      checkType(Add(expr, u), DecimalType.SYSTEM_DEFAULT)
+      checkType(Subtract(expr, u), DecimalType.SYSTEM_DEFAULT)
+    }
+
+    checkType(Multiply(d1, u), DecimalType(38, 19))
+    checkType(Multiply(d2, u), DecimalType(38, 20))
+    checkType(Multiply(i, u), DecimalType(38, 18))
+    checkType(Multiply(u, u), DecimalType(38, 36))
+
+    checkType(Divide(u, d1), DecimalType(38, 21))
+    checkType(Divide(u, d2), DecimalType(38, 24))
+    checkType(Divide(u, i), DecimalType(38, 29))
+    checkType(Divide(u, u), DecimalType(38, 38))
+
+    checkType(Remainder(d1, u), DecimalType(19, 18))
+    checkType(Remainder(d2, u), DecimalType(21, 18))
+    checkType(Remainder(i, u), DecimalType(28, 18))
+    checkType(Remainder(u, u), DecimalType.SYSTEM_DEFAULT)
+
+    for (expr <- Seq(f, b)) {
+      checkType(Add(expr, u), DoubleType)
+      checkType(Subtract(expr, u), DoubleType)
+      checkType(Multiply(expr, u), DoubleType)
+      checkType(Divide(expr, u), DoubleType)
+      checkType(Remainder(expr, u), DoubleType)
     }
   }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
index 835220c563f41..d0fb95b580ad2 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
@@ -35,14 +35,14 @@ class HiveTypeCoercionSuite extends PlanTest {
 
     shouldCast(NullType, NullType, NullType)
     shouldCast(NullType, IntegerType, IntegerType)
-    shouldCast(NullType, DecimalType, DecimalType.Unlimited)
+    shouldCast(NullType, DecimalType, DecimalType.SYSTEM_DEFAULT)
 
     shouldCast(ByteType, IntegerType, IntegerType)
     shouldCast(IntegerType, IntegerType, IntegerType)
     shouldCast(IntegerType, LongType, LongType)
-    shouldCast(IntegerType, DecimalType, DecimalType.Unlimited)
+    shouldCast(IntegerType, DecimalType, DecimalType(10, 0))
     shouldCast(LongType, IntegerType, IntegerType)
-    shouldCast(LongType, DecimalType, DecimalType.Unlimited)
+    shouldCast(LongType, DecimalType, DecimalType(20, 0))
 
     shouldCast(DateType, TimestampType, TimestampType)
     shouldCast(TimestampType, DateType, DateType)
@@ -71,8 +71,8 @@ class HiveTypeCoercionSuite extends PlanTest {
     shouldCast(IntegerType, TypeCollection(StringType, BinaryType), StringType)
     shouldCast(IntegerType, TypeCollection(BinaryType, StringType), StringType)
 
-    shouldCast(
-      DecimalType.Unlimited, TypeCollection(IntegerType, DecimalType), DecimalType.Unlimited)
+    shouldCast(DecimalType.SYSTEM_DEFAULT,
+      TypeCollection(IntegerType, DecimalType), DecimalType.SYSTEM_DEFAULT)
     shouldCast(DecimalType(10, 2), TypeCollection(IntegerType, DecimalType), DecimalType(10, 2))
     shouldCast(DecimalType(10, 2), TypeCollection(DecimalType, IntegerType), DecimalType(10, 2))
     shouldCast(IntegerType, TypeCollection(DecimalType(10, 2), StringType), DecimalType(10, 2))
@@ -82,7 +82,7 @@ class HiveTypeCoercionSuite extends PlanTest {
 
     // NumericType should not be changed when function accepts any of them.
     Seq(ByteType, ShortType, IntegerType, LongType, FloatType, DoubleType,
-      DecimalType.Unlimited, DecimalType(10, 2)).foreach { tpe =>
+      DecimalType.SYSTEM_DEFAULT, DecimalType(10, 2)).foreach { tpe =>
       shouldCast(tpe, NumericType, tpe)
     }
 
@@ -107,8 +107,8 @@ class HiveTypeCoercionSuite extends PlanTest {
     shouldNotCast(IntegerType, TimestampType)
     shouldNotCast(LongType, DateType)
     shouldNotCast(LongType, TimestampType)
-    shouldNotCast(DecimalType.Unlimited, DateType)
-    shouldNotCast(DecimalType.Unlimited, TimestampType)
+    shouldNotCast(DecimalType.SYSTEM_DEFAULT, DateType)
+    shouldNotCast(DecimalType.SYSTEM_DEFAULT, TimestampType)
 
     shouldNotCast(IntegerType, TypeCollection(DateType, TimestampType))
 
@@ -160,14 +160,6 @@ class HiveTypeCoercionSuite extends PlanTest {
     widenTest(LongType, FloatType, Some(FloatType))
     widenTest(LongType, DoubleType, Some(DoubleType))
 
-    // Casting up to unlimited-precision decimal
-    widenTest(IntegerType, DecimalType.Unlimited, Some(DecimalType.Unlimited))
-    widenTest(DoubleType, DecimalType.Unlimited, Some(DecimalType.Unlimited))
-    widenTest(DecimalType(3, 2), DecimalType.Unlimited, Some(DecimalType.Unlimited))
-    widenTest(DecimalType.Unlimited, IntegerType, Some(DecimalType.Unlimited))
-    widenTest(DecimalType.Unlimited, DoubleType, Some(DecimalType.Unlimited))
-    widenTest(DecimalType.Unlimited, DecimalType(3, 2), Some(DecimalType.Unlimited))
-
     // No up-casting for fixed-precision decimal (this is handled by arithmetic rules)
     widenTest(DecimalType(2, 1), DecimalType(3, 2), None)
     widenTest(DecimalType(2, 1), DoubleType, None)
@@ -242,9 +234,9 @@ class HiveTypeCoercionSuite extends PlanTest {
         :: Literal(1)
         :: Literal(new java.math.BigDecimal("1000000000000000000000"))
         :: Nil),
-      Coalesce(Cast(Literal(1L), DecimalType())
-        :: Cast(Literal(1), DecimalType())
-        :: Cast(Literal(new java.math.BigDecimal("1000000000000000000000")), DecimalType())
+      Coalesce(Cast(Literal(1L), DecimalType(22, 0))
+        :: Cast(Literal(1), DecimalType(22, 0))
+        :: Cast(Literal(new java.math.BigDecimal("1000000000000000000000")), DecimalType(22, 0))
         :: Nil))
   }
 
@@ -323,7 +315,7 @@ class HiveTypeCoercionSuite extends PlanTest {
 
     val left = LocalRelation(
       AttributeReference("i", IntegerType)(),
-      AttributeReference("u", DecimalType.Unlimited)(),
+      AttributeReference("u", DecimalType.SYSTEM_DEFAULT)(),
       AttributeReference("b", ByteType)(),
       AttributeReference("d", DoubleType)())
     val right = LocalRelation(
@@ -333,7 +325,7 @@ class HiveTypeCoercionSuite extends PlanTest {
       AttributeReference("l", LongType)())
 
     val wt = HiveTypeCoercion.WidenTypes
-    val expectedTypes = Seq(StringType, DecimalType.Unlimited, FloatType, DoubleType)
+    val expectedTypes = Seq(StringType, DecimalType.SYSTEM_DEFAULT, FloatType, DoubleType)
 
     val r1 = wt(Union(left, right)).asInstanceOf[Union]
     val r2 = wt(Except(left, right)).asInstanceOf[Except]
@@ -353,13 +345,13 @@ class HiveTypeCoercionSuite extends PlanTest {
       }
     }
 
-    val dp = HiveTypeCoercion.DecimalPrecision
+    val dp = HiveTypeCoercion.WidenTypes
 
     val left1 = LocalRelation(
       AttributeReference("l", DecimalType(10, 8))())
     val right1 = LocalRelation(
       AttributeReference("r", DecimalType(5, 5))())
-    val expectedType1 = Seq(DecimalType(math.max(8, 5) + math.max(10 - 8, 5 - 5), math.max(8, 5)))
+    val expectedType1 = Seq(DecimalType(10, 8))
 
     val r1 = dp(Union(left1, right1)).asInstanceOf[Union]
     val r2 = dp(Except(left1, right1)).asInstanceOf[Except]
@@ -372,12 +364,11 @@ class HiveTypeCoercionSuite extends PlanTest {
     checkOutput(r3.left, expectedType1)
     checkOutput(r3.right, expectedType1)
 
-    val plan1 = LocalRelation(
-      AttributeReference("l", DecimalType(10, 10))())
+    val plan1 = LocalRelation(AttributeReference("l", DecimalType(10, 5))())
 
     val rightTypes = Seq(ByteType, ShortType, IntegerType, LongType, FloatType, DoubleType)
-    val expectedTypes = Seq(DecimalType(3, 0), DecimalType(5, 0), DecimalType(10, 0),
-      DecimalType(20, 0), DecimalType(7, 7), DecimalType(15, 15))
+    val expectedTypes = Seq(DecimalType(10, 5), DecimalType(10, 5), DecimalType(15, 5),
+      DecimalType(25, 5), DoubleType, DoubleType)
 
     rightTypes.zip(expectedTypes).map { case (rType, expectedType) =>
       val plan2 = LocalRelation(
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
index ccf448eee0688..facf65c155148 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
@@ -185,7 +185,7 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkCast(1, 1.0)
     checkCast(123, "123")
 
-    checkEvaluation(cast(123, DecimalType.Unlimited), Decimal(123))
+    checkEvaluation(cast(123, DecimalType.USER_DEFAULT), Decimal(123))
     checkEvaluation(cast(123, DecimalType(3, 0)), Decimal(123))
     checkEvaluation(cast(123, DecimalType(3, 1)), null)
     checkEvaluation(cast(123, DecimalType(2, 0)), null)
@@ -203,7 +203,7 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkCast(1L, 1.0)
     checkCast(123L, "123")
 
-    checkEvaluation(cast(123L, DecimalType.Unlimited), Decimal(123))
+    checkEvaluation(cast(123L, DecimalType.USER_DEFAULT), Decimal(123))
     checkEvaluation(cast(123L, DecimalType(3, 0)), Decimal(123))
     checkEvaluation(cast(123L, DecimalType(3, 1)), Decimal(123.0))
 
@@ -225,7 +225,7 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(cast(cast(1000, TimestampType), LongType), 1.toLong)
     checkEvaluation(cast(cast(-1200, TimestampType), LongType), -2.toLong)
 
-    checkEvaluation(cast(123, DecimalType.Unlimited), Decimal(123))
+    checkEvaluation(cast(123, DecimalType.USER_DEFAULT), Decimal(123))
     checkEvaluation(cast(123, DecimalType(3, 0)), Decimal(123))
     checkEvaluation(cast(123, DecimalType(3, 1)), null)
     checkEvaluation(cast(123, DecimalType(2, 0)), null)
@@ -267,7 +267,7 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
     assert(cast("abcdef", IntegerType).nullable === true)
     assert(cast("abcdef", ShortType).nullable === true)
     assert(cast("abcdef", ByteType).nullable === true)
-    assert(cast("abcdef", DecimalType.Unlimited).nullable === true)
+    assert(cast("abcdef", DecimalType.USER_DEFAULT).nullable === true)
     assert(cast("abcdef", DecimalType(4, 2)).nullable === true)
     assert(cast("abcdef", DoubleType).nullable === true)
     assert(cast("abcdef", FloatType).nullable === true)
@@ -291,9 +291,9 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
       c.getTimeInMillis * 1000)
 
     checkEvaluation(cast("abdef", StringType), "abdef")
-    checkEvaluation(cast("abdef", DecimalType.Unlimited), null)
+    checkEvaluation(cast("abdef", DecimalType.USER_DEFAULT), null)
     checkEvaluation(cast("abdef", TimestampType), null)
-    checkEvaluation(cast("12.65", DecimalType.Unlimited), Decimal(12.65))
+    checkEvaluation(cast("12.65", DecimalType.SYSTEM_DEFAULT), Decimal(12.65))
 
     checkEvaluation(cast(cast(sd, DateType), StringType), sd)
     checkEvaluation(cast(cast(d, StringType), DateType), 0)
@@ -311,20 +311,20 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
       5.toLong)
     checkEvaluation(
       cast(cast(cast(cast(cast(cast("5", ByteType), TimestampType),
-        DecimalType.Unlimited), LongType), StringType), ShortType),
+        DecimalType.SYSTEM_DEFAULT), LongType), StringType), ShortType),
       0.toShort)
     checkEvaluation(
       cast(cast(cast(cast(cast(cast("5", TimestampType), ByteType),
-        DecimalType.Unlimited), LongType), StringType), ShortType),
+        DecimalType.SYSTEM_DEFAULT), LongType), StringType), ShortType),
       null)
-    checkEvaluation(cast(cast(cast(cast(cast(cast("5", DecimalType.Unlimited),
+    checkEvaluation(cast(cast(cast(cast(cast(cast("5", DecimalType.SYSTEM_DEFAULT),
       ByteType), TimestampType), LongType), StringType), ShortType),
       0.toShort)
 
     checkEvaluation(cast("23", DoubleType), 23d)
     checkEvaluation(cast("23", IntegerType), 23)
     checkEvaluation(cast("23", FloatType), 23f)
-    checkEvaluation(cast("23", DecimalType.Unlimited), Decimal(23))
+    checkEvaluation(cast("23", DecimalType.USER_DEFAULT), Decimal(23))
     checkEvaluation(cast("23", ByteType), 23.toByte)
     checkEvaluation(cast("23", ShortType), 23.toShort)
     checkEvaluation(cast("2012-12-11", DoubleType), null)
@@ -338,7 +338,7 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(Add(Literal(23d), cast(true, DoubleType)), 24d)
     checkEvaluation(Add(Literal(23), cast(true, IntegerType)), 24)
     checkEvaluation(Add(Literal(23f), cast(true, FloatType)), 24f)
-    checkEvaluation(Add(Literal(Decimal(23)), cast(true, DecimalType.Unlimited)), Decimal(24))
+    checkEvaluation(Add(Literal(Decimal(23)), cast(true, DecimalType.USER_DEFAULT)), Decimal(24))
     checkEvaluation(Add(Literal(23.toByte), cast(true, ByteType)), 24.toByte)
     checkEvaluation(Add(Literal(23.toShort), cast(true, ShortType)), 24.toShort)
   }
@@ -362,10 +362,10 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
     // - Values that would overflow the target precision should turn into null
     // - Because of this, casts to fixed-precision decimals should be nullable
 
-    assert(cast(123, DecimalType.Unlimited).nullable === false)
-    assert(cast(10.03f, DecimalType.Unlimited).nullable === true)
-    assert(cast(10.03, DecimalType.Unlimited).nullable === true)
-    assert(cast(Decimal(10.03), DecimalType.Unlimited).nullable === false)
+    assert(cast(123, DecimalType.USER_DEFAULT).nullable === true)
+    assert(cast(10.03f, DecimalType.SYSTEM_DEFAULT).nullable === true)
+    assert(cast(10.03, DecimalType.SYSTEM_DEFAULT).nullable === true)
+    assert(cast(Decimal(10.03), DecimalType.SYSTEM_DEFAULT).nullable === true)
 
     assert(cast(123, DecimalType(2, 1)).nullable === true)
     assert(cast(10.03f, DecimalType(2, 1)).nullable === true)
@@ -373,7 +373,7 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
     assert(cast(Decimal(10.03), DecimalType(2, 1)).nullable === true)
 
 
-    checkEvaluation(cast(10.03, DecimalType.Unlimited), Decimal(10.03))
+    checkEvaluation(cast(10.03, DecimalType.SYSTEM_DEFAULT), Decimal(10.03))
     checkEvaluation(cast(10.03, DecimalType(4, 2)), Decimal(10.03))
     checkEvaluation(cast(10.03, DecimalType(3, 1)), Decimal(10.0))
     checkEvaluation(cast(10.03, DecimalType(2, 0)), Decimal(10))
@@ -383,7 +383,7 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(cast(Decimal(10.03), DecimalType(3, 1)), Decimal(10.0))
     checkEvaluation(cast(Decimal(10.03), DecimalType(3, 2)), null)
 
-    checkEvaluation(cast(10.05, DecimalType.Unlimited), Decimal(10.05))
+    checkEvaluation(cast(10.05, DecimalType.SYSTEM_DEFAULT), Decimal(10.05))
     checkEvaluation(cast(10.05, DecimalType(4, 2)), Decimal(10.05))
     checkEvaluation(cast(10.05, DecimalType(3, 1)), Decimal(10.1))
     checkEvaluation(cast(10.05, DecimalType(2, 0)), Decimal(10))
@@ -409,10 +409,10 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(cast(Decimal(-9.95), DecimalType(3, 1)), Decimal(-10.0))
     checkEvaluation(cast(Decimal(-9.95), DecimalType(1, 0)), null)
 
-    checkEvaluation(cast(Double.NaN, DecimalType.Unlimited), null)
-    checkEvaluation(cast(1.0 / 0.0, DecimalType.Unlimited), null)
-    checkEvaluation(cast(Float.NaN, DecimalType.Unlimited), null)
-    checkEvaluation(cast(1.0f / 0.0f, DecimalType.Unlimited), null)
+    checkEvaluation(cast(Double.NaN, DecimalType.SYSTEM_DEFAULT), null)
+    checkEvaluation(cast(1.0 / 0.0, DecimalType.SYSTEM_DEFAULT), null)
+    checkEvaluation(cast(Float.NaN, DecimalType.SYSTEM_DEFAULT), null)
+    checkEvaluation(cast(1.0f / 0.0f, DecimalType.SYSTEM_DEFAULT), null)
 
     checkEvaluation(cast(Double.NaN, DecimalType(2, 1)), null)
     checkEvaluation(cast(1.0 / 0.0, DecimalType(2, 1)), null)
@@ -427,7 +427,7 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(cast(d, LongType), null)
     checkEvaluation(cast(d, FloatType), null)
     checkEvaluation(cast(d, DoubleType), null)
-    checkEvaluation(cast(d, DecimalType.Unlimited), null)
+    checkEvaluation(cast(d, DecimalType.SYSTEM_DEFAULT), null)
     checkEvaluation(cast(d, DecimalType(10, 2)), null)
     checkEvaluation(cast(d, StringType), "1970-01-01")
     checkEvaluation(cast(cast(d, TimestampType), StringType), "1970-01-01 00:00:00")
@@ -454,7 +454,7 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
       cast(cast(millis.toDouble / 1000, TimestampType), DoubleType),
       millis.toDouble / 1000)
     checkEvaluation(
-      cast(cast(Decimal(1), TimestampType), DecimalType.Unlimited),
+      cast(cast(Decimal(1), TimestampType), DecimalType.SYSTEM_DEFAULT),
       Decimal(1))
 
     // A test for higher precision than millis
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ConditionalExpressionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ConditionalExpressionSuite.scala
index afa143bd5f331..b31d6661c8c1c 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ConditionalExpressionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ConditionalExpressionSuite.scala
@@ -60,7 +60,7 @@ class ConditionalExpressionSuite extends SparkFunSuite with ExpressionEvalHelper
 
     testIf(_.toFloat, FloatType)
     testIf(_.toDouble, DoubleType)
-    testIf(Decimal(_), DecimalType.Unlimited)
+    testIf(Decimal(_), DecimalType.USER_DEFAULT)
 
     testIf(identity, DateType)
     testIf(_.toLong, TimestampType)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala
index d924ff7a102f6..f6404d21611e5 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala
@@ -33,7 +33,7 @@ class LiteralExpressionSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(Literal.create(null, LongType), null)
     checkEvaluation(Literal.create(null, StringType), null)
     checkEvaluation(Literal.create(null, BinaryType), null)
-    checkEvaluation(Literal.create(null, DecimalType()), null)
+    checkEvaluation(Literal.create(null, DecimalType.USER_DEFAULT), null)
     checkEvaluation(Literal.create(null, ArrayType(ByteType, true)), null)
     checkEvaluation(Literal.create(null, MapType(StringType, IntegerType)), null)
     checkEvaluation(Literal.create(null, StructType(Seq.empty)), null)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullFunctionsSuite.scala
index 0728f6695c39d..9efe44c83293d 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullFunctionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullFunctionsSuite.scala
@@ -30,7 +30,7 @@ class NullFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     testFunc(1L, LongType)
     testFunc(1.0F, FloatType)
     testFunc(1.0, DoubleType)
-    testFunc(Decimal(1.5), DecimalType.Unlimited)
+    testFunc(Decimal(1.5), DecimalType(2, 1))
     testFunc(new java.sql.Date(10), DateType)
     testFunc(new java.sql.Timestamp(10), TimestampType)
     testFunc("abcd", StringType)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMapSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMapSuite.scala
index 7566cb59e34ee..48b7dc57451a3 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMapSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMapSuite.scala
@@ -121,6 +121,8 @@ class UnsafeFixedWidthAggregationMapSuite
     }.toSet
     seenKeys.size should be (groupKeys.size)
     seenKeys should be (groupKeys)
+
+    map.free()
   }
 
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala
index 8819234e78e60..a5d9806c20463 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala
@@ -145,7 +145,7 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers {
       DoubleType,
       StringType,
       BinaryType
-      // DecimalType.Unlimited,
+      // DecimalType.Default,
       // ArrayType(IntegerType)
     )
     val converter = new UnsafeRowConverter(fieldTypes)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeParserSuite.scala
index c6171b7b6916d..1ba290753ce48 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeParserSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeParserSuite.scala
@@ -44,7 +44,7 @@ class DataTypeParserSuite extends SparkFunSuite {
   checkDataType("float", FloatType)
   checkDataType("dOUBle", DoubleType)
   checkDataType("decimal(10, 5)", DecimalType(10, 5))
-  checkDataType("decimal", DecimalType.Unlimited)
+  checkDataType("decimal", DecimalType.USER_DEFAULT)
   checkDataType("DATE", DateType)
   checkDataType("timestamp", TimestampType)
   checkDataType("string", StringType)
@@ -87,7 +87,7 @@ class DataTypeParserSuite extends SparkFunSuite {
     StructType(
       StructField("struct",
         StructType(
-          StructField("deciMal", DecimalType.Unlimited, true) ::
+          StructField("deciMal", DecimalType.USER_DEFAULT, true) ::
           StructField("anotherDecimal", DecimalType(5, 2), true) :: Nil), true) ::
       StructField("MAP", MapType(TimestampType, StringType), true) ::
       StructField("arrAy", ArrayType(DoubleType, true), true) :: Nil)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala
index 14e7b4a9561b6..88b221cd81d74 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala
@@ -185,7 +185,7 @@ class DataTypeSuite extends SparkFunSuite {
   checkDataTypeJsonRepr(FloatType)
   checkDataTypeJsonRepr(DoubleType)
   checkDataTypeJsonRepr(DecimalType(10, 5))
-  checkDataTypeJsonRepr(DecimalType.Unlimited)
+  checkDataTypeJsonRepr(DecimalType.SYSTEM_DEFAULT)
   checkDataTypeJsonRepr(DateType)
   checkDataTypeJsonRepr(TimestampType)
   checkDataTypeJsonRepr(StringType)
@@ -219,7 +219,7 @@ class DataTypeSuite extends SparkFunSuite {
   checkDefaultSize(FloatType, 4)
   checkDefaultSize(DoubleType, 8)
   checkDefaultSize(DecimalType(10, 5), 4096)
-  checkDefaultSize(DecimalType.Unlimited, 4096)
+  checkDefaultSize(DecimalType.SYSTEM_DEFAULT, 4096)
   checkDefaultSize(DateType, 4)
   checkDefaultSize(TimestampType, 8)
   checkDefaultSize(StringType, 4096)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeTestUtils.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeTestUtils.scala
index 32632b5d6e342..0ee9ddac815b8 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeTestUtils.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeTestUtils.scala
@@ -34,7 +34,7 @@ object DataTypeTestUtils {
    * decimal types.
    */
   val fractionalTypes: Set[FractionalType] = Set(
-    DecimalType(precisionInfo = None),
+    DecimalType.SYSTEM_DEFAULT,
     DecimalType(2, 1),
     DoubleType,
     FloatType
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
index 6e2a6525bf17e..b25dcbca82b9f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
@@ -996,7 +996,7 @@ class ColumnName(name: String) extends Column(name) {
    * Creates a new [[StructField]] of type decimal.
    * @since 1.3.0
    */
-  def decimal: StructField = StructField(name, DecimalType.Unlimited)
+  def decimal: StructField = StructField(name, DecimalType.USER_DEFAULT)
 
   /**
    * Creates a new [[StructField]] of type decimal.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala
index fc72360c88fe1..9d8415f06399c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala
@@ -375,7 +375,7 @@ private[sql] object TIMESTAMP extends NativeColumnType(TimestampType, 9, 8) {
 
 private[sql] case class FIXED_DECIMAL(precision: Int, scale: Int)
   extends NativeColumnType(
-    DecimalType(Some(PrecisionInfo(precision, scale))),
+    DecimalType(precision, scale),
     10,
     FIXED_DECIMAL.defaultSize) {
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala
index 16176abe3a51d..5ed158b3d2912 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala
@@ -21,9 +21,9 @@ import org.apache.spark.TaskContext
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.trees._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.physical._
+import org.apache.spark.sql.catalyst.trees._
 import org.apache.spark.sql.types._
 
 case class AggregateEvaluation(
@@ -92,8 +92,8 @@ case class GeneratedAggregate(
       case s @ Sum(expr) =>
         val calcType =
           expr.dataType match {
-            case DecimalType.Fixed(_, _) =>
-              DecimalType.Unlimited
+            case DecimalType.Fixed(p, s) =>
+              DecimalType.bounded(p + 10, s)
             case _ =>
               expr.dataType
           }
@@ -121,8 +121,8 @@ case class GeneratedAggregate(
       case cs @ CombineSum(expr) =>
         val calcType =
           expr.dataType match {
-            case DecimalType.Fixed(_, _) =>
-              DecimalType.Unlimited
+            case DecimalType.Fixed(p, s) =>
+              DecimalType.bounded(p + 10, s)
             case _ =>
               expr.dataType
           }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
index 6b4a359db22d1..9d0fa894b9942 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
@@ -25,6 +25,7 @@ import scala.util.Try
 
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.util.Shell
+
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{Cast, Literal}
 import org.apache.spark.sql.types._
@@ -236,7 +237,7 @@ private[sql] object PartitioningUtils {
 
   /**
    * Converts a string to a [[Literal]] with automatic type inference.  Currently only supports
-   * [[IntegerType]], [[LongType]], [[DoubleType]], [[DecimalType.Unlimited]], and
+   * [[IntegerType]], [[LongType]], [[DoubleType]], [[DecimalType.SYSTEM_DEFAULT]], and
    * [[StringType]].
    */
   private[sql] def inferPartitionColumnValue(
@@ -249,7 +250,7 @@ private[sql] object PartitioningUtils {
         .orElse(Try(Literal.create(JLong.parseLong(raw), LongType)))
         // Then falls back to fractional types
         .orElse(Try(Literal.create(JDouble.parseDouble(raw), DoubleType)))
-        .orElse(Try(Literal.create(new JBigDecimal(raw), DecimalType.Unlimited)))
+        .orElse(Try(Literal(new JBigDecimal(raw))))
         // Then falls back to string
         .getOrElse {
           if (raw == defaultPartitionName) {
@@ -268,7 +269,7 @@ private[sql] object PartitioningUtils {
   }
 
   private val upCastingOrder: Seq[DataType] =
-    Seq(NullType, IntegerType, LongType, FloatType, DoubleType, DecimalType.Unlimited, StringType)
+    Seq(NullType, IntegerType, LongType, FloatType, DoubleType, StringType)
 
   /**
    * Given a collection of [[Literal]]s, resolves possible type conflicts by up-casting "lower"
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala
index 7a27fba1780b9..3cf70db6b7b09 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala
@@ -66,8 +66,8 @@ private[sql] object JDBCRDD extends Logging {
       case java.sql.Types.DATALINK      => null
       case java.sql.Types.DATE          => DateType
       case java.sql.Types.DECIMAL
-        if precision != 0 || scale != 0 => DecimalType(precision, scale)
-      case java.sql.Types.DECIMAL       => DecimalType.Unlimited
+        if precision != 0 || scale != 0 => DecimalType.bounded(precision, scale)
+      case java.sql.Types.DECIMAL       => DecimalType.SYSTEM_DEFAULT
       case java.sql.Types.DISTINCT      => null
       case java.sql.Types.DOUBLE        => DoubleType
       case java.sql.Types.FLOAT         => FloatType
@@ -80,8 +80,8 @@ private[sql] object JDBCRDD extends Logging {
       case java.sql.Types.NCLOB         => StringType
       case java.sql.Types.NULL          => null
       case java.sql.Types.NUMERIC
-        if precision != 0 || scale != 0 => DecimalType(precision, scale)
-      case java.sql.Types.NUMERIC       => DecimalType.Unlimited
+        if precision != 0 || scale != 0 => DecimalType.bounded(precision, scale)
+      case java.sql.Types.NUMERIC       => DecimalType.SYSTEM_DEFAULT
       case java.sql.Types.NVARCHAR      => StringType
       case java.sql.Types.OTHER         => null
       case java.sql.Types.REAL          => DoubleType
@@ -314,7 +314,7 @@ private[sql] class JDBCRDD(
   abstract class JDBCConversion
   case object BooleanConversion extends JDBCConversion
   case object DateConversion extends JDBCConversion
-  case class  DecimalConversion(precisionInfo: Option[(Int, Int)]) extends JDBCConversion
+  case class  DecimalConversion(precision: Int, scale: Int) extends JDBCConversion
   case object DoubleConversion extends JDBCConversion
   case object FloatConversion extends JDBCConversion
   case object IntegerConversion extends JDBCConversion
@@ -331,8 +331,7 @@ private[sql] class JDBCRDD(
     schema.fields.map(sf => sf.dataType match {
       case BooleanType => BooleanConversion
       case DateType => DateConversion
-      case DecimalType.Unlimited => DecimalConversion(None)
-      case DecimalType.Fixed(d) => DecimalConversion(Some(d))
+      case DecimalType.Fixed(p, s) => DecimalConversion(p, s)
       case DoubleType => DoubleConversion
       case FloatType => FloatConversion
       case IntegerType => IntegerConversion
@@ -399,20 +398,13 @@ private[sql] class JDBCRDD(
             // DecimalType(12, 2). Thus, after saving the dataframe into parquet file and then
             // retrieve it, you will get wrong result 199.99.
             // So it is needed to set precision and scale for Decimal based on JDBC metadata.
-            case DecimalConversion(Some((p, s))) =>
+            case DecimalConversion(p, s) =>
               val decimalVal = rs.getBigDecimal(pos)
               if (decimalVal == null) {
                 mutableRow.update(i, null)
               } else {
                 mutableRow.update(i, Decimal(decimalVal, p, s))
               }
-            case DecimalConversion(None) =>
-              val decimalVal = rs.getBigDecimal(pos)
-              if (decimalVal == null) {
-                mutableRow.update(i, null)
-              } else {
-                mutableRow.update(i, Decimal(decimalVal))
-              }
             case DoubleConversion => mutableRow.setDouble(i, rs.getDouble(pos))
             case FloatConversion => mutableRow.setFloat(i, rs.getFloat(pos))
             case IntegerConversion => mutableRow.setInt(i, rs.getInt(pos))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/jdbc.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/jdbc.scala
index f7ea852fe7f58..035e0510080ff 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/jdbc.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/jdbc.scala
@@ -89,8 +89,7 @@ package object jdbc {
                   case BinaryType => stmt.setBytes(i + 1, row.getAs[Array[Byte]](i))
                   case TimestampType => stmt.setTimestamp(i + 1, row.getAs[java.sql.Timestamp](i))
                   case DateType => stmt.setDate(i + 1, row.getAs[java.sql.Date](i))
-                  case DecimalType.Unlimited => stmt.setBigDecimal(i + 1,
-                      row.getAs[java.math.BigDecimal](i))
+                  case t: DecimalType => stmt.setBigDecimal(i + 1, row.getDecimal(i))
                   case _ => throw new IllegalArgumentException(
                       s"Can't translate non-null value for field $i")
                 }
@@ -145,7 +144,7 @@ package object jdbc {
             case BinaryType => "BLOB"
             case TimestampType => "TIMESTAMP"
             case DateType => "DATE"
-            case DecimalType.Unlimited => "DECIMAL(40,20)"
+            case t: DecimalType => s"DECIMAL(${t.precision}},${t.scale}})"
             case _ => throw new IllegalArgumentException(s"Don't know how to save $field to JDBC")
           })
         val nullable = if (field.nullable) "" else "NOT NULL"
@@ -177,7 +176,7 @@ package object jdbc {
             case BinaryType => java.sql.Types.BLOB
             case TimestampType => java.sql.Types.TIMESTAMP
             case DateType => java.sql.Types.DATE
-            case DecimalType.Unlimited => java.sql.Types.DECIMAL
+            case t: DecimalType => java.sql.Types.DECIMAL
             case _ => throw new IllegalArgumentException(
               s"Can't translate null value for field $field")
           })
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/json/InferSchema.scala b/sql/core/src/main/scala/org/apache/spark/sql/json/InferSchema.scala
index afe2c6c11ac69..0eb3b04007f8d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/json/InferSchema.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/json/InferSchema.scala
@@ -113,7 +113,7 @@ private[sql] object InferSchema {
           case INT | LONG => LongType
           // Since we do not have a data type backed by BigInteger,
           // when we see a Java BigInteger, we use DecimalType.
-          case BIG_INTEGER | BIG_DECIMAL => DecimalType.Unlimited
+          case BIG_INTEGER | BIG_DECIMAL => DecimalType.SYSTEM_DEFAULT
           case FLOAT | DOUBLE => DoubleType
         }
 
@@ -168,8 +168,13 @@ private[sql] object InferSchema {
     HiveTypeCoercion.findTightestCommonTypeOfTwo(t1, t2).getOrElse {
       // t1 or t2 is a StructType, ArrayType, or an unexpected type.
       (t1, t2) match {
-        case (other: DataType, NullType) => other
-        case (NullType, other: DataType) => other
+        // Double support larger range than fixed decimal, DecimalType.Maximum should be enough
+        // in most case, also have better precision.
+        case (DoubleType, t: DecimalType) =>
+          if (t == DecimalType.SYSTEM_DEFAULT) t else DoubleType
+        case (t: DecimalType, DoubleType) =>
+          if (t == DecimalType.SYSTEM_DEFAULT) t else DoubleType
+
         case (StructType(fields1), StructType(fields2)) =>
           val newFields = (fields1 ++ fields2).groupBy(field => field.name).map {
             case (name, fieldTypes) =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala
index 1ea6926af6d5b..1d3a0d15d336e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala
@@ -439,10 +439,6 @@ private[parquet] class CatalystSchemaConverter(
           .length(minBytesForPrecision(precision))
           .named(field.name)
 
-      case dec @ DecimalType.Unlimited if followParquetFormatSpec =>
-        throw new AnalysisException(
-          s"Data type $dec is not supported. Decimal precision and scale must be specified.")
-
       // ===================================================
       // ArrayType and MapType (for Spark versions <= 1.4.x)
       // ===================================================
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
index e8851ddb68026..d1040bf5562a2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
@@ -261,10 +261,10 @@ private[parquet] class RowWriteSupport extends WriteSupport[InternalRow] with Lo
         case BinaryType => writer.addBinary(
           Binary.fromByteArray(value.asInstanceOf[Array[Byte]]))
         case d: DecimalType =>
-          if (d.precisionInfo == None || d.precisionInfo.get.precision > 18) {
+          if (d.precision > 18) {
             sys.error(s"Unsupported datatype $d, cannot write to consumer")
           }
-          writeDecimal(value.asInstanceOf[Decimal], d.precisionInfo.get.precision)
+          writeDecimal(value.asInstanceOf[Decimal], d.precision)
         case _ => sys.error(s"Do not know how to writer $schema to consumer")
       }
     }
@@ -415,10 +415,10 @@ private[parquet] class MutableRowWriteSupport extends RowWriteSupport {
       case BinaryType => writer.addBinary(
         Binary.fromByteArray(record(index).asInstanceOf[Array[Byte]]))
       case d: DecimalType =>
-        if (d.precisionInfo == None || d.precisionInfo.get.precision > 18) {
+        if (d.precision > 18) {
           sys.error(s"Unsupported datatype $d, cannot write to consumer")
         }
-        writeDecimal(record(index).asInstanceOf[Decimal], d.precisionInfo.get.precision)
+        writeDecimal(record(index).asInstanceOf[Decimal], d.precision)
       case _ => sys.error(s"Unsupported datatype $ctype, cannot write to consumer")
     }
   }
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaApplySchemaSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaApplySchemaSuite.java
index fcb8f5499cf84..cb84e78d628ca 100644
--- a/sql/core/src/test/java/test/org/apache/spark/sql/JavaApplySchemaSuite.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaApplySchemaSuite.java
@@ -22,7 +22,6 @@
 import java.util.Arrays;
 import java.util.List;
 
-import org.apache.spark.sql.test.TestSQLContext$;
 import org.junit.After;
 import org.junit.Assert;
 import org.junit.Before;
@@ -31,8 +30,14 @@
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.api.java.function.Function;
-import org.apache.spark.sql.*;
-import org.apache.spark.sql.types.*;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.test.TestSQLContext$;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
 
 // The test suite itself is Serializable so that anonymous Function implementations can be
 // serialized, as an alternative to converting these anonymous classes to static inner classes;
@@ -159,7 +164,8 @@ public void applySchemaToJSON() {
         "\"bigInteger\":92233720368547758069, \"double\":1.7976931348623157E305, " +
         "\"boolean\":false, \"null\":null}"));
     List<StructField> fields = new ArrayList<StructField>(7);
-    fields.add(DataTypes.createStructField("bigInteger", DataTypes.createDecimalType(), true));
+    fields.add(DataTypes.createStructField("bigInteger", DataTypes.createDecimalType(38, 18),
+      true));
     fields.add(DataTypes.createStructField("boolean", DataTypes.BooleanType, true));
     fields.add(DataTypes.createStructField("double", DataTypes.DoubleType, true));
     fields.add(DataTypes.createStructField("integer", DataTypes.LongType, true));
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala
index 01bc23277fa88..037e2048a8631 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala
@@ -148,7 +148,7 @@ class InMemoryColumnarQuerySuite extends QueryTest {
     val dataTypes =
       Seq(StringType, BinaryType, NullType, BooleanType,
         ByteType, ShortType, IntegerType, LongType,
-        FloatType, DoubleType, DecimalType.Unlimited, DecimalType(6, 5),
+        FloatType, DoubleType, DecimalType.SYSTEM_DEFAULT, DecimalType(6, 5),
         DateType, TimestampType,
         ArrayType(IntegerType), MapType(StringType, LongType), struct)
     val fields = dataTypes.zipWithIndex.map { case (dataType, index) =>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
index 3d71deb13e884..845ce669f0b33 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
@@ -109,7 +109,7 @@ class PlannerSuite extends SparkFunSuite {
       FloatType ::
       DoubleType ::
       DecimalType(10, 5) ::
-      DecimalType.Unlimited ::
+      DecimalType.SYSTEM_DEFAULT ::
       DateType ::
       TimestampType ::
       StringType ::
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlSerializer2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlSerializer2Suite.scala
index 4a53fadd7e099..54f82f89ed18a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlSerializer2Suite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlSerializer2Suite.scala
@@ -54,7 +54,7 @@ class SparkSqlSerializer2DataTypeSuite extends SparkFunSuite {
   checkSupported(StringType, isSupported = true)
   checkSupported(BinaryType, isSupported = true)
   checkSupported(DecimalType(10, 5), isSupported = true)
-  checkSupported(DecimalType.Unlimited, isSupported = true)
+  checkSupported(DecimalType.SYSTEM_DEFAULT, isSupported = true)
 
   // If NullType is the only data type in the schema, we do not support it.
   checkSupported(NullType, isSupported = false)
@@ -86,7 +86,7 @@ abstract class SparkSqlSerializer2Suite extends QueryTest with BeforeAndAfterAll
     val supportedTypes =
       Seq(StringType, BinaryType, NullType, BooleanType,
         ByteType, ShortType, IntegerType, LongType,
-        FloatType, DoubleType, DecimalType.Unlimited, DecimalType(6, 5),
+        FloatType, DoubleType, DecimalType.SYSTEM_DEFAULT, DecimalType(6, 5),
         DateType, TimestampType)
 
     val fields = supportedTypes.zipWithIndex.map { case (dataType, index) =>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
index 0f82f13088d39..42f2449afb0f9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
@@ -134,7 +134,7 @@ class JDBCSuite extends SparkFunSuite with BeforeAndAfter {
       """.stripMargin.replaceAll("\n", " "))
 
 
-    conn.prepareStatement("create table test.flttypes (a DOUBLE, b REAL, c DECIMAL(40, 20))"
+    conn.prepareStatement("create table test.flttypes (a DOUBLE, b REAL, c DECIMAL(38, 18))"
         ).executeUpdate()
     conn.prepareStatement("insert into test.flttypes values ("
       + "1.0000000000000002220446049250313080847263336181640625, "
@@ -152,7 +152,7 @@ class JDBCSuite extends SparkFunSuite with BeforeAndAfter {
       s"""
         |create table test.nulltypes (a INT, b BOOLEAN, c TINYINT, d BINARY(20), e VARCHAR(20),
         |f VARCHAR_IGNORECASE(20), g CHAR(20), h BLOB, i CLOB, j TIME, k DATE, l TIMESTAMP,
-        |m DOUBLE, n REAL, o DECIMAL(40, 20))
+        |m DOUBLE, n REAL, o DECIMAL(38, 18))
       """.stripMargin.replaceAll("\n", " ")).executeUpdate()
     conn.prepareStatement("insert into test.nulltypes values ("
       + "null, null, null, null, null, null, null, null, null, "
@@ -357,14 +357,14 @@ class JDBCSuite extends SparkFunSuite with BeforeAndAfter {
 
   test("H2 floating-point types") {
     val rows = sql("SELECT * FROM flttypes").collect()
-    assert(rows(0).getDouble(0) === 1.00000000000000022) // Yes, I meant ==.
-    assert(rows(0).getDouble(1) === 1.00000011920928955) // Yes, I meant ==.
-    assert(rows(0).getAs[BigDecimal](2)
-      .equals(new BigDecimal("123456789012345.54321543215432100000")))
-    assert(rows(0).schema.fields(2).dataType === DecimalType(40, 20))
-    val compareDecimal = sql("SELECT C FROM flttypes where C > C - 1").collect()
-    assert(compareDecimal(0).getAs[BigDecimal](0)
-      .equals(new BigDecimal("123456789012345.54321543215432100000")))
+    assert(rows(0).getDouble(0) === 1.00000000000000022)
+    assert(rows(0).getDouble(1) === 1.00000011920928955)
+    assert(rows(0).getAs[BigDecimal](2) ===
+      new BigDecimal("123456789012345.543215432154321000"))
+    assert(rows(0).schema.fields(2).dataType === DecimalType(38, 18))
+    val result = sql("SELECT C FROM flttypes where C > C - 1").collect()
+    assert(result(0).getAs[BigDecimal](0) ===
+      new BigDecimal("123456789012345.543215432154321000"))
   }
 
   test("SQL query as table name") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
index 1d04513a44672..3ac312d6f4c50 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
@@ -63,18 +63,18 @@ class JsonSuite extends QueryTest with TestJsonData {
     checkTypePromotion(intNumber.toLong, enforceCorrectType(intNumber, LongType))
     checkTypePromotion(intNumber.toDouble, enforceCorrectType(intNumber, DoubleType))
     checkTypePromotion(
-      Decimal(intNumber), enforceCorrectType(intNumber, DecimalType.Unlimited))
+      Decimal(intNumber), enforceCorrectType(intNumber, DecimalType.SYSTEM_DEFAULT))
 
     val longNumber: Long = 9223372036854775807L
     checkTypePromotion(longNumber, enforceCorrectType(longNumber, LongType))
     checkTypePromotion(longNumber.toDouble, enforceCorrectType(longNumber, DoubleType))
     checkTypePromotion(
-      Decimal(longNumber), enforceCorrectType(longNumber, DecimalType.Unlimited))
+      Decimal(longNumber), enforceCorrectType(longNumber, DecimalType.SYSTEM_DEFAULT))
 
     val doubleNumber: Double = 1.7976931348623157E308d
     checkTypePromotion(doubleNumber.toDouble, enforceCorrectType(doubleNumber, DoubleType))
     checkTypePromotion(
-      Decimal(doubleNumber), enforceCorrectType(doubleNumber, DecimalType.Unlimited))
+      Decimal(doubleNumber), enforceCorrectType(doubleNumber, DecimalType.SYSTEM_DEFAULT))
 
     checkTypePromotion(DateTimeUtils.fromJavaTimestamp(new Timestamp(intNumber)),
         enforceCorrectType(intNumber, TimestampType))
@@ -115,7 +115,7 @@ class JsonSuite extends QueryTest with TestJsonData {
     checkDataType(NullType, IntegerType, IntegerType)
     checkDataType(NullType, LongType, LongType)
     checkDataType(NullType, DoubleType, DoubleType)
-    checkDataType(NullType, DecimalType.Unlimited, DecimalType.Unlimited)
+    checkDataType(NullType, DecimalType.SYSTEM_DEFAULT, DecimalType.SYSTEM_DEFAULT)
     checkDataType(NullType, StringType, StringType)
     checkDataType(NullType, ArrayType(IntegerType), ArrayType(IntegerType))
     checkDataType(NullType, StructType(Nil), StructType(Nil))
@@ -126,7 +126,7 @@ class JsonSuite extends QueryTest with TestJsonData {
     checkDataType(BooleanType, IntegerType, StringType)
     checkDataType(BooleanType, LongType, StringType)
     checkDataType(BooleanType, DoubleType, StringType)
-    checkDataType(BooleanType, DecimalType.Unlimited, StringType)
+    checkDataType(BooleanType, DecimalType.SYSTEM_DEFAULT, StringType)
     checkDataType(BooleanType, StringType, StringType)
     checkDataType(BooleanType, ArrayType(IntegerType), StringType)
     checkDataType(BooleanType, StructType(Nil), StringType)
@@ -135,7 +135,7 @@ class JsonSuite extends QueryTest with TestJsonData {
     checkDataType(IntegerType, IntegerType, IntegerType)
     checkDataType(IntegerType, LongType, LongType)
     checkDataType(IntegerType, DoubleType, DoubleType)
-    checkDataType(IntegerType, DecimalType.Unlimited, DecimalType.Unlimited)
+    checkDataType(IntegerType, DecimalType.SYSTEM_DEFAULT, DecimalType.SYSTEM_DEFAULT)
     checkDataType(IntegerType, StringType, StringType)
     checkDataType(IntegerType, ArrayType(IntegerType), StringType)
     checkDataType(IntegerType, StructType(Nil), StringType)
@@ -143,23 +143,24 @@ class JsonSuite extends QueryTest with TestJsonData {
     // LongType
     checkDataType(LongType, LongType, LongType)
     checkDataType(LongType, DoubleType, DoubleType)
-    checkDataType(LongType, DecimalType.Unlimited, DecimalType.Unlimited)
+    checkDataType(LongType, DecimalType.SYSTEM_DEFAULT, DecimalType.SYSTEM_DEFAULT)
     checkDataType(LongType, StringType, StringType)
     checkDataType(LongType, ArrayType(IntegerType), StringType)
     checkDataType(LongType, StructType(Nil), StringType)
 
     // DoubleType
     checkDataType(DoubleType, DoubleType, DoubleType)
-    checkDataType(DoubleType, DecimalType.Unlimited, DecimalType.Unlimited)
+    checkDataType(DoubleType, DecimalType.SYSTEM_DEFAULT, DecimalType.SYSTEM_DEFAULT)
     checkDataType(DoubleType, StringType, StringType)
     checkDataType(DoubleType, ArrayType(IntegerType), StringType)
     checkDataType(DoubleType, StructType(Nil), StringType)
 
-    // DoubleType
-    checkDataType(DecimalType.Unlimited, DecimalType.Unlimited, DecimalType.Unlimited)
-    checkDataType(DecimalType.Unlimited, StringType, StringType)
-    checkDataType(DecimalType.Unlimited, ArrayType(IntegerType), StringType)
-    checkDataType(DecimalType.Unlimited, StructType(Nil), StringType)
+    // DecimalType
+    checkDataType(DecimalType.SYSTEM_DEFAULT, DecimalType.SYSTEM_DEFAULT,
+      DecimalType.SYSTEM_DEFAULT)
+    checkDataType(DecimalType.SYSTEM_DEFAULT, StringType, StringType)
+    checkDataType(DecimalType.SYSTEM_DEFAULT, ArrayType(IntegerType), StringType)
+    checkDataType(DecimalType.SYSTEM_DEFAULT, StructType(Nil), StringType)
 
     // StringType
     checkDataType(StringType, StringType, StringType)
@@ -213,7 +214,7 @@ class JsonSuite extends QueryTest with TestJsonData {
     checkDataType(
       StructType(
         StructField("f1", IntegerType, true) :: Nil),
-      DecimalType.Unlimited,
+      DecimalType.SYSTEM_DEFAULT,
       StringType)
   }
 
@@ -240,7 +241,7 @@ class JsonSuite extends QueryTest with TestJsonData {
     val jsonDF = ctx.read.json(primitiveFieldAndType)
 
     val expectedSchema = StructType(
-      StructField("bigInteger", DecimalType.Unlimited, true) ::
+      StructField("bigInteger", DecimalType.SYSTEM_DEFAULT, true) ::
       StructField("boolean", BooleanType, true) ::
       StructField("double", DoubleType, true) ::
       StructField("integer", LongType, true) ::
@@ -270,7 +271,7 @@ class JsonSuite extends QueryTest with TestJsonData {
     val expectedSchema = StructType(
       StructField("arrayOfArray1", ArrayType(ArrayType(StringType, true), true), true) ::
       StructField("arrayOfArray2", ArrayType(ArrayType(DoubleType, true), true), true) ::
-      StructField("arrayOfBigInteger", ArrayType(DecimalType.Unlimited, true), true) ::
+      StructField("arrayOfBigInteger", ArrayType(DecimalType.SYSTEM_DEFAULT, true), true) ::
       StructField("arrayOfBoolean", ArrayType(BooleanType, true), true) ::
       StructField("arrayOfDouble", ArrayType(DoubleType, true), true) ::
       StructField("arrayOfInteger", ArrayType(LongType, true), true) ::
@@ -284,7 +285,7 @@ class JsonSuite extends QueryTest with TestJsonData {
           StructField("field3", StringType, true) :: Nil), true), true) ::
       StructField("struct", StructType(
         StructField("field1", BooleanType, true) ::
-        StructField("field2", DecimalType.Unlimited, true) :: Nil), true) ::
+        StructField("field2", DecimalType.SYSTEM_DEFAULT, true) :: Nil), true) ::
       StructField("structWithArrayFields", StructType(
         StructField("field1", ArrayType(LongType, true), true) ::
         StructField("field2", ArrayType(StringType, true), true) :: Nil), true) :: Nil)
@@ -385,7 +386,7 @@ class JsonSuite extends QueryTest with TestJsonData {
     val expectedSchema = StructType(
       StructField("num_bool", StringType, true) ::
       StructField("num_num_1", LongType, true) ::
-      StructField("num_num_2", DecimalType.Unlimited, true) ::
+      StructField("num_num_2", DecimalType.SYSTEM_DEFAULT, true) ::
       StructField("num_num_3", DoubleType, true) ::
       StructField("num_str", StringType, true) ::
       StructField("str_bool", StringType, true) :: Nil)
@@ -421,11 +422,11 @@ class JsonSuite extends QueryTest with TestJsonData {
       Row(-89) :: Row(21474836370L) :: Row(21474836470L) :: Nil
     )
 
-    // Widening to DecimalType
+    // Widening to DoubleType
     checkAnswer(
-      sql("select num_num_2 + 1.2 from jsonTable where num_num_2 > 1.1"),
-      Row(new java.math.BigDecimal("21474836472.1")) ::
-        Row(new java.math.BigDecimal("92233720368547758071.2")) :: Nil
+      sql("select num_num_2 + 1.3 from jsonTable where num_num_2 > 1.1"),
+      Row(21474836472.2) ::
+        Row(92233720368547758071.3) :: Nil
     )
 
     // Widening to DoubleType
@@ -442,8 +443,8 @@ class JsonSuite extends QueryTest with TestJsonData {
 
     // Number and String conflict: resolve the type as number in this query.
     checkAnswer(
-      sql("select num_str + 1.2 from jsonTable where num_str > 92233720368547758060"),
-      Row(new java.math.BigDecimal("92233720368547758061.2").doubleValue)
+      sql("select num_str + 1.2 from jsonTable where num_str >= 92233720368547758060"),
+      Row(new java.math.BigDecimal("92233720368547758071.2").doubleValue)
     )
 
     // String and Boolean conflict: resolve the type as string.
@@ -489,9 +490,9 @@ class JsonSuite extends QueryTest with TestJsonData {
     // in the Project.
     checkAnswer(
       jsonDF.
-        where('num_str > BigDecimal("92233720368547758060")).
+        where('num_str >= BigDecimal("92233720368547758060")).
         select(('num_str + 1.2).as("num")),
-      Row(new java.math.BigDecimal("92233720368547758061.2"))
+      Row(new java.math.BigDecimal("92233720368547758071.2").doubleValue())
     )
 
     // The following test will fail. The type of num_str is StringType.
@@ -610,7 +611,7 @@ class JsonSuite extends QueryTest with TestJsonData {
     val jsonDF = ctx.read.json(path)
 
     val expectedSchema = StructType(
-      StructField("bigInteger", DecimalType.Unlimited, true) ::
+      StructField("bigInteger", DecimalType.SYSTEM_DEFAULT, true) ::
       StructField("boolean", BooleanType, true) ::
       StructField("double", DoubleType, true) ::
       StructField("integer", LongType, true) ::
@@ -668,7 +669,7 @@ class JsonSuite extends QueryTest with TestJsonData {
     primitiveFieldAndType.map(record => record.replaceAll("\n", " ")).saveAsTextFile(path)
 
     val schema = StructType(
-      StructField("bigInteger", DecimalType.Unlimited, true) ::
+      StructField("bigInteger", DecimalType.SYSTEM_DEFAULT, true) ::
       StructField("boolean", BooleanType, true) ::
       StructField("double", DoubleType, true) ::
       StructField("integer", IntegerType, true) ::
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala
index 7b16eba00d6fb..3a5b860484e86 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala
@@ -122,14 +122,6 @@ class ParquetIOSuiteBase extends QueryTest with ParquetTest {
         sqlContext.read.parquet(dir.getCanonicalPath).collect()
       }
     }
-
-    // Unlimited-length decimals are not yet supported
-    intercept[Throwable] {
-      withTempPath { dir =>
-        makeDecimalRDD(DecimalType.Unlimited).write.parquet(dir.getCanonicalPath)
-        sqlContext.read.parquet(dir.getCanonicalPath).collect()
-      }
-    }
   }
 
   test("date type") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala
index 4f98776b91160..7f16b1125c7a5 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala
@@ -509,7 +509,7 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest {
         FloatType,
         DoubleType,
         DecimalType(10, 5),
-        DecimalType.Unlimited,
+        DecimalType.SYSTEM_DEFAULT,
         DateType,
         TimestampType,
         StringType)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/DDLTestSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/DDLTestSuite.scala
index 54e1efb6e36e7..da53ec16b5c41 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/DDLTestSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/DDLTestSuite.scala
@@ -44,7 +44,7 @@ case class SimpleDDLScan(from: Int, to: Int, table: String)(@transient val sqlCo
       StructField("doubleType", DoubleType, nullable = false),
       StructField("bigintType", LongType, nullable = false),
       StructField("tinyintType", ByteType, nullable = false),
-      StructField("decimalType", DecimalType.Unlimited, nullable = false),
+      StructField("decimalType", DecimalType.USER_DEFAULT, nullable = false),
       StructField("fixedDecimalType", DecimalType(5, 1), nullable = false),
       StructField("binaryType", BinaryType, nullable = false),
       StructField("booleanType", BooleanType, nullable = false),
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala
index 2c916f3322b6d..143aadc08b1c4 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala
@@ -202,7 +202,7 @@ class TableScanSuite extends DataSourceTest {
       StructField("longField_:,<>=+/~^", LongType, true) ::
       StructField("floatField", FloatType, true) ::
       StructField("doubleField", DoubleType, true) ::
-      StructField("decimalField1", DecimalType.Unlimited, true) ::
+      StructField("decimalField1", DecimalType.USER_DEFAULT, true) ::
       StructField("decimalField2", DecimalType(9, 2), true) ::
       StructField("dateField", DateType, true) ::
       StructField("timestampField", TimestampType, true) ::
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
index a8f2ee37cb8ed..592cfa0ee8380 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
@@ -179,7 +179,7 @@ private[hive] trait HiveInspectors {
     // writable
     case c: Class[_] if c == classOf[hadoopIo.DoubleWritable] => DoubleType
     case c: Class[_] if c == classOf[hiveIo.DoubleWritable] => DoubleType
-    case c: Class[_] if c == classOf[hiveIo.HiveDecimalWritable] => DecimalType.Unlimited
+    case c: Class[_] if c == classOf[hiveIo.HiveDecimalWritable] => DecimalType.SYSTEM_DEFAULT
     case c: Class[_] if c == classOf[hiveIo.ByteWritable] => ByteType
     case c: Class[_] if c == classOf[hiveIo.ShortWritable] => ShortType
     case c: Class[_] if c == classOf[hiveIo.DateWritable] => DateType
@@ -195,8 +195,8 @@ private[hive] trait HiveInspectors {
     case c: Class[_] if c == classOf[java.lang.String] => StringType
     case c: Class[_] if c == classOf[java.sql.Date] => DateType
     case c: Class[_] if c == classOf[java.sql.Timestamp] => TimestampType
-    case c: Class[_] if c == classOf[HiveDecimal] => DecimalType.Unlimited
-    case c: Class[_] if c == classOf[java.math.BigDecimal] => DecimalType.Unlimited
+    case c: Class[_] if c == classOf[HiveDecimal] => DecimalType.SYSTEM_DEFAULT
+    case c: Class[_] if c == classOf[java.math.BigDecimal] => DecimalType.SYSTEM_DEFAULT
     case c: Class[_] if c == classOf[Array[Byte]] => BinaryType
     case c: Class[_] if c == classOf[java.lang.Short] => ShortType
     case c: Class[_] if c == classOf[java.lang.Integer] => IntegerType
@@ -813,9 +813,6 @@ private[hive] trait HiveInspectors {
 
     private def decimalTypeInfo(decimalType: DecimalType): TypeInfo = decimalType match {
       case DecimalType.Fixed(precision, scale) => new DecimalTypeInfo(precision, scale)
-      case _ => new DecimalTypeInfo(
-        HiveShim.UNLIMITED_DECIMAL_PRECISION,
-        HiveShim.UNLIMITED_DECIMAL_SCALE)
     }
 
     def toTypeInfo: TypeInfo = dt match {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
index 8518e333e8058..620b8a44d8a9b 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
@@ -377,7 +377,7 @@ private[hive] object HiveQl extends Logging {
       DecimalType(precision.getText.toInt, scale.getText.toInt)
     case Token("TOK_DECIMAL", precision :: Nil) =>
       DecimalType(precision.getText.toInt, 0)
-    case Token("TOK_DECIMAL", Nil) => DecimalType.Unlimited
+    case Token("TOK_DECIMAL", Nil) => DecimalType.USER_DEFAULT
     case Token("TOK_BIGINT", Nil) => LongType
     case Token("TOK_INT", Nil) => IntegerType
     case Token("TOK_TINYINT", Nil) => ByteType
@@ -1369,7 +1369,7 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
     case Token("TOK_FUNCTION", Token("TOK_DECIMAL", precision :: Nil) :: arg :: Nil) =>
       Cast(nodeToExpr(arg), DecimalType(precision.getText.toInt, 0))
     case Token("TOK_FUNCTION", Token("TOK_DECIMAL", Nil) :: arg :: Nil) =>
-      Cast(nodeToExpr(arg), DecimalType.Unlimited)
+      Cast(nodeToExpr(arg), DecimalType.USER_DEFAULT)
     case Token("TOK_FUNCTION", Token("TOK_TIMESTAMP", Nil) :: arg :: Nil) =>
       Cast(nodeToExpr(arg), TimestampType)
     case Token("TOK_FUNCTION", Token("TOK_DATE", Nil) :: arg :: Nil) =>

From 52de3acca4ce8c36fd4c9ce162473a091701bbc7 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Thu, 23 Jul 2015 18:53:07 -0700
Subject: [PATCH 346/549] [SPARK-9122] [MLLIB] [PySpark] spark.mllib regression
 support batch predict

spark.mllib support batch predict for LinearRegressionModel, RidgeRegressionModel and LassoModel.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #7614 from yanboliang/spark-9122 and squashes the following commits:

4e610c0 [Yanbo Liang] spark.mllib regression support batch predict
---
 python/pyspark/mllib/regression.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/mllib/regression.py b/python/pyspark/mllib/regression.py
index 8e90adee5f4c2..5b7afc15ddfba 100644
--- a/python/pyspark/mllib/regression.py
+++ b/python/pyspark/mllib/regression.py
@@ -97,9 +97,11 @@ class LinearRegressionModelBase(LinearModel):
 
     def predict(self, x):
         """
-        Predict the value of the dependent variable given a vector x
-        containing values for the independent variables.
+        Predict the value of the dependent variable given a vector or
+        an RDD of vectors containing values for the independent variables.
         """
+        if isinstance(x, RDD):
+            return x.map(self.predict)
         x = _convert_to_vector(x)
         return self.weights.dot(x) + self.intercept
 
@@ -124,6 +126,8 @@ class LinearRegressionModel(LinearRegressionModelBase):
     True
     >>> abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5
     True
+    >>> abs(lrm.predict(sc.parallelize([[1.0]])).collect()[0] - 1) < 0.5
+    True
     >>> import os, tempfile
     >>> path = tempfile.mkdtemp()
     >>> lrm.save(sc, path)
@@ -267,6 +271,8 @@ class LassoModel(LinearRegressionModelBase):
     True
     >>> abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5
     True
+    >>> abs(lrm.predict(sc.parallelize([[1.0]])).collect()[0] - 1) < 0.5
+    True
     >>> import os, tempfile
     >>> path = tempfile.mkdtemp()
     >>> lrm.save(sc, path)
@@ -382,6 +388,8 @@ class RidgeRegressionModel(LinearRegressionModelBase):
     True
     >>> abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5
     True
+    >>> abs(lrm.predict(sc.parallelize([[1.0]])).collect()[0] - 1) < 0.5
+    True
     >>> import os, tempfile
     >>> path = tempfile.mkdtemp()
     >>> lrm.save(sc, path)

From d249636e59fabd8ca57a47dc2cbad9c4a4e7a750 Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Thu, 23 Jul 2015 20:06:54 -0700
Subject: [PATCH 347/549] [SPARK-9216] [STREAMING] Define
 KinesisBackedBlockRDDs

For more information see master JIRA: https://issues.apache.org/jira/browse/SPARK-9215
Design Doc: https://docs.google.com/document/d/1k0dl270EnK7uExrsCE7jYw7PYx0YC935uBcxn3p0f58/edit

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #7578 from tdas/kinesis-rdd and squashes the following commits:

543d208 [Tathagata Das] Fixed scala style
5082a30 [Tathagata Das] Fixed scala style
3f40c2d [Tathagata Das] Addressed comments
c4f25d2 [Tathagata Das] Addressed comment
d3d64d1 [Tathagata Das] Minor update
f6e35c8 [Tathagata Das] Added retry logic to make it more robust
8874b70 [Tathagata Das] Updated Kinesis RDD
575bdbc [Tathagata Das] Fix scala style issues
4a36096 [Tathagata Das] Add license
5da3995 [Tathagata Das] Changed KinesisSuiteHelper to KinesisFunSuite
528e206 [Tathagata Das] Merge remote-tracking branch 'apache-github/master' into kinesis-rdd
3ae0814 [Tathagata Das] Added KinesisBackedBlockRDD
---
 .../kinesis/KinesisBackedBlockRDD.scala       | 285 ++++++++++++++++++
 .../streaming/kinesis/KinesisTestUtils.scala  |   2 +-
 .../kinesis/KinesisBackedBlockRDDSuite.scala  | 246 +++++++++++++++
 .../streaming/kinesis/KinesisFunSuite.scala   |  13 +-
 .../kinesis/KinesisStreamSuite.scala          |   4 +-
 5 files changed, 545 insertions(+), 5 deletions(-)
 create mode 100644 extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDD.scala
 create mode 100644 extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDDSuite.scala

diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDD.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDD.scala
new file mode 100644
index 0000000000000..8f144a4d974a8
--- /dev/null
+++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDD.scala
@@ -0,0 +1,285 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.streaming.kinesis
+
+import scala.collection.JavaConversions._
+import scala.util.control.NonFatal
+
+import com.amazonaws.auth.{AWSCredentials, DefaultAWSCredentialsProviderChain}
+import com.amazonaws.services.kinesis.AmazonKinesisClient
+import com.amazonaws.services.kinesis.model._
+
+import org.apache.spark._
+import org.apache.spark.rdd.{BlockRDD, BlockRDDPartition}
+import org.apache.spark.storage.BlockId
+import org.apache.spark.util.NextIterator
+
+
+/** Class representing a range of Kinesis sequence numbers. Both sequence numbers are inclusive. */
+private[kinesis]
+case class SequenceNumberRange(
+    streamName: String, shardId: String, fromSeqNumber: String, toSeqNumber: String)
+
+/** Class representing an array of Kinesis sequence number ranges */
+private[kinesis]
+case class SequenceNumberRanges(ranges: Array[SequenceNumberRange]) {
+  def isEmpty(): Boolean = ranges.isEmpty
+  def nonEmpty(): Boolean = ranges.nonEmpty
+  override def toString(): String = ranges.mkString("SequenceNumberRanges(", ", ", ")")
+}
+
+private[kinesis]
+object SequenceNumberRanges {
+  def apply(range: SequenceNumberRange): SequenceNumberRanges = {
+    new SequenceNumberRanges(Array(range))
+  }
+}
+
+
+/** Partition storing the information of the ranges of Kinesis sequence numbers to read */
+private[kinesis]
+class KinesisBackedBlockRDDPartition(
+    idx: Int,
+    blockId: BlockId,
+    val isBlockIdValid: Boolean,
+    val seqNumberRanges: SequenceNumberRanges
+  ) extends BlockRDDPartition(blockId, idx)
+
+/**
+ * A BlockRDD where the block data is backed by Kinesis, which can accessed using the
+ * sequence numbers of the corresponding blocks.
+ */
+private[kinesis]
+class KinesisBackedBlockRDD(
+    sc: SparkContext,
+    regionId: String,
+    endpointUrl: String,
+    @transient blockIds: Array[BlockId],
+    @transient arrayOfseqNumberRanges: Array[SequenceNumberRanges],
+    @transient isBlockIdValid: Array[Boolean] = Array.empty,
+    retryTimeoutMs: Int = 10000,
+    awsCredentialsOption: Option[SerializableAWSCredentials] = None
+  ) extends BlockRDD[Array[Byte]](sc, blockIds) {
+
+  require(blockIds.length == arrayOfseqNumberRanges.length,
+    "Number of blockIds is not equal to the number of sequence number ranges")
+
+  override def isValid(): Boolean = true
+
+  override def getPartitions: Array[Partition] = {
+    Array.tabulate(blockIds.length) { i =>
+      val isValid = if (isBlockIdValid.length == 0) true else isBlockIdValid(i)
+      new KinesisBackedBlockRDDPartition(i, blockIds(i), isValid, arrayOfseqNumberRanges(i))
+    }
+  }
+
+  override def compute(split: Partition, context: TaskContext): Iterator[Array[Byte]] = {
+    val blockManager = SparkEnv.get.blockManager
+    val partition = split.asInstanceOf[KinesisBackedBlockRDDPartition]
+    val blockId = partition.blockId
+
+    def getBlockFromBlockManager(): Option[Iterator[Array[Byte]]] = {
+      logDebug(s"Read partition data of $this from block manager, block $blockId")
+      blockManager.get(blockId).map(_.data.asInstanceOf[Iterator[Array[Byte]]])
+    }
+
+    def getBlockFromKinesis(): Iterator[Array[Byte]] = {
+      val credenentials = awsCredentialsOption.getOrElse {
+        new DefaultAWSCredentialsProviderChain().getCredentials()
+      }
+      partition.seqNumberRanges.ranges.iterator.flatMap { range =>
+        new KinesisSequenceRangeIterator(
+          credenentials, endpointUrl, regionId, range, retryTimeoutMs)
+      }
+    }
+    if (partition.isBlockIdValid) {
+      getBlockFromBlockManager().getOrElse { getBlockFromKinesis() }
+    } else {
+      getBlockFromKinesis()
+    }
+  }
+}
+
+
+/**
+ * An iterator that return the Kinesis data based on the given range of sequence numbers.
+ * Internally, it repeatedly fetches sets of records starting from the fromSequenceNumber,
+ * until the endSequenceNumber is reached.
+ */
+private[kinesis]
+class KinesisSequenceRangeIterator(
+    credentials: AWSCredentials,
+    endpointUrl: String,
+    regionId: String,
+    range: SequenceNumberRange,
+    retryTimeoutMs: Int
+  ) extends NextIterator[Array[Byte]] with Logging {
+
+  private val client = new AmazonKinesisClient(credentials)
+  private val streamName = range.streamName
+  private val shardId = range.shardId
+
+  private var toSeqNumberReceived = false
+  private var lastSeqNumber: String = null
+  private var internalIterator: Iterator[Record] = null
+
+  client.setEndpoint(endpointUrl, "kinesis", regionId)
+
+  override protected def getNext(): Array[Byte] = {
+    var nextBytes: Array[Byte] = null
+    if (toSeqNumberReceived) {
+      finished = true
+    } else {
+
+      if (internalIterator == null) {
+
+        // If the internal iterator has not been initialized,
+        // then fetch records from starting sequence number
+        internalIterator = getRecords(ShardIteratorType.AT_SEQUENCE_NUMBER, range.fromSeqNumber)
+      } else if (!internalIterator.hasNext) {
+
+        // If the internal iterator does not have any more records,
+        // then fetch more records after the last consumed sequence number
+        internalIterator = getRecords(ShardIteratorType.AFTER_SEQUENCE_NUMBER, lastSeqNumber)
+      }
+
+      if (!internalIterator.hasNext) {
+
+        // If the internal iterator still does not have any data, then throw exception
+        // and terminate this iterator
+        finished = true
+        throw new SparkException(
+          s"Could not read until the end sequence number of the range: $range")
+      } else {
+
+        // Get the record, copy the data into a byte array and remember its sequence number
+        val nextRecord: Record = internalIterator.next()
+        val byteBuffer = nextRecord.getData()
+        nextBytes = new Array[Byte](byteBuffer.remaining())
+        byteBuffer.get(nextBytes)
+        lastSeqNumber = nextRecord.getSequenceNumber()
+
+        // If the this record's sequence number matches the stopping sequence number, then make sure
+        // the iterator is marked finished next time getNext() is called
+        if (nextRecord.getSequenceNumber == range.toSeqNumber) {
+          toSeqNumberReceived = true
+        }
+      }
+
+    }
+    nextBytes
+  }
+
+  override protected def close(): Unit = {
+    client.shutdown()
+  }
+
+  /**
+   * Get records starting from or after the given sequence number.
+   */
+  private def getRecords(iteratorType: ShardIteratorType, seqNum: String): Iterator[Record] = {
+    val shardIterator = getKinesisIterator(iteratorType, seqNum)
+    val result = getRecordsAndNextKinesisIterator(shardIterator)
+    result._1
+  }
+
+  /**
+   * Get the records starting from using a Kinesis shard iterator (which is a progress handle
+   * to get records from Kinesis), and get the next shard iterator for next consumption.
+   */
+  private def getRecordsAndNextKinesisIterator(
+      shardIterator: String): (Iterator[Record], String) = {
+    val getRecordsRequest = new GetRecordsRequest
+    getRecordsRequest.setRequestCredentials(credentials)
+    getRecordsRequest.setShardIterator(shardIterator)
+    val getRecordsResult = retryOrTimeout[GetRecordsResult](
+      s"getting records using shard iterator") {
+        client.getRecords(getRecordsRequest)
+      }
+    (getRecordsResult.getRecords.iterator(), getRecordsResult.getNextShardIterator)
+  }
+
+  /**
+   * Get the Kinesis shard iterator for getting records starting from or after the given
+   * sequence number.
+   */
+  private def getKinesisIterator(
+      iteratorType: ShardIteratorType,
+      sequenceNumber: String): String = {
+    val getShardIteratorRequest = new GetShardIteratorRequest
+    getShardIteratorRequest.setRequestCredentials(credentials)
+    getShardIteratorRequest.setStreamName(streamName)
+    getShardIteratorRequest.setShardId(shardId)
+    getShardIteratorRequest.setShardIteratorType(iteratorType.toString)
+    getShardIteratorRequest.setStartingSequenceNumber(sequenceNumber)
+    val getShardIteratorResult = retryOrTimeout[GetShardIteratorResult](
+        s"getting shard iterator from sequence number $sequenceNumber") {
+          client.getShardIterator(getShardIteratorRequest)
+        }
+    getShardIteratorResult.getShardIterator
+  }
+
+  /** Helper method to retry Kinesis API request with exponential backoff and timeouts */
+  private def retryOrTimeout[T](message: String)(body: => T): T = {
+    import KinesisSequenceRangeIterator._
+
+    var startTimeMs = System.currentTimeMillis()
+    var retryCount = 0
+    var waitTimeMs = MIN_RETRY_WAIT_TIME_MS
+    var result: Option[T] = None
+    var lastError: Throwable = null
+
+    def isTimedOut = (System.currentTimeMillis() - startTimeMs) >= retryTimeoutMs
+    def isMaxRetryDone = retryCount >= MAX_RETRIES
+
+    while (result.isEmpty && !isTimedOut && !isMaxRetryDone) {
+      if (retryCount > 0) {  // wait only if this is a retry
+        Thread.sleep(waitTimeMs)
+        waitTimeMs *= 2  // if you have waited, then double wait time for next round
+      }
+      try {
+        result = Some(body)
+      } catch {
+        case NonFatal(t) =>
+          lastError = t
+           t match {
+             case ptee: ProvisionedThroughputExceededException =>
+               logWarning(s"Error while $message [attempt = ${retryCount + 1}]", ptee)
+             case e: Throwable =>
+               throw new SparkException(s"Error while $message", e)
+           }
+      }
+      retryCount += 1
+    }
+    result.getOrElse {
+      if (isTimedOut) {
+        throw new SparkException(
+          s"Timed out after $retryTimeoutMs ms while $message, last exception: ", lastError)
+      } else {
+        throw new SparkException(
+          s"Gave up after $retryCount retries while $message, last exception: ", lastError)
+      }
+    }
+  }
+}
+
+private[streaming]
+object KinesisSequenceRangeIterator {
+  val MAX_RETRIES = 3
+  val MIN_RETRY_WAIT_TIME_MS = 100
+}
diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisTestUtils.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisTestUtils.scala
index f6bf552e6bb8e..0ff1b7ed0fd90 100644
--- a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisTestUtils.scala
+++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisTestUtils.scala
@@ -177,7 +177,7 @@ private class KinesisTestUtils(
 
 private[kinesis] object KinesisTestUtils {
 
-  val envVarName = "RUN_KINESIS_TESTS"
+  val envVarName = "ENABLE_KINESIS_TESTS"
 
   val shouldRunTests = sys.env.get(envVarName) == Some("1")
 
diff --git a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDDSuite.scala b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDDSuite.scala
new file mode 100644
index 0000000000000..b2e2a4246dbd5
--- /dev/null
+++ b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDDSuite.scala
@@ -0,0 +1,246 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.streaming.kinesis
+
+import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll}
+
+import org.apache.spark.storage.{BlockId, BlockManager, StorageLevel, StreamBlockId}
+import org.apache.spark.{SparkConf, SparkContext, SparkException, SparkFunSuite}
+
+class KinesisBackedBlockRDDSuite extends KinesisFunSuite with BeforeAndAfterAll {
+
+  private val regionId = "us-east-1"
+  private val endpointUrl = "https://kinesis.us-east-1.amazonaws.com"
+  private val testData = 1 to 8
+
+  private var testUtils: KinesisTestUtils = null
+  private var shardIds: Seq[String] = null
+  private var shardIdToData: Map[String, Seq[Int]] = null
+  private var shardIdToSeqNumbers: Map[String, Seq[String]] = null
+  private var shardIdToDataAndSeqNumbers: Map[String, Seq[(Int, String)]] = null
+  private var shardIdToRange: Map[String, SequenceNumberRange] = null
+  private var allRanges: Seq[SequenceNumberRange] = null
+
+  private var sc: SparkContext = null
+  private var blockManager: BlockManager = null
+
+
+  override def beforeAll(): Unit = {
+    runIfTestsEnabled("Prepare KinesisTestUtils") {
+      testUtils = new KinesisTestUtils(endpointUrl)
+      testUtils.createStream()
+
+      shardIdToDataAndSeqNumbers = testUtils.pushData(testData)
+      require(shardIdToDataAndSeqNumbers.size > 1, "Need data to be sent to multiple shards")
+
+      shardIds = shardIdToDataAndSeqNumbers.keySet.toSeq
+      shardIdToData = shardIdToDataAndSeqNumbers.mapValues { _.map { _._1 }}
+      shardIdToSeqNumbers = shardIdToDataAndSeqNumbers.mapValues { _.map { _._2 }}
+      shardIdToRange = shardIdToSeqNumbers.map { case (shardId, seqNumbers) =>
+        val seqNumRange = SequenceNumberRange(
+          testUtils.streamName, shardId, seqNumbers.head, seqNumbers.last)
+        (shardId, seqNumRange)
+      }
+      allRanges = shardIdToRange.values.toSeq
+
+      val conf = new SparkConf().setMaster("local[4]").setAppName("KinesisBackedBlockRDDSuite")
+      sc = new SparkContext(conf)
+      blockManager = sc.env.blockManager
+    }
+  }
+
+  override def afterAll(): Unit = {
+    if (sc != null) {
+      sc.stop()
+    }
+  }
+
+  testIfEnabled("Basic reading from Kinesis") {
+    // Verify all data using multiple ranges in a single RDD partition
+    val receivedData1 = new KinesisBackedBlockRDD(sc, regionId, endpointUrl,
+      fakeBlockIds(1),
+      Array(SequenceNumberRanges(allRanges.toArray))
+    ).map { bytes => new String(bytes).toInt }.collect()
+    assert(receivedData1.toSet === testData.toSet)
+
+    // Verify all data using one range in each of the multiple RDD partitions
+    val receivedData2 = new KinesisBackedBlockRDD(sc, regionId, endpointUrl,
+      fakeBlockIds(allRanges.size),
+      allRanges.map { range => SequenceNumberRanges(Array(range)) }.toArray
+    ).map { bytes => new String(bytes).toInt }.collect()
+    assert(receivedData2.toSet === testData.toSet)
+
+    // Verify ordering within each partition
+    val receivedData3 = new KinesisBackedBlockRDD(sc, regionId, endpointUrl,
+      fakeBlockIds(allRanges.size),
+      allRanges.map { range => SequenceNumberRanges(Array(range)) }.toArray
+    ).map { bytes => new String(bytes).toInt }.collectPartitions()
+    assert(receivedData3.length === allRanges.size)
+    for (i <- 0 until allRanges.size) {
+      assert(receivedData3(i).toSeq === shardIdToData(allRanges(i).shardId))
+    }
+  }
+
+  testIfEnabled("Read data available in both block manager and Kinesis") {
+    testRDD(numPartitions = 2, numPartitionsInBM = 2, numPartitionsInKinesis = 2)
+  }
+
+  testIfEnabled("Read data available only in block manager, not in Kinesis") {
+    testRDD(numPartitions = 2, numPartitionsInBM = 2, numPartitionsInKinesis = 0)
+  }
+
+  testIfEnabled("Read data available only in Kinesis, not in block manager") {
+    testRDD(numPartitions = 2, numPartitionsInBM = 0, numPartitionsInKinesis = 2)
+  }
+
+  testIfEnabled("Read data available partially in block manager, rest in Kinesis") {
+    testRDD(numPartitions = 2, numPartitionsInBM = 1, numPartitionsInKinesis = 1)
+  }
+
+  testIfEnabled("Test isBlockValid skips block fetching from block manager") {
+    testRDD(numPartitions = 2, numPartitionsInBM = 2, numPartitionsInKinesis = 0,
+      testIsBlockValid = true)
+  }
+
+  testIfEnabled("Test whether RDD is valid after removing blocks from block anager") {
+    testRDD(numPartitions = 2, numPartitionsInBM = 2, numPartitionsInKinesis = 2,
+      testBlockRemove = true)
+  }
+
+  /**
+   * Test the WriteAheadLogBackedRDD, by writing some partitions of the data to block manager
+   * and the rest to a write ahead log, and then reading reading it all back using the RDD.
+   * It can also test if the partitions that were read from the log were again stored in
+   * block manager.
+   *
+   *
+   *
+   * @param numPartitions Number of partitions in RDD
+   * @param numPartitionsInBM Number of partitions to write to the BlockManager.
+   *                          Partitions 0 to (numPartitionsInBM-1) will be written to BlockManager
+   * @param numPartitionsInKinesis Number of partitions to write to the Kinesis.
+   *                           Partitions (numPartitions - 1 - numPartitionsInKinesis) to
+   *                           (numPartitions - 1) will be written to Kinesis
+   * @param testIsBlockValid Test whether setting isBlockValid to false skips block fetching
+   * @param testBlockRemove Test whether calling rdd.removeBlock() makes the RDD still usable with
+   *                        reads falling back to the WAL
+   * Example with numPartitions = 5, numPartitionsInBM = 3, and numPartitionsInWAL = 4
+   *
+   *   numPartitionsInBM = 3
+   *   |------------------|
+   *   |                  |
+   *    0       1       2       3       4
+   *           |                         |
+   *           |-------------------------|
+   *              numPartitionsInKinesis = 4
+   */
+  private def testRDD(
+      numPartitions: Int,
+      numPartitionsInBM: Int,
+      numPartitionsInKinesis: Int,
+      testIsBlockValid: Boolean = false,
+      testBlockRemove: Boolean = false
+    ): Unit = {
+    require(shardIds.size > 1, "Need at least 2 shards to test")
+    require(numPartitionsInBM <= shardIds.size ,
+      "Number of partitions in BlockManager cannot be more than the Kinesis test shards available")
+    require(numPartitionsInKinesis <= shardIds.size ,
+      "Number of partitions in Kinesis cannot be more than the Kinesis test shards available")
+    require(numPartitionsInBM <= numPartitions,
+      "Number of partitions in BlockManager cannot be more than that in RDD")
+    require(numPartitionsInKinesis <= numPartitions,
+      "Number of partitions in Kinesis cannot be more than that in RDD")
+
+    // Put necessary blocks in the block manager
+    val blockIds = fakeBlockIds(numPartitions)
+    blockIds.foreach(blockManager.removeBlock(_))
+    (0 until numPartitionsInBM).foreach { i =>
+      val blockData = shardIdToData(shardIds(i)).iterator.map { _.toString.getBytes() }
+      blockManager.putIterator(blockIds(i), blockData, StorageLevel.MEMORY_ONLY)
+    }
+
+    // Create the necessary ranges to use in the RDD
+    val fakeRanges = Array.fill(numPartitions - numPartitionsInKinesis)(
+      SequenceNumberRanges(SequenceNumberRange("fakeStream", "fakeShardId", "xxx", "yyy")))
+    val realRanges = Array.tabulate(numPartitionsInKinesis) { i =>
+      val range = shardIdToRange(shardIds(i + (numPartitions - numPartitionsInKinesis)))
+      SequenceNumberRanges(Array(range))
+    }
+    val ranges = (fakeRanges ++ realRanges)
+
+
+    // Make sure that the left `numPartitionsInBM` blocks are in block manager, and others are not
+    require(
+      blockIds.take(numPartitionsInBM).forall(blockManager.get(_).nonEmpty),
+      "Expected blocks not in BlockManager"
+    )
+
+    require(
+      blockIds.drop(numPartitionsInBM).forall(blockManager.get(_).isEmpty),
+      "Unexpected blocks in BlockManager"
+    )
+
+    // Make sure that the right sequence `numPartitionsInKinesis` are configured, and others are not
+    require(
+      ranges.takeRight(numPartitionsInKinesis).forall {
+        _.ranges.forall { _.streamName == testUtils.streamName }
+      }, "Incorrect configuration of RDD, expected ranges not set: "
+    )
+
+    require(
+      ranges.dropRight(numPartitionsInKinesis).forall {
+        _.ranges.forall { _.streamName != testUtils.streamName }
+      }, "Incorrect configuration of RDD, unexpected ranges set"
+    )
+
+    val rdd = new KinesisBackedBlockRDD(sc, regionId, endpointUrl, blockIds, ranges)
+    val collectedData = rdd.map { bytes =>
+      new String(bytes).toInt
+    }.collect()
+    assert(collectedData.toSet === testData.toSet)
+
+    // Verify that the block fetching is skipped when isBlockValid is set to false.
+    // This is done by using a RDD whose data is only in memory but is set to skip block fetching
+    // Using that RDD will throw exception, as it skips block fetching even if the blocks are in
+    // in BlockManager.
+    if (testIsBlockValid) {
+      require(numPartitionsInBM === numPartitions, "All partitions must be in BlockManager")
+      require(numPartitionsInKinesis === 0, "No partitions must be in Kinesis")
+      val rdd2 = new KinesisBackedBlockRDD(sc, regionId, endpointUrl, blockIds.toArray,
+        ranges, isBlockIdValid = Array.fill(blockIds.length)(false))
+      intercept[SparkException] {
+        rdd2.collect()
+      }
+    }
+
+    // Verify that the RDD is not invalid after the blocks are removed and can still read data
+    // from write ahead log
+    if (testBlockRemove) {
+      require(numPartitions === numPartitionsInKinesis,
+        "All partitions must be in WAL for this test")
+      require(numPartitionsInBM > 0, "Some partitions must be in BlockManager for this test")
+      rdd.removeBlocks()
+      assert(rdd.map { bytes => new String(bytes).toInt }.collect().toSet === testData.toSet)
+    }
+  }
+
+  /** Generate fake block ids */
+  private def fakeBlockIds(num: Int): Array[BlockId] = {
+    Array.tabulate(num) { i => new StreamBlockId(0, i) }
+  }
+}
diff --git a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisFunSuite.scala b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisFunSuite.scala
index 6d011f295e7f7..8373138785a89 100644
--- a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisFunSuite.scala
+++ b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisFunSuite.scala
@@ -23,15 +23,24 @@ import org.apache.spark.SparkFunSuite
  * Helper class that runs Kinesis real data transfer tests or
  * ignores them based on env variable is set or not.
  */
-trait KinesisSuiteHelper { self: SparkFunSuite =>
+trait KinesisFunSuite extends SparkFunSuite  {
   import KinesisTestUtils._
 
   /** Run the test if environment variable is set or ignore the test */
-  def testOrIgnore(testName: String)(testBody: => Unit) {
+  def testIfEnabled(testName: String)(testBody: => Unit) {
     if (shouldRunTests) {
       test(testName)(testBody)
     } else {
       ignore(s"$testName [enable by setting env var $envVarName=1]")(testBody)
     }
   }
+
+  /** Run the give body of code only if Kinesis tests are enabled */
+  def runIfTestsEnabled(message: String)(body: => Unit): Unit = {
+    if (shouldRunTests) {
+      body
+    } else {
+      ignore(s"$message [enable by setting env var $envVarName=1]")()
+    }
+  }
 }
diff --git a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisStreamSuite.scala b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisStreamSuite.scala
index 50f71413abf37..f9c952b9468bb 100644
--- a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisStreamSuite.scala
+++ b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisStreamSuite.scala
@@ -30,7 +30,7 @@ import org.apache.spark.storage.StorageLevel
 import org.apache.spark.streaming._
 import org.apache.spark.{SparkConf, SparkContext, SparkFunSuite}
 
-class KinesisStreamSuite extends SparkFunSuite with KinesisSuiteHelper
+class KinesisStreamSuite extends KinesisFunSuite
   with Eventually with BeforeAndAfter with BeforeAndAfterAll {
 
   // This is the name that KCL uses to save metadata to DynamoDB
@@ -83,7 +83,7 @@ class KinesisStreamSuite extends SparkFunSuite with KinesisSuiteHelper
    * you must have AWS credentials available through the default AWS provider chain,
    * and you have to set the system environment variable RUN_KINESIS_TESTS=1 .
    */
-  testOrIgnore("basic operation") {
+  testIfEnabled("basic operation") {
     val kinesisTestUtils = new KinesisTestUtils()
     try {
       kinesisTestUtils.createStream()

From d4d762f275749a923356cd84de549b14c22cc3eb Mon Sep 17 00:00:00 2001
From: Ram Sriharsha <rsriharsha@hw11853.local>
Date: Thu, 23 Jul 2015 22:35:41 -0700
Subject: [PATCH 348/549] [SPARK-8092] [ML] Allow OneVsRest Classifier feature
 and label column names to be configurable.

The base classifier input and output columns are ignored in favor of  the ones specified in OneVsRest.

Author: Ram Sriharsha <rsriharsha@hw11853.local>

Closes #6631 from harsha2010/SPARK-8092 and squashes the following commits:

6591dc6 [Ram Sriharsha] add documentation for params
b7024b1 [Ram Sriharsha] cleanup
f0e2bfb [Ram Sriharsha] merge with master
108d3d7 [Ram Sriharsha] merge with master
4f74126 [Ram Sriharsha] Allow label/ features columns to be configurable
---
 .../spark/ml/classification/OneVsRest.scala   | 17 ++++++++++++-
 .../ml/classification/OneVsRestSuite.scala    | 24 +++++++++++++++++++
 2 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala
index ea757c5e40c76..1741f19dc911c 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala
@@ -47,6 +47,8 @@ private[ml] trait OneVsRestParams extends PredictorParams {
 
   /**
    * param for the base binary classifier that we reduce multiclass classification into.
+   * The base classifier input and output columns are ignored in favor of
+   * the ones specified in [[OneVsRest]].
    * @group param
    */
   val classifier: Param[ClassifierType] = new Param(this, "classifier", "base binary classifier")
@@ -160,6 +162,15 @@ final class OneVsRest(override val uid: String)
     set(classifier, value.asInstanceOf[ClassifierType])
   }
 
+  /** @group setParam */
+  def setLabelCol(value: String): this.type = set(labelCol, value)
+
+  /** @group setParam */
+  def setFeaturesCol(value: String): this.type = set(featuresCol, value)
+
+  /** @group setParam */
+  def setPredictionCol(value: String): this.type = set(predictionCol, value)
+
   override def transformSchema(schema: StructType): StructType = {
     validateAndTransformSchema(schema, fitting = true, getClassifier.featuresDataType)
   }
@@ -195,7 +206,11 @@ final class OneVsRest(override val uid: String)
       val labelUDFWithNewMeta = labelUDF(col($(labelCol))).as(labelColName, newLabelMeta)
       val trainingDataset = multiclassLabeled.withColumn(labelColName, labelUDFWithNewMeta)
       val classifier = getClassifier
-      classifier.fit(trainingDataset, classifier.labelCol -> labelColName)
+      val paramMap = new ParamMap()
+      paramMap.put(classifier.labelCol -> labelColName)
+      paramMap.put(classifier.featuresCol -> getFeaturesCol)
+      paramMap.put(classifier.predictionCol -> getPredictionCol)
+      classifier.fit(trainingDataset, paramMap)
     }.toArray[ClassificationModel[_, _]]
 
     if (handlePersistence) {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala
index 75cf5bd4ead4f..3775292f6dca7 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala
@@ -19,6 +19,7 @@ package org.apache.spark.ml.classification
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.attribute.NominalAttribute
+import org.apache.spark.ml.feature.StringIndexer
 import org.apache.spark.ml.param.{ParamMap, ParamsSuite}
 import org.apache.spark.ml.util.MetadataUtils
 import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
@@ -104,6 +105,29 @@ class OneVsRestSuite extends SparkFunSuite with MLlibTestSparkContext {
     ova.fit(datasetWithLabelMetadata)
   }
 
+  test("SPARK-8092: ensure label features and prediction cols are configurable") {
+    val labelIndexer = new StringIndexer()
+      .setInputCol("label")
+      .setOutputCol("indexed")
+
+    val indexedDataset = labelIndexer
+      .fit(dataset)
+      .transform(dataset)
+      .drop("label")
+      .withColumnRenamed("features", "f")
+
+    val ova = new OneVsRest()
+    ova.setClassifier(new LogisticRegression())
+      .setLabelCol(labelIndexer.getOutputCol)
+      .setFeaturesCol("f")
+      .setPredictionCol("p")
+
+    val ovaModel = ova.fit(indexedDataset)
+    val transformedDataset = ovaModel.transform(indexedDataset)
+    val outputFields = transformedDataset.schema.fieldNames.toSet
+    assert(outputFields.contains("p"))
+  }
+
   test("SPARK-8049: OneVsRest shouldn't output temp columns") {
     val logReg = new LogisticRegression()
       .setMaxIter(1)

From 408e64b284ef8bd6796d815b5eb603312d090b74 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@outlook.com>
Date: Thu, 23 Jul 2015 23:40:01 -0700
Subject: [PATCH 349/549] [SPARK-9294][SQL] cleanup comments, code style,
 naming typo for the new aggregation

fix some comments and code style for https://github.com/apache/spark/pull/7458

Author: Wenchen Fan <cloud0fan@outlook.com>

Closes #7619 from cloud-fan/agg-clean and squashes the following commits:

3925457 [Wenchen Fan] one more...
cc78357 [Wenchen Fan] one more cleanup
26f6a93 [Wenchen Fan] some minor cleanup for the new aggregation
---
 .../sql/catalyst/analysis/Analyzer.scala      |  2 +-
 .../expressions/aggregate/interfaces.scala    | 18 ++--
 .../apache/spark/sql/execution/Exchange.scala |  6 +-
 .../spark/sql/execution/SparkStrategies.scala |  8 +-
 .../aggregate/sortBasedIterators.scala        | 82 ++++++-------------
 .../spark/sql/execution/aggregate/utils.scala | 10 +--
 .../org/apache/spark/sql/SQLQuerySuite.scala  |  9 +-
 7 files changed, 46 insertions(+), 89 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 8cadbc57e87e1..e916887187dc8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -533,7 +533,7 @@ class Analyzer(
                 case min: Min if isDistinct => min
                 // For other aggregate functions, DISTINCT keyword is not supported for now.
                 // Once we converted to the new code path, we will allow using DISTINCT keyword.
-                case other if isDistinct =>
+                case other: AggregateExpression1 if isDistinct =>
                   failAnalysis(s"$name does not support DISTINCT keyword.")
                 // If it does not have DISTINCT keyword, we will return it as is.
                 case other => other
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala
index d3fee1ade05e6..10bd19c8a840f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala
@@ -23,18 +23,18 @@ import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCod
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.types._
 
-/** The mode of an [[AggregateFunction1]]. */
+/** The mode of an [[AggregateFunction2]]. */
 private[sql] sealed trait AggregateMode
 
 /**
- * An [[AggregateFunction1]] with [[Partial]] mode is used for partial aggregation.
+ * An [[AggregateFunction2]] with [[Partial]] mode is used for partial aggregation.
  * This function updates the given aggregation buffer with the original input of this
  * function. When it has processed all input rows, the aggregation buffer is returned.
  */
 private[sql] case object Partial extends AggregateMode
 
 /**
- * An [[AggregateFunction1]] with [[PartialMerge]] mode is used to merge aggregation buffers
+ * An [[AggregateFunction2]] with [[PartialMerge]] mode is used to merge aggregation buffers
  * containing intermediate results for this function.
  * This function updates the given aggregation buffer by merging multiple aggregation buffers.
  * When it has processed all input rows, the aggregation buffer is returned.
@@ -42,8 +42,8 @@ private[sql] case object Partial extends AggregateMode
 private[sql] case object PartialMerge extends AggregateMode
 
 /**
- * An [[AggregateFunction1]] with [[PartialMerge]] mode is used to merge aggregation buffers
- * containing intermediate results for this function and the generate final result.
+ * An [[AggregateFunction2]] with [[PartialMerge]] mode is used to merge aggregation buffers
+ * containing intermediate results for this function and then generate final result.
  * This function updates the given aggregation buffer by merging multiple aggregation buffers.
  * When it has processed all input rows, the final result of this function is returned.
  */
@@ -85,12 +85,12 @@ private[sql] case class AggregateExpression2(
   override def nullable: Boolean = aggregateFunction.nullable
 
   override def references: AttributeSet = {
-    val childReferemces = mode match {
+    val childReferences = mode match {
       case Partial | Complete => aggregateFunction.references.toSeq
       case PartialMerge | Final => aggregateFunction.bufferAttributes
     }
 
-    AttributeSet(childReferemces)
+    AttributeSet(childReferences)
   }
 
   override def toString: String = s"(${aggregateFunction}2,mode=$mode,isDistinct=$isDistinct)"
@@ -99,10 +99,8 @@ private[sql] case class AggregateExpression2(
 abstract class AggregateFunction2
   extends Expression with ImplicitCastInputTypes {
 
-  self: Product =>
-
   /** An aggregate function is not foldable. */
-  override def foldable: Boolean = false
+  final override def foldable: Boolean = false
 
   /**
    * The offset of this function's buffer in the underlying buffer shared with other functions.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
index d31e265a293e9..41a0c519ba527 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
@@ -224,13 +224,13 @@ private[sql] case class EnsureRequirements(sqlContext: SQLContext) extends Rule[
       // compatible.
       // TODO: ASSUMES TRANSITIVITY?
       def compatible: Boolean =
-        !operator.children
+        operator.children
           .map(_.outputPartitioning)
           .sliding(2)
-          .map {
+          .forall {
             case Seq(a) => true
             case Seq(a, b) => a.compatibleWith(b)
-          }.exists(!_)
+          }
 
       // Adds Exchange or Sort operators as required
       def addOperatorsIfNecessary(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index f54aa2027f6a6..eb4be1900b153 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -190,12 +190,12 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
         sqlContext.conf.codegenEnabled).isDefined
     }
 
-    def canBeCodeGened(aggs: Seq[AggregateExpression1]): Boolean = !aggs.exists {
-      case _: CombineSum | _: Sum | _: Count | _: Max | _: Min |  _: CombineSetsAndCount => false
+    def canBeCodeGened(aggs: Seq[AggregateExpression1]): Boolean = aggs.forall {
+      case _: CombineSum | _: Sum | _: Count | _: Max | _: Min |  _: CombineSetsAndCount => true
       // The generated set implementation is pretty limited ATM.
       case CollectHashSet(exprs) if exprs.size == 1  &&
-           Seq(IntegerType, LongType).contains(exprs.head.dataType) => false
-      case _ => true
+           Seq(IntegerType, LongType).contains(exprs.head.dataType) => true
+      case _ => false
     }
 
     def allAggregates(exprs: Seq[Expression]): Seq[AggregateExpression1] =
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/sortBasedIterators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/sortBasedIterators.scala
index ce1cbdc9cb090..b8e95a5a2a4da 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/sortBasedIterators.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/sortBasedIterators.scala
@@ -67,13 +67,6 @@ private[sql] abstract class SortAggregationIterator(
     functions
   }
 
-  // All non-algebraic aggregate functions.
-  protected val nonAlgebraicAggregateFunctions: Array[AggregateFunction2] = {
-    aggregateFunctions.collect {
-      case func: AggregateFunction2 if !func.isInstanceOf[AlgebraicAggregate] => func
-    }.toArray
-  }
-
   // Positions of those non-algebraic aggregate functions in aggregateFunctions.
   // For example, we have func1, func2, func3, func4 in aggregateFunctions, and
   // func2 and func3 are non-algebraic aggregate functions.
@@ -91,6 +84,10 @@ private[sql] abstract class SortAggregationIterator(
     positions.toArray
   }
 
+  // All non-algebraic aggregate functions.
+  protected val nonAlgebraicAggregateFunctions: Array[AggregateFunction2] =
+    nonAlgebraicAggregateFunctionPositions.map(aggregateFunctions)
+
   // This is used to project expressions for the grouping expressions.
   protected val groupGenerator =
     newMutableProjection(groupingExpressions, inputAttributes)()
@@ -179,8 +176,6 @@ private[sql] abstract class SortAggregationIterator(
       // For the below compare method, we do not need to make a copy of groupingKey.
       val groupingKey = groupGenerator(currentRow)
       // Check if the current row belongs the current input row.
-      currentGroupingKey.equals(groupingKey)
-
       if (currentGroupingKey == groupingKey) {
         processRow(currentRow)
       } else {
@@ -288,10 +283,7 @@ class PartialSortAggregationIterator(
 
   // This projection is used to update buffer values for all AlgebraicAggregates.
   private val algebraicUpdateProjection = {
-    val bufferSchema = aggregateFunctions.flatMap {
-      case ae: AlgebraicAggregate => ae.bufferAttributes
-      case agg: AggregateFunction2 => agg.bufferAttributes
-    }
+    val bufferSchema = aggregateFunctions.flatMap(_.bufferAttributes)
     val updateExpressions = aggregateFunctions.flatMap {
       case ae: AlgebraicAggregate => ae.updateExpressions
       case agg: AggregateFunction2 => Seq.fill(agg.bufferAttributes.length)(NoOp)
@@ -348,19 +340,14 @@ class PartialMergeSortAggregationIterator(
     inputAttributes,
     inputIter) {
 
-  private val placeholderAttribtues =
+  private val placeholderAttributes =
     Seq.fill(initialBufferOffset)(AttributeReference("placeholder", NullType)())
 
   // This projection is used to merge buffer values for all AlgebraicAggregates.
   private val algebraicMergeProjection = {
     val bufferSchemata =
-      placeholderAttribtues ++ aggregateFunctions.flatMap {
-        case ae: AlgebraicAggregate => ae.bufferAttributes
-        case agg: AggregateFunction2 => agg.bufferAttributes
-      } ++ placeholderAttribtues ++ aggregateFunctions.flatMap {
-        case ae: AlgebraicAggregate => ae.cloneBufferAttributes
-        case agg: AggregateFunction2 => agg.cloneBufferAttributes
-      }
+      placeholderAttributes ++ aggregateFunctions.flatMap(_.bufferAttributes) ++
+        placeholderAttributes ++ aggregateFunctions.flatMap(_.cloneBufferAttributes)
     val mergeExpressions = placeholderExpressions ++ aggregateFunctions.flatMap {
       case ae: AlgebraicAggregate => ae.mergeExpressions
       case agg: AggregateFunction2 => Seq.fill(agg.bufferAttributes.length)(NoOp)
@@ -444,13 +431,8 @@ class FinalSortAggregationIterator(
   // This projection is used to merge buffer values for all AlgebraicAggregates.
   private val algebraicMergeProjection = {
     val bufferSchemata =
-      offsetAttributes ++ aggregateFunctions.flatMap {
-        case ae: AlgebraicAggregate => ae.bufferAttributes
-        case agg: AggregateFunction2 => agg.bufferAttributes
-      } ++ offsetAttributes ++ aggregateFunctions.flatMap {
-        case ae: AlgebraicAggregate => ae.cloneBufferAttributes
-        case agg: AggregateFunction2 => agg.cloneBufferAttributes
-      }
+      offsetAttributes ++ aggregateFunctions.flatMap(_.bufferAttributes) ++
+        offsetAttributes ++ aggregateFunctions.flatMap(_.cloneBufferAttributes)
     val mergeExpressions = placeholderExpressions ++ aggregateFunctions.flatMap {
       case ae: AlgebraicAggregate => ae.mergeExpressions
       case agg: AggregateFunction2 => Seq.fill(agg.bufferAttributes.length)(NoOp)
@@ -462,13 +444,8 @@ class FinalSortAggregationIterator(
   // This projection is used to evaluate all AlgebraicAggregates.
   private val algebraicEvalProjection = {
     val bufferSchemata =
-      offsetAttributes ++ aggregateFunctions.flatMap {
-        case ae: AlgebraicAggregate => ae.bufferAttributes
-        case agg: AggregateFunction2 => agg.bufferAttributes
-      } ++ offsetAttributes ++ aggregateFunctions.flatMap {
-        case ae: AlgebraicAggregate => ae.cloneBufferAttributes
-        case agg: AggregateFunction2 => agg.cloneBufferAttributes
-      }
+      offsetAttributes ++ aggregateFunctions.flatMap(_.bufferAttributes) ++
+        offsetAttributes ++ aggregateFunctions.flatMap(_.cloneBufferAttributes)
     val evalExpressions = aggregateFunctions.map {
       case ae: AlgebraicAggregate => ae.evaluateExpression
       case agg: AggregateFunction2 => NoOp
@@ -599,11 +576,10 @@ class FinalAndCompleteSortAggregationIterator(
   }
 
   // All non-algebraic aggregate functions with mode Final.
-  private val finalNonAlgebraicAggregateFunctions: Array[AggregateFunction2] = {
+  private val finalNonAlgebraicAggregateFunctions: Array[AggregateFunction2] =
     finalAggregateFunctions.collect {
       case func: AggregateFunction2 if !func.isInstanceOf[AlgebraicAggregate] => func
-    }.toArray
-  }
+    }
 
   // All aggregate functions with mode Complete.
   private val completeAggregateFunctions: Array[AggregateFunction2] = {
@@ -617,11 +593,10 @@ class FinalAndCompleteSortAggregationIterator(
   }
 
   // All non-algebraic aggregate functions with mode Complete.
-  private val completeNonAlgebraicAggregateFunctions: Array[AggregateFunction2] = {
+  private val completeNonAlgebraicAggregateFunctions: Array[AggregateFunction2] =
     completeAggregateFunctions.collect {
       case func: AggregateFunction2 if !func.isInstanceOf[AlgebraicAggregate] => func
-    }.toArray
-  }
+    }
 
   // This projection is used to merge buffer values for all AlgebraicAggregates with mode
   // Final.
@@ -633,13 +608,9 @@ class FinalAndCompleteSortAggregationIterator(
     val completeOffsetExpressions = Seq.fill(numCompleteOffsetAttributes)(NoOp)
 
     val bufferSchemata =
-      offsetAttributes ++ finalAggregateFunctions.flatMap {
-        case ae: AlgebraicAggregate => ae.bufferAttributes
-        case agg: AggregateFunction2 => agg.bufferAttributes
-      } ++ completeOffsetAttributes ++ offsetAttributes ++ finalAggregateFunctions.flatMap {
-        case ae: AlgebraicAggregate => ae.cloneBufferAttributes
-        case agg: AggregateFunction2 => agg.cloneBufferAttributes
-      } ++ completeOffsetAttributes
+      offsetAttributes ++ finalAggregateFunctions.flatMap(_.bufferAttributes) ++
+        completeOffsetAttributes ++ offsetAttributes ++
+        finalAggregateFunctions.flatMap(_.cloneBufferAttributes) ++ completeOffsetAttributes
     val mergeExpressions =
       placeholderExpressions ++ finalAggregateFunctions.flatMap {
         case ae: AlgebraicAggregate => ae.mergeExpressions
@@ -658,10 +629,8 @@ class FinalAndCompleteSortAggregationIterator(
     val finalOffsetExpressions = Seq.fill(numFinalOffsetAttributes)(NoOp)
 
     val bufferSchema =
-      offsetAttributes ++ finalOffsetAttributes ++ completeAggregateFunctions.flatMap {
-        case ae: AlgebraicAggregate => ae.bufferAttributes
-        case agg: AggregateFunction2 => agg.bufferAttributes
-      }
+      offsetAttributes ++ finalOffsetAttributes ++
+        completeAggregateFunctions.flatMap(_.bufferAttributes)
     val updateExpressions =
       placeholderExpressions ++ finalOffsetExpressions ++ completeAggregateFunctions.flatMap {
         case ae: AlgebraicAggregate => ae.updateExpressions
@@ -673,13 +642,8 @@ class FinalAndCompleteSortAggregationIterator(
   // This projection is used to evaluate all AlgebraicAggregates.
   private val algebraicEvalProjection = {
     val bufferSchemata =
-      offsetAttributes ++ aggregateFunctions.flatMap {
-        case ae: AlgebraicAggregate => ae.bufferAttributes
-        case agg: AggregateFunction2 => agg.bufferAttributes
-      } ++ offsetAttributes ++ aggregateFunctions.flatMap {
-        case ae: AlgebraicAggregate => ae.cloneBufferAttributes
-        case agg: AggregateFunction2 => agg.cloneBufferAttributes
-      }
+      offsetAttributes ++ aggregateFunctions.flatMap(_.bufferAttributes) ++
+        offsetAttributes ++ aggregateFunctions.flatMap(_.cloneBufferAttributes)
     val evalExpressions = aggregateFunctions.map {
       case ae: AlgebraicAggregate => ae.evaluateExpression
       case agg: AggregateFunction2 => NoOp
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/utils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/utils.scala
index 1cb27710e0480..5bbe6c162ff4b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/utils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/utils.scala
@@ -191,10 +191,7 @@ object Utils {
     }
     val groupExpressionMap = namedGroupingExpressions.toMap
     val namedGroupingAttributes = namedGroupingExpressions.map(_._2.toAttribute)
-    val partialAggregateExpressions = aggregateExpressions.map {
-      case AggregateExpression2(aggregateFunction, mode, isDistinct) =>
-        AggregateExpression2(aggregateFunction, Partial, isDistinct)
-    }
+    val partialAggregateExpressions = aggregateExpressions.map(_.copy(mode = Partial))
     val partialAggregateAttributes = partialAggregateExpressions.flatMap { agg =>
       agg.aggregateFunction.bufferAttributes
     }
@@ -208,10 +205,7 @@ object Utils {
         child)
 
     // 2. Create an Aggregate Operator for final aggregations.
-    val finalAggregateExpressions = aggregateExpressions.map {
-      case AggregateExpression2(aggregateFunction, mode, isDistinct) =>
-        AggregateExpression2(aggregateFunction, Final, isDistinct)
-    }
+    val finalAggregateExpressions = aggregateExpressions.map(_.copy(mode = Final))
     val finalAggregateAttributes =
       finalAggregateExpressions.map {
         expr => aggregateFunctionMap(expr.aggregateFunction, expr.isDistinct)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index ab8dce603c117..95a1106cf072d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -1518,18 +1518,19 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll with SQLTestUtils {
 
   test("SPARK-8945: add and subtract expressions for interval type") {
     import org.apache.spark.unsafe.types.Interval
+    import org.apache.spark.unsafe.types.Interval.MICROS_PER_WEEK
 
     val df = sql("select interval 3 years -3 month 7 week 123 microseconds as i")
-    checkAnswer(df, Row(new Interval(12 * 3 - 3, 7L * 1000 * 1000 * 3600 * 24 * 7 + 123)))
+    checkAnswer(df, Row(new Interval(12 * 3 - 3, 7L * MICROS_PER_WEEK + 123)))
 
     checkAnswer(df.select(df("i") + new Interval(2, 123)),
-      Row(new Interval(12 * 3 - 3 + 2, 7L * 1000 * 1000 * 3600 * 24 * 7 + 123 + 123)))
+      Row(new Interval(12 * 3 - 3 + 2, 7L * MICROS_PER_WEEK + 123 + 123)))
 
     checkAnswer(df.select(df("i") - new Interval(2, 123)),
-      Row(new Interval(12 * 3 - 3 - 2, 7L * 1000 * 1000 * 3600 * 24 * 7 + 123 - 123)))
+      Row(new Interval(12 * 3 - 3 - 2, 7L * MICROS_PER_WEEK + 123 - 123)))
 
     // unary minus
     checkAnswer(df.select(-df("i")),
-      Row(new Interval(-(12 * 3 - 3), -(7L * 1000 * 1000 * 3600 * 24 * 7 + 123))))
+      Row(new Interval(-(12 * 3 - 3), -(7L * MICROS_PER_WEEK + 123))))
   }
 }

From cb8c241f05b9ab4ad0cd07df14d454cc5a4554cc Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Fri, 24 Jul 2015 01:18:43 -0700
Subject: [PATCH 350/549] [SPARK-9200][SQL] Don't implicitly cast non-atomic
 types to string type.

Author: Reynold Xin <rxin@databricks.com>

Closes #7636 from rxin/complex-string-implicit-cast and squashes the following commits:

3e67327 [Reynold Xin] [SPARK-9200][SQL] Don't implicitly cast non-atomic types to string type.
---
 .../spark/sql/catalyst/analysis/HiveTypeCoercion.scala    | 3 ++-
 .../sql/catalyst/analysis/HiveTypeCoercionSuite.scala     | 8 ++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
index d56ceeadc9e85..87ffbfe791b93 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
@@ -720,7 +720,8 @@ object HiveTypeCoercion {
         case (StringType, DateType) => Cast(e, DateType)
         case (StringType, TimestampType) => Cast(e, TimestampType)
         case (StringType, BinaryType) => Cast(e, BinaryType)
-        case (any, StringType) if any != StringType => Cast(e, StringType)
+        // Cast any atomic type to string.
+        case (any: AtomicType, StringType) if any != StringType => Cast(e, StringType)
 
         // When we reach here, input type is not acceptable for any types in this type collection,
         // try to find the first one we can implicitly cast.
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
index d0fb95b580ad2..55865bdb534b4 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
@@ -115,6 +115,14 @@ class HiveTypeCoercionSuite extends PlanTest {
     shouldNotCast(IntegerType, ArrayType)
     shouldNotCast(IntegerType, MapType)
     shouldNotCast(IntegerType, StructType)
+
+    shouldNotCast(IntervalType, StringType)
+
+    // Don't implicitly cast complex types to string.
+    shouldNotCast(ArrayType(StringType), StringType)
+    shouldNotCast(MapType(StringType, StringType), StringType)
+    shouldNotCast(new StructType().add("a1", StringType), StringType)
+    shouldNotCast(MapType(StringType, StringType), StringType)
   }
 
   test("tightest common bound for types") {

From 8fe32b4f7d49607ad5f2479d454b33ab3f079f7c Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Fri, 24 Jul 2015 01:47:13 -0700
Subject: [PATCH 351/549] [build] Enable memory leak detection for Tungsten.

This was turned off accidentally in #7591.

Author: Reynold Xin <rxin@databricks.com>

Closes #7637 from rxin/enable-mem-leak-detect and squashes the following commits:

34bc3ef [Reynold Xin] Enable memory leak detection for Tungsten.
---
 project/SparkBuild.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index b5b0adf630b9e..61a05d375d99e 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -543,7 +543,7 @@ object TestSettings {
     javaOptions in Test += "-Dspark.ui.enabled=false",
     javaOptions in Test += "-Dspark.ui.showConsoleProgress=false",
     javaOptions in Test += "-Dspark.driver.allowMultipleContexts=true",
-    //javaOptions in Test += "-Dspark.unsafe.exceptionOnMemoryLeak=true",
+    javaOptions in Test += "-Dspark.unsafe.exceptionOnMemoryLeak=true",
     javaOptions in Test += "-Dsun.io.serialization.extendedDebugInfo=true",
     javaOptions in Test += "-Dderby.system.durability=test",
     javaOptions in Test ++= System.getProperties.filter(_._1 startsWith "spark")

From 6a7e537f3a4fd5e99a905f9842dc0ad4c348e4fd Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@appier.com>
Date: Fri, 24 Jul 2015 17:39:57 +0800
Subject: [PATCH 352/549] [SPARK-8756] [SQL] Keep cached information and avoid
 re-calculating footers in ParquetRelation2

JIRA: https://issues.apache.org/jira/browse/SPARK-8756

Currently, in ParquetRelation2, footers are re-read every time refresh() is called. But we can check if it is possibly changed before we do the reading because reading all footers will be expensive when there are too many partitions. This pr fixes this by keeping some cached information to check it.

Author: Liang-Chi Hsieh <viirya@appier.com>

Closes #7154 from viirya/cached_footer_parquet_relation and squashes the following commits:

92e9347 [Liang-Chi Hsieh] Fix indentation.
ae0ec64 [Liang-Chi Hsieh] Fix wrong assignment.
c8fdfb7 [Liang-Chi Hsieh] Fix it.
a52b6d1 [Liang-Chi Hsieh] For comments.
c2a2420 [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into cached_footer_parquet_relation
fa5458f [Liang-Chi Hsieh] Use Map to cache FileStatus and do merging previously loaded schema and newly loaded one.
6ae0911 [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into cached_footer_parquet_relation
21bbdec [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into cached_footer_parquet_relation
12a0ed9 [Liang-Chi Hsieh] Add check of FileStatus's modification time.
186429d [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into cached_footer_parquet_relation
0ef8caf [Liang-Chi Hsieh] Keep cached information and avoid re-calculating footers.
---
 .../apache/spark/sql/parquet/newParquet.scala | 38 ++++++++++++-------
 1 file changed, 24 insertions(+), 14 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
index 2f9f880c70690..c384697c0ee62 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
@@ -345,24 +345,34 @@ private[sql] class ParquetRelation2(
     // Schema of the whole table, including partition columns.
     var schema: StructType = _
 
+    // Cached leaves
+    var cachedLeaves: Set[FileStatus] = null
+
     /**
      * Refreshes `FileStatus`es, footers, partition spec, and table schema.
      */
     def refresh(): Unit = {
-      // Lists `FileStatus`es of all leaf nodes (files) under all base directories.
-      val leaves = cachedLeafStatuses().filter { f =>
-        isSummaryFile(f.getPath) ||
-          !(f.getPath.getName.startsWith("_") || f.getPath.getName.startsWith("."))
-      }.toArray
-
-      dataStatuses = leaves.filterNot(f => isSummaryFile(f.getPath))
-      metadataStatuses = leaves.filter(_.getPath.getName == ParquetFileWriter.PARQUET_METADATA_FILE)
-      commonMetadataStatuses =
-        leaves.filter(_.getPath.getName == ParquetFileWriter.PARQUET_COMMON_METADATA_FILE)
-
-      // If we already get the schema, don't need to re-compute it since the schema merging is
-      // time-consuming.
-      if (dataSchema == null) {
+      val currentLeafStatuses = cachedLeafStatuses()
+
+      // Check if cachedLeafStatuses is changed or not
+      val leafStatusesChanged = (cachedLeaves == null) ||
+        !cachedLeaves.equals(currentLeafStatuses)
+
+      if (leafStatusesChanged) {
+        cachedLeaves = currentLeafStatuses.toIterator.toSet
+
+        // Lists `FileStatus`es of all leaf nodes (files) under all base directories.
+        val leaves = currentLeafStatuses.filter { f =>
+          isSummaryFile(f.getPath) ||
+            !(f.getPath.getName.startsWith("_") || f.getPath.getName.startsWith("."))
+        }.toArray
+
+        dataStatuses = leaves.filterNot(f => isSummaryFile(f.getPath))
+        metadataStatuses =
+          leaves.filter(_.getPath.getName == ParquetFileWriter.PARQUET_METADATA_FILE)
+        commonMetadataStatuses =
+          leaves.filter(_.getPath.getName == ParquetFileWriter.PARQUET_COMMON_METADATA_FILE)
+
         dataSchema = {
           val dataSchema0 = maybeDataSchema
             .orElse(readSchema())

From 6cd28cc21ed585ab8d1e0e7147a1a48b044c9c8e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Garillot?= <francois@garillot.net>
Date: Fri, 24 Jul 2015 15:41:13 +0100
Subject: [PATCH 353/549] [SPARK-9236] [CORE] Make defaultPartitioner not reuse
 a parent RDD's partitioner if it has 0 partitions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

See also comments on https://issues.apache.org/jira/browse/SPARK-9236

Author: François Garillot <francois@garillot.net>

Closes #7616 from huitseeker/issue/SPARK-9236 and squashes the following commits:

217f902 [François Garillot] [SPARK-9236] Make defaultPartitioner not reuse a parent RDD's partitioner if it has 0 partitions
---
 .../scala/org/apache/spark/Partitioner.scala  |  2 +-
 .../spark/rdd/PairRDDFunctionsSuite.scala     | 23 +++++++++++++++++++
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/Partitioner.scala b/core/src/main/scala/org/apache/spark/Partitioner.scala
index ad68512dccb79..4b9d59975bdc2 100644
--- a/core/src/main/scala/org/apache/spark/Partitioner.scala
+++ b/core/src/main/scala/org/apache/spark/Partitioner.scala
@@ -56,7 +56,7 @@ object Partitioner {
    */
   def defaultPartitioner(rdd: RDD[_], others: RDD[_]*): Partitioner = {
     val bySize = (Seq(rdd) ++ others).sortBy(_.partitions.size).reverse
-    for (r <- bySize if r.partitioner.isDefined) {
+    for (r <- bySize if r.partitioner.isDefined && r.partitioner.get.numPartitions > 0) {
       return r.partitioner.get
     }
     if (rdd.context.conf.contains("spark.default.parallelism")) {
diff --git a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
index dfa102f432a02..1321ec84735b5 100644
--- a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
@@ -282,6 +282,29 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext {
     ))
   }
 
+  // See SPARK-9326
+  test("cogroup with empty RDD") {
+    import scala.reflect.classTag
+    val intPairCT = classTag[(Int, Int)]
+
+    val rdd1 = sc.parallelize(Array((1, 1), (1, 2), (2, 1), (3, 1)))
+    val rdd2 = sc.emptyRDD[(Int, Int)](intPairCT)
+
+    val joined = rdd1.cogroup(rdd2).collect()
+    assert(joined.size > 0)
+  }
+
+  // See SPARK-9326
+  test("cogroup with groupByed RDD having 0 partitions") {
+    import scala.reflect.classTag
+    val intCT = classTag[Int]
+
+    val rdd1 = sc.parallelize(Array((1, 1), (1, 2), (2, 1), (3, 1)))
+    val rdd2 = sc.emptyRDD[Int](intCT).groupBy((x) => 5)
+    val joined = rdd1.cogroup(rdd2).collect()
+    assert(joined.size > 0)
+  }
+
   test("rightOuterJoin") {
     val rdd1 = sc.parallelize(Array((1, 1), (1, 2), (2, 1), (3, 1)))
     val rdd2 = sc.parallelize(Array((1, 'x'), (2, 'y'), (2, 'z'), (4, 'w')))

From dfb18be0366376be3b928dbf4570448c60fe652b Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Fri, 24 Jul 2015 08:24:13 -0700
Subject: [PATCH 354/549] [SPARK-9069] [SQL] follow up

Address comments for #7605

cc rxin

Author: Davies Liu <davies@databricks.com>

Closes #7634 from davies/decimal_unlimited2 and squashes the following commits:

b2d8b0d [Davies Liu] add doc and test for DecimalType.isWiderThan
65b251c [Davies Liu] fix test
6a91f32 [Davies Liu] fix style
ca9c973 [Davies Liu] address comments
---
 .../catalyst/analysis/HiveTypeCoercion.scala  | 30 +++++--------------
 .../expressions/decimalFunctions.scala        | 13 ++++++++
 .../apache/spark/sql/types/DecimalType.scala  |  6 +++-
 .../analysis/DecimalPrecisionSuite.scala      | 26 ++++++++++++++++
 .../analysis/HiveTypeCoercionSuite.scala      |  6 ++--
 5 files changed, 55 insertions(+), 26 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
index 87ffbfe791b93..e0527503442f0 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
@@ -19,9 +19,7 @@ package org.apache.spark.sql.catalyst.analysis
 
 import javax.annotation.Nullable
 
-import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, CodeGenContext}
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.types._
@@ -38,7 +36,7 @@ object HiveTypeCoercion {
   val typeCoercionRules =
     PropagateTypes ::
       InConversion ::
-      WidenTypes ::
+      WidenSetOperationTypes ::
       PromoteStrings ::
       DecimalPrecision ::
       BooleanEquality ::
@@ -175,7 +173,7 @@ object HiveTypeCoercion {
    *
    * This rule is only applied to Union/Except/Intersect
    */
-  object WidenTypes extends Rule[LogicalPlan] {
+  object WidenSetOperationTypes extends Rule[LogicalPlan] {
 
     private[this] def widenOutputTypes(
         planName: String,
@@ -203,9 +201,9 @@ object HiveTypeCoercion {
 
       def castOutput(plan: LogicalPlan): LogicalPlan = {
         val casted = plan.output.zip(castedTypes).map {
-          case (hs, Some(dt)) if dt != hs.dataType =>
-            Alias(Cast(hs, dt), hs.name)()
-          case (hs, _) => hs
+          case (e, Some(dt)) if e.dataType != dt =>
+            Alias(Cast(e, dt), e.name)()
+          case (e, _) => e
         }
         Project(casted, plan)
       }
@@ -355,20 +353,8 @@ object HiveTypeCoercion {
       DecimalType.bounded(range + scale, scale)
     }
 
-    /**
-     * An expression used to wrap the children when promote the precision of DecimalType to avoid
-     * promote multiple times.
-     */
-    case class ChangePrecision(child: Expression) extends UnaryExpression {
-      override def dataType: DataType = child.dataType
-      override def eval(input: InternalRow): Any = child.eval(input)
-      override def gen(ctx: CodeGenContext): GeneratedExpressionCode = child.gen(ctx)
-      override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = ""
-      override def prettyName: String = "change_precision"
-    }
-
-    def changePrecision(e: Expression, dataType: DataType): Expression = {
-      ChangePrecision(Cast(e, dataType))
+    private def changePrecision(e: Expression, dataType: DataType): Expression = {
+      ChangeDecimalPrecision(Cast(e, dataType))
     }
 
     def apply(plan: LogicalPlan): LogicalPlan = plan transform {
@@ -378,7 +364,7 @@ object HiveTypeCoercion {
         case e if !e.childrenResolved => e
 
         // Skip nodes who is already promoted
-        case e: BinaryArithmetic if e.left.isInstanceOf[ChangePrecision] => e
+        case e: BinaryArithmetic if e.left.isInstanceOf[ChangeDecimalPrecision] => e
 
         case Add(e1 @ DecimalType.Expression(p1, s1), e2 @ DecimalType.Expression(p2, s2)) =>
           val dt = DecimalType.bounded(max(s1, s2) + max(p1 - s1, p2 - s2) + 1, max(s1, s2))
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalFunctions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalFunctions.scala
index b9d4736a65e26..adb33e4c8d4a1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalFunctions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalFunctions.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode}
 import org.apache.spark.sql.types._
 
@@ -60,3 +61,15 @@ case class MakeDecimal(child: Expression, precision: Int, scale: Int) extends Un
     })
   }
 }
+
+/**
+ * An expression used to wrap the children when promote the precision of DecimalType to avoid
+ * promote multiple times.
+ */
+case class ChangeDecimalPrecision(child: Expression) extends UnaryExpression {
+  override def dataType: DataType = child.dataType
+  override def eval(input: InternalRow): Any = child.eval(input)
+  override def gen(ctx: CodeGenContext): GeneratedExpressionCode = child.gen(ctx)
+  override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = ""
+  override def prettyName: String = "change_decimal_precision"
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala
index 26b24616d98ec..0cd352d0fa928 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala
@@ -78,6 +78,10 @@ case class DecimalType(precision: Int, scale: Int) extends FractionalType {
 
   override def toString: String = s"DecimalType($precision,$scale)"
 
+  /**
+   * Returns whether this DecimalType is wider than `other`. If yes, it means `other`
+   * can be casted into `this` safely without losing any precision or range.
+   */
   private[sql] def isWiderThan(other: DataType): Boolean = other match {
     case dt: DecimalType =>
       (precision - scale) >= (dt.precision - dt.scale) && scale >= dt.scale
@@ -109,7 +113,7 @@ object DecimalType extends AbstractDataType {
   @deprecated("Does not support unlimited precision, please specify the precision and scale", "1.5")
   val Unlimited: DecimalType = SYSTEM_DEFAULT
 
-  // The decimal types compatible with other numberic types
+  // The decimal types compatible with other numeric types
   private[sql] val ByteDecimal = DecimalType(3, 0)
   private[sql] val ShortDecimal = DecimalType(5, 0)
   private[sql] val IntDecimal = DecimalType(10, 0)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala
index f9f15e7a6608d..fc11627da6fd1 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala
@@ -154,4 +154,30 @@ class DecimalPrecisionSuite extends SparkFunSuite with BeforeAndAfter {
       checkType(Remainder(expr, u), DoubleType)
     }
   }
+
+  test("DecimalType.isWiderThan") {
+    val d0 = DecimalType(2, 0)
+    val d1 = DecimalType(2, 1)
+    val d2 = DecimalType(5, 2)
+    val d3 = DecimalType(15, 3)
+    val d4 = DecimalType(25, 4)
+
+    assert(d0.isWiderThan(d1) === false)
+    assert(d1.isWiderThan(d0) === false)
+    assert(d1.isWiderThan(d2) === false)
+    assert(d2.isWiderThan(d1) === true)
+    assert(d2.isWiderThan(d3) === false)
+    assert(d3.isWiderThan(d2) === true)
+    assert(d4.isWiderThan(d3) === true)
+
+    assert(d1.isWiderThan(ByteType) === false)
+    assert(d2.isWiderThan(ByteType) === true)
+    assert(d2.isWiderThan(ShortType) === false)
+    assert(d3.isWiderThan(ShortType) === true)
+    assert(d3.isWiderThan(IntegerType) === true)
+    assert(d3.isWiderThan(LongType) === false)
+    assert(d4.isWiderThan(LongType) === true)
+    assert(d4.isWiderThan(FloatType) === false)
+    assert(d4.isWiderThan(DoubleType) === false)
+  }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
index 55865bdb534b4..4454d51b75877 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
@@ -314,7 +314,7 @@ class HiveTypeCoercionSuite extends PlanTest {
     )
   }
 
-  test("WidenTypes for union except and intersect") {
+  test("WidenSetOperationTypes for union except and intersect") {
     def checkOutput(logical: LogicalPlan, expectTypes: Seq[DataType]): Unit = {
       logical.output.zip(expectTypes).foreach { case (attr, dt) =>
         assert(attr.dataType === dt)
@@ -332,7 +332,7 @@ class HiveTypeCoercionSuite extends PlanTest {
       AttributeReference("f", FloatType)(),
       AttributeReference("l", LongType)())
 
-    val wt = HiveTypeCoercion.WidenTypes
+    val wt = HiveTypeCoercion.WidenSetOperationTypes
     val expectedTypes = Seq(StringType, DecimalType.SYSTEM_DEFAULT, FloatType, DoubleType)
 
     val r1 = wt(Union(left, right)).asInstanceOf[Union]
@@ -353,7 +353,7 @@ class HiveTypeCoercionSuite extends PlanTest {
       }
     }
 
-    val dp = HiveTypeCoercion.WidenTypes
+    val dp = HiveTypeCoercion.WidenSetOperationTypes
 
     val left1 = LocalRelation(
       AttributeReference("l", DecimalType(10, 8))())

From 846cf46282da8f4b87aeee64e407a38cdc80e13b Mon Sep 17 00:00:00 2001
From: "zhichao.li" <zhichao.li@intel.com>
Date: Fri, 24 Jul 2015 08:34:50 -0700
Subject: [PATCH 355/549] [SPARK-9238] [SQL] Remove two extra useless entries
 for bytesOfCodePointInUTF8

Only a trial thing, not sure if I understand correctly or not but I guess only 2 entries in `bytesOfCodePointInUTF8` for the case of 6 bytes codepoint(1111110x) is enough.
Details can be found from https://en.wikipedia.org/wiki/UTF-8 in "Description" section.

Author: zhichao.li <zhichao.li@intel.com>

Closes #7582 from zhichao-li/utf8 and squashes the following commits:

8bddd01 [zhichao.li] two extra entries
---
 .../src/main/java/org/apache/spark/unsafe/types/UTF8String.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
index 946d355f1fc28..6d8dcb1cbf876 100644
--- a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
+++ b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -48,7 +48,7 @@ public final class UTF8String implements Comparable<UTF8String>, Serializable {
     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
     4, 4, 4, 4, 4, 4, 4, 4,
     5, 5, 5, 5,
-    6, 6, 6, 6};
+    6, 6};
 
   public static final UTF8String EMPTY_UTF8 = UTF8String.fromString("");
 

From 428cde5d1c46adad344255447283dfb9716d65cb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Garillot?= <francois@garillot.net>
Date: Fri, 24 Jul 2015 17:09:33 +0100
Subject: [PATCH 356/549] [SPARK-9250] Make change-scala-version more helpful
 w.r.t. valid Scala versions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Author: François Garillot <francois@garillot.net>

Closes #7595 from huitseeker/issue/SPARK-9250 and squashes the following commits:

80a0218 [François Garillot] [SPARK-9250] Make change-scala-version's usage more explicit, introduce a -h|--help option.
---
 dev/change-scala-version.sh | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/dev/change-scala-version.sh b/dev/change-scala-version.sh
index b81c00c9d6d9d..d7975dfb6475c 100755
--- a/dev/change-scala-version.sh
+++ b/dev/change-scala-version.sh
@@ -19,19 +19,23 @@
 
 set -e
 
+VALID_VERSIONS=( 2.10 2.11 )
+
 usage() {
-  echo "Usage: $(basename $0) <version>" 1>&2
+  echo "Usage: $(basename $0) [-h|--help] <version>
+where :
+  -h| --help Display this help text
+  valid version values : ${VALID_VERSIONS[*]}
+" 1>&2
   exit 1
 }
 
-if [ $# -ne 1 ]; then
+if [[ ($# -ne 1) || ( $1 == "--help") ||  $1 == "-h" ]]; then
   usage
 fi
 
 TO_VERSION=$1
 
-VALID_VERSIONS=( 2.10 2.11 )
-
 check_scala_version() {
   for i in ${VALID_VERSIONS[*]}; do [ $i = "$1" ] && return 0; done
   echo "Invalid Scala version: $1. Valid versions: ${VALID_VERSIONS[*]}" 1>&2

From 3aec9f4e2d8fcce9ddf84ab4d0e10147c18afa16 Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Fri, 24 Jul 2015 09:10:11 -0700
Subject: [PATCH 357/549] [SPARK-9249] [SPARKR] local variable assigned but may
 not be used

[[SPARK-9249] local variable assigned but may not be used - ASF JIRA](https://issues.apache.org/jira/browse/SPARK-9249)

https://gist.github.com/yu-iskw/0e5b0253c11769457ea5

Author: Yu ISHIKAWA <yuu.ishikawa@gmail.com>

Closes #7640 from yu-iskw/SPARK-9249 and squashes the following commits:

7a51cab [Yu ISHIKAWA] [SPARK-9249][SparkR] local variable assigned but may not be used
---
 R/pkg/R/deserialize.R | 4 ++--
 R/pkg/R/sparkR.R      | 3 ---
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/R/pkg/R/deserialize.R b/R/pkg/R/deserialize.R
index 7d1f6b0819ed0..6d364f77be7ee 100644
--- a/R/pkg/R/deserialize.R
+++ b/R/pkg/R/deserialize.R
@@ -102,11 +102,11 @@ readList <- function(con) {
 
 readRaw <- function(con) {
   dataLen <- readInt(con)
-  data <- readBin(con, raw(), as.integer(dataLen), endian = "big")
+  readBin(con, raw(), as.integer(dataLen), endian = "big")
 }
 
 readRawLen <- function(con, dataLen) {
-  data <- readBin(con, raw(), as.integer(dataLen), endian = "big")
+  readBin(con, raw(), as.integer(dataLen), endian = "big")
 }
 
 readDeserialize <- function(con) {
diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R
index 79b79d70943cb..76c15875b50d5 100644
--- a/R/pkg/R/sparkR.R
+++ b/R/pkg/R/sparkR.R
@@ -104,16 +104,13 @@ sparkR.init <- function(
     return(get(".sparkRjsc", envir = .sparkREnv))
   }
 
-  sparkMem <- Sys.getenv("SPARK_MEM", "1024m")
   jars <- suppressWarnings(normalizePath(as.character(sparkJars)))
 
   # Classpath separator is ";" on Windows
   # URI needs four /// as from http://stackoverflow.com/a/18522792
   if (.Platform$OS.type == "unix") {
-    collapseChar <- ":"
     uriSep <- "//"
   } else {
-    collapseChar <- ";"
     uriSep <- "////"
   }
 

From 431ca39be51352dfcdacc87de7e64c2af313558d Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Fri, 24 Jul 2015 09:37:36 -0700
Subject: [PATCH 358/549] [SPARK-9285][SQL] Remove InternalRow's inheritance
 from Row.

I also changed InternalRow's size/length function to numFields, to make it more obvious that it is not about bytes, but the number of fields.

Author: Reynold Xin <rxin@databricks.com>

Closes #7626 from rxin/internalRow and squashes the following commits:

e124daf [Reynold Xin] Fixed test case.
805ceb7 [Reynold Xin] Commented out the failed test suite.
f8a9ca5 [Reynold Xin] Fixed more bugs. Still at least one more remaining.
76d9081 [Reynold Xin] Fixed data sources.
7807f70 [Reynold Xin] Fixed DataFrameSuite.
cb60cd2 [Reynold Xin] Code review & small bug fixes.
0a2948b [Reynold Xin] Fixed style.
3280d03 [Reynold Xin] [SPARK-9285][SQL] Remove InternalRow's inheritance from Row.
---
 .../apache/spark/mllib/linalg/Matrices.scala  |   4 +-
 .../apache/spark/mllib/linalg/Vectors.scala   |   4 +-
 .../sql/catalyst/expressions/UnsafeRow.java   |   9 +-
 .../sql/catalyst/CatalystTypeConverters.scala |  14 +-
 .../spark/sql/catalyst/InternalRow.scala      | 153 ++++++++++++----
 .../spark/sql/catalyst/expressions/Cast.scala |   4 +-
 .../sql/catalyst/expressions/Projection.scala | 168 +++++++++---------
 .../expressions/SpecificMutableRow.scala      |   4 +-
 .../sql/catalyst/expressions/aggregates.scala |   4 +-
 .../codegen/GenerateProjection.scala          |   2 +-
 .../expressions/complexTypeExtractors.scala   |   4 +-
 .../spark/sql/catalyst/expressions/rows.scala |  57 +++---
 .../scala/org/apache/spark/sql/RowTest.scala  |  10 --
 .../sql/catalyst/expressions/CastSuite.scala  |  24 ++-
 .../expressions/ComplexTypeSuite.scala        |   7 +-
 .../spark/sql/columnar/ColumnType.scala       |   2 +-
 .../columnar/InMemoryColumnarTableScan.scala  |  12 +-
 .../sql/execution/SparkSqlSerializer2.scala   |  10 +-
 .../datasources/DataSourceStrategy.scala      |   4 +-
 .../sql/execution/datasources/commands.scala  |  53 ++++--
 .../spark/sql/execution/datasources/ddl.scala |  16 +-
 .../spark/sql/execution/pythonUDFs.scala      |   4 +-
 .../sql/expressions/aggregate/udaf.scala      |   3 +-
 .../apache/spark/sql/jdbc/JDBCRelation.scala  |   3 +-
 .../apache/spark/sql/json/JSONRelation.scala  |   6 +-
 .../sql/parquet/CatalystRowConverter.scala    |  10 +-
 .../sql/parquet/ParquetTableOperations.scala  |   2 +-
 .../sql/parquet/ParquetTableSupport.scala     |  12 +-
 .../apache/spark/sql/parquet/newParquet.scala |   6 +-
 .../apache/spark/sql/sources/interfaces.scala |  22 ++-
 .../scala/org/apache/spark/sql/RowSuite.scala |   4 +-
 .../spark/sql/sources/DDLTestSuite.scala      |   5 +-
 .../spark/sql/sources/PrunedScanSuite.scala   |   2 +-
 .../spark/sql/sources/TableScanSuite.scala    |   2 +-
 .../hive/execution/InsertIntoHiveTable.scala  |   9 +-
 .../spark/sql/hive/hiveWriterContainers.scala |   8 +-
 .../spark/sql/hive/orc/OrcRelation.scala      |   8 +-
 .../CommitFailureTestRelationSuite.scala      |  47 +++++
 .../ParquetHadoopFsRelationSuite.scala        | 139 +++++++++++++++
 .../SimpleTextHadoopFsRelationSuite.scala     |  57 ++++++
 .../sql/sources/hadoopFsRelationSuites.scala  | 166 -----------------
 41 files changed, 647 insertions(+), 433 deletions(-)
 create mode 100644 sql/hive/src/test/scala/org/apache/spark/sql/sources/CommitFailureTestRelationSuite.scala
 create mode 100644 sql/hive/src/test/scala/org/apache/spark/sql/sources/ParquetHadoopFsRelationSuite.scala
 create mode 100644 sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextHadoopFsRelationSuite.scala

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
index 55da0e094d132..b6e2c30fbf104 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
@@ -174,8 +174,8 @@ private[spark] class MatrixUDT extends UserDefinedType[Matrix] {
   override def deserialize(datum: Any): Matrix = {
     datum match {
       case row: InternalRow =>
-        require(row.length == 7,
-          s"MatrixUDT.deserialize given row with length ${row.length} but requires length == 7")
+        require(row.numFields == 7,
+          s"MatrixUDT.deserialize given row with length ${row.numFields} but requires length == 7")
         val tpe = row.getByte(0)
         val numRows = row.getInt(1)
         val numCols = row.getInt(2)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
index 9067b3ba9a7bb..c884aad08889f 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
@@ -203,8 +203,8 @@ private[spark] class VectorUDT extends UserDefinedType[Vector] {
   override def deserialize(datum: Any): Vector = {
     datum match {
       case row: InternalRow =>
-        require(row.length == 4,
-          s"VectorUDT.deserialize given row with length ${row.length} but requires length == 4")
+        require(row.numFields == 4,
+          s"VectorUDT.deserialize given row with length ${row.numFields} but requires length == 4")
         val tpe = row.getByte(0)
         tpe match {
           case 0 =>
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
index fa1216b455a9e..a8986608855e2 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
@@ -64,7 +64,8 @@ public final class UnsafeRow extends MutableRow {
   /** The size of this row's backing data, in bytes) */
   private int sizeInBytes;
 
-  public int length() { return numFields; }
+  @Override
+  public int numFields() { return numFields; }
 
   /** The width of the null tracking bit set, in bytes */
   private int bitSetWidthInBytes;
@@ -218,12 +219,12 @@ public void setFloat(int ordinal, float value) {
   }
 
   @Override
-  public int size() {
-    return numFields;
+  public Object get(int i) {
+    throw new UnsupportedOperationException();
   }
 
   @Override
-  public Object get(int i) {
+  public <T> T getAs(int i) {
     throw new UnsupportedOperationException();
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala
index bfaee04f33b7f..5c3072a77aeba 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala
@@ -140,14 +140,14 @@ object CatalystTypeConverters {
   private object IdentityConverter extends CatalystTypeConverter[Any, Any, Any] {
     override def toCatalystImpl(scalaValue: Any): Any = scalaValue
     override def toScala(catalystValue: Any): Any = catalystValue
-    override def toScalaImpl(row: InternalRow, column: Int): Any = row(column)
+    override def toScalaImpl(row: InternalRow, column: Int): Any = row.get(column)
   }
 
   private case class UDTConverter(
       udt: UserDefinedType[_]) extends CatalystTypeConverter[Any, Any, Any] {
     override def toCatalystImpl(scalaValue: Any): Any = udt.serialize(scalaValue)
     override def toScala(catalystValue: Any): Any = udt.deserialize(catalystValue)
-    override def toScalaImpl(row: InternalRow, column: Int): Any = toScala(row(column))
+    override def toScalaImpl(row: InternalRow, column: Int): Any = toScala(row.get(column))
   }
 
   /** Converter for arrays, sequences, and Java iterables. */
@@ -184,7 +184,7 @@ object CatalystTypeConverters {
     }
 
     override def toScalaImpl(row: InternalRow, column: Int): Seq[Any] =
-      toScala(row(column).asInstanceOf[Seq[Any]])
+      toScala(row.get(column).asInstanceOf[Seq[Any]])
   }
 
   private case class MapConverter(
@@ -227,7 +227,7 @@ object CatalystTypeConverters {
     }
 
     override def toScalaImpl(row: InternalRow, column: Int): Map[Any, Any] =
-      toScala(row(column).asInstanceOf[Map[Any, Any]])
+      toScala(row.get(column).asInstanceOf[Map[Any, Any]])
   }
 
   private case class StructConverter(
@@ -260,9 +260,9 @@ object CatalystTypeConverters {
       if (row == null) {
         null
       } else {
-        val ar = new Array[Any](row.size)
+        val ar = new Array[Any](row.numFields)
         var idx = 0
-        while (idx < row.size) {
+        while (idx < row.numFields) {
           ar(idx) = converters(idx).toScala(row, idx)
           idx += 1
         }
@@ -271,7 +271,7 @@ object CatalystTypeConverters {
     }
 
     override def toScalaImpl(row: InternalRow, column: Int): Row =
-      toScala(row(column).asInstanceOf[InternalRow])
+      toScala(row.get(column).asInstanceOf[InternalRow])
   }
 
   private object StringConverter extends CatalystTypeConverter[Any, String, UTF8String] {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala
index c7ec49b3d6c3d..efc4faea569b2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala
@@ -25,48 +25,139 @@ import org.apache.spark.unsafe.types.UTF8String
  * An abstract class for row used internal in Spark SQL, which only contain the columns as
  * internal types.
  */
-abstract class InternalRow extends Row {
+abstract class InternalRow extends Serializable {
 
-  def getUTF8String(i: Int): UTF8String = getAs[UTF8String](i)
+  def numFields: Int
 
-  def getBinary(i: Int): Array[Byte] = getAs[Array[Byte]](i)
+  def get(i: Int): Any
 
-  // This is only use for test
-  override def getString(i: Int): String = getAs[UTF8String](i).toString
-
-  // These expensive API should not be used internally.
-  final override def getDecimal(i: Int): java.math.BigDecimal =
-    throw new UnsupportedOperationException
-  final override def getDate(i: Int): java.sql.Date =
-    throw new UnsupportedOperationException
-  final override def getTimestamp(i: Int): java.sql.Timestamp =
-    throw new UnsupportedOperationException
-  final override def getSeq[T](i: Int): Seq[T] = throw new UnsupportedOperationException
-  final override def getList[T](i: Int): java.util.List[T] = throw new UnsupportedOperationException
-  final override def getMap[K, V](i: Int): scala.collection.Map[K, V] =
-    throw new UnsupportedOperationException
-  final override def getJavaMap[K, V](i: Int): java.util.Map[K, V] =
-    throw new UnsupportedOperationException
-  final override def getStruct(i: Int): Row = throw new UnsupportedOperationException
-  final override def getAs[T](fieldName: String): T = throw new UnsupportedOperationException
-  final override def getValuesMap[T](fieldNames: Seq[String]): Map[String, T] =
-    throw new UnsupportedOperationException
-
-  // A default implementation to change the return type
-  override def copy(): InternalRow = this
+  // TODO: Remove this.
+  def apply(i: Int): Any = get(i)
+
+  def getAs[T](i: Int): T = get(i).asInstanceOf[T]
+
+  def isNullAt(i: Int): Boolean = get(i) == null
+
+  def getBoolean(i: Int): Boolean = getAs[Boolean](i)
+
+  def getByte(i: Int): Byte = getAs[Byte](i)
+
+  def getShort(i: Int): Short = getAs[Short](i)
+
+  def getInt(i: Int): Int = getAs[Int](i)
+
+  def getLong(i: Int): Long = getAs[Long](i)
+
+  def getFloat(i: Int): Float = getAs[Float](i)
+
+  def getDouble(i: Int): Double = getAs[Double](i)
+
+  override def toString: String = s"[${this.mkString(",")}]"
+
+  /**
+   * Make a copy of the current [[InternalRow]] object.
+   */
+  def copy(): InternalRow = this
+
+  /** Returns true if there are any NULL values in this row. */
+  def anyNull: Boolean = {
+    val len = numFields
+    var i = 0
+    while (i < len) {
+      if (isNullAt(i)) { return true }
+      i += 1
+    }
+    false
+  }
+
+  override def equals(o: Any): Boolean = {
+    if (!o.isInstanceOf[InternalRow]) {
+      return false
+    }
+
+    val other = o.asInstanceOf[InternalRow]
+    if (other eq null) {
+      return false
+    }
+
+    val len = numFields
+    if (len != other.numFields) {
+      return false
+    }
+
+    var i = 0
+    while (i < len) {
+      if (isNullAt(i) != other.isNullAt(i)) {
+        return false
+      }
+      if (!isNullAt(i)) {
+        val o1 = get(i)
+        val o2 = other.get(i)
+        o1 match {
+          case b1: Array[Byte] =>
+            if (!o2.isInstanceOf[Array[Byte]] ||
+              !java.util.Arrays.equals(b1, o2.asInstanceOf[Array[Byte]])) {
+              return false
+            }
+          case f1: Float if java.lang.Float.isNaN(f1) =>
+            if (!o2.isInstanceOf[Float] || ! java.lang.Float.isNaN(o2.asInstanceOf[Float])) {
+              return false
+            }
+          case d1: Double if java.lang.Double.isNaN(d1) =>
+            if (!o2.isInstanceOf[Double] || ! java.lang.Double.isNaN(o2.asInstanceOf[Double])) {
+              return false
+            }
+          case _ => if (o1 != o2) {
+            return false
+          }
+        }
+      }
+      i += 1
+    }
+    true
+  }
+
+  /* ---------------------- utility methods for Scala ---------------------- */
 
   /**
-   * Returns true if we can check equality for these 2 rows.
-   * Equality check between external row and internal row is not allowed.
-   * Here we do this check to prevent call `equals` on internal row with external row.
+   * Return a Scala Seq representing the row. Elements are placed in the same order in the Seq.
    */
-  protected override def canEqual(other: Row) = other.isInstanceOf[InternalRow]
+  def toSeq: Seq[Any] = {
+    val n = numFields
+    val values = new Array[Any](n)
+    var i = 0
+    while (i < n) {
+      values.update(i, get(i))
+      i += 1
+    }
+    values.toSeq
+  }
+
+  /** Displays all elements of this sequence in a string (without a separator). */
+  def mkString: String = toSeq.mkString
+
+  /** Displays all elements of this sequence in a string using a separator string. */
+  def mkString(sep: String): String = toSeq.mkString(sep)
+
+  /**
+   * Displays all elements of this traversable or iterator in a string using
+   * start, end, and separator strings.
+   */
+  def mkString(start: String, sep: String, end: String): String = toSeq.mkString(start, sep, end)
+
+  def getUTF8String(i: Int): UTF8String = getAs[UTF8String](i)
+
+  def getBinary(i: Int): Array[Byte] = getAs[Array[Byte]](i)
+
+  // This is only use for test
+  def getString(i: Int): String = getAs[UTF8String](i).toString
 
   // Custom hashCode function that matches the efficient code generated version.
   override def hashCode: Int = {
     var result: Int = 37
     var i = 0
-    while (i < length) {
+    val len = numFields
+    while (i < len) {
       val update: Int =
         if (isNullAt(i)) {
           0
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
index c66854d52c50b..47ad3e089e4c7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -382,8 +382,8 @@ case class Cast(child: Expression, dataType: DataType)
     val newRow = new GenericMutableRow(from.fields.length)
     buildCast[InternalRow](_, row => {
       var i = 0
-      while (i < row.length) {
-        newRow.update(i, if (row.isNullAt(i)) null else casts(i)(row(i)))
+      while (i < row.numFields) {
+        newRow.update(i, if (row.isNullAt(i)) null else casts(i)(row.get(i)))
         i += 1
       }
       newRow.copy()
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
index 04872fbc8b091..dbda05a792cbf 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
@@ -176,49 +176,49 @@ class JoinedRow extends InternalRow {
 
   override def toSeq: Seq[Any] = row1.toSeq ++ row2.toSeq
 
-  override def length: Int = row1.length + row2.length
+  override def numFields: Int = row1.numFields + row2.numFields
 
   override def getUTF8String(i: Int): UTF8String = {
-    if (i < row1.length) row1.getUTF8String(i) else row2.getUTF8String(i - row1.length)
+    if (i < row1.numFields) row1.getUTF8String(i) else row2.getUTF8String(i - row1.numFields)
   }
 
   override def getBinary(i: Int): Array[Byte] = {
-    if (i < row1.length) row1.getBinary(i) else row2.getBinary(i - row1.length)
+    if (i < row1.numFields) row1.getBinary(i) else row2.getBinary(i - row1.numFields)
   }
 
   override def get(i: Int): Any =
-    if (i < row1.length) row1(i) else row2(i - row1.length)
+    if (i < row1.numFields) row1.get(i) else row2.get(i - row1.numFields)
 
   override def isNullAt(i: Int): Boolean =
-    if (i < row1.length) row1.isNullAt(i) else row2.isNullAt(i - row1.length)
+    if (i < row1.numFields) row1.isNullAt(i) else row2.isNullAt(i - row1.numFields)
 
   override def getInt(i: Int): Int =
-    if (i < row1.length) row1.getInt(i) else row2.getInt(i - row1.length)
+    if (i < row1.numFields) row1.getInt(i) else row2.getInt(i - row1.numFields)
 
   override def getLong(i: Int): Long =
-    if (i < row1.length) row1.getLong(i) else row2.getLong(i - row1.length)
+    if (i < row1.numFields) row1.getLong(i) else row2.getLong(i - row1.numFields)
 
   override def getDouble(i: Int): Double =
-    if (i < row1.length) row1.getDouble(i) else row2.getDouble(i - row1.length)
+    if (i < row1.numFields) row1.getDouble(i) else row2.getDouble(i - row1.numFields)
 
   override def getBoolean(i: Int): Boolean =
-    if (i < row1.length) row1.getBoolean(i) else row2.getBoolean(i - row1.length)
+    if (i < row1.numFields) row1.getBoolean(i) else row2.getBoolean(i - row1.numFields)
 
   override def getShort(i: Int): Short =
-    if (i < row1.length) row1.getShort(i) else row2.getShort(i - row1.length)
+    if (i < row1.numFields) row1.getShort(i) else row2.getShort(i - row1.numFields)
 
   override def getByte(i: Int): Byte =
-    if (i < row1.length) row1.getByte(i) else row2.getByte(i - row1.length)
+    if (i < row1.numFields) row1.getByte(i) else row2.getByte(i - row1.numFields)
 
   override def getFloat(i: Int): Float =
-    if (i < row1.length) row1.getFloat(i) else row2.getFloat(i - row1.length)
+    if (i < row1.numFields) row1.getFloat(i) else row2.getFloat(i - row1.numFields)
 
   override def copy(): InternalRow = {
-    val totalSize = row1.length + row2.length
+    val totalSize = row1.numFields + row2.numFields
     val copiedValues = new Array[Any](totalSize)
     var i = 0
     while(i < totalSize) {
-      copiedValues(i) = apply(i)
+      copiedValues(i) = get(i)
       i += 1
     }
     new GenericInternalRow(copiedValues)
@@ -278,49 +278,49 @@ class JoinedRow2 extends InternalRow {
 
   override def toSeq: Seq[Any] = row1.toSeq ++ row2.toSeq
 
-  override def length: Int = row1.length + row2.length
+  override def numFields: Int = row1.numFields + row2.numFields
 
   override def getUTF8String(i: Int): UTF8String = {
-    if (i < row1.length) row1.getUTF8String(i) else row2.getUTF8String(i - row1.length)
+    if (i < row1.numFields) row1.getUTF8String(i) else row2.getUTF8String(i - row1.numFields)
   }
 
   override def getBinary(i: Int): Array[Byte] = {
-    if (i < row1.length) row1.getBinary(i) else row2.getBinary(i - row1.length)
+    if (i < row1.numFields) row1.getBinary(i) else row2.getBinary(i - row1.numFields)
   }
 
   override def get(i: Int): Any =
-    if (i < row1.length) row1(i) else row2(i - row1.length)
+    if (i < row1.numFields) row1.get(i) else row2.get(i - row1.numFields)
 
   override def isNullAt(i: Int): Boolean =
-    if (i < row1.length) row1.isNullAt(i) else row2.isNullAt(i - row1.length)
+    if (i < row1.numFields) row1.isNullAt(i) else row2.isNullAt(i - row1.numFields)
 
   override def getInt(i: Int): Int =
-    if (i < row1.length) row1.getInt(i) else row2.getInt(i - row1.length)
+    if (i < row1.numFields) row1.getInt(i) else row2.getInt(i - row1.numFields)
 
   override def getLong(i: Int): Long =
-    if (i < row1.length) row1.getLong(i) else row2.getLong(i - row1.length)
+    if (i < row1.numFields) row1.getLong(i) else row2.getLong(i - row1.numFields)
 
   override def getDouble(i: Int): Double =
-    if (i < row1.length) row1.getDouble(i) else row2.getDouble(i - row1.length)
+    if (i < row1.numFields) row1.getDouble(i) else row2.getDouble(i - row1.numFields)
 
   override def getBoolean(i: Int): Boolean =
-    if (i < row1.length) row1.getBoolean(i) else row2.getBoolean(i - row1.length)
+    if (i < row1.numFields) row1.getBoolean(i) else row2.getBoolean(i - row1.numFields)
 
   override def getShort(i: Int): Short =
-    if (i < row1.length) row1.getShort(i) else row2.getShort(i - row1.length)
+    if (i < row1.numFields) row1.getShort(i) else row2.getShort(i - row1.numFields)
 
   override def getByte(i: Int): Byte =
-    if (i < row1.length) row1.getByte(i) else row2.getByte(i - row1.length)
+    if (i < row1.numFields) row1.getByte(i) else row2.getByte(i - row1.numFields)
 
   override def getFloat(i: Int): Float =
-    if (i < row1.length) row1.getFloat(i) else row2.getFloat(i - row1.length)
+    if (i < row1.numFields) row1.getFloat(i) else row2.getFloat(i - row1.numFields)
 
   override def copy(): InternalRow = {
-    val totalSize = row1.length + row2.length
+    val totalSize = row1.numFields + row2.numFields
     val copiedValues = new Array[Any](totalSize)
     var i = 0
     while(i < totalSize) {
-      copiedValues(i) = apply(i)
+      copiedValues(i) = get(i)
       i += 1
     }
     new GenericInternalRow(copiedValues)
@@ -374,50 +374,50 @@ class JoinedRow3 extends InternalRow {
 
   override def toSeq: Seq[Any] = row1.toSeq ++ row2.toSeq
 
-  override def length: Int = row1.length + row2.length
+  override def numFields: Int = row1.numFields + row2.numFields
 
   override def getUTF8String(i: Int): UTF8String = {
-    if (i < row1.length) row1.getUTF8String(i) else row2.getUTF8String(i - row1.length)
+    if (i < row1.numFields) row1.getUTF8String(i) else row2.getUTF8String(i - row1.numFields)
   }
 
   override def getBinary(i: Int): Array[Byte] = {
-    if (i < row1.length) row1.getBinary(i) else row2.getBinary(i - row1.length)
+    if (i < row1.numFields) row1.getBinary(i) else row2.getBinary(i - row1.numFields)
   }
 
 
   override def get(i: Int): Any =
-    if (i < row1.length) row1(i) else row2(i - row1.length)
+    if (i < row1.numFields) row1.get(i) else row2.get(i - row1.numFields)
 
   override def isNullAt(i: Int): Boolean =
-    if (i < row1.length) row1.isNullAt(i) else row2.isNullAt(i - row1.length)
+    if (i < row1.numFields) row1.isNullAt(i) else row2.isNullAt(i - row1.numFields)
 
   override def getInt(i: Int): Int =
-    if (i < row1.length) row1.getInt(i) else row2.getInt(i - row1.length)
+    if (i < row1.numFields) row1.getInt(i) else row2.getInt(i - row1.numFields)
 
   override def getLong(i: Int): Long =
-    if (i < row1.length) row1.getLong(i) else row2.getLong(i - row1.length)
+    if (i < row1.numFields) row1.getLong(i) else row2.getLong(i - row1.numFields)
 
   override def getDouble(i: Int): Double =
-    if (i < row1.length) row1.getDouble(i) else row2.getDouble(i - row1.length)
+    if (i < row1.numFields) row1.getDouble(i) else row2.getDouble(i - row1.numFields)
 
   override def getBoolean(i: Int): Boolean =
-    if (i < row1.length) row1.getBoolean(i) else row2.getBoolean(i - row1.length)
+    if (i < row1.numFields) row1.getBoolean(i) else row2.getBoolean(i - row1.numFields)
 
   override def getShort(i: Int): Short =
-    if (i < row1.length) row1.getShort(i) else row2.getShort(i - row1.length)
+    if (i < row1.numFields) row1.getShort(i) else row2.getShort(i - row1.numFields)
 
   override def getByte(i: Int): Byte =
-    if (i < row1.length) row1.getByte(i) else row2.getByte(i - row1.length)
+    if (i < row1.numFields) row1.getByte(i) else row2.getByte(i - row1.numFields)
 
   override def getFloat(i: Int): Float =
-    if (i < row1.length) row1.getFloat(i) else row2.getFloat(i - row1.length)
+    if (i < row1.numFields) row1.getFloat(i) else row2.getFloat(i - row1.numFields)
 
   override def copy(): InternalRow = {
-    val totalSize = row1.length + row2.length
+    val totalSize = row1.numFields + row2.numFields
     val copiedValues = new Array[Any](totalSize)
     var i = 0
     while(i < totalSize) {
-      copiedValues(i) = apply(i)
+      copiedValues(i) = get(i)
       i += 1
     }
     new GenericInternalRow(copiedValues)
@@ -471,50 +471,50 @@ class JoinedRow4 extends InternalRow {
 
   override def toSeq: Seq[Any] = row1.toSeq ++ row2.toSeq
 
-  override def length: Int = row1.length + row2.length
+  override def numFields: Int = row1.numFields + row2.numFields
 
   override def getUTF8String(i: Int): UTF8String = {
-    if (i < row1.length) row1.getUTF8String(i) else row2.getUTF8String(i - row1.length)
+    if (i < row1.numFields) row1.getUTF8String(i) else row2.getUTF8String(i - row1.numFields)
   }
 
   override def getBinary(i: Int): Array[Byte] = {
-    if (i < row1.length) row1.getBinary(i) else row2.getBinary(i - row1.length)
+    if (i < row1.numFields) row1.getBinary(i) else row2.getBinary(i - row1.numFields)
   }
 
 
   override def get(i: Int): Any =
-    if (i < row1.length) row1(i) else row2(i - row1.length)
+    if (i < row1.numFields) row1.get(i) else row2.get(i - row1.numFields)
 
   override def isNullAt(i: Int): Boolean =
-    if (i < row1.length) row1.isNullAt(i) else row2.isNullAt(i - row1.length)
+    if (i < row1.numFields) row1.isNullAt(i) else row2.isNullAt(i - row1.numFields)
 
   override def getInt(i: Int): Int =
-    if (i < row1.length) row1.getInt(i) else row2.getInt(i - row1.length)
+    if (i < row1.numFields) row1.getInt(i) else row2.getInt(i - row1.numFields)
 
   override def getLong(i: Int): Long =
-    if (i < row1.length) row1.getLong(i) else row2.getLong(i - row1.length)
+    if (i < row1.numFields) row1.getLong(i) else row2.getLong(i - row1.numFields)
 
   override def getDouble(i: Int): Double =
-    if (i < row1.length) row1.getDouble(i) else row2.getDouble(i - row1.length)
+    if (i < row1.numFields) row1.getDouble(i) else row2.getDouble(i - row1.numFields)
 
   override def getBoolean(i: Int): Boolean =
-    if (i < row1.length) row1.getBoolean(i) else row2.getBoolean(i - row1.length)
+    if (i < row1.numFields) row1.getBoolean(i) else row2.getBoolean(i - row1.numFields)
 
   override def getShort(i: Int): Short =
-    if (i < row1.length) row1.getShort(i) else row2.getShort(i - row1.length)
+    if (i < row1.numFields) row1.getShort(i) else row2.getShort(i - row1.numFields)
 
   override def getByte(i: Int): Byte =
-    if (i < row1.length) row1.getByte(i) else row2.getByte(i - row1.length)
+    if (i < row1.numFields) row1.getByte(i) else row2.getByte(i - row1.numFields)
 
   override def getFloat(i: Int): Float =
-    if (i < row1.length) row1.getFloat(i) else row2.getFloat(i - row1.length)
+    if (i < row1.numFields) row1.getFloat(i) else row2.getFloat(i - row1.numFields)
 
   override def copy(): InternalRow = {
-    val totalSize = row1.length + row2.length
+    val totalSize = row1.numFields + row2.numFields
     val copiedValues = new Array[Any](totalSize)
     var i = 0
     while(i < totalSize) {
-      copiedValues(i) = apply(i)
+      copiedValues(i) = get(i)
       i += 1
     }
     new GenericInternalRow(copiedValues)
@@ -568,50 +568,50 @@ class JoinedRow5 extends InternalRow {
 
   override def toSeq: Seq[Any] = row1.toSeq ++ row2.toSeq
 
-  override def length: Int = row1.length + row2.length
+  override def numFields: Int = row1.numFields + row2.numFields
 
   override def getUTF8String(i: Int): UTF8String = {
-    if (i < row1.length) row1.getUTF8String(i) else row2.getUTF8String(i - row1.length)
+    if (i < row1.numFields) row1.getUTF8String(i) else row2.getUTF8String(i - row1.numFields)
   }
 
   override def getBinary(i: Int): Array[Byte] = {
-    if (i < row1.length) row1.getBinary(i) else row2.getBinary(i - row1.length)
+    if (i < row1.numFields) row1.getBinary(i) else row2.getBinary(i - row1.numFields)
   }
 
 
   override def get(i: Int): Any =
-    if (i < row1.length) row1(i) else row2(i - row1.length)
+    if (i < row1.numFields) row1.get(i) else row2.get(i - row1.numFields)
 
   override def isNullAt(i: Int): Boolean =
-    if (i < row1.length) row1.isNullAt(i) else row2.isNullAt(i - row1.length)
+    if (i < row1.numFields) row1.isNullAt(i) else row2.isNullAt(i - row1.numFields)
 
   override def getInt(i: Int): Int =
-    if (i < row1.length) row1.getInt(i) else row2.getInt(i - row1.length)
+    if (i < row1.numFields) row1.getInt(i) else row2.getInt(i - row1.numFields)
 
   override def getLong(i: Int): Long =
-    if (i < row1.length) row1.getLong(i) else row2.getLong(i - row1.length)
+    if (i < row1.numFields) row1.getLong(i) else row2.getLong(i - row1.numFields)
 
   override def getDouble(i: Int): Double =
-    if (i < row1.length) row1.getDouble(i) else row2.getDouble(i - row1.length)
+    if (i < row1.numFields) row1.getDouble(i) else row2.getDouble(i - row1.numFields)
 
   override def getBoolean(i: Int): Boolean =
-    if (i < row1.length) row1.getBoolean(i) else row2.getBoolean(i - row1.length)
+    if (i < row1.numFields) row1.getBoolean(i) else row2.getBoolean(i - row1.numFields)
 
   override def getShort(i: Int): Short =
-    if (i < row1.length) row1.getShort(i) else row2.getShort(i - row1.length)
+    if (i < row1.numFields) row1.getShort(i) else row2.getShort(i - row1.numFields)
 
   override def getByte(i: Int): Byte =
-    if (i < row1.length) row1.getByte(i) else row2.getByte(i - row1.length)
+    if (i < row1.numFields) row1.getByte(i) else row2.getByte(i - row1.numFields)
 
   override def getFloat(i: Int): Float =
-    if (i < row1.length) row1.getFloat(i) else row2.getFloat(i - row1.length)
+    if (i < row1.numFields) row1.getFloat(i) else row2.getFloat(i - row1.numFields)
 
   override def copy(): InternalRow = {
-    val totalSize = row1.length + row2.length
+    val totalSize = row1.numFields + row2.numFields
     val copiedValues = new Array[Any](totalSize)
     var i = 0
     while(i < totalSize) {
-      copiedValues(i) = apply(i)
+      copiedValues(i) = get(i)
       i += 1
     }
     new GenericInternalRow(copiedValues)
@@ -665,50 +665,50 @@ class JoinedRow6 extends InternalRow {
 
   override def toSeq: Seq[Any] = row1.toSeq ++ row2.toSeq
 
-  override def length: Int = row1.length + row2.length
+  override def numFields: Int = row1.numFields + row2.numFields
 
   override def getUTF8String(i: Int): UTF8String = {
-    if (i < row1.length) row1.getUTF8String(i) else row2.getUTF8String(i - row1.length)
+    if (i < row1.numFields) row1.getUTF8String(i) else row2.getUTF8String(i - row1.numFields)
   }
 
   override def getBinary(i: Int): Array[Byte] = {
-    if (i < row1.length) row1.getBinary(i) else row2.getBinary(i - row1.length)
+    if (i < row1.numFields) row1.getBinary(i) else row2.getBinary(i - row1.numFields)
   }
 
 
   override def get(i: Int): Any =
-    if (i < row1.length) row1(i) else row2(i - row1.length)
+    if (i < row1.numFields) row1.get(i) else row2.get(i - row1.numFields)
 
   override def isNullAt(i: Int): Boolean =
-    if (i < row1.length) row1.isNullAt(i) else row2.isNullAt(i - row1.length)
+    if (i < row1.numFields) row1.isNullAt(i) else row2.isNullAt(i - row1.numFields)
 
   override def getInt(i: Int): Int =
-    if (i < row1.length) row1.getInt(i) else row2.getInt(i - row1.length)
+    if (i < row1.numFields) row1.getInt(i) else row2.getInt(i - row1.numFields)
 
   override def getLong(i: Int): Long =
-    if (i < row1.length) row1.getLong(i) else row2.getLong(i - row1.length)
+    if (i < row1.numFields) row1.getLong(i) else row2.getLong(i - row1.numFields)
 
   override def getDouble(i: Int): Double =
-    if (i < row1.length) row1.getDouble(i) else row2.getDouble(i - row1.length)
+    if (i < row1.numFields) row1.getDouble(i) else row2.getDouble(i - row1.numFields)
 
   override def getBoolean(i: Int): Boolean =
-    if (i < row1.length) row1.getBoolean(i) else row2.getBoolean(i - row1.length)
+    if (i < row1.numFields) row1.getBoolean(i) else row2.getBoolean(i - row1.numFields)
 
   override def getShort(i: Int): Short =
-    if (i < row1.length) row1.getShort(i) else row2.getShort(i - row1.length)
+    if (i < row1.numFields) row1.getShort(i) else row2.getShort(i - row1.numFields)
 
   override def getByte(i: Int): Byte =
-    if (i < row1.length) row1.getByte(i) else row2.getByte(i - row1.length)
+    if (i < row1.numFields) row1.getByte(i) else row2.getByte(i - row1.numFields)
 
   override def getFloat(i: Int): Float =
-    if (i < row1.length) row1.getFloat(i) else row2.getFloat(i - row1.length)
+    if (i < row1.numFields) row1.getFloat(i) else row2.getFloat(i - row1.numFields)
 
   override def copy(): InternalRow = {
-    val totalSize = row1.length + row2.length
+    val totalSize = row1.numFields + row2.numFields
     val copiedValues = new Array[Any](totalSize)
     var i = 0
     while(i < totalSize) {
-      copiedValues(i) = apply(i)
+      copiedValues(i) = get(i)
       i += 1
     }
     new GenericInternalRow(copiedValues)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SpecificMutableRow.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SpecificMutableRow.scala
index 6f291d2c86c1e..4b4833bd06a3b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SpecificMutableRow.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SpecificMutableRow.scala
@@ -211,7 +211,7 @@ final class SpecificMutableRow(val values: Array[MutableValue]) extends MutableR
 
   def this() = this(Seq.empty)
 
-  override def length: Int = values.length
+  override def numFields: Int = values.length
 
   override def toSeq: Seq[Any] = values.map(_.boxed).toSeq
 
@@ -245,7 +245,7 @@ final class SpecificMutableRow(val values: Array[MutableValue]) extends MutableR
   override def setString(ordinal: Int, value: String): Unit =
     update(ordinal, UTF8String.fromString(value))
 
-  override def getString(ordinal: Int): String = apply(ordinal).toString
+  override def getString(ordinal: Int): String = get(ordinal).toString
 
   override def setInt(ordinal: Int, value: Int): Unit = {
     val currentValue = values(ordinal).asInstanceOf[MutableInt]
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
index 73fde4e9164d7..62b6cc834c9c9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
@@ -675,7 +675,7 @@ case class CombineSetsAndSumFunction(
     val inputSetEval = inputSet.eval(input).asInstanceOf[OpenHashSet[Any]]
     val inputIterator = inputSetEval.iterator
     while (inputIterator.hasNext) {
-      seen.add(inputIterator.next)
+      seen.add(inputIterator.next())
     }
   }
 
@@ -685,7 +685,7 @@ case class CombineSetsAndSumFunction(
       null
     } else {
       Cast(Literal(
-        casted.iterator.map(f => f.apply(0)).reduceLeft(
+        casted.iterator.map(f => f.get(0)).reduceLeft(
           base.dataType.asInstanceOf[NumericType].numeric.asInstanceOf[Numeric[Any]].plus)),
         base.dataType).eval(null)
     }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala
index 405d6b0e3bc76..f0efc4bff12ba 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala
@@ -178,7 +178,7 @@ object GenerateProjection extends CodeGenerator[Seq[Expression], Projection] {
           $initColumns
         }
 
-        public int length() { return ${expressions.length};}
+        public int numFields() { return ${expressions.length};}
         protected boolean[] nullBits = new boolean[${expressions.length}];
         public void setNullAt(int i) { nullBits[i] = true; }
         public boolean isNullAt(int i) { return nullBits[i]; }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala
index 5504781edca1b..c91122cda2a41 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala
@@ -110,7 +110,7 @@ case class GetStructField(child: Expression, field: StructField, ordinal: Int)
   override def toString: String = s"$child.${field.name}"
 
   protected override def nullSafeEval(input: Any): Any =
-    input.asInstanceOf[InternalRow](ordinal)
+    input.asInstanceOf[InternalRow].get(ordinal)
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     nullSafeCodeGen(ctx, ev, eval => {
@@ -142,7 +142,7 @@ case class GetArrayStructFields(
 
   protected override def nullSafeEval(input: Any): Any = {
     input.asInstanceOf[Seq[InternalRow]].map { row =>
-      if (row == null) null else row(ordinal)
+      if (row == null) null else row.get(ordinal)
     }
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala
index d78be5a5958f9..53779dd4049d1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala
@@ -44,9 +44,10 @@ abstract class MutableRow extends InternalRow {
   }
 
   override def copy(): InternalRow = {
-    val arr = new Array[Any](length)
+    val n = numFields
+    val arr = new Array[Any](n)
     var i = 0
-    while (i < length) {
+    while (i < n) {
       arr(i) = get(i)
       i += 1
     }
@@ -54,36 +55,23 @@ abstract class MutableRow extends InternalRow {
   }
 }
 
-/**
- * A row implementation that uses an array of objects as the underlying storage.
- */
-trait ArrayBackedRow {
-  self: Row =>
-
-  protected val values: Array[Any]
-
-  override def toSeq: Seq[Any] = values.toSeq
-
-  def length: Int = values.length
-
-  override def get(i: Int): Any = values(i)
-
-  def setNullAt(i: Int): Unit = { values(i) = null}
-
-  def update(i: Int, value: Any): Unit = { values(i) = value }
-}
-
 /**
  * A row implementation that uses an array of objects as the underlying storage.  Note that, while
  * the array is not copied, and thus could technically be mutated after creation, this is not
  * allowed.
  */
-class GenericRow(protected[sql] val values: Array[Any]) extends Row with ArrayBackedRow {
+class GenericRow(protected[sql] val values: Array[Any]) extends Row {
   /** No-arg constructor for serialization. */
   protected def this() = this(null)
 
   def this(size: Int) = this(new Array[Any](size))
 
+  override def length: Int = values.length
+
+  override def get(i: Int): Any = values(i)
+
+  override def toSeq: Seq[Any] = values.toSeq
+
   override def copy(): Row = this
 }
 
@@ -101,34 +89,49 @@ class GenericRowWithSchema(values: Array[Any], override val schema: StructType)
  * Note that, while the array is not copied, and thus could technically be mutated after creation,
  * this is not allowed.
  */
-class GenericInternalRow(protected[sql] val values: Array[Any])
-    extends InternalRow with ArrayBackedRow {
+class GenericInternalRow(protected[sql] val values: Array[Any]) extends InternalRow {
   /** No-arg constructor for serialization. */
   protected def this() = this(null)
 
   def this(size: Int) = this(new Array[Any](size))
 
+  override def toSeq: Seq[Any] = values.toSeq
+
+  override def numFields: Int = values.length
+
+  override def get(i: Int): Any = values(i)
+
   override def copy(): InternalRow = this
 }
 
 /**
  * This is used for serialization of Python DataFrame
  */
-class GenericInternalRowWithSchema(values: Array[Any], override val schema: StructType)
+class GenericInternalRowWithSchema(values: Array[Any], val schema: StructType)
   extends GenericInternalRow(values) {
 
   /** No-arg constructor for serialization. */
   protected def this() = this(null, null)
 
-  override def fieldIndex(name: String): Int = schema.fieldIndex(name)
+  def fieldIndex(name: String): Int = schema.fieldIndex(name)
 }
 
-class GenericMutableRow(val values: Array[Any]) extends MutableRow with ArrayBackedRow {
+class GenericMutableRow(val values: Array[Any]) extends MutableRow {
   /** No-arg constructor for serialization. */
   protected def this() = this(null)
 
   def this(size: Int) = this(new Array[Any](size))
 
+  override def toSeq: Seq[Any] = values.toSeq
+
+  override def numFields: Int = values.length
+
+  override def get(i: Int): Any = values(i)
+
+  override def setNullAt(i: Int): Unit = { values(i) = null}
+
+  override def update(i: Int, value: Any): Unit = { values(i) = value }
+
   override def copy(): InternalRow = new GenericInternalRow(values.clone())
 }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/RowTest.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/RowTest.scala
index 878a1bb9b7e6d..01ff84cb56054 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/RowTest.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/RowTest.scala
@@ -83,15 +83,5 @@ class RowTest extends FunSpec with Matchers {
     it("equality check for internal rows") {
       internalRow shouldEqual internalRow2
     }
-
-    it("throws an exception when check equality between external and internal rows") {
-      def assertError(f: => Unit): Unit = {
-        val e = intercept[UnsupportedOperationException](f)
-        e.getMessage.contains("cannot check equality between external and internal rows")
-      }
-
-      assertError(internalRow.equals(externalRow))
-      assertError(externalRow.equals(internalRow))
-    }
   }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
index facf65c155148..408353cf70a49 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
@@ -24,6 +24,7 @@ import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.UTF8String
 
 /**
  * Test suite for data type casting expression [[Cast]].
@@ -580,14 +581,21 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
 
   test("cast from struct") {
     val struct = Literal.create(
-      InternalRow("123", "abc", "", null),
+      InternalRow(
+        UTF8String.fromString("123"),
+        UTF8String.fromString("abc"),
+        UTF8String.fromString(""),
+        null),
       StructType(Seq(
         StructField("a", StringType, nullable = true),
         StructField("b", StringType, nullable = true),
         StructField("c", StringType, nullable = true),
         StructField("d", StringType, nullable = true))))
     val struct_notNull = Literal.create(
-      InternalRow("123", "abc", ""),
+      InternalRow(
+        UTF8String.fromString("123"),
+        UTF8String.fromString("abc"),
+        UTF8String.fromString("")),
       StructType(Seq(
         StructField("a", StringType, nullable = false),
         StructField("b", StringType, nullable = false),
@@ -676,8 +684,11 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
   test("complex casting") {
     val complex = Literal.create(
       InternalRow(
-        Seq("123", "abc", ""),
-        Map("a" -> "123", "b" -> "abc", "c" -> ""),
+        Seq(UTF8String.fromString("123"), UTF8String.fromString("abc"), UTF8String.fromString("")),
+        Map(
+          UTF8String.fromString("a") -> UTF8String.fromString("123"),
+          UTF8String.fromString("b") -> UTF8String.fromString("abc"),
+          UTF8String.fromString("c") -> UTF8String.fromString("")),
         InternalRow(0)),
       StructType(Seq(
         StructField("a",
@@ -700,7 +711,10 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
     assert(ret.resolved === true)
     checkEvaluation(ret, InternalRow(
       Seq(123, null, null),
-      Map("a" -> true, "b" -> true, "c" -> false),
+      Map(
+        UTF8String.fromString("a") -> true,
+        UTF8String.fromString("b") -> true,
+        UTF8String.fromString("c") -> false),
       InternalRow(0L)))
   }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala
index a8aee8f634e03..fc842772f3480 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala
@@ -22,6 +22,7 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.UnresolvedExtractValue
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.UTF8String
 
 
 class ComplexTypeSuite extends SparkFunSuite with ExpressionEvalHelper {
@@ -150,12 +151,14 @@ class ComplexTypeSuite extends SparkFunSuite with ExpressionEvalHelper {
   test("CreateNamedStruct with literal field") {
     val row = InternalRow(1, 2, 3)
     val c1 = 'a.int.at(0)
-    checkEvaluation(CreateNamedStruct(Seq("a", c1, "b", "y")), InternalRow(1, "y"), row)
+    checkEvaluation(CreateNamedStruct(Seq("a", c1, "b", "y")),
+      InternalRow(1, UTF8String.fromString("y")), row)
   }
 
   test("CreateNamedStruct from all literal fields") {
     checkEvaluation(
-      CreateNamedStruct(Seq("a", "x", "b", 2.0)), InternalRow("x", 2.0), InternalRow.empty)
+      CreateNamedStruct(Seq("a", "x", "b", 2.0)),
+      InternalRow(UTF8String.fromString("x"), 2.0), InternalRow.empty)
   }
 
   test("test dsl for complex type") {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala
index 9d8415f06399c..ac42bde07c37d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala
@@ -309,7 +309,7 @@ private[sql] object SHORT extends NativeColumnType(ShortType, 6, 2) {
 
 private[sql] object STRING extends NativeColumnType(StringType, 7, 8) {
   override def actualSize(row: InternalRow, ordinal: Int): Int = {
-    row.getString(ordinal).getBytes("utf-8").length + 4
+    row.getUTF8String(ordinal).numBytes() + 4
   }
 
   override def append(v: UTF8String, buffer: ByteBuffer): Unit = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala
index 38720968c1313..5d5b0697d7016 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala
@@ -134,13 +134,13 @@ private[sql] case class InMemoryRelation(
             // may result malformed rows, causing ArrayIndexOutOfBoundsException, which is somewhat
             // hard to decipher.
             assert(
-              row.size == columnBuilders.size,
-              s"""Row column number mismatch, expected ${output.size} columns, but got ${row.size}.
-                 |Row content: $row
-               """.stripMargin)
+              row.numFields == columnBuilders.size,
+              s"Row column number mismatch, expected ${output.size} columns, " +
+                s"but got ${row.numFields}." +
+                s"\nRow content: $row")
 
             var i = 0
-            while (i < row.length) {
+            while (i < row.numFields) {
               columnBuilders(i).appendFrom(row, i)
               i += 1
             }
@@ -304,7 +304,7 @@ private[sql] case class InMemoryColumnarTableScan(
 
           // Extract rows via column accessors
           new Iterator[InternalRow] {
-            private[this] val rowLen = nextRow.length
+            private[this] val rowLen = nextRow.numFields
             override def next(): InternalRow = {
               var i = 0
               while (i < rowLen) {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlSerializer2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlSerializer2.scala
index c87e2064a8f33..83c4e8733f15f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlSerializer2.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlSerializer2.scala
@@ -25,7 +25,6 @@ import scala.reflect.ClassTag
 
 import org.apache.spark.Logging
 import org.apache.spark.serializer._
-import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{MutableRow, SpecificMutableRow}
 import org.apache.spark.sql.types._
@@ -53,7 +52,7 @@ private[sql] class Serializer2SerializationStream(
   private val writeRowFunc = SparkSqlSerializer2.createSerializationFunction(rowSchema, rowOut)
 
   override def writeObject[T: ClassTag](t: T): SerializationStream = {
-    val kv = t.asInstanceOf[Product2[Row, Row]]
+    val kv = t.asInstanceOf[Product2[InternalRow, InternalRow]]
     writeKey(kv._1)
     writeValue(kv._2)
 
@@ -66,7 +65,7 @@ private[sql] class Serializer2SerializationStream(
   }
 
   override def writeValue[T: ClassTag](t: T): SerializationStream = {
-    writeRowFunc(t.asInstanceOf[Row])
+    writeRowFunc(t.asInstanceOf[InternalRow])
     this
   }
 
@@ -205,8 +204,9 @@ private[sql] object SparkSqlSerializer2 {
   /**
    * The util function to create the serialization function based on the given schema.
    */
-  def createSerializationFunction(schema: Array[DataType], out: DataOutputStream): Row => Unit = {
-    (row: Row) =>
+  def createSerializationFunction(schema: Array[DataType], out: DataOutputStream)
+    : InternalRow => Unit = {
+    (row: InternalRow) =>
       // If the schema is null, the returned function does nothing when it get called.
       if (schema != null) {
         var i = 0
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
index 2b400926177fe..7f452daef33c5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
@@ -206,7 +206,7 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
         val mutableRow = new SpecificMutableRow(dataTypes)
         iterator.map { dataRow =>
           var i = 0
-          while (i < mutableRow.length) {
+          while (i < mutableRow.numFields) {
             mergers(i)(mutableRow, dataRow, i)
             i += 1
           }
@@ -315,7 +315,7 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
     if (relation.relation.needConversion) {
       execution.RDDConversions.rowToRowRdd(rdd, output.map(_.dataType))
     } else {
-      rdd.map(_.asInstanceOf[InternalRow])
+      rdd.asInstanceOf[RDD[InternalRow]]
     }
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/commands.scala
index cd2aa7f7433c5..d551f386eee6e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/commands.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/commands.scala
@@ -174,14 +174,19 @@ private[sql] case class InsertIntoHadoopFsRelation(
       try {
         writerContainer.executorSideSetup(taskContext)
 
-        val converter: InternalRow => Row = if (needsConversion) {
-          CatalystTypeConverters.createToScalaConverter(dataSchema).asInstanceOf[InternalRow => Row]
+        if (needsConversion) {
+          val converter = CatalystTypeConverters.createToScalaConverter(dataSchema)
+            .asInstanceOf[InternalRow => Row]
+          while (iterator.hasNext) {
+            val internalRow = iterator.next()
+            writerContainer.outputWriterForRow(internalRow).write(converter(internalRow))
+          }
         } else {
-          r: InternalRow => r.asInstanceOf[Row]
-        }
-        while (iterator.hasNext) {
-          val internalRow = iterator.next()
-          writerContainer.outputWriterForRow(internalRow).write(converter(internalRow))
+          while (iterator.hasNext) {
+            val internalRow = iterator.next()
+            writerContainer.outputWriterForRow(internalRow)
+              .asInstanceOf[OutputWriterInternal].writeInternal(internalRow)
+          }
         }
 
         writerContainer.commitTask()
@@ -248,17 +253,23 @@ private[sql] case class InsertIntoHadoopFsRelation(
         val partitionProj = newProjection(codegenEnabled, partitionCasts, output)
         val dataProj = newProjection(codegenEnabled, dataOutput, output)
 
-        val dataConverter: InternalRow => Row = if (needsConversion) {
-          CatalystTypeConverters.createToScalaConverter(dataSchema).asInstanceOf[InternalRow => Row]
+        if (needsConversion) {
+          val converter = CatalystTypeConverters.createToScalaConverter(dataSchema)
+            .asInstanceOf[InternalRow => Row]
+          while (iterator.hasNext) {
+            val internalRow = iterator.next()
+            val partitionPart = partitionProj(internalRow)
+            val dataPart = converter(dataProj(internalRow))
+            writerContainer.outputWriterForRow(partitionPart).write(dataPart)
+          }
         } else {
-          r: InternalRow => r.asInstanceOf[Row]
-        }
-
-        while (iterator.hasNext) {
-          val internalRow = iterator.next()
-          val partitionPart = partitionProj(internalRow)
-          val dataPart = dataConverter(dataProj(internalRow))
-          writerContainer.outputWriterForRow(partitionPart).write(dataPart)
+          while (iterator.hasNext) {
+            val internalRow = iterator.next()
+            val partitionPart = partitionProj(internalRow)
+            val dataPart = dataProj(internalRow)
+            writerContainer.outputWriterForRow(partitionPart)
+              .asInstanceOf[OutputWriterInternal].writeInternal(dataPart)
+          }
         }
 
         writerContainer.commitTask()
@@ -530,8 +541,12 @@ private[sql] class DynamicPartitionWriterContainer(
       while (i < partitionColumns.length) {
         val col = partitionColumns(i)
         val partitionValueString = {
-          val string = row.getString(i)
-          if (string.eq(null)) defaultPartitionName else PartitioningUtils.escapePathName(string)
+          val string = row.getUTF8String(i)
+          if (string.eq(null)) {
+            defaultPartitionName
+          } else {
+            PartitioningUtils.escapePathName(string.toString)
+          }
         }
 
         if (i > 0) {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ddl.scala
index c8033d3c0470a..1f2797ec5527a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ddl.scala
@@ -23,11 +23,11 @@ import scala.util.matching.Regex
 import org.apache.hadoop.fs.Path
 import org.apache.spark.Logging
 import org.apache.spark.deploy.SparkHadoopUtil
-import org.apache.spark.sql.{AnalysisException, DataFrame, SQLContext, SaveMode}
+import org.apache.spark.sql.{AnalysisException, DataFrame, Row, SQLContext, SaveMode}
+import org.apache.spark.sql.catalyst.AbstractSparkSQLParser
 import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
 import org.apache.spark.sql.catalyst.plans.logical._
-import org.apache.spark.sql.catalyst.{AbstractSparkSQLParser, InternalRow}
 import org.apache.spark.sql.execution.RunnableCommand
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types._
@@ -415,12 +415,12 @@ private[sql] case class CreateTempTableUsing(
     provider: String,
     options: Map[String, String]) extends RunnableCommand {
 
-  def run(sqlContext: SQLContext): Seq[InternalRow] = {
+  def run(sqlContext: SQLContext): Seq[Row] = {
     val resolved = ResolvedDataSource(
       sqlContext, userSpecifiedSchema, Array.empty[String], provider, options)
     sqlContext.registerDataFrameAsTable(
       DataFrame(sqlContext, LogicalRelation(resolved.relation)), tableName)
-    Seq.empty
+    Seq.empty[Row]
   }
 }
 
@@ -432,20 +432,20 @@ private[sql] case class CreateTempTableUsingAsSelect(
     options: Map[String, String],
     query: LogicalPlan) extends RunnableCommand {
 
-  override def run(sqlContext: SQLContext): Seq[InternalRow] = {
+  override def run(sqlContext: SQLContext): Seq[Row] = {
     val df = DataFrame(sqlContext, query)
     val resolved = ResolvedDataSource(sqlContext, provider, partitionColumns, mode, options, df)
     sqlContext.registerDataFrameAsTable(
       DataFrame(sqlContext, LogicalRelation(resolved.relation)), tableName)
 
-    Seq.empty
+    Seq.empty[Row]
   }
 }
 
 private[sql] case class RefreshTable(databaseName: String, tableName: String)
   extends RunnableCommand {
 
-  override def run(sqlContext: SQLContext): Seq[InternalRow] = {
+  override def run(sqlContext: SQLContext): Seq[Row] = {
     // Refresh the given table's metadata first.
     sqlContext.catalog.refreshTable(databaseName, tableName)
 
@@ -464,7 +464,7 @@ private[sql] case class RefreshTable(databaseName: String, tableName: String)
       sqlContext.cacheManager.cacheQuery(df, Some(tableName))
     }
 
-    Seq.empty[InternalRow]
+    Seq.empty[Row]
   }
 }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUDFs.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUDFs.scala
index e6e27a87c7151..40bf03a3f1a62 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUDFs.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUDFs.scala
@@ -126,9 +126,9 @@ object EvaluatePython {
     case (null, _) => null
 
     case (row: InternalRow, struct: StructType) =>
-      val values = new Array[Any](row.size)
+      val values = new Array[Any](row.numFields)
       var i = 0
-      while (i < row.size) {
+      while (i < row.numFields) {
         values(i) = toJava(row(i), struct.fields(i).dataType)
         i += 1
       }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/aggregate/udaf.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/aggregate/udaf.scala
index 6c49a906c848a..46f0fac861282 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/aggregate/udaf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/aggregate/udaf.scala
@@ -148,7 +148,7 @@ class InputAggregationBuffer private[sql] (
     toCatalystConverters: Array[Any => Any],
     toScalaConverters: Array[Any => Any],
     bufferOffset: Int,
-    var underlyingInputBuffer: Row)
+    var underlyingInputBuffer: InternalRow)
   extends AggregationBuffer(toCatalystConverters, toScalaConverters, bufferOffset) {
 
   override def get(i: Int): Any = {
@@ -156,6 +156,7 @@ class InputAggregationBuffer private[sql] (
       throw new IllegalArgumentException(
         s"Could not access ${i}th value in this buffer because it only has $length values.")
     }
+    // TODO: Use buffer schema to avoid using generic getter.
     toScalaConverters(i)(underlyingInputBuffer(offsets(i)))
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRelation.scala
index 4d3aac464c538..41d0ecb4bbfbf 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRelation.scala
@@ -128,6 +128,7 @@ private[sql] case class JDBCRelation(
 
   override def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row] = {
     val driver: String = DriverRegistry.getDriverClassName(url)
+    // Rely on a type erasure hack to pass RDD[InternalRow] back as RDD[Row]
     JDBCRDD.scanTable(
       sqlContext.sparkContext,
       schema,
@@ -137,7 +138,7 @@ private[sql] case class JDBCRelation(
       table,
       requiredColumns,
       filters,
-      parts).map(_.asInstanceOf[Row])
+      parts).asInstanceOf[RDD[Row]]
   }
 
   override def insert(data: DataFrame, overwrite: Boolean): Unit = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/json/JSONRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/json/JSONRelation.scala
index 922794ac9aac5..562b058414d07 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/json/JSONRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/json/JSONRelation.scala
@@ -154,17 +154,19 @@ private[sql] class JSONRelation(
   }
 
   override def buildScan(): RDD[Row] = {
+    // Rely on type erasure hack to pass RDD[InternalRow] back as RDD[Row]
     JacksonParser(
       baseRDD(),
       schema,
-      sqlContext.conf.columnNameOfCorruptRecord).map(_.asInstanceOf[Row])
+      sqlContext.conf.columnNameOfCorruptRecord).asInstanceOf[RDD[Row]]
   }
 
   override def buildScan(requiredColumns: Seq[Attribute], filters: Seq[Expression]): RDD[Row] = {
+    // Rely on a type erasure hack to pass RDD[InternalRow] back as RDD[Row]
     JacksonParser(
       baseRDD(),
       StructType.fromAttributes(requiredColumns),
-      sqlContext.conf.columnNameOfCorruptRecord).map(_.asInstanceOf[Row])
+      sqlContext.conf.columnNameOfCorruptRecord).asInstanceOf[RDD[Row]]
   }
 
   override def insert(data: DataFrame, overwrite: Boolean): Unit = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystRowConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystRowConverter.scala
index 0c3d8fdab6bd2..b5e4263008f56 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystRowConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystRowConverter.scala
@@ -28,7 +28,7 @@ import org.apache.parquet.io.api.{Binary, Converter, GroupConverter, PrimitiveCo
 import org.apache.parquet.schema.Type.Repetition
 import org.apache.parquet.schema.{GroupType, PrimitiveType, Type}
 
-import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.types._
@@ -55,8 +55,8 @@ private[parquet] trait ParentContainerUpdater {
 private[parquet] object NoopUpdater extends ParentContainerUpdater
 
 /**
- * A [[CatalystRowConverter]] is used to convert Parquet "structs" into Spark SQL [[Row]]s.  Since
- * any Parquet record is also a struct, this converter can also be used as root converter.
+ * A [[CatalystRowConverter]] is used to convert Parquet "structs" into Spark SQL [[InternalRow]]s.
+ * Since any Parquet record is also a struct, this converter can also be used as root converter.
  *
  * When used as a root converter, [[NoopUpdater]] should be used since root converters don't have
  * any "parent" container.
@@ -108,7 +108,7 @@ private[parquet] class CatalystRowConverter(
 
   override def start(): Unit = {
     var i = 0
-    while (i < currentRow.length) {
+    while (i < currentRow.numFields) {
       currentRow.setNullAt(i)
       i += 1
     }
@@ -178,7 +178,7 @@ private[parquet] class CatalystRowConverter(
 
       case t: StructType =>
         new CatalystRowConverter(parquetType.asGroupType(), t, new ParentContainerUpdater {
-          override def set(value: Any): Unit = updater.set(value.asInstanceOf[Row].copy())
+          override def set(value: Any): Unit = updater.set(value.asInstanceOf[InternalRow].copy())
         })
 
       case t: UserDefinedType[_] =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
index 28cba5e54d69e..8cab27d6e1c46 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
@@ -178,7 +178,7 @@ private[sql] case class ParquetTableScan(
               val row = iter.next()._2.asInstanceOf[InternalRow]
 
               var i = 0
-              while (i < row.size) {
+              while (i < row.numFields) {
                 mutableRow(i) = row(i)
                 i += 1
               }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
index d1040bf5562a2..c7c58e69d42ef 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
@@ -208,9 +208,9 @@ private[parquet] class RowWriteSupport extends WriteSupport[InternalRow] with Lo
 
   override def write(record: InternalRow): Unit = {
     val attributesSize = attributes.size
-    if (attributesSize > record.size) {
-      throw new IndexOutOfBoundsException(
-        s"Trying to write more fields than contained in row ($attributesSize > ${record.size})")
+    if (attributesSize > record.numFields) {
+      throw new IndexOutOfBoundsException("Trying to write more fields than contained in row " +
+        s"($attributesSize > ${record.numFields})")
     }
 
     var index = 0
@@ -378,9 +378,9 @@ private[parquet] class RowWriteSupport extends WriteSupport[InternalRow] with Lo
 private[parquet] class MutableRowWriteSupport extends RowWriteSupport {
   override def write(record: InternalRow): Unit = {
     val attributesSize = attributes.size
-    if (attributesSize > record.size) {
-      throw new IndexOutOfBoundsException(
-        s"Trying to write more fields than contained in row ($attributesSize > ${record.size})")
+    if (attributesSize > record.numFields) {
+      throw new IndexOutOfBoundsException("Trying to write more fields than contained in row " +
+        s"($attributesSize > ${record.numFields})")
     }
 
     var index = 0
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
index c384697c0ee62..8ec228c2b25bc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
@@ -61,7 +61,7 @@ private[sql] class DefaultSource extends HadoopFsRelationProvider {
 
 // NOTE: This class is instantiated and used on executor side only, no need to be serializable.
 private[sql] class ParquetOutputWriter(path: String, context: TaskAttemptContext)
-  extends OutputWriter {
+  extends OutputWriterInternal {
 
   private val recordWriter: RecordWriter[Void, InternalRow] = {
     val outputFormat = {
@@ -86,7 +86,7 @@ private[sql] class ParquetOutputWriter(path: String, context: TaskAttemptContext
     outputFormat.getRecordWriter(context)
   }
 
-  override def write(row: Row): Unit = recordWriter.write(null, row.asInstanceOf[InternalRow])
+  override def writeInternal(row: InternalRow): Unit = recordWriter.write(null, row)
 
   override def close(): Unit = recordWriter.close(context)
 }
@@ -324,7 +324,7 @@ private[sql] class ParquetRelation2(
             new SqlNewHadoopPartition(id, i, rawSplits(i).asInstanceOf[InputSplit with Writable])
           }
         }
-      }.values.map(_.asInstanceOf[Row])
+      }.values.asInstanceOf[RDD[Row]]  // type erasure hack to pass RDD[InternalRow] as RDD[Row]
     }
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
index 7cd005b959488..119bac786d478 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
@@ -344,6 +344,18 @@ abstract class OutputWriter {
   def close(): Unit
 }
 
+/**
+ * This is an internal, private version of [[OutputWriter]] with an writeInternal method that
+ * accepts an [[InternalRow]] rather than an [[Row]]. Data sources that return this must have
+ * the conversion flag set to false.
+ */
+private[sql] abstract class OutputWriterInternal extends OutputWriter {
+
+  override def write(row: Row): Unit = throw new UnsupportedOperationException
+
+  def writeInternal(row: InternalRow): Unit
+}
+
 /**
  * ::Experimental::
  * A [[BaseRelation]] that provides much of the common code required for formats that store their
@@ -592,12 +604,12 @@ abstract class HadoopFsRelation private[sql](maybePartitionSpec: Option[Partitio
       BoundReference(dataSchema.fieldIndex(col), field.dataType, field.nullable)
     }.toSeq
 
-    val rdd = buildScan(inputFiles)
-    val converted =
+    val rdd: RDD[Row] = buildScan(inputFiles)
+    val converted: RDD[InternalRow] =
       if (needConversion) {
         RDDConversions.rowToRowRdd(rdd, dataSchema.fields.map(_.dataType))
       } else {
-        rdd.map(_.asInstanceOf[InternalRow])
+        rdd.asInstanceOf[RDD[InternalRow]]
       }
     converted.mapPartitions { rows =>
       val buildProjection = if (codegenEnabled) {
@@ -606,8 +618,8 @@ abstract class HadoopFsRelation private[sql](maybePartitionSpec: Option[Partitio
         () => new InterpretedMutableProjection(requiredOutput, dataSchema.toAttributes)
       }
       val mutableProjection = buildProjection()
-      rows.map(r => mutableProjection(r).asInstanceOf[Row])
-    }
+      rows.map(r => mutableProjection(r))
+    }.asInstanceOf[RDD[Row]]
   }
 
   /**
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/RowSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/RowSuite.scala
index 7cc6ffd7548d0..0e5c5abff85f6 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/RowSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/RowSuite.scala
@@ -35,14 +35,14 @@ class RowSuite extends SparkFunSuite {
     expected.update(2, false)
     expected.update(3, null)
     val actual1 = Row(2147483647, "this is a string", false, null)
-    assert(expected.size === actual1.size)
+    assert(expected.numFields === actual1.size)
     assert(expected.getInt(0) === actual1.getInt(0))
     assert(expected.getString(1) === actual1.getString(1))
     assert(expected.getBoolean(2) === actual1.getBoolean(2))
     assert(expected(3) === actual1(3))
 
     val actual2 = Row.fromSeq(Seq(2147483647, "this is a string", false, null))
-    assert(expected.size === actual2.size)
+    assert(expected.numFields === actual2.size)
     assert(expected.getInt(0) === actual2.getInt(0))
     assert(expected.getString(1) === actual2.getString(1))
     assert(expected.getBoolean(2) === actual2.getBoolean(2))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/DDLTestSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/DDLTestSuite.scala
index da53ec16b5c41..84855ce45e918 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/DDLTestSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/DDLTestSuite.scala
@@ -61,9 +61,10 @@ case class SimpleDDLScan(from: Int, to: Int, table: String)(@transient val sqlCo
   override def needConversion: Boolean = false
 
   override def buildScan(): RDD[Row] = {
+    // Rely on a type erasure hack to pass RDD[InternalRow] back as RDD[Row]
     sqlContext.sparkContext.parallelize(from to to).map { e =>
-      InternalRow(UTF8String.fromString(s"people$e"), e * 2): Row
-    }
+      InternalRow(UTF8String.fromString(s"people$e"), e * 2)
+    }.asInstanceOf[RDD[Row]]
   }
 }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/PrunedScanSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/PrunedScanSuite.scala
index 257526feab945..0d5183444af78 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/PrunedScanSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/PrunedScanSuite.scala
@@ -131,7 +131,7 @@ class PrunedScanSuite extends DataSourceTest {
             queryExecution)
       }
 
-      if (rawOutput.size != expectedColumns.size) {
+      if (rawOutput.numFields != expectedColumns.size) {
         fail(s"Wrong output row. Got $rawOutput\n$queryExecution")
       }
     }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala
index 143aadc08b1c4..5e189c3563ca8 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala
@@ -93,7 +93,7 @@ case class AllDataTypesScan(
         InternalRow(i, UTF8String.fromString(i.toString)),
         InternalRow(Seq(UTF8String.fromString(s"str_$i"), UTF8String.fromString(s"str_${i + 1}")),
           InternalRow(Seq(DateTimeUtils.fromJavaDate(new Date(1970, 1, i + 1))))))
-    }
+    }.asInstanceOf[RDD[Row]]
   }
 }
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
index 8202e553afbfe..34b629403e128 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
@@ -122,7 +122,7 @@ case class InsertIntoHiveTable(
    *
    * Note: this is run once and then kept to avoid double insertions.
    */
-  protected[sql] lazy val sideEffectResult: Seq[InternalRow] = {
+  protected[sql] lazy val sideEffectResult: Seq[Row] = {
     // Have to pass the TableDesc object to RDD.mapPartitions and then instantiate new serializer
     // instances within the closure, since Serializer is not serializable while TableDesc is.
     val tableDesc = table.tableDesc
@@ -252,13 +252,12 @@ case class InsertIntoHiveTable(
     // however for now we return an empty list to simplify compatibility checks with hive, which
     // does not return anything for insert operations.
     // TODO: implement hive compatibility as rules.
-    Seq.empty[InternalRow]
+    Seq.empty[Row]
   }
 
-  override def executeCollect(): Array[Row] =
-    sideEffectResult.toArray
+  override def executeCollect(): Array[Row] = sideEffectResult.toArray
 
   protected override def doExecute(): RDD[InternalRow] = {
-    sqlContext.sparkContext.parallelize(sideEffectResult, 1)
+    sqlContext.sparkContext.parallelize(sideEffectResult.asInstanceOf[Seq[InternalRow]], 1)
   }
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveWriterContainers.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveWriterContainers.scala
index ecc78a5f8d321..8850e060d2a73 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveWriterContainers.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveWriterContainers.scala
@@ -34,6 +34,7 @@ import org.apache.hadoop.hive.common.FileUtils
 import org.apache.spark.mapred.SparkHadoopMapRedUtil
 import org.apache.spark.sql.Row
 import org.apache.spark.{Logging, SerializableWritable, SparkHadoopWriter}
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.hive.HiveShim.{ShimFileSinkDesc => FileSinkDesc}
 import org.apache.spark.sql.types._
@@ -94,7 +95,9 @@ private[hive] class SparkHiveWriterContainer(
     "part-" + numberFormat.format(splitID) + extension
   }
 
-  def getLocalFileWriter(row: Row, schema: StructType): FileSinkOperator.RecordWriter = writer
+  def getLocalFileWriter(row: InternalRow, schema: StructType): FileSinkOperator.RecordWriter = {
+    writer
+  }
 
   def close() {
     // Seems the boolean value passed into close does not matter.
@@ -197,7 +200,8 @@ private[spark] class SparkHiveDynamicPartitionWriterContainer(
     jobConf.setBoolean(SUCCESSFUL_JOB_OUTPUT_DIR_MARKER, oldMarker)
   }
 
-  override def getLocalFileWriter(row: Row, schema: StructType): FileSinkOperator.RecordWriter = {
+  override def getLocalFileWriter(row: InternalRow, schema: StructType)
+    : FileSinkOperator.RecordWriter = {
     def convertToHiveRawString(col: String, value: Any): String = {
       val raw = String.valueOf(value)
       schema(col).dataType match {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
index de63ee56dd8e6..10623dc820316 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
@@ -66,7 +66,7 @@ private[orc] class OrcOutputWriter(
     path: String,
     dataSchema: StructType,
     context: TaskAttemptContext)
-  extends OutputWriter with SparkHadoopMapRedUtil with HiveInspectors {
+  extends OutputWriterInternal with SparkHadoopMapRedUtil with HiveInspectors {
 
   private val serializer = {
     val table = new Properties()
@@ -119,9 +119,9 @@ private[orc] class OrcOutputWriter(
     ).asInstanceOf[RecordWriter[NullWritable, Writable]]
   }
 
-  override def write(row: Row): Unit = {
+  override def writeInternal(row: InternalRow): Unit = {
     var i = 0
-    while (i < row.length) {
+    while (i < row.numFields) {
       reusableOutputBuffer(i) = wrappers(i)(row(i))
       i += 1
     }
@@ -192,7 +192,7 @@ private[sql] class OrcRelation(
       filters: Array[Filter],
       inputPaths: Array[FileStatus]): RDD[Row] = {
     val output = StructType(requiredColumns.map(dataSchema(_))).toAttributes
-    OrcTableScan(output, this, filters, inputPaths).execute().map(_.asInstanceOf[Row])
+    OrcTableScan(output, this, filters, inputPaths).execute().asInstanceOf[RDD[Row]]
   }
 
   override def prepareJobForWrite(job: Job): OutputWriterFactory = {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/CommitFailureTestRelationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/CommitFailureTestRelationSuite.scala
new file mode 100644
index 0000000000000..e976125b3706d
--- /dev/null
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/CommitFailureTestRelationSuite.scala
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources
+
+import org.apache.hadoop.fs.Path
+import org.apache.spark.deploy.SparkHadoopUtil
+import org.apache.spark.{SparkException, SparkFunSuite}
+import org.apache.spark.sql.hive.test.TestHive
+import org.apache.spark.sql.test.SQLTestUtils
+
+
+class CommitFailureTestRelationSuite extends SparkFunSuite with SQLTestUtils {
+  override val sqlContext = TestHive
+
+  // When committing a task, `CommitFailureTestSource` throws an exception for testing purpose.
+  val dataSourceName: String = classOf[CommitFailureTestSource].getCanonicalName
+
+  test("SPARK-7684: commitTask() failure should fallback to abortTask()") {
+    withTempPath { file =>
+      // Here we coalesce partition number to 1 to ensure that only a single task is issued.  This
+      // prevents race condition happened when FileOutputCommitter tries to remove the `_temporary`
+      // directory while committing/aborting the job.  See SPARK-8513 for more details.
+      val df = sqlContext.range(0, 10).coalesce(1)
+      intercept[SparkException] {
+        df.write.format(dataSourceName).save(file.getCanonicalPath)
+      }
+
+      val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf)
+      assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary")))
+    }
+  }
+}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/ParquetHadoopFsRelationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/ParquetHadoopFsRelationSuite.scala
new file mode 100644
index 0000000000000..d280543a071d9
--- /dev/null
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/ParquetHadoopFsRelationSuite.scala
@@ -0,0 +1,139 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources
+
+import java.io.File
+
+import com.google.common.io.Files
+import org.apache.hadoop.fs.Path
+
+import org.apache.spark.deploy.SparkHadoopUtil
+import org.apache.spark.sql.{AnalysisException, SaveMode, parquet}
+import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
+
+
+class ParquetHadoopFsRelationSuite extends HadoopFsRelationTest {
+  override val dataSourceName: String = classOf[parquet.DefaultSource].getCanonicalName
+
+  import sqlContext._
+  import sqlContext.implicits._
+
+  test("save()/load() - partitioned table - simple queries - partition columns in data") {
+    withTempDir { file =>
+      val basePath = new Path(file.getCanonicalPath)
+      val fs = basePath.getFileSystem(SparkHadoopUtil.get.conf)
+      val qualifiedBasePath = fs.makeQualified(basePath)
+
+      for (p1 <- 1 to 2; p2 <- Seq("foo", "bar")) {
+        val partitionDir = new Path(qualifiedBasePath, s"p1=$p1/p2=$p2")
+        sparkContext
+          .parallelize(for (i <- 1 to 3) yield (i, s"val_$i", p1))
+          .toDF("a", "b", "p1")
+          .write.parquet(partitionDir.toString)
+      }
+
+      val dataSchemaWithPartition =
+        StructType(dataSchema.fields :+ StructField("p1", IntegerType, nullable = true))
+
+      checkQueries(
+        read.format(dataSourceName)
+          .option("dataSchema", dataSchemaWithPartition.json)
+          .load(file.getCanonicalPath))
+    }
+  }
+
+  test("SPARK-7868: _temporary directories should be ignored") {
+    withTempPath { dir =>
+      val df = Seq("a", "b", "c").zipWithIndex.toDF()
+
+      df.write
+        .format("parquet")
+        .save(dir.getCanonicalPath)
+
+      df.write
+        .format("parquet")
+        .save(s"${dir.getCanonicalPath}/_temporary")
+
+      checkAnswer(read.format("parquet").load(dir.getCanonicalPath), df.collect())
+    }
+  }
+
+  test("SPARK-8014: Avoid scanning output directory when SaveMode isn't SaveMode.Append") {
+    withTempDir { dir =>
+      val path = dir.getCanonicalPath
+      val df = Seq(1 -> "a").toDF()
+
+      // Creates an arbitrary file.  If this directory gets scanned, ParquetRelation2 will throw
+      // since it's not a valid Parquet file.
+      val emptyFile = new File(path, "empty")
+      Files.createParentDirs(emptyFile)
+      Files.touch(emptyFile)
+
+      // This shouldn't throw anything.
+      df.write.format("parquet").mode(SaveMode.Ignore).save(path)
+
+      // This should only complain that the destination directory already exists, rather than file
+      // "empty" is not a Parquet file.
+      assert {
+        intercept[AnalysisException] {
+          df.write.format("parquet").mode(SaveMode.ErrorIfExists).save(path)
+        }.getMessage.contains("already exists")
+      }
+
+      // This shouldn't throw anything.
+      df.write.format("parquet").mode(SaveMode.Overwrite).save(path)
+      checkAnswer(read.format("parquet").load(path), df)
+    }
+  }
+
+  test("SPARK-8079: Avoid NPE thrown from BaseWriterContainer.abortJob") {
+    withTempPath { dir =>
+      intercept[AnalysisException] {
+        // Parquet doesn't allow field names with spaces.  Here we are intentionally making an
+        // exception thrown from the `ParquetRelation2.prepareForWriteJob()` method to trigger
+        // the bug.  Please refer to spark-8079 for more details.
+        range(1, 10)
+          .withColumnRenamed("id", "a b")
+          .write
+          .format("parquet")
+          .save(dir.getCanonicalPath)
+      }
+    }
+  }
+
+  test("SPARK-8604: Parquet data source should write summary file while doing appending") {
+    withTempPath { dir =>
+      val path = dir.getCanonicalPath
+      val df = sqlContext.range(0, 5)
+      df.write.mode(SaveMode.Overwrite).parquet(path)
+
+      val summaryPath = new Path(path, "_metadata")
+      val commonSummaryPath = new Path(path, "_common_metadata")
+
+      val fs = summaryPath.getFileSystem(configuration)
+      fs.delete(summaryPath, true)
+      fs.delete(commonSummaryPath, true)
+
+      df.write.mode(SaveMode.Append).parquet(path)
+      checkAnswer(sqlContext.read.parquet(path), df.unionAll(df))
+
+      assert(fs.exists(summaryPath))
+      assert(fs.exists(commonSummaryPath))
+    }
+  }
+}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextHadoopFsRelationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextHadoopFsRelationSuite.scala
new file mode 100644
index 0000000000000..d761909d60e21
--- /dev/null
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextHadoopFsRelationSuite.scala
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources
+
+import org.apache.hadoop.fs.Path
+
+import org.apache.spark.deploy.SparkHadoopUtil
+import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
+
+/*
+This is commented out due a bug in the data source API (SPARK-9291).
+
+
+class SimpleTextHadoopFsRelationSuite extends HadoopFsRelationTest {
+  override val dataSourceName: String = classOf[SimpleTextSource].getCanonicalName
+
+  import sqlContext._
+
+  test("save()/load() - partitioned table - simple queries - partition columns in data") {
+    withTempDir { file =>
+      val basePath = new Path(file.getCanonicalPath)
+      val fs = basePath.getFileSystem(SparkHadoopUtil.get.conf)
+      val qualifiedBasePath = fs.makeQualified(basePath)
+
+      for (p1 <- 1 to 2; p2 <- Seq("foo", "bar")) {
+        val partitionDir = new Path(qualifiedBasePath, s"p1=$p1/p2=$p2")
+        sparkContext
+          .parallelize(for (i <- 1 to 3) yield s"$i,val_$i,$p1")
+          .saveAsTextFile(partitionDir.toString)
+      }
+
+      val dataSchemaWithPartition =
+        StructType(dataSchema.fields :+ StructField("p1", IntegerType, nullable = true))
+
+      checkQueries(
+        read.format(dataSourceName)
+          .option("dataSchema", dataSchemaWithPartition.json)
+          .load(file.getCanonicalPath))
+    }
+  }
+}
+*/
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
index 2a8748d913569..dd274023a1cf5 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala
@@ -17,18 +17,14 @@
 
 package org.apache.spark.sql.sources
 
-import java.io.File
-
 import scala.collection.JavaConversions._
 
-import com.google.common.io.Files
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext}
 import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
 import org.apache.parquet.hadoop.ParquetOutputCommitter
 
-import org.apache.spark.{SparkException, SparkFunSuite}
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.sql._
 import org.apache.spark.sql.execution.datasources.LogicalRelation
@@ -581,165 +577,3 @@ class AlwaysFailParquetOutputCommitter(
     sys.error("Intentional job commitment failure for testing purpose.")
   }
 }
-
-class SimpleTextHadoopFsRelationSuite extends HadoopFsRelationTest {
-  override val dataSourceName: String = classOf[SimpleTextSource].getCanonicalName
-
-  import sqlContext._
-
-  test("save()/load() - partitioned table - simple queries - partition columns in data") {
-    withTempDir { file =>
-      val basePath = new Path(file.getCanonicalPath)
-      val fs = basePath.getFileSystem(SparkHadoopUtil.get.conf)
-      val qualifiedBasePath = fs.makeQualified(basePath)
-
-      for (p1 <- 1 to 2; p2 <- Seq("foo", "bar")) {
-        val partitionDir = new Path(qualifiedBasePath, s"p1=$p1/p2=$p2")
-        sparkContext
-          .parallelize(for (i <- 1 to 3) yield s"$i,val_$i,$p1")
-          .saveAsTextFile(partitionDir.toString)
-      }
-
-      val dataSchemaWithPartition =
-        StructType(dataSchema.fields :+ StructField("p1", IntegerType, nullable = true))
-
-      checkQueries(
-        read.format(dataSourceName)
-          .option("dataSchema", dataSchemaWithPartition.json)
-          .load(file.getCanonicalPath))
-    }
-  }
-}
-
-class CommitFailureTestRelationSuite extends SparkFunSuite with SQLTestUtils {
-  override val sqlContext = TestHive
-
-  // When committing a task, `CommitFailureTestSource` throws an exception for testing purpose.
-  val dataSourceName: String = classOf[CommitFailureTestSource].getCanonicalName
-
-  test("SPARK-7684: commitTask() failure should fallback to abortTask()") {
-    withTempPath { file =>
-      // Here we coalesce partition number to 1 to ensure that only a single task is issued.  This
-      // prevents race condition happened when FileOutputCommitter tries to remove the `_temporary`
-      // directory while committing/aborting the job.  See SPARK-8513 for more details.
-      val df = sqlContext.range(0, 10).coalesce(1)
-      intercept[SparkException] {
-        df.write.format(dataSourceName).save(file.getCanonicalPath)
-      }
-
-      val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf)
-      assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary")))
-    }
-  }
-}
-
-class ParquetHadoopFsRelationSuite extends HadoopFsRelationTest {
-  override val dataSourceName: String = classOf[parquet.DefaultSource].getCanonicalName
-
-  import sqlContext._
-  import sqlContext.implicits._
-
-  test("save()/load() - partitioned table - simple queries - partition columns in data") {
-    withTempDir { file =>
-      val basePath = new Path(file.getCanonicalPath)
-      val fs = basePath.getFileSystem(SparkHadoopUtil.get.conf)
-      val qualifiedBasePath = fs.makeQualified(basePath)
-
-      for (p1 <- 1 to 2; p2 <- Seq("foo", "bar")) {
-        val partitionDir = new Path(qualifiedBasePath, s"p1=$p1/p2=$p2")
-        sparkContext
-          .parallelize(for (i <- 1 to 3) yield (i, s"val_$i", p1))
-          .toDF("a", "b", "p1")
-          .write.parquet(partitionDir.toString)
-      }
-
-      val dataSchemaWithPartition =
-        StructType(dataSchema.fields :+ StructField("p1", IntegerType, nullable = true))
-
-      checkQueries(
-        read.format(dataSourceName)
-          .option("dataSchema", dataSchemaWithPartition.json)
-          .load(file.getCanonicalPath))
-    }
-  }
-
-  test("SPARK-7868: _temporary directories should be ignored") {
-    withTempPath { dir =>
-      val df = Seq("a", "b", "c").zipWithIndex.toDF()
-
-      df.write
-        .format("parquet")
-        .save(dir.getCanonicalPath)
-
-      df.write
-        .format("parquet")
-        .save(s"${dir.getCanonicalPath}/_temporary")
-
-      checkAnswer(read.format("parquet").load(dir.getCanonicalPath), df.collect())
-    }
-  }
-
-  test("SPARK-8014: Avoid scanning output directory when SaveMode isn't SaveMode.Append") {
-    withTempDir { dir =>
-      val path = dir.getCanonicalPath
-      val df = Seq(1 -> "a").toDF()
-
-      // Creates an arbitrary file.  If this directory gets scanned, ParquetRelation2 will throw
-      // since it's not a valid Parquet file.
-      val emptyFile = new File(path, "empty")
-      Files.createParentDirs(emptyFile)
-      Files.touch(emptyFile)
-
-      // This shouldn't throw anything.
-      df.write.format("parquet").mode(SaveMode.Ignore).save(path)
-
-      // This should only complain that the destination directory already exists, rather than file
-      // "empty" is not a Parquet file.
-      assert {
-        intercept[AnalysisException] {
-          df.write.format("parquet").mode(SaveMode.ErrorIfExists).save(path)
-        }.getMessage.contains("already exists")
-      }
-
-      // This shouldn't throw anything.
-      df.write.format("parquet").mode(SaveMode.Overwrite).save(path)
-      checkAnswer(read.format("parquet").load(path), df)
-    }
-  }
-
-  test("SPARK-8079: Avoid NPE thrown from BaseWriterContainer.abortJob") {
-    withTempPath { dir =>
-      intercept[AnalysisException] {
-        // Parquet doesn't allow field names with spaces.  Here we are intentionally making an
-        // exception thrown from the `ParquetRelation2.prepareForWriteJob()` method to trigger
-        // the bug.  Please refer to spark-8079 for more details.
-        range(1, 10)
-          .withColumnRenamed("id", "a b")
-          .write
-          .format("parquet")
-          .save(dir.getCanonicalPath)
-      }
-    }
-  }
-
-  test("SPARK-8604: Parquet data source should write summary file while doing appending") {
-    withTempPath { dir =>
-      val path = dir.getCanonicalPath
-      val df = sqlContext.range(0, 5)
-      df.write.mode(SaveMode.Overwrite).parquet(path)
-
-      val summaryPath = new Path(path, "_metadata")
-      val commonSummaryPath = new Path(path, "_common_metadata")
-
-      val fs = summaryPath.getFileSystem(configuration)
-      fs.delete(summaryPath, true)
-      fs.delete(commonSummaryPath, true)
-
-      df.write.mode(SaveMode.Append).parquet(path)
-      checkAnswer(sqlContext.read.parquet(path), df.unionAll(df))
-
-      assert(fs.exists(summaryPath))
-      assert(fs.exists(commonSummaryPath))
-    }
-  }
-}

From c8d71a4183dfc83ff257047857af0b6d66c6b90d Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Fri, 24 Jul 2015 09:38:13 -0700
Subject: [PATCH 359/549] [SPARK-9305] Rename org.apache.spark.Row to Item.

It's a thing used in test cases, but named Row. Pretty annoying because everytime I search for Row, it shows up before the Spark SQL Row, which is what a developer wants most of the time.

Author: Reynold Xin <rxin@databricks.com>

Closes #7638 from rxin/remove-row and squashes the following commits:

aeda52d [Reynold Xin] [SPARK-9305] Rename org.apache.spark.Row to Item.
---
 .../scala/org/apache/spark/PartitioningSuite.scala     | 10 +++++-----
 .../org/apache/spark/sql/RandomDataGenerator.scala     |  3 +--
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/core/src/test/scala/org/apache/spark/PartitioningSuite.scala b/core/src/test/scala/org/apache/spark/PartitioningSuite.scala
index 3316f561a4949..aa8028792cb41 100644
--- a/core/src/test/scala/org/apache/spark/PartitioningSuite.scala
+++ b/core/src/test/scala/org/apache/spark/PartitioningSuite.scala
@@ -91,13 +91,13 @@ class PartitioningSuite extends SparkFunSuite with SharedSparkContext with Priva
 
   test("RangePartitioner for keys that are not Comparable (but with Ordering)") {
     // Row does not extend Comparable, but has an implicit Ordering defined.
-    implicit object RowOrdering extends Ordering[Row] {
-      override def compare(x: Row, y: Row): Int = x.value - y.value
+    implicit object RowOrdering extends Ordering[Item] {
+      override def compare(x: Item, y: Item): Int = x.value - y.value
     }
 
-    val rdd = sc.parallelize(1 to 4500).map(x => (Row(x), Row(x)))
+    val rdd = sc.parallelize(1 to 4500).map(x => (Item(x), Item(x)))
     val partitioner = new RangePartitioner(1500, rdd)
-    partitioner.getPartition(Row(100))
+    partitioner.getPartition(Item(100))
   }
 
   test("RangPartitioner.sketch") {
@@ -252,4 +252,4 @@ class PartitioningSuite extends SparkFunSuite with SharedSparkContext with Priva
 }
 
 
-private sealed case class Row(value: Int)
+private sealed case class Item(value: Int)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala
index b9f2ad7ec0481..75ae29d690770 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala
@@ -69,8 +69,7 @@ object RandomDataGenerator {
    * Returns a function which generates random values for the given [[DataType]], or `None` if no
    * random data generator is defined for that data type. The generated values will use an external
    * representation of the data type; for example, the random generator for [[DateType]] will return
-   * instances of [[java.sql.Date]] and the generator for [[StructType]] will return a
-   * [[org.apache.spark.Row]].
+   * instances of [[java.sql.Date]] and the generator for [[StructType]] will return a [[Row]].
    *
    * @param dataType the type to generate values for
    * @param nullable whether null values should be generated

From c2b50d693e469558e3b3c9cbb9d76089d259b587 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Fri, 24 Jul 2015 09:49:50 -0700
Subject: [PATCH 360/549] [SPARK-9292] Analysis should check that join
 conditions' data types are BooleanType

This patch adds an analysis check to ensure that join conditions' data types are BooleanType. This check is necessary in order to report proper errors for non-boolean DataFrame join conditions.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #7630 from JoshRosen/SPARK-9292 and squashes the following commits:

aec6c7b [Josh Rosen] Check condition type in resolved()
75a3ea6 [Josh Rosen] Fix SPARK-9292.
---
 .../apache/spark/sql/catalyst/analysis/CheckAnalysis.scala   | 5 +++++
 .../spark/sql/catalyst/plans/logical/basicOperators.scala    | 5 ++++-
 .../spark/sql/catalyst/analysis/AnalysisErrorSuite.scala     | 5 +++++
 3 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
index c203fcecf20fb..c23ab3c74338d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
@@ -83,6 +83,11 @@ trait CheckAnalysis {
               s"filter expression '${f.condition.prettyString}' " +
                 s"of type ${f.condition.dataType.simpleString} is not a boolean.")
 
+          case j @ Join(_, _, _, Some(condition)) if condition.dataType != BooleanType =>
+            failAnalysis(
+              s"join condition '${condition.prettyString}' " +
+                s"of type ${condition.dataType.simpleString} is not a boolean.")
+
           case Aggregate(groupingExprs, aggregateExprs, child) =>
             def checkValidAggregateExpression(expr: Expression): Unit = expr match {
               case _: AggregateExpression => // OK
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
index 6aefa9f67556a..57a12820fa4c6 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
@@ -128,7 +128,10 @@ case class Join(
 
   // Joins are only resolved if they don't introduce ambiguous expression ids.
   override lazy val resolved: Boolean = {
-    childrenResolved && expressions.forall(_.resolved) && selfJoinResolved
+    childrenResolved &&
+      expressions.forall(_.resolved) &&
+      selfJoinResolved &&
+      condition.forall(_.dataType == BooleanType)
   }
 }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
index dca8c881f21ab..7bf678ebf71ce 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
@@ -118,6 +118,11 @@ class AnalysisErrorSuite extends SparkFunSuite with BeforeAndAfter {
     testRelation.where(Literal(1)),
     "filter" :: "'1'" :: "not a boolean" :: Literal(1).dataType.simpleString :: Nil)
 
+  errorTest(
+    "non-boolean join conditions",
+    testRelation.join(testRelation, condition = Some(Literal(1))),
+    "condition" :: "'1'" :: "not a boolean" :: Literal(1).dataType.simpleString :: Nil)
+
   errorTest(
     "missing group by",
     testRelation2.groupBy('a)('b),

From e25312451322969ad716dddf8248b8c17f68323b Mon Sep 17 00:00:00 2001
From: MechCoder <manojkumarsivaraj334@gmail.com>
Date: Fri, 24 Jul 2015 10:56:48 -0700
Subject: [PATCH 361/549] [SPARK-9222] [MLlib] Make class instantiation
 variables in DistributedLDAModel private[clustering]

This makes it easier to test all the class variables of the DistributedLDAmodel.

Author: MechCoder <manojkumarsivaraj334@gmail.com>

Closes #7573 from MechCoder/lda_test and squashes the following commits:

2f1a293 [MechCoder] [SPARK-9222] [MLlib] Make class instantiation variables in DistributedLDAModel private[clustering]
---
 .../apache/spark/mllib/clustering/LDAModel.scala  |  8 ++++----
 .../apache/spark/mllib/clustering/LDASuite.scala  | 15 +++++++++++++++
 2 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
index 920b57756b625..31c1d520fd659 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
@@ -283,12 +283,12 @@ object LocalLDAModel extends Loader[LocalLDAModel] {
  */
 @Experimental
 class DistributedLDAModel private (
-    private val graph: Graph[LDA.TopicCounts, LDA.TokenCount],
-    private val globalTopicTotals: LDA.TopicCounts,
+    private[clustering] val graph: Graph[LDA.TopicCounts, LDA.TokenCount],
+    private[clustering] val globalTopicTotals: LDA.TopicCounts,
     val k: Int,
     val vocabSize: Int,
-    private val docConcentration: Double,
-    private val topicConcentration: Double,
+    private[clustering] val docConcentration: Double,
+    private[clustering] val topicConcentration: Double,
     private[spark] val iterationTimes: Array[Double]) extends LDAModel {
 
   import LDA._
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
index da70d9bd7c790..376a87f0511b4 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
@@ -20,6 +20,7 @@ package org.apache.spark.mllib.clustering
 import breeze.linalg.{DenseMatrix => BDM}
 
 import org.apache.spark.SparkFunSuite
+import org.apache.spark.graphx.Edge
 import org.apache.spark.mllib.linalg.{DenseMatrix, Matrix, Vector, Vectors}
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.mllib.util.TestingUtils._
@@ -318,6 +319,20 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext {
       assert(distributedModel.k === sameDistributedModel.k)
       assert(distributedModel.vocabSize === sameDistributedModel.vocabSize)
       assert(distributedModel.iterationTimes === sameDistributedModel.iterationTimes)
+      assert(distributedModel.docConcentration === sameDistributedModel.docConcentration)
+      assert(distributedModel.topicConcentration === sameDistributedModel.topicConcentration)
+      assert(distributedModel.globalTopicTotals === sameDistributedModel.globalTopicTotals)
+
+      val graph = distributedModel.graph
+      val sameGraph = sameDistributedModel.graph
+      assert(graph.vertices.sortByKey().collect() === sameGraph.vertices.sortByKey().collect())
+      val edge = graph.edges.map {
+        case Edge(sid: Long, did: Long, nos: Double) => (sid, did, nos)
+      }.sortBy(x => (x._1, x._2)).collect()
+      val sameEdge = sameGraph.edges.map {
+        case Edge(sid: Long, did: Long, nos: Double) => (sid, did, nos)
+      }.sortBy(x => (x._1, x._2)).collect()
+      assert(edge === sameEdge)
     } finally {
       Utils.deleteRecursively(tempDir1)
       Utils.deleteRecursively(tempDir2)

From 6aceaf3d62ee335570ddc07ccaf07e8c3776f517 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Fri, 24 Jul 2015 11:34:23 -0700
Subject: [PATCH 362/549] [SPARK-9295] Analysis should detect sorting on
 unsupported column types

This patch extends CheckAnalysis to throw errors for queries that try to sort on unsupported column types, such as ArrayType.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #7633 from JoshRosen/SPARK-9295 and squashes the following commits:

23b2fbf [Josh Rosen] Embed function in foreach
bfe1451 [Josh Rosen] Update to allow sorting by null literals
2f1b802 [Josh Rosen] Add analysis rule to detect sorting on unsupported column types (SPARK-9295)
---
 .../spark/sql/catalyst/analysis/CheckAnalysis.scala    | 10 ++++++++++
 .../sql/catalyst/analysis/AnalysisErrorSuite.scala     |  5 +++++
 2 files changed, 15 insertions(+)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
index c23ab3c74338d..81d473c1130f7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
@@ -103,6 +103,16 @@ trait CheckAnalysis {
 
             aggregateExprs.foreach(checkValidAggregateExpression)
 
+          case Sort(orders, _, _) =>
+            orders.foreach { order =>
+              order.dataType match {
+                case t: AtomicType => // OK
+                case NullType => // OK
+                case t =>
+                  failAnalysis(s"Sorting is not supported for columns of type ${t.simpleString}")
+              }
+            }
+
           case _ => // Fallbacks to the following checks
         }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
index 7bf678ebf71ce..2588df98246dd 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
@@ -113,6 +113,11 @@ class AnalysisErrorSuite extends SparkFunSuite with BeforeAndAfter {
     testRelation.select(Literal(1).cast(BinaryType).as('badCast)),
     "cannot cast" :: Literal(1).dataType.simpleString :: BinaryType.simpleString :: Nil)
 
+  errorTest(
+    "sorting by unsupported column types",
+    listRelation.orderBy('list.asc),
+    "sorting" :: "type" :: "array<int>" :: Nil)
+
   errorTest(
     "non-boolean filters",
     testRelation.where(Literal(1)),

From 8399ba14873854ab2f80a0ccaf6adba499060365 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Fri, 24 Jul 2015 11:53:16 -0700
Subject: [PATCH 363/549] [SPARK-9261] [STREAMING] Avoid calling APIs that
 expose shaded classes.

Doing this may cause weird errors when tests are run on maven, depending
on the flags used. Instead, expose the needed functionality through methods
that do not expose shaded classes.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #7601 from vanzin/SPARK-9261 and squashes the following commits:

4f64a16 [Marcelo Vanzin] [SPARK-9261] [streaming] Avoid calling APIs that expose shaded classes.
---
 .../scala/org/apache/spark/ui/WebUI.scala     | 19 +++++++++++++++++++
 .../spark/streaming/ui/StreamingTab.scala     | 12 +++---------
 2 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/ui/WebUI.scala b/core/src/main/scala/org/apache/spark/ui/WebUI.scala
index 2c84e4485996e..61449847add3d 100644
--- a/core/src/main/scala/org/apache/spark/ui/WebUI.scala
+++ b/core/src/main/scala/org/apache/spark/ui/WebUI.scala
@@ -107,6 +107,25 @@ private[spark] abstract class WebUI(
     }
   }
 
+  /**
+   * Add a handler for static content.
+   *
+   * @param resourceBase Root of where to find resources to serve.
+   * @param path Path in UI where to mount the resources.
+   */
+  def addStaticHandler(resourceBase: String, path: String): Unit = {
+    attachHandler(JettyUtils.createStaticHandler(resourceBase, path))
+  }
+
+  /**
+   * Remove a static content handler.
+   *
+   * @param path Path in UI to unmount.
+   */
+  def removeStaticHandler(path: String): Unit = {
+    handlers.find(_.getContextPath() == path).foreach(detachHandler)
+  }
+
   /** Initialize all components of the server. */
   def initialize()
 
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingTab.scala b/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingTab.scala
index e0c0f57212f55..bc53f2a31f6d1 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingTab.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingTab.scala
@@ -17,11 +17,9 @@
 
 package org.apache.spark.streaming.ui
 
-import org.eclipse.jetty.servlet.ServletContextHandler
-
 import org.apache.spark.{Logging, SparkException}
 import org.apache.spark.streaming.StreamingContext
-import org.apache.spark.ui.{JettyUtils, SparkUI, SparkUITab}
+import org.apache.spark.ui.{SparkUI, SparkUITab}
 
 import StreamingTab._
 
@@ -42,18 +40,14 @@ private[spark] class StreamingTab(val ssc: StreamingContext)
   attachPage(new StreamingPage(this))
   attachPage(new BatchPage(this))
 
-  var staticHandler: ServletContextHandler = null
-
   def attach() {
     getSparkUI(ssc).attachTab(this)
-    staticHandler = JettyUtils.createStaticHandler(STATIC_RESOURCE_DIR, "/static/streaming")
-    getSparkUI(ssc).attachHandler(staticHandler)
+    getSparkUI(ssc).addStaticHandler(STATIC_RESOURCE_DIR, "/static/streaming")
   }
 
   def detach() {
     getSparkUI(ssc).detachTab(this)
-    getSparkUI(ssc).detachHandler(staticHandler)
-    staticHandler = null
+    getSparkUI(ssc).removeStaticHandler("/static/streaming")
   }
 }
 

From 9a11396113d4bb0e76e0520df4fc58e7a8ec9f69 Mon Sep 17 00:00:00 2001
From: Cheolsoo Park <cheolsoop@netflix.com>
Date: Fri, 24 Jul 2015 11:56:55 -0700
Subject: [PATCH 364/549] [SPARK-9270] [PYSPARK] allow --name option in pyspark

This is continuation of #7512 which added `--name` option to spark-shell. This PR adds the same option to pyspark.

Note that `--conf spark.app.name` in command-line has no effect in spark-shell and pyspark. Instead, `--name` must be used. This is in fact inconsistency with spark-sql which doesn't accept `--name` option while it accepts `--conf spark.app.name`. I am not fixing this inconsistency in this PR. IMO, one of `--name` and `--conf spark.app.name` is needed not both. But since I cannot decide which to choose, I am not making any change here.

Author: Cheolsoo Park <cheolsoop@netflix.com>

Closes #7610 from piaozhexiu/SPARK-9270 and squashes the following commits:

763e86d [Cheolsoo Park] Update windows script
400b7f9 [Cheolsoo Park] Allow --name option to pyspark
---
 bin/pyspark             | 2 +-
 bin/pyspark2.cmd        | 2 +-
 python/pyspark/shell.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/bin/pyspark b/bin/pyspark
index f9dbddfa53560..8f2a3b5a7717b 100755
--- a/bin/pyspark
+++ b/bin/pyspark
@@ -82,4 +82,4 @@ fi
 
 export PYSPARK_DRIVER_PYTHON
 export PYSPARK_DRIVER_PYTHON_OPTS
-exec "$SPARK_HOME"/bin/spark-submit pyspark-shell-main "$@"
+exec "$SPARK_HOME"/bin/spark-submit pyspark-shell-main --name "PySparkShell" "$@"
diff --git a/bin/pyspark2.cmd b/bin/pyspark2.cmd
index 45e9e3def5121..3c6169983e76b 100644
--- a/bin/pyspark2.cmd
+++ b/bin/pyspark2.cmd
@@ -35,4 +35,4 @@ set PYTHONPATH=%SPARK_HOME%\python\lib\py4j-0.8.2.1-src.zip;%PYTHONPATH%
 set OLD_PYTHONSTARTUP=%PYTHONSTARTUP%
 set PYTHONSTARTUP=%SPARK_HOME%\python\pyspark\shell.py
 
-call %SPARK_HOME%\bin\spark-submit2.cmd pyspark-shell-main %*
+call %SPARK_HOME%\bin\spark-submit2.cmd pyspark-shell-main --name "PySparkShell" %*
diff --git a/python/pyspark/shell.py b/python/pyspark/shell.py
index 144cdf0b0cdd5..99331297c19f0 100644
--- a/python/pyspark/shell.py
+++ b/python/pyspark/shell.py
@@ -40,7 +40,7 @@
 if os.environ.get("SPARK_EXECUTOR_URI"):
     SparkContext.setSystemProperty("spark.executor.uri", os.environ["SPARK_EXECUTOR_URI"])
 
-sc = SparkContext(appName="PySparkShell", pyFiles=add_files)
+sc = SparkContext(pyFiles=add_files)
 atexit.register(lambda: sc.stop())
 
 try:

From 64135cbb3363e3b74dad3c0498cb9959c047d381 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@appier.com>
Date: Fri, 24 Jul 2015 12:36:44 -0700
Subject: [PATCH 365/549] [SPARK-9067] [SQL] Close reader in NewHadoopRDD early
 if there is no more data

JIRA: https://issues.apache.org/jira/browse/SPARK-9067

According to the description of the JIRA ticket, calling `reader.close()` only after the task is finished will cause memory and file open limit problem since these resources are occupied even we don't need that anymore.

This PR simply closes the reader early when we know there is no more data to read.

Author: Liang-Chi Hsieh <viirya@appier.com>

Closes #7424 from viirya/close_reader and squashes the following commits:

3ff64e5 [Liang-Chi Hsieh] For comments.
3d20267 [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into close_reader
e152182 [Liang-Chi Hsieh] For comments.
5116cbe [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into close_reader
3ceb755 [Liang-Chi Hsieh] For comments.
e34d98e [Liang-Chi Hsieh] For comments.
50ed729 [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into close_reader
216912f [Liang-Chi Hsieh] Fix it.
f429016 [Liang-Chi Hsieh] Release reader if we don't need it.
a305621 [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into close_reader
67569da [Liang-Chi Hsieh] Close reader early if there is no more data.
---
 .../org/apache/spark/rdd/NewHadoopRDD.scala   | 37 ++++++++++++-------
 .../spark/sql/execution/SqlNewHadoopRDD.scala | 36 +++++++++++-------
 2 files changed, 47 insertions(+), 26 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
index f827270ee6a44..f83a051f5da11 100644
--- a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
@@ -128,7 +128,7 @@ class NewHadoopRDD[K, V](
           configurable.setConf(conf)
         case _ =>
       }
-      val reader = format.createRecordReader(
+      private var reader = format.createRecordReader(
         split.serializableHadoopSplit.value, hadoopAttemptContext)
       reader.initialize(split.serializableHadoopSplit.value, hadoopAttemptContext)
 
@@ -141,6 +141,12 @@ class NewHadoopRDD[K, V](
       override def hasNext: Boolean = {
         if (!finished && !havePair) {
           finished = !reader.nextKeyValue
+          if (finished) {
+            // Close and release the reader here; close() will also be called when the task
+            // completes, but for tasks that read from many files, it helps to release the
+            // resources early.
+            close()
+          }
           havePair = !finished
         }
         !finished
@@ -159,18 +165,23 @@ class NewHadoopRDD[K, V](
 
       private def close() {
         try {
-          reader.close()
-          if (bytesReadCallback.isDefined) {
-            inputMetrics.updateBytesRead()
-          } else if (split.serializableHadoopSplit.value.isInstanceOf[FileSplit] ||
-                     split.serializableHadoopSplit.value.isInstanceOf[CombineFileSplit]) {
-            // If we can't get the bytes read from the FS stats, fall back to the split size,
-            // which may be inaccurate.
-            try {
-              inputMetrics.incBytesRead(split.serializableHadoopSplit.value.getLength)
-            } catch {
-              case e: java.io.IOException =>
-                logWarning("Unable to get input size to set InputMetrics for task", e)
+          if (reader != null) {
+            // Close reader and release it
+            reader.close()
+            reader = null
+
+            if (bytesReadCallback.isDefined) {
+              inputMetrics.updateBytesRead()
+            } else if (split.serializableHadoopSplit.value.isInstanceOf[FileSplit] ||
+                       split.serializableHadoopSplit.value.isInstanceOf[CombineFileSplit]) {
+              // If we can't get the bytes read from the FS stats, fall back to the split size,
+              // which may be inaccurate.
+              try {
+                inputMetrics.incBytesRead(split.serializableHadoopSplit.value.getLength)
+              } catch {
+                case e: java.io.IOException =>
+                  logWarning("Unable to get input size to set InputMetrics for task", e)
+              }
             }
           }
         } catch {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SqlNewHadoopRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SqlNewHadoopRDD.scala
index e1c1a6c06268f..3d75b6a91def6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SqlNewHadoopRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SqlNewHadoopRDD.scala
@@ -147,7 +147,7 @@ private[sql] class SqlNewHadoopRDD[K, V](
           configurable.setConf(conf)
         case _ =>
       }
-      val reader = format.createRecordReader(
+      private var reader = format.createRecordReader(
         split.serializableHadoopSplit.value, hadoopAttemptContext)
       reader.initialize(split.serializableHadoopSplit.value, hadoopAttemptContext)
 
@@ -160,6 +160,12 @@ private[sql] class SqlNewHadoopRDD[K, V](
       override def hasNext: Boolean = {
         if (!finished && !havePair) {
           finished = !reader.nextKeyValue
+          if (finished) {
+            // Close and release the reader here; close() will also be called when the task
+            // completes, but for tasks that read from many files, it helps to release the
+            // resources early.
+            close()
+          }
           havePair = !finished
         }
         !finished
@@ -178,18 +184,22 @@ private[sql] class SqlNewHadoopRDD[K, V](
 
       private def close() {
         try {
-          reader.close()
-          if (bytesReadCallback.isDefined) {
-            inputMetrics.updateBytesRead()
-          } else if (split.serializableHadoopSplit.value.isInstanceOf[FileSplit] ||
-                     split.serializableHadoopSplit.value.isInstanceOf[CombineFileSplit]) {
-            // If we can't get the bytes read from the FS stats, fall back to the split size,
-            // which may be inaccurate.
-            try {
-              inputMetrics.incBytesRead(split.serializableHadoopSplit.value.getLength)
-            } catch {
-              case e: java.io.IOException =>
-                logWarning("Unable to get input size to set InputMetrics for task", e)
+          if (reader != null) {
+            reader.close()
+            reader = null
+
+            if (bytesReadCallback.isDefined) {
+              inputMetrics.updateBytesRead()
+            } else if (split.serializableHadoopSplit.value.isInstanceOf[FileSplit] ||
+                       split.serializableHadoopSplit.value.isInstanceOf[CombineFileSplit]) {
+              // If we can't get the bytes read from the FS stats, fall back to the split size,
+              // which may be inaccurate.
+              try {
+                inputMetrics.incBytesRead(split.serializableHadoopSplit.value.getLength)
+              } catch {
+                case e: java.io.IOException =>
+                  logWarning("Unable to get input size to set InputMetrics for task", e)
+              }
             }
           }
         } catch {

From a400ab516fa93185aa683a596f9d7c6c1a02f330 Mon Sep 17 00:00:00 2001
From: MechCoder <manojkumarsivaraj334@gmail.com>
Date: Fri, 24 Jul 2015 14:58:07 -0700
Subject: [PATCH 366/549] [SPARK-7045] [MLLIB] Avoid intermediate
 representation when creating model

Word2Vec used to convert from an Array[Float] representation to a Map[String, Array[Float]] and then back to an Array[Float] through Word2VecModel.

This prevents this conversion while still supporting the older method of supplying a Map.

Author: MechCoder <manojkumarsivaraj334@gmail.com>

Closes #5748 from MechCoder/spark-7045 and squashes the following commits:

e308913 [MechCoder] move docs
5703116 [MechCoder] minor
fa04313 [MechCoder] style fixes
b1d61c4 [MechCoder] better errors and tests
3b32c8c [MechCoder] [SPARK-7045] Avoid intermediate representation when creating model
---
 .../apache/spark/mllib/feature/Word2Vec.scala | 85 +++++++++++--------
 .../spark/mllib/feature/Word2VecSuite.scala   |  6 ++
 2 files changed, 55 insertions(+), 36 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
index f087d06d2a46a..cbbd2b0c8d060 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
@@ -403,17 +403,8 @@ class Word2Vec extends Serializable with Logging {
     }
     newSentences.unpersist()
 
-    val word2VecMap = mutable.HashMap.empty[String, Array[Float]]
-    var i = 0
-    while (i < vocabSize) {
-      val word = bcVocab.value(i).word
-      val vector = new Array[Float](vectorSize)
-      Array.copy(syn0Global, i * vectorSize, vector, 0, vectorSize)
-      word2VecMap += word -> vector
-      i += 1
-    }
-
-    new Word2VecModel(word2VecMap.toMap)
+    val wordArray = vocab.map(_.word)
+    new Word2VecModel(wordArray.zipWithIndex.toMap, syn0Global)
   }
 
   /**
@@ -429,38 +420,42 @@ class Word2Vec extends Serializable with Logging {
 /**
  * :: Experimental ::
  * Word2Vec model
+ * @param wordIndex maps each word to an index, which can retrieve the corresponding
+ *                  vector from wordVectors
+ * @param wordVectors array of length numWords * vectorSize, vector corresponding
+ *                    to the word mapped with index i can be retrieved by the slice
+ *                    (i * vectorSize, i * vectorSize + vectorSize)
  */
 @Experimental
-class Word2VecModel private[spark] (
-    model: Map[String, Array[Float]]) extends Serializable with Saveable {
-
-  // wordList: Ordered list of words obtained from model.
-  private val wordList: Array[String] = model.keys.toArray
-
-  // wordIndex: Maps each word to an index, which can retrieve the corresponding
-  //            vector from wordVectors (see below).
-  private val wordIndex: Map[String, Int] = wordList.zip(0 until model.size).toMap
+class Word2VecModel private[mllib] (
+    private val wordIndex: Map[String, Int],
+    private val wordVectors: Array[Float]) extends Serializable with Saveable {
 
-  // vectorSize: Dimension of each word's vector.
-  private val vectorSize = model.head._2.size
   private val numWords = wordIndex.size
+  // vectorSize: Dimension of each word's vector.
+  private val vectorSize = wordVectors.length / numWords
+
+  // wordList: Ordered list of words obtained from wordIndex.
+  private val wordList: Array[String] = {
+    val (wl, _) = wordIndex.toSeq.sortBy(_._2).unzip
+    wl.toArray
+  }
 
-  // wordVectors: Array of length numWords * vectorSize, vector corresponding to the word
-  //              mapped with index i can be retrieved by the slice
-  //              (ind * vectorSize, ind * vectorSize + vectorSize)
   // wordVecNorms: Array of length numWords, each value being the Euclidean norm
   //               of the wordVector.
-  private val (wordVectors: Array[Float], wordVecNorms: Array[Double]) = {
-    val wordVectors = new Array[Float](vectorSize * numWords)
+  private val wordVecNorms: Array[Double] = {
     val wordVecNorms = new Array[Double](numWords)
     var i = 0
     while (i < numWords) {
-      val vec = model.get(wordList(i)).get
-      Array.copy(vec, 0, wordVectors, i * vectorSize, vectorSize)
+      val vec = wordVectors.slice(i * vectorSize, i * vectorSize + vectorSize)
       wordVecNorms(i) = blas.snrm2(vectorSize, vec, 1)
       i += 1
     }
-    (wordVectors, wordVecNorms)
+    wordVecNorms
+  }
+
+  def this(model: Map[String, Array[Float]]) = {
+    this(Word2VecModel.buildWordIndex(model), Word2VecModel.buildWordVectors(model))
   }
 
   private def cosineSimilarity(v1: Array[Float], v2: Array[Float]): Double = {
@@ -484,8 +479,9 @@ class Word2VecModel private[spark] (
    * @return vector representation of word
    */
   def transform(word: String): Vector = {
-    model.get(word) match {
-      case Some(vec) =>
+    wordIndex.get(word) match {
+      case Some(ind) =>
+        val vec = wordVectors.slice(ind * vectorSize, ind * vectorSize + vectorSize)
         Vectors.dense(vec.map(_.toDouble))
       case None =>
         throw new IllegalStateException(s"$word not in vocabulary")
@@ -511,7 +507,7 @@ class Word2VecModel private[spark] (
    */
   def findSynonyms(vector: Vector, num: Int): Array[(String, Double)] = {
     require(num > 0, "Number of similar words should > 0")
-
+    // TODO: optimize top-k
     val fVector = vector.toArray.map(_.toFloat)
     val cosineVec = Array.fill[Float](numWords)(0)
     val alpha: Float = 1
@@ -521,13 +517,13 @@ class Word2VecModel private[spark] (
       "T", vectorSize, numWords, alpha, wordVectors, vectorSize, fVector, 1, beta, cosineVec, 1)
 
     // Need not divide with the norm of the given vector since it is constant.
-    val updatedCosines = new Array[Double](numWords)
+    val cosVec = cosineVec.map(_.toDouble)
     var ind = 0
     while (ind < numWords) {
-      updatedCosines(ind) = cosineVec(ind) / wordVecNorms(ind)
+      cosVec(ind) /= wordVecNorms(ind)
       ind += 1
     }
-    wordList.zip(updatedCosines)
+    wordList.zip(cosVec)
       .toSeq
       .sortBy(- _._2)
       .take(num + 1)
@@ -548,6 +544,23 @@ class Word2VecModel private[spark] (
 @Experimental
 object Word2VecModel extends Loader[Word2VecModel] {
 
+  private def buildWordIndex(model: Map[String, Array[Float]]): Map[String, Int] = {
+    model.keys.zipWithIndex.toMap
+  }
+
+  private def buildWordVectors(model: Map[String, Array[Float]]): Array[Float] = {
+    require(model.nonEmpty, "Word2VecMap should be non-empty")
+    val (vectorSize, numWords) = (model.head._2.size, model.size)
+    val wordList = model.keys.toArray
+    val wordVectors = new Array[Float](vectorSize * numWords)
+    var i = 0
+    while (i < numWords) {
+      Array.copy(model(wordList(i)), 0, wordVectors, i * vectorSize, vectorSize)
+      i += 1
+    }
+    wordVectors
+  }
+
   private object SaveLoadV1_0 {
 
     val formatVersionV1_0 = "1.0"
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala
index b6818369208d7..4cc8d1129b858 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala
@@ -37,6 +37,12 @@ class Word2VecSuite extends SparkFunSuite with MLlibTestSparkContext {
     assert(syms.length == 2)
     assert(syms(0)._1 == "b")
     assert(syms(1)._1 == "c")
+
+    // Test that model built using Word2Vec, i.e wordVectors and wordIndec
+    // and a Word2VecMap give the same values.
+    val word2VecMap = model.getVectors
+    val newModel = new Word2VecModel(word2VecMap)
+    assert(newModel.getVectors.mapValues(_.toSeq) === word2VecMap.mapValues(_.toSeq))
   }
 
   test("Word2VecModel") {

From f99cb5615cbc0b469d52af6bd08f8bf888af58f3 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Fri, 24 Jul 2015 19:29:01 -0700
Subject: [PATCH 367/549] [SPARK-9330][SQL] Create specialized getStruct getter
 in InternalRow.

Also took the chance to rearrange some of the methods in UnsafeRow to group static/private/public things together.

Author: Reynold Xin <rxin@databricks.com>

Closes #7654 from rxin/getStruct and squashes the following commits:

b491a09 [Reynold Xin] Fixed typo.
48d77e5 [Reynold Xin] [SPARK-9330][SQL] Create specialized getStruct getter in InternalRow.
---
 .../sql/catalyst/expressions/UnsafeRow.java   | 87 ++++++++++++-------
 .../sql/catalyst/CatalystTypeConverters.scala |  2 +-
 .../spark/sql/catalyst/InternalRow.scala      | 22 +++--
 .../catalyst/expressions/BoundAttribute.scala |  1 +
 .../expressions/codegen/CodeGenerator.scala   |  5 +-
 5 files changed, 77 insertions(+), 40 deletions(-)

diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
index a8986608855e2..225f6e6553d19 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
@@ -51,28 +51,9 @@
  */
 public final class UnsafeRow extends MutableRow {
 
-  private Object baseObject;
-  private long baseOffset;
-
-  public Object getBaseObject() { return baseObject; }
-  public long getBaseOffset() { return baseOffset; }
-  public int getSizeInBytes() { return sizeInBytes; }
-
-  /** The number of fields in this row, used for calculating the bitset width (and in assertions) */
-  private int numFields;
-
-  /** The size of this row's backing data, in bytes) */
-  private int sizeInBytes;
-
-  @Override
-  public int numFields() { return numFields; }
-
-  /** The width of the null tracking bit set, in bytes */
-  private int bitSetWidthInBytes;
-
-  private long getFieldOffset(int ordinal) {
-   return baseOffset + bitSetWidthInBytes + ordinal * 8L;
-  }
+  //////////////////////////////////////////////////////////////////////////////
+  // Static methods
+  //////////////////////////////////////////////////////////////////////////////
 
   public static int calculateBitSetWidthInBytes(int numFields) {
     return ((numFields / 64) + (numFields % 64 == 0 ? 0 : 1)) * 8;
@@ -103,7 +84,7 @@ public static int calculateBitSetWidthInBytes(int numFields) {
           DoubleType,
           DateType,
           TimestampType
-    })));
+        })));
 
     // We support get() on a superset of the types for which we support set():
     final Set<DataType> _readableFieldTypes = new HashSet<>(
@@ -115,12 +96,48 @@ public static int calculateBitSetWidthInBytes(int numFields) {
     readableFieldTypes = Collections.unmodifiableSet(_readableFieldTypes);
   }
 
+  //////////////////////////////////////////////////////////////////////////////
+  // Private fields and methods
+  //////////////////////////////////////////////////////////////////////////////
+
+  private Object baseObject;
+  private long baseOffset;
+
+  /** The number of fields in this row, used for calculating the bitset width (and in assertions) */
+  private int numFields;
+
+  /** The size of this row's backing data, in bytes) */
+  private int sizeInBytes;
+
+  private void setNotNullAt(int i) {
+    assertIndexIsValid(i);
+    BitSetMethods.unset(baseObject, baseOffset, i);
+  }
+
+  /** The width of the null tracking bit set, in bytes */
+  private int bitSetWidthInBytes;
+
+  private long getFieldOffset(int ordinal) {
+    return baseOffset + bitSetWidthInBytes + ordinal * 8L;
+  }
+
+  //////////////////////////////////////////////////////////////////////////////
+  // Public methods
+  //////////////////////////////////////////////////////////////////////////////
+
   /**
    * Construct a new UnsafeRow. The resulting row won't be usable until `pointTo()` has been called,
    * since the value returned by this constructor is equivalent to a null pointer.
    */
   public UnsafeRow() { }
 
+  public Object getBaseObject() { return baseObject; }
+  public long getBaseOffset() { return baseOffset; }
+  public int getSizeInBytes() { return sizeInBytes; }
+
+  @Override
+  public int numFields() { return numFields; }
+
   /**
    * Update this UnsafeRow to point to different backing data.
    *
@@ -130,7 +147,7 @@ public UnsafeRow() { }
    * @param sizeInBytes the size of this row's backing data, in bytes
    */
   public void pointTo(Object baseObject, long baseOffset, int numFields, int sizeInBytes) {
-    assert numFields >= 0 : "numFields should >= 0";
+    assert numFields >= 0 : "numFields (" + numFields + ") should >= 0";
     this.bitSetWidthInBytes = calculateBitSetWidthInBytes(numFields);
     this.baseObject = baseObject;
     this.baseOffset = baseOffset;
@@ -153,11 +170,6 @@ public void setNullAt(int i) {
     PlatformDependent.UNSAFE.putLong(baseObject, getFieldOffset(i), 0);
   }
 
-  private void setNotNullAt(int i) {
-    assertIndexIsValid(i);
-    BitSetMethods.unset(baseObject, baseOffset, i);
-  }
-
   @Override
   public void update(int ordinal, Object value) {
     throw new UnsupportedOperationException();
@@ -316,6 +328,21 @@ public String getString(int i) {
     return getUTF8String(i).toString();
   }
 
+  @Override
+  public UnsafeRow getStruct(int i, int numFields) {
+    if (isNullAt(i)) {
+      return null;
+    } else {
+      assertIndexIsValid(i);
+      final long offsetAndSize = getLong(i);
+      final int offset = (int) (offsetAndSize >> 32);
+      final int size = (int) (offsetAndSize & ((1L << 32) - 1));
+      final UnsafeRow row = new UnsafeRow();
+      row.pointTo(baseObject, baseOffset + offset, numFields, size);
+      return row;
+    }
+  }
+
   /**
    * Copies this row, returning a self-contained UnsafeRow that stores its data in an internal
    * byte array rather than referencing data stored in a data page.
@@ -388,7 +415,7 @@ public boolean equals(Object other) {
    */
   public byte[] getBytes() {
     if (baseObject instanceof byte[] && baseOffset == PlatformDependent.BYTE_ARRAY_OFFSET
-        && (((byte[]) baseObject).length == sizeInBytes)) {
+      && (((byte[]) baseObject).length == sizeInBytes)) {
       return (byte[]) baseObject;
     } else {
       byte[] bytes = new byte[sizeInBytes];
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala
index 5c3072a77aeba..7416ddbaef3fc 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala
@@ -271,7 +271,7 @@ object CatalystTypeConverters {
     }
 
     override def toScalaImpl(row: InternalRow, column: Int): Row =
-      toScala(row.get(column).asInstanceOf[InternalRow])
+      toScala(row.getStruct(column, structType.size))
   }
 
   private object StringConverter extends CatalystTypeConverter[Any, String, UTF8String] {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala
index efc4faea569b2..f248b1f338acc 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala
@@ -52,6 +52,21 @@ abstract class InternalRow extends Serializable {
 
   def getDouble(i: Int): Double = getAs[Double](i)
 
+  def getUTF8String(i: Int): UTF8String = getAs[UTF8String](i)
+
+  def getBinary(i: Int): Array[Byte] = getAs[Array[Byte]](i)
+
+  // This is only use for test
+  def getString(i: Int): String = getAs[UTF8String](i).toString
+
+  /**
+   * Returns a struct from ordinal position.
+   *
+   * @param ordinal position to get the struct from.
+   * @param numFields number of fields the struct type has
+   */
+  def getStruct(ordinal: Int, numFields: Int): InternalRow = getAs[InternalRow](ordinal)
+
   override def toString: String = s"[${this.mkString(",")}]"
 
   /**
@@ -145,13 +160,6 @@ abstract class InternalRow extends Serializable {
    */
   def mkString(start: String, sep: String, end: String): String = toSeq.mkString(start, sep, end)
 
-  def getUTF8String(i: Int): UTF8String = getAs[UTF8String](i)
-
-  def getBinary(i: Int): Array[Byte] = getAs[Array[Byte]](i)
-
-  // This is only use for test
-  def getString(i: Int): String = getAs[UTF8String](i).toString
-
   // Custom hashCode function that matches the efficient code generated version.
   override def hashCode: Int = {
     var result: Int = 37
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala
index 6aa4930cb8587..1f7adcd36ec14 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala
@@ -48,6 +48,7 @@ case class BoundReference(ordinal: Int, dataType: DataType, nullable: Boolean)
         case DoubleType => input.getDouble(ordinal)
         case StringType => input.getUTF8String(ordinal)
         case BinaryType => input.getBinary(ordinal)
+        case t: StructType => input.getStruct(ordinal, t.size)
         case _ => input.get(ordinal)
       }
     }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index 48225e1574600..4a90f1b559896 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -109,6 +109,7 @@ class CodeGenContext {
       case _ if isPrimitiveType(jt) => s"$row.get${primitiveTypeName(jt)}($ordinal)"
       case StringType => s"$row.getUTF8String($ordinal)"
       case BinaryType => s"$row.getBinary($ordinal)"
+      case t: StructType => s"$row.getStruct($ordinal, ${t.size})"
       case _ => s"($jt)$row.apply($ordinal)"
     }
   }
@@ -249,13 +250,13 @@ abstract class CodeGenerator[InType <: AnyRef, OutType <: AnyRef] extends Loggin
   protected val mutableRowType: String = classOf[MutableRow].getName
   protected val genericMutableRowType: String = classOf[GenericMutableRow].getName
 
-  protected def declareMutableStates(ctx: CodeGenContext) = {
+  protected def declareMutableStates(ctx: CodeGenContext): String = {
     ctx.mutableStates.map { case (javaType, variableName, _) =>
       s"private $javaType $variableName;"
     }.mkString("\n      ")
   }
 
-  protected def initMutableStates(ctx: CodeGenContext) = {
+  protected def initMutableStates(ctx: CodeGenContext): String = {
     ctx.mutableStates.map(_._3).mkString("\n        ")
   }
 

From c84acd4aa4f8bee98baa550cd6801c797a8a7a25 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Fri, 24 Jul 2015 19:35:24 -0700
Subject: [PATCH 368/549] [SPARK-9331][SQL] Add a code formatter to auto-format
 generated code.

The generated expression code can be hard to read since they are not indented well. This patch adds a code formatter that formats code automatically when we output them to the screen.

Author: Reynold Xin <rxin@databricks.com>

Closes #7656 from rxin/codeformatter and squashes the following commits:

5ba0e90 [Reynold Xin] [SPARK-9331][SQL] Add a code formatter to auto-format generated code.
---
 .../expressions/codegen/CodeFormatter.scala   | 60 +++++++++++++++
 .../expressions/codegen/CodeGenerator.scala   |  2 +-
 .../codegen/GenerateMutableProjection.scala   | 11 +--
 .../codegen/GenerateOrdering.scala            |  2 +-
 .../codegen/GeneratePredicate.scala           |  2 +-
 .../codegen/GenerateProjection.scala          |  3 +-
 .../codegen/GenerateUnsafeProjection.scala    |  2 +-
 .../codegen/CodeFormatterSuite.scala          | 76 +++++++++++++++++++
 8 files changed, 148 insertions(+), 10 deletions(-)
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatter.scala
 create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatterSuite.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatter.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatter.scala
new file mode 100644
index 0000000000000..2087cc7f109bc
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatter.scala
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions.codegen
+
+/**
+ * An utility class that indents a block of code based on the curly braces.
+ *
+ * This is used to prettify generated code when in debug mode (or exceptions).
+ *
+ * Written by Matei Zaharia.
+ */
+object CodeFormatter {
+  def format(code: String): String = new CodeFormatter().addLines(code).result()
+}
+
+private class CodeFormatter {
+  private val code = new StringBuilder
+  private var indentLevel = 0
+  private val indentSize = 2
+  private var indentString = ""
+
+  private def addLine(line: String): Unit = {
+    val indentChange = line.count(_ == '{') - line.count(_ == '}')
+    val newIndentLevel = math.max(0, indentLevel + indentChange)
+    // Lines starting with '}' should be de-indented even if they contain '{' after;
+    // in addition, lines ending with ':' are typically labels
+    val thisLineIndent = if (line.startsWith("}") || line.endsWith(":")) {
+      " " * (indentSize * (indentLevel - 1))
+    } else {
+      indentString
+    }
+    code.append(thisLineIndent)
+    code.append(line)
+    code.append("\n")
+    indentLevel = newIndentLevel
+    indentString = " " * (indentSize * newIndentLevel)
+  }
+
+  private def addLines(code: String): CodeFormatter = {
+    code.split('\n').foreach(s => addLine(s.trim()))
+    this
+  }
+
+  private def result(): String = code.result()
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index 4a90f1b559896..508882acbee5a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -299,7 +299,7 @@ abstract class CodeGenerator[InType <: AnyRef, OutType <: AnyRef] extends Loggin
       evaluator.cook(code)
     } catch {
       case e: Exception =>
-        val msg = s"failed to compile:\n $code"
+        val msg = "failed to compile:\n " + CodeFormatter.format(code)
         logError(msg, e)
         throw new Exception(msg, e)
     }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
index d838268f46956..825031a4faf5e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
@@ -17,11 +17,11 @@
 
 package org.apache.spark.sql.catalyst.expressions.codegen
 
+import scala.collection.mutable.ArrayBuffer
+
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.aggregate.NoOp
 
-import scala.collection.mutable.ArrayBuffer
-
 // MutableProjection is not accessible in Java
 abstract class BaseMutableProjection extends MutableProjection
 
@@ -45,10 +45,11 @@ object GenerateMutableProjection extends CodeGenerator[Seq[Expression], () => Mu
         val evaluationCode = e.gen(ctx)
         evaluationCode.code +
           s"""
-            if(${evaluationCode.isNull})
+            if (${evaluationCode.isNull}) {
               mutableRow.setNullAt($i);
-            else
+            } else {
               ${ctx.setColumn("mutableRow", e.dataType, i, evaluationCode.primitive)};
+            }
           """
     }
     // collect projections into blocks as function has 64kb codesize limit in JVM
@@ -119,7 +120,7 @@ object GenerateMutableProjection extends CodeGenerator[Seq[Expression], () => Mu
       }
     """
 
-    logDebug(s"code for ${expressions.mkString(",")}:\n$code")
+    logDebug(s"code for ${expressions.mkString(",")}:\n${CodeFormatter.format(code)}")
 
     val c = compile(code)
     () => {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateOrdering.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateOrdering.scala
index 2e6f9e204d813..dbd4616d281c8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateOrdering.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateOrdering.scala
@@ -107,7 +107,7 @@ object GenerateOrdering extends CodeGenerator[Seq[SortOrder], Ordering[InternalR
         }
       }"""
 
-    logDebug(s"Generated Ordering: $code")
+    logDebug(s"Generated Ordering: ${CodeFormatter.format(code)}")
 
     compile(code).generate(ctx.references.toArray).asInstanceOf[BaseOrdering]
   }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratePredicate.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratePredicate.scala
index 1dda5992c3654..dfd593fb7c064 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratePredicate.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratePredicate.scala
@@ -60,7 +60,7 @@ object GeneratePredicate extends CodeGenerator[Expression, (InternalRow) => Bool
         }
       }"""
 
-    logDebug(s"Generated predicate '$predicate':\n$code")
+    logDebug(s"Generated predicate '$predicate':\n${CodeFormatter.format(code)}")
 
     val p = compile(code).generate(ctx.references.toArray).asInstanceOf[Predicate]
     (r: InternalRow) => p.eval(r)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala
index f0efc4bff12ba..a361b216eb472 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala
@@ -230,7 +230,8 @@ object GenerateProjection extends CodeGenerator[Seq[Expression], Projection] {
     }
     """
 
-    logDebug(s"MutableRow, initExprs: ${expressions.mkString(",")} code:\n${code}")
+    logDebug(s"MutableRow, initExprs: ${expressions.mkString(",")} code:\n" +
+      CodeFormatter.format(code))
 
     compile(code).generate(ctx.references.toArray).asInstanceOf[Projection]
   }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
index d65e5c38ebf5c..0320bcb827bf7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
@@ -114,7 +114,7 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
     }
     """
 
-    logDebug(s"code for ${expressions.mkString(",")}:\n$code")
+    logDebug(s"code for ${expressions.mkString(",")}:\n${CodeFormatter.format(code)}")
 
     val c = compile(code)
     c.generate(ctx.references.toArray).asInstanceOf[UnsafeProjection]
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatterSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatterSuite.scala
new file mode 100644
index 0000000000000..478702fea6146
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatterSuite.scala
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions.codegen
+
+import org.apache.spark.SparkFunSuite
+
+
+class CodeFormatterSuite extends SparkFunSuite {
+
+  def testCase(name: String)(input: String)(expected: String): Unit = {
+    test(name) {
+      assert(CodeFormatter.format(input).trim === expected.trim)
+    }
+  }
+
+  testCase("basic example") {
+    """
+      |class A {
+      |blahblah;
+      |}
+    """.stripMargin
+  }{
+    """
+      |class A {
+      |  blahblah;
+      |}
+    """.stripMargin
+  }
+
+  testCase("nested example") {
+    """
+      |class A {
+      | if (c) {
+      |duh;
+      |}
+      |}
+    """.stripMargin
+  } {
+    """
+      |class A {
+      |  if (c) {
+      |    duh;
+      |  }
+      |}
+    """.stripMargin
+  }
+
+  testCase("single line") {
+    """
+      |class A {
+      | if (c) {duh;}
+      |}
+    """.stripMargin
+  }{
+    """
+      |class A {
+      |  if (c) {duh;}
+      |}
+    """.stripMargin
+  }
+}

From 19bcd6ab12bf355bc5d774905ec7fe3b5fc8e0e2 Mon Sep 17 00:00:00 2001
From: Patrick Wendell <patrick@databricks.com>
Date: Fri, 24 Jul 2015 22:57:01 -0700
Subject: [PATCH 369/549] [HOTFIX] - Disable Kinesis tests due to rate limits

---
 .../apache/spark/streaming/kinesis/KinesisStreamSuite.scala   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisStreamSuite.scala b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisStreamSuite.scala
index f9c952b9468bb..4992b041765e9 100644
--- a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisStreamSuite.scala
+++ b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisStreamSuite.scala
@@ -59,7 +59,7 @@ class KinesisStreamSuite extends KinesisFunSuite
     }
   }
 
-  test("KinesisUtils API") {
+  ignore("KinesisUtils API") {
     ssc = new StreamingContext(sc, Seconds(1))
     // Tests the API, does not actually test data receiving
     val kinesisStream1 = KinesisUtils.createStream(ssc, "mySparkStream",
@@ -83,7 +83,7 @@ class KinesisStreamSuite extends KinesisFunSuite
    * you must have AWS credentials available through the default AWS provider chain,
    * and you have to set the system environment variable RUN_KINESIS_TESTS=1 .
    */
-  testIfEnabled("basic operation") {
+  ignore("basic operation") {
     val kinesisTestUtils = new KinesisTestUtils()
     try {
       kinesisTestUtils.createStream()

From 723db13e0688bf20e2a5f02ad170397c3a287712 Mon Sep 17 00:00:00 2001
From: JD <jd@csh.rit.edu>
Date: Sat, 25 Jul 2015 00:34:59 -0700
Subject: [PATCH 370/549] [Spark-8668][SQL] Adding expr to functions

Author: JD <jd@csh.rit.edu>
Author: Joseph Batchik <josephbatchik@gmail.com>

Closes #7606 from JDrit/expr and squashes the following commits:

ad7f607 [Joseph Batchik] fixing python linter error
9d6daea [Joseph Batchik] removed order by per @rxin's comment
707d5c6 [Joseph Batchik] Added expr to fuctions.py
79df83c [JD] added example to the docs
b89eec8 [JD] moved function up as per @rxin's comment
4960909 [JD] updated per @JoshRosen's comment
2cb329c [JD] updated per @rxin's comment
9a9ad0c [JD] removing unused import
6dc26d0 [JD] removed split
7f2222c [JD] Adding expr function as per SPARK-8668
---
 python/pyspark/sql/functions.py                   | 10 ++++++++++
 python/pyspark/sql/tests.py                       |  7 +++++++
 .../scala/org/apache/spark/sql/functions.scala    | 15 +++++++++++++--
 .../org/apache/spark/sql/SQLQuerySuite.scala      | 11 +++++++++++
 4 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 719e623a1a11f..d930f7db25d25 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -541,6 +541,16 @@ def sparkPartitionId():
     return Column(sc._jvm.functions.sparkPartitionId())
 
 
+def expr(str):
+    """Parses the expression string into the column that it represents
+
+    >>> df.select(expr("length(name)")).collect()
+    [Row('length(name)=5), Row('length(name)=3)]
+    """
+    sc = SparkContext._active_spark_context
+    return Column(sc._jvm.functions.expr(str))
+
+
 @ignore_unicode_prefix
 @since(1.5)
 def length(col):
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index ea821f486f13a..5aa6135dc1ee7 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -846,6 +846,13 @@ def test_bitwise_operations(self):
         result = df.select(functions.bitwiseNOT(df.b)).collect()[0].asDict()
         self.assertEqual(~75, result['~b'])
 
+    def test_expr(self):
+        from pyspark.sql import functions
+        row = Row(a="length string", b=75)
+        df = self.sqlCtx.createDataFrame([row])
+        result = df.select(functions.expr("length(a)")).collect()[0].asDict()
+        self.assertEqual(13, result["'length(a)"])
+
     def test_replace(self):
         schema = StructType([
             StructField("name", StringType(), True),
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index bfeecbe8b2ab5..cab3db609dd4b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -22,7 +22,7 @@ import scala.reflect.runtime.universe.{TypeTag, typeTag}
 import scala.util.Try
 
 import org.apache.spark.annotation.Experimental
-import org.apache.spark.sql.catalyst.ScalaReflection
+import org.apache.spark.sql.catalyst.{SqlParser, ScalaReflection}
 import org.apache.spark.sql.catalyst.analysis.{UnresolvedFunction, Star}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical.BroadcastHint
@@ -792,6 +792,18 @@ object functions {
    */
   def bitwiseNOT(e: Column): Column = BitwiseNot(e.expr)
 
+  /**
+   * Parses the expression string into the column that it represents, similar to
+   * DataFrame.selectExpr
+   * {{{
+   *   // get the number of words of each length
+   *   df.groupBy(expr("length(word)")).count()
+   * }}}
+   *
+   * @group normal_funcs
+   */
+  def expr(expr: String): Column = Column(new SqlParser().parseExpression(expr))
+
   //////////////////////////////////////////////////////////////////////////////////////////////
   // Math Functions
   //////////////////////////////////////////////////////////////////////////////////////////////
@@ -2451,5 +2463,4 @@ object functions {
     }
     UnresolvedFunction(udfName, exprs, isDistinct = false)
   }
-
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 95a1106cf072d..cd386b7a3ecf9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -112,6 +112,17 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll with SQLTestUtils {
       Row("1", 1) :: Row("2", 1) :: Row("3", 1) :: Nil)
   }
 
+  test("SPARK-8668 expr function") {
+    checkAnswer(Seq((1, "Bobby G."))
+      .toDF("id", "name")
+      .select(expr("length(name)"), expr("abs(id)")), Row(8, 1))
+
+    checkAnswer(Seq((1, "building burrito tunnels"), (1, "major projects"))
+      .toDF("id", "saying")
+      .groupBy(expr("length(saying)"))
+      .count(), Row(24, 1) :: Row(14, 1) :: Nil)
+  }
+
   test("SQL Dialect Switching to a new SQL parser") {
     val newContext = new SQLContext(sqlContext.sparkContext)
     newContext.setConf("spark.sql.dialect", classOf[MyDialect].getCanonicalName())

From f0ebab3f6d3a9231474acf20110db72c0fb51882 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Sat, 25 Jul 2015 01:28:46 -0700
Subject: [PATCH 371/549] [SPARK-9336][SQL] Remove extra JoinedRows

They were added to improve performance (so JIT can inline the JoinedRow calls). However, we can also just improve it by projecting output out to UnsafeRow in Tungsten variant of the operators.

Author: Reynold Xin <rxin@databricks.com>

Closes #7659 from rxin/remove-joinedrows and squashes the following commits:

7510447 [Reynold Xin] [SPARK-9336][SQL] Remove extra JoinedRows
---
 .../sql/catalyst/expressions/Projection.scala | 494 +-----------------
 .../spark/sql/execution/Aggregate.scala       |   2 +-
 .../sql/execution/GeneratedAggregate.scala    |   2 +-
 .../apache/spark/sql/execution/Window.scala   |   2 +-
 .../aggregate/sortBasedIterators.scala        |   2 +-
 .../spark/sql/execution/joins/HashJoin.scala  |   2 +-
 .../sql/execution/joins/SortMergeJoin.scala   |   2 +-
 7 files changed, 8 insertions(+), 498 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
index dbda05a792cbf..6023a2c564389 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
@@ -44,7 +44,7 @@ class InterpretedProjection(expressions: Seq[Expression]) extends Projection {
     new GenericInternalRow(outputArray)
   }
 
-  override def toString: String = s"Row => [${exprArray.mkString(",")}]"
+  override def toString(): String = s"Row => [${exprArray.mkString(",")}]"
 }
 
 /**
@@ -58,7 +58,7 @@ case class InterpretedMutableProjection(expressions: Seq[Expression]) extends Mu
     this(expressions.map(BindReferences.bindReference(_, inputSchema)))
 
   private[this] val exprArray = expressions.toArray
-  private[this] var mutableRow: MutableRow = new GenericMutableRow(exprArray.size)
+  private[this] var mutableRow: MutableRow = new GenericMutableRow(exprArray.length)
   def currentValue: InternalRow = mutableRow
 
   override def target(row: MutableRow): MutableProjection = {
@@ -186,496 +186,6 @@ class JoinedRow extends InternalRow {
     if (i < row1.numFields) row1.getBinary(i) else row2.getBinary(i - row1.numFields)
   }
 
-  override def get(i: Int): Any =
-    if (i < row1.numFields) row1.get(i) else row2.get(i - row1.numFields)
-
-  override def isNullAt(i: Int): Boolean =
-    if (i < row1.numFields) row1.isNullAt(i) else row2.isNullAt(i - row1.numFields)
-
-  override def getInt(i: Int): Int =
-    if (i < row1.numFields) row1.getInt(i) else row2.getInt(i - row1.numFields)
-
-  override def getLong(i: Int): Long =
-    if (i < row1.numFields) row1.getLong(i) else row2.getLong(i - row1.numFields)
-
-  override def getDouble(i: Int): Double =
-    if (i < row1.numFields) row1.getDouble(i) else row2.getDouble(i - row1.numFields)
-
-  override def getBoolean(i: Int): Boolean =
-    if (i < row1.numFields) row1.getBoolean(i) else row2.getBoolean(i - row1.numFields)
-
-  override def getShort(i: Int): Short =
-    if (i < row1.numFields) row1.getShort(i) else row2.getShort(i - row1.numFields)
-
-  override def getByte(i: Int): Byte =
-    if (i < row1.numFields) row1.getByte(i) else row2.getByte(i - row1.numFields)
-
-  override def getFloat(i: Int): Float =
-    if (i < row1.numFields) row1.getFloat(i) else row2.getFloat(i - row1.numFields)
-
-  override def copy(): InternalRow = {
-    val totalSize = row1.numFields + row2.numFields
-    val copiedValues = new Array[Any](totalSize)
-    var i = 0
-    while(i < totalSize) {
-      copiedValues(i) = get(i)
-      i += 1
-    }
-    new GenericInternalRow(copiedValues)
-  }
-
-  override def toString: String = {
-    // Make sure toString never throws NullPointerException.
-    if ((row1 eq null) && (row2 eq null)) {
-      "[ empty row ]"
-    } else if (row1 eq null) {
-      row2.mkString("[", ",", "]")
-    } else if (row2 eq null) {
-      row1.mkString("[", ",", "]")
-    } else {
-      mkString("[", ",", "]")
-    }
-  }
-}
-
-/**
- * JIT HACK: Replace with macros
- * The `JoinedRow` class is used in many performance critical situation.  Unfortunately, since there
- * are multiple different types of `Rows` that could be stored as `row1` and `row2` most of the
- * calls in the critical path are polymorphic.  By creating special versions of this class that are
- * used in only a single location of the code, we increase the chance that only a single type of
- * Row will be referenced, increasing the opportunity for the JIT to play tricks.  This sounds
- * crazy but in benchmarks it had noticeable effects.
- */
-class JoinedRow2 extends InternalRow {
-  private[this] var row1: InternalRow = _
-  private[this] var row2: InternalRow = _
-
-  def this(left: InternalRow, right: InternalRow) = {
-    this()
-    row1 = left
-    row2 = right
-  }
-
-  /** Updates this JoinedRow to used point at two new base rows.  Returns itself. */
-  def apply(r1: InternalRow, r2: InternalRow): InternalRow = {
-    row1 = r1
-    row2 = r2
-    this
-  }
-
-  /** Updates this JoinedRow by updating its left base row.  Returns itself. */
-  def withLeft(newLeft: InternalRow): InternalRow = {
-    row1 = newLeft
-    this
-  }
-
-  /** Updates this JoinedRow by updating its right base row.  Returns itself. */
-  def withRight(newRight: InternalRow): InternalRow = {
-    row2 = newRight
-    this
-  }
-
-  override def toSeq: Seq[Any] = row1.toSeq ++ row2.toSeq
-
-  override def numFields: Int = row1.numFields + row2.numFields
-
-  override def getUTF8String(i: Int): UTF8String = {
-    if (i < row1.numFields) row1.getUTF8String(i) else row2.getUTF8String(i - row1.numFields)
-  }
-
-  override def getBinary(i: Int): Array[Byte] = {
-    if (i < row1.numFields) row1.getBinary(i) else row2.getBinary(i - row1.numFields)
-  }
-
-  override def get(i: Int): Any =
-    if (i < row1.numFields) row1.get(i) else row2.get(i - row1.numFields)
-
-  override def isNullAt(i: Int): Boolean =
-    if (i < row1.numFields) row1.isNullAt(i) else row2.isNullAt(i - row1.numFields)
-
-  override def getInt(i: Int): Int =
-    if (i < row1.numFields) row1.getInt(i) else row2.getInt(i - row1.numFields)
-
-  override def getLong(i: Int): Long =
-    if (i < row1.numFields) row1.getLong(i) else row2.getLong(i - row1.numFields)
-
-  override def getDouble(i: Int): Double =
-    if (i < row1.numFields) row1.getDouble(i) else row2.getDouble(i - row1.numFields)
-
-  override def getBoolean(i: Int): Boolean =
-    if (i < row1.numFields) row1.getBoolean(i) else row2.getBoolean(i - row1.numFields)
-
-  override def getShort(i: Int): Short =
-    if (i < row1.numFields) row1.getShort(i) else row2.getShort(i - row1.numFields)
-
-  override def getByte(i: Int): Byte =
-    if (i < row1.numFields) row1.getByte(i) else row2.getByte(i - row1.numFields)
-
-  override def getFloat(i: Int): Float =
-    if (i < row1.numFields) row1.getFloat(i) else row2.getFloat(i - row1.numFields)
-
-  override def copy(): InternalRow = {
-    val totalSize = row1.numFields + row2.numFields
-    val copiedValues = new Array[Any](totalSize)
-    var i = 0
-    while(i < totalSize) {
-      copiedValues(i) = get(i)
-      i += 1
-    }
-    new GenericInternalRow(copiedValues)
-  }
-
-  override def toString: String = {
-    // Make sure toString never throws NullPointerException.
-    if ((row1 eq null) && (row2 eq null)) {
-      "[ empty row ]"
-    } else if (row1 eq null) {
-      row2.mkString("[", ",", "]")
-    } else if (row2 eq null) {
-      row1.mkString("[", ",", "]")
-    } else {
-      mkString("[", ",", "]")
-    }
-  }
-}
-
-/**
- * JIT HACK: Replace with macros
- */
-class JoinedRow3 extends InternalRow {
-  private[this] var row1: InternalRow = _
-  private[this] var row2: InternalRow = _
-
-  def this(left: InternalRow, right: InternalRow) = {
-    this()
-    row1 = left
-    row2 = right
-  }
-
-  /** Updates this JoinedRow to used point at two new base rows.  Returns itself. */
-  def apply(r1: InternalRow, r2: InternalRow): InternalRow = {
-    row1 = r1
-    row2 = r2
-    this
-  }
-
-  /** Updates this JoinedRow by updating its left base row.  Returns itself. */
-  def withLeft(newLeft: InternalRow): InternalRow = {
-    row1 = newLeft
-    this
-  }
-
-  /** Updates this JoinedRow by updating its right base row.  Returns itself. */
-  def withRight(newRight: InternalRow): InternalRow = {
-    row2 = newRight
-    this
-  }
-
-  override def toSeq: Seq[Any] = row1.toSeq ++ row2.toSeq
-
-  override def numFields: Int = row1.numFields + row2.numFields
-
-  override def getUTF8String(i: Int): UTF8String = {
-    if (i < row1.numFields) row1.getUTF8String(i) else row2.getUTF8String(i - row1.numFields)
-  }
-
-  override def getBinary(i: Int): Array[Byte] = {
-    if (i < row1.numFields) row1.getBinary(i) else row2.getBinary(i - row1.numFields)
-  }
-
-
-  override def get(i: Int): Any =
-    if (i < row1.numFields) row1.get(i) else row2.get(i - row1.numFields)
-
-  override def isNullAt(i: Int): Boolean =
-    if (i < row1.numFields) row1.isNullAt(i) else row2.isNullAt(i - row1.numFields)
-
-  override def getInt(i: Int): Int =
-    if (i < row1.numFields) row1.getInt(i) else row2.getInt(i - row1.numFields)
-
-  override def getLong(i: Int): Long =
-    if (i < row1.numFields) row1.getLong(i) else row2.getLong(i - row1.numFields)
-
-  override def getDouble(i: Int): Double =
-    if (i < row1.numFields) row1.getDouble(i) else row2.getDouble(i - row1.numFields)
-
-  override def getBoolean(i: Int): Boolean =
-    if (i < row1.numFields) row1.getBoolean(i) else row2.getBoolean(i - row1.numFields)
-
-  override def getShort(i: Int): Short =
-    if (i < row1.numFields) row1.getShort(i) else row2.getShort(i - row1.numFields)
-
-  override def getByte(i: Int): Byte =
-    if (i < row1.numFields) row1.getByte(i) else row2.getByte(i - row1.numFields)
-
-  override def getFloat(i: Int): Float =
-    if (i < row1.numFields) row1.getFloat(i) else row2.getFloat(i - row1.numFields)
-
-  override def copy(): InternalRow = {
-    val totalSize = row1.numFields + row2.numFields
-    val copiedValues = new Array[Any](totalSize)
-    var i = 0
-    while(i < totalSize) {
-      copiedValues(i) = get(i)
-      i += 1
-    }
-    new GenericInternalRow(copiedValues)
-  }
-
-  override def toString: String = {
-    // Make sure toString never throws NullPointerException.
-    if ((row1 eq null) && (row2 eq null)) {
-      "[ empty row ]"
-    } else if (row1 eq null) {
-      row2.mkString("[", ",", "]")
-    } else if (row2 eq null) {
-      row1.mkString("[", ",", "]")
-    } else {
-      mkString("[", ",", "]")
-    }
-  }
-}
-
-/**
- * JIT HACK: Replace with macros
- */
-class JoinedRow4 extends InternalRow {
-  private[this] var row1: InternalRow = _
-  private[this] var row2: InternalRow = _
-
-  def this(left: InternalRow, right: InternalRow) = {
-    this()
-    row1 = left
-    row2 = right
-  }
-
-  /** Updates this JoinedRow to used point at two new base rows.  Returns itself. */
-  def apply(r1: InternalRow, r2: InternalRow): InternalRow = {
-    row1 = r1
-    row2 = r2
-    this
-  }
-
-  /** Updates this JoinedRow by updating its left base row.  Returns itself. */
-  def withLeft(newLeft: InternalRow): InternalRow = {
-    row1 = newLeft
-    this
-  }
-
-  /** Updates this JoinedRow by updating its right base row.  Returns itself. */
-  def withRight(newRight: InternalRow): InternalRow = {
-    row2 = newRight
-    this
-  }
-
-  override def toSeq: Seq[Any] = row1.toSeq ++ row2.toSeq
-
-  override def numFields: Int = row1.numFields + row2.numFields
-
-  override def getUTF8String(i: Int): UTF8String = {
-    if (i < row1.numFields) row1.getUTF8String(i) else row2.getUTF8String(i - row1.numFields)
-  }
-
-  override def getBinary(i: Int): Array[Byte] = {
-    if (i < row1.numFields) row1.getBinary(i) else row2.getBinary(i - row1.numFields)
-  }
-
-
-  override def get(i: Int): Any =
-    if (i < row1.numFields) row1.get(i) else row2.get(i - row1.numFields)
-
-  override def isNullAt(i: Int): Boolean =
-    if (i < row1.numFields) row1.isNullAt(i) else row2.isNullAt(i - row1.numFields)
-
-  override def getInt(i: Int): Int =
-    if (i < row1.numFields) row1.getInt(i) else row2.getInt(i - row1.numFields)
-
-  override def getLong(i: Int): Long =
-    if (i < row1.numFields) row1.getLong(i) else row2.getLong(i - row1.numFields)
-
-  override def getDouble(i: Int): Double =
-    if (i < row1.numFields) row1.getDouble(i) else row2.getDouble(i - row1.numFields)
-
-  override def getBoolean(i: Int): Boolean =
-    if (i < row1.numFields) row1.getBoolean(i) else row2.getBoolean(i - row1.numFields)
-
-  override def getShort(i: Int): Short =
-    if (i < row1.numFields) row1.getShort(i) else row2.getShort(i - row1.numFields)
-
-  override def getByte(i: Int): Byte =
-    if (i < row1.numFields) row1.getByte(i) else row2.getByte(i - row1.numFields)
-
-  override def getFloat(i: Int): Float =
-    if (i < row1.numFields) row1.getFloat(i) else row2.getFloat(i - row1.numFields)
-
-  override def copy(): InternalRow = {
-    val totalSize = row1.numFields + row2.numFields
-    val copiedValues = new Array[Any](totalSize)
-    var i = 0
-    while(i < totalSize) {
-      copiedValues(i) = get(i)
-      i += 1
-    }
-    new GenericInternalRow(copiedValues)
-  }
-
-  override def toString: String = {
-    // Make sure toString never throws NullPointerException.
-    if ((row1 eq null) && (row2 eq null)) {
-      "[ empty row ]"
-    } else if (row1 eq null) {
-      row2.mkString("[", ",", "]")
-    } else if (row2 eq null) {
-      row1.mkString("[", ",", "]")
-    } else {
-      mkString("[", ",", "]")
-    }
-  }
-}
-
-/**
- * JIT HACK: Replace with macros
- */
-class JoinedRow5 extends InternalRow {
-  private[this] var row1: InternalRow = _
-  private[this] var row2: InternalRow = _
-
-  def this(left: InternalRow, right: InternalRow) = {
-    this()
-    row1 = left
-    row2 = right
-  }
-
-  /** Updates this JoinedRow to used point at two new base rows.  Returns itself. */
-  def apply(r1: InternalRow, r2: InternalRow): InternalRow = {
-    row1 = r1
-    row2 = r2
-    this
-  }
-
-  /** Updates this JoinedRow by updating its left base row.  Returns itself. */
-  def withLeft(newLeft: InternalRow): InternalRow = {
-    row1 = newLeft
-    this
-  }
-
-  /** Updates this JoinedRow by updating its right base row.  Returns itself. */
-  def withRight(newRight: InternalRow): InternalRow = {
-    row2 = newRight
-    this
-  }
-
-  override def toSeq: Seq[Any] = row1.toSeq ++ row2.toSeq
-
-  override def numFields: Int = row1.numFields + row2.numFields
-
-  override def getUTF8String(i: Int): UTF8String = {
-    if (i < row1.numFields) row1.getUTF8String(i) else row2.getUTF8String(i - row1.numFields)
-  }
-
-  override def getBinary(i: Int): Array[Byte] = {
-    if (i < row1.numFields) row1.getBinary(i) else row2.getBinary(i - row1.numFields)
-  }
-
-
-  override def get(i: Int): Any =
-    if (i < row1.numFields) row1.get(i) else row2.get(i - row1.numFields)
-
-  override def isNullAt(i: Int): Boolean =
-    if (i < row1.numFields) row1.isNullAt(i) else row2.isNullAt(i - row1.numFields)
-
-  override def getInt(i: Int): Int =
-    if (i < row1.numFields) row1.getInt(i) else row2.getInt(i - row1.numFields)
-
-  override def getLong(i: Int): Long =
-    if (i < row1.numFields) row1.getLong(i) else row2.getLong(i - row1.numFields)
-
-  override def getDouble(i: Int): Double =
-    if (i < row1.numFields) row1.getDouble(i) else row2.getDouble(i - row1.numFields)
-
-  override def getBoolean(i: Int): Boolean =
-    if (i < row1.numFields) row1.getBoolean(i) else row2.getBoolean(i - row1.numFields)
-
-  override def getShort(i: Int): Short =
-    if (i < row1.numFields) row1.getShort(i) else row2.getShort(i - row1.numFields)
-
-  override def getByte(i: Int): Byte =
-    if (i < row1.numFields) row1.getByte(i) else row2.getByte(i - row1.numFields)
-
-  override def getFloat(i: Int): Float =
-    if (i < row1.numFields) row1.getFloat(i) else row2.getFloat(i - row1.numFields)
-
-  override def copy(): InternalRow = {
-    val totalSize = row1.numFields + row2.numFields
-    val copiedValues = new Array[Any](totalSize)
-    var i = 0
-    while(i < totalSize) {
-      copiedValues(i) = get(i)
-      i += 1
-    }
-    new GenericInternalRow(copiedValues)
-  }
-
-  override def toString: String = {
-    // Make sure toString never throws NullPointerException.
-    if ((row1 eq null) && (row2 eq null)) {
-      "[ empty row ]"
-    } else if (row1 eq null) {
-      row2.mkString("[", ",", "]")
-    } else if (row2 eq null) {
-      row1.mkString("[", ",", "]")
-    } else {
-      mkString("[", ",", "]")
-    }
-  }
-}
-
-/**
- * JIT HACK: Replace with macros
- */
-class JoinedRow6 extends InternalRow {
-  private[this] var row1: InternalRow = _
-  private[this] var row2: InternalRow = _
-
-  def this(left: InternalRow, right: InternalRow) = {
-    this()
-    row1 = left
-    row2 = right
-  }
-
-  /** Updates this JoinedRow to used point at two new base rows.  Returns itself. */
-  def apply(r1: InternalRow, r2: InternalRow): InternalRow = {
-    row1 = r1
-    row2 = r2
-    this
-  }
-
-  /** Updates this JoinedRow by updating its left base row.  Returns itself. */
-  def withLeft(newLeft: InternalRow): InternalRow = {
-    row1 = newLeft
-    this
-  }
-
-  /** Updates this JoinedRow by updating its right base row.  Returns itself. */
-  def withRight(newRight: InternalRow): InternalRow = {
-    row2 = newRight
-    this
-  }
-
-  override def toSeq: Seq[Any] = row1.toSeq ++ row2.toSeq
-
-  override def numFields: Int = row1.numFields + row2.numFields
-
-  override def getUTF8String(i: Int): UTF8String = {
-    if (i < row1.numFields) row1.getUTF8String(i) else row2.getUTF8String(i - row1.numFields)
-  }
-
-  override def getBinary(i: Int): Array[Byte] = {
-    if (i < row1.numFields) row1.getBinary(i) else row2.getBinary(i - row1.numFields)
-  }
-
-
   override def get(i: Int): Any =
     if (i < row1.numFields) row1.get(i) else row2.get(i - row1.numFields)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Aggregate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Aggregate.scala
index c2c945321db95..e8c6a0f8f801d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Aggregate.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Aggregate.scala
@@ -172,7 +172,7 @@ case class Aggregate(
           private[this] val resultProjection =
             new InterpretedMutableProjection(
               resultExpressions, computedSchema ++ namedGroups.map(_._2))
-          private[this] val joinedRow = new JoinedRow4
+          private[this] val joinedRow = new JoinedRow
 
           override final def hasNext: Boolean = hashTableIter.hasNext
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala
index 5ed158b3d2912..5ad4691a5ca07 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala
@@ -269,7 +269,7 @@ case class GeneratedAggregate(
           namedGroups.map(_._2) ++ computationSchema)
       log.info(s"Result Projection: ${resultExpressions.mkString(",")}")
 
-      val joinedRow = new JoinedRow3
+      val joinedRow = new JoinedRow
 
       if (!iter.hasNext) {
         // This is an empty input, so return early so that we do not allocate data structures
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Window.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Window.scala
index de04132eb1104..91c8a02e2b5bc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Window.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Window.scala
@@ -298,7 +298,7 @@ case class Window(
         var rowsSize = 0
         override final def hasNext: Boolean = rowIndex < rowsSize || nextRowAvailable
 
-        val join = new JoinedRow6
+        val join = new JoinedRow
         val windowFunctionResult = new GenericMutableRow(unboundExpressions.size)
         override final def next(): InternalRow = {
           // Load the next partition if we need to.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/sortBasedIterators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/sortBasedIterators.scala
index b8e95a5a2a4da..1b89edafa8dad 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/sortBasedIterators.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/sortBasedIterators.scala
@@ -106,7 +106,7 @@ private[sql] abstract class SortAggregationIterator(
     new GenericMutableRow(size)
   }
 
-  protected val joinedRow = new JoinedRow4
+  protected val joinedRow = new JoinedRow
 
   protected val placeholderExpressions = Seq.fill(initialBufferOffset)(NoOp)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala
index ae34409bcfcca..46ab5b0d1cc6d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala
@@ -69,7 +69,7 @@ trait HashJoin {
       private[this] var currentMatchPosition: Int = -1
 
       // Mutable per row objects.
-      private[this] val joinRow = new JoinedRow2
+      private[this] val joinRow = new JoinedRow
       private[this] val resultProjection: Projection = {
         if (supportUnsafe) {
           UnsafeProjection.create(self.schema)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoin.scala
index 981447eacad74..bb18b5403f8e8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoin.scala
@@ -66,7 +66,7 @@ case class SortMergeJoin(
     leftResults.zipPartitions(rightResults) { (leftIter, rightIter) =>
       new Iterator[InternalRow] {
         // Mutable per row objects.
-        private[this] val joinRow = new JoinedRow5
+        private[this] val joinRow = new JoinedRow
         private[this] var leftElement: InternalRow = _
         private[this] var rightElement: InternalRow = _
         private[this] var leftKey: InternalRow = _

From 215713e19924dff69d226a97f1860a5470464d15 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Sat, 25 Jul 2015 01:37:41 -0700
Subject: [PATCH 372/549] [SPARK-9334][SQL] Remove UnsafeRowConverter in favor
 of UnsafeProjection.

The two are redundant.

Once this patch is merged, I plan to remove the inbound conversions from unsafe aggregates.

Author: Reynold Xin <rxin@databricks.com>

Closes #7658 from rxin/unsafeconverters and squashes the following commits:

ed19e6c [Reynold Xin] Updated support types.
2a56d7e [Reynold Xin] [SPARK-9334][SQL] Remove UnsafeRowConverter in favor of UnsafeProjection.
---
 .../UnsafeFixedWidthAggregationMap.java       |  55 +---
 .../expressions/UnsafeRowWriters.java         |  83 ++++++
 .../sql/catalyst/expressions/Projection.scala |   4 +-
 .../expressions/UnsafeRowConverter.scala      | 276 ------------------
 .../codegen/GenerateUnsafeProjection.scala    | 131 ++++++---
 .../expressions/ExpressionEvalHelper.scala    |   2 +-
 .../expressions/UnsafeRowConverterSuite.scala |  79 ++---
 .../execution/UnsafeRowSerializerSuite.scala  |  17 +-
 .../apache/spark/unsafe/types/ByteArray.java  |  38 +++
 .../apache/spark/unsafe/types/UTF8String.java |  15 +
 10 files changed, 262 insertions(+), 438 deletions(-)
 create mode 100644 sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRowWriters.java
 delete mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverter.scala
 create mode 100644 unsafe/src/main/java/org/apache/spark/unsafe/types/ByteArray.java

diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMap.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMap.java
index 2f7e84a7f59e2..684de6e81d67c 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMap.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMap.java
@@ -47,7 +47,7 @@ public final class UnsafeFixedWidthAggregationMap {
   /**
    * Encodes grouping keys as UnsafeRows.
    */
-  private final UnsafeRowConverter groupingKeyToUnsafeRowConverter;
+  private final UnsafeProjection groupingKeyProjection;
 
   /**
    * A hashmap which maps from opaque bytearray keys to bytearray values.
@@ -59,14 +59,6 @@ public final class UnsafeFixedWidthAggregationMap {
    */
   private final UnsafeRow currentAggregationBuffer = new UnsafeRow();
 
-  /**
-   * Scratch space that is used when encoding grouping keys into UnsafeRow format.
-   *
-   * By default, this is a 8 kb array, but it will grow as necessary in case larger keys are
-   * encountered.
-   */
-  private byte[] groupingKeyConversionScratchSpace = new byte[1024 * 8];
-
   private final boolean enablePerfMetrics;
 
   /**
@@ -112,26 +104,17 @@ public UnsafeFixedWidthAggregationMap(
       TaskMemoryManager memoryManager,
       int initialCapacity,
       boolean enablePerfMetrics) {
-    this.emptyAggregationBuffer =
-      convertToUnsafeRow(emptyAggregationBuffer, aggregationBufferSchema);
     this.aggregationBufferSchema = aggregationBufferSchema;
-    this.groupingKeyToUnsafeRowConverter = new UnsafeRowConverter(groupingKeySchema);
+    this.groupingKeyProjection = UnsafeProjection.create(groupingKeySchema);
     this.groupingKeySchema = groupingKeySchema;
     this.map = new BytesToBytesMap(memoryManager, initialCapacity, enablePerfMetrics);
     this.enablePerfMetrics = enablePerfMetrics;
-  }
 
-  /**
-   * Convert a Java object row into an UnsafeRow, allocating it into a new byte array.
-   */
-  private static byte[] convertToUnsafeRow(InternalRow javaRow, StructType schema) {
-    final UnsafeRowConverter converter = new UnsafeRowConverter(schema);
-    final int size = converter.getSizeRequirement(javaRow);
-    final byte[] unsafeRow = new byte[size];
-    final int writtenLength =
-      converter.writeRow(javaRow, unsafeRow, PlatformDependent.BYTE_ARRAY_OFFSET, size);
-    assert (writtenLength == unsafeRow.length): "Size requirement calculation was wrong!";
-    return unsafeRow;
+    // Initialize the buffer for aggregation value
+    final UnsafeProjection valueProjection = UnsafeProjection.create(aggregationBufferSchema);
+    this.emptyAggregationBuffer = valueProjection.apply(emptyAggregationBuffer).getBytes();
+    assert(this.emptyAggregationBuffer.length == aggregationBufferSchema.length() * 8 +
+      UnsafeRow.calculateBitSetWidthInBytes(aggregationBufferSchema.length()));
   }
 
   /**
@@ -139,30 +122,20 @@ private static byte[] convertToUnsafeRow(InternalRow javaRow, StructType schema)
    * return the same object.
    */
   public UnsafeRow getAggregationBuffer(InternalRow groupingKey) {
-    final int groupingKeySize = groupingKeyToUnsafeRowConverter.getSizeRequirement(groupingKey);
-    // Make sure that the buffer is large enough to hold the key. If it's not, grow it:
-    if (groupingKeySize > groupingKeyConversionScratchSpace.length) {
-      groupingKeyConversionScratchSpace = new byte[groupingKeySize];
-    }
-    final int actualGroupingKeySize = groupingKeyToUnsafeRowConverter.writeRow(
-      groupingKey,
-      groupingKeyConversionScratchSpace,
-      PlatformDependent.BYTE_ARRAY_OFFSET,
-      groupingKeySize);
-    assert (groupingKeySize == actualGroupingKeySize) : "Size requirement calculation was wrong!";
+    final UnsafeRow unsafeGroupingKeyRow = this.groupingKeyProjection.apply(groupingKey);
 
     // Probe our map using the serialized key
     final BytesToBytesMap.Location loc = map.lookup(
-      groupingKeyConversionScratchSpace,
-      PlatformDependent.BYTE_ARRAY_OFFSET,
-      groupingKeySize);
+      unsafeGroupingKeyRow.getBaseObject(),
+      unsafeGroupingKeyRow.getBaseOffset(),
+      unsafeGroupingKeyRow.getSizeInBytes());
     if (!loc.isDefined()) {
       // This is the first time that we've seen this grouping key, so we'll insert a copy of the
       // empty aggregation buffer into the map:
       loc.putNewKey(
-        groupingKeyConversionScratchSpace,
-        PlatformDependent.BYTE_ARRAY_OFFSET,
-        groupingKeySize,
+        unsafeGroupingKeyRow.getBaseObject(),
+        unsafeGroupingKeyRow.getBaseOffset(),
+        unsafeGroupingKeyRow.getSizeInBytes(),
         emptyAggregationBuffer,
         PlatformDependent.BYTE_ARRAY_OFFSET,
         emptyAggregationBuffer.length
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRowWriters.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRowWriters.java
new file mode 100644
index 0000000000000..87521d1f23c99
--- /dev/null
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRowWriters.java
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions;
+
+import org.apache.spark.unsafe.PlatformDependent;
+import org.apache.spark.unsafe.array.ByteArrayMethods;
+import org.apache.spark.unsafe.types.ByteArray;
+import org.apache.spark.unsafe.types.UTF8String;
+
+/**
+ * A set of helper methods to write data into {@link UnsafeRow}s,
+ * used by {@link org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection}.
+ */
+public class UnsafeRowWriters {
+
+  /** Writer for UTF8String. */
+  public static class UTF8StringWriter {
+
+    public static int getSize(UTF8String input) {
+      return ByteArrayMethods.roundNumberOfBytesToNearestWord(input.numBytes());
+    }
+
+    public static int write(UnsafeRow target, int ordinal, int cursor, UTF8String input) {
+      final long offset = target.getBaseOffset() + cursor;
+      final int numBytes = input.numBytes();
+
+      // zero-out the padding bytes
+      if ((numBytes & 0x07) > 0) {
+        PlatformDependent.UNSAFE.putLong(
+          target.getBaseObject(), offset + ((numBytes >> 3) << 3), 0L);
+      }
+
+      // Write the string to the variable length portion.
+      input.writeToMemory(target.getBaseObject(), offset);
+
+      // Set the fixed length portion.
+      target.setLong(ordinal, (((long) cursor) << 32) | ((long) numBytes));
+      return ByteArrayMethods.roundNumberOfBytesToNearestWord(numBytes);
+    }
+  }
+
+  /** Writer for bianry (byte array) type. */
+  public static class BinaryWriter {
+
+    public static int getSize(byte[] input) {
+      return ByteArrayMethods.roundNumberOfBytesToNearestWord(input.length);
+    }
+
+    public static int write(UnsafeRow target, int ordinal, int cursor, byte[] input) {
+      final long offset = target.getBaseOffset() + cursor;
+      final int numBytes = input.length;
+
+      // zero-out the padding bytes
+      if ((numBytes & 0x07) > 0) {
+        PlatformDependent.UNSAFE.putLong(
+          target.getBaseObject(), offset + ((numBytes >> 3) << 3), 0L);
+      }
+
+      // Write the string to the variable length portion.
+      ByteArray.writeToMemory(input, target.getBaseObject(), offset);
+
+      // Set the fixed length portion.
+      target.setLong(ordinal, (((long) cursor) << 32) | ((long) numBytes));
+      return ByteArrayMethods.roundNumberOfBytesToNearestWord(numBytes);
+    }
+  }
+
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
index 6023a2c564389..fb873e7e99547 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
@@ -90,8 +90,10 @@ object UnsafeProjection {
    * Seq[Expression].
    */
   def canSupport(schema: StructType): Boolean = canSupport(schema.fields.map(_.dataType))
-  def canSupport(types: Array[DataType]): Boolean = types.forall(UnsafeColumnWriter.canEmbed(_))
   def canSupport(exprs: Seq[Expression]): Boolean = canSupport(exprs.map(_.dataType).toArray)
+  private def canSupport(types: Array[DataType]): Boolean = {
+    types.forall(GenerateUnsafeProjection.canSupport)
+  }
 
   /**
    * Returns an UnsafeProjection for given StructType.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverter.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverter.scala
deleted file mode 100644
index c47b16c0f8585..0000000000000
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverter.scala
+++ /dev/null
@@ -1,276 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.catalyst.expressions
-
-import scala.util.Try
-
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.types._
-import org.apache.spark.unsafe.PlatformDependent
-import org.apache.spark.unsafe.array.ByteArrayMethods
-import org.apache.spark.unsafe.types.UTF8String
-
-
-/**
- * Converts Rows into UnsafeRow format. This class is NOT thread-safe.
- *
- * @param fieldTypes the data types of the row's columns.
- */
-class UnsafeRowConverter(fieldTypes: Array[DataType]) {
-
-  def this(schema: StructType) {
-    this(schema.fields.map(_.dataType))
-  }
-
-  /** Re-used pointer to the unsafe row being written */
-  private[this] val unsafeRow = new UnsafeRow()
-
-  /** Functions for encoding each column */
-  private[this] val writers: Array[UnsafeColumnWriter] = {
-    fieldTypes.map(t => UnsafeColumnWriter.forType(t))
-  }
-
-  /** The size, in bytes, of the fixed-length portion of the row, including the null bitmap */
-  private[this] val fixedLengthSize: Int =
-    (8 * fieldTypes.length) + UnsafeRow.calculateBitSetWidthInBytes(fieldTypes.length)
-
-  /**
-   * Compute the amount of space, in bytes, required to encode the given row.
-   */
-  def getSizeRequirement(row: InternalRow): Int = {
-    var fieldNumber = 0
-    var variableLengthFieldSize: Int = 0
-    while (fieldNumber < writers.length) {
-      if (!row.isNullAt(fieldNumber)) {
-        variableLengthFieldSize += writers(fieldNumber).getSize(row, fieldNumber)
-      }
-      fieldNumber += 1
-    }
-    fixedLengthSize + variableLengthFieldSize
-  }
-
-  /**
-   * Convert the given row into UnsafeRow format.
-   *
-   * @param row the row to convert
-   * @param baseObject the base object of the destination address
-   * @param baseOffset the base offset of the destination address
-   * @param rowLengthInBytes the length calculated by `getSizeRequirement(row)`
-   * @return the number of bytes written. This should be equal to `getSizeRequirement(row)`.
-   */
-  def writeRow(
-      row: InternalRow,
-      baseObject: Object,
-      baseOffset: Long,
-      rowLengthInBytes: Int): Int = {
-    unsafeRow.pointTo(baseObject, baseOffset, writers.length, rowLengthInBytes)
-
-    if (writers.length > 0) {
-      // zero-out the bitset
-      var n = writers.length / 64
-      while (n >= 0) {
-        PlatformDependent.UNSAFE.putLong(
-          unsafeRow.getBaseObject,
-          unsafeRow.getBaseOffset + n * 8,
-          0L)
-        n -= 1
-      }
-    }
-
-    var fieldNumber = 0
-    var appendCursor: Int = fixedLengthSize
-    while (fieldNumber < writers.length) {
-      if (row.isNullAt(fieldNumber)) {
-        unsafeRow.setNullAt(fieldNumber)
-      } else {
-        appendCursor += writers(fieldNumber).write(row, unsafeRow, fieldNumber, appendCursor)
-      }
-      fieldNumber += 1
-    }
-    appendCursor
-  }
-
-}
-
-/**
- * Function for writing a column into an UnsafeRow.
- */
-private abstract class UnsafeColumnWriter {
-  /**
-   * Write a value into an UnsafeRow.
-   *
-   * @param source the row being converted
-   * @param target a pointer to the converted unsafe row
-   * @param column the column to write
-   * @param appendCursor the offset from the start of the unsafe row to the end of the row;
-   *                     used for calculating where variable-length data should be written
-   * @return the number of variable-length bytes written
-   */
-  def write(source: InternalRow, target: UnsafeRow, column: Int, appendCursor: Int): Int
-
-  /**
-   * Return the number of bytes that are needed to write this variable-length value.
-   */
-  def getSize(source: InternalRow, column: Int): Int
-}
-
-private object UnsafeColumnWriter {
-
-  def forType(dataType: DataType): UnsafeColumnWriter = {
-    dataType match {
-      case NullType => NullUnsafeColumnWriter
-      case BooleanType => BooleanUnsafeColumnWriter
-      case ByteType => ByteUnsafeColumnWriter
-      case ShortType => ShortUnsafeColumnWriter
-      case IntegerType | DateType => IntUnsafeColumnWriter
-      case LongType | TimestampType => LongUnsafeColumnWriter
-      case FloatType => FloatUnsafeColumnWriter
-      case DoubleType => DoubleUnsafeColumnWriter
-      case StringType => StringUnsafeColumnWriter
-      case BinaryType => BinaryUnsafeColumnWriter
-      case t =>
-        throw new UnsupportedOperationException(s"Do not know how to write columns of type $t")
-    }
-  }
-
-  /**
-   * Returns whether the dataType can be embedded into UnsafeRow (not using ObjectPool).
-   */
-  def canEmbed(dataType: DataType): Boolean = Try(forType(dataType)).isSuccess
-}
-
-// ------------------------------------------------------------------------------------------------
-
-private abstract class PrimitiveUnsafeColumnWriter extends UnsafeColumnWriter {
-  // Primitives don't write to the variable-length region:
-  def getSize(sourceRow: InternalRow, column: Int): Int = 0
-}
-
-private object NullUnsafeColumnWriter extends PrimitiveUnsafeColumnWriter {
-  override def write(source: InternalRow, target: UnsafeRow, column: Int, cursor: Int): Int = {
-    target.setNullAt(column)
-    0
-  }
-}
-
-private object BooleanUnsafeColumnWriter extends PrimitiveUnsafeColumnWriter {
-  override def write(source: InternalRow, target: UnsafeRow, column: Int, cursor: Int): Int = {
-    target.setBoolean(column, source.getBoolean(column))
-    0
-  }
-}
-
-private object ByteUnsafeColumnWriter extends PrimitiveUnsafeColumnWriter {
-  override def write(source: InternalRow, target: UnsafeRow, column: Int, cursor: Int): Int = {
-    target.setByte(column, source.getByte(column))
-    0
-  }
-}
-
-private object ShortUnsafeColumnWriter extends PrimitiveUnsafeColumnWriter {
-  override def write(source: InternalRow, target: UnsafeRow, column: Int, cursor: Int): Int = {
-    target.setShort(column, source.getShort(column))
-    0
-  }
-}
-
-private object IntUnsafeColumnWriter extends PrimitiveUnsafeColumnWriter {
-  override def write(source: InternalRow, target: UnsafeRow, column: Int, cursor: Int): Int = {
-    target.setInt(column, source.getInt(column))
-    0
-  }
-}
-
-private object LongUnsafeColumnWriter extends PrimitiveUnsafeColumnWriter {
-  override def write(source: InternalRow, target: UnsafeRow, column: Int, cursor: Int): Int = {
-    target.setLong(column, source.getLong(column))
-    0
-  }
-}
-
-private object FloatUnsafeColumnWriter extends PrimitiveUnsafeColumnWriter {
-  override def write(source: InternalRow, target: UnsafeRow, column: Int, cursor: Int): Int = {
-    target.setFloat(column, source.getFloat(column))
-    0
-  }
-}
-
-private object DoubleUnsafeColumnWriter extends PrimitiveUnsafeColumnWriter {
-  override def write(source: InternalRow, target: UnsafeRow, column: Int, cursor: Int): Int = {
-    target.setDouble(column, source.getDouble(column))
-    0
-  }
-}
-
-private abstract class BytesUnsafeColumnWriter extends UnsafeColumnWriter {
-
-  protected[this] def isString: Boolean
-  protected[this] def getBytes(source: InternalRow, column: Int): Array[Byte]
-
-  override def getSize(source: InternalRow, column: Int): Int = {
-    val numBytes = getBytes(source, column).length
-    ByteArrayMethods.roundNumberOfBytesToNearestWord(numBytes)
-  }
-
-  override def write(source: InternalRow, target: UnsafeRow, column: Int, cursor: Int): Int = {
-    val bytes = getBytes(source, column)
-    write(target, bytes, column, cursor)
-  }
-
-  def write(target: UnsafeRow, bytes: Array[Byte], column: Int, cursor: Int): Int = {
-    val offset = target.getBaseOffset + cursor
-    val numBytes = bytes.length
-    if ((numBytes & 0x07) > 0) {
-      // zero-out the padding bytes
-      PlatformDependent.UNSAFE.putLong(target.getBaseObject, offset + ((numBytes >> 3) << 3), 0L)
-    }
-    PlatformDependent.copyMemory(
-      bytes,
-      PlatformDependent.BYTE_ARRAY_OFFSET,
-      target.getBaseObject,
-      offset,
-      numBytes
-    )
-    target.setLong(column, (cursor.toLong << 32) | numBytes.toLong)
-    ByteArrayMethods.roundNumberOfBytesToNearestWord(numBytes)
-  }
-}
-
-private object StringUnsafeColumnWriter extends BytesUnsafeColumnWriter {
-  protected[this] def isString: Boolean = true
-  def getBytes(source: InternalRow, column: Int): Array[Byte] = {
-    source.getAs[UTF8String](column).getBytes
-  }
-  // TODO(davies): refactor this
-  // specialized for codegen
-  def getSize(value: UTF8String): Int =
-    ByteArrayMethods.roundNumberOfBytesToNearestWord(value.numBytes())
-  def write(target: UnsafeRow, value: UTF8String, column: Int, cursor: Int): Int = {
-    write(target, value.getBytes, column, cursor)
-  }
-}
-
-private object BinaryUnsafeColumnWriter extends BytesUnsafeColumnWriter {
-  protected[this] override def isString: Boolean = false
-  override def getBytes(source: InternalRow, column: Int): Array[Byte] = {
-    source.getAs[Array[Byte]](column)
-  }
-  // specialized for codegen
-  def getSize(value: Array[Byte]): Int =
-    ByteArrayMethods.roundNumberOfBytesToNearestWord(value.length)
-}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
index 0320bcb827bf7..afd0d9cfa1ddd 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
@@ -18,8 +18,7 @@
 package org.apache.spark.sql.catalyst.expressions.codegen
 
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.types.{NullType, BinaryType, StringType}
-
+import org.apache.spark.sql.types._
 
 /**
  * Generates a [[Projection]] that returns an [[UnsafeRow]].
@@ -32,25 +31,43 @@ import org.apache.spark.sql.types.{NullType, BinaryType, StringType}
  */
 object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafeProjection] {
 
-  protected def canonicalize(in: Seq[Expression]): Seq[Expression] =
-    in.map(ExpressionCanonicalizer.execute)
+  private val StringWriter = classOf[UnsafeRowWriters.UTF8StringWriter].getName
+  private val BinaryWriter = classOf[UnsafeRowWriters.BinaryWriter].getName
 
-  protected def bind(in: Seq[Expression], inputSchema: Seq[Attribute]): Seq[Expression] =
-    in.map(BindReferences.bindReference(_, inputSchema))
+  /** Returns true iff we support this data type. */
+  def canSupport(dataType: DataType): Boolean = dataType match {
+    case t: AtomicType if !t.isInstanceOf[DecimalType] => true
+    case NullType => true
+    case _ => false
+  }
+
+  /**
+   * Generates the code to create an [[UnsafeRow]] object based on the input expressions.
+   * @param ctx context for code generation
+   * @param ev specifies the name of the variable for the output [[UnsafeRow]] object
+   * @param expressions input expressions
+   * @return generated code to put the expression output into an [[UnsafeRow]]
+   */
+  def createCode(ctx: CodeGenContext, ev: GeneratedExpressionCode, expressions: Seq[Expression])
+    : String = {
+
+    val ret = ev.primitive
+    ctx.addMutableState("UnsafeRow", ret, s"$ret = new UnsafeRow();")
+    val bufferTerm = ctx.freshName("buffer")
+    ctx.addMutableState("byte[]", bufferTerm, s"$bufferTerm = new byte[64];")
+    val cursorTerm = ctx.freshName("cursor")
+    val numBytesTerm = ctx.freshName("numBytes")
 
-  protected def create(expressions: Seq[Expression]): UnsafeProjection = {
-    val ctx = newCodeGenContext()
     val exprs = expressions.map(_.gen(ctx))
     val allExprs = exprs.map(_.code).mkString("\n")
     val fixedSize = 8 * exprs.length + UnsafeRow.calculateBitSetWidthInBytes(exprs.length)
-    val stringWriter = "org.apache.spark.sql.catalyst.expressions.StringUnsafeColumnWriter"
-    val binaryWriter = "org.apache.spark.sql.catalyst.expressions.BinaryUnsafeColumnWriter"
+
     val additionalSize = expressions.zipWithIndex.map { case (e, i) =>
       e.dataType match {
         case StringType =>
-          s" + (${exprs(i).isNull} ? 0 : $stringWriter.getSize(${exprs(i).primitive}))"
+          s" + (${exprs(i).isNull} ? 0 : $StringWriter.getSize(${exprs(i).primitive}))"
         case BinaryType =>
-          s" + (${exprs(i).isNull} ? 0 : $binaryWriter.getSize(${exprs(i).primitive}))"
+          s" + (${exprs(i).isNull} ? 0 : $BinaryWriter.getSize(${exprs(i).primitive}))"
         case _ => ""
       }
     }.mkString("")
@@ -58,63 +75,85 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
     val writers = expressions.zipWithIndex.map { case (e, i) =>
       val update = e.dataType match {
         case dt if ctx.isPrimitiveType(dt) =>
-          s"${ctx.setColumn("target", dt, i, exprs(i).primitive)}"
+          s"${ctx.setColumn(ret, dt, i, exprs(i).primitive)}"
         case StringType =>
-          s"cursor += $stringWriter.write(target, ${exprs(i).primitive}, $i, cursor)"
+          s"$cursorTerm += $StringWriter.write($ret, $i, $cursorTerm, ${exprs(i).primitive})"
         case BinaryType =>
-          s"cursor += $binaryWriter.write(target, ${exprs(i).primitive}, $i, cursor)"
+          s"$cursorTerm += $BinaryWriter.write($ret, $i, $cursorTerm, ${exprs(i).primitive})"
         case NullType => ""
         case _ =>
           throw new UnsupportedOperationException(s"Not supported DataType: ${e.dataType}")
       }
       s"""if (${exprs(i).isNull}) {
-            target.setNullAt($i);
+            $ret.setNullAt($i);
           } else {
             $update;
           }"""
     }.mkString("\n          ")
 
-    val code = s"""
-    private $exprType[] expressions;
+    s"""
+      $allExprs
+      int $numBytesTerm = $fixedSize $additionalSize;
+      if ($numBytesTerm > $bufferTerm.length) {
+        $bufferTerm = new byte[$numBytesTerm];
+      }
 
-    public Object generate($exprType[] expr) {
-      this.expressions = expr;
-      return new SpecificProjection();
-    }
+      $ret.pointTo(
+        $bufferTerm,
+        org.apache.spark.unsafe.PlatformDependent.BYTE_ARRAY_OFFSET,
+        ${expressions.size},
+        $numBytesTerm);
+      int $cursorTerm = $fixedSize;
 
-    class SpecificProjection extends ${classOf[UnsafeProjection].getName} {
 
-      private UnsafeRow target = new UnsafeRow();
-      private byte[] buffer = new byte[64];
-      ${declareMutableStates(ctx)}
+      $writers
+      boolean ${ev.isNull} = false;
+     """
+  }
 
-      public SpecificProjection() {
-        ${initMutableStates(ctx)}
-      }
+  protected def canonicalize(in: Seq[Expression]): Seq[Expression] =
+    in.map(ExpressionCanonicalizer.execute)
+
+  protected def bind(in: Seq[Expression], inputSchema: Seq[Attribute]): Seq[Expression] =
+    in.map(BindReferences.bindReference(_, inputSchema))
+
+  protected def create(expressions: Seq[Expression]): UnsafeProjection = {
+    val ctx = newCodeGenContext()
+
+    val isNull = ctx.freshName("retIsNull")
+    val primitive = ctx.freshName("retValue")
+    val eval = GeneratedExpressionCode("", isNull, primitive)
+    eval.code = createCode(ctx, eval, expressions)
 
-      // Scala.Function1 need this
-      public Object apply(Object row) {
-        return apply((InternalRow) row);
+    val code = s"""
+      private $exprType[] expressions;
+
+      public Object generate($exprType[] expr) {
+        this.expressions = expr;
+        return new SpecificProjection();
       }
 
-      public UnsafeRow apply(InternalRow i) {
-        $allExprs
+      class SpecificProjection extends ${classOf[UnsafeProjection].getName} {
+
+        ${declareMutableStates(ctx)}
+
+        public SpecificProjection() {
+          ${initMutableStates(ctx)}
+        }
+
+        // Scala.Function1 need this
+        public Object apply(Object row) {
+          return apply((InternalRow) row);
+        }
 
-        // additionalSize had '+' in the beginning
-        int numBytes = $fixedSize $additionalSize;
-        if (numBytes > buffer.length) {
-          buffer = new byte[numBytes];
+        public UnsafeRow apply(InternalRow i) {
+          ${eval.code}
+          return ${eval.primitive};
         }
-        target.pointTo(buffer, org.apache.spark.unsafe.PlatformDependent.BYTE_ARRAY_OFFSET,
-          ${expressions.size}, numBytes);
-        int cursor = $fixedSize;
-        $writers
-        return target;
       }
-    }
-    """
+      """
 
-    logDebug(s"code for ${expressions.mkString(",")}:\n${CodeFormatter.format(code)}")
+    logDebug(s"code for ${expressions.mkString(",")}:\n$code")
 
     val c = compile(code)
     c.generate(ctx.references.toArray).asInstanceOf[UnsafeProjection]
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
index 6e17ffcda9dc4..4930219aa63cb 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
@@ -43,7 +43,7 @@ trait ExpressionEvalHelper {
     checkEvaluationWithoutCodegen(expression, catalystValue, inputRow)
     checkEvaluationWithGeneratedMutableProjection(expression, catalystValue, inputRow)
     checkEvaluationWithGeneratedProjection(expression, catalystValue, inputRow)
-    if (UnsafeColumnWriter.canEmbed(expression.dataType)) {
+    if (GenerateUnsafeProjection.canSupport(expression.dataType)) {
       checkEvalutionWithUnsafeProjection(expression, catalystValue, inputRow)
     }
     checkEvaluationWithOptimization(expression, catalystValue, inputRow)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala
index a5d9806c20463..4606bcb57311d 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala
@@ -26,7 +26,6 @@ import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.types._
-import org.apache.spark.unsafe.PlatformDependent
 import org.apache.spark.unsafe.array.ByteArrayMethods
 import org.apache.spark.unsafe.types.UTF8String
 
@@ -34,22 +33,15 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers {
 
   test("basic conversion with only primitive types") {
     val fieldTypes: Array[DataType] = Array(LongType, LongType, IntegerType)
-    val converter = new UnsafeRowConverter(fieldTypes)
+    val converter = UnsafeProjection.create(fieldTypes)
 
     val row = new SpecificMutableRow(fieldTypes)
     row.setLong(0, 0)
     row.setLong(1, 1)
     row.setInt(2, 2)
 
-    val sizeRequired: Int = converter.getSizeRequirement(row)
-    assert(sizeRequired === 8 + (3 * 8))
-    val buffer: Array[Long] = new Array[Long](sizeRequired / 8)
-    val numBytesWritten =
-      converter.writeRow(row, buffer, PlatformDependent.LONG_ARRAY_OFFSET, sizeRequired)
-    assert(numBytesWritten === sizeRequired)
-
-    val unsafeRow = new UnsafeRow()
-    unsafeRow.pointTo(buffer, PlatformDependent.LONG_ARRAY_OFFSET, fieldTypes.length, sizeRequired)
+    val unsafeRow: UnsafeRow = converter.apply(row)
+    assert(converter.apply(row).getSizeInBytes === 8 + (3 * 8))
     assert(unsafeRow.getLong(0) === 0)
     assert(unsafeRow.getLong(1) === 1)
     assert(unsafeRow.getInt(2) === 2)
@@ -73,25 +65,18 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers {
 
   test("basic conversion with primitive, string and binary types") {
     val fieldTypes: Array[DataType] = Array(LongType, StringType, BinaryType)
-    val converter = new UnsafeRowConverter(fieldTypes)
+    val converter = UnsafeProjection.create(fieldTypes)
 
     val row = new SpecificMutableRow(fieldTypes)
     row.setLong(0, 0)
     row.update(1, UTF8String.fromString("Hello"))
     row.update(2, "World".getBytes)
 
-    val sizeRequired: Int = converter.getSizeRequirement(row)
-    assert(sizeRequired === 8 + (8 * 3) +
+    val unsafeRow: UnsafeRow = converter.apply(row)
+    assert(unsafeRow.getSizeInBytes === 8 + (8 * 3) +
       ByteArrayMethods.roundNumberOfBytesToNearestWord("Hello".getBytes.length) +
       ByteArrayMethods.roundNumberOfBytesToNearestWord("World".getBytes.length))
-    val buffer: Array[Long] = new Array[Long](sizeRequired / 8)
-    val numBytesWritten = converter.writeRow(
-      row, buffer, PlatformDependent.LONG_ARRAY_OFFSET, sizeRequired)
-    assert(numBytesWritten === sizeRequired)
-
-    val unsafeRow = new UnsafeRow()
-    unsafeRow.pointTo(
-      buffer, PlatformDependent.LONG_ARRAY_OFFSET, fieldTypes.length, sizeRequired)
+
     assert(unsafeRow.getLong(0) === 0)
     assert(unsafeRow.getString(1) === "Hello")
     assert(unsafeRow.getBinary(2) === "World".getBytes)
@@ -99,7 +84,7 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers {
 
   test("basic conversion with primitive, string, date and timestamp types") {
     val fieldTypes: Array[DataType] = Array(LongType, StringType, DateType, TimestampType)
-    val converter = new UnsafeRowConverter(fieldTypes)
+    val converter = UnsafeProjection.create(fieldTypes)
 
     val row = new SpecificMutableRow(fieldTypes)
     row.setLong(0, 0)
@@ -107,17 +92,10 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers {
     row.update(2, DateTimeUtils.fromJavaDate(Date.valueOf("1970-01-01")))
     row.update(3, DateTimeUtils.fromJavaTimestamp(Timestamp.valueOf("2015-05-08 08:10:25")))
 
-    val sizeRequired: Int = converter.getSizeRequirement(row)
-    assert(sizeRequired === 8 + (8 * 4) +
+    val unsafeRow: UnsafeRow = converter.apply(row)
+    assert(unsafeRow.getSizeInBytes === 8 + (8 * 4) +
       ByteArrayMethods.roundNumberOfBytesToNearestWord("Hello".getBytes.length))
-    val buffer: Array[Long] = new Array[Long](sizeRequired / 8)
-    val numBytesWritten =
-      converter.writeRow(row, buffer, PlatformDependent.LONG_ARRAY_OFFSET, sizeRequired)
-    assert(numBytesWritten === sizeRequired)
-
-    val unsafeRow = new UnsafeRow()
-    unsafeRow.pointTo(
-      buffer, PlatformDependent.LONG_ARRAY_OFFSET, fieldTypes.length, sizeRequired)
+
     assert(unsafeRow.getLong(0) === 0)
     assert(unsafeRow.getString(1) === "Hello")
     // Date is represented as Int in unsafeRow
@@ -148,26 +126,18 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers {
       // DecimalType.Default,
       // ArrayType(IntegerType)
     )
-    val converter = new UnsafeRowConverter(fieldTypes)
+    val converter = UnsafeProjection.create(fieldTypes)
 
     val rowWithAllNullColumns: InternalRow = {
       val r = new SpecificMutableRow(fieldTypes)
-      for (i <- 0 to fieldTypes.length - 1) {
+      for (i <- fieldTypes.indices) {
         r.setNullAt(i)
       }
       r
     }
 
-    val sizeRequired: Int = converter.getSizeRequirement(rowWithAllNullColumns)
-    val createdFromNullBuffer: Array[Long] = new Array[Long](sizeRequired / 8)
-    val numBytesWritten = converter.writeRow(
-      rowWithAllNullColumns, createdFromNullBuffer, PlatformDependent.LONG_ARRAY_OFFSET,
-      sizeRequired)
-    assert(numBytesWritten === sizeRequired)
+    val createdFromNull: UnsafeRow = converter.apply(rowWithAllNullColumns)
 
-    val createdFromNull = new UnsafeRow()
-    createdFromNull.pointTo(
-      createdFromNullBuffer, PlatformDependent.LONG_ARRAY_OFFSET, fieldTypes.length, sizeRequired)
     for (i <- fieldTypes.indices) {
       assert(createdFromNull.isNullAt(i))
     }
@@ -202,15 +172,8 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers {
       // r.update(11, Array(11))
       r
     }
-    val setToNullAfterCreationBuffer: Array[Long] = new Array[Long](sizeRequired / 8 + 2)
-    converter.writeRow(
-      rowWithNoNullColumns, setToNullAfterCreationBuffer, PlatformDependent.LONG_ARRAY_OFFSET,
-      sizeRequired)
-    val setToNullAfterCreation = new UnsafeRow()
-    setToNullAfterCreation.pointTo(
-      setToNullAfterCreationBuffer, PlatformDependent.LONG_ARRAY_OFFSET, fieldTypes.length,
-      sizeRequired)
 
+    val setToNullAfterCreation = converter.apply(rowWithNoNullColumns)
     assert(setToNullAfterCreation.isNullAt(0) === rowWithNoNullColumns.isNullAt(0))
     assert(setToNullAfterCreation.getBoolean(1) === rowWithNoNullColumns.getBoolean(1))
     assert(setToNullAfterCreation.getByte(2) === rowWithNoNullColumns.getByte(2))
@@ -228,8 +191,7 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers {
       setToNullAfterCreation.setNullAt(i)
     }
     // There are some garbage left in the var-length area
-    assert(Arrays.equals(createdFromNullBuffer,
-      java.util.Arrays.copyOf(setToNullAfterCreationBuffer, sizeRequired / 8)))
+    assert(Arrays.equals(createdFromNull.getBytes, setToNullAfterCreation.getBytes()))
 
     setToNullAfterCreation.setNullAt(0)
     setToNullAfterCreation.setBoolean(1, false)
@@ -269,12 +231,7 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers {
     row2.setFloat(0, java.lang.Float.intBitsToFloat(0x7fffffff))
     row2.setDouble(1, java.lang.Double.longBitsToDouble(0x7fffffffffffffffL))
 
-    val converter = new UnsafeRowConverter(fieldTypes)
-    val row1Buffer = new Array[Byte](converter.getSizeRequirement(row1))
-    val row2Buffer = new Array[Byte](converter.getSizeRequirement(row2))
-    converter.writeRow(row1, row1Buffer, PlatformDependent.BYTE_ARRAY_OFFSET, row1Buffer.length)
-    converter.writeRow(row2, row2Buffer, PlatformDependent.BYTE_ARRAY_OFFSET, row2Buffer.length)
-
-    assert(row1Buffer.toSeq === row2Buffer.toSeq)
+    val converter = UnsafeProjection.create(fieldTypes)
+    assert(converter.apply(row1).getBytes === converter.apply(row2).getBytes)
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala
index a1e1695717e23..40b47ae18d648 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala
@@ -22,29 +22,22 @@ import java.io.{ByteArrayInputStream, ByteArrayOutputStream}
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
-import org.apache.spark.sql.catalyst.expressions.{UnsafeRow, UnsafeRowConverter}
+import org.apache.spark.sql.catalyst.expressions.{UnsafeProjection, UnsafeRow}
 import org.apache.spark.sql.types._
-import org.apache.spark.unsafe.PlatformDependent
 
 class UnsafeRowSerializerSuite extends SparkFunSuite {
 
   private def toUnsafeRow(row: Row, schema: Array[DataType]): UnsafeRow = {
     val internalRow = CatalystTypeConverters.convertToCatalyst(row).asInstanceOf[InternalRow]
-    val rowConverter = new UnsafeRowConverter(schema)
-    val rowSizeInBytes = rowConverter.getSizeRequirement(internalRow)
-    val byteArray = new Array[Byte](rowSizeInBytes)
-    rowConverter.writeRow(
-      internalRow, byteArray, PlatformDependent.BYTE_ARRAY_OFFSET, rowSizeInBytes)
-    val unsafeRow = new UnsafeRow()
-    unsafeRow.pointTo(byteArray, PlatformDependent.BYTE_ARRAY_OFFSET, row.length, rowSizeInBytes)
-    unsafeRow
+    val converter = UnsafeProjection.create(schema)
+    converter.apply(internalRow)
   }
 
-  ignore("toUnsafeRow() test helper method") {
+  test("toUnsafeRow() test helper method") {
     // This currently doesnt work because the generic getter throws an exception.
     val row = Row("Hello", 123)
     val unsafeRow = toUnsafeRow(row, Array(StringType, IntegerType))
-    assert(row.getString(0) === unsafeRow.get(0).toString)
+    assert(row.getString(0) === unsafeRow.getUTF8String(0).toString)
     assert(row.getInt(1) === unsafeRow.getInt(1))
   }
 
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/ByteArray.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/ByteArray.java
new file mode 100644
index 0000000000000..69b0e206cef18
--- /dev/null
+++ b/unsafe/src/main/java/org/apache/spark/unsafe/types/ByteArray.java
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.unsafe.types;
+
+import org.apache.spark.unsafe.PlatformDependent;
+
+public class ByteArray {
+
+  /**
+   * Writes the content of a byte array into a memory address, identified by an object and an
+   * offset. The target memory address must already been allocated, and have enough space to
+   * hold all the bytes in this string.
+   */
+  public static void writeToMemory(byte[] src, Object target, long targetOffset) {
+    PlatformDependent.copyMemory(
+      src,
+      PlatformDependent.BYTE_ARRAY_OFFSET,
+      target,
+      targetOffset,
+      src.length
+    );
+  }
+}
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
index 6d8dcb1cbf876..85381cf0ef425 100644
--- a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
+++ b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -95,6 +95,21 @@ protected UTF8String(Object base, long offset, int size) {
     this.numBytes = size;
   }
 
+  /**
+   * Writes the content of this string into a memory address, identified by an object and an offset.
+   * The target memory address must already been allocated, and have enough space to hold all the
+   * bytes in this string.
+   */
+  public void writeToMemory(Object target, long targetOffset) {
+    PlatformDependent.copyMemory(
+      base,
+      offset,
+      target,
+      targetOffset,
+      numBytes
+    );
+  }
+
   /**
    * Returns the number of bytes for a code point with the first byte as `b`
    * @param b The first byte of a code point

From c980e20cf17f2980c564beab9b241022872e29ea Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Sat, 25 Jul 2015 11:05:08 +0100
Subject: [PATCH 373/549] [SPARK-9304] [BUILD] Improve backwards compatibility
 of SPARK-8401

Add back change-version-to-X.sh scripts, as wrappers for new script, for backwards compatibility

Author: Sean Owen <sowen@cloudera.com>

Closes #7639 from srowen/SPARK-9304 and squashes the following commits:

9ab2681 [Sean Owen] Add deprecation message to wrappers
3c8c202 [Sean Owen] Add back change-version-to-X.sh scripts, as wrappers for new script, for backwards compatibility
---
 dev/change-version-to-2.10.sh | 23 +++++++++++++++++++++++
 dev/change-version-to-2.11.sh | 23 +++++++++++++++++++++++
 2 files changed, 46 insertions(+)
 create mode 100755 dev/change-version-to-2.10.sh
 create mode 100755 dev/change-version-to-2.11.sh

diff --git a/dev/change-version-to-2.10.sh b/dev/change-version-to-2.10.sh
new file mode 100755
index 0000000000000..0962d34c52f28
--- /dev/null
+++ b/dev/change-version-to-2.10.sh
@@ -0,0 +1,23 @@
+#!/usr/bin/env bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# This script exists for backwards compability. Use change-scala-version.sh instead.
+echo "This script is deprecated. Please instead run: change-scala-version.sh 2.10"
+
+$(dirname $0)/change-scala-version.sh 2.10
diff --git a/dev/change-version-to-2.11.sh b/dev/change-version-to-2.11.sh
new file mode 100755
index 0000000000000..4ccfeef09fd04
--- /dev/null
+++ b/dev/change-version-to-2.11.sh
@@ -0,0 +1,23 @@
+#!/usr/bin/env bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# This script exists for backwards compability. Use change-scala-version.sh instead.
+echo "This script is deprecated. Please instead run: change-scala-version.sh 2.11"
+
+$(dirname $0)/change-scala-version.sh 2.11

From e2ec018e37cb699077b5fa2bd662f2055cb42296 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Sat, 25 Jul 2015 11:42:49 -0700
Subject: [PATCH 374/549] [SPARK-9285] [SQL] Fixes Row/InternalRow conversion
 for HadoopFsRelation

This is a follow-up of #7626. It fixes `Row`/`InternalRow` conversion for data sources extending `HadoopFsRelation` with `needConversion` being `true`.

Author: Cheng Lian <lian@databricks.com>

Closes #7649 from liancheng/spark-9285-conversion-fix and squashes the following commits:

036a50c [Cheng Lian] Addresses PR comment
f6d7c6a [Cheng Lian] Fixes Row/InternalRow conversion for HadoopFsRelation
---
 .../apache/spark/sql/sources/interfaces.scala | 23 ++++++++++++++++---
 .../SimpleTextHadoopFsRelationSuite.scala     |  5 ----
 2 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
index 119bac786d478..7126145ddc010 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
@@ -28,7 +28,7 @@ import org.apache.spark.{Logging, SparkContext}
 import org.apache.spark.annotation.{DeveloperApi, Experimental}
 import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.codegen.GenerateMutableProjection
 import org.apache.spark.sql.execution.RDDConversions
@@ -593,6 +593,11 @@ abstract class HadoopFsRelation private[sql](maybePartitionSpec: Option[Partitio
    *
    * @since 1.4.0
    */
+  // TODO Tries to eliminate the extra Catalyst-to-Scala conversion when `needConversion` is true
+  //
+  // PR #7626 separated `Row` and `InternalRow` completely.  One of the consequences is that we can
+  // no longer treat an `InternalRow` containing Catalyst values as a `Row`.  Thus we have to
+  // introduce another row value conversion for data sources whose `needConversion` is true.
   def buildScan(requiredColumns: Array[String], inputFiles: Array[FileStatus]): RDD[Row] = {
     // Yeah, to workaround serialization...
     val dataSchema = this.dataSchema
@@ -611,14 +616,26 @@ abstract class HadoopFsRelation private[sql](maybePartitionSpec: Option[Partitio
       } else {
         rdd.asInstanceOf[RDD[InternalRow]]
       }
+
     converted.mapPartitions { rows =>
       val buildProjection = if (codegenEnabled) {
         GenerateMutableProjection.generate(requiredOutput, dataSchema.toAttributes)
       } else {
         () => new InterpretedMutableProjection(requiredOutput, dataSchema.toAttributes)
       }
-      val mutableProjection = buildProjection()
-      rows.map(r => mutableProjection(r))
+
+      val projectedRows = {
+        val mutableProjection = buildProjection()
+        rows.map(r => mutableProjection(r))
+      }
+
+      if (needConversion) {
+        val requiredSchema = StructType(requiredColumns.map(dataSchema(_)))
+        val toScala = CatalystTypeConverters.createToScalaConverter(requiredSchema)
+        projectedRows.map(toScala(_).asInstanceOf[Row])
+      } else {
+        projectedRows
+      }
     }.asInstanceOf[RDD[Row]]
   }
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextHadoopFsRelationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextHadoopFsRelationSuite.scala
index d761909d60e21..e8975e5f5cd08 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextHadoopFsRelationSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextHadoopFsRelationSuite.scala
@@ -22,10 +22,6 @@ import org.apache.hadoop.fs.Path
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
 
-/*
-This is commented out due a bug in the data source API (SPARK-9291).
-
-
 class SimpleTextHadoopFsRelationSuite extends HadoopFsRelationTest {
   override val dataSourceName: String = classOf[SimpleTextSource].getCanonicalName
 
@@ -54,4 +50,3 @@ class SimpleTextHadoopFsRelationSuite extends HadoopFsRelationTest {
     }
   }
 }
-*/

From 2c94d0f24a37fa079b56d534b0b0a4574209215b Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@outlook.com>
Date: Sat, 25 Jul 2015 12:10:02 -0700
Subject: [PATCH 375/549] [SPARK-9192][SQL] add initialization phase for
 nondeterministic expression

Currently nondeterministic expression is broken without a explicit initialization phase.

Let me take `MonotonicallyIncreasingID` as an example. This expression need a mutable state to remember how many times it has been evaluated, so we use `transient var count: Long` there. By being transient, the `count` will be reset to 0 and **only** to 0 when serialize and deserialize it, as deserialize transient variable will result to default value. There is *no way* to use another initial value for `count`, until we add the explicit initialization phase.

Another use case is local execution for `LocalRelation`, there is no serialize and deserialize phase and thus we can't reset mutable states for it.

Author: Wenchen Fan <cloud0fan@outlook.com>

Closes #7535 from cloud-fan/init and squashes the following commits:

6c6f332 [Wenchen Fan] add test
ef68ff4 [Wenchen Fan] fix comments
9eac85e [Wenchen Fan] move init code to interpreted class
bb7d838 [Wenchen Fan] pulls out nondeterministic expressions into a project
b4a4fc7 [Wenchen Fan] revert a refactor
86fee36 [Wenchen Fan] add initialization phase for nondeterministic expression
---
 .../sql/catalyst/analysis/Analyzer.scala      |  35 +++++-
 .../sql/catalyst/analysis/CheckAnalysis.scala |  19 +++-
 .../sql/catalyst/expressions/Expression.scala |  21 +++-
 .../sql/catalyst/expressions/Projection.scala |  10 ++
 .../sql/catalyst/expressions/predicates.scala |   4 +
 .../sql/catalyst/expressions/random.scala     |  12 +-
 .../plans/logical/basicOperators.scala        |   3 +-
 .../sql/catalyst/analysis/AnalysisSuite.scala |  96 +++++++---------
 .../sql/catalyst/analysis/AnalysisTest.scala  | 105 ++++++++++++++++++
 .../expressions/ExpressionEvalHelper.scala    |   4 +
 .../MonotonicallyIncreasingID.scala           |  13 ++-
 .../expressions/SparkPartitionID.scala        |   8 +-
 12 files changed, 254 insertions(+), 76 deletions(-)
 create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisTest.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index e916887187dc8..a723e92114b32 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -19,10 +19,11 @@ package org.apache.spark.sql.catalyst.analysis
 
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.expressions.aggregate.{Complete, AggregateExpression2, AggregateFunction2}
-import org.apache.spark.sql.catalyst.{SimpleCatalystConf, CatalystConf}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules._
+import org.apache.spark.sql.catalyst.trees.TreeNodeRef
+import org.apache.spark.sql.catalyst.{SimpleCatalystConf, CatalystConf}
 import org.apache.spark.sql.types._
 import scala.collection.mutable.ArrayBuffer
 
@@ -78,7 +79,9 @@ class Analyzer(
       GlobalAggregates ::
       UnresolvedHavingClauseAttributes ::
       HiveTypeCoercion.typeCoercionRules ++
-      extendedResolutionRules : _*)
+      extendedResolutionRules : _*),
+    Batch("Nondeterministic", Once,
+      PullOutNondeterministic)
   )
 
   /**
@@ -910,6 +913,34 @@ class Analyzer(
         Project(finalProjectList, withWindow)
     }
   }
+
+  /**
+   * Pulls out nondeterministic expressions from LogicalPlan which is not Project or Filter,
+   * put them into an inner Project and finally project them away at the outer Project.
+   */
+  object PullOutNondeterministic extends Rule[LogicalPlan] {
+    override def apply(plan: LogicalPlan): LogicalPlan = plan transform {
+      case p: Project => p
+      case f: Filter => f
+
+      // todo: It's hard to write a general rule to pull out nondeterministic expressions
+      // from LogicalPlan, currently we only do it for UnaryNode which has same output
+      // schema with its child.
+      case p: UnaryNode if p.output == p.child.output && p.expressions.exists(!_.deterministic) =>
+        val nondeterministicExprs = p.expressions.filterNot(_.deterministic).map { e =>
+          val ne = e match {
+            case n: NamedExpression => n
+            case _ => Alias(e, "_nondeterministic")()
+          }
+          new TreeNodeRef(e) -> ne
+        }.toMap
+        val newPlan = p.transformExpressions { case e =>
+          nondeterministicExprs.get(new TreeNodeRef(e)).map(_.toAttribute).getOrElse(e)
+        }
+        val newChild = Project(p.child.output ++ nondeterministicExprs.values, p.child)
+        Project(p.output, newPlan.withNewChildren(newChild :: Nil))
+    }
+  }
 }
 
 /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
index 81d473c1130f7..a373714832962 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
@@ -19,7 +19,6 @@ package org.apache.spark.sql.catalyst.analysis
 
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression2
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.types._
 
@@ -38,10 +37,10 @@ trait CheckAnalysis {
     throw new AnalysisException(msg)
   }
 
-  def containsMultipleGenerators(exprs: Seq[Expression]): Boolean = {
+  protected def containsMultipleGenerators(exprs: Seq[Expression]): Boolean = {
     exprs.flatMap(_.collect {
-      case e: Generator => true
-    }).nonEmpty
+      case e: Generator => e
+    }).length > 1
   }
 
   def checkAnalysis(plan: LogicalPlan): Unit = {
@@ -137,13 +136,21 @@ trait CheckAnalysis {
               s"""
                  |Failure when resolving conflicting references in Join:
                  |$plan
-                  |Conflicting attributes: ${conflictingAttributes.mkString(",")}
-                  |""".stripMargin)
+                 |Conflicting attributes: ${conflictingAttributes.mkString(",")}
+                 |""".stripMargin)
 
           case o if !o.resolved =>
             failAnalysis(
               s"unresolved operator ${operator.simpleString}")
 
+          case o if o.expressions.exists(!_.deterministic) &&
+            !o.isInstanceOf[Project] && !o.isInstanceOf[Filter] =>
+            failAnalysis(
+              s"""nondeterministic expressions are only allowed in Project or Filter, found:
+                 | ${o.expressions.map(_.prettyString).mkString(",")}
+                 |in operator ${operator.simpleString}
+             """.stripMargin)
+
           case _ => // Analysis successful!
         }
     }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
index 3f72e6e184db1..cb4c3f24b2721 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
@@ -196,7 +196,26 @@ trait Unevaluable extends Expression {
  * An expression that is nondeterministic.
  */
 trait Nondeterministic extends Expression {
-  override def deterministic: Boolean = false
+  final override def deterministic: Boolean = false
+  final override def foldable: Boolean = false
+
+  private[this] var initialized = false
+
+  final def initialize(): Unit = {
+    if (!initialized) {
+      initInternal()
+      initialized = true
+    }
+  }
+
+  protected def initInternal(): Unit
+
+  final override def eval(input: InternalRow = null): Any = {
+    require(initialized, "nondeterministic expression should be initialized before evaluate")
+    evalInternal(input)
+  }
+
+  protected def evalInternal(input: InternalRow): Any
 }
 
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
index fb873e7e99547..c1ed9cf7ed6a0 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
@@ -31,6 +31,11 @@ class InterpretedProjection(expressions: Seq[Expression]) extends Projection {
   def this(expressions: Seq[Expression], inputSchema: Seq[Attribute]) =
     this(expressions.map(BindReferences.bindReference(_, inputSchema)))
 
+  expressions.foreach(_.foreach {
+    case n: Nondeterministic => n.initialize()
+    case _ =>
+  })
+
   // null check is required for when Kryo invokes the no-arg constructor.
   protected val exprArray = if (expressions != null) expressions.toArray else null
 
@@ -57,6 +62,11 @@ case class InterpretedMutableProjection(expressions: Seq[Expression]) extends Mu
   def this(expressions: Seq[Expression], inputSchema: Seq[Attribute]) =
     this(expressions.map(BindReferences.bindReference(_, inputSchema)))
 
+  expressions.foreach(_.foreach {
+    case n: Nondeterministic => n.initialize()
+    case _ =>
+  })
+
   private[this] val exprArray = expressions.toArray
   private[this] var mutableRow: MutableRow = new GenericMutableRow(exprArray.length)
   def currentValue: InternalRow = mutableRow
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
index 3f1bd2a925fe7..5bfe1cad24a3e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
@@ -30,6 +30,10 @@ object InterpretedPredicate {
     create(BindReferences.bindReference(expression, inputSchema))
 
   def create(expression: Expression): (InternalRow => Boolean) = {
+    expression.foreach {
+      case n: Nondeterministic => n.initialize()
+      case _ =>
+    }
     (r: InternalRow) => expression.eval(r).asInstanceOf[Boolean]
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/random.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/random.scala
index aef24a5486466..8f30519697a37 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/random.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/random.scala
@@ -38,9 +38,13 @@ abstract class RDG extends LeafExpression with Nondeterministic {
 
   /**
    * Record ID within each partition. By being transient, the Random Number Generator is
-   * reset every time we serialize and deserialize it.
+   * reset every time we serialize and deserialize and initialize it.
    */
-  @transient protected lazy val rng = new XORShiftRandom(seed + TaskContext.getPartitionId)
+  @transient protected var rng: XORShiftRandom = _
+
+  override protected def initInternal(): Unit = {
+    rng = new XORShiftRandom(seed + TaskContext.getPartitionId)
+  }
 
   override def nullable: Boolean = false
 
@@ -49,7 +53,7 @@ abstract class RDG extends LeafExpression with Nondeterministic {
 
 /** Generate a random column with i.i.d. uniformly distributed values in [0, 1). */
 case class Rand(seed: Long) extends RDG {
-  override def eval(input: InternalRow): Double = rng.nextDouble()
+  override protected def evalInternal(input: InternalRow): Double = rng.nextDouble()
 
   def this() = this(Utils.random.nextLong())
 
@@ -72,7 +76,7 @@ case class Rand(seed: Long) extends RDG {
 
 /** Generate a random column with i.i.d. gaussian random distribution. */
 case class Randn(seed: Long) extends RDG {
-  override def eval(input: InternalRow): Double = rng.nextGaussian()
+  override protected def evalInternal(input: InternalRow): Double = rng.nextGaussian()
 
   def this() = this(Utils.random.nextLong())
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
index 57a12820fa4c6..8e1a236e2988c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
@@ -18,7 +18,6 @@
 package org.apache.spark.sql.catalyst.plans.logical
 
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression2
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.types._
 import org.apache.spark.util.collection.OpenHashSet
@@ -379,7 +378,7 @@ case class Limit(limitExpr: Expression, child: LogicalPlan) extends UnaryNode {
   override def output: Seq[Attribute] = child.output
 
   override lazy val statistics: Statistics = {
-    val limit = limitExpr.eval(null).asInstanceOf[Int]
+    val limit = limitExpr.eval().asInstanceOf[Int]
     val sizeInBytes = (limit: Long) * output.map(a => a.dataType.defaultSize).sum
     Statistics(sizeInBytes = sizeInBytes)
   }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
index 7e67427237a65..ed645b618dc9b 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
@@ -17,10 +17,6 @@
 
 package org.apache.spark.sql.catalyst.analysis
 
-import org.scalatest.BeforeAndAfter
-
-import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.types._
@@ -28,6 +24,7 @@ import org.apache.spark.sql.catalyst.SimpleCatalystConf
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.dsl.plans._
 
+// todo: remove this and use AnalysisTest instead.
 object AnalysisSuite {
   val caseSensitiveConf = new SimpleCatalystConf(true)
   val caseInsensitiveConf = new SimpleCatalystConf(false)
@@ -55,7 +52,7 @@ object AnalysisSuite {
     AttributeReference("a", StringType)(),
     AttributeReference("b", StringType)(),
     AttributeReference("c", DoubleType)(),
-    AttributeReference("d", DecimalType.SYSTEM_DEFAULT)(),
+    AttributeReference("d", DecimalType(10, 2))(),
     AttributeReference("e", ShortType)())
 
   val nestedRelation = LocalRelation(
@@ -81,8 +78,7 @@ object AnalysisSuite {
 }
 
 
-class AnalysisSuite extends SparkFunSuite with BeforeAndAfter {
-  import AnalysisSuite._
+class AnalysisSuite extends AnalysisTest {
 
   test("union project *") {
     val plan = (1 to 100)
@@ -91,7 +87,7 @@ class AnalysisSuite extends SparkFunSuite with BeforeAndAfter {
         a.select(UnresolvedStar(None)).select('a).unionAll(b.select(UnresolvedStar(None)))
       }
 
-    assert(caseInsensitiveAnalyzer.execute(plan).resolved)
+    assertAnalysisSuccess(plan)
   }
 
   test("check project's resolved") {
@@ -106,61 +102,40 @@ class AnalysisSuite extends SparkFunSuite with BeforeAndAfter {
   }
 
   test("analyze project") {
-    assert(
-      caseSensitiveAnalyzer.execute(Project(Seq(UnresolvedAttribute("a")), testRelation)) ===
-        Project(testRelation.output, testRelation))
-
-    assert(
-      caseSensitiveAnalyzer.execute(
-        Project(Seq(UnresolvedAttribute("TbL.a")),
-          UnresolvedRelation(Seq("TaBlE"), Some("TbL")))) ===
-        Project(testRelation.output, testRelation))
-
-    val e = intercept[AnalysisException] {
-      caseSensitiveAnalyze(
-        Project(Seq(UnresolvedAttribute("tBl.a")),
-          UnresolvedRelation(Seq("TaBlE"), Some("TbL"))))
-    }
-    assert(e.getMessage().toLowerCase.contains("cannot resolve"))
-
-    assert(
-      caseInsensitiveAnalyzer.execute(
-        Project(Seq(UnresolvedAttribute("TbL.a")),
-          UnresolvedRelation(Seq("TaBlE"), Some("TbL")))) ===
-        Project(testRelation.output, testRelation))
-
-    assert(
-      caseInsensitiveAnalyzer.execute(
-        Project(Seq(UnresolvedAttribute("tBl.a")),
-          UnresolvedRelation(Seq("TaBlE"), Some("TbL")))) ===
-        Project(testRelation.output, testRelation))
+    checkAnalysis(
+      Project(Seq(UnresolvedAttribute("a")), testRelation),
+      Project(testRelation.output, testRelation))
+
+    checkAnalysis(
+      Project(Seq(UnresolvedAttribute("TbL.a")), UnresolvedRelation(Seq("TaBlE"), Some("TbL"))),
+      Project(testRelation.output, testRelation))
+
+    assertAnalysisError(
+      Project(Seq(UnresolvedAttribute("tBl.a")), UnresolvedRelation(Seq("TaBlE"), Some("TbL"))),
+      Seq("cannot resolve"))
+
+    checkAnalysis(
+      Project(Seq(UnresolvedAttribute("TbL.a")), UnresolvedRelation(Seq("TaBlE"), Some("TbL"))),
+      Project(testRelation.output, testRelation),
+      caseSensitive = false)
+
+    checkAnalysis(
+      Project(Seq(UnresolvedAttribute("tBl.a")), UnresolvedRelation(Seq("TaBlE"), Some("TbL"))),
+      Project(testRelation.output, testRelation),
+      caseSensitive = false)
   }
 
   test("resolve relations") {
-    val e = intercept[RuntimeException] {
-      caseSensitiveAnalyze(UnresolvedRelation(Seq("tAbLe"), None))
-    }
-    assert(e.getMessage == "Table Not Found: tAbLe")
+    assertAnalysisError(UnresolvedRelation(Seq("tAbLe"), None), Seq("Table Not Found: tAbLe"))
 
-    assert(
-      caseSensitiveAnalyzer.execute(UnresolvedRelation(Seq("TaBlE"), None)) === testRelation)
+    checkAnalysis(UnresolvedRelation(Seq("TaBlE"), None), testRelation)
 
-    assert(
-      caseInsensitiveAnalyzer.execute(UnresolvedRelation(Seq("tAbLe"), None)) === testRelation)
+    checkAnalysis(UnresolvedRelation(Seq("tAbLe"), None), testRelation, caseSensitive = false)
 
-    assert(
-      caseInsensitiveAnalyzer.execute(UnresolvedRelation(Seq("TaBlE"), None)) === testRelation)
+    checkAnalysis(UnresolvedRelation(Seq("TaBlE"), None), testRelation, caseSensitive = false)
   }
 
-
   test("divide should be casted into fractional types") {
-    val testRelation2 = LocalRelation(
-      AttributeReference("a", StringType)(),
-      AttributeReference("b", StringType)(),
-      AttributeReference("c", DoubleType)(),
-      AttributeReference("d", DecimalType(10, 2))(),
-      AttributeReference("e", ShortType)())
-
     val plan = caseInsensitiveAnalyzer.execute(
       testRelation2.select(
         'a / Literal(2) as 'div1,
@@ -170,10 +145,21 @@ class AnalysisSuite extends SparkFunSuite with BeforeAndAfter {
         'e / 'e as 'div5))
     val pl = plan.asInstanceOf[Project].projectList
 
+    // StringType will be promoted into Double
     assert(pl(0).dataType == DoubleType)
     assert(pl(1).dataType == DoubleType)
     assert(pl(2).dataType == DoubleType)
-    assert(pl(3).dataType == DoubleType)  // StringType will be promoted into Double
+    assert(pl(3).dataType == DoubleType)
     assert(pl(4).dataType == DoubleType)
   }
+
+  test("pull out nondeterministic expressions from unary LogicalPlan") {
+    val plan = RepartitionByExpression(Seq(Rand(33)), testRelation)
+    val projected = Alias(Rand(33), "_nondeterministic")()
+    val expected =
+      Project(testRelation.output,
+        RepartitionByExpression(Seq(projected.toAttribute),
+          Project(testRelation.output :+ projected, testRelation)))
+    checkAnalysis(plan, expected)
+  }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisTest.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisTest.scala
new file mode 100644
index 0000000000000..fdb4f28950daf
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisTest.scala
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.analysis
+
+import org.apache.spark.sql.AnalysisException
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.plans.PlanTest
+import org.apache.spark.sql.catalyst.plans.logical._
+import org.apache.spark.sql.catalyst.SimpleCatalystConf
+import org.apache.spark.sql.types._
+
+trait AnalysisTest extends PlanTest {
+  val testRelation = LocalRelation(AttributeReference("a", IntegerType, nullable = true)())
+
+  val testRelation2 = LocalRelation(
+    AttributeReference("a", StringType)(),
+    AttributeReference("b", StringType)(),
+    AttributeReference("c", DoubleType)(),
+    AttributeReference("d", DecimalType(10, 2))(),
+    AttributeReference("e", ShortType)())
+
+  val nestedRelation = LocalRelation(
+    AttributeReference("top", StructType(
+      StructField("duplicateField", StringType) ::
+        StructField("duplicateField", StringType) ::
+        StructField("differentCase", StringType) ::
+        StructField("differentcase", StringType) :: Nil
+    ))())
+
+  val nestedRelation2 = LocalRelation(
+    AttributeReference("top", StructType(
+      StructField("aField", StringType) ::
+        StructField("bField", StringType) ::
+        StructField("cField", StringType) :: Nil
+    ))())
+
+  val listRelation = LocalRelation(
+    AttributeReference("list", ArrayType(IntegerType))())
+
+  val (caseSensitiveAnalyzer, caseInsensitiveAnalyzer) = {
+    val caseSensitiveConf = new SimpleCatalystConf(true)
+    val caseInsensitiveConf = new SimpleCatalystConf(false)
+
+    val caseSensitiveCatalog = new SimpleCatalog(caseSensitiveConf)
+    val caseInsensitiveCatalog = new SimpleCatalog(caseInsensitiveConf)
+
+    caseSensitiveCatalog.registerTable(Seq("TaBlE"), testRelation)
+    caseInsensitiveCatalog.registerTable(Seq("TaBlE"), testRelation)
+
+    new Analyzer(caseSensitiveCatalog, EmptyFunctionRegistry, caseSensitiveConf) {
+      override val extendedResolutionRules = EliminateSubQueries :: Nil
+    } ->
+    new Analyzer(caseInsensitiveCatalog, EmptyFunctionRegistry, caseInsensitiveConf) {
+      override val extendedResolutionRules = EliminateSubQueries :: Nil
+    }
+  }
+
+  protected def getAnalyzer(caseSensitive: Boolean) = {
+    if (caseSensitive) caseSensitiveAnalyzer else caseInsensitiveAnalyzer
+  }
+
+  protected def checkAnalysis(
+      inputPlan: LogicalPlan,
+      expectedPlan: LogicalPlan,
+      caseSensitive: Boolean = true): Unit = {
+    val analyzer = getAnalyzer(caseSensitive)
+    val actualPlan = analyzer.execute(inputPlan)
+    analyzer.checkAnalysis(actualPlan)
+    comparePlans(actualPlan, expectedPlan)
+  }
+
+  protected def assertAnalysisSuccess(
+      inputPlan: LogicalPlan,
+      caseSensitive: Boolean = true): Unit = {
+    val analyzer = getAnalyzer(caseSensitive)
+    analyzer.checkAnalysis(analyzer.execute(inputPlan))
+  }
+
+  protected def assertAnalysisError(
+      inputPlan: LogicalPlan,
+      expectedErrors: Seq[String],
+      caseSensitive: Boolean = true): Unit = {
+    val analyzer = getAnalyzer(caseSensitive)
+    // todo: make sure we throw AnalysisException during analysis
+    val e = intercept[Exception] {
+      analyzer.checkAnalysis(analyzer.execute(inputPlan))
+    }
+    expectedErrors.forall(e.getMessage.contains)
+  }
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
index 4930219aa63cb..852a8b235f127 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
@@ -64,6 +64,10 @@ trait ExpressionEvalHelper {
   }
 
   protected def evaluate(expression: Expression, inputRow: InternalRow = EmptyRow): Any = {
+    expression.foreach {
+      case n: Nondeterministic => n.initialize()
+      case _ =>
+    }
     expression.eval(inputRow)
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/MonotonicallyIncreasingID.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/MonotonicallyIncreasingID.scala
index 2645eb1854bce..eca36b3274420 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/MonotonicallyIncreasingID.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/MonotonicallyIncreasingID.scala
@@ -37,17 +37,22 @@ private[sql] case class MonotonicallyIncreasingID() extends LeafExpression with
 
   /**
    * Record ID within each partition. By being transient, count's value is reset to 0 every time
-   * we serialize and deserialize it.
+   * we serialize and deserialize and initialize it.
    */
-  @transient private[this] var count: Long = 0L
+  @transient private[this] var count: Long = _
 
-  @transient private lazy val partitionMask = TaskContext.getPartitionId().toLong << 33
+  @transient private[this] var partitionMask: Long = _
+
+  override protected def initInternal(): Unit = {
+    count = 0L
+    partitionMask = TaskContext.getPartitionId().toLong << 33
+  }
 
   override def nullable: Boolean = false
 
   override def dataType: DataType = LongType
 
-  override def eval(input: InternalRow): Long = {
+  override protected def evalInternal(input: InternalRow): Long = {
     val currentCount = count
     count += 1
     partitionMask + currentCount
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/SparkPartitionID.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/SparkPartitionID.scala
index 53ddd47e3e0c1..61ef079d89af5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/SparkPartitionID.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/SparkPartitionID.scala
@@ -33,9 +33,13 @@ private[sql] case object SparkPartitionID extends LeafExpression with Nondetermi
 
   override def dataType: DataType = IntegerType
 
-  @transient private lazy val partitionId = TaskContext.getPartitionId()
+  @transient private[this] var partitionId: Int = _
 
-  override def eval(input: InternalRow): Int = partitionId
+  override protected def initInternal(): Unit = {
+    partitionId = TaskContext.getPartitionId()
+  }
+
+  override protected def evalInternal(input: InternalRow): Int = partitionId
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     val idTerm = ctx.freshName("partitionId")

From b1f4b4abfd8d038c3684685b245b5fd31b927da0 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Sat, 25 Jul 2015 18:41:51 -0700
Subject: [PATCH 376/549] [SPARK-9348][SQL] Remove apply method on InternalRow.

Author: Reynold Xin <rxin@databricks.com>

Closes #7665 from rxin/remove-row-apply and squashes the following commits:

0b43001 [Reynold Xin] support getString in UnsafeRow.
176d633 [Reynold Xin] apply -> get.
2941324 [Reynold Xin] [SPARK-9348][SQL] Remove apply method on InternalRow.
---
 .../sql/catalyst/expressions/UnsafeRow.java   | 88 +++++++++----------
 .../spark/sql/catalyst/InternalRow.scala      | 32 +++----
 .../expressions/codegen/CodeGenerator.scala   |  2 +-
 .../expressions/MathFunctionsSuite.scala      |  2 +-
 .../spark/sql/columnar/ColumnStats.scala      |  4 +-
 .../spark/sql/columnar/ColumnType.scala       | 16 ++--
 .../compression/compressionSchemes.scala      |  2 +-
 .../sql/execution/SparkSqlSerializer2.scala   |  4 +-
 .../datasources/DataSourceStrategy.scala      |  6 +-
 .../spark/sql/execution/debug/package.scala   |  2 +-
 .../spark/sql/execution/pythonUDFs.scala      |  2 +-
 .../sql/expressions/aggregate/udaf.scala      |  4 +-
 .../sql/parquet/ParquetTableOperations.scala  |  6 +-
 .../sql/parquet/ParquetTableSupport.scala     | 22 ++---
 .../scala/org/apache/spark/sql/RowSuite.scala |  4 +-
 .../spark/sql/columnar/ColumnStatsSuite.scala | 12 +--
 .../NullableColumnAccessorSuite.scala         |  2 +-
 .../columnar/NullableColumnBuilderSuite.scala |  2 +-
 .../compression/BooleanBitSetSuite.scala      |  2 +-
 .../spark/sql/hive/HiveInspectors.scala       |  6 +-
 .../hive/execution/InsertIntoHiveTable.scala  |  2 +-
 .../spark/sql/hive/orc/OrcRelation.scala      |  2 +-
 22 files changed, 113 insertions(+), 111 deletions(-)

diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
index 225f6e6553d19..9be9089493335 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
@@ -231,84 +231,89 @@ public void setFloat(int ordinal, float value) {
   }
 
   @Override
-  public Object get(int i) {
+  public Object get(int ordinal) {
     throw new UnsupportedOperationException();
   }
 
   @Override
-  public <T> T getAs(int i) {
+  public <T> T getAs(int ordinal) {
     throw new UnsupportedOperationException();
   }
 
   @Override
-  public boolean isNullAt(int i) {
-    assertIndexIsValid(i);
-    return BitSetMethods.isSet(baseObject, baseOffset, i);
+  public boolean isNullAt(int ordinal) {
+    assertIndexIsValid(ordinal);
+    return BitSetMethods.isSet(baseObject, baseOffset, ordinal);
   }
 
   @Override
-  public boolean getBoolean(int i) {
-    assertIndexIsValid(i);
-    return PlatformDependent.UNSAFE.getBoolean(baseObject, getFieldOffset(i));
+  public boolean getBoolean(int ordinal) {
+    assertIndexIsValid(ordinal);
+    return PlatformDependent.UNSAFE.getBoolean(baseObject, getFieldOffset(ordinal));
   }
 
   @Override
-  public byte getByte(int i) {
-    assertIndexIsValid(i);
-    return PlatformDependent.UNSAFE.getByte(baseObject, getFieldOffset(i));
+  public byte getByte(int ordinal) {
+    assertIndexIsValid(ordinal);
+    return PlatformDependent.UNSAFE.getByte(baseObject, getFieldOffset(ordinal));
   }
 
   @Override
-  public short getShort(int i) {
-    assertIndexIsValid(i);
-    return PlatformDependent.UNSAFE.getShort(baseObject, getFieldOffset(i));
+  public short getShort(int ordinal) {
+    assertIndexIsValid(ordinal);
+    return PlatformDependent.UNSAFE.getShort(baseObject, getFieldOffset(ordinal));
   }
 
   @Override
-  public int getInt(int i) {
-    assertIndexIsValid(i);
-    return PlatformDependent.UNSAFE.getInt(baseObject, getFieldOffset(i));
+  public int getInt(int ordinal) {
+    assertIndexIsValid(ordinal);
+    return PlatformDependent.UNSAFE.getInt(baseObject, getFieldOffset(ordinal));
   }
 
   @Override
-  public long getLong(int i) {
-    assertIndexIsValid(i);
-    return PlatformDependent.UNSAFE.getLong(baseObject, getFieldOffset(i));
+  public long getLong(int ordinal) {
+    assertIndexIsValid(ordinal);
+    return PlatformDependent.UNSAFE.getLong(baseObject, getFieldOffset(ordinal));
   }
 
   @Override
-  public float getFloat(int i) {
-    assertIndexIsValid(i);
-    if (isNullAt(i)) {
+  public float getFloat(int ordinal) {
+    assertIndexIsValid(ordinal);
+    if (isNullAt(ordinal)) {
       return Float.NaN;
     } else {
-      return PlatformDependent.UNSAFE.getFloat(baseObject, getFieldOffset(i));
+      return PlatformDependent.UNSAFE.getFloat(baseObject, getFieldOffset(ordinal));
     }
   }
 
   @Override
-  public double getDouble(int i) {
-    assertIndexIsValid(i);
-    if (isNullAt(i)) {
+  public double getDouble(int ordinal) {
+    assertIndexIsValid(ordinal);
+    if (isNullAt(ordinal)) {
       return Float.NaN;
     } else {
-      return PlatformDependent.UNSAFE.getDouble(baseObject, getFieldOffset(i));
+      return PlatformDependent.UNSAFE.getDouble(baseObject, getFieldOffset(ordinal));
     }
   }
 
   @Override
-  public UTF8String getUTF8String(int i) {
-    assertIndexIsValid(i);
-    return isNullAt(i) ? null : UTF8String.fromBytes(getBinary(i));
+  public UTF8String getUTF8String(int ordinal) {
+    assertIndexIsValid(ordinal);
+    return isNullAt(ordinal) ? null : UTF8String.fromBytes(getBinary(ordinal));
   }
 
   @Override
-  public byte[] getBinary(int i) {
-    if (isNullAt(i)) {
+  public String getString(int ordinal) {
+    return getUTF8String(ordinal).toString();
+  }
+
+  @Override
+  public byte[] getBinary(int ordinal) {
+    if (isNullAt(ordinal)) {
       return null;
     } else {
-      assertIndexIsValid(i);
-      final long offsetAndSize = getLong(i);
+      assertIndexIsValid(ordinal);
+      final long offsetAndSize = getLong(ordinal);
       final int offset = (int) (offsetAndSize >> 32);
       final int size = (int) (offsetAndSize & ((1L << 32) - 1));
       final byte[] bytes = new byte[size];
@@ -324,17 +329,12 @@ public byte[] getBinary(int i) {
   }
 
   @Override
-  public String getString(int i) {
-    return getUTF8String(i).toString();
-  }
-
-  @Override
-  public UnsafeRow getStruct(int i, int numFields) {
-    if (isNullAt(i)) {
+  public UnsafeRow getStruct(int ordinal, int numFields) {
+    if (isNullAt(ordinal)) {
       return null;
     } else {
-      assertIndexIsValid(i);
-      final long offsetAndSize = getLong(i);
+      assertIndexIsValid(ordinal);
+      final long offsetAndSize = getLong(ordinal);
       final int offset = (int) (offsetAndSize >> 32);
       final int size = (int) (offsetAndSize & ((1L << 32) - 1));
       final UnsafeRow row = new UnsafeRow();
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala
index f248b1f338acc..37f0f57e9e6d3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.catalyst
 
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.types.Decimal
 import org.apache.spark.unsafe.types.UTF8String
 
 /**
@@ -29,35 +30,34 @@ abstract class InternalRow extends Serializable {
 
   def numFields: Int
 
-  def get(i: Int): Any
+  def get(ordinal: Int): Any
 
-  // TODO: Remove this.
-  def apply(i: Int): Any = get(i)
+  def getAs[T](ordinal: Int): T = get(ordinal).asInstanceOf[T]
 
-  def getAs[T](i: Int): T = get(i).asInstanceOf[T]
+  def isNullAt(ordinal: Int): Boolean = get(ordinal) == null
 
-  def isNullAt(i: Int): Boolean = get(i) == null
+  def getBoolean(ordinal: Int): Boolean = getAs[Boolean](ordinal)
 
-  def getBoolean(i: Int): Boolean = getAs[Boolean](i)
+  def getByte(ordinal: Int): Byte = getAs[Byte](ordinal)
 
-  def getByte(i: Int): Byte = getAs[Byte](i)
+  def getShort(ordinal: Int): Short = getAs[Short](ordinal)
 
-  def getShort(i: Int): Short = getAs[Short](i)
+  def getInt(ordinal: Int): Int = getAs[Int](ordinal)
 
-  def getInt(i: Int): Int = getAs[Int](i)
+  def getLong(ordinal: Int): Long = getAs[Long](ordinal)
 
-  def getLong(i: Int): Long = getAs[Long](i)
+  def getFloat(ordinal: Int): Float = getAs[Float](ordinal)
 
-  def getFloat(i: Int): Float = getAs[Float](i)
+  def getDouble(ordinal: Int): Double = getAs[Double](ordinal)
 
-  def getDouble(i: Int): Double = getAs[Double](i)
+  def getUTF8String(ordinal: Int): UTF8String = getAs[UTF8String](ordinal)
 
-  def getUTF8String(i: Int): UTF8String = getAs[UTF8String](i)
+  def getBinary(ordinal: Int): Array[Byte] = getAs[Array[Byte]](ordinal)
 
-  def getBinary(i: Int): Array[Byte] = getAs[Array[Byte]](i)
+  def getDecimal(ordinal: Int): Decimal = getAs[Decimal](ordinal)
 
-  // This is only use for test
-  def getString(i: Int): String = getAs[UTF8String](i).toString
+  // This is only use for test and will throw a null pointer exception if the position is null.
+  def getString(ordinal: Int): String = getAs[UTF8String](ordinal).toString
 
   /**
    * Returns a struct from ordinal position.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index 508882acbee5a..2a1e288cb8377 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -110,7 +110,7 @@ class CodeGenContext {
       case StringType => s"$row.getUTF8String($ordinal)"
       case BinaryType => s"$row.getBinary($ordinal)"
       case t: StructType => s"$row.getStruct($ordinal, ${t.size})"
-      case _ => s"($jt)$row.apply($ordinal)"
+      case _ => s"($jt)$row.get($ordinal)"
     }
   }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala
index a2b0fad7b7a04..6caf8baf24a81 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala
@@ -158,7 +158,7 @@ class MathFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
       GenerateMutableProjection.generate(Alias(expression, s"Optimized($expression)")() :: Nil)(),
       expression)
 
-    val actual = plan(inputRow).apply(0)
+    val actual = plan(inputRow).get(0)
     if (!actual.asInstanceOf[Double].isNaN) {
       fail(s"Incorrect Evaluation: $expression, actual: $actual, expected: NaN")
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnStats.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnStats.scala
index 00374d1fa3ef1..7c63179af6470 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnStats.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnStats.scala
@@ -211,7 +211,7 @@ private[sql] class StringColumnStats extends ColumnStats {
   override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
     super.gatherStats(row, ordinal)
     if (!row.isNullAt(ordinal)) {
-      val value = row(ordinal).asInstanceOf[UTF8String]
+      val value = row.getUTF8String(ordinal)
       if (upper == null || value.compareTo(upper) > 0) upper = value
       if (lower == null || value.compareTo(lower) < 0) lower = value
       sizeInBytes += STRING.actualSize(row, ordinal)
@@ -241,7 +241,7 @@ private[sql] class FixedDecimalColumnStats extends ColumnStats {
   override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
     super.gatherStats(row, ordinal)
     if (!row.isNullAt(ordinal)) {
-      val value = row(ordinal).asInstanceOf[Decimal]
+      val value = row.getDecimal(ordinal)
       if (upper == null || value.compareTo(upper) > 0) upper = value
       if (lower == null || value.compareTo(lower) < 0) lower = value
       sizeInBytes += FIXED_DECIMAL.defaultSize
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala
index ac42bde07c37d..c0ca52751b66c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala
@@ -90,7 +90,7 @@ private[sql] sealed abstract class ColumnType[T <: DataType, JvmType](
    * boxing/unboxing costs whenever possible.
    */
   def copyField(from: InternalRow, fromOrdinal: Int, to: MutableRow, toOrdinal: Int): Unit = {
-    to(toOrdinal) = from(fromOrdinal)
+    to(toOrdinal) = from.get(fromOrdinal)
   }
 
   /**
@@ -329,11 +329,11 @@ private[sql] object STRING extends NativeColumnType(StringType, 7, 8) {
   }
 
   override def getField(row: InternalRow, ordinal: Int): UTF8String = {
-    row(ordinal).asInstanceOf[UTF8String]
+    row.getUTF8String(ordinal)
   }
 
   override def copyField(from: InternalRow, fromOrdinal: Int, to: MutableRow, toOrdinal: Int) {
-    to.update(toOrdinal, from(fromOrdinal))
+    to.update(toOrdinal, from.getUTF8String(fromOrdinal))
   }
 }
 
@@ -347,7 +347,7 @@ private[sql] object DATE extends NativeColumnType(DateType, 8, 4) {
   }
 
   override def getField(row: InternalRow, ordinal: Int): Int = {
-    row(ordinal).asInstanceOf[Int]
+    row.getInt(ordinal)
   }
 
   def setField(row: MutableRow, ordinal: Int, value: Int): Unit = {
@@ -365,7 +365,7 @@ private[sql] object TIMESTAMP extends NativeColumnType(TimestampType, 9, 8) {
   }
 
   override def getField(row: InternalRow, ordinal: Int): Long = {
-    row(ordinal).asInstanceOf[Long]
+    row.getLong(ordinal)
   }
 
   override def setField(row: MutableRow, ordinal: Int, value: Long): Unit = {
@@ -388,7 +388,7 @@ private[sql] case class FIXED_DECIMAL(precision: Int, scale: Int)
   }
 
   override def getField(row: InternalRow, ordinal: Int): Decimal = {
-    row(ordinal).asInstanceOf[Decimal]
+    row.getDecimal(ordinal)
   }
 
   override def setField(row: MutableRow, ordinal: Int, value: Decimal): Unit = {
@@ -427,7 +427,7 @@ private[sql] object BINARY extends ByteArrayColumnType[BinaryType.type](11, 16)
   }
 
   override def getField(row: InternalRow, ordinal: Int): Array[Byte] = {
-    row(ordinal).asInstanceOf[Array[Byte]]
+    row.getBinary(ordinal)
   }
 }
 
@@ -440,7 +440,7 @@ private[sql] object GENERIC extends ByteArrayColumnType[DataType](12, 16) {
   }
 
   override def getField(row: InternalRow, ordinal: Int): Array[Byte] = {
-    SparkSqlSerializer.serialize(row(ordinal))
+    SparkSqlSerializer.serialize(row.get(ordinal))
   }
 }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/compressionSchemes.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/compressionSchemes.scala
index 5abc1259a19ab..6150df6930b32 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/compressionSchemes.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/compressionSchemes.scala
@@ -128,7 +128,7 @@ private[sql] case object RunLengthEncoding extends CompressionScheme {
         while (from.hasRemaining) {
           columnType.extract(from, value, 0)
 
-          if (value(0) == currentValue(0)) {
+          if (value.get(0) == currentValue.get(0)) {
             currentRun += 1
           } else {
             // Writes current run
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlSerializer2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlSerializer2.scala
index 83c4e8733f15f..6ee833c7b2c94 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlSerializer2.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlSerializer2.scala
@@ -278,7 +278,7 @@ private[sql] object SparkSqlSerializer2 {
                 out.writeByte(NULL)
               } else {
                 out.writeByte(NOT_NULL)
-                val bytes = row.getAs[UTF8String](i).getBytes
+                val bytes = row.getUTF8String(i).getBytes
                 out.writeInt(bytes.length)
                 out.write(bytes)
               }
@@ -298,7 +298,7 @@ private[sql] object SparkSqlSerializer2 {
                 out.writeByte(NULL)
               } else {
                 out.writeByte(NOT_NULL)
-                val value = row.apply(i).asInstanceOf[Decimal]
+                val value = row.getAs[Decimal](i)
                 val javaBigDecimal = value.toJavaBigDecimal
                 // First, write out the unscaled value.
                 val bytes: Array[Byte] = javaBigDecimal.unscaledValue().toByteArray
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
index 7f452daef33c5..cdbe42381a7e4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
@@ -170,6 +170,8 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
     execution.PhysicalRDD(projections.map(_.toAttribute), unionedRows)
   }
 
+  // TODO: refactor this thing. It is very complicated because it does projection internally.
+  // We should just put a project on top of this.
   private def mergeWithPartitionValues(
       schema: StructType,
       requiredColumns: Array[String],
@@ -187,13 +189,13 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
         if (i != -1) {
           // If yes, gets column value from partition values.
           (mutableRow: MutableRow, dataRow: InternalRow, ordinal: Int) => {
-            mutableRow(ordinal) = partitionValues(i)
+            mutableRow(ordinal) = partitionValues.get(i)
           }
         } else {
           // Otherwise, inherits the value from scanned data.
           val i = nonPartitionColumns.indexOf(name)
           (mutableRow: MutableRow, dataRow: InternalRow, ordinal: Int) => {
-            mutableRow(ordinal) = dataRow(i)
+            mutableRow(ordinal) = dataRow.get(i)
           }
         }
       }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala
index e6081cb05bc2d..1fdcc6a850602 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala
@@ -136,7 +136,7 @@ package object debug {
             tupleCount += 1
             var i = 0
             while (i < numColumns) {
-              val value = currentRow(i)
+              val value = currentRow.get(i)
               if (value != null) {
                 columnStats(i).elementTypes += HashSet(value.getClass.getName)
               }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUDFs.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUDFs.scala
index 40bf03a3f1a62..970c40dc61a3c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUDFs.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUDFs.scala
@@ -129,7 +129,7 @@ object EvaluatePython {
       val values = new Array[Any](row.numFields)
       var i = 0
       while (i < row.numFields) {
-        values(i) = toJava(row(i), struct.fields(i).dataType)
+        values(i) = toJava(row.get(i), struct.fields(i).dataType)
         i += 1
       }
       new GenericInternalRowWithSchema(values, struct)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/aggregate/udaf.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/aggregate/udaf.scala
index 46f0fac861282..7a6e86779b185 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/aggregate/udaf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/aggregate/udaf.scala
@@ -121,7 +121,7 @@ class MutableAggregationBuffer private[sql] (
       throw new IllegalArgumentException(
         s"Could not access ${i}th value in this buffer because it only has $length values.")
     }
-    toScalaConverters(i)(underlyingBuffer(offsets(i)))
+    toScalaConverters(i)(underlyingBuffer.get(offsets(i)))
   }
 
   def update(i: Int, value: Any): Unit = {
@@ -157,7 +157,7 @@ class InputAggregationBuffer private[sql] (
         s"Could not access ${i}th value in this buffer because it only has $length values.")
     }
     // TODO: Use buffer schema to avoid using generic getter.
-    toScalaConverters(i)(underlyingInputBuffer(offsets(i)))
+    toScalaConverters(i)(underlyingInputBuffer.get(offsets(i)))
   }
 
   override def copy(): InputAggregationBuffer = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
index 8cab27d6e1c46..38bb1e3967642 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
@@ -159,7 +159,7 @@ private[sql] case class ParquetTableScan(
 
               // Parquet will leave partitioning columns empty, so we fill them in here.
               var i = 0
-              while (i < requestedPartitionOrdinals.size) {
+              while (i < requestedPartitionOrdinals.length) {
                 row(requestedPartitionOrdinals(i)._2) =
                   partitionRowValues(requestedPartitionOrdinals(i)._1)
                 i += 1
@@ -179,12 +179,12 @@ private[sql] case class ParquetTableScan(
 
               var i = 0
               while (i < row.numFields) {
-                mutableRow(i) = row(i)
+                mutableRow(i) = row.get(i)
                 i += 1
               }
               // Parquet will leave partitioning columns empty, so we fill them in here.
               i = 0
-              while (i < requestedPartitionOrdinals.size) {
+              while (i < requestedPartitionOrdinals.length) {
                 mutableRow(requestedPartitionOrdinals(i)._2) =
                   partitionRowValues(requestedPartitionOrdinals(i)._1)
                 i += 1
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
index c7c58e69d42ef..2c23d4e8a8146 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
@@ -217,9 +217,9 @@ private[parquet] class RowWriteSupport extends WriteSupport[InternalRow] with Lo
     writer.startMessage()
     while(index < attributesSize) {
       // null values indicate optional fields but we do not check currently
-      if (record(index) != null) {
+      if (!record.isNullAt(index)) {
         writer.startField(attributes(index).name, index)
-        writeValue(attributes(index).dataType, record(index))
+        writeValue(attributes(index).dataType, record.get(index))
         writer.endField(attributes(index).name, index)
       }
       index = index + 1
@@ -277,10 +277,10 @@ private[parquet] class RowWriteSupport extends WriteSupport[InternalRow] with Lo
       val fields = schema.fields.toArray
       writer.startGroup()
       var i = 0
-      while(i < fields.size) {
-        if (struct(i) != null) {
+      while(i < fields.length) {
+        if (!struct.isNullAt(i)) {
           writer.startField(fields(i).name, i)
-          writeValue(fields(i).dataType, struct(i))
+          writeValue(fields(i).dataType, struct.get(i))
           writer.endField(fields(i).name, i)
         }
         i = i + 1
@@ -387,7 +387,7 @@ private[parquet] class MutableRowWriteSupport extends RowWriteSupport {
     writer.startMessage()
     while(index < attributesSize) {
       // null values indicate optional fields but we do not check currently
-      if (record(index) != null && record(index) != Nil) {
+      if (!record.isNullAt(index) && !record.isNullAt(index)) {
         writer.startField(attributes(index).name, index)
         consumeType(attributes(index).dataType, record, index)
         writer.endField(attributes(index).name, index)
@@ -410,15 +410,15 @@ private[parquet] class MutableRowWriteSupport extends RowWriteSupport {
       case TimestampType => writeTimestamp(record.getLong(index))
       case FloatType => writer.addFloat(record.getFloat(index))
       case DoubleType => writer.addDouble(record.getDouble(index))
-      case StringType => writer.addBinary(
-        Binary.fromByteArray(record(index).asInstanceOf[UTF8String].getBytes))
-      case BinaryType => writer.addBinary(
-        Binary.fromByteArray(record(index).asInstanceOf[Array[Byte]]))
+      case StringType =>
+        writer.addBinary(Binary.fromByteArray(record.getUTF8String(index).getBytes))
+      case BinaryType =>
+        writer.addBinary(Binary.fromByteArray(record.getBinary(index)))
       case d: DecimalType =>
         if (d.precision > 18) {
           sys.error(s"Unsupported datatype $d, cannot write to consumer")
         }
-        writeDecimal(record(index).asInstanceOf[Decimal], d.precision)
+        writeDecimal(record.getDecimal(index), d.precision)
       case _ => sys.error(s"Unsupported datatype $ctype, cannot write to consumer")
     }
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/RowSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/RowSuite.scala
index 0e5c5abff85f6..c6804e84827c0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/RowSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/RowSuite.scala
@@ -39,14 +39,14 @@ class RowSuite extends SparkFunSuite {
     assert(expected.getInt(0) === actual1.getInt(0))
     assert(expected.getString(1) === actual1.getString(1))
     assert(expected.getBoolean(2) === actual1.getBoolean(2))
-    assert(expected(3) === actual1(3))
+    assert(expected.get(3) === actual1.get(3))
 
     val actual2 = Row.fromSeq(Seq(2147483647, "this is a string", false, null))
     assert(expected.numFields === actual2.size)
     assert(expected.getInt(0) === actual2.getInt(0))
     assert(expected.getString(1) === actual2.getString(1))
     assert(expected.getBoolean(2) === actual2.getBoolean(2))
-    assert(expected(3) === actual2(3))
+    assert(expected.get(3) === actual2.get(3))
   }
 
   test("SpecificMutableRow.update with null") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnStatsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnStatsSuite.scala
index 3333fee6711c0..31e7b0e72e510 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnStatsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnStatsSuite.scala
@@ -58,15 +58,15 @@ class ColumnStatsSuite extends SparkFunSuite {
       val rows = Seq.fill(10)(makeRandomRow(columnType)) ++ Seq.fill(10)(makeNullRow(1))
       rows.foreach(columnStats.gatherStats(_, 0))
 
-      val values = rows.take(10).map(_(0).asInstanceOf[T#InternalType])
+      val values = rows.take(10).map(_.get(0).asInstanceOf[T#InternalType])
       val ordering = columnType.dataType.ordering.asInstanceOf[Ordering[T#InternalType]]
       val stats = columnStats.collectedStatistics
 
-      assertResult(values.min(ordering), "Wrong lower bound")(stats(0))
-      assertResult(values.max(ordering), "Wrong upper bound")(stats(1))
-      assertResult(10, "Wrong null count")(stats(2))
-      assertResult(20, "Wrong row count")(stats(3))
-      assertResult(stats(4), "Wrong size in bytes") {
+      assertResult(values.min(ordering), "Wrong lower bound")(stats.get(0))
+      assertResult(values.max(ordering), "Wrong upper bound")(stats.get(1))
+      assertResult(10, "Wrong null count")(stats.get(2))
+      assertResult(20, "Wrong row count")(stats.get(3))
+      assertResult(stats.get(4), "Wrong size in bytes") {
         rows.map { row =>
           if (row.isNullAt(0)) 4 else columnType.actualSize(row, 0)
         }.sum
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnAccessorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnAccessorSuite.scala
index 9eaa769846088..d421f4d8d091e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnAccessorSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnAccessorSuite.scala
@@ -75,7 +75,7 @@ class NullableColumnAccessorSuite extends SparkFunSuite {
       (0 until 4).foreach { _ =>
         assert(accessor.hasNext)
         accessor.extractTo(row, 0)
-        assert(row(0) === randomRow(0))
+        assert(row.get(0) === randomRow.get(0))
 
         assert(accessor.hasNext)
         accessor.extractTo(row, 0)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnBuilderSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnBuilderSuite.scala
index 17e9ae464bcc0..cd8bf75ff1752 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnBuilderSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnBuilderSuite.scala
@@ -98,7 +98,7 @@ class NullableColumnBuilderSuite extends SparkFunSuite {
           columnType.extract(buffer)
         }
 
-        assert(actual === randomRow(0), "Extracted value didn't equal to the original one")
+        assert(actual === randomRow.get(0), "Extracted value didn't equal to the original one")
       }
 
       assert(!buffer.hasRemaining)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/BooleanBitSetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/BooleanBitSetSuite.scala
index f606e2133bedc..33092c83a1a1c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/BooleanBitSetSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/BooleanBitSetSuite.scala
@@ -33,7 +33,7 @@ class BooleanBitSetSuite extends SparkFunSuite {
 
     val builder = TestCompressibleColumnBuilder(new NoopColumnStats, BOOLEAN, BooleanBitSet)
     val rows = Seq.fill[InternalRow](count)(makeRandomRow(BOOLEAN))
-    val values = rows.map(_(0))
+    val values = rows.map(_.get(0))
 
     rows.foreach(builder.appendFrom(_, 0))
     val buffer = builder.build()
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
index 592cfa0ee8380..16977ce30cfff 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
@@ -497,7 +497,7 @@ private[hive] trait HiveInspectors {
         x.setStructFieldData(
           result,
           fieldRefs.get(i),
-          wrap(row(i), fieldRefs.get(i).getFieldObjectInspector))
+          wrap(row.get(i), fieldRefs.get(i).getFieldObjectInspector))
         i += 1
       }
 
@@ -508,7 +508,7 @@ private[hive] trait HiveInspectors {
       val result = new java.util.ArrayList[AnyRef](fieldRefs.length)
       var i = 0
       while (i < fieldRefs.length) {
-        result.add(wrap(row(i), fieldRefs.get(i).getFieldObjectInspector))
+        result.add(wrap(row.get(i), fieldRefs.get(i).getFieldObjectInspector))
         i += 1
       }
 
@@ -536,7 +536,7 @@ private[hive] trait HiveInspectors {
       cache: Array[AnyRef]): Array[AnyRef] = {
     var i = 0
     while (i < inspectors.length) {
-      cache(i) = wrap(row(i), inspectors(i))
+      cache(i) = wrap(row.get(i), inspectors(i))
       i += 1
     }
     cache
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
index 34b629403e128..f0e0ca05a8aad 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
@@ -102,7 +102,7 @@ case class InsertIntoHiveTable(
       iterator.foreach { row =>
         var i = 0
         while (i < fieldOIs.length) {
-          outputData(i) = if (row.isNullAt(i)) null else wrappers(i)(row(i))
+          outputData(i) = if (row.isNullAt(i)) null else wrappers(i)(row.get(i))
           i += 1
         }
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
index 10623dc820316..58445095ad74f 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
@@ -122,7 +122,7 @@ private[orc] class OrcOutputWriter(
   override def writeInternal(row: InternalRow): Unit = {
     var i = 0
     while (i < row.numFields) {
-      reusableOutputBuffer(i) = wrappers(i)(row(i))
+      reusableOutputBuffer(i) = wrappers(i)(row.get(i))
       i += 1
     }
 

From 41a7cdf85de2d583d8b8759941a9d6c6e98cae4d Mon Sep 17 00:00:00 2001
From: Nishkam Ravi <nravi@cloudera.com>
Date: Sat, 25 Jul 2015 22:56:25 -0700
Subject: [PATCH 377/549] [SPARK-8881] [SPARK-9260] Fix algorithm for
 scheduling executors on workers

Current scheduling algorithm allocates one core at a time and in doing so ends up ignoring spark.executor.cores. As a result, when spark.cores.max/spark.executor.cores (i.e, num_executors) < num_workers, executors are not launched and the app hangs. This PR fixes and refactors the scheduling algorithm.

andrewor14

Author: Nishkam Ravi <nravi@cloudera.com>
Author: nishkamravi2 <nishkamravi@gmail.com>

Closes #7274 from nishkamravi2/master_scheduler and squashes the following commits:

b998097 [nishkamravi2] Update Master.scala
da0f491 [Nishkam Ravi] Update Master.scala
79084e8 [Nishkam Ravi] Update Master.scala
1daf25f [Nishkam Ravi] Update Master.scala
f279cdf [Nishkam Ravi] Update Master.scala
adec84b [Nishkam Ravi] Update Master.scala
a06da76 [nishkamravi2] Update Master.scala
40c8f9f [nishkamravi2] Update Master.scala (to trigger retest)
c11c689 [nishkamravi2] Update EventLoggingListenerSuite.scala
5d6a19c [nishkamravi2] Update Master.scala (for the purpose of issuing a retest)
2d6371c [Nishkam Ravi] Update Master.scala
66362d5 [nishkamravi2] Update Master.scala
ee7cf0e [Nishkam Ravi] Improved scheduling algorithm for executors
---
 .../apache/spark/deploy/master/Master.scala   | 112 ++++++++++++------
 1 file changed, 75 insertions(+), 37 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
index 4615febf17d24..029f94d1020be 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
@@ -541,6 +541,7 @@ private[master] class Master(
 
   /**
    * Schedule executors to be launched on the workers.
+   * Returns an array containing number of cores assigned to each worker.
    *
    * There are two modes of launching executors. The first attempts to spread out an application's
    * executors on as many workers as possible, while the second does the opposite (i.e. launch them
@@ -551,39 +552,73 @@ private[master] class Master(
    * multiple executors from the same application may be launched on the same worker if the worker
    * has enough cores and memory. Otherwise, each executor grabs all the cores available on the
    * worker by default, in which case only one executor may be launched on each worker.
+   *
+   * It is important to allocate coresPerExecutor on each worker at a time (instead of 1 core
+   * at a time). Consider the following example: cluster has 4 workers with 16 cores each.
+   * User requests 3 executors (spark.cores.max = 48, spark.executor.cores = 16). If 1 core is
+   * allocated at a time, 12 cores from each worker would be assigned to each executor.
+   * Since 12 < 16, no executors would launch [SPARK-8881].
    */
-  private def startExecutorsOnWorkers(): Unit = {
-    // Right now this is a very simple FIFO scheduler. We keep trying to fit in the first app
-    // in the queue, then the second app, etc.
-    if (spreadOutApps) {
-      // Try to spread out each app among all the workers, until it has all its cores
-      for (app <- waitingApps if app.coresLeft > 0) {
-        val usableWorkers = workers.toArray.filter(_.state == WorkerState.ALIVE)
-          .filter(worker => worker.memoryFree >= app.desc.memoryPerExecutorMB &&
-            worker.coresFree >= app.desc.coresPerExecutor.getOrElse(1))
-          .sortBy(_.coresFree).reverse
-        val numUsable = usableWorkers.length
-        val assigned = new Array[Int](numUsable) // Number of cores to give on each node
-        var toAssign = math.min(app.coresLeft, usableWorkers.map(_.coresFree).sum)
-        var pos = 0
-        while (toAssign > 0) {
-          if (usableWorkers(pos).coresFree - assigned(pos) > 0) {
-            toAssign -= 1
-            assigned(pos) += 1
+  private[master] def scheduleExecutorsOnWorkers(
+      app: ApplicationInfo,
+      usableWorkers: Array[WorkerInfo],
+      spreadOutApps: Boolean): Array[Int] = {
+    // If the number of cores per executor is not specified, then we can just schedule
+    // 1 core at a time since we expect a single executor to be launched on each worker
+    val coresPerExecutor = app.desc.coresPerExecutor.getOrElse(1)
+    val memoryPerExecutor = app.desc.memoryPerExecutorMB
+    val numUsable = usableWorkers.length
+    val assignedCores = new Array[Int](numUsable) // Number of cores to give to each worker
+    val assignedMemory = new Array[Int](numUsable) // Amount of memory to give to each worker
+    var coresToAssign = math.min(app.coresLeft, usableWorkers.map(_.coresFree).sum)
+    var freeWorkers = (0 until numUsable).toIndexedSeq
+
+    def canLaunchExecutor(pos: Int): Boolean = {
+      usableWorkers(pos).coresFree - assignedCores(pos) >= coresPerExecutor &&
+      usableWorkers(pos).memoryFree - assignedMemory(pos) >= memoryPerExecutor
+    }
+
+    while (coresToAssign >= coresPerExecutor && freeWorkers.nonEmpty) {
+      freeWorkers = freeWorkers.filter(canLaunchExecutor)
+      freeWorkers.foreach { pos =>
+        var keepScheduling = true
+        while (keepScheduling && canLaunchExecutor(pos) && coresToAssign >= coresPerExecutor) {
+          coresToAssign -= coresPerExecutor
+          assignedCores(pos) += coresPerExecutor
+          assignedMemory(pos) += memoryPerExecutor
+
+          // Spreading out an application means spreading out its executors across as
+          // many workers as possible. If we are not spreading out, then we should keep
+          // scheduling executors on this worker until we use all of its resources.
+          // Otherwise, just move on to the next worker.
+          if (spreadOutApps) {
+            keepScheduling = false
           }
-          pos = (pos + 1) % numUsable
-        }
-        // Now that we've decided how many cores to give on each node, let's actually give them
-        for (pos <- 0 until numUsable if assigned(pos) > 0) {
-          allocateWorkerResourceToExecutors(app, assigned(pos), usableWorkers(pos))
         }
       }
-    } else {
-      // Pack each app into as few workers as possible until we've assigned all its cores
-      for (worker <- workers if worker.coresFree > 0 && worker.state == WorkerState.ALIVE) {
-        for (app <- waitingApps if app.coresLeft > 0) {
-          allocateWorkerResourceToExecutors(app, app.coresLeft, worker)
-        }
+    }
+    assignedCores
+  }
+
+  /**
+   * Schedule and launch executors on workers
+   */
+  private def startExecutorsOnWorkers(): Unit = {
+    // Right now this is a very simple FIFO scheduler. We keep trying to fit in the first app
+    // in the queue, then the second app, etc.
+    for (app <- waitingApps if app.coresLeft > 0) {
+      val coresPerExecutor: Option[Int] = app.desc.coresPerExecutor
+      // Filter out workers that don't have enough resources to launch an executor
+      val usableWorkers = workers.toArray.filter(_.state == WorkerState.ALIVE)
+        .filter(worker => worker.memoryFree >= app.desc.memoryPerExecutorMB &&
+          worker.coresFree >= coresPerExecutor.getOrElse(1))
+        .sortBy(_.coresFree).reverse
+      val assignedCores = scheduleExecutorsOnWorkers(app, usableWorkers, spreadOutApps)
+
+      // Now that we've decided how many cores to allocate on each worker, let's allocate them
+      for (pos <- 0 until usableWorkers.length if assignedCores(pos) > 0) {
+        allocateWorkerResourceToExecutors(
+          app, assignedCores(pos), coresPerExecutor, usableWorkers(pos))
       }
     }
   }
@@ -591,19 +626,22 @@ private[master] class Master(
   /**
    * Allocate a worker's resources to one or more executors.
    * @param app the info of the application which the executors belong to
-   * @param coresToAllocate cores on this worker to be allocated to this application
+   * @param assignedCores number of cores on this worker for this application
+   * @param coresPerExecutor number of cores per executor
    * @param worker the worker info
    */
   private def allocateWorkerResourceToExecutors(
       app: ApplicationInfo,
-      coresToAllocate: Int,
+      assignedCores: Int,
+      coresPerExecutor: Option[Int],
       worker: WorkerInfo): Unit = {
-    val memoryPerExecutor = app.desc.memoryPerExecutorMB
-    val coresPerExecutor = app.desc.coresPerExecutor.getOrElse(coresToAllocate)
-    var coresLeft = coresToAllocate
-    while (coresLeft >= coresPerExecutor && worker.memoryFree >= memoryPerExecutor) {
-      val exec = app.addExecutor(worker, coresPerExecutor)
-      coresLeft -= coresPerExecutor
+    // If the number of cores per executor is specified, we divide the cores assigned
+    // to this worker evenly among the executors with no remainder.
+    // Otherwise, we launch a single executor that grabs all the assignedCores on this worker.
+    val numExecutors = coresPerExecutor.map { assignedCores / _ }.getOrElse(1)
+    val coresToAssign = coresPerExecutor.getOrElse(assignedCores)
+    for (i <- 1 to numExecutors) {
+      val exec = app.addExecutor(worker, coresToAssign)
       launchExecutor(worker, exec)
       app.state = ApplicationState.RUNNING
     }

From 4a01bfc2a2e664186028ea32095d32d29c9f9e38 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Sat, 25 Jul 2015 23:52:37 -0700
Subject: [PATCH 378/549] [SPARK-9350][SQL] Introduce an InternalRow generic
 getter that requires a DataType

Currently UnsafeRow cannot support a generic getter. However, if the data type is known, we can support a generic getter.

Author: Reynold Xin <rxin@databricks.com>

Closes #7666 from rxin/generic-getter-with-datatype and squashes the following commits:

ee2874c [Reynold Xin] Add a default implementation for getStruct.
1e109a0 [Reynold Xin] [SPARK-9350][SQL] Introduce an InternalRow generic getter that requires a DataType.
033ee88 [Reynold Xin] Removed getAs in non test code.
---
 .../apache/spark/mllib/linalg/Matrices.scala  |  8 +++--
 .../apache/spark/mllib/linalg/Vectors.scala   |  9 ++++--
 .../sql/catalyst/expressions/UnsafeRow.java   |  5 ---
 .../sql/catalyst/CatalystTypeConverters.scala | 16 ++++++----
 .../spark/sql/catalyst/InternalRow.scala      | 32 +++++++++++--------
 .../catalyst/expressions/BoundAttribute.scala |  2 +-
 .../spark/sql/catalyst/expressions/Cast.scala |  5 +--
 .../sql/catalyst/expressions/Projection.scala |  8 +++++
 .../expressions/SpecificMutableRow.scala      | 10 +++---
 .../sql/catalyst/expressions/aggregates.scala |  2 +-
 .../expressions/complexTypeCreator.scala      |  2 +-
 .../expressions/complexTypeExtractors.scala   |  4 +--
 .../spark/sql/catalyst/expressions/rows.scala |  8 +++++
 .../expressions/ExpressionEvalHelper.scala    |  7 ++--
 .../expressions/MathFunctionsSuite.scala      |  2 +-
 .../expressions/UnsafeRowConverterSuite.scala |  2 +-
 .../sql/execution/SparkSqlSerializer2.scala   |  4 +--
 .../spark/sql/execution/basicOperators.scala  |  4 +--
 .../datasources/DataSourceStrategy.scala      |  4 +--
 .../spark/sql/execution/debug/package.scala   |  2 +-
 .../spark/sql/execution/pythonUDFs.scala      |  2 +-
 .../sql/execution/stat/FrequentItems.scala    |  8 ++---
 .../sql/expressions/aggregate/udaf.scala      | 10 ++++--
 .../sql/parquet/ParquetTableOperations.scala  |  2 +-
 .../sql/parquet/ParquetTableSupport.scala     |  4 +--
 .../scala/org/apache/spark/sql/RowSuite.scala | 11 ++++---
 .../hive/execution/InsertIntoHiveTable.scala  |  4 ++-
 .../spark/sql/hive/orc/OrcRelation.scala      |  2 +-
 28 files changed, 105 insertions(+), 74 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
index b6e2c30fbf104..d82ba2456df1a 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
@@ -179,12 +179,14 @@ private[spark] class MatrixUDT extends UserDefinedType[Matrix] {
         val tpe = row.getByte(0)
         val numRows = row.getInt(1)
         val numCols = row.getInt(2)
-        val values = row.getAs[Iterable[Double]](5).toArray
+        val values = row.getAs[Seq[Double]](5, ArrayType(DoubleType, containsNull = false)).toArray
         val isTransposed = row.getBoolean(6)
         tpe match {
           case 0 =>
-            val colPtrs = row.getAs[Iterable[Int]](3).toArray
-            val rowIndices = row.getAs[Iterable[Int]](4).toArray
+            val colPtrs =
+              row.getAs[Seq[Int]](3, ArrayType(IntegerType, containsNull = false)).toArray
+            val rowIndices =
+              row.getAs[Seq[Int]](4, ArrayType(IntegerType, containsNull = false)).toArray
             new SparseMatrix(numRows, numCols, colPtrs, rowIndices, values, isTransposed)
           case 1 =>
             new DenseMatrix(numRows, numCols, values, isTransposed)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
index c884aad08889f..0cb28d78bec05 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
@@ -209,11 +209,14 @@ private[spark] class VectorUDT extends UserDefinedType[Vector] {
         tpe match {
           case 0 =>
             val size = row.getInt(1)
-            val indices = row.getAs[Iterable[Int]](2).toArray
-            val values = row.getAs[Iterable[Double]](3).toArray
+            val indices =
+              row.getAs[Seq[Int]](2, ArrayType(IntegerType, containsNull = false)).toArray
+            val values =
+              row.getAs[Seq[Double]](3, ArrayType(DoubleType, containsNull = false)).toArray
             new SparseVector(size, indices, values)
           case 1 =>
-            val values = row.getAs[Iterable[Double]](3).toArray
+            val values =
+              row.getAs[Seq[Double]](3, ArrayType(DoubleType, containsNull = false)).toArray
             new DenseVector(values)
         }
     }
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
index 9be9089493335..87e5a89c19658 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
@@ -235,11 +235,6 @@ public Object get(int ordinal) {
     throw new UnsupportedOperationException();
   }
 
-  @Override
-  public <T> T getAs(int ordinal) {
-    throw new UnsupportedOperationException();
-  }
-
   @Override
   public boolean isNullAt(int ordinal) {
     assertIndexIsValid(ordinal);
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala
index 7416ddbaef3fc..d1d89a1f48329 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala
@@ -77,7 +77,7 @@ object CatalystTypeConverters {
       case LongType => LongConverter
       case FloatType => FloatConverter
       case DoubleType => DoubleConverter
-      case _ => IdentityConverter
+      case dataType: DataType => IdentityConverter(dataType)
     }
     converter.asInstanceOf[CatalystTypeConverter[Any, Any, Any]]
   }
@@ -137,17 +137,19 @@ object CatalystTypeConverters {
     protected def toScalaImpl(row: InternalRow, column: Int): ScalaOutputType
   }
 
-  private object IdentityConverter extends CatalystTypeConverter[Any, Any, Any] {
+  private case class IdentityConverter(dataType: DataType)
+    extends CatalystTypeConverter[Any, Any, Any] {
     override def toCatalystImpl(scalaValue: Any): Any = scalaValue
     override def toScala(catalystValue: Any): Any = catalystValue
-    override def toScalaImpl(row: InternalRow, column: Int): Any = row.get(column)
+    override def toScalaImpl(row: InternalRow, column: Int): Any = row.get(column, dataType)
   }
 
   private case class UDTConverter(
       udt: UserDefinedType[_]) extends CatalystTypeConverter[Any, Any, Any] {
     override def toCatalystImpl(scalaValue: Any): Any = udt.serialize(scalaValue)
     override def toScala(catalystValue: Any): Any = udt.deserialize(catalystValue)
-    override def toScalaImpl(row: InternalRow, column: Int): Any = toScala(row.get(column))
+    override def toScalaImpl(row: InternalRow, column: Int): Any =
+      toScala(row.get(column, udt.sqlType))
   }
 
   /** Converter for arrays, sequences, and Java iterables. */
@@ -184,7 +186,7 @@ object CatalystTypeConverters {
     }
 
     override def toScalaImpl(row: InternalRow, column: Int): Seq[Any] =
-      toScala(row.get(column).asInstanceOf[Seq[Any]])
+      toScala(row.get(column, ArrayType(elementType)).asInstanceOf[Seq[Any]])
   }
 
   private case class MapConverter(
@@ -227,7 +229,7 @@ object CatalystTypeConverters {
     }
 
     override def toScalaImpl(row: InternalRow, column: Int): Map[Any, Any] =
-      toScala(row.get(column).asInstanceOf[Map[Any, Any]])
+      toScala(row.get(column, MapType(keyType, valueType)).asInstanceOf[Map[Any, Any]])
   }
 
   private case class StructConverter(
@@ -311,7 +313,7 @@ object CatalystTypeConverters {
     }
     override def toScala(catalystValue: Decimal): JavaBigDecimal = catalystValue.toJavaBigDecimal
     override def toScalaImpl(row: InternalRow, column: Int): JavaBigDecimal =
-      row.get(column).asInstanceOf[Decimal].toJavaBigDecimal
+      row.getDecimal(column).toJavaBigDecimal
   }
 
   private abstract class PrimitiveConverter[T] extends CatalystTypeConverter[T, Any, Any] {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala
index 37f0f57e9e6d3..385d9671386dc 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst
 
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.types.Decimal
+import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
 
 /**
@@ -32,32 +32,36 @@ abstract class InternalRow extends Serializable {
 
   def get(ordinal: Int): Any
 
-  def getAs[T](ordinal: Int): T = get(ordinal).asInstanceOf[T]
+  def genericGet(ordinal: Int): Any = get(ordinal, null)
+
+  def get(ordinal: Int, dataType: DataType): Any = get(ordinal)
+
+  def getAs[T](ordinal: Int, dataType: DataType): T = get(ordinal, dataType).asInstanceOf[T]
 
   def isNullAt(ordinal: Int): Boolean = get(ordinal) == null
 
-  def getBoolean(ordinal: Int): Boolean = getAs[Boolean](ordinal)
+  def getBoolean(ordinal: Int): Boolean = getAs[Boolean](ordinal, BooleanType)
 
-  def getByte(ordinal: Int): Byte = getAs[Byte](ordinal)
+  def getByte(ordinal: Int): Byte = getAs[Byte](ordinal, ByteType)
 
-  def getShort(ordinal: Int): Short = getAs[Short](ordinal)
+  def getShort(ordinal: Int): Short = getAs[Short](ordinal, ShortType)
 
-  def getInt(ordinal: Int): Int = getAs[Int](ordinal)
+  def getInt(ordinal: Int): Int = getAs[Int](ordinal, IntegerType)
 
-  def getLong(ordinal: Int): Long = getAs[Long](ordinal)
+  def getLong(ordinal: Int): Long = getAs[Long](ordinal, LongType)
 
-  def getFloat(ordinal: Int): Float = getAs[Float](ordinal)
+  def getFloat(ordinal: Int): Float = getAs[Float](ordinal, FloatType)
 
-  def getDouble(ordinal: Int): Double = getAs[Double](ordinal)
+  def getDouble(ordinal: Int): Double = getAs[Double](ordinal, DoubleType)
 
-  def getUTF8String(ordinal: Int): UTF8String = getAs[UTF8String](ordinal)
+  def getUTF8String(ordinal: Int): UTF8String = getAs[UTF8String](ordinal, StringType)
 
-  def getBinary(ordinal: Int): Array[Byte] = getAs[Array[Byte]](ordinal)
+  def getBinary(ordinal: Int): Array[Byte] = getAs[Array[Byte]](ordinal, BinaryType)
 
-  def getDecimal(ordinal: Int): Decimal = getAs[Decimal](ordinal)
+  def getDecimal(ordinal: Int): Decimal = getAs[Decimal](ordinal, DecimalType.SYSTEM_DEFAULT)
 
   // This is only use for test and will throw a null pointer exception if the position is null.
-  def getString(ordinal: Int): String = getAs[UTF8String](ordinal).toString
+  def getString(ordinal: Int): String = getUTF8String(ordinal).toString
 
   /**
    * Returns a struct from ordinal position.
@@ -65,7 +69,7 @@ abstract class InternalRow extends Serializable {
    * @param ordinal position to get the struct from.
    * @param numFields number of fields the struct type has
    */
-  def getStruct(ordinal: Int, numFields: Int): InternalRow = getAs[InternalRow](ordinal)
+  def getStruct(ordinal: Int, numFields: Int): InternalRow = getAs[InternalRow](ordinal, null)
 
   override def toString: String = s"[${this.mkString(",")}]"
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala
index 1f7adcd36ec14..6b5c450e3fb0a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala
@@ -49,7 +49,7 @@ case class BoundReference(ordinal: Int, dataType: DataType, nullable: Boolean)
         case StringType => input.getUTF8String(ordinal)
         case BinaryType => input.getBinary(ordinal)
         case t: StructType => input.getStruct(ordinal, t.size)
-        case _ => input.get(ordinal)
+        case dataType => input.get(ordinal, dataType)
       }
     }
   }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
index 47ad3e089e4c7..e5b83cd31bf0f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -375,7 +375,7 @@ case class Cast(child: Expression, dataType: DataType)
   }
 
   private[this] def castStruct(from: StructType, to: StructType): Any => Any = {
-    val casts = from.fields.zip(to.fields).map {
+    val castFuncs: Array[(Any) => Any] = from.fields.zip(to.fields).map {
       case (fromField, toField) => cast(fromField.dataType, toField.dataType)
     }
     // TODO: Could be faster?
@@ -383,7 +383,8 @@ case class Cast(child: Expression, dataType: DataType)
     buildCast[InternalRow](_, row => {
       var i = 0
       while (i < row.numFields) {
-        newRow.update(i, if (row.isNullAt(i)) null else casts(i)(row.get(i)))
+        newRow.update(i,
+          if (row.isNullAt(i)) null else castFuncs(i)(row.get(i, from.apply(i).dataType)))
         i += 1
       }
       newRow.copy()
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
index c1ed9cf7ed6a0..cc89d74146b34 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
@@ -225,6 +225,14 @@ class JoinedRow extends InternalRow {
   override def getFloat(i: Int): Float =
     if (i < row1.numFields) row1.getFloat(i) else row2.getFloat(i - row1.numFields)
 
+  override def getStruct(i: Int, numFields: Int): InternalRow = {
+    if (i < row1.numFields) {
+      row1.getStruct(i, numFields)
+    } else {
+      row2.getStruct(i - row1.numFields, numFields)
+    }
+  }
+
   override def copy(): InternalRow = {
     val totalSize = row1.numFields + row2.numFields
     val copiedValues = new Array[Any](totalSize)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SpecificMutableRow.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SpecificMutableRow.scala
index 4b4833bd06a3b..5953a093dc684 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SpecificMutableRow.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SpecificMutableRow.scala
@@ -221,6 +221,10 @@ final class SpecificMutableRow(val values: Array[MutableValue]) extends MutableR
 
   override def get(i: Int): Any = values(i).boxed
 
+  override def getStruct(ordinal: Int, numFields: Int): InternalRow = {
+    values(ordinal).boxed.asInstanceOf[InternalRow]
+  }
+
   override def isNullAt(i: Int): Boolean = values(i).isNull
 
   override def copy(): InternalRow = {
@@ -245,8 +249,6 @@ final class SpecificMutableRow(val values: Array[MutableValue]) extends MutableR
   override def setString(ordinal: Int, value: String): Unit =
     update(ordinal, UTF8String.fromString(value))
 
-  override def getString(ordinal: Int): String = get(ordinal).toString
-
   override def setInt(ordinal: Int, value: Int): Unit = {
     val currentValue = values(ordinal).asInstanceOf[MutableInt]
     currentValue.isNull = false
@@ -316,8 +318,4 @@ final class SpecificMutableRow(val values: Array[MutableValue]) extends MutableR
   override def getByte(i: Int): Byte = {
     values(i).asInstanceOf[MutableByte].value
   }
-
-  override def getAs[T](i: Int): T = {
-    values(i).boxed.asInstanceOf[T]
-  }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
index 62b6cc834c9c9..42343d4d8d79c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
@@ -685,7 +685,7 @@ case class CombineSetsAndSumFunction(
       null
     } else {
       Cast(Literal(
-        casted.iterator.map(f => f.get(0)).reduceLeft(
+        casted.iterator.map(f => f.genericGet(0)).reduceLeft(
           base.dataType.asInstanceOf[NumericType].numeric.asInstanceOf[Numeric[Any]].plus)),
         base.dataType).eval(null)
     }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
index 20b1eaab8e303..119168fa59f15 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
@@ -21,7 +21,7 @@ import scala.collection.mutable
 
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
-import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, CodeGenContext}
+import org.apache.spark.sql.catalyst.expressions.codegen.{GenerateUnsafeProjection, GeneratedExpressionCode, CodeGenContext}
 import org.apache.spark.sql.catalyst.util.TypeUtils
 import org.apache.spark.sql.types._
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala
index c91122cda2a41..6331a9eb603ca 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala
@@ -110,7 +110,7 @@ case class GetStructField(child: Expression, field: StructField, ordinal: Int)
   override def toString: String = s"$child.${field.name}"
 
   protected override def nullSafeEval(input: Any): Any =
-    input.asInstanceOf[InternalRow].get(ordinal)
+    input.asInstanceOf[InternalRow].get(ordinal, field.dataType)
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     nullSafeCodeGen(ctx, ev, eval => {
@@ -142,7 +142,7 @@ case class GetArrayStructFields(
 
   protected override def nullSafeEval(input: Any): Any = {
     input.asInstanceOf[Seq[InternalRow]].map { row =>
-      if (row == null) null else row.get(ordinal)
+      if (row == null) null else row.get(ordinal, field.dataType)
     }
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala
index 53779dd4049d1..daeabe8e90f1d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala
@@ -101,6 +101,10 @@ class GenericInternalRow(protected[sql] val values: Array[Any]) extends Internal
 
   override def get(i: Int): Any = values(i)
 
+  override def getStruct(ordinal: Int, numFields: Int): InternalRow = {
+    values(ordinal).asInstanceOf[InternalRow]
+  }
+
   override def copy(): InternalRow = this
 }
 
@@ -128,6 +132,10 @@ class GenericMutableRow(val values: Array[Any]) extends MutableRow {
 
   override def get(i: Int): Any = values(i)
 
+  override def getStruct(ordinal: Int, numFields: Int): InternalRow = {
+    values(ordinal).asInstanceOf[InternalRow]
+  }
+
   override def setNullAt(i: Int): Unit = { values(i) = null}
 
   override def update(i: Int, value: Any): Unit = { values(i) = value }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
index 852a8b235f127..8b0f90cf3a623 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
@@ -113,7 +113,7 @@ trait ExpressionEvalHelper {
       GenerateMutableProjection.generate(Alias(expression, s"Optimized($expression)")() :: Nil)(),
       expression)
 
-    val actual = plan(inputRow).get(0)
+    val actual = plan(inputRow).get(0, expression.dataType)
     if (!checkResult(actual, expected)) {
       val input = if (inputRow == EmptyRow) "" else s", input: $inputRow"
       fail(s"Incorrect Evaluation: $expression, actual: $actual, expected: $expected$input")
@@ -194,13 +194,14 @@ trait ExpressionEvalHelper {
     var plan = generateProject(
       GenerateProjection.generate(Alias(expression, s"Optimized($expression)")() :: Nil),
       expression)
-    var actual = plan(inputRow).get(0)
+    var actual = plan(inputRow).get(0, expression.dataType)
     assert(checkResult(actual, expected))
 
     plan = generateProject(
       GenerateUnsafeProjection.generate(Alias(expression, s"Optimized($expression)")() :: Nil),
       expression)
-    actual = FromUnsafeProjection(expression.dataType :: Nil)(plan(inputRow)).get(0)
+    actual = FromUnsafeProjection(expression.dataType :: Nil)(
+      plan(inputRow)).get(0, expression.dataType)
     assert(checkResult(actual, expected))
   }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala
index 6caf8baf24a81..21459a7c69838 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala
@@ -158,7 +158,7 @@ class MathFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
       GenerateMutableProjection.generate(Alias(expression, s"Optimized($expression)")() :: Nil)(),
       expression)
 
-    val actual = plan(inputRow).get(0)
+    val actual = plan(inputRow).get(0, expression.dataType)
     if (!actual.asInstanceOf[Double].isNaN) {
       fail(s"Incorrect Evaluation: $expression, actual: $actual, expected: NaN")
     }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala
index 4606bcb57311d..2834b54e8fb2e 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala
@@ -183,7 +183,7 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers {
     assert(setToNullAfterCreation.getFloat(6) === rowWithNoNullColumns.getFloat(6))
     assert(setToNullAfterCreation.getDouble(7) === rowWithNoNullColumns.getDouble(7))
     assert(setToNullAfterCreation.getString(8) === rowWithNoNullColumns.getString(8))
-    assert(setToNullAfterCreation.getBinary(9) === rowWithNoNullColumns.get(9))
+    assert(setToNullAfterCreation.getBinary(9) === rowWithNoNullColumns.getBinary(9))
     // assert(setToNullAfterCreation.get(10) === rowWithNoNullColumns.get(10))
     // assert(setToNullAfterCreation.get(11) === rowWithNoNullColumns.get(11))
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlSerializer2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlSerializer2.scala
index 6ee833c7b2c94..c808442a4849b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlSerializer2.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlSerializer2.scala
@@ -288,7 +288,7 @@ private[sql] object SparkSqlSerializer2 {
                 out.writeByte(NULL)
               } else {
                 out.writeByte(NOT_NULL)
-                val bytes = row.getAs[Array[Byte]](i)
+                val bytes = row.getBinary(i)
                 out.writeInt(bytes.length)
                 out.write(bytes)
               }
@@ -298,7 +298,7 @@ private[sql] object SparkSqlSerializer2 {
                 out.writeByte(NULL)
               } else {
                 out.writeByte(NOT_NULL)
-                val value = row.getAs[Decimal](i)
+                val value = row.getDecimal(i)
                 val javaBigDecimal = value.toJavaBigDecimal
                 // First, write out the unscaled value.
                 val bytes: Array[Byte] = javaBigDecimal.unscaledValue().toByteArray
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
index fdd7ad59aba50..fe429d862a0a3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
@@ -17,16 +17,16 @@
 
 package org.apache.spark.sql.execution
 
-import org.apache.spark.sql.types.StructType
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.{RDD, ShuffledRDD}
 import org.apache.spark.shuffle.sort.SortShuffleManager
+import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.CatalystTypeConverters
 import org.apache.spark.sql.catalyst.errors._
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.plans.physical._
+import org.apache.spark.sql.types.StructType
 import org.apache.spark.util.collection.ExternalSorter
 import org.apache.spark.util.collection.unsafe.sort.PrefixComparator
 import org.apache.spark.util.{CompletionIterator, MutablePair}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
index cdbe42381a7e4..6b91e51ca52fb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
@@ -189,13 +189,13 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
         if (i != -1) {
           // If yes, gets column value from partition values.
           (mutableRow: MutableRow, dataRow: InternalRow, ordinal: Int) => {
-            mutableRow(ordinal) = partitionValues.get(i)
+            mutableRow(ordinal) = partitionValues.genericGet(i)
           }
         } else {
           // Otherwise, inherits the value from scanned data.
           val i = nonPartitionColumns.indexOf(name)
           (mutableRow: MutableRow, dataRow: InternalRow, ordinal: Int) => {
-            mutableRow(ordinal) = dataRow.get(i)
+            mutableRow(ordinal) = dataRow.genericGet(i)
           }
         }
       }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala
index 1fdcc6a850602..aeeb0e45270dd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala
@@ -136,7 +136,7 @@ package object debug {
             tupleCount += 1
             var i = 0
             while (i < numColumns) {
-              val value = currentRow.get(i)
+              val value = currentRow.get(i, output(i).dataType)
               if (value != null) {
                 columnStats(i).elementTypes += HashSet(value.getClass.getName)
               }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUDFs.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUDFs.scala
index 970c40dc61a3c..ec084a299649e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUDFs.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUDFs.scala
@@ -129,7 +129,7 @@ object EvaluatePython {
       val values = new Array[Any](row.numFields)
       var i = 0
       while (i < row.numFields) {
-        values(i) = toJava(row.get(i), struct.fields(i).dataType)
+        values(i) = toJava(row.get(i, struct.fields(i).dataType), struct.fields(i).dataType)
         i += 1
       }
       new GenericInternalRowWithSchema(values, struct)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala
index ec5c6950f37ad..78da2840dad69 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala
@@ -22,7 +22,7 @@ import scala.collection.mutable.{Map => MutableMap}
 import org.apache.spark.Logging
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.plans.logical.LocalRelation
-import org.apache.spark.sql.types.{ArrayType, StructField, StructType}
+import org.apache.spark.sql.types.{DataType, ArrayType, StructField, StructType}
 import org.apache.spark.sql.{Column, DataFrame}
 
 private[sql] object FrequentItems extends Logging {
@@ -85,17 +85,17 @@ private[sql] object FrequentItems extends Logging {
     val sizeOfMap = (1 / support).toInt
     val countMaps = Seq.tabulate(numCols)(i => new FreqItemCounter(sizeOfMap))
     val originalSchema = df.schema
-    val colInfo = cols.map { name =>
+    val colInfo: Array[(String, DataType)] = cols.map { name =>
       val index = originalSchema.fieldIndex(name)
       (name, originalSchema.fields(index).dataType)
-    }
+    }.toArray
 
     val freqItems = df.select(cols.map(Column(_)) : _*).queryExecution.toRdd.aggregate(countMaps)(
       seqOp = (counts, row) => {
         var i = 0
         while (i < numCols) {
           val thisMap = counts(i)
-          val key = row.get(i)
+          val key = row.get(i, colInfo(i)._2)
           thisMap.add(key, 1L)
           i += 1
         }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/aggregate/udaf.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/aggregate/udaf.scala
index 7a6e86779b185..4ada9eca7a035 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/aggregate/udaf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/aggregate/udaf.scala
@@ -110,6 +110,7 @@ private[sql] abstract class AggregationBuffer(
  * A Mutable [[Row]] representing an mutable aggregation buffer.
  */
 class MutableAggregationBuffer private[sql] (
+    schema: StructType,
     toCatalystConverters: Array[Any => Any],
     toScalaConverters: Array[Any => Any],
     bufferOffset: Int,
@@ -121,7 +122,7 @@ class MutableAggregationBuffer private[sql] (
       throw new IllegalArgumentException(
         s"Could not access ${i}th value in this buffer because it only has $length values.")
     }
-    toScalaConverters(i)(underlyingBuffer.get(offsets(i)))
+    toScalaConverters(i)(underlyingBuffer.get(offsets(i), schema(i).dataType))
   }
 
   def update(i: Int, value: Any): Unit = {
@@ -134,6 +135,7 @@ class MutableAggregationBuffer private[sql] (
 
   override def copy(): MutableAggregationBuffer = {
     new MutableAggregationBuffer(
+      schema,
       toCatalystConverters,
       toScalaConverters,
       bufferOffset,
@@ -145,6 +147,7 @@ class MutableAggregationBuffer private[sql] (
  * A [[Row]] representing an immutable aggregation buffer.
  */
 class InputAggregationBuffer private[sql] (
+    schema: StructType,
     toCatalystConverters: Array[Any => Any],
     toScalaConverters: Array[Any => Any],
     bufferOffset: Int,
@@ -157,11 +160,12 @@ class InputAggregationBuffer private[sql] (
         s"Could not access ${i}th value in this buffer because it only has $length values.")
     }
     // TODO: Use buffer schema to avoid using generic getter.
-    toScalaConverters(i)(underlyingInputBuffer.get(offsets(i)))
+    toScalaConverters(i)(underlyingInputBuffer.get(offsets(i), schema(i).dataType))
   }
 
   override def copy(): InputAggregationBuffer = {
     new InputAggregationBuffer(
+      schema,
       toCatalystConverters,
       toScalaConverters,
       bufferOffset,
@@ -233,6 +237,7 @@ case class ScalaUDAF(
 
   lazy val inputAggregateBuffer: InputAggregationBuffer =
     new InputAggregationBuffer(
+      bufferSchema,
       bufferValuesToCatalystConverters,
       bufferValuesToScalaConverters,
       bufferOffset,
@@ -240,6 +245,7 @@ case class ScalaUDAF(
 
   lazy val mutableAggregateBuffer: MutableAggregationBuffer =
     new MutableAggregationBuffer(
+      bufferSchema,
       bufferValuesToCatalystConverters,
       bufferValuesToScalaConverters,
       bufferOffset,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
index 38bb1e3967642..75cbbde4f1512 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
@@ -179,7 +179,7 @@ private[sql] case class ParquetTableScan(
 
               var i = 0
               while (i < row.numFields) {
-                mutableRow(i) = row.get(i)
+                mutableRow(i) = row.genericGet(i)
                 i += 1
               }
               // Parquet will leave partitioning columns empty, so we fill them in here.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
index 2c23d4e8a8146..7b6a7f65d69db 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
@@ -219,7 +219,7 @@ private[parquet] class RowWriteSupport extends WriteSupport[InternalRow] with Lo
       // null values indicate optional fields but we do not check currently
       if (!record.isNullAt(index)) {
         writer.startField(attributes(index).name, index)
-        writeValue(attributes(index).dataType, record.get(index))
+        writeValue(attributes(index).dataType, record.get(index, attributes(index).dataType))
         writer.endField(attributes(index).name, index)
       }
       index = index + 1
@@ -280,7 +280,7 @@ private[parquet] class RowWriteSupport extends WriteSupport[InternalRow] with Lo
       while(i < fields.length) {
         if (!struct.isNullAt(i)) {
           writer.startField(fields(i).name, i)
-          writeValue(fields(i).dataType, struct.get(i))
+          writeValue(fields(i).dataType, struct.get(i, fields(i).dataType))
           writer.endField(fields(i).name, i)
         }
         i = i + 1
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/RowSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/RowSuite.scala
index c6804e84827c0..01b7c21e84159 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/RowSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/RowSuite.scala
@@ -30,23 +30,24 @@ class RowSuite extends SparkFunSuite {
 
   test("create row") {
     val expected = new GenericMutableRow(4)
-    expected.update(0, 2147483647)
+    expected.setInt(0, 2147483647)
     expected.setString(1, "this is a string")
-    expected.update(2, false)
-    expected.update(3, null)
+    expected.setBoolean(2, false)
+    expected.setNullAt(3)
+
     val actual1 = Row(2147483647, "this is a string", false, null)
     assert(expected.numFields === actual1.size)
     assert(expected.getInt(0) === actual1.getInt(0))
     assert(expected.getString(1) === actual1.getString(1))
     assert(expected.getBoolean(2) === actual1.getBoolean(2))
-    assert(expected.get(3) === actual1.get(3))
+    assert(expected.isNullAt(3) === actual1.isNullAt(3))
 
     val actual2 = Row.fromSeq(Seq(2147483647, "this is a string", false, null))
     assert(expected.numFields === actual2.size)
     assert(expected.getInt(0) === actual2.getInt(0))
     assert(expected.getString(1) === actual2.getString(1))
     assert(expected.getBoolean(2) === actual2.getBoolean(2))
-    assert(expected.get(3) === actual2.get(3))
+    assert(expected.isNullAt(3) === actual2.isNullAt(3))
   }
 
   test("SpecificMutableRow.update with null") {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
index f0e0ca05a8aad..e4944caeff924 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
@@ -36,6 +36,7 @@ import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.execution.{UnaryNode, SparkPlan}
 import org.apache.spark.sql.hive.HiveShim.{ShimFileSinkDesc => FileSinkDesc}
 import org.apache.spark.sql.hive._
+import org.apache.spark.sql.types.DataType
 import org.apache.spark.{SparkException, TaskContext}
 
 import scala.collection.JavaConversions._
@@ -96,13 +97,14 @@ case class InsertIntoHiveTable(
       val fieldOIs = standardOI.getAllStructFieldRefs.map(_.getFieldObjectInspector).toArray
       val wrappers = fieldOIs.map(wrapperFor)
       val outputData = new Array[Any](fieldOIs.length)
+      val dataTypes: Array[DataType] = child.output.map(_.dataType).toArray
 
       writerContainer.executorSideSetup(context.stageId, context.partitionId, context.attemptNumber)
 
       iterator.foreach { row =>
         var i = 0
         while (i < fieldOIs.length) {
-          outputData(i) = if (row.isNullAt(i)) null else wrappers(i)(row.get(i))
+          outputData(i) = if (row.isNullAt(i)) null else wrappers(i)(row.get(i, dataTypes(i)))
           i += 1
         }
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
index 58445095ad74f..924f4d37ce21f 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
@@ -122,7 +122,7 @@ private[orc] class OrcOutputWriter(
   override def writeInternal(row: InternalRow): Unit = {
     var i = 0
     while (i < row.numFields) {
-      reusableOutputBuffer(i) = wrappers(i)(row.get(i))
+      reusableOutputBuffer(i) = wrappers(i)(row.get(i, dataSchema(i).dataType))
       i += 1
     }
 

From b79bf1df6238c087c3ec524344f1fc179719c5de Mon Sep 17 00:00:00 2001
From: Yuhao Yang <hhbyyh@gmail.com>
Date: Sun, 26 Jul 2015 14:02:20 +0100
Subject: [PATCH 379/549] [SPARK-9337] [MLLIB] Add an ut for Word2Vec to verify
 the empty vocabulary check

jira: https://issues.apache.org/jira/browse/SPARK-9337

Word2Vec should throw exception when vocabulary is empty

Author: Yuhao Yang <hhbyyh@gmail.com>

Closes #7660 from hhbyyh/ut4Word2vec and squashes the following commits:

17a18cb [Yuhao Yang] add ut for word2vec
---
 .../org/apache/spark/mllib/feature/Word2VecSuite.scala | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala
index 4cc8d1129b858..a864eec460f2b 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala
@@ -45,6 +45,16 @@ class Word2VecSuite extends SparkFunSuite with MLlibTestSparkContext {
     assert(newModel.getVectors.mapValues(_.toSeq) === word2VecMap.mapValues(_.toSeq))
   }
 
+  test("Word2Vec throws exception when vocabulary is empty") {
+    intercept[IllegalArgumentException] {
+      val sentence = "a b c"
+      val localDoc = Seq(sentence, sentence)
+      val doc = sc.parallelize(localDoc)
+        .map(line => line.split(" ").toSeq)
+      new Word2Vec().setMinCount(10).fit(doc)
+    }
+  }
+
   test("Word2VecModel") {
     val num = 2
     val word2VecMap = Map(

From 6c400b4f39be3fb5f473b8d2db11d239ea8ddf42 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Sun, 26 Jul 2015 10:27:39 -0700
Subject: [PATCH 380/549] [SPARK-9354][SQL] Remove InternalRow.get generic
 getter call in Hive integration code.

Replaced them with get(ordinal, datatype) so we can use UnsafeRow here.

I passed the data types throughout.

Author: Reynold Xin <rxin@databricks.com>

Closes #7669 from rxin/row-generic-getter-hive and squashes the following commits:

3467d8e [Reynold Xin] [SPARK-9354][SQL] Remove Internal.get generic getter call in Hive integration code.
---
 .../spark/sql/hive/HiveInspectors.scala       | 43 ++++++-----
 .../org/apache/spark/sql/hive/hiveUDFs.scala  | 74 ++++++++++++-------
 .../spark/sql/hive/HiveInspectorSuite.scala   | 53 +++++++------
 3 files changed, 102 insertions(+), 68 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
index 16977ce30cfff..f467500259c91 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
@@ -46,7 +46,7 @@ import scala.collection.JavaConversions._
  *     long / scala.Long
  *     short / scala.Short
  *     byte / scala.Byte
- *     org.apache.spark.sql.types.Decimal
+ *     [[org.apache.spark.sql.types.Decimal]]
  *     Array[Byte]
  *     java.sql.Date
  *     java.sql.Timestamp
@@ -54,7 +54,7 @@ import scala.collection.JavaConversions._
  *    Map: scala.collection.immutable.Map
  *    List: scala.collection.immutable.Seq
  *    Struct:
- *           org.apache.spark.sql.catalyst.expression.Row
+ *           [[org.apache.spark.sql.catalyst.InternalRow]]
  *    Union: NOT SUPPORTED YET
  *  The Complex types plays as a container, which can hold arbitrary data types.
  *
@@ -454,7 +454,7 @@ private[hive] trait HiveInspectors {
    *
    *  NOTICE: the complex data type requires recursive wrapping.
    */
-  def wrap(a: Any, oi: ObjectInspector): AnyRef = oi match {
+  def wrap(a: Any, oi: ObjectInspector, dataType: DataType): AnyRef = oi match {
     case x: ConstantObjectInspector => x.getWritableConstantValue
     case _ if a == null => null
     case x: PrimitiveObjectInspector => x match {
@@ -488,43 +488,50 @@ private[hive] trait HiveInspectors {
     }
     case x: SettableStructObjectInspector =>
       val fieldRefs = x.getAllStructFieldRefs
+      val structType = dataType.asInstanceOf[StructType]
       val row = a.asInstanceOf[InternalRow]
       // 1. create the pojo (most likely) object
       val result = x.create()
       var i = 0
       while (i < fieldRefs.length) {
         // 2. set the property for the pojo
+        val tpe = structType(i).dataType
         x.setStructFieldData(
           result,
           fieldRefs.get(i),
-          wrap(row.get(i), fieldRefs.get(i).getFieldObjectInspector))
+          wrap(row.get(i, tpe), fieldRefs.get(i).getFieldObjectInspector, tpe))
         i += 1
       }
 
       result
     case x: StructObjectInspector =>
       val fieldRefs = x.getAllStructFieldRefs
+      val structType = dataType.asInstanceOf[StructType]
       val row = a.asInstanceOf[InternalRow]
       val result = new java.util.ArrayList[AnyRef](fieldRefs.length)
       var i = 0
       while (i < fieldRefs.length) {
-        result.add(wrap(row.get(i), fieldRefs.get(i).getFieldObjectInspector))
+        val tpe = structType(i).dataType
+        result.add(wrap(row.get(i, tpe), fieldRefs.get(i).getFieldObjectInspector, tpe))
         i += 1
       }
 
       result
     case x: ListObjectInspector =>
       val list = new java.util.ArrayList[Object]
+      val tpe = dataType.asInstanceOf[ArrayType].elementType
       a.asInstanceOf[Seq[_]].foreach {
-        v => list.add(wrap(v, x.getListElementObjectInspector))
+        v => list.add(wrap(v, x.getListElementObjectInspector, tpe))
       }
       list
     case x: MapObjectInspector =>
+      val keyType = dataType.asInstanceOf[MapType].keyType
+      val valueType = dataType.asInstanceOf[MapType].valueType
       // Some UDFs seem to assume we pass in a HashMap.
       val hashMap = new java.util.HashMap[AnyRef, AnyRef]()
-      hashMap.putAll(a.asInstanceOf[Map[_, _]].map {
-        case (k, v) =>
-          wrap(k, x.getMapKeyObjectInspector) -> wrap(v, x.getMapValueObjectInspector)
+      hashMap.putAll(a.asInstanceOf[Map[_, _]].map { case (k, v) =>
+        wrap(k, x.getMapKeyObjectInspector, keyType) ->
+          wrap(v, x.getMapValueObjectInspector, valueType)
       })
 
       hashMap
@@ -533,22 +540,24 @@ private[hive] trait HiveInspectors {
   def wrap(
       row: InternalRow,
       inspectors: Seq[ObjectInspector],
-      cache: Array[AnyRef]): Array[AnyRef] = {
+      cache: Array[AnyRef],
+      dataTypes: Array[DataType]): Array[AnyRef] = {
     var i = 0
     while (i < inspectors.length) {
-      cache(i) = wrap(row.get(i), inspectors(i))
+      cache(i) = wrap(row.get(i, dataTypes(i)), inspectors(i), dataTypes(i))
       i += 1
     }
     cache
   }
 
   def wrap(
-    row: Seq[Any],
-    inspectors: Seq[ObjectInspector],
-    cache: Array[AnyRef]): Array[AnyRef] = {
+      row: Seq[Any],
+      inspectors: Seq[ObjectInspector],
+      cache: Array[AnyRef],
+      dataTypes: Array[DataType]): Array[AnyRef] = {
     var i = 0
     while (i < inspectors.length) {
-      cache(i) = wrap(row(i), inspectors(i))
+      cache(i) = wrap(row(i), inspectors(i), dataTypes(i))
       i += 1
     }
     cache
@@ -625,7 +634,7 @@ private[hive] trait HiveInspectors {
         ObjectInspectorFactory.getStandardConstantListObjectInspector(listObjectInspector, null)
       } else {
         val list = new java.util.ArrayList[Object]()
-        value.asInstanceOf[Seq[_]].foreach(v => list.add(wrap(v, listObjectInspector)))
+        value.asInstanceOf[Seq[_]].foreach(v => list.add(wrap(v, listObjectInspector, dt)))
         ObjectInspectorFactory.getStandardConstantListObjectInspector(listObjectInspector, list)
       }
     case Literal(value, MapType(keyType, valueType, _)) =>
@@ -636,7 +645,7 @@ private[hive] trait HiveInspectors {
       } else {
         val map = new java.util.HashMap[Object, Object]()
         value.asInstanceOf[Map[_, _]].foreach (entry => {
-          map.put(wrap(entry._1, keyOI), wrap(entry._2, valueOI))
+          map.put(wrap(entry._1, keyOI, keyType), wrap(entry._2, valueOI, valueType))
         })
         ObjectInspectorFactory.getStandardConstantMapObjectInspector(keyOI, valueOI, map)
       }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
index 3259b50acc765..54bf6bd67ff84 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
@@ -83,24 +83,22 @@ private[hive] class HiveFunctionRegistry(underlying: analysis.FunctionRegistry)
 private[hive] case class HiveSimpleUDF(funcWrapper: HiveFunctionWrapper, children: Seq[Expression])
   extends Expression with HiveInspectors with CodegenFallback with Logging {
 
-  type UDFType = UDF
-
   override def deterministic: Boolean = isUDFDeterministic
 
   override def nullable: Boolean = true
 
   @transient
-  lazy val function = funcWrapper.createFunction[UDFType]()
+  lazy val function = funcWrapper.createFunction[UDF]()
 
   @transient
-  protected lazy val method =
+  private lazy val method =
     function.getResolver.getEvalMethod(children.map(_.dataType.toTypeInfo))
 
   @transient
-  protected lazy val arguments = children.map(toInspector).toArray
+  private lazy val arguments = children.map(toInspector).toArray
 
   @transient
-  protected lazy val isUDFDeterministic = {
+  private lazy val isUDFDeterministic = {
     val udfType = function.getClass().getAnnotation(classOf[HiveUDFType])
     udfType != null && udfType.deterministic()
   }
@@ -109,7 +107,7 @@ private[hive] case class HiveSimpleUDF(funcWrapper: HiveFunctionWrapper, childre
 
   // Create parameter converters
   @transient
-  protected lazy val conversionHelper = new ConversionHelper(method, arguments)
+  private lazy val conversionHelper = new ConversionHelper(method, arguments)
 
   @transient
   lazy val dataType = javaClassToDataType(method.getReturnType)
@@ -119,14 +117,19 @@ private[hive] case class HiveSimpleUDF(funcWrapper: HiveFunctionWrapper, childre
     method.getGenericReturnType(), ObjectInspectorOptions.JAVA)
 
   @transient
-  protected lazy val cached: Array[AnyRef] = new Array[AnyRef](children.length)
+  private lazy val cached: Array[AnyRef] = new Array[AnyRef](children.length)
+
+  @transient
+  private lazy val inputDataTypes: Array[DataType] = children.map(_.dataType).toArray
 
   // TODO: Finish input output types.
   override def eval(input: InternalRow): Any = {
-    unwrap(
-      FunctionRegistry.invoke(method, function, conversionHelper
-        .convertIfNecessary(wrap(children.map(c => c.eval(input)), arguments, cached): _*): _*),
-      returnInspector)
+    val inputs = wrap(children.map(c => c.eval(input)), arguments, cached, inputDataTypes)
+    val ret = FunctionRegistry.invoke(
+      method,
+      function,
+      conversionHelper.convertIfNecessary(inputs : _*): _*)
+    unwrap(ret, returnInspector)
   }
 
   override def toString: String = {
@@ -135,47 +138,48 @@ private[hive] case class HiveSimpleUDF(funcWrapper: HiveFunctionWrapper, childre
 }
 
 // Adapter from Catalyst ExpressionResult to Hive DeferredObject
-private[hive] class DeferredObjectAdapter(oi: ObjectInspector)
+private[hive] class DeferredObjectAdapter(oi: ObjectInspector, dataType: DataType)
   extends DeferredObject with HiveInspectors {
+
   private var func: () => Any = _
   def set(func: () => Any): Unit = {
     this.func = func
   }
   override def prepare(i: Int): Unit = {}
-  override def get(): AnyRef = wrap(func(), oi)
+  override def get(): AnyRef = wrap(func(), oi, dataType)
 }
 
 private[hive] case class HiveGenericUDF(funcWrapper: HiveFunctionWrapper, children: Seq[Expression])
   extends Expression with HiveInspectors with CodegenFallback with Logging {
-  type UDFType = GenericUDF
+
+  override def nullable: Boolean = true
 
   override def deterministic: Boolean = isUDFDeterministic
 
-  override def nullable: Boolean = true
+  override def foldable: Boolean =
+    isUDFDeterministic && returnInspector.isInstanceOf[ConstantObjectInspector]
 
   @transient
-  lazy val function = funcWrapper.createFunction[UDFType]()
+  lazy val function = funcWrapper.createFunction[GenericUDF]()
 
   @transient
-  protected lazy val argumentInspectors = children.map(toInspector)
+  private lazy val argumentInspectors = children.map(toInspector)
 
   @transient
-  protected lazy val returnInspector = {
+  private lazy val returnInspector = {
     function.initializeAndFoldConstants(argumentInspectors.toArray)
   }
 
   @transient
-  protected lazy val isUDFDeterministic = {
+  private lazy val isUDFDeterministic = {
     val udfType = function.getClass.getAnnotation(classOf[HiveUDFType])
     udfType != null && udfType.deterministic()
   }
 
-  override def foldable: Boolean =
-    isUDFDeterministic && returnInspector.isInstanceOf[ConstantObjectInspector]
-
   @transient
-  protected lazy val deferedObjects =
-    argumentInspectors.map(new DeferredObjectAdapter(_)).toArray[DeferredObject]
+  private lazy val deferedObjects = argumentInspectors.zip(children).map { case (inspect, child) =>
+    new DeferredObjectAdapter(inspect, child.dataType)
+  }.toArray[DeferredObject]
 
   lazy val dataType: DataType = inspectorToDataType(returnInspector)
 
@@ -354,6 +358,9 @@ private[hive] case class HiveWindowFunction(
   // Output buffer.
   private var outputBuffer: Any = _
 
+  @transient
+  private lazy val inputDataTypes: Array[DataType] = children.map(_.dataType).toArray
+
   override def init(): Unit = {
     evaluator.init(GenericUDAFEvaluator.Mode.COMPLETE, inputInspectors)
   }
@@ -368,8 +375,13 @@ private[hive] case class HiveWindowFunction(
   }
 
   override def prepareInputParameters(input: InternalRow): AnyRef = {
-    wrap(inputProjection(input), inputInspectors, new Array[AnyRef](children.length))
+    wrap(
+      inputProjection(input),
+      inputInspectors,
+      new Array[AnyRef](children.length),
+      inputDataTypes)
   }
+
   // Add input parameters for a single row.
   override def update(input: AnyRef): Unit = {
     evaluator.iterate(hiveEvaluatorBuffer, input.asInstanceOf[Array[AnyRef]])
@@ -510,12 +522,15 @@ private[hive] case class HiveGenericUDTF(
     field => (inspectorToDataType(field.getFieldObjectInspector), true)
   }
 
+  @transient
+  private lazy val inputDataTypes: Array[DataType] = children.map(_.dataType).toArray
+
   override def eval(input: InternalRow): TraversableOnce[InternalRow] = {
     outputInspector // Make sure initialized.
 
     val inputProjection = new InterpretedProjection(children)
 
-    function.process(wrap(inputProjection(input), inputInspectors, udtInput))
+    function.process(wrap(inputProjection(input), inputInspectors, udtInput, inputDataTypes))
     collector.collectRows()
   }
 
@@ -584,9 +599,12 @@ private[hive] case class HiveUDAFFunction(
   @transient
   protected lazy val cached = new Array[AnyRef](exprs.length)
 
+  @transient
+  private lazy val inputDataTypes: Array[DataType] = exprs.map(_.dataType).toArray
+
   def update(input: InternalRow): Unit = {
     val inputs = inputProjection(input)
-    function.iterate(buffer, wrap(inputs, inspectors, cached))
+    function.iterate(buffer, wrap(inputs, inspectors, cached, inputDataTypes))
   }
 }
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala
index 8bb498a06fc9e..0330013f5325e 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala
@@ -48,7 +48,11 @@ class HiveInspectorSuite extends SparkFunSuite with HiveInspectors {
       ObjectInspectorOptions.JAVA).asInstanceOf[StructObjectInspector]
 
     val a = unwrap(state, soi).asInstanceOf[InternalRow]
-    val b = wrap(a, soi).asInstanceOf[UDAFPercentile.State]
+
+    val dt = new StructType()
+      .add("counts", MapType(LongType, LongType))
+      .add("percentiles", ArrayType(DoubleType))
+    val b = wrap(a, soi, dt).asInstanceOf[UDAFPercentile.State]
 
     val sfCounts = soi.getStructFieldRef("counts")
     val sfPercentiles = soi.getStructFieldRef("percentiles")
@@ -158,44 +162,45 @@ class HiveInspectorSuite extends SparkFunSuite with HiveInspectors {
     val writableOIs = dataTypes.map(toWritableInspector)
     val nullRow = data.map(d => null)
 
-    checkValues(nullRow, nullRow.zip(writableOIs).map {
-      case (d, oi) => unwrap(wrap(d, oi), oi)
+    checkValues(nullRow, nullRow.zip(writableOIs).zip(dataTypes).map {
+      case ((d, oi), dt) => unwrap(wrap(d, oi, dt), oi)
     })
 
     // struct couldn't be constant, sweep it out
     val constantExprs = data.filter(!_.dataType.isInstanceOf[StructType])
+    val constantTypes = constantExprs.map(_.dataType)
     val constantData = constantExprs.map(_.eval())
     val constantNullData = constantData.map(_ => null)
     val constantWritableOIs = constantExprs.map(e => toWritableInspector(e.dataType))
     val constantNullWritableOIs =
       constantExprs.map(e => toInspector(Literal.create(null, e.dataType)))
 
-    checkValues(constantData, constantData.zip(constantWritableOIs).map {
-      case (d, oi) => unwrap(wrap(d, oi), oi)
+    checkValues(constantData, constantData.zip(constantWritableOIs).zip(constantTypes).map {
+      case ((d, oi), dt) => unwrap(wrap(d, oi, dt), oi)
     })
 
-    checkValues(constantNullData, constantData.zip(constantNullWritableOIs).map {
-      case (d, oi) => unwrap(wrap(d, oi), oi)
+    checkValues(constantNullData, constantData.zip(constantNullWritableOIs).zip(constantTypes).map {
+      case ((d, oi), dt) => unwrap(wrap(d, oi, dt), oi)
     })
 
-    checkValues(constantNullData, constantNullData.zip(constantWritableOIs).map {
-      case (d, oi) => unwrap(wrap(d, oi), oi)
+    checkValues(constantNullData, constantNullData.zip(constantWritableOIs).zip(constantTypes).map {
+      case ((d, oi), dt) => unwrap(wrap(d, oi, dt), oi)
     })
   }
 
   test("wrap / unwrap primitive writable object inspector") {
     val writableOIs = dataTypes.map(toWritableInspector)
 
-    checkValues(row, row.zip(writableOIs).map {
-      case (data, oi) => unwrap(wrap(data, oi), oi)
+    checkValues(row, row.zip(writableOIs).zip(dataTypes).map {
+      case ((data, oi), dt) => unwrap(wrap(data, oi, dt), oi)
     })
   }
 
   test("wrap / unwrap primitive java object inspector") {
     val ois = dataTypes.map(toInspector)
 
-    checkValues(row, row.zip(ois).map {
-      case (data, oi) => unwrap(wrap(data, oi), oi)
+    checkValues(row, row.zip(ois).zip(dataTypes).map {
+      case ((data, oi), dt) => unwrap(wrap(data, oi, dt), oi)
     })
   }
 
@@ -205,31 +210,33 @@ class HiveInspectorSuite extends SparkFunSuite with HiveInspectors {
     })
     val inspector = toInspector(dt)
     checkValues(row,
-      unwrap(wrap(InternalRow.fromSeq(row), inspector), inspector).asInstanceOf[InternalRow])
-    checkValue(null, unwrap(wrap(null, toInspector(dt)), toInspector(dt)))
+      unwrap(wrap(InternalRow.fromSeq(row), inspector, dt), inspector).asInstanceOf[InternalRow])
+    checkValue(null, unwrap(wrap(null, toInspector(dt), dt), toInspector(dt)))
   }
 
   test("wrap / unwrap Array Type") {
     val dt = ArrayType(dataTypes(0))
 
     val d = row(0) :: row(0) :: Nil
-    checkValue(d, unwrap(wrap(d, toInspector(dt)), toInspector(dt)))
-    checkValue(null, unwrap(wrap(null, toInspector(dt)), toInspector(dt)))
+    checkValue(d, unwrap(wrap(d, toInspector(dt), dt), toInspector(dt)))
+    checkValue(null, unwrap(wrap(null, toInspector(dt), dt), toInspector(dt)))
     checkValue(d,
-      unwrap(wrap(d, toInspector(Literal.create(d, dt))), toInspector(Literal.create(d, dt))))
+      unwrap(wrap(d, toInspector(Literal.create(d, dt)), dt), toInspector(Literal.create(d, dt))))
     checkValue(d,
-      unwrap(wrap(null, toInspector(Literal.create(d, dt))), toInspector(Literal.create(d, dt))))
+      unwrap(wrap(null, toInspector(Literal.create(d, dt)), dt),
+        toInspector(Literal.create(d, dt))))
   }
 
   test("wrap / unwrap Map Type") {
     val dt = MapType(dataTypes(0), dataTypes(1))
 
     val d = Map(row(0) -> row(1))
-    checkValue(d, unwrap(wrap(d, toInspector(dt)), toInspector(dt)))
-    checkValue(null, unwrap(wrap(null, toInspector(dt)), toInspector(dt)))
+    checkValue(d, unwrap(wrap(d, toInspector(dt), dt), toInspector(dt)))
+    checkValue(null, unwrap(wrap(null, toInspector(dt), dt), toInspector(dt)))
     checkValue(d,
-      unwrap(wrap(d, toInspector(Literal.create(d, dt))), toInspector(Literal.create(d, dt))))
+      unwrap(wrap(d, toInspector(Literal.create(d, dt)), dt), toInspector(Literal.create(d, dt))))
     checkValue(d,
-      unwrap(wrap(null, toInspector(Literal.create(d, dt))), toInspector(Literal.create(d, dt))))
+      unwrap(wrap(null, toInspector(Literal.create(d, dt)), dt),
+        toInspector(Literal.create(d, dt))))
   }
 }

From fb5d43fb2529d78d55f1fe8d365191c946153640 Mon Sep 17 00:00:00 2001
From: Yijie Shen <henry.yijieshen@gmail.com>
Date: Sun, 26 Jul 2015 10:29:22 -0700
Subject: [PATCH 381/549] [SPARK-9356][SQL]Remove the internal use of
 DecimalType.Unlimited

JIRA: https://issues.apache.org/jira/browse/SPARK-9356

Author: Yijie Shen <henry.yijieshen@gmail.com>

Closes #7671 from yjshen/deprecated_unlimit and squashes the following commits:

c707f56 [Yijie Shen] remove pattern matching in changePrecision
4a1823c [Yijie Shen] remove internal occurrence of Decimal.Unlimited
---
 .../spark/sql/catalyst/expressions/Cast.scala | 22 +++++++------------
 .../expressions/NullFunctionsSuite.scala      |  2 +-
 .../datasources/PartitioningUtils.scala       |  3 +--
 3 files changed, 10 insertions(+), 17 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
index e5b83cd31bf0f..e208262da96dc 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -507,20 +507,14 @@ case class Cast(child: Expression, dataType: DataType)
   }
 
   private[this] def changePrecision(d: String, decimalType: DecimalType,
-      evPrim: String, evNull: String): String = {
-    decimalType match {
-      case DecimalType.Unlimited =>
-        s"$evPrim = $d;"
-      case DecimalType.Fixed(precision, scale) =>
-        s"""
-          if ($d.changePrecision($precision, $scale)) {
-            $evPrim = $d;
-          } else {
-            $evNull = true;
-          }
-        """
-    }
-  }
+      evPrim: String, evNull: String): String =
+    s"""
+      if ($d.changePrecision(${decimalType.precision}, ${decimalType.scale})) {
+        $evPrim = $d;
+      } else {
+        $evNull = true;
+      }
+    """
 
   private[this] def castToDecimalCode(from: DataType, target: DecimalType): CastFunction = {
     from match {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullFunctionsSuite.scala
index 9efe44c83293d..ace6c15dc8418 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullFunctionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullFunctionsSuite.scala
@@ -92,7 +92,7 @@ class NullFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
 
     val nullOnly = Seq(Literal("x"),
       Literal.create(null, DoubleType),
-      Literal.create(null, DecimalType.Unlimited),
+      Literal.create(null, DecimalType.USER_DEFAULT),
       Literal(Float.MaxValue),
       Literal(false))
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
index 9d0fa894b9942..66dfcc308ceca 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
@@ -179,8 +179,7 @@ private[sql] object PartitioningUtils {
    * {{{
    *   NullType ->
    *   IntegerType -> LongType ->
-   *   DoubleType -> DecimalType.Unlimited ->
-   *   StringType
+   *   DoubleType -> StringType
    * }}}
    */
   private[sql] def resolvePartitions(

From 1cf19760d61a5a17bd175a906d34a2940141b76d Mon Sep 17 00:00:00 2001
From: Andrew Or <andrew@databricks.com>
Date: Sun, 26 Jul 2015 13:03:13 -0700
Subject: [PATCH 382/549] [SPARK-9352] [SPARK-9353] Add tests for standalone
 scheduling code

This also fixes a small issue in the standalone Master that was uncovered by the new tests. For more detail, read the description of SPARK-9353.

Author: Andrew Or <andrew@databricks.com>

Closes #7668 from andrewor14/standalone-scheduling-tests and squashes the following commits:

d852faf [Andrew Or] Add tests + fix scheduling with memory limits
---
 .../apache/spark/deploy/master/Master.scala   |   8 +-
 .../spark/deploy/master/MasterSuite.scala     | 199 +++++++++++++++++-
 2 files changed, 202 insertions(+), 5 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
index 029f94d1020be..51b3f0dead73e 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
@@ -559,7 +559,7 @@ private[master] class Master(
    * allocated at a time, 12 cores from each worker would be assigned to each executor.
    * Since 12 < 16, no executors would launch [SPARK-8881].
    */
-  private[master] def scheduleExecutorsOnWorkers(
+  private def scheduleExecutorsOnWorkers(
       app: ApplicationInfo,
       usableWorkers: Array[WorkerInfo],
       spreadOutApps: Boolean): Array[Int] = {
@@ -585,7 +585,11 @@ private[master] class Master(
         while (keepScheduling && canLaunchExecutor(pos) && coresToAssign >= coresPerExecutor) {
           coresToAssign -= coresPerExecutor
           assignedCores(pos) += coresPerExecutor
-          assignedMemory(pos) += memoryPerExecutor
+          // If cores per executor is not set, we are assigning 1 core at a time
+          // without actually meaning to launch 1 executor for each core assigned
+          if (app.desc.coresPerExecutor.isDefined) {
+            assignedMemory(pos) += memoryPerExecutor
+          }
 
           // Spreading out an application means spreading out its executors across as
           // many workers as possible. If we are not spreading out, then we should keep
diff --git a/core/src/test/scala/org/apache/spark/deploy/master/MasterSuite.scala b/core/src/test/scala/org/apache/spark/deploy/master/MasterSuite.scala
index a8fbaf1d9da0a..4d7016d1e594b 100644
--- a/core/src/test/scala/org/apache/spark/deploy/master/MasterSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/master/MasterSuite.scala
@@ -25,14 +25,15 @@ import scala.language.postfixOps
 
 import org.json4s._
 import org.json4s.jackson.JsonMethods._
-import org.scalatest.Matchers
+import org.scalatest.{Matchers, PrivateMethodTester}
 import org.scalatest.concurrent.Eventually
 import other.supplier.{CustomPersistenceEngine, CustomRecoveryModeFactory}
 
-import org.apache.spark.{SparkConf, SparkFunSuite}
+import org.apache.spark.{SecurityManager, SparkConf, SparkFunSuite}
 import org.apache.spark.deploy._
+import org.apache.spark.rpc.RpcEnv
 
-class MasterSuite extends SparkFunSuite with Matchers with Eventually {
+class MasterSuite extends SparkFunSuite with Matchers with Eventually with PrivateMethodTester {
 
   test("can use a custom recovery mode factory") {
     val conf = new SparkConf(loadDefaults = false)
@@ -142,4 +143,196 @@ class MasterSuite extends SparkFunSuite with Matchers with Eventually {
     }
   }
 
+  test("basic scheduling - spread out") {
+    testBasicScheduling(spreadOut = true)
+  }
+
+  test("basic scheduling - no spread out") {
+    testBasicScheduling(spreadOut = false)
+  }
+
+  test("scheduling with max cores - spread out") {
+    testSchedulingWithMaxCores(spreadOut = true)
+  }
+
+  test("scheduling with max cores - no spread out") {
+    testSchedulingWithMaxCores(spreadOut = false)
+  }
+
+  test("scheduling with cores per executor - spread out") {
+    testSchedulingWithCoresPerExecutor(spreadOut = true)
+  }
+
+  test("scheduling with cores per executor - no spread out") {
+    testSchedulingWithCoresPerExecutor(spreadOut = false)
+  }
+
+  test("scheduling with cores per executor AND max cores - spread out") {
+    testSchedulingWithCoresPerExecutorAndMaxCores(spreadOut = true)
+  }
+
+  test("scheduling with cores per executor AND max cores - no spread out") {
+    testSchedulingWithCoresPerExecutorAndMaxCores(spreadOut = false)
+  }
+
+  private def testBasicScheduling(spreadOut: Boolean): Unit = {
+    val master = makeMaster()
+    val appInfo = makeAppInfo(1024)
+    val workerInfo = makeWorkerInfo(4096, 10)
+    val workerInfos = Array(workerInfo, workerInfo, workerInfo)
+    val scheduledCores = master.invokePrivate(
+      _scheduleExecutorsOnWorkers(appInfo, workerInfos, spreadOut))
+    assert(scheduledCores.length === 3)
+    assert(scheduledCores(0) === 10)
+    assert(scheduledCores(1) === 10)
+    assert(scheduledCores(2) === 10)
+  }
+
+  private def testSchedulingWithMaxCores(spreadOut: Boolean): Unit = {
+    val master = makeMaster()
+    val appInfo1 = makeAppInfo(1024, maxCores = Some(8))
+    val appInfo2 = makeAppInfo(1024, maxCores = Some(16))
+    val workerInfo = makeWorkerInfo(4096, 10)
+    val workerInfos = Array(workerInfo, workerInfo, workerInfo)
+    var scheduledCores = master.invokePrivate(
+      _scheduleExecutorsOnWorkers(appInfo1, workerInfos, spreadOut))
+    assert(scheduledCores.length === 3)
+    // With spreading out, each worker should be assigned a few cores
+    if (spreadOut) {
+      assert(scheduledCores(0) === 3)
+      assert(scheduledCores(1) === 3)
+      assert(scheduledCores(2) === 2)
+    } else {
+      // Without spreading out, the cores should be concentrated on the first worker
+      assert(scheduledCores(0) === 8)
+      assert(scheduledCores(1) === 0)
+      assert(scheduledCores(2) === 0)
+    }
+    // Now test the same thing with max cores > cores per worker
+    scheduledCores = master.invokePrivate(
+      _scheduleExecutorsOnWorkers(appInfo2, workerInfos, spreadOut))
+    assert(scheduledCores.length === 3)
+    if (spreadOut) {
+      assert(scheduledCores(0) === 6)
+      assert(scheduledCores(1) === 5)
+      assert(scheduledCores(2) === 5)
+    } else {
+      // Without spreading out, the first worker should be fully booked,
+      // and the leftover cores should spill over to the second worker only.
+      assert(scheduledCores(0) === 10)
+      assert(scheduledCores(1) === 6)
+      assert(scheduledCores(2) === 0)
+    }
+  }
+
+  private def testSchedulingWithCoresPerExecutor(spreadOut: Boolean): Unit = {
+    val master = makeMaster()
+    val appInfo1 = makeAppInfo(1024, coresPerExecutor = Some(2))
+    val appInfo2 = makeAppInfo(256, coresPerExecutor = Some(2))
+    val appInfo3 = makeAppInfo(256, coresPerExecutor = Some(3))
+    val workerInfo = makeWorkerInfo(4096, 10)
+    val workerInfos = Array(workerInfo, workerInfo, workerInfo)
+    // Each worker should end up with 4 executors with 2 cores each
+    // This should be 4 because of the memory restriction on each worker
+    var scheduledCores = master.invokePrivate(
+      _scheduleExecutorsOnWorkers(appInfo1, workerInfos, spreadOut))
+    assert(scheduledCores.length === 3)
+    assert(scheduledCores(0) === 8)
+    assert(scheduledCores(1) === 8)
+    assert(scheduledCores(2) === 8)
+    // Now test the same thing without running into the worker memory limit
+    // Each worker should now end up with 5 executors with 2 cores each
+    scheduledCores = master.invokePrivate(
+      _scheduleExecutorsOnWorkers(appInfo2, workerInfos, spreadOut))
+    assert(scheduledCores.length === 3)
+    assert(scheduledCores(0) === 10)
+    assert(scheduledCores(1) === 10)
+    assert(scheduledCores(2) === 10)
+    // Now test the same thing with a cores per executor that 10 is not divisible by
+    scheduledCores = master.invokePrivate(
+      _scheduleExecutorsOnWorkers(appInfo3, workerInfos, spreadOut))
+    assert(scheduledCores.length === 3)
+    assert(scheduledCores(0) === 9)
+    assert(scheduledCores(1) === 9)
+    assert(scheduledCores(2) === 9)
+  }
+
+  // Sorry for the long method name!
+  private def testSchedulingWithCoresPerExecutorAndMaxCores(spreadOut: Boolean): Unit = {
+    val master = makeMaster()
+    val appInfo1 = makeAppInfo(256, coresPerExecutor = Some(2), maxCores = Some(4))
+    val appInfo2 = makeAppInfo(256, coresPerExecutor = Some(2), maxCores = Some(20))
+    val appInfo3 = makeAppInfo(256, coresPerExecutor = Some(3), maxCores = Some(20))
+    val workerInfo = makeWorkerInfo(4096, 10)
+    val workerInfos = Array(workerInfo, workerInfo, workerInfo)
+    // We should only launch two executors, each with exactly 2 cores
+    var scheduledCores = master.invokePrivate(
+      _scheduleExecutorsOnWorkers(appInfo1, workerInfos, spreadOut))
+    assert(scheduledCores.length === 3)
+    if (spreadOut) {
+      assert(scheduledCores(0) === 2)
+      assert(scheduledCores(1) === 2)
+      assert(scheduledCores(2) === 0)
+    } else {
+      assert(scheduledCores(0) === 4)
+      assert(scheduledCores(1) === 0)
+      assert(scheduledCores(2) === 0)
+    }
+    // Test max cores > number of cores per worker
+    scheduledCores = master.invokePrivate(
+      _scheduleExecutorsOnWorkers(appInfo2, workerInfos, spreadOut))
+    assert(scheduledCores.length === 3)
+    if (spreadOut) {
+      assert(scheduledCores(0) === 8)
+      assert(scheduledCores(1) === 6)
+      assert(scheduledCores(2) === 6)
+    } else {
+      assert(scheduledCores(0) === 10)
+      assert(scheduledCores(1) === 10)
+      assert(scheduledCores(2) === 0)
+    }
+    // Test max cores > number of cores per worker AND
+    // a cores per executor that is 10 is not divisible by
+    scheduledCores = master.invokePrivate(
+      _scheduleExecutorsOnWorkers(appInfo3, workerInfos, spreadOut))
+    assert(scheduledCores.length === 3)
+    if (spreadOut) {
+      assert(scheduledCores(0) === 6)
+      assert(scheduledCores(1) === 6)
+      assert(scheduledCores(2) === 6)
+    } else {
+      assert(scheduledCores(0) === 9)
+      assert(scheduledCores(1) === 9)
+      assert(scheduledCores(2) === 0)
+    }
+  }
+
+  // ===============================
+  // | Utility methods for testing |
+  // ===============================
+
+  private val _scheduleExecutorsOnWorkers = PrivateMethod[Array[Int]]('scheduleExecutorsOnWorkers)
+
+  private def makeMaster(conf: SparkConf = new SparkConf): Master = {
+    val securityMgr = new SecurityManager(conf)
+    val rpcEnv = RpcEnv.create(Master.SYSTEM_NAME, "localhost", 7077, conf, securityMgr)
+    val master = new Master(rpcEnv, rpcEnv.address, 8080, securityMgr, conf)
+    master
+  }
+
+  private def makeAppInfo(
+      memoryPerExecutorMb: Int,
+      coresPerExecutor: Option[Int] = None,
+      maxCores: Option[Int] = None): ApplicationInfo = {
+    val desc = new ApplicationDescription(
+      "test", maxCores, memoryPerExecutorMb, null, "", None, None, coresPerExecutor)
+    val appId = System.currentTimeMillis.toString
+    new ApplicationInfo(0, appId, desc, new Date, null, Int.MaxValue)
+  }
+
+  private def makeWorkerInfo(memoryMb: Int, cores: Int): WorkerInfo = {
+    val workerId = System.currentTimeMillis.toString
+    new WorkerInfo(workerId, "host", 100, cores, memoryMb, null, 101, "address")
+  }
+
 }

From 6b2baec04fa3d928f0ee84af8c2723ac03a4648c Mon Sep 17 00:00:00 2001
From: Kay Ousterhout <kayousterhout@gmail.com>
Date: Sun, 26 Jul 2015 13:35:16 -0700
Subject: [PATCH 383/549] [SPARK-9326] Close lock file used for file downloads.

A lock file is used to ensure multiple executors running on the
same machine don't download the same file concurrently. Spark never
closes these lock files (releasing the lock does not close the
underlying file); this commit fixes that.

cc vanzin (looks like you've been involved in various other fixes surrounding these lock files)

Author: Kay Ousterhout <kayousterhout@gmail.com>

Closes #7650 from kayousterhout/SPARK-9326 and squashes the following commits:

0401bd1 [Kay Ousterhout] Close lock file used for file downloads.
---
 core/src/main/scala/org/apache/spark/util/Utils.scala | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index c5816949cd360..c4012d0e83f7d 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -443,11 +443,11 @@ private[spark] object Utils extends Logging {
       val lockFileName = s"${url.hashCode}${timestamp}_lock"
       val localDir = new File(getLocalDir(conf))
       val lockFile = new File(localDir, lockFileName)
-      val raf = new RandomAccessFile(lockFile, "rw")
+      val lockFileChannel = new RandomAccessFile(lockFile, "rw").getChannel()
       // Only one executor entry.
       // The FileLock is only used to control synchronization for executors download file,
       // it's always safe regardless of lock type (mandatory or advisory).
-      val lock = raf.getChannel().lock()
+      val lock = lockFileChannel.lock()
       val cachedFile = new File(localDir, cachedFileName)
       try {
         if (!cachedFile.exists()) {
@@ -455,6 +455,7 @@ private[spark] object Utils extends Logging {
         }
       } finally {
         lock.release()
+        lockFileChannel.close()
       }
       copyFile(
         url,

From c025c3d0a1fdfbc45b64db9c871176b40b4a7b9b Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Sun, 26 Jul 2015 16:49:19 -0700
Subject: [PATCH 384/549] [SPARK-9095] [SQL] Removes the old Parquet support

This PR removes the old Parquet support:

- Removes the old `ParquetRelation` together with related SQL configuration, plan nodes, strategies, utility classes, and test suites.

- Renames `ParquetRelation2` to `ParquetRelation`

- Renames `RowReadSupport` and `RowRecordMaterializer` to `CatalystReadSupport` and `CatalystRecordMaterializer` respectively, and moved them to separate files.

  This follows naming convention used in other Parquet data models implemented in parquet-mr. It should be easier for developers who are familiar with Parquet to follow.

There's still some other code that can be cleaned up. Especially `RowWriteSupport`. But I'd like to leave this part to SPARK-8848.

Author: Cheng Lian <lian@databricks.com>

Closes #7441 from liancheng/spark-9095 and squashes the following commits:

c7b6e38 [Cheng Lian] Removes WriteToFile
2d688d6 [Cheng Lian] Renames ParquetRelation2 to ParquetRelation
ca9e1b7 [Cheng Lian] Removes old Parquet support
---
 .../plans/logical/basicOperators.scala        |   6 -
 .../org/apache/spark/sql/DataFrame.scala      |   9 +-
 .../apache/spark/sql/DataFrameReader.scala    |   8 +-
 .../scala/org/apache/spark/sql/SQLConf.scala  |   6 -
 .../org/apache/spark/sql/SQLContext.scala     |   6 +-
 .../spark/sql/execution/SparkStrategies.scala |  58 +-
 .../sql/parquet/CatalystReadSupport.scala     | 153 ++++
 .../parquet/CatalystRecordMaterializer.scala  |  41 +
 .../sql/parquet/CatalystSchemaConverter.scala |   5 +
 .../spark/sql/parquet/ParquetConverter.scala  |   1 +
 .../spark/sql/parquet/ParquetRelation.scala   | 843 +++++++++++++++---
 .../sql/parquet/ParquetTableOperations.scala  | 492 ----------
 .../sql/parquet/ParquetTableSupport.scala     | 151 +---
 .../spark/sql/parquet/ParquetTypes.scala      |  42 +-
 .../apache/spark/sql/parquet/newParquet.scala | 732 ---------------
 .../sql/parquet/ParquetFilterSuite.scala      |  65 +-
 .../spark/sql/parquet/ParquetIOSuite.scala    |  37 +-
 .../ParquetPartitionDiscoverySuite.scala      |   2 +-
 .../spark/sql/parquet/ParquetQuerySuite.scala |  27 +-
 .../sql/parquet/ParquetSchemaSuite.scala      |  12 +-
 .../apache/spark/sql/hive/HiveContext.scala   |   2 -
 .../spark/sql/hive/HiveMetastoreCatalog.scala |  22 +-
 .../spark/sql/hive/HiveStrategies.scala       | 141 +--
 .../spark/sql/hive/HiveParquetSuite.scala     |  86 +-
 .../sql/hive/MetastoreDataSourcesSuite.scala  |  14 +-
 .../sql/hive/execution/SQLQuerySuite.scala    |  54 +-
 .../apache/spark/sql/hive/parquetSuites.scala | 174 +---
 27 files changed, 1037 insertions(+), 2152 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystReadSupport.scala
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystRecordMaterializer.scala
 delete mode 100644 sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
 delete mode 100644 sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
index 8e1a236e2988c..af68358daf5f1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
@@ -186,12 +186,6 @@ case class WithWindowDefinition(
   override def output: Seq[Attribute] = child.output
 }
 
-case class WriteToFile(
-    path: String,
-    child: LogicalPlan) extends UnaryNode {
-  override def output: Seq[Attribute] = child.output
-}
-
 /**
  * @param order  The ordering expressions
  * @param global True means global sorting apply for entire data set,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index fa942a1f8fd93..114ab91d10aa0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -139,8 +139,7 @@ class DataFrame private[sql](
     // happen right away to let these side effects take place eagerly.
     case _: Command |
          _: InsertIntoTable |
-         _: CreateTableUsingAsSelect |
-         _: WriteToFile =>
+         _: CreateTableUsingAsSelect =>
       LogicalRDD(queryExecution.analyzed.output, queryExecution.toRdd)(sqlContext)
     case _ =>
       queryExecution.analyzed
@@ -1615,11 +1614,7 @@ class DataFrame private[sql](
    */
   @deprecated("Use write.parquet(path)", "1.4.0")
   def saveAsParquetFile(path: String): Unit = {
-    if (sqlContext.conf.parquetUseDataSourceApi) {
-      write.format("parquet").mode(SaveMode.ErrorIfExists).save(path)
-    } else {
-      sqlContext.executePlan(WriteToFile(path, logicalPlan)).toRdd
-    }
+    write.format("parquet").mode(SaveMode.ErrorIfExists).save(path)
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index e9d782cdcd667..eb09807f9d9c2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -21,16 +21,16 @@ import java.util.Properties
 
 import org.apache.hadoop.fs.Path
 
-import org.apache.spark.{Logging, Partition}
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.execution.datasources.{ResolvedDataSource, LogicalRelation}
+import org.apache.spark.sql.execution.datasources.{LogicalRelation, ResolvedDataSource}
 import org.apache.spark.sql.jdbc.{JDBCPartition, JDBCPartitioningInfo, JDBCRelation}
 import org.apache.spark.sql.json.JSONRelation
-import org.apache.spark.sql.parquet.ParquetRelation2
+import org.apache.spark.sql.parquet.ParquetRelation
 import org.apache.spark.sql.types.StructType
+import org.apache.spark.{Logging, Partition}
 
 /**
  * :: Experimental ::
@@ -259,7 +259,7 @@ class DataFrameReader private[sql](sqlContext: SQLContext) extends Logging {
       }.toArray
 
       sqlContext.baseRelationToDataFrame(
-        new ParquetRelation2(
+        new ParquetRelation(
           globbedPaths.map(_.toString), None, None, extraOptions.toMap)(sqlContext))
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
index 2a641b9d64a95..9b2dbd7442f5c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
@@ -276,10 +276,6 @@ private[spark] object SQLConf {
     defaultValue = Some(true),
     doc = "Enables Parquet filter push-down optimization when set to true.")
 
-  val PARQUET_USE_DATA_SOURCE_API = booleanConf("spark.sql.parquet.useDataSourceApi",
-    defaultValue = Some(true),
-    doc = "<TODO>")
-
   val PARQUET_FOLLOW_PARQUET_FORMAT_SPEC = booleanConf(
     key = "spark.sql.parquet.followParquetFormatSpec",
     defaultValue = Some(false),
@@ -456,8 +452,6 @@ private[sql] class SQLConf extends Serializable with CatalystConf {
 
   private[spark] def parquetFilterPushDown: Boolean = getConf(PARQUET_FILTER_PUSHDOWN_ENABLED)
 
-  private[spark] def parquetUseDataSourceApi: Boolean = getConf(PARQUET_USE_DATA_SOURCE_API)
-
   private[spark] def orcFilterPushDown: Boolean = getConf(ORC_FILTER_PUSHDOWN_ENABLED)
 
   private[spark] def verifyPartitionPath: Boolean = getConf(HIVE_VERIFY_PARTITION_PATH)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 49bfe74b680af..0e25e06e99ab2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -870,7 +870,6 @@ class SQLContext(@transient val sparkContext: SparkContext)
       LeftSemiJoin ::
       HashJoin ::
       InMemoryScans ::
-      ParquetOperations ::
       BasicOperators ::
       CartesianProduct ::
       BroadcastNestedLoopJoin :: Nil)
@@ -1115,11 +1114,8 @@ class SQLContext(@transient val sparkContext: SparkContext)
   def parquetFile(paths: String*): DataFrame = {
     if (paths.isEmpty) {
       emptyDataFrame
-    } else if (conf.parquetUseDataSourceApi) {
-      read.parquet(paths : _*)
     } else {
-      DataFrame(this, parquet.ParquetRelation(
-        paths.mkString(","), Some(sparkContext.hadoopConfiguration), this))
+      read.parquet(paths : _*)
     }
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index eb4be1900b153..e2c7e8006f3b1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -17,19 +17,18 @@
 
 package org.apache.spark.sql.execution
 
-import org.apache.spark.sql.{SQLContext, Strategy, execution}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression2}
+import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression2
 import org.apache.spark.sql.catalyst.planning._
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical.{BroadcastHint, LogicalPlan}
 import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.sql.columnar.{InMemoryColumnarTableScan, InMemoryRelation}
-import org.apache.spark.sql.execution.{DescribeCommand => RunnableDescribeCommand}
 import org.apache.spark.sql.execution.datasources.{CreateTableUsing, CreateTempTableUsing, DescribeCommand => LogicalDescribeCommand, _}
-import org.apache.spark.sql.parquet._
+import org.apache.spark.sql.execution.{DescribeCommand => RunnableDescribeCommand}
 import org.apache.spark.sql.types._
+import org.apache.spark.sql.{SQLContext, Strategy, execution}
 
 private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
   self: SQLContext#SparkPlanner =>
@@ -306,57 +305,6 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
     }
   }
 
-  object ParquetOperations extends Strategy {
-    def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
-      // TODO: need to support writing to other types of files.  Unify the below code paths.
-      case logical.WriteToFile(path, child) =>
-        val relation =
-          ParquetRelation.create(path, child, sparkContext.hadoopConfiguration, sqlContext)
-        // Note: overwrite=false because otherwise the metadata we just created will be deleted
-        InsertIntoParquetTable(relation, planLater(child), overwrite = false) :: Nil
-      case logical.InsertIntoTable(
-          table: ParquetRelation, partition, child, overwrite, ifNotExists) =>
-        InsertIntoParquetTable(table, planLater(child), overwrite) :: Nil
-      case PhysicalOperation(projectList, filters: Seq[Expression], relation: ParquetRelation) =>
-        val partitionColNames = relation.partitioningAttributes.map(_.name).toSet
-        val filtersToPush = filters.filter { pred =>
-            val referencedColNames = pred.references.map(_.name).toSet
-            referencedColNames.intersect(partitionColNames).isEmpty
-          }
-        val prunePushedDownFilters =
-          if (sqlContext.conf.parquetFilterPushDown) {
-            (predicates: Seq[Expression]) => {
-              // Note: filters cannot be pushed down to Parquet if they contain more complex
-              // expressions than simple "Attribute cmp Literal" comparisons. Here we remove all
-              // filters that have been pushed down. Note that a predicate such as "(A AND B) OR C"
-              // can result in "A OR C" being pushed down. Here we are conservative in the sense
-              // that even if "A" was pushed and we check for "A AND B" we still want to keep
-              // "A AND B" in the higher-level filter, not just "B".
-              predicates.map(p => p -> ParquetFilters.createFilter(p)).collect {
-                case (predicate, None) => predicate
-                // Filter needs to be applied above when it contains partitioning
-                // columns
-                case (predicate, _)
-                  if !predicate.references.map(_.name).toSet.intersect(partitionColNames).isEmpty =>
-                  predicate
-              }
-            }
-          } else {
-            identity[Seq[Expression]] _
-          }
-        pruneFilterProject(
-          projectList,
-          filters,
-          prunePushedDownFilters,
-          ParquetTableScan(
-            _,
-            relation,
-            if (sqlContext.conf.parquetFilterPushDown) filtersToPush else Nil)) :: Nil
-
-      case _ => Nil
-    }
-  }
-
   object InMemoryScans extends Strategy {
     def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
       case PhysicalOperation(projectList, filters, mem: InMemoryRelation) =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystReadSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystReadSupport.scala
new file mode 100644
index 0000000000000..975fec101d9c2
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystReadSupport.scala
@@ -0,0 +1,153 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.parquet
+
+import java.util.{Map => JMap}
+
+import scala.collection.JavaConversions.{iterableAsScalaIterable, mapAsJavaMap, mapAsScalaMap}
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.parquet.hadoop.api.ReadSupport.ReadContext
+import org.apache.parquet.hadoop.api.{InitContext, ReadSupport}
+import org.apache.parquet.io.api.RecordMaterializer
+import org.apache.parquet.schema.MessageType
+
+import org.apache.spark.Logging
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.types.StructType
+
+private[parquet] class CatalystReadSupport extends ReadSupport[InternalRow] with Logging {
+  override def prepareForRead(
+      conf: Configuration,
+      keyValueMetaData: JMap[String, String],
+      fileSchema: MessageType,
+      readContext: ReadContext): RecordMaterializer[InternalRow] = {
+    log.debug(s"Preparing for read Parquet file with message type: $fileSchema")
+
+    val toCatalyst = new CatalystSchemaConverter(conf)
+    val parquetRequestedSchema = readContext.getRequestedSchema
+
+    val catalystRequestedSchema =
+      Option(readContext.getReadSupportMetadata).map(_.toMap).flatMap { metadata =>
+        metadata
+          // First tries to read requested schema, which may result from projections
+          .get(CatalystReadSupport.SPARK_ROW_REQUESTED_SCHEMA)
+          // If not available, tries to read Catalyst schema from file metadata.  It's only
+          // available if the target file is written by Spark SQL.
+          .orElse(metadata.get(CatalystReadSupport.SPARK_METADATA_KEY))
+      }.map(StructType.fromString).getOrElse {
+        logDebug("Catalyst schema not available, falling back to Parquet schema")
+        toCatalyst.convert(parquetRequestedSchema)
+      }
+
+    logDebug(s"Catalyst schema used to read Parquet files: $catalystRequestedSchema")
+    new CatalystRecordMaterializer(parquetRequestedSchema, catalystRequestedSchema)
+  }
+
+  override def init(context: InitContext): ReadContext = {
+    val conf = context.getConfiguration
+
+    // If the target file was written by Spark SQL, we should be able to find a serialized Catalyst
+    // schema of this file from its the metadata.
+    val maybeRowSchema = Option(conf.get(RowWriteSupport.SPARK_ROW_SCHEMA))
+
+    // Optional schema of requested columns, in the form of a string serialized from a Catalyst
+    // `StructType` containing all requested columns.
+    val maybeRequestedSchema = Option(conf.get(CatalystReadSupport.SPARK_ROW_REQUESTED_SCHEMA))
+
+    // Below we construct a Parquet schema containing all requested columns.  This schema tells
+    // Parquet which columns to read.
+    //
+    // If `maybeRequestedSchema` is defined, we assemble an equivalent Parquet schema.  Otherwise,
+    // we have to fallback to the full file schema which contains all columns in the file.
+    // Obviously this may waste IO bandwidth since it may read more columns than requested.
+    //
+    // Two things to note:
+    //
+    // 1. It's possible that some requested columns don't exist in the target Parquet file.  For
+    //    example, in the case of schema merging, the globally merged schema may contain extra
+    //    columns gathered from other Parquet files.  These columns will be simply filled with nulls
+    //    when actually reading the target Parquet file.
+    //
+    // 2. When `maybeRequestedSchema` is available, we can't simply convert the Catalyst schema to
+    //    Parquet schema using `CatalystSchemaConverter`, because the mapping is not unique due to
+    //    non-standard behaviors of some Parquet libraries/tools.  For example, a Parquet file
+    //    containing a single integer array field `f1` may have the following legacy 2-level
+    //    structure:
+    //
+    //      message root {
+    //        optional group f1 (LIST) {
+    //          required INT32 element;
+    //        }
+    //      }
+    //
+    //    while `CatalystSchemaConverter` may generate a standard 3-level structure:
+    //
+    //      message root {
+    //        optional group f1 (LIST) {
+    //          repeated group list {
+    //            required INT32 element;
+    //          }
+    //        }
+    //      }
+    //
+    //    Apparently, we can't use the 2nd schema to read the target Parquet file as they have
+    //    different physical structures.
+    val parquetRequestedSchema =
+      maybeRequestedSchema.fold(context.getFileSchema) { schemaString =>
+        val toParquet = new CatalystSchemaConverter(conf)
+        val fileSchema = context.getFileSchema.asGroupType()
+        val fileFieldNames = fileSchema.getFields.map(_.getName).toSet
+
+        StructType
+          // Deserializes the Catalyst schema of requested columns
+          .fromString(schemaString)
+          .map { field =>
+            if (fileFieldNames.contains(field.name)) {
+              // If the field exists in the target Parquet file, extracts the field type from the
+              // full file schema and makes a single-field Parquet schema
+              new MessageType("root", fileSchema.getType(field.name))
+            } else {
+              // Otherwise, just resorts to `CatalystSchemaConverter`
+              toParquet.convert(StructType(Array(field)))
+            }
+          }
+          // Merges all single-field Parquet schemas to form a complete schema for all requested
+          // columns.  Note that it's possible that no columns are requested at all (e.g., count
+          // some partition column of a partitioned Parquet table). That's why `fold` is used here
+          // and always fallback to an empty Parquet schema.
+          .fold(new MessageType("root")) {
+            _ union _
+          }
+      }
+
+    val metadata =
+      Map.empty[String, String] ++
+        maybeRequestedSchema.map(CatalystReadSupport.SPARK_ROW_REQUESTED_SCHEMA -> _) ++
+        maybeRowSchema.map(RowWriteSupport.SPARK_ROW_SCHEMA -> _)
+
+    logInfo(s"Going to read Parquet file with these requested columns: $parquetRequestedSchema")
+    new ReadContext(parquetRequestedSchema, metadata)
+  }
+}
+
+private[parquet] object CatalystReadSupport {
+  val SPARK_ROW_REQUESTED_SCHEMA = "org.apache.spark.sql.parquet.row.requested_schema"
+
+  val SPARK_METADATA_KEY = "org.apache.spark.sql.parquet.row.metadata"
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystRecordMaterializer.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystRecordMaterializer.scala
new file mode 100644
index 0000000000000..84f1dccfeb788
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystRecordMaterializer.scala
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.parquet
+
+import org.apache.parquet.io.api.{GroupConverter, RecordMaterializer}
+import org.apache.parquet.schema.MessageType
+
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.types.StructType
+
+/**
+ * A [[RecordMaterializer]] for Catalyst rows.
+ *
+ * @param parquetSchema Parquet schema of the records to be read
+ * @param catalystSchema Catalyst schema of the rows to be constructed
+ */
+private[parquet] class CatalystRecordMaterializer(
+    parquetSchema: MessageType, catalystSchema: StructType)
+  extends RecordMaterializer[InternalRow] {
+
+  private val rootConverter = new CatalystRowConverter(parquetSchema, catalystSchema, NoopUpdater)
+
+  override def getCurrentRecord: InternalRow = rootConverter.currentRow
+
+  override def getRootConverter: GroupConverter = rootConverter
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala
index 1d3a0d15d336e..e9ef01e2dba1b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala
@@ -570,6 +570,11 @@ private[parquet] object CatalystSchemaConverter {
        """.stripMargin.split("\n").mkString(" "))
   }
 
+  def checkFieldNames(schema: StructType): StructType = {
+    schema.fieldNames.foreach(checkFieldName)
+    schema
+  }
+
   def analysisRequire(f: => Boolean, message: String): Unit = {
     if (!f) {
       throw new AnalysisException(message)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala
index be0a2029d233b..ea51650fe9039 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.parquet
 
 import org.apache.spark.sql.catalyst.InternalRow
 
+// TODO Removes this while fixing SPARK-8848
 private[sql] object CatalystConverter {
   // This is mostly Parquet convention (see, e.g., `ConversionPatterns`).
   // Note that "array" for the array elements is chosen by ParquetAvro.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala
index 086559e9f7658..cc6fa2b88663f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala
@@ -17,81 +17,720 @@
 
 package org.apache.spark.sql.parquet
 
-import java.io.IOException
+import java.net.URI
 import java.util.logging.{Level, Logger => JLogger}
+import java.util.{List => JList}
 
-import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.fs.Path
-import org.apache.hadoop.fs.permission.FsAction
+import scala.collection.JavaConversions._
+import scala.collection.mutable
+import scala.util.{Failure, Try}
+
+import com.google.common.base.Objects
+import org.apache.hadoop.fs.{FileStatus, Path}
+import org.apache.hadoop.io.Writable
+import org.apache.hadoop.mapreduce._
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
+import org.apache.parquet.filter2.predicate.FilterApi
 import org.apache.parquet.hadoop.metadata.CompressionCodecName
-import org.apache.parquet.hadoop.{ParquetOutputCommitter, ParquetOutputFormat, ParquetRecordReader}
+import org.apache.parquet.hadoop.util.ContextUtil
+import org.apache.parquet.hadoop.{ParquetOutputCommitter, ParquetRecordReader, _}
 import org.apache.parquet.schema.MessageType
 import org.apache.parquet.{Log => ParquetLog}
 
-import org.apache.spark.sql.catalyst.analysis.{MultiInstanceRelation, UnresolvedException}
-import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap}
-import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, Statistics}
-import org.apache.spark.sql.types.StructType
-import org.apache.spark.sql.{DataFrame, SQLContext}
-import org.apache.spark.util.Utils
+import org.apache.spark.{Logging, Partition => SparkPartition, SparkException}
+import org.apache.spark.broadcast.Broadcast
+import org.apache.spark.rdd.RDD
+import org.apache.spark.rdd.RDD._
+import org.apache.spark.sql._
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.execution.{SqlNewHadoopPartition, SqlNewHadoopRDD}
+import org.apache.spark.sql.execution.datasources.PartitionSpec
+import org.apache.spark.sql.sources._
+import org.apache.spark.sql.types.{DataType, StructType}
+import org.apache.spark.util.{SerializableConfiguration, Utils}
+
+
+private[sql] class DefaultSource extends HadoopFsRelationProvider {
+  override def createRelation(
+      sqlContext: SQLContext,
+      paths: Array[String],
+      schema: Option[StructType],
+      partitionColumns: Option[StructType],
+      parameters: Map[String, String]): HadoopFsRelation = {
+    new ParquetRelation(paths, schema, None, partitionColumns, parameters)(sqlContext)
+  }
+}
+
+// NOTE: This class is instantiated and used on executor side only, no need to be serializable.
+private[sql] class ParquetOutputWriter(path: String, context: TaskAttemptContext)
+  extends OutputWriterInternal {
+
+  private val recordWriter: RecordWriter[Void, InternalRow] = {
+    val outputFormat = {
+      new ParquetOutputFormat[InternalRow]() {
+        // Here we override `getDefaultWorkFile` for two reasons:
+        //
+        //  1. To allow appending.  We need to generate unique output file names to avoid
+        //     overwriting existing files (either exist before the write job, or are just written
+        //     by other tasks within the same write job).
+        //
+        //  2. To allow dynamic partitioning.  Default `getDefaultWorkFile` uses
+        //     `FileOutputCommitter.getWorkPath()`, which points to the base directory of all
+        //     partitions in the case of dynamic partitioning.
+        override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = {
+          val uniqueWriteJobId = context.getConfiguration.get("spark.sql.sources.writeJobUUID")
+          val split = context.getTaskAttemptID.getTaskID.getId
+          new Path(path, f"part-r-$split%05d-$uniqueWriteJobId$extension")
+        }
+      }
+    }
+
+    outputFormat.getRecordWriter(context)
+  }
+
+  override def writeInternal(row: InternalRow): Unit = recordWriter.write(null, row)
+
+  override def close(): Unit = recordWriter.close(context)
+}
+
+private[sql] class ParquetRelation(
+    override val paths: Array[String],
+    private val maybeDataSchema: Option[StructType],
+    // This is for metastore conversion.
+    private val maybePartitionSpec: Option[PartitionSpec],
+    override val userDefinedPartitionColumns: Option[StructType],
+    parameters: Map[String, String])(
+    val sqlContext: SQLContext)
+  extends HadoopFsRelation(maybePartitionSpec)
+  with Logging {
+
+  private[sql] def this(
+      paths: Array[String],
+      maybeDataSchema: Option[StructType],
+      maybePartitionSpec: Option[PartitionSpec],
+      parameters: Map[String, String])(
+      sqlContext: SQLContext) = {
+    this(
+      paths,
+      maybeDataSchema,
+      maybePartitionSpec,
+      maybePartitionSpec.map(_.partitionColumns),
+      parameters)(sqlContext)
+  }
+
+  // Should we merge schemas from all Parquet part-files?
+  private val shouldMergeSchemas =
+    parameters
+      .get(ParquetRelation.MERGE_SCHEMA)
+      .map(_.toBoolean)
+      .getOrElse(sqlContext.conf.getConf(SQLConf.PARQUET_SCHEMA_MERGING_ENABLED))
+
+  private val maybeMetastoreSchema = parameters
+    .get(ParquetRelation.METASTORE_SCHEMA)
+    .map(DataType.fromJson(_).asInstanceOf[StructType])
+
+  private lazy val metadataCache: MetadataCache = {
+    val meta = new MetadataCache
+    meta.refresh()
+    meta
+  }
 
-/**
- * Relation that consists of data stored in a Parquet columnar format.
- *
- * Users should interact with parquet files though a [[DataFrame]], created by a [[SQLContext]]
- * instead of using this class directly.
- *
- * {{{
- *   val parquetRDD = sqlContext.parquetFile("path/to/parquet.file")
- * }}}
- *
- * @param path The path to the Parquet file.
- */
-private[sql] case class ParquetRelation(
-    path: String,
-    @transient conf: Option[Configuration],
-    @transient sqlContext: SQLContext,
-    partitioningAttributes: Seq[Attribute] = Nil)
-  extends LeafNode with MultiInstanceRelation {
-
-  /** Schema derived from ParquetFile */
-  def parquetSchema: MessageType =
-    ParquetTypesConverter
-      .readMetaData(new Path(path), conf)
-      .getFileMetaData
-      .getSchema
-
-  /** Attributes */
-  override val output =
-    partitioningAttributes ++
-    ParquetTypesConverter.readSchemaFromFile(
-      new Path(path.split(",").head),
-      conf,
-      sqlContext.conf.isParquetBinaryAsString,
-      sqlContext.conf.isParquetINT96AsTimestamp)
-  lazy val attributeMap = AttributeMap(output.map(o => o -> o))
-
-  override def newInstance(): this.type = {
-    ParquetRelation(path, conf, sqlContext).asInstanceOf[this.type]
-  }
-
-  // Equals must also take into account the output attributes so that we can distinguish between
-  // different instances of the same relation,
   override def equals(other: Any): Boolean = other match {
-    case p: ParquetRelation =>
-      p.path == path && p.output == output
+    case that: ParquetRelation =>
+      val schemaEquality = if (shouldMergeSchemas) {
+        this.shouldMergeSchemas == that.shouldMergeSchemas
+      } else {
+        this.dataSchema == that.dataSchema &&
+          this.schema == that.schema
+      }
+
+      this.paths.toSet == that.paths.toSet &&
+        schemaEquality &&
+        this.maybeDataSchema == that.maybeDataSchema &&
+        this.partitionColumns == that.partitionColumns
+
     case _ => false
   }
 
-  override def hashCode: Int = {
-    com.google.common.base.Objects.hashCode(path, output)
+  override def hashCode(): Int = {
+    if (shouldMergeSchemas) {
+      Objects.hashCode(
+        Boolean.box(shouldMergeSchemas),
+        paths.toSet,
+        maybeDataSchema,
+        partitionColumns)
+    } else {
+      Objects.hashCode(
+        Boolean.box(shouldMergeSchemas),
+        paths.toSet,
+        dataSchema,
+        schema,
+        maybeDataSchema,
+        partitionColumns)
+    }
+  }
+
+  /** Constraints on schema of dataframe to be stored. */
+  private def checkConstraints(schema: StructType): Unit = {
+    if (schema.fieldNames.length != schema.fieldNames.distinct.length) {
+      val duplicateColumns = schema.fieldNames.groupBy(identity).collect {
+        case (x, ys) if ys.length > 1 => "\"" + x + "\""
+      }.mkString(", ")
+      throw new AnalysisException(s"Duplicate column(s) : $duplicateColumns found, " +
+        s"cannot save to parquet format")
+    }
+  }
+
+  override def dataSchema: StructType = {
+    val schema = maybeDataSchema.getOrElse(metadataCache.dataSchema)
+    // check if schema satisfies the constraints
+    // before moving forward
+    checkConstraints(schema)
+    schema
+  }
+
+  override private[sql] def refresh(): Unit = {
+    super.refresh()
+    metadataCache.refresh()
+  }
+
+  // Parquet data source always uses Catalyst internal representations.
+  override val needConversion: Boolean = false
+
+  override def sizeInBytes: Long = metadataCache.dataStatuses.map(_.getLen).sum
+
+  override def prepareJobForWrite(job: Job): OutputWriterFactory = {
+    val conf = ContextUtil.getConfiguration(job)
+
+    val committerClass =
+      conf.getClass(
+        SQLConf.PARQUET_OUTPUT_COMMITTER_CLASS.key,
+        classOf[ParquetOutputCommitter],
+        classOf[ParquetOutputCommitter])
+
+    if (conf.get(SQLConf.PARQUET_OUTPUT_COMMITTER_CLASS.key) == null) {
+      logInfo("Using default output committer for Parquet: " +
+        classOf[ParquetOutputCommitter].getCanonicalName)
+    } else {
+      logInfo("Using user defined output committer for Parquet: " + committerClass.getCanonicalName)
+    }
+
+    conf.setClass(
+      SQLConf.OUTPUT_COMMITTER_CLASS.key,
+      committerClass,
+      classOf[ParquetOutputCommitter])
+
+    // We're not really using `ParquetOutputFormat[Row]` for writing data here, because we override
+    // it in `ParquetOutputWriter` to support appending and dynamic partitioning.  The reason why
+    // we set it here is to setup the output committer class to `ParquetOutputCommitter`, which is
+    // bundled with `ParquetOutputFormat[Row]`.
+    job.setOutputFormatClass(classOf[ParquetOutputFormat[Row]])
+
+    // TODO There's no need to use two kinds of WriteSupport
+    // We should unify them. `SpecificMutableRow` can process both atomic (primitive) types and
+    // complex types.
+    val writeSupportClass =
+      if (dataSchema.map(_.dataType).forall(ParquetTypesConverter.isPrimitiveType)) {
+        classOf[MutableRowWriteSupport]
+      } else {
+        classOf[RowWriteSupport]
+      }
+
+    ParquetOutputFormat.setWriteSupportClass(job, writeSupportClass)
+    RowWriteSupport.setSchema(dataSchema.toAttributes, conf)
+
+    // Sets compression scheme
+    conf.set(
+      ParquetOutputFormat.COMPRESSION,
+      ParquetRelation
+        .shortParquetCompressionCodecNames
+        .getOrElse(
+          sqlContext.conf.parquetCompressionCodec.toUpperCase,
+          CompressionCodecName.UNCOMPRESSED).name())
+
+    new OutputWriterFactory {
+      override def newInstance(
+          path: String, dataSchema: StructType, context: TaskAttemptContext): OutputWriter = {
+        new ParquetOutputWriter(path, context)
+      }
+    }
+  }
+
+  override def buildScan(
+      requiredColumns: Array[String],
+      filters: Array[Filter],
+      inputFiles: Array[FileStatus],
+      broadcastedConf: Broadcast[SerializableConfiguration]): RDD[Row] = {
+    val useMetadataCache = sqlContext.getConf(SQLConf.PARQUET_CACHE_METADATA)
+    val parquetFilterPushDown = sqlContext.conf.parquetFilterPushDown
+    val assumeBinaryIsString = sqlContext.conf.isParquetBinaryAsString
+    val assumeInt96IsTimestamp = sqlContext.conf.isParquetINT96AsTimestamp
+    val followParquetFormatSpec = sqlContext.conf.followParquetFormatSpec
+
+    // Create the function to set variable Parquet confs at both driver and executor side.
+    val initLocalJobFuncOpt =
+      ParquetRelation.initializeLocalJobFunc(
+        requiredColumns,
+        filters,
+        dataSchema,
+        useMetadataCache,
+        parquetFilterPushDown,
+        assumeBinaryIsString,
+        assumeInt96IsTimestamp,
+        followParquetFormatSpec) _
+
+    // Create the function to set input paths at the driver side.
+    val setInputPaths = ParquetRelation.initializeDriverSideJobFunc(inputFiles) _
+
+    Utils.withDummyCallSite(sqlContext.sparkContext) {
+      new SqlNewHadoopRDD(
+        sc = sqlContext.sparkContext,
+        broadcastedConf = broadcastedConf,
+        initDriverSideJobFuncOpt = Some(setInputPaths),
+        initLocalJobFuncOpt = Some(initLocalJobFuncOpt),
+        inputFormatClass = classOf[ParquetInputFormat[InternalRow]],
+        keyClass = classOf[Void],
+        valueClass = classOf[InternalRow]) {
+
+        val cacheMetadata = useMetadataCache
+
+        @transient val cachedStatuses = inputFiles.map { f =>
+          // In order to encode the authority of a Path containing special characters such as '/'
+          // (which does happen in some S3N credentials), we need to use the string returned by the
+          // URI of the path to create a new Path.
+          val pathWithEscapedAuthority = escapePathUserInfo(f.getPath)
+          new FileStatus(
+            f.getLen, f.isDir, f.getReplication, f.getBlockSize, f.getModificationTime,
+            f.getAccessTime, f.getPermission, f.getOwner, f.getGroup, pathWithEscapedAuthority)
+        }.toSeq
+
+        private def escapePathUserInfo(path: Path): Path = {
+          val uri = path.toUri
+          new Path(new URI(
+            uri.getScheme, uri.getRawUserInfo, uri.getHost, uri.getPort, uri.getPath,
+            uri.getQuery, uri.getFragment))
+        }
+
+        // Overridden so we can inject our own cached files statuses.
+        override def getPartitions: Array[SparkPartition] = {
+          val inputFormat = new ParquetInputFormat[InternalRow] {
+            override def listStatus(jobContext: JobContext): JList[FileStatus] = {
+              if (cacheMetadata) cachedStatuses else super.listStatus(jobContext)
+            }
+          }
+
+          val jobContext = newJobContext(getConf(isDriverSide = true), jobId)
+          val rawSplits = inputFormat.getSplits(jobContext)
+
+          Array.tabulate[SparkPartition](rawSplits.size) { i =>
+            new SqlNewHadoopPartition(id, i, rawSplits(i).asInstanceOf[InputSplit with Writable])
+          }
+        }
+      }.values.asInstanceOf[RDD[Row]]  // type erasure hack to pass RDD[InternalRow] as RDD[Row]
+    }
   }
 
-  // TODO: Use data from the footers.
-  override lazy val statistics = Statistics(sizeInBytes = sqlContext.conf.defaultSizeInBytes)
+  private class MetadataCache {
+    // `FileStatus` objects of all "_metadata" files.
+    private var metadataStatuses: Array[FileStatus] = _
+
+    // `FileStatus` objects of all "_common_metadata" files.
+    private var commonMetadataStatuses: Array[FileStatus] = _
+
+    // `FileStatus` objects of all data files (Parquet part-files).
+    var dataStatuses: Array[FileStatus] = _
+
+    // Schema of the actual Parquet files, without partition columns discovered from partition
+    // directory paths.
+    var dataSchema: StructType = null
+
+    // Schema of the whole table, including partition columns.
+    var schema: StructType = _
+
+    // Cached leaves
+    var cachedLeaves: Set[FileStatus] = null
+
+    /**
+     * Refreshes `FileStatus`es, footers, partition spec, and table schema.
+     */
+    def refresh(): Unit = {
+      val currentLeafStatuses = cachedLeafStatuses()
+
+      // Check if cachedLeafStatuses is changed or not
+      val leafStatusesChanged = (cachedLeaves == null) ||
+        !cachedLeaves.equals(currentLeafStatuses)
+
+      if (leafStatusesChanged) {
+        cachedLeaves = currentLeafStatuses.toIterator.toSet
+
+        // Lists `FileStatus`es of all leaf nodes (files) under all base directories.
+        val leaves = currentLeafStatuses.filter { f =>
+          isSummaryFile(f.getPath) ||
+            !(f.getPath.getName.startsWith("_") || f.getPath.getName.startsWith("."))
+        }.toArray
+
+        dataStatuses = leaves.filterNot(f => isSummaryFile(f.getPath))
+        metadataStatuses =
+          leaves.filter(_.getPath.getName == ParquetFileWriter.PARQUET_METADATA_FILE)
+        commonMetadataStatuses =
+          leaves.filter(_.getPath.getName == ParquetFileWriter.PARQUET_COMMON_METADATA_FILE)
+
+        dataSchema = {
+          val dataSchema0 = maybeDataSchema
+            .orElse(readSchema())
+            .orElse(maybeMetastoreSchema)
+            .getOrElse(throw new AnalysisException(
+              s"Failed to discover schema of Parquet file(s) in the following location(s):\n" +
+                paths.mkString("\n\t")))
+
+          // If this Parquet relation is converted from a Hive Metastore table, must reconcile case
+          // case insensitivity issue and possible schema mismatch (probably caused by schema
+          // evolution).
+          maybeMetastoreSchema
+            .map(ParquetRelation.mergeMetastoreParquetSchema(_, dataSchema0))
+            .getOrElse(dataSchema0)
+        }
+      }
+    }
+
+    private def isSummaryFile(file: Path): Boolean = {
+      file.getName == ParquetFileWriter.PARQUET_COMMON_METADATA_FILE ||
+        file.getName == ParquetFileWriter.PARQUET_METADATA_FILE
+    }
+
+    private def readSchema(): Option[StructType] = {
+      // Sees which file(s) we need to touch in order to figure out the schema.
+      //
+      // Always tries the summary files first if users don't require a merged schema.  In this case,
+      // "_common_metadata" is more preferable than "_metadata" because it doesn't contain row
+      // groups information, and could be much smaller for large Parquet files with lots of row
+      // groups.  If no summary file is available, falls back to some random part-file.
+      //
+      // NOTE: Metadata stored in the summary files are merged from all part-files.  However, for
+      // user defined key-value metadata (in which we store Spark SQL schema), Parquet doesn't know
+      // how to merge them correctly if some key is associated with different values in different
+      // part-files.  When this happens, Parquet simply gives up generating the summary file.  This
+      // implies that if a summary file presents, then:
+      //
+      //   1. Either all part-files have exactly the same Spark SQL schema, or
+      //   2. Some part-files don't contain Spark SQL schema in the key-value metadata at all (thus
+      //      their schemas may differ from each other).
+      //
+      // Here we tend to be pessimistic and take the second case into account.  Basically this means
+      // we can't trust the summary files if users require a merged schema, and must touch all part-
+      // files to do the merge.
+      val filesToTouch =
+        if (shouldMergeSchemas) {
+          // Also includes summary files, 'cause there might be empty partition directories.
+          (metadataStatuses ++ commonMetadataStatuses ++ dataStatuses).toSeq
+        } else {
+          // Tries any "_common_metadata" first. Parquet files written by old versions or Parquet
+          // don't have this.
+          commonMetadataStatuses.headOption
+            // Falls back to "_metadata"
+            .orElse(metadataStatuses.headOption)
+            // Summary file(s) not found, the Parquet file is either corrupted, or different part-
+            // files contain conflicting user defined metadata (two or more values are associated
+            // with a same key in different files).  In either case, we fall back to any of the
+            // first part-file, and just assume all schemas are consistent.
+            .orElse(dataStatuses.headOption)
+            .toSeq
+        }
+
+      assert(
+        filesToTouch.nonEmpty || maybeDataSchema.isDefined || maybeMetastoreSchema.isDefined,
+        "No predefined schema found, " +
+          s"and no Parquet data files or summary files found under ${paths.mkString(", ")}.")
+
+      ParquetRelation.mergeSchemasInParallel(filesToTouch, sqlContext)
+    }
+  }
 }
 
-private[sql] object ParquetRelation {
+private[sql] object ParquetRelation extends Logging {
+  // Whether we should merge schemas collected from all Parquet part-files.
+  private[sql] val MERGE_SCHEMA = "mergeSchema"
+
+  // Hive Metastore schema, used when converting Metastore Parquet tables.  This option is only used
+  // internally.
+  private[sql] val METASTORE_SCHEMA = "metastoreSchema"
+
+  /** This closure sets various Parquet configurations at both driver side and executor side. */
+  private[parquet] def initializeLocalJobFunc(
+      requiredColumns: Array[String],
+      filters: Array[Filter],
+      dataSchema: StructType,
+      useMetadataCache: Boolean,
+      parquetFilterPushDown: Boolean,
+      assumeBinaryIsString: Boolean,
+      assumeInt96IsTimestamp: Boolean,
+      followParquetFormatSpec: Boolean)(job: Job): Unit = {
+    val conf = job.getConfiguration
+    conf.set(ParquetInputFormat.READ_SUPPORT_CLASS, classOf[CatalystReadSupport].getName)
+
+    // Try to push down filters when filter push-down is enabled.
+    if (parquetFilterPushDown) {
+      filters
+        // Collects all converted Parquet filter predicates. Notice that not all predicates can be
+        // converted (`ParquetFilters.createFilter` returns an `Option`). That's why a `flatMap`
+        // is used here.
+        .flatMap(ParquetFilters.createFilter(dataSchema, _))
+        .reduceOption(FilterApi.and)
+        .foreach(ParquetInputFormat.setFilterPredicate(conf, _))
+    }
+
+    conf.set(CatalystReadSupport.SPARK_ROW_REQUESTED_SCHEMA, {
+      val requestedSchema = StructType(requiredColumns.map(dataSchema(_)))
+      CatalystSchemaConverter.checkFieldNames(requestedSchema).json
+    })
+
+    conf.set(
+      RowWriteSupport.SPARK_ROW_SCHEMA,
+      CatalystSchemaConverter.checkFieldNames(dataSchema).json)
+
+    // Tell FilteringParquetRowInputFormat whether it's okay to cache Parquet and FS metadata
+    conf.setBoolean(SQLConf.PARQUET_CACHE_METADATA.key, useMetadataCache)
+
+    // Sets flags for Parquet schema conversion
+    conf.setBoolean(SQLConf.PARQUET_BINARY_AS_STRING.key, assumeBinaryIsString)
+    conf.setBoolean(SQLConf.PARQUET_INT96_AS_TIMESTAMP.key, assumeInt96IsTimestamp)
+    conf.setBoolean(SQLConf.PARQUET_FOLLOW_PARQUET_FORMAT_SPEC.key, followParquetFormatSpec)
+  }
+
+  /** This closure sets input paths at the driver side. */
+  private[parquet] def initializeDriverSideJobFunc(
+      inputFiles: Array[FileStatus])(job: Job): Unit = {
+    // We side the input paths at the driver side.
+    logInfo(s"Reading Parquet file(s) from ${inputFiles.map(_.getPath).mkString(", ")}")
+    if (inputFiles.nonEmpty) {
+      FileInputFormat.setInputPaths(job, inputFiles.map(_.getPath): _*)
+    }
+  }
+
+  private[parquet] def readSchema(
+      footers: Seq[Footer], sqlContext: SQLContext): Option[StructType] = {
+
+    def parseParquetSchema(schema: MessageType): StructType = {
+      val converter = new CatalystSchemaConverter(
+        sqlContext.conf.isParquetBinaryAsString,
+        sqlContext.conf.isParquetBinaryAsString,
+        sqlContext.conf.followParquetFormatSpec)
+
+      converter.convert(schema)
+    }
+
+    val seen = mutable.HashSet[String]()
+    val finalSchemas: Seq[StructType] = footers.flatMap { footer =>
+      val metadata = footer.getParquetMetadata.getFileMetaData
+      val serializedSchema = metadata
+        .getKeyValueMetaData
+        .toMap
+        .get(CatalystReadSupport.SPARK_METADATA_KEY)
+      if (serializedSchema.isEmpty) {
+        // Falls back to Parquet schema if no Spark SQL schema found.
+        Some(parseParquetSchema(metadata.getSchema))
+      } else if (!seen.contains(serializedSchema.get)) {
+        seen += serializedSchema.get
+
+        // Don't throw even if we failed to parse the serialized Spark schema. Just fallback to
+        // whatever is available.
+        Some(Try(DataType.fromJson(serializedSchema.get))
+          .recover { case _: Throwable =>
+            logInfo(
+              s"Serialized Spark schema in Parquet key-value metadata is not in JSON format, " +
+                "falling back to the deprecated DataType.fromCaseClassString parser.")
+            DataType.fromCaseClassString(serializedSchema.get)
+          }
+          .recover { case cause: Throwable =>
+            logWarning(
+              s"""Failed to parse serialized Spark schema in Parquet key-value metadata:
+                 |\t$serializedSchema
+               """.stripMargin,
+              cause)
+          }
+          .map(_.asInstanceOf[StructType])
+          .getOrElse {
+            // Falls back to Parquet schema if Spark SQL schema can't be parsed.
+            parseParquetSchema(metadata.getSchema)
+          })
+      } else {
+        None
+      }
+    }
+
+    finalSchemas.reduceOption { (left, right) =>
+      try left.merge(right) catch { case e: Throwable =>
+        throw new SparkException(s"Failed to merge incompatible schemas $left and $right", e)
+      }
+    }
+  }
+
+  /**
+   * Reconciles Hive Metastore case insensitivity issue and data type conflicts between Metastore
+   * schema and Parquet schema.
+   *
+   * Hive doesn't retain case information, while Parquet is case sensitive. On the other hand, the
+   * schema read from Parquet files may be incomplete (e.g. older versions of Parquet doesn't
+   * distinguish binary and string).  This method generates a correct schema by merging Metastore
+   * schema data types and Parquet schema field names.
+   */
+  private[parquet] def mergeMetastoreParquetSchema(
+      metastoreSchema: StructType,
+      parquetSchema: StructType): StructType = {
+    def schemaConflictMessage: String =
+      s"""Converting Hive Metastore Parquet, but detected conflicting schemas. Metastore schema:
+         |${metastoreSchema.prettyJson}
+         |
+         |Parquet schema:
+         |${parquetSchema.prettyJson}
+       """.stripMargin
+
+    val mergedParquetSchema = mergeMissingNullableFields(metastoreSchema, parquetSchema)
+
+    assert(metastoreSchema.size <= mergedParquetSchema.size, schemaConflictMessage)
+
+    val ordinalMap = metastoreSchema.zipWithIndex.map {
+      case (field, index) => field.name.toLowerCase -> index
+    }.toMap
+
+    val reorderedParquetSchema = mergedParquetSchema.sortBy(f =>
+      ordinalMap.getOrElse(f.name.toLowerCase, metastoreSchema.size + 1))
+
+    StructType(metastoreSchema.zip(reorderedParquetSchema).map {
+      // Uses Parquet field names but retains Metastore data types.
+      case (mSchema, pSchema) if mSchema.name.toLowerCase == pSchema.name.toLowerCase =>
+        mSchema.copy(name = pSchema.name)
+      case _ =>
+        throw new SparkException(schemaConflictMessage)
+    })
+  }
+
+  /**
+   * Returns the original schema from the Parquet file with any missing nullable fields from the
+   * Hive Metastore schema merged in.
+   *
+   * When constructing a DataFrame from a collection of structured data, the resulting object has
+   * a schema corresponding to the union of the fields present in each element of the collection.
+   * Spark SQL simply assigns a null value to any field that isn't present for a particular row.
+   * In some cases, it is possible that a given table partition stored as a Parquet file doesn't
+   * contain a particular nullable field in its schema despite that field being present in the
+   * table schema obtained from the Hive Metastore. This method returns a schema representing the
+   * Parquet file schema along with any additional nullable fields from the Metastore schema
+   * merged in.
+   */
+  private[parquet] def mergeMissingNullableFields(
+      metastoreSchema: StructType,
+      parquetSchema: StructType): StructType = {
+    val fieldMap = metastoreSchema.map(f => f.name.toLowerCase -> f).toMap
+    val missingFields = metastoreSchema
+      .map(_.name.toLowerCase)
+      .diff(parquetSchema.map(_.name.toLowerCase))
+      .map(fieldMap(_))
+      .filter(_.nullable)
+    StructType(parquetSchema ++ missingFields)
+  }
+
+  /**
+   * Figures out a merged Parquet schema with a distributed Spark job.
+   *
+   * Note that locality is not taken into consideration here because:
+   *
+   *  1. For a single Parquet part-file, in most cases the footer only resides in the last block of
+   *     that file.  Thus we only need to retrieve the location of the last block.  However, Hadoop
+   *     `FileSystem` only provides API to retrieve locations of all blocks, which can be
+   *     potentially expensive.
+   *
+   *  2. This optimization is mainly useful for S3, where file metadata operations can be pretty
+   *     slow.  And basically locality is not available when using S3 (you can't run computation on
+   *     S3 nodes).
+   */
+  def mergeSchemasInParallel(
+      filesToTouch: Seq[FileStatus], sqlContext: SQLContext): Option[StructType] = {
+    val assumeBinaryIsString = sqlContext.conf.isParquetBinaryAsString
+    val assumeInt96IsTimestamp = sqlContext.conf.isParquetINT96AsTimestamp
+    val followParquetFormatSpec = sqlContext.conf.followParquetFormatSpec
+    val serializedConf = new SerializableConfiguration(sqlContext.sparkContext.hadoopConfiguration)
+
+    // HACK ALERT:
+    //
+    // Parquet requires `FileStatus`es to read footers.  Here we try to send cached `FileStatus`es
+    // to executor side to avoid fetching them again.  However, `FileStatus` is not `Serializable`
+    // but only `Writable`.  What makes it worth, for some reason, `FileStatus` doesn't play well
+    // with `SerializableWritable[T]` and always causes a weird `IllegalStateException`.  These
+    // facts virtually prevents us to serialize `FileStatus`es.
+    //
+    // Since Parquet only relies on path and length information of those `FileStatus`es to read
+    // footers, here we just extract them (which can be easily serialized), send them to executor
+    // side, and resemble fake `FileStatus`es there.
+    val partialFileStatusInfo = filesToTouch.map(f => (f.getPath.toString, f.getLen))
+
+    // Issues a Spark job to read Parquet schema in parallel.
+    val partiallyMergedSchemas =
+      sqlContext
+        .sparkContext
+        .parallelize(partialFileStatusInfo)
+        .mapPartitions { iterator =>
+          // Resembles fake `FileStatus`es with serialized path and length information.
+          val fakeFileStatuses = iterator.map { case (path, length) =>
+            new FileStatus(length, false, 0, 0, 0, 0, null, null, null, new Path(path))
+          }.toSeq
+
+          // Skips row group information since we only need the schema
+          val skipRowGroups = true
+
+          // Reads footers in multi-threaded manner within each task
+          val footers =
+            ParquetFileReader.readAllFootersInParallel(
+              serializedConf.value, fakeFileStatuses, skipRowGroups)
+
+          // Converter used to convert Parquet `MessageType` to Spark SQL `StructType`
+          val converter =
+            new CatalystSchemaConverter(
+              assumeBinaryIsString = assumeBinaryIsString,
+              assumeInt96IsTimestamp = assumeInt96IsTimestamp,
+              followParquetFormatSpec = followParquetFormatSpec)
+
+          footers.map { footer =>
+            ParquetRelation.readSchemaFromFooter(footer, converter)
+          }.reduceOption(_ merge _).iterator
+        }.collect()
+
+    partiallyMergedSchemas.reduceOption(_ merge _)
+  }
+
+  /**
+   * Reads Spark SQL schema from a Parquet footer.  If a valid serialized Spark SQL schema string
+   * can be found in the file metadata, returns the deserialized [[StructType]], otherwise, returns
+   * a [[StructType]] converted from the [[MessageType]] stored in this footer.
+   */
+  def readSchemaFromFooter(
+      footer: Footer, converter: CatalystSchemaConverter): StructType = {
+    val fileMetaData = footer.getParquetMetadata.getFileMetaData
+    fileMetaData
+      .getKeyValueMetaData
+      .toMap
+      .get(CatalystReadSupport.SPARK_METADATA_KEY)
+      .flatMap(deserializeSchemaString)
+      .getOrElse(converter.convert(fileMetaData.getSchema))
+  }
+
+  private def deserializeSchemaString(schemaString: String): Option[StructType] = {
+    // Tries to deserialize the schema string as JSON first, then falls back to the case class
+    // string parser (data generated by older versions of Spark SQL uses this format).
+    Try(DataType.fromJson(schemaString).asInstanceOf[StructType]).recover {
+      case _: Throwable =>
+        logInfo(
+          s"Serialized Spark schema in Parquet key-value metadata is not in JSON format, " +
+            "falling back to the deprecated DataType.fromCaseClassString parser.")
+        DataType.fromCaseClassString(schemaString).asInstanceOf[StructType]
+    }.recoverWith {
+      case cause: Throwable =>
+        logWarning(
+          "Failed to parse and ignored serialized Spark schema in " +
+            s"Parquet key-value metadata:\n\t$schemaString", cause)
+        Failure(cause)
+    }.toOption
+  }
 
   def enableLogForwarding() {
     // Note: the org.apache.parquet.Log class has a static initializer that
@@ -127,12 +766,6 @@ private[sql] object ParquetRelation {
     JLogger.getLogger(classOf[ParquetRecordReader[_]].getName).setLevel(Level.OFF)
   }
 
-  // The element type for the RDDs that this relation maps to.
-  type RowType = org.apache.spark.sql.catalyst.expressions.GenericMutableRow
-
-  // The compression type
-  type CompressionType = org.apache.parquet.hadoop.metadata.CompressionCodecName
-
   // The parquet compression short names
   val shortParquetCompressionCodecNames = Map(
     "NONE"         -> CompressionCodecName.UNCOMPRESSED,
@@ -140,82 +773,4 @@ private[sql] object ParquetRelation {
     "SNAPPY"       -> CompressionCodecName.SNAPPY,
     "GZIP"         -> CompressionCodecName.GZIP,
     "LZO"          -> CompressionCodecName.LZO)
-
-  /**
-   * Creates a new ParquetRelation and underlying Parquetfile for the given LogicalPlan. Note that
-   * this is used inside [[org.apache.spark.sql.execution.SparkStrategies SparkStrategies]] to
-   * create a resolved relation as a data sink for writing to a Parquetfile. The relation is empty
-   * but is initialized with ParquetMetadata and can be inserted into.
-   *
-   * @param pathString The directory the Parquetfile will be stored in.
-   * @param child The child node that will be used for extracting the schema.
-   * @param conf A configuration to be used.
-   * @return An empty ParquetRelation with inferred metadata.
-   */
-  def create(pathString: String,
-             child: LogicalPlan,
-             conf: Configuration,
-             sqlContext: SQLContext): ParquetRelation = {
-    if (!child.resolved) {
-      throw new UnresolvedException[LogicalPlan](
-        child,
-        "Attempt to create Parquet table from unresolved child (when schema is not available)")
-    }
-    createEmpty(pathString, child.output, false, conf, sqlContext)
-  }
-
-  /**
-   * Creates an empty ParquetRelation and underlying Parquetfile that only
-   * consists of the Metadata for the given schema.
-   *
-   * @param pathString The directory the Parquetfile will be stored in.
-   * @param attributes The schema of the relation.
-   * @param conf A configuration to be used.
-   * @return An empty ParquetRelation.
-   */
-  def createEmpty(pathString: String,
-                  attributes: Seq[Attribute],
-                  allowExisting: Boolean,
-                  conf: Configuration,
-                  sqlContext: SQLContext): ParquetRelation = {
-    val path = checkPath(pathString, allowExisting, conf)
-    conf.set(ParquetOutputFormat.COMPRESSION, shortParquetCompressionCodecNames.getOrElse(
-      sqlContext.conf.parquetCompressionCodec.toUpperCase, CompressionCodecName.UNCOMPRESSED)
-      .name())
-    ParquetRelation.enableLogForwarding()
-    // This is a hack. We always set nullable/containsNull/valueContainsNull to true
-    // for the schema of a parquet data.
-    val schema = StructType.fromAttributes(attributes).asNullable
-    val newAttributes = schema.toAttributes
-    ParquetTypesConverter.writeMetaData(newAttributes, path, conf)
-    new ParquetRelation(path.toString, Some(conf), sqlContext) {
-      override val output = newAttributes
-    }
-  }
-
-  private def checkPath(pathStr: String, allowExisting: Boolean, conf: Configuration): Path = {
-    if (pathStr == null) {
-      throw new IllegalArgumentException("Unable to create ParquetRelation: path is null")
-    }
-    val origPath = new Path(pathStr)
-    val fs = origPath.getFileSystem(conf)
-    if (fs == null) {
-      throw new IllegalArgumentException(
-        s"Unable to create ParquetRelation: incorrectly formatted path $pathStr")
-    }
-    val path = origPath.makeQualified(fs)
-    if (!allowExisting && fs.exists(path)) {
-      sys.error(s"File $pathStr already exists.")
-    }
-
-    if (fs.exists(path) &&
-        !fs.getFileStatus(path)
-        .getPermission
-        .getUserAction
-        .implies(FsAction.READ_WRITE)) {
-      throw new IOException(
-        s"Unable to create ParquetRelation: path $path not read-writable")
-    }
-    path
-  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
deleted file mode 100644
index 75cbbde4f1512..0000000000000
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
+++ /dev/null
@@ -1,492 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.parquet
-
-import java.io.IOException
-import java.text.{NumberFormat, SimpleDateFormat}
-import java.util.concurrent.TimeUnit
-import java.util.Date
-
-import scala.collection.JavaConversions._
-import scala.util.Try
-
-import com.google.common.cache.CacheBuilder
-import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.fs.{BlockLocation, FileStatus, Path}
-import org.apache.hadoop.mapreduce._
-import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat => NewFileInputFormat}
-import org.apache.hadoop.mapreduce.lib.output.{FileOutputCommitter, FileOutputFormat => NewFileOutputFormat}
-import org.apache.parquet.hadoop._
-import org.apache.parquet.hadoop.api.ReadSupport
-import org.apache.parquet.hadoop.util.ContextUtil
-import org.apache.parquet.schema.MessageType
-
-import org.apache.spark.annotation.DeveloperApi
-import org.apache.spark.mapred.SparkHadoopMapRedUtil
-import org.apache.spark.mapreduce.SparkHadoopMapReduceUtil
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.SQLConf
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, _}
-import org.apache.spark.sql.execution.{LeafNode, SparkPlan, UnaryNode}
-import org.apache.spark.sql.types.StructType
-import org.apache.spark.{Logging, TaskContext}
-import org.apache.spark.util.SerializableConfiguration
-
-/**
- * :: DeveloperApi ::
- * Parquet table scan operator. Imports the file that backs the given
- * [[org.apache.spark.sql.parquet.ParquetRelation]] as a ``RDD[InternalRow]``.
- */
-private[sql] case class ParquetTableScan(
-    attributes: Seq[Attribute],
-    relation: ParquetRelation,
-    columnPruningPred: Seq[Expression])
-  extends LeafNode {
-
-  // The resolution of Parquet attributes is case sensitive, so we resolve the original attributes
-  // by exprId. note: output cannot be transient, see
-  // https://issues.apache.org/jira/browse/SPARK-1367
-  val output = attributes.map(relation.attributeMap)
-
-  // A mapping of ordinals partitionRow -> finalOutput.
-  val requestedPartitionOrdinals = {
-    val partitionAttributeOrdinals = AttributeMap(relation.partitioningAttributes.zipWithIndex)
-
-    attributes.zipWithIndex.flatMap {
-      case (attribute, finalOrdinal) =>
-        partitionAttributeOrdinals.get(attribute).map(_ -> finalOrdinal)
-    }
-  }.toArray
-
-  protected override def doExecute(): RDD[InternalRow] = {
-    import org.apache.parquet.filter2.compat.FilterCompat.FilterPredicateCompat
-
-    val sc = sqlContext.sparkContext
-    val job = new Job(sc.hadoopConfiguration)
-    ParquetInputFormat.setReadSupportClass(job, classOf[RowReadSupport])
-
-    val conf: Configuration = ContextUtil.getConfiguration(job)
-
-    relation.path.split(",").foreach { curPath =>
-      val qualifiedPath = {
-        val path = new Path(curPath)
-        path.getFileSystem(conf).makeQualified(path)
-      }
-      NewFileInputFormat.addInputPath(job, qualifiedPath)
-    }
-
-    // Store both requested and original schema in `Configuration`
-    conf.set(
-      RowReadSupport.SPARK_ROW_REQUESTED_SCHEMA,
-      ParquetTypesConverter.convertToString(output))
-    conf.set(
-      RowWriteSupport.SPARK_ROW_SCHEMA,
-      ParquetTypesConverter.convertToString(relation.output))
-
-    // Store record filtering predicate in `Configuration`
-    // Note 1: the input format ignores all predicates that cannot be expressed
-    // as simple column predicate filters in Parquet. Here we just record
-    // the whole pruning predicate.
-    ParquetFilters
-      .createRecordFilter(columnPruningPred)
-      .map(_.asInstanceOf[FilterPredicateCompat].getFilterPredicate)
-      // Set this in configuration of ParquetInputFormat, needed for RowGroupFiltering
-      .foreach(ParquetInputFormat.setFilterPredicate(conf, _))
-
-    // Tell FilteringParquetRowInputFormat whether it's okay to cache Parquet and FS metadata
-    conf.setBoolean(
-      SQLConf.PARQUET_CACHE_METADATA.key,
-      sqlContext.getConf(SQLConf.PARQUET_CACHE_METADATA, true))
-
-    // Use task side metadata in parquet
-    conf.setBoolean(ParquetInputFormat.TASK_SIDE_METADATA, true)
-
-    val baseRDD =
-      new org.apache.spark.rdd.NewHadoopRDD(
-        sc,
-        classOf[FilteringParquetRowInputFormat],
-        classOf[Void],
-        classOf[InternalRow],
-        conf)
-
-    if (requestedPartitionOrdinals.nonEmpty) {
-      // This check is based on CatalystConverter.createRootConverter.
-      val primitiveRow = output.forall(a => ParquetTypesConverter.isPrimitiveType(a.dataType))
-
-      // Uses temporary variable to avoid the whole `ParquetTableScan` object being captured into
-      // the `mapPartitionsWithInputSplit` closure below.
-      val outputSize = output.size
-
-      baseRDD.mapPartitionsWithInputSplit { case (split, iter) =>
-        val partValue = "([^=]+)=([^=]+)".r
-        val partValues =
-          split.asInstanceOf[org.apache.parquet.hadoop.ParquetInputSplit]
-            .getPath
-            .toString
-            .split("/")
-            .flatMap {
-              case partValue(key, value) => Some(key -> value)
-              case _ => None
-            }.toMap
-
-        // Convert the partitioning attributes into the correct types
-        val partitionRowValues =
-          relation.partitioningAttributes
-            .map(a => Cast(Literal(partValues(a.name)), a.dataType).eval(EmptyRow))
-
-        if (primitiveRow) {
-          new Iterator[InternalRow] {
-            def hasNext: Boolean = iter.hasNext
-            def next(): InternalRow = {
-              // We are using CatalystPrimitiveRowConverter and it returns a SpecificMutableRow.
-              val row = iter.next()._2.asInstanceOf[SpecificMutableRow]
-
-              // Parquet will leave partitioning columns empty, so we fill them in here.
-              var i = 0
-              while (i < requestedPartitionOrdinals.length) {
-                row(requestedPartitionOrdinals(i)._2) =
-                  partitionRowValues(requestedPartitionOrdinals(i)._1)
-                i += 1
-              }
-              row
-            }
-          }
-        } else {
-          // Create a mutable row since we need to fill in values from partition columns.
-          val mutableRow = new GenericMutableRow(outputSize)
-          new Iterator[InternalRow] {
-            def hasNext: Boolean = iter.hasNext
-            def next(): InternalRow = {
-              // We are using CatalystGroupConverter and it returns a GenericRow.
-              // Since GenericRow is not mutable, we just cast it to a Row.
-              val row = iter.next()._2.asInstanceOf[InternalRow]
-
-              var i = 0
-              while (i < row.numFields) {
-                mutableRow(i) = row.genericGet(i)
-                i += 1
-              }
-              // Parquet will leave partitioning columns empty, so we fill them in here.
-              i = 0
-              while (i < requestedPartitionOrdinals.length) {
-                mutableRow(requestedPartitionOrdinals(i)._2) =
-                  partitionRowValues(requestedPartitionOrdinals(i)._1)
-                i += 1
-              }
-              mutableRow
-            }
-          }
-        }
-      }
-    } else {
-      baseRDD.map(_._2)
-    }
-  }
-
-  /**
-   * Applies a (candidate) projection.
-   *
-   * @param prunedAttributes The list of attributes to be used in the projection.
-   * @return Pruned TableScan.
-   */
-  def pruneColumns(prunedAttributes: Seq[Attribute]): ParquetTableScan = {
-    val success = validateProjection(prunedAttributes)
-    if (success) {
-      ParquetTableScan(prunedAttributes, relation, columnPruningPred)
-    } else {
-      sys.error("Warning: Could not validate Parquet schema projection in pruneColumns")
-    }
-  }
-
-  /**
-   * Evaluates a candidate projection by checking whether the candidate is a subtype
-   * of the original type.
-   *
-   * @param projection The candidate projection.
-   * @return True if the projection is valid, false otherwise.
-   */
-  private def validateProjection(projection: Seq[Attribute]): Boolean = {
-    val original: MessageType = relation.parquetSchema
-    val candidate: MessageType = ParquetTypesConverter.convertFromAttributes(projection)
-    Try(original.checkContains(candidate)).isSuccess
-  }
-}
-
-/**
- * :: DeveloperApi ::
- * Operator that acts as a sink for queries on RDDs and can be used to
- * store the output inside a directory of Parquet files. This operator
- * is similar to Hive's INSERT INTO TABLE operation in the sense that
- * one can choose to either overwrite or append to a directory. Note
- * that consecutive insertions to the same table must have compatible
- * (source) schemas.
- *
- * WARNING: EXPERIMENTAL! InsertIntoParquetTable with overwrite=false may
- * cause data corruption in the case that multiple users try to append to
- * the same table simultaneously. Inserting into a table that was
- * previously generated by other means (e.g., by creating an HDFS
- * directory and importing Parquet files generated by other tools) may
- * cause unpredicted behaviour and therefore results in a RuntimeException
- * (only detected via filename pattern so will not catch all cases).
- */
-@DeveloperApi
-private[sql] case class InsertIntoParquetTable(
-    relation: ParquetRelation,
-    child: SparkPlan,
-    overwrite: Boolean = false)
-  extends UnaryNode with SparkHadoopMapReduceUtil {
-
-  /**
-   * Inserts all rows into the Parquet file.
-   */
-  protected override def doExecute(): RDD[InternalRow] = {
-    // TODO: currently we do not check whether the "schema"s are compatible
-    // That means if one first creates a table and then INSERTs data with
-    // and incompatible schema the execution will fail. It would be nice
-    // to catch this early one, maybe having the planner validate the schema
-    // before calling execute().
-
-    val childRdd = child.execute()
-    assert(childRdd != null)
-
-    val job = new Job(sqlContext.sparkContext.hadoopConfiguration)
-
-    val writeSupport =
-      if (child.output.map(_.dataType).forall(ParquetTypesConverter.isPrimitiveType)) {
-        log.debug("Initializing MutableRowWriteSupport")
-        classOf[org.apache.spark.sql.parquet.MutableRowWriteSupport]
-      } else {
-        classOf[org.apache.spark.sql.parquet.RowWriteSupport]
-      }
-
-    ParquetOutputFormat.setWriteSupportClass(job, writeSupport)
-
-    val conf = ContextUtil.getConfiguration(job)
-    // This is a hack. We always set nullable/containsNull/valueContainsNull to true
-    // for the schema of a parquet data.
-    val schema = StructType.fromAttributes(relation.output).asNullable
-    RowWriteSupport.setSchema(schema.toAttributes, conf)
-
-    val fspath = new Path(relation.path)
-    val fs = fspath.getFileSystem(conf)
-
-    if (overwrite) {
-      try {
-        fs.delete(fspath, true)
-      } catch {
-        case e: IOException =>
-          throw new IOException(
-            s"Unable to clear output directory ${fspath.toString} prior"
-              + s" to InsertIntoParquetTable:\n${e.toString}")
-      }
-    }
-    saveAsHadoopFile(childRdd, relation.path.toString, conf)
-
-    // We return the child RDD to allow chaining (alternatively, one could return nothing).
-    childRdd
-  }
-
-  override def output: Seq[Attribute] = child.output
-
-  /**
-   * Stores the given Row RDD as a Hadoop file.
-   *
-   * Note: We cannot use ``saveAsNewAPIHadoopFile`` from [[org.apache.spark.rdd.PairRDDFunctions]]
-   * together with [[org.apache.spark.util.MutablePair]] because ``PairRDDFunctions`` uses
-   * ``Tuple2`` and not ``Product2``. Also, we want to allow appending files to an existing
-   * directory and need to determine which was the largest written file index before starting to
-   * write.
-   *
-   * @param rdd The [[org.apache.spark.rdd.RDD]] to writer
-   * @param path The directory to write to.
-   * @param conf A [[org.apache.hadoop.conf.Configuration]].
-   */
-  private def saveAsHadoopFile(
-      rdd: RDD[InternalRow],
-      path: String,
-      conf: Configuration) {
-    val job = new Job(conf)
-    val keyType = classOf[Void]
-    job.setOutputKeyClass(keyType)
-    job.setOutputValueClass(classOf[InternalRow])
-    NewFileOutputFormat.setOutputPath(job, new Path(path))
-    val wrappedConf = new SerializableConfiguration(job.getConfiguration)
-    val formatter = new SimpleDateFormat("yyyyMMddHHmm")
-    val jobtrackerID = formatter.format(new Date())
-    val stageId = sqlContext.sparkContext.newRddId()
-
-    val taskIdOffset =
-      if (overwrite) {
-        1
-      } else {
-        FileSystemHelper
-          .findMaxTaskId(NewFileOutputFormat.getOutputPath(job).toString, job.getConfiguration) + 1
-      }
-
-    def writeShard(context: TaskContext, iter: Iterator[InternalRow]): Int = {
-      /* "reduce task" <split #> <attempt # = spark task #> */
-      val attemptId = newTaskAttemptID(jobtrackerID, stageId, isMap = false, context.partitionId,
-        context.attemptNumber)
-      val hadoopContext = newTaskAttemptContext(wrappedConf.value, attemptId)
-      val format = new AppendingParquetOutputFormat(taskIdOffset)
-      val committer = format.getOutputCommitter(hadoopContext)
-      committer.setupTask(hadoopContext)
-      val writer = format.getRecordWriter(hadoopContext)
-      try {
-        while (iter.hasNext) {
-          val row = iter.next()
-          writer.write(null, row)
-        }
-      } finally {
-        writer.close(hadoopContext)
-      }
-      SparkHadoopMapRedUtil.commitTask(committer, hadoopContext, context)
-      1
-    }
-    val jobFormat = new AppendingParquetOutputFormat(taskIdOffset)
-    /* apparently we need a TaskAttemptID to construct an OutputCommitter;
-     * however we're only going to use this local OutputCommitter for
-     * setupJob/commitJob, so we just use a dummy "map" task.
-     */
-    val jobAttemptId = newTaskAttemptID(jobtrackerID, stageId, isMap = true, 0, 0)
-    val jobTaskContext = newTaskAttemptContext(wrappedConf.value, jobAttemptId)
-    val jobCommitter = jobFormat.getOutputCommitter(jobTaskContext)
-    jobCommitter.setupJob(jobTaskContext)
-    sqlContext.sparkContext.runJob(rdd, writeShard _)
-    jobCommitter.commitJob(jobTaskContext)
-  }
-}
-
-/**
- * TODO: this will be able to append to directories it created itself, not necessarily
- * to imported ones.
- */
-private[parquet] class AppendingParquetOutputFormat(offset: Int)
-  extends org.apache.parquet.hadoop.ParquetOutputFormat[InternalRow] {
-  // override to accept existing directories as valid output directory
-  override def checkOutputSpecs(job: JobContext): Unit = {}
-  var committer: OutputCommitter = null
-
-  // override to choose output filename so not overwrite existing ones
-  override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = {
-    val numfmt = NumberFormat.getInstance()
-    numfmt.setMinimumIntegerDigits(5)
-    numfmt.setGroupingUsed(false)
-
-    val taskId: TaskID = getTaskAttemptID(context).getTaskID
-    val partition: Int = taskId.getId
-    val filename = "part-r-" + numfmt.format(partition + offset) + ".parquet"
-    val committer: FileOutputCommitter =
-      getOutputCommitter(context).asInstanceOf[FileOutputCommitter]
-    new Path(committer.getWorkPath, filename)
-  }
-
-  // The TaskAttemptContext is a class in hadoop-1 but is an interface in hadoop-2.
-  // The signatures of the method TaskAttemptContext.getTaskAttemptID for the both versions
-  // are the same, so the method calls are source-compatible but NOT binary-compatible because
-  // the opcode of method call for class is INVOKEVIRTUAL and for interface is INVOKEINTERFACE.
-  private def getTaskAttemptID(context: TaskAttemptContext): TaskAttemptID = {
-    context.getClass.getMethod("getTaskAttemptID").invoke(context).asInstanceOf[TaskAttemptID]
-  }
-
-  // override to create output committer from configuration
-  override def getOutputCommitter(context: TaskAttemptContext): OutputCommitter = {
-    if (committer == null) {
-      val output = getOutputPath(context)
-      val cls = context.getConfiguration.getClass("spark.sql.parquet.output.committer.class",
-        classOf[ParquetOutputCommitter], classOf[ParquetOutputCommitter])
-      val ctor = cls.getDeclaredConstructor(classOf[Path], classOf[TaskAttemptContext])
-      committer = ctor.newInstance(output, context).asInstanceOf[ParquetOutputCommitter]
-    }
-    committer
-  }
-
-  // FileOutputFormat.getOutputPath takes JobConf in hadoop-1 but JobContext in hadoop-2
-  private def getOutputPath(context: TaskAttemptContext): Path = {
-    context.getConfiguration().get("mapred.output.dir") match {
-      case null => null
-      case name => new Path(name)
-    }
-  }
-}
-
-// TODO Removes this class after removing old Parquet support code
-/**
- * We extend ParquetInputFormat in order to have more control over which
- * RecordFilter we want to use.
- */
-private[parquet] class FilteringParquetRowInputFormat
-  extends org.apache.parquet.hadoop.ParquetInputFormat[InternalRow] with Logging {
-
-  override def createRecordReader(
-      inputSplit: InputSplit,
-      taskAttemptContext: TaskAttemptContext): RecordReader[Void, InternalRow] = {
-
-    import org.apache.parquet.filter2.compat.FilterCompat.NoOpFilter
-
-    val readSupport: ReadSupport[InternalRow] = new RowReadSupport()
-
-    val filter = ParquetInputFormat.getFilter(ContextUtil.getConfiguration(taskAttemptContext))
-    if (!filter.isInstanceOf[NoOpFilter]) {
-      new ParquetRecordReader[InternalRow](
-        readSupport,
-        filter)
-    } else {
-      new ParquetRecordReader[InternalRow](readSupport)
-    }
-  }
-
-}
-
-private[parquet] object FileSystemHelper {
-  def listFiles(pathStr: String, conf: Configuration): Seq[Path] = {
-    val origPath = new Path(pathStr)
-    val fs = origPath.getFileSystem(conf)
-    if (fs == null) {
-      throw new IllegalArgumentException(
-        s"ParquetTableOperations: Path $origPath is incorrectly formatted")
-    }
-    val path = origPath.makeQualified(fs)
-    if (!fs.exists(path) || !fs.getFileStatus(path).isDir) {
-      throw new IllegalArgumentException(
-        s"ParquetTableOperations: path $path does not exist or is not a directory")
-    }
-    fs.globStatus(path)
-      .flatMap { status => if (status.isDir) fs.listStatus(status.getPath) else List(status) }
-      .map(_.getPath)
-  }
-
-    /**
-     * Finds the maximum taskid in the output file names at the given path.
-     */
-  def findMaxTaskId(pathStr: String, conf: Configuration): Int = {
-    val files = FileSystemHelper.listFiles(pathStr, conf)
-    // filename pattern is part-r-<int>.parquet
-    val nameP = new scala.util.matching.Regex("""part-.-(\d{1,}).*""", "taskid")
-    val hiddenFileP = new scala.util.matching.Regex("_.*")
-    files.map(_.getName).map {
-      case nameP(taskid) => taskid.toInt
-      case hiddenFileP() => 0
-      case other: String =>
-        sys.error("ERROR: attempting to append to set of Parquet files and found file" +
-          s"that does not match name pattern: $other")
-      case _ => 0
-    }.reduceOption(_ max _).getOrElse(0)
-  }
-}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
index 7b6a7f65d69db..fc9f61a636768 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
@@ -18,18 +18,13 @@
 package org.apache.spark.sql.parquet
 
 import java.nio.{ByteBuffer, ByteOrder}
-import java.util
 import java.util.{HashMap => JHashMap}
 
-import scala.collection.JavaConversions._
-
 import org.apache.hadoop.conf.Configuration
 import org.apache.parquet.column.ParquetProperties
 import org.apache.parquet.hadoop.ParquetOutputFormat
-import org.apache.parquet.hadoop.api.ReadSupport.ReadContext
-import org.apache.parquet.hadoop.api.{InitContext, ReadSupport, WriteSupport}
+import org.apache.parquet.hadoop.api.WriteSupport
 import org.apache.parquet.io.api._
-import org.apache.parquet.schema.MessageType
 
 import org.apache.spark.Logging
 import org.apache.spark.sql.catalyst.InternalRow
@@ -38,147 +33,6 @@ import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
 
-/**
- * A [[RecordMaterializer]] for Catalyst rows.
- *
- * @param parquetSchema Parquet schema of the records to be read
- * @param catalystSchema Catalyst schema of the rows to be constructed
- */
-private[parquet] class RowRecordMaterializer(parquetSchema: MessageType, catalystSchema: StructType)
-  extends RecordMaterializer[InternalRow] {
-
-  private val rootConverter = new CatalystRowConverter(parquetSchema, catalystSchema, NoopUpdater)
-
-  override def getCurrentRecord: InternalRow = rootConverter.currentRow
-
-  override def getRootConverter: GroupConverter = rootConverter
-}
-
-private[parquet] class RowReadSupport extends ReadSupport[InternalRow] with Logging {
-  override def prepareForRead(
-      conf: Configuration,
-      keyValueMetaData: util.Map[String, String],
-      fileSchema: MessageType,
-      readContext: ReadContext): RecordMaterializer[InternalRow] = {
-    log.debug(s"Preparing for read Parquet file with message type: $fileSchema")
-
-    val toCatalyst = new CatalystSchemaConverter(conf)
-    val parquetRequestedSchema = readContext.getRequestedSchema
-
-    val catalystRequestedSchema =
-      Option(readContext.getReadSupportMetadata).map(_.toMap).flatMap { metadata =>
-        metadata
-          // First tries to read requested schema, which may result from projections
-          .get(RowReadSupport.SPARK_ROW_REQUESTED_SCHEMA)
-          // If not available, tries to read Catalyst schema from file metadata.  It's only
-          // available if the target file is written by Spark SQL.
-          .orElse(metadata.get(RowReadSupport.SPARK_METADATA_KEY))
-      }.map(StructType.fromString).getOrElse {
-        logDebug("Catalyst schema not available, falling back to Parquet schema")
-        toCatalyst.convert(parquetRequestedSchema)
-      }
-
-    logDebug(s"Catalyst schema used to read Parquet files: $catalystRequestedSchema")
-    new RowRecordMaterializer(parquetRequestedSchema, catalystRequestedSchema)
-  }
-
-  override def init(context: InitContext): ReadContext = {
-    val conf = context.getConfiguration
-
-    // If the target file was written by Spark SQL, we should be able to find a serialized Catalyst
-    // schema of this file from its the metadata.
-    val maybeRowSchema = Option(conf.get(RowWriteSupport.SPARK_ROW_SCHEMA))
-
-    // Optional schema of requested columns, in the form of a string serialized from a Catalyst
-    // `StructType` containing all requested columns.
-    val maybeRequestedSchema = Option(conf.get(RowReadSupport.SPARK_ROW_REQUESTED_SCHEMA))
-
-    // Below we construct a Parquet schema containing all requested columns.  This schema tells
-    // Parquet which columns to read.
-    //
-    // If `maybeRequestedSchema` is defined, we assemble an equivalent Parquet schema.  Otherwise,
-    // we have to fallback to the full file schema which contains all columns in the file.
-    // Obviously this may waste IO bandwidth since it may read more columns than requested.
-    //
-    // Two things to note:
-    //
-    // 1. It's possible that some requested columns don't exist in the target Parquet file.  For
-    //    example, in the case of schema merging, the globally merged schema may contain extra
-    //    columns gathered from other Parquet files.  These columns will be simply filled with nulls
-    //    when actually reading the target Parquet file.
-    //
-    // 2. When `maybeRequestedSchema` is available, we can't simply convert the Catalyst schema to
-    //    Parquet schema using `CatalystSchemaConverter`, because the mapping is not unique due to
-    //    non-standard behaviors of some Parquet libraries/tools.  For example, a Parquet file
-    //    containing a single integer array field `f1` may have the following legacy 2-level
-    //    structure:
-    //
-    //      message root {
-    //        optional group f1 (LIST) {
-    //          required INT32 element;
-    //        }
-    //      }
-    //
-    //    while `CatalystSchemaConverter` may generate a standard 3-level structure:
-    //
-    //      message root {
-    //        optional group f1 (LIST) {
-    //          repeated group list {
-    //            required INT32 element;
-    //          }
-    //        }
-    //      }
-    //
-    //    Apparently, we can't use the 2nd schema to read the target Parquet file as they have
-    //    different physical structures.
-    val parquetRequestedSchema =
-      maybeRequestedSchema.fold(context.getFileSchema) { schemaString =>
-        val toParquet = new CatalystSchemaConverter(conf)
-        val fileSchema = context.getFileSchema.asGroupType()
-        val fileFieldNames = fileSchema.getFields.map(_.getName).toSet
-
-        StructType
-          // Deserializes the Catalyst schema of requested columns
-          .fromString(schemaString)
-          .map { field =>
-            if (fileFieldNames.contains(field.name)) {
-              // If the field exists in the target Parquet file, extracts the field type from the
-              // full file schema and makes a single-field Parquet schema
-              new MessageType("root", fileSchema.getType(field.name))
-            } else {
-              // Otherwise, just resorts to `CatalystSchemaConverter`
-              toParquet.convert(StructType(Array(field)))
-            }
-          }
-          // Merges all single-field Parquet schemas to form a complete schema for all requested
-          // columns.  Note that it's possible that no columns are requested at all (e.g., count
-          // some partition column of a partitioned Parquet table). That's why `fold` is used here
-          // and always fallback to an empty Parquet schema.
-          .fold(new MessageType("root")) {
-            _ union _
-          }
-      }
-
-    val metadata =
-      Map.empty[String, String] ++
-        maybeRequestedSchema.map(RowReadSupport.SPARK_ROW_REQUESTED_SCHEMA -> _) ++
-        maybeRowSchema.map(RowWriteSupport.SPARK_ROW_SCHEMA -> _)
-
-    logInfo(s"Going to read Parquet file with these requested columns: $parquetRequestedSchema")
-    new ReadContext(parquetRequestedSchema, metadata)
-  }
-}
-
-private[parquet] object RowReadSupport {
-  val SPARK_ROW_REQUESTED_SCHEMA = "org.apache.spark.sql.parquet.row.requested_schema"
-  val SPARK_METADATA_KEY = "org.apache.spark.sql.parquet.row.metadata"
-
-  private def getRequestedSchema(configuration: Configuration): Seq[Attribute] = {
-    val schemaString = configuration.get(RowReadSupport.SPARK_ROW_REQUESTED_SCHEMA)
-    if (schemaString == null) null else ParquetTypesConverter.convertFromString(schemaString)
-  }
-}
-
 /**
  * A `parquet.hadoop.api.WriteSupport` for Row objects.
  */
@@ -190,7 +44,7 @@ private[parquet] class RowWriteSupport extends WriteSupport[InternalRow] with Lo
   override def init(configuration: Configuration): WriteSupport.WriteContext = {
     val origAttributesStr: String = configuration.get(RowWriteSupport.SPARK_ROW_SCHEMA)
     val metadata = new JHashMap[String, String]()
-    metadata.put(RowReadSupport.SPARK_METADATA_KEY, origAttributesStr)
+    metadata.put(CatalystReadSupport.SPARK_METADATA_KEY, origAttributesStr)
 
     if (attributes == null) {
       attributes = ParquetTypesConverter.convertFromString(origAttributesStr).toArray
@@ -443,4 +297,3 @@ private[parquet] object RowWriteSupport {
       ParquetProperties.WriterVersion.PARQUET_1_0.toString)
   }
 }
-
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTypes.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTypes.scala
index e748bd7857bd8..3854f5bd39fb1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTypes.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTypes.scala
@@ -53,15 +53,6 @@ private[parquet] object ParquetTypesConverter extends Logging {
     length
   }
 
-  def convertToAttributes(
-      parquetSchema: MessageType,
-      isBinaryAsString: Boolean,
-      isInt96AsTimestamp: Boolean): Seq[Attribute] = {
-    val converter = new CatalystSchemaConverter(
-      isBinaryAsString, isInt96AsTimestamp, followParquetFormatSpec = false)
-    converter.convert(parquetSchema).toAttributes
-  }
-
   def convertFromAttributes(attributes: Seq[Attribute]): MessageType = {
     val converter = new CatalystSchemaConverter()
     converter.convert(StructType.fromAttributes(attributes))
@@ -103,7 +94,7 @@ private[parquet] object ParquetTypesConverter extends Logging {
     }
     val extraMetadata = new java.util.HashMap[String, String]()
     extraMetadata.put(
-      RowReadSupport.SPARK_METADATA_KEY,
+      CatalystReadSupport.SPARK_METADATA_KEY,
       ParquetTypesConverter.convertToString(attributes))
     // TODO: add extra data, e.g., table name, date, etc.?
 
@@ -165,35 +156,4 @@ private[parquet] object ParquetTypesConverter extends Logging {
       .getOrElse(
         throw new IllegalArgumentException(s"Could not find Parquet metadata at path $path"))
   }
-
-  /**
-   * Reads in Parquet Metadata from the given path and tries to extract the schema
-   * (Catalyst attributes) from the application-specific key-value map. If this
-   * is empty it falls back to converting from the Parquet file schema which
-   * may lead to an upcast of types (e.g., {byte, short} to int).
-   *
-   * @param origPath The path at which we expect one (or more) Parquet files.
-   * @param conf The Hadoop configuration to use.
-   * @return A list of attributes that make up the schema.
-   */
-  def readSchemaFromFile(
-      origPath: Path,
-      conf: Option[Configuration],
-      isBinaryAsString: Boolean,
-      isInt96AsTimestamp: Boolean): Seq[Attribute] = {
-    val keyValueMetadata: java.util.Map[String, String] =
-      readMetaData(origPath, conf)
-        .getFileMetaData
-        .getKeyValueMetaData
-    if (keyValueMetadata.get(RowReadSupport.SPARK_METADATA_KEY) != null) {
-      convertFromString(keyValueMetadata.get(RowReadSupport.SPARK_METADATA_KEY))
-    } else {
-      val attributes = convertToAttributes(
-        readMetaData(origPath, conf).getFileMetaData.getSchema,
-        isBinaryAsString,
-        isInt96AsTimestamp)
-      log.info(s"Falling back to schema conversion from Parquet types; result: $attributes")
-      attributes
-    }
-  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
deleted file mode 100644
index 8ec228c2b25bc..0000000000000
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
+++ /dev/null
@@ -1,732 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.parquet
-
-import java.net.URI
-import java.util.{List => JList}
-
-import scala.collection.JavaConversions._
-import scala.collection.mutable
-import scala.util.{Failure, Try}
-
-import com.google.common.base.Objects
-import org.apache.hadoop.fs.{FileStatus, Path}
-import org.apache.hadoop.io.Writable
-import org.apache.hadoop.mapreduce._
-import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
-import org.apache.parquet.filter2.predicate.FilterApi
-import org.apache.parquet.hadoop._
-import org.apache.parquet.hadoop.metadata.CompressionCodecName
-import org.apache.parquet.hadoop.util.ContextUtil
-import org.apache.parquet.schema.MessageType
-
-import org.apache.spark.{Logging, Partition => SparkPartition, SparkException}
-import org.apache.spark.broadcast.Broadcast
-import org.apache.spark.rdd.RDD
-import org.apache.spark.rdd.RDD._
-import org.apache.spark.sql._
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.execution.{SqlNewHadoopPartition, SqlNewHadoopRDD}
-import org.apache.spark.sql.execution.datasources.PartitionSpec
-import org.apache.spark.sql.sources._
-import org.apache.spark.sql.types.{DataType, StructType}
-import org.apache.spark.util.{SerializableConfiguration, Utils}
-
-
-private[sql] class DefaultSource extends HadoopFsRelationProvider {
-  override def createRelation(
-      sqlContext: SQLContext,
-      paths: Array[String],
-      schema: Option[StructType],
-      partitionColumns: Option[StructType],
-      parameters: Map[String, String]): HadoopFsRelation = {
-    new ParquetRelation2(paths, schema, None, partitionColumns, parameters)(sqlContext)
-  }
-}
-
-// NOTE: This class is instantiated and used on executor side only, no need to be serializable.
-private[sql] class ParquetOutputWriter(path: String, context: TaskAttemptContext)
-  extends OutputWriterInternal {
-
-  private val recordWriter: RecordWriter[Void, InternalRow] = {
-    val outputFormat = {
-      new ParquetOutputFormat[InternalRow]() {
-        // Here we override `getDefaultWorkFile` for two reasons:
-        //
-        //  1. To allow appending.  We need to generate unique output file names to avoid
-        //     overwriting existing files (either exist before the write job, or are just written
-        //     by other tasks within the same write job).
-        //
-        //  2. To allow dynamic partitioning.  Default `getDefaultWorkFile` uses
-        //     `FileOutputCommitter.getWorkPath()`, which points to the base directory of all
-        //     partitions in the case of dynamic partitioning.
-        override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = {
-          val uniqueWriteJobId = context.getConfiguration.get("spark.sql.sources.writeJobUUID")
-          val split = context.getTaskAttemptID.getTaskID.getId
-          new Path(path, f"part-r-$split%05d-$uniqueWriteJobId$extension")
-        }
-      }
-    }
-
-    outputFormat.getRecordWriter(context)
-  }
-
-  override def writeInternal(row: InternalRow): Unit = recordWriter.write(null, row)
-
-  override def close(): Unit = recordWriter.close(context)
-}
-
-private[sql] class ParquetRelation2(
-    override val paths: Array[String],
-    private val maybeDataSchema: Option[StructType],
-    // This is for metastore conversion.
-    private val maybePartitionSpec: Option[PartitionSpec],
-    override val userDefinedPartitionColumns: Option[StructType],
-    parameters: Map[String, String])(
-    val sqlContext: SQLContext)
-  extends HadoopFsRelation(maybePartitionSpec)
-  with Logging {
-
-  private[sql] def this(
-      paths: Array[String],
-      maybeDataSchema: Option[StructType],
-      maybePartitionSpec: Option[PartitionSpec],
-      parameters: Map[String, String])(
-      sqlContext: SQLContext) = {
-    this(
-      paths,
-      maybeDataSchema,
-      maybePartitionSpec,
-      maybePartitionSpec.map(_.partitionColumns),
-      parameters)(sqlContext)
-  }
-
-  // Should we merge schemas from all Parquet part-files?
-  private val shouldMergeSchemas =
-    parameters
-      .get(ParquetRelation2.MERGE_SCHEMA)
-      .map(_.toBoolean)
-      .getOrElse(sqlContext.conf.getConf(SQLConf.PARQUET_SCHEMA_MERGING_ENABLED))
-
-  private val maybeMetastoreSchema = parameters
-    .get(ParquetRelation2.METASTORE_SCHEMA)
-    .map(DataType.fromJson(_).asInstanceOf[StructType])
-
-  private lazy val metadataCache: MetadataCache = {
-    val meta = new MetadataCache
-    meta.refresh()
-    meta
-  }
-
-  override def equals(other: Any): Boolean = other match {
-    case that: ParquetRelation2 =>
-      val schemaEquality = if (shouldMergeSchemas) {
-        this.shouldMergeSchemas == that.shouldMergeSchemas
-      } else {
-        this.dataSchema == that.dataSchema &&
-          this.schema == that.schema
-      }
-
-      this.paths.toSet == that.paths.toSet &&
-        schemaEquality &&
-        this.maybeDataSchema == that.maybeDataSchema &&
-        this.partitionColumns == that.partitionColumns
-
-    case _ => false
-  }
-
-  override def hashCode(): Int = {
-    if (shouldMergeSchemas) {
-      Objects.hashCode(
-        Boolean.box(shouldMergeSchemas),
-        paths.toSet,
-        maybeDataSchema,
-        partitionColumns)
-    } else {
-      Objects.hashCode(
-        Boolean.box(shouldMergeSchemas),
-        paths.toSet,
-        dataSchema,
-        schema,
-        maybeDataSchema,
-        partitionColumns)
-    }
-  }
-
-  /** Constraints on schema of dataframe to be stored. */
-  private def checkConstraints(schema: StructType): Unit = {
-    if (schema.fieldNames.length != schema.fieldNames.distinct.length) {
-      val duplicateColumns = schema.fieldNames.groupBy(identity).collect {
-        case (x, ys) if ys.length > 1 => "\"" + x + "\""
-      }.mkString(", ")
-      throw new AnalysisException(s"Duplicate column(s) : $duplicateColumns found, " +
-        s"cannot save to parquet format")
-    }
-  }
-
-  override def dataSchema: StructType = {
-    val schema = maybeDataSchema.getOrElse(metadataCache.dataSchema)
-    // check if schema satisfies the constraints
-    // before moving forward
-    checkConstraints(schema)
-    schema
-  }
-
-  override private[sql] def refresh(): Unit = {
-    super.refresh()
-    metadataCache.refresh()
-  }
-
-  // Parquet data source always uses Catalyst internal representations.
-  override val needConversion: Boolean = false
-
-  override def sizeInBytes: Long = metadataCache.dataStatuses.map(_.getLen).sum
-
-  override def prepareJobForWrite(job: Job): OutputWriterFactory = {
-    val conf = ContextUtil.getConfiguration(job)
-
-    val committerClass =
-      conf.getClass(
-        SQLConf.PARQUET_OUTPUT_COMMITTER_CLASS.key,
-        classOf[ParquetOutputCommitter],
-        classOf[ParquetOutputCommitter])
-
-    if (conf.get(SQLConf.PARQUET_OUTPUT_COMMITTER_CLASS.key) == null) {
-      logInfo("Using default output committer for Parquet: " +
-        classOf[ParquetOutputCommitter].getCanonicalName)
-    } else {
-      logInfo("Using user defined output committer for Parquet: " + committerClass.getCanonicalName)
-    }
-
-    conf.setClass(
-      SQLConf.OUTPUT_COMMITTER_CLASS.key,
-      committerClass,
-      classOf[ParquetOutputCommitter])
-
-    // We're not really using `ParquetOutputFormat[Row]` for writing data here, because we override
-    // it in `ParquetOutputWriter` to support appending and dynamic partitioning.  The reason why
-    // we set it here is to setup the output committer class to `ParquetOutputCommitter`, which is
-    // bundled with `ParquetOutputFormat[Row]`.
-    job.setOutputFormatClass(classOf[ParquetOutputFormat[Row]])
-
-    // TODO There's no need to use two kinds of WriteSupport
-    // We should unify them. `SpecificMutableRow` can process both atomic (primitive) types and
-    // complex types.
-    val writeSupportClass =
-      if (dataSchema.map(_.dataType).forall(ParquetTypesConverter.isPrimitiveType)) {
-        classOf[MutableRowWriteSupport]
-      } else {
-        classOf[RowWriteSupport]
-      }
-
-    ParquetOutputFormat.setWriteSupportClass(job, writeSupportClass)
-    RowWriteSupport.setSchema(dataSchema.toAttributes, conf)
-
-    // Sets compression scheme
-    conf.set(
-      ParquetOutputFormat.COMPRESSION,
-      ParquetRelation
-        .shortParquetCompressionCodecNames
-        .getOrElse(
-          sqlContext.conf.parquetCompressionCodec.toUpperCase,
-          CompressionCodecName.UNCOMPRESSED).name())
-
-    new OutputWriterFactory {
-      override def newInstance(
-          path: String, dataSchema: StructType, context: TaskAttemptContext): OutputWriter = {
-        new ParquetOutputWriter(path, context)
-      }
-    }
-  }
-
-  override def buildScan(
-      requiredColumns: Array[String],
-      filters: Array[Filter],
-      inputFiles: Array[FileStatus],
-      broadcastedConf: Broadcast[SerializableConfiguration]): RDD[Row] = {
-    val useMetadataCache = sqlContext.getConf(SQLConf.PARQUET_CACHE_METADATA)
-    val parquetFilterPushDown = sqlContext.conf.parquetFilterPushDown
-    val assumeBinaryIsString = sqlContext.conf.isParquetBinaryAsString
-    val assumeInt96IsTimestamp = sqlContext.conf.isParquetINT96AsTimestamp
-    val followParquetFormatSpec = sqlContext.conf.followParquetFormatSpec
-
-    // Create the function to set variable Parquet confs at both driver and executor side.
-    val initLocalJobFuncOpt =
-      ParquetRelation2.initializeLocalJobFunc(
-        requiredColumns,
-        filters,
-        dataSchema,
-        useMetadataCache,
-        parquetFilterPushDown,
-        assumeBinaryIsString,
-        assumeInt96IsTimestamp,
-        followParquetFormatSpec) _
-
-    // Create the function to set input paths at the driver side.
-    val setInputPaths = ParquetRelation2.initializeDriverSideJobFunc(inputFiles) _
-
-    Utils.withDummyCallSite(sqlContext.sparkContext) {
-      new SqlNewHadoopRDD(
-        sc = sqlContext.sparkContext,
-        broadcastedConf = broadcastedConf,
-        initDriverSideJobFuncOpt = Some(setInputPaths),
-        initLocalJobFuncOpt = Some(initLocalJobFuncOpt),
-        inputFormatClass = classOf[ParquetInputFormat[InternalRow]],
-        keyClass = classOf[Void],
-        valueClass = classOf[InternalRow]) {
-
-        val cacheMetadata = useMetadataCache
-
-        @transient val cachedStatuses = inputFiles.map { f =>
-          // In order to encode the authority of a Path containing special characters such as '/'
-          // (which does happen in some S3N credentials), we need to use the string returned by the
-          // URI of the path to create a new Path.
-          val pathWithEscapedAuthority = escapePathUserInfo(f.getPath)
-          new FileStatus(
-            f.getLen, f.isDir, f.getReplication, f.getBlockSize, f.getModificationTime,
-            f.getAccessTime, f.getPermission, f.getOwner, f.getGroup, pathWithEscapedAuthority)
-        }.toSeq
-
-        private def escapePathUserInfo(path: Path): Path = {
-          val uri = path.toUri
-          new Path(new URI(
-            uri.getScheme, uri.getRawUserInfo, uri.getHost, uri.getPort, uri.getPath,
-            uri.getQuery, uri.getFragment))
-        }
-
-        // Overridden so we can inject our own cached files statuses.
-        override def getPartitions: Array[SparkPartition] = {
-          val inputFormat = new ParquetInputFormat[InternalRow] {
-            override def listStatus(jobContext: JobContext): JList[FileStatus] = {
-              if (cacheMetadata) cachedStatuses else super.listStatus(jobContext)
-            }
-          }
-
-          val jobContext = newJobContext(getConf(isDriverSide = true), jobId)
-          val rawSplits = inputFormat.getSplits(jobContext)
-
-          Array.tabulate[SparkPartition](rawSplits.size) { i =>
-            new SqlNewHadoopPartition(id, i, rawSplits(i).asInstanceOf[InputSplit with Writable])
-          }
-        }
-      }.values.asInstanceOf[RDD[Row]]  // type erasure hack to pass RDD[InternalRow] as RDD[Row]
-    }
-  }
-
-  private class MetadataCache {
-    // `FileStatus` objects of all "_metadata" files.
-    private var metadataStatuses: Array[FileStatus] = _
-
-    // `FileStatus` objects of all "_common_metadata" files.
-    private var commonMetadataStatuses: Array[FileStatus] = _
-
-    // `FileStatus` objects of all data files (Parquet part-files).
-    var dataStatuses: Array[FileStatus] = _
-
-    // Schema of the actual Parquet files, without partition columns discovered from partition
-    // directory paths.
-    var dataSchema: StructType = null
-
-    // Schema of the whole table, including partition columns.
-    var schema: StructType = _
-
-    // Cached leaves
-    var cachedLeaves: Set[FileStatus] = null
-
-    /**
-     * Refreshes `FileStatus`es, footers, partition spec, and table schema.
-     */
-    def refresh(): Unit = {
-      val currentLeafStatuses = cachedLeafStatuses()
-
-      // Check if cachedLeafStatuses is changed or not
-      val leafStatusesChanged = (cachedLeaves == null) ||
-        !cachedLeaves.equals(currentLeafStatuses)
-
-      if (leafStatusesChanged) {
-        cachedLeaves = currentLeafStatuses.toIterator.toSet
-
-        // Lists `FileStatus`es of all leaf nodes (files) under all base directories.
-        val leaves = currentLeafStatuses.filter { f =>
-          isSummaryFile(f.getPath) ||
-            !(f.getPath.getName.startsWith("_") || f.getPath.getName.startsWith("."))
-        }.toArray
-
-        dataStatuses = leaves.filterNot(f => isSummaryFile(f.getPath))
-        metadataStatuses =
-          leaves.filter(_.getPath.getName == ParquetFileWriter.PARQUET_METADATA_FILE)
-        commonMetadataStatuses =
-          leaves.filter(_.getPath.getName == ParquetFileWriter.PARQUET_COMMON_METADATA_FILE)
-
-        dataSchema = {
-          val dataSchema0 = maybeDataSchema
-            .orElse(readSchema())
-            .orElse(maybeMetastoreSchema)
-            .getOrElse(throw new AnalysisException(
-              s"Failed to discover schema of Parquet file(s) in the following location(s):\n" +
-                paths.mkString("\n\t")))
-
-          // If this Parquet relation is converted from a Hive Metastore table, must reconcile case
-          // case insensitivity issue and possible schema mismatch (probably caused by schema
-          // evolution).
-          maybeMetastoreSchema
-            .map(ParquetRelation2.mergeMetastoreParquetSchema(_, dataSchema0))
-            .getOrElse(dataSchema0)
-        }
-      }
-    }
-
-    private def isSummaryFile(file: Path): Boolean = {
-      file.getName == ParquetFileWriter.PARQUET_COMMON_METADATA_FILE ||
-        file.getName == ParquetFileWriter.PARQUET_METADATA_FILE
-    }
-
-    private def readSchema(): Option[StructType] = {
-      // Sees which file(s) we need to touch in order to figure out the schema.
-      //
-      // Always tries the summary files first if users don't require a merged schema.  In this case,
-      // "_common_metadata" is more preferable than "_metadata" because it doesn't contain row
-      // groups information, and could be much smaller for large Parquet files with lots of row
-      // groups.  If no summary file is available, falls back to some random part-file.
-      //
-      // NOTE: Metadata stored in the summary files are merged from all part-files.  However, for
-      // user defined key-value metadata (in which we store Spark SQL schema), Parquet doesn't know
-      // how to merge them correctly if some key is associated with different values in different
-      // part-files.  When this happens, Parquet simply gives up generating the summary file.  This
-      // implies that if a summary file presents, then:
-      //
-      //   1. Either all part-files have exactly the same Spark SQL schema, or
-      //   2. Some part-files don't contain Spark SQL schema in the key-value metadata at all (thus
-      //      their schemas may differ from each other).
-      //
-      // Here we tend to be pessimistic and take the second case into account.  Basically this means
-      // we can't trust the summary files if users require a merged schema, and must touch all part-
-      // files to do the merge.
-      val filesToTouch =
-        if (shouldMergeSchemas) {
-          // Also includes summary files, 'cause there might be empty partition directories.
-          (metadataStatuses ++ commonMetadataStatuses ++ dataStatuses).toSeq
-        } else {
-          // Tries any "_common_metadata" first. Parquet files written by old versions or Parquet
-          // don't have this.
-          commonMetadataStatuses.headOption
-            // Falls back to "_metadata"
-            .orElse(metadataStatuses.headOption)
-            // Summary file(s) not found, the Parquet file is either corrupted, or different part-
-            // files contain conflicting user defined metadata (two or more values are associated
-            // with a same key in different files).  In either case, we fall back to any of the
-            // first part-file, and just assume all schemas are consistent.
-            .orElse(dataStatuses.headOption)
-            .toSeq
-        }
-
-      assert(
-        filesToTouch.nonEmpty || maybeDataSchema.isDefined || maybeMetastoreSchema.isDefined,
-        "No predefined schema found, " +
-          s"and no Parquet data files or summary files found under ${paths.mkString(", ")}.")
-
-      ParquetRelation2.mergeSchemasInParallel(filesToTouch, sqlContext)
-    }
-  }
-}
-
-private[sql] object ParquetRelation2 extends Logging {
-  // Whether we should merge schemas collected from all Parquet part-files.
-  private[sql] val MERGE_SCHEMA = "mergeSchema"
-
-  // Hive Metastore schema, used when converting Metastore Parquet tables.  This option is only used
-  // internally.
-  private[sql] val METASTORE_SCHEMA = "metastoreSchema"
-
-  /** This closure sets various Parquet configurations at both driver side and executor side. */
-  private[parquet] def initializeLocalJobFunc(
-      requiredColumns: Array[String],
-      filters: Array[Filter],
-      dataSchema: StructType,
-      useMetadataCache: Boolean,
-      parquetFilterPushDown: Boolean,
-      assumeBinaryIsString: Boolean,
-      assumeInt96IsTimestamp: Boolean,
-      followParquetFormatSpec: Boolean)(job: Job): Unit = {
-    val conf = job.getConfiguration
-    conf.set(ParquetInputFormat.READ_SUPPORT_CLASS, classOf[RowReadSupport].getName)
-
-    // Try to push down filters when filter push-down is enabled.
-    if (parquetFilterPushDown) {
-      filters
-        // Collects all converted Parquet filter predicates. Notice that not all predicates can be
-        // converted (`ParquetFilters.createFilter` returns an `Option`). That's why a `flatMap`
-        // is used here.
-        .flatMap(ParquetFilters.createFilter(dataSchema, _))
-        .reduceOption(FilterApi.and)
-        .foreach(ParquetInputFormat.setFilterPredicate(conf, _))
-    }
-
-    conf.set(RowReadSupport.SPARK_ROW_REQUESTED_SCHEMA, {
-      val requestedSchema = StructType(requiredColumns.map(dataSchema(_)))
-      ParquetTypesConverter.convertToString(requestedSchema.toAttributes)
-    })
-
-    conf.set(
-      RowWriteSupport.SPARK_ROW_SCHEMA,
-      ParquetTypesConverter.convertToString(dataSchema.toAttributes))
-
-    // Tell FilteringParquetRowInputFormat whether it's okay to cache Parquet and FS metadata
-    conf.setBoolean(SQLConf.PARQUET_CACHE_METADATA.key, useMetadataCache)
-
-    // Sets flags for Parquet schema conversion
-    conf.setBoolean(SQLConf.PARQUET_BINARY_AS_STRING.key, assumeBinaryIsString)
-    conf.setBoolean(SQLConf.PARQUET_INT96_AS_TIMESTAMP.key, assumeInt96IsTimestamp)
-    conf.setBoolean(SQLConf.PARQUET_FOLLOW_PARQUET_FORMAT_SPEC.key, followParquetFormatSpec)
-  }
-
-  /** This closure sets input paths at the driver side. */
-  private[parquet] def initializeDriverSideJobFunc(
-      inputFiles: Array[FileStatus])(job: Job): Unit = {
-    // We side the input paths at the driver side.
-    logInfo(s"Reading Parquet file(s) from ${inputFiles.map(_.getPath).mkString(", ")}")
-    if (inputFiles.nonEmpty) {
-      FileInputFormat.setInputPaths(job, inputFiles.map(_.getPath): _*)
-    }
-  }
-
-  private[parquet] def readSchema(
-      footers: Seq[Footer], sqlContext: SQLContext): Option[StructType] = {
-
-    def parseParquetSchema(schema: MessageType): StructType = {
-      StructType.fromAttributes(
-        // TODO Really no need to use `Attribute` here, we only need to know the data type.
-        ParquetTypesConverter.convertToAttributes(
-          schema,
-          sqlContext.conf.isParquetBinaryAsString,
-          sqlContext.conf.isParquetINT96AsTimestamp))
-    }
-
-    val seen = mutable.HashSet[String]()
-    val finalSchemas: Seq[StructType] = footers.flatMap { footer =>
-      val metadata = footer.getParquetMetadata.getFileMetaData
-      val serializedSchema = metadata
-        .getKeyValueMetaData
-        .toMap
-        .get(RowReadSupport.SPARK_METADATA_KEY)
-      if (serializedSchema.isEmpty) {
-        // Falls back to Parquet schema if no Spark SQL schema found.
-        Some(parseParquetSchema(metadata.getSchema))
-      } else if (!seen.contains(serializedSchema.get)) {
-        seen += serializedSchema.get
-
-        // Don't throw even if we failed to parse the serialized Spark schema. Just fallback to
-        // whatever is available.
-        Some(Try(DataType.fromJson(serializedSchema.get))
-          .recover { case _: Throwable =>
-            logInfo(
-              s"Serialized Spark schema in Parquet key-value metadata is not in JSON format, " +
-                "falling back to the deprecated DataType.fromCaseClassString parser.")
-            DataType.fromCaseClassString(serializedSchema.get)
-          }
-          .recover { case cause: Throwable =>
-            logWarning(
-              s"""Failed to parse serialized Spark schema in Parquet key-value metadata:
-                 |\t$serializedSchema
-               """.stripMargin,
-              cause)
-          }
-          .map(_.asInstanceOf[StructType])
-          .getOrElse {
-            // Falls back to Parquet schema if Spark SQL schema can't be parsed.
-            parseParquetSchema(metadata.getSchema)
-          })
-      } else {
-        None
-      }
-    }
-
-    finalSchemas.reduceOption { (left, right) =>
-      try left.merge(right) catch { case e: Throwable =>
-        throw new SparkException(s"Failed to merge incompatible schemas $left and $right", e)
-      }
-    }
-  }
-
-  /**
-   * Reconciles Hive Metastore case insensitivity issue and data type conflicts between Metastore
-   * schema and Parquet schema.
-   *
-   * Hive doesn't retain case information, while Parquet is case sensitive. On the other hand, the
-   * schema read from Parquet files may be incomplete (e.g. older versions of Parquet doesn't
-   * distinguish binary and string).  This method generates a correct schema by merging Metastore
-   * schema data types and Parquet schema field names.
-   */
-  private[parquet] def mergeMetastoreParquetSchema(
-      metastoreSchema: StructType,
-      parquetSchema: StructType): StructType = {
-    def schemaConflictMessage: String =
-      s"""Converting Hive Metastore Parquet, but detected conflicting schemas. Metastore schema:
-         |${metastoreSchema.prettyJson}
-         |
-         |Parquet schema:
-         |${parquetSchema.prettyJson}
-       """.stripMargin
-
-    val mergedParquetSchema = mergeMissingNullableFields(metastoreSchema, parquetSchema)
-
-    assert(metastoreSchema.size <= mergedParquetSchema.size, schemaConflictMessage)
-
-    val ordinalMap = metastoreSchema.zipWithIndex.map {
-      case (field, index) => field.name.toLowerCase -> index
-    }.toMap
-
-    val reorderedParquetSchema = mergedParquetSchema.sortBy(f =>
-      ordinalMap.getOrElse(f.name.toLowerCase, metastoreSchema.size + 1))
-
-    StructType(metastoreSchema.zip(reorderedParquetSchema).map {
-      // Uses Parquet field names but retains Metastore data types.
-      case (mSchema, pSchema) if mSchema.name.toLowerCase == pSchema.name.toLowerCase =>
-        mSchema.copy(name = pSchema.name)
-      case _ =>
-        throw new SparkException(schemaConflictMessage)
-    })
-  }
-
-  /**
-   * Returns the original schema from the Parquet file with any missing nullable fields from the
-   * Hive Metastore schema merged in.
-   *
-   * When constructing a DataFrame from a collection of structured data, the resulting object has
-   * a schema corresponding to the union of the fields present in each element of the collection.
-   * Spark SQL simply assigns a null value to any field that isn't present for a particular row.
-   * In some cases, it is possible that a given table partition stored as a Parquet file doesn't
-   * contain a particular nullable field in its schema despite that field being present in the
-   * table schema obtained from the Hive Metastore. This method returns a schema representing the
-   * Parquet file schema along with any additional nullable fields from the Metastore schema
-   * merged in.
-   */
-  private[parquet] def mergeMissingNullableFields(
-      metastoreSchema: StructType,
-      parquetSchema: StructType): StructType = {
-    val fieldMap = metastoreSchema.map(f => f.name.toLowerCase -> f).toMap
-    val missingFields = metastoreSchema
-      .map(_.name.toLowerCase)
-      .diff(parquetSchema.map(_.name.toLowerCase))
-      .map(fieldMap(_))
-      .filter(_.nullable)
-    StructType(parquetSchema ++ missingFields)
-  }
-
-  /**
-   * Figures out a merged Parquet schema with a distributed Spark job.
-   *
-   * Note that locality is not taken into consideration here because:
-   *
-   *  1. For a single Parquet part-file, in most cases the footer only resides in the last block of
-   *     that file.  Thus we only need to retrieve the location of the last block.  However, Hadoop
-   *     `FileSystem` only provides API to retrieve locations of all blocks, which can be
-   *     potentially expensive.
-   *
-   *  2. This optimization is mainly useful for S3, where file metadata operations can be pretty
-   *     slow.  And basically locality is not available when using S3 (you can't run computation on
-   *     S3 nodes).
-   */
-  def mergeSchemasInParallel(
-      filesToTouch: Seq[FileStatus], sqlContext: SQLContext): Option[StructType] = {
-    val assumeBinaryIsString = sqlContext.conf.isParquetBinaryAsString
-    val assumeInt96IsTimestamp = sqlContext.conf.isParquetINT96AsTimestamp
-    val followParquetFormatSpec = sqlContext.conf.followParquetFormatSpec
-    val serializedConf = new SerializableConfiguration(sqlContext.sparkContext.hadoopConfiguration)
-
-    // HACK ALERT:
-    //
-    // Parquet requires `FileStatus`es to read footers.  Here we try to send cached `FileStatus`es
-    // to executor side to avoid fetching them again.  However, `FileStatus` is not `Serializable`
-    // but only `Writable`.  What makes it worth, for some reason, `FileStatus` doesn't play well
-    // with `SerializableWritable[T]` and always causes a weird `IllegalStateException`.  These
-    // facts virtually prevents us to serialize `FileStatus`es.
-    //
-    // Since Parquet only relies on path and length information of those `FileStatus`es to read
-    // footers, here we just extract them (which can be easily serialized), send them to executor
-    // side, and resemble fake `FileStatus`es there.
-    val partialFileStatusInfo = filesToTouch.map(f => (f.getPath.toString, f.getLen))
-
-    // Issues a Spark job to read Parquet schema in parallel.
-    val partiallyMergedSchemas =
-      sqlContext
-        .sparkContext
-        .parallelize(partialFileStatusInfo)
-        .mapPartitions { iterator =>
-          // Resembles fake `FileStatus`es with serialized path and length information.
-          val fakeFileStatuses = iterator.map { case (path, length) =>
-            new FileStatus(length, false, 0, 0, 0, 0, null, null, null, new Path(path))
-          }.toSeq
-
-          // Skips row group information since we only need the schema
-          val skipRowGroups = true
-
-          // Reads footers in multi-threaded manner within each task
-          val footers =
-            ParquetFileReader.readAllFootersInParallel(
-              serializedConf.value, fakeFileStatuses, skipRowGroups)
-
-          // Converter used to convert Parquet `MessageType` to Spark SQL `StructType`
-          val converter =
-            new CatalystSchemaConverter(
-              assumeBinaryIsString = assumeBinaryIsString,
-              assumeInt96IsTimestamp = assumeInt96IsTimestamp,
-              followParquetFormatSpec = followParquetFormatSpec)
-
-          footers.map { footer =>
-            ParquetRelation2.readSchemaFromFooter(footer, converter)
-          }.reduceOption(_ merge _).iterator
-        }.collect()
-
-    partiallyMergedSchemas.reduceOption(_ merge _)
-  }
-
-  /**
-   * Reads Spark SQL schema from a Parquet footer.  If a valid serialized Spark SQL schema string
-   * can be found in the file metadata, returns the deserialized [[StructType]], otherwise, returns
-   * a [[StructType]] converted from the [[MessageType]] stored in this footer.
-   */
-  def readSchemaFromFooter(
-      footer: Footer, converter: CatalystSchemaConverter): StructType = {
-    val fileMetaData = footer.getParquetMetadata.getFileMetaData
-    fileMetaData
-      .getKeyValueMetaData
-      .toMap
-      .get(RowReadSupport.SPARK_METADATA_KEY)
-      .flatMap(deserializeSchemaString)
-      .getOrElse(converter.convert(fileMetaData.getSchema))
-  }
-
-  private def deserializeSchemaString(schemaString: String): Option[StructType] = {
-    // Tries to deserialize the schema string as JSON first, then falls back to the case class
-    // string parser (data generated by older versions of Spark SQL uses this format).
-    Try(DataType.fromJson(schemaString).asInstanceOf[StructType]).recover {
-      case _: Throwable =>
-        logInfo(
-          s"Serialized Spark schema in Parquet key-value metadata is not in JSON format, " +
-            "falling back to the deprecated DataType.fromCaseClassString parser.")
-        DataType.fromCaseClassString(schemaString).asInstanceOf[StructType]
-    }.recoverWith {
-      case cause: Throwable =>
-        logWarning(
-          "Failed to parse and ignored serialized Spark schema in " +
-            s"Parquet key-value metadata:\n\t$schemaString", cause)
-        Failure(cause)
-    }.toOption
-  }
-}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala
index 23df102cd951d..b6a7c4fbddbdc 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala
@@ -17,7 +17,6 @@
 
 package org.apache.spark.sql.parquet
 
-import org.scalatest.BeforeAndAfterAll
 import org.apache.parquet.filter2.predicate.Operators._
 import org.apache.parquet.filter2.predicate.{FilterPredicate, Operators}
 
@@ -40,7 +39,7 @@ import org.apache.spark.sql.{Column, DataFrame, QueryTest, Row, SQLConf}
  * 2. `Tuple1(Option(x))` is used together with `AnyVal` types like `Int` to ensure the inferred
  *    data type is nullable.
  */
-class ParquetFilterSuiteBase extends QueryTest with ParquetTest {
+class ParquetFilterSuite extends QueryTest with ParquetTest {
   lazy val sqlContext = org.apache.spark.sql.test.TestSQLContext
 
   private def checkFilterPredicate(
@@ -56,17 +55,9 @@ class ParquetFilterSuiteBase extends QueryTest with ParquetTest {
         .select(output.map(e => Column(e)): _*)
         .where(Column(predicate))
 
-      val maybeAnalyzedPredicate = {
-        val forParquetTableScan = query.queryExecution.executedPlan.collect {
-          case plan: ParquetTableScan => plan.columnPruningPred
-        }.flatten.reduceOption(_ && _)
-
-        val forParquetDataSource = query.queryExecution.optimizedPlan.collect {
-          case PhysicalOperation(_, filters, LogicalRelation(_: ParquetRelation2)) => filters
-        }.flatten.reduceOption(_ && _)
-
-        forParquetTableScan.orElse(forParquetDataSource)
-      }
+      val maybeAnalyzedPredicate = query.queryExecution.optimizedPlan.collect {
+        case PhysicalOperation(_, filters, LogicalRelation(_: ParquetRelation)) => filters
+      }.flatten.reduceOption(_ && _)
 
       assert(maybeAnalyzedPredicate.isDefined)
       maybeAnalyzedPredicate.foreach { pred =>
@@ -98,7 +89,7 @@ class ParquetFilterSuiteBase extends QueryTest with ParquetTest {
       (predicate: Predicate, filterClass: Class[_ <: FilterPredicate], expected: Seq[Row])
       (implicit df: DataFrame): Unit = {
     def checkBinaryAnswer(df: DataFrame, expected: Seq[Row]) = {
-      assertResult(expected.map(_.getAs[Array[Byte]](0).mkString(",")).toSeq.sorted) {
+      assertResult(expected.map(_.getAs[Array[Byte]](0).mkString(",")).sorted) {
         df.map(_.getAs[Array[Byte]](0).mkString(",")).collect().toSeq.sorted
       }
     }
@@ -308,18 +299,6 @@ class ParquetFilterSuiteBase extends QueryTest with ParquetTest {
         '_1 < 2.b || '_1 > 3.b, classOf[Operators.Or], Seq(Row(1.b), Row(4.b)))
     }
   }
-}
-
-class ParquetDataSourceOnFilterSuite extends ParquetFilterSuiteBase with BeforeAndAfterAll {
-  lazy val originalConf = sqlContext.conf.parquetUseDataSourceApi
-
-  override protected def beforeAll(): Unit = {
-    sqlContext.conf.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, true)
-  }
-
-  override protected def afterAll(): Unit = {
-    sqlContext.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, originalConf)
-  }
 
   test("SPARK-6554: don't push down predicates which reference partition columns") {
     import sqlContext.implicits._
@@ -338,37 +317,3 @@ class ParquetDataSourceOnFilterSuite extends ParquetFilterSuiteBase with BeforeA
     }
   }
 }
-
-class ParquetDataSourceOffFilterSuite extends ParquetFilterSuiteBase with BeforeAndAfterAll {
-  lazy val originalConf = sqlContext.conf.parquetUseDataSourceApi
-
-  override protected def beforeAll(): Unit = {
-    sqlContext.conf.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, false)
-  }
-
-  override protected def afterAll(): Unit = {
-    sqlContext.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, originalConf)
-  }
-
-  test("SPARK-6742: don't push down predicates which reference partition columns") {
-    import sqlContext.implicits._
-
-    withSQLConf(SQLConf.PARQUET_FILTER_PUSHDOWN_ENABLED.key -> "true") {
-      withTempPath { dir =>
-        val path = s"${dir.getCanonicalPath}/part=1"
-        (1 to 3).map(i => (i, i.toString)).toDF("a", "b").write.parquet(path)
-
-        // If the "part = 1" filter gets pushed down, this query will throw an exception since
-        // "part" is not a valid column in the actual Parquet file
-        val df = DataFrame(sqlContext, org.apache.spark.sql.parquet.ParquetRelation(
-          path,
-          Some(sqlContext.sparkContext.hadoopConfiguration), sqlContext,
-          Seq(AttributeReference("part", IntegerType, false)()) ))
-
-        checkAnswer(
-          df.filter("a = 1 or part = 1"),
-          (1 to 3).map(i => Row(1, i, i.toString)))
-      }
-    }
-  }
-}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala
index 3a5b860484e86..b5314a3dd92e5 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala
@@ -32,7 +32,6 @@ import org.apache.parquet.hadoop.metadata.{CompressionCodecName, FileMetaData, P
 import org.apache.parquet.hadoop.{Footer, ParquetFileWriter, ParquetOutputCommitter, ParquetWriter}
 import org.apache.parquet.io.api.RecordConsumer
 import org.apache.parquet.schema.{MessageType, MessageTypeParser}
-import org.scalatest.BeforeAndAfterAll
 
 import org.apache.spark.SparkException
 import org.apache.spark.sql._
@@ -63,7 +62,7 @@ private[parquet] class TestGroupWriteSupport(schema: MessageType) extends WriteS
 /**
  * A test suite that tests basic Parquet I/O.
  */
-class ParquetIOSuiteBase extends QueryTest with ParquetTest {
+class ParquetIOSuite extends QueryTest with ParquetTest {
   lazy val sqlContext = org.apache.spark.sql.test.TestSQLContext
   import sqlContext.implicits._
 
@@ -357,7 +356,7 @@ class ParquetIOSuiteBase extends QueryTest with ParquetTest {
       """.stripMargin)
 
     withTempPath { location =>
-      val extraMetadata = Map(RowReadSupport.SPARK_METADATA_KEY -> sparkSchema.toString)
+      val extraMetadata = Map(CatalystReadSupport.SPARK_METADATA_KEY -> sparkSchema.toString)
       val fileMetadata = new FileMetaData(parquetSchema, extraMetadata, "Spark")
       val path = new Path(location.getCanonicalPath)
 
@@ -422,26 +421,6 @@ class ParquetIOSuiteBase extends QueryTest with ParquetTest {
       }
     }
   }
-}
-
-class BogusParquetOutputCommitter(outputPath: Path, context: TaskAttemptContext)
-  extends ParquetOutputCommitter(outputPath, context) {
-
-  override def commitJob(jobContext: JobContext): Unit = {
-    sys.error("Intentional exception for testing purposes")
-  }
-}
-
-class ParquetDataSourceOnIOSuite extends ParquetIOSuiteBase with BeforeAndAfterAll {
-  private lazy val originalConf = sqlContext.conf.parquetUseDataSourceApi
-
-  override protected def beforeAll(): Unit = {
-    sqlContext.conf.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, true)
-  }
-
-  override protected def afterAll(): Unit = {
-    sqlContext.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API.key, originalConf.toString)
-  }
 
   test("SPARK-6330 regression test") {
     // In 1.3.0, save to fs other than file: without configuring core-site.xml would get:
@@ -456,14 +435,10 @@ class ParquetDataSourceOnIOSuite extends ParquetIOSuiteBase with BeforeAndAfterA
   }
 }
 
-class ParquetDataSourceOffIOSuite extends ParquetIOSuiteBase with BeforeAndAfterAll {
-  private lazy val originalConf = sqlContext.conf.parquetUseDataSourceApi
-
-  override protected def beforeAll(): Unit = {
-    sqlContext.conf.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, false)
-  }
+class BogusParquetOutputCommitter(outputPath: Path, context: TaskAttemptContext)
+  extends ParquetOutputCommitter(outputPath, context) {
 
-  override protected def afterAll(): Unit = {
-    sqlContext.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, originalConf)
+  override def commitJob(jobContext: JobContext): Unit = {
+    sys.error("Intentional exception for testing purposes")
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala
index 7f16b1125c7a5..2eef10189f11c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala
@@ -467,7 +467,7 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest {
       (1 to 10).map(i => (i, i.toString)).toDF("a", "b").write.parquet(dir.getCanonicalPath)
       val queryExecution = sqlContext.read.parquet(dir.getCanonicalPath).queryExecution
       queryExecution.analyzed.collectFirst {
-        case LogicalRelation(relation: ParquetRelation2) =>
+        case LogicalRelation(relation: ParquetRelation) =>
           assert(relation.partitionSpec === PartitionSpec.emptySpec)
       }.getOrElse {
         fail(s"Expecting a ParquetRelation2, but got:\n$queryExecution")
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala
index 21007d95ed752..c037faf4cfd92 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala
@@ -18,7 +18,6 @@
 package org.apache.spark.sql.parquet
 
 import org.apache.hadoop.fs.Path
-import org.scalatest.BeforeAndAfterAll
 
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.{QueryTest, Row, SQLConf}
@@ -26,7 +25,7 @@ import org.apache.spark.sql.{QueryTest, Row, SQLConf}
 /**
  * A test suite that tests various Parquet queries.
  */
-class ParquetQuerySuiteBase extends QueryTest with ParquetTest {
+class ParquetQuerySuite extends QueryTest with ParquetTest {
   lazy val sqlContext = org.apache.spark.sql.test.TestSQLContext
   import sqlContext.sql
 
@@ -164,27 +163,3 @@ class ParquetQuerySuiteBase extends QueryTest with ParquetTest {
     }
   }
 }
-
-class ParquetDataSourceOnQuerySuite extends ParquetQuerySuiteBase with BeforeAndAfterAll {
-  private lazy val originalConf = sqlContext.conf.parquetUseDataSourceApi
-
-  override protected def beforeAll(): Unit = {
-    sqlContext.conf.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, true)
-  }
-
-  override protected def afterAll(): Unit = {
-    sqlContext.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, originalConf)
-  }
-}
-
-class ParquetDataSourceOffQuerySuite extends ParquetQuerySuiteBase with BeforeAndAfterAll {
-  private lazy val originalConf = sqlContext.conf.parquetUseDataSourceApi
-
-  override protected def beforeAll(): Unit = {
-    sqlContext.conf.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, false)
-  }
-
-  override protected def afterAll(): Unit = {
-    sqlContext.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, originalConf)
-  }
-}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala
index fa629392674bd..4a0b3b60f419d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala
@@ -378,7 +378,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
         StructField("lowerCase", StringType),
         StructField("UPPERCase", DoubleType, nullable = false)))) {
 
-      ParquetRelation2.mergeMetastoreParquetSchema(
+      ParquetRelation.mergeMetastoreParquetSchema(
         StructType(Seq(
           StructField("lowercase", StringType),
           StructField("uppercase", DoubleType, nullable = false))),
@@ -393,7 +393,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
       StructType(Seq(
         StructField("UPPERCase", DoubleType, nullable = false)))) {
 
-      ParquetRelation2.mergeMetastoreParquetSchema(
+      ParquetRelation.mergeMetastoreParquetSchema(
         StructType(Seq(
           StructField("uppercase", DoubleType, nullable = false))),
 
@@ -404,7 +404,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
 
     // Metastore schema contains additional non-nullable fields.
     assert(intercept[Throwable] {
-      ParquetRelation2.mergeMetastoreParquetSchema(
+      ParquetRelation.mergeMetastoreParquetSchema(
         StructType(Seq(
           StructField("uppercase", DoubleType, nullable = false),
           StructField("lowerCase", BinaryType, nullable = false))),
@@ -415,7 +415,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
 
     // Conflicting non-nullable field names
     intercept[Throwable] {
-      ParquetRelation2.mergeMetastoreParquetSchema(
+      ParquetRelation.mergeMetastoreParquetSchema(
         StructType(Seq(StructField("lower", StringType, nullable = false))),
         StructType(Seq(StructField("lowerCase", BinaryType))))
     }
@@ -429,7 +429,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
         StructField("firstField", StringType, nullable = true),
         StructField("secondField", StringType, nullable = true),
         StructField("thirdfield", StringType, nullable = true)))) {
-      ParquetRelation2.mergeMetastoreParquetSchema(
+      ParquetRelation.mergeMetastoreParquetSchema(
         StructType(Seq(
           StructField("firstfield", StringType, nullable = true),
           StructField("secondfield", StringType, nullable = true),
@@ -442,7 +442,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     // Merge should fail if the Metastore contains any additional fields that are not
     // nullable.
     assert(intercept[Throwable] {
-      ParquetRelation2.mergeMetastoreParquetSchema(
+      ParquetRelation.mergeMetastoreParquetSchema(
         StructType(Seq(
           StructField("firstfield", StringType, nullable = true),
           StructField("secondfield", StringType, nullable = true),
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index 4cdb83c5116f9..1b8edefef4093 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -444,9 +444,7 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) with Logging {
       HiveDDLStrategy,
       DDLStrategy,
       TakeOrderedAndProject,
-      ParquetOperations,
       InMemoryScans,
-      ParquetConversion, // Must be before HiveTableScans
       HiveTableScans,
       DataSinks,
       Scripts,
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index 0a2121c955871..262923531216f 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -21,7 +21,6 @@ import scala.collection.JavaConversions._
 
 import com.google.common.base.Objects
 import com.google.common.cache.{CacheBuilder, CacheLoader, LoadingCache}
-
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.hive.common.StatsSetupConst
 import org.apache.hadoop.hive.metastore.Warehouse
@@ -30,7 +29,6 @@ import org.apache.hadoop.hive.ql.metadata._
 import org.apache.hadoop.hive.ql.plan.TableDesc
 
 import org.apache.spark.Logging
-import org.apache.spark.sql.{AnalysisException, SQLContext, SaveMode}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.{Catalog, MultiInstanceRelation, OverrideCatalog}
 import org.apache.spark.sql.catalyst.expressions._
@@ -39,10 +37,11 @@ import org.apache.spark.sql.catalyst.plans.logical
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules._
 import org.apache.spark.sql.execution.datasources
-import org.apache.spark.sql.execution.datasources.{Partition => ParquetPartition, PartitionSpec, CreateTableUsingAsSelect, ResolvedDataSource, LogicalRelation}
+import org.apache.spark.sql.execution.datasources.{CreateTableUsingAsSelect, LogicalRelation, Partition => ParquetPartition, PartitionSpec, ResolvedDataSource}
 import org.apache.spark.sql.hive.client._
-import org.apache.spark.sql.parquet.ParquetRelation2
+import org.apache.spark.sql.parquet.ParquetRelation
 import org.apache.spark.sql.types._
+import org.apache.spark.sql.{AnalysisException, SQLContext, SaveMode}
 
 
 private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: HiveContext)
@@ -260,8 +259,8 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
     // serialize the Metastore schema to JSON and pass it as a data source option because of the
     // evil case insensitivity issue, which is reconciled within `ParquetRelation2`.
     val parquetOptions = Map(
-      ParquetRelation2.METASTORE_SCHEMA -> metastoreSchema.json,
-      ParquetRelation2.MERGE_SCHEMA -> mergeSchema.toString)
+      ParquetRelation.METASTORE_SCHEMA -> metastoreSchema.json,
+      ParquetRelation.MERGE_SCHEMA -> mergeSchema.toString)
     val tableIdentifier =
       QualifiedTableName(metastoreRelation.databaseName, metastoreRelation.tableName)
 
@@ -272,7 +271,7 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
         partitionSpecInMetastore: Option[PartitionSpec]): Option[LogicalRelation] = {
       cachedDataSourceTables.getIfPresent(tableIdentifier) match {
         case null => None // Cache miss
-        case logical@LogicalRelation(parquetRelation: ParquetRelation2) =>
+        case logical@LogicalRelation(parquetRelation: ParquetRelation) =>
           // If we have the same paths, same schema, and same partition spec,
           // we will use the cached Parquet Relation.
           val useCached =
@@ -317,7 +316,7 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
       val cached = getCached(tableIdentifier, paths, metastoreSchema, Some(partitionSpec))
       val parquetRelation = cached.getOrElse {
         val created = LogicalRelation(
-          new ParquetRelation2(
+          new ParquetRelation(
             paths.toArray, None, Some(partitionSpec), parquetOptions)(hive))
         cachedDataSourceTables.put(tableIdentifier, created)
         created
@@ -330,7 +329,7 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
       val cached = getCached(tableIdentifier, paths, metastoreSchema, None)
       val parquetRelation = cached.getOrElse {
         val created = LogicalRelation(
-          new ParquetRelation2(paths.toArray, None, None, parquetOptions)(hive))
+          new ParquetRelation(paths.toArray, None, None, parquetOptions)(hive))
         cachedDataSourceTables.put(tableIdentifier, created)
         created
       }
@@ -370,8 +369,6 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
   /**
    * When scanning or writing to non-partitioned Metastore Parquet tables, convert them to Parquet
    * data source relations for better performance.
-   *
-   * This rule can be considered as [[HiveStrategies.ParquetConversion]] done right.
    */
   object ParquetConversions extends Rule[LogicalPlan] {
     override def apply(plan: LogicalPlan): LogicalPlan = {
@@ -386,7 +383,6 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
             // Inserting into partitioned table is not supported in Parquet data source (yet).
             if !relation.hiveQlTable.isPartitioned &&
               hive.convertMetastoreParquet &&
-              conf.parquetUseDataSourceApi &&
               relation.tableDesc.getSerdeClassName.toLowerCase.contains("parquet") =>
           val parquetRelation = convertToParquetRelation(relation)
           val attributedRewrites = relation.output.zip(parquetRelation.output)
@@ -397,7 +393,6 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
           // Inserting into partitioned table is not supported in Parquet data source (yet).
           if !relation.hiveQlTable.isPartitioned &&
             hive.convertMetastoreParquet &&
-            conf.parquetUseDataSourceApi &&
             relation.tableDesc.getSerdeClassName.toLowerCase.contains("parquet") =>
           val parquetRelation = convertToParquetRelation(relation)
           val attributedRewrites = relation.output.zip(parquetRelation.output)
@@ -406,7 +401,6 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
         // Read path
         case p @ PhysicalOperation(_, _, relation: MetastoreRelation)
             if hive.convertMetastoreParquet &&
-              conf.parquetUseDataSourceApi &&
               relation.tableDesc.getSerdeClassName.toLowerCase.contains("parquet") =>
           val parquetRelation = convertToParquetRelation(relation)
           val attributedRewrites = relation.output.zip(parquetRelation.output)
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
index a22c3292eff94..cd6cd322c94ed 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
@@ -17,23 +17,14 @@
 
 package org.apache.spark.sql.hive
 
-import scala.collection.JavaConversions._
-
-import org.apache.spark.annotation.Experimental
 import org.apache.spark.sql._
-import org.apache.spark.sql.catalyst.CatalystTypeConverters
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
-import org.apache.spark.sql.catalyst.expressions.codegen.GeneratePredicate
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.planning._
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
-import org.apache.spark.sql.execution.{DescribeCommand => RunnableDescribeCommand, _}
 import org.apache.spark.sql.execution.datasources.{CreateTableUsing, CreateTableUsingAsSelect, DescribeCommand}
+import org.apache.spark.sql.execution.{DescribeCommand => RunnableDescribeCommand, _}
 import org.apache.spark.sql.hive.execution._
-import org.apache.spark.sql.parquet.ParquetRelation
-import org.apache.spark.sql.types.StringType
 
 
 private[hive] trait HiveStrategies {
@@ -42,136 +33,6 @@ private[hive] trait HiveStrategies {
 
   val hiveContext: HiveContext
 
-  /**
-   * :: Experimental ::
-   * Finds table scans that would use the Hive SerDe and replaces them with our own native parquet
-   * table scan operator.
-   *
-   * TODO: Much of this logic is duplicated in HiveTableScan.  Ideally we would do some refactoring
-   * but since this is after the code freeze for 1.1 all logic is here to minimize disruption.
-   *
-   * Other issues:
-   *  - Much of this logic assumes case insensitive resolution.
-   */
-  @Experimental
-  object ParquetConversion extends Strategy {
-    implicit class LogicalPlanHacks(s: DataFrame) {
-      def lowerCase: DataFrame = DataFrame(s.sqlContext, s.logicalPlan)
-
-      def addPartitioningAttributes(attrs: Seq[Attribute]): DataFrame = {
-        // Don't add the partitioning key if its already present in the data.
-        if (attrs.map(_.name).toSet.subsetOf(s.logicalPlan.output.map(_.name).toSet)) {
-          s
-        } else {
-          DataFrame(
-            s.sqlContext,
-            s.logicalPlan transform {
-              case p: ParquetRelation => p.copy(partitioningAttributes = attrs)
-            })
-        }
-      }
-    }
-
-    implicit class PhysicalPlanHacks(originalPlan: SparkPlan) {
-      def fakeOutput(newOutput: Seq[Attribute]): OutputFaker =
-        OutputFaker(
-          originalPlan.output.map(a =>
-            newOutput.find(a.name.toLowerCase == _.name.toLowerCase)
-              .getOrElse(
-                sys.error(s"Can't find attribute $a to fake in set ${newOutput.mkString(",")}"))),
-          originalPlan)
-    }
-
-    def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
-      case PhysicalOperation(projectList, predicates, relation: MetastoreRelation)
-          if relation.tableDesc.getSerdeClassName.contains("Parquet") &&
-             hiveContext.convertMetastoreParquet &&
-             !hiveContext.conf.parquetUseDataSourceApi =>
-
-        // Filter out all predicates that only deal with partition keys
-        val partitionsKeys = AttributeSet(relation.partitionKeys)
-        val (pruningPredicates, otherPredicates) = predicates.partition {
-          _.references.subsetOf(partitionsKeys)
-        }
-
-        // We are going to throw the predicates and projection back at the whole optimization
-        // sequence so lets unresolve all the attributes, allowing them to be rebound to the
-        // matching parquet attributes.
-        val unresolvedOtherPredicates = Column(otherPredicates.map(_ transform {
-          case a: AttributeReference => UnresolvedAttribute(a.name)
-        }).reduceOption(And).getOrElse(Literal(true)))
-
-        val unresolvedProjection: Seq[Column] = projectList.map(_ transform {
-          case a: AttributeReference => UnresolvedAttribute(a.name)
-        }).map(Column(_))
-
-        try {
-          if (relation.hiveQlTable.isPartitioned) {
-            val rawPredicate = pruningPredicates.reduceOption(And).getOrElse(Literal(true))
-            // Translate the predicate so that it automatically casts the input values to the
-            // correct data types during evaluation.
-            val castedPredicate = rawPredicate transform {
-              case a: AttributeReference =>
-                val idx = relation.partitionKeys.indexWhere(a.exprId == _.exprId)
-                val key = relation.partitionKeys(idx)
-                Cast(BoundReference(idx, StringType, nullable = true), key.dataType)
-            }
-
-            val inputData = new GenericMutableRow(relation.partitionKeys.size)
-            val pruningCondition =
-              if (codegenEnabled) {
-                GeneratePredicate.generate(castedPredicate)
-              } else {
-                InterpretedPredicate.create(castedPredicate)
-              }
-
-            val partitions = relation.getHiveQlPartitions(pruningPredicates).filter { part =>
-              val partitionValues = part.getValues
-              var i = 0
-              while (i < partitionValues.size()) {
-                inputData(i) = CatalystTypeConverters.convertToCatalyst(partitionValues(i))
-                i += 1
-              }
-              pruningCondition(inputData)
-            }
-
-            val partitionLocations = partitions.map(_.getLocation)
-
-            if (partitionLocations.isEmpty) {
-              PhysicalRDD(plan.output, sparkContext.emptyRDD[InternalRow]) :: Nil
-            } else {
-              hiveContext
-                .read.parquet(partitionLocations: _*)
-                .addPartitioningAttributes(relation.partitionKeys)
-                .lowerCase
-                .where(unresolvedOtherPredicates)
-                .select(unresolvedProjection: _*)
-                .queryExecution
-                .executedPlan
-                .fakeOutput(projectList.map(_.toAttribute)) :: Nil
-            }
-
-          } else {
-            hiveContext
-              .read.parquet(relation.hiveQlTable.getDataLocation.toString)
-              .lowerCase
-              .where(unresolvedOtherPredicates)
-              .select(unresolvedProjection: _*)
-              .queryExecution
-              .executedPlan
-              .fakeOutput(projectList.map(_.toAttribute)) :: Nil
-          }
-        } catch {
-          // parquetFile will throw an exception when there is no data.
-          // TODO: Remove this hack for Spark 1.3.
-          case iae: java.lang.IllegalArgumentException
-              if iae.getMessage.contains("Can not create a Path from an empty string") =>
-            PhysicalRDD(plan.output, sparkContext.emptyRDD[InternalRow]) :: Nil
-        }
-      case _ => Nil
-    }
-  }
-
   object Scripts extends Strategy {
     def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
       case logical.ScriptTransformation(input, script, output, child, schema: HiveScriptIOSchema) =>
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveParquetSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveParquetSuite.scala
index af68615e8e9d6..a45c2d957278f 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveParquetSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveParquetSuite.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.hive
 
 import org.apache.spark.sql.hive.test.TestHive
 import org.apache.spark.sql.parquet.ParquetTest
-import org.apache.spark.sql.{QueryTest, Row, SQLConf}
+import org.apache.spark.sql.{QueryTest, Row}
 
 case class Cases(lower: String, UPPER: String)
 
@@ -28,64 +28,54 @@ class HiveParquetSuite extends QueryTest with ParquetTest {
 
   import sqlContext._
 
-  def run(prefix: String): Unit = {
-    test(s"$prefix: Case insensitive attribute names") {
-      withParquetTable((1 to 4).map(i => Cases(i.toString, i.toString)), "cases") {
-        val expected = (1 to 4).map(i => Row(i.toString))
-        checkAnswer(sql("SELECT upper FROM cases"), expected)
-        checkAnswer(sql("SELECT LOWER FROM cases"), expected)
-      }
+  test("Case insensitive attribute names") {
+    withParquetTable((1 to 4).map(i => Cases(i.toString, i.toString)), "cases") {
+      val expected = (1 to 4).map(i => Row(i.toString))
+      checkAnswer(sql("SELECT upper FROM cases"), expected)
+      checkAnswer(sql("SELECT LOWER FROM cases"), expected)
     }
+  }
 
-    test(s"$prefix: SELECT on Parquet table") {
-      val data = (1 to 4).map(i => (i, s"val_$i"))
-      withParquetTable(data, "t") {
-        checkAnswer(sql("SELECT * FROM t"), data.map(Row.fromTuple))
-      }
+  test("SELECT on Parquet table") {
+    val data = (1 to 4).map(i => (i, s"val_$i"))
+    withParquetTable(data, "t") {
+      checkAnswer(sql("SELECT * FROM t"), data.map(Row.fromTuple))
     }
+  }
 
-    test(s"$prefix: Simple column projection + filter on Parquet table") {
-      withParquetTable((1 to 4).map(i => (i % 2 == 0, i, s"val_$i")), "t") {
-        checkAnswer(
-          sql("SELECT `_1`, `_3` FROM t WHERE `_1` = true"),
-          Seq(Row(true, "val_2"), Row(true, "val_4")))
-      }
+  test("Simple column projection + filter on Parquet table") {
+    withParquetTable((1 to 4).map(i => (i % 2 == 0, i, s"val_$i")), "t") {
+      checkAnswer(
+        sql("SELECT `_1`, `_3` FROM t WHERE `_1` = true"),
+        Seq(Row(true, "val_2"), Row(true, "val_4")))
     }
+  }
 
-    test(s"$prefix: Converting Hive to Parquet Table via saveAsParquetFile") {
-      withTempPath { dir =>
-        sql("SELECT * FROM src").write.parquet(dir.getCanonicalPath)
-        read.parquet(dir.getCanonicalPath).registerTempTable("p")
-        withTempTable("p") {
-          checkAnswer(
-            sql("SELECT * FROM src ORDER BY key"),
-            sql("SELECT * from p ORDER BY key").collect().toSeq)
-        }
+  test("Converting Hive to Parquet Table via saveAsParquetFile") {
+    withTempPath { dir =>
+      sql("SELECT * FROM src").write.parquet(dir.getCanonicalPath)
+      read.parquet(dir.getCanonicalPath).registerTempTable("p")
+      withTempTable("p") {
+        checkAnswer(
+          sql("SELECT * FROM src ORDER BY key"),
+          sql("SELECT * from p ORDER BY key").collect().toSeq)
       }
     }
+  }
 
-    test(s"$prefix: INSERT OVERWRITE TABLE Parquet table") {
-      withParquetTable((1 to 10).map(i => (i, s"val_$i")), "t") {
-        withTempPath { file =>
-          sql("SELECT * FROM t LIMIT 1").write.parquet(file.getCanonicalPath)
-          read.parquet(file.getCanonicalPath).registerTempTable("p")
-          withTempTable("p") {
-            // let's do three overwrites for good measure
-            sql("INSERT OVERWRITE TABLE p SELECT * FROM t")
-            sql("INSERT OVERWRITE TABLE p SELECT * FROM t")
-            sql("INSERT OVERWRITE TABLE p SELECT * FROM t")
-            checkAnswer(sql("SELECT * FROM p"), sql("SELECT * FROM t").collect().toSeq)
-          }
+  test("INSERT OVERWRITE TABLE Parquet table") {
+    withParquetTable((1 to 10).map(i => (i, s"val_$i")), "t") {
+      withTempPath { file =>
+        sql("SELECT * FROM t LIMIT 1").write.parquet(file.getCanonicalPath)
+        read.parquet(file.getCanonicalPath).registerTempTable("p")
+        withTempTable("p") {
+          // let's do three overwrites for good measure
+          sql("INSERT OVERWRITE TABLE p SELECT * FROM t")
+          sql("INSERT OVERWRITE TABLE p SELECT * FROM t")
+          sql("INSERT OVERWRITE TABLE p SELECT * FROM t")
+          checkAnswer(sql("SELECT * FROM p"), sql("SELECT * FROM t").collect().toSeq)
         }
       }
     }
   }
-
-  withSQLConf(SQLConf.PARQUET_USE_DATA_SOURCE_API.key -> "true") {
-    run("Parquet data source enabled")
-  }
-
-  withSQLConf(SQLConf.PARQUET_USE_DATA_SOURCE_API.key -> "false") {
-    run("Parquet data source disabled")
-  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
index e403f32efaf91..4fdf774ead75e 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
@@ -21,10 +21,9 @@ import java.io.File
 
 import scala.collection.mutable.ArrayBuffer
 
-import org.scalatest.BeforeAndAfterAll
-
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapred.InvalidInputException
+import org.scalatest.BeforeAndAfterAll
 
 import org.apache.spark.Logging
 import org.apache.spark.sql._
@@ -33,7 +32,7 @@ import org.apache.spark.sql.hive.client.{HiveTable, ManagedTable}
 import org.apache.spark.sql.hive.test.TestHive
 import org.apache.spark.sql.hive.test.TestHive._
 import org.apache.spark.sql.hive.test.TestHive.implicits._
-import org.apache.spark.sql.parquet.ParquetRelation2
+import org.apache.spark.sql.parquet.ParquetRelation
 import org.apache.spark.sql.test.SQLTestUtils
 import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
@@ -564,10 +563,7 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with BeforeA
   }
 
   test("scan a parquet table created through a CTAS statement") {
-    withSQLConf(
-      HiveContext.CONVERT_METASTORE_PARQUET.key -> "true",
-      SQLConf.PARQUET_USE_DATA_SOURCE_API.key -> "true") {
-
+    withSQLConf(HiveContext.CONVERT_METASTORE_PARQUET.key -> "true") {
       withTempTable("jt") {
         (1 to 10).map(i => i -> s"str$i").toDF("a", "b").registerTempTable("jt")
 
@@ -582,9 +578,9 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with BeforeA
             Row(3) :: Row(4) :: Nil)
 
           table("test_parquet_ctas").queryExecution.optimizedPlan match {
-            case LogicalRelation(p: ParquetRelation2) => // OK
+            case LogicalRelation(p: ParquetRelation) => // OK
             case _ =>
-              fail(s"test_parquet_ctas should have be converted to ${classOf[ParquetRelation2]}")
+              fail(s"test_parquet_ctas should have be converted to ${classOf[ParquetRelation]}")
           }
         }
       }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index 03428265422e6..ff42fdefaa62a 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -28,7 +28,8 @@ import org.apache.spark.sql.hive.test.TestHive
 import org.apache.spark.sql.hive.test.TestHive._
 import org.apache.spark.sql.hive.test.TestHive.implicits._
 import org.apache.spark.sql.hive.{HiveContext, HiveQLDialect, MetastoreRelation}
-import org.apache.spark.sql.parquet.ParquetRelation2
+import org.apache.spark.sql.parquet.ParquetRelation
+import org.apache.spark.sql.test.SQLTestUtils
 import org.apache.spark.sql.types._
 
 case class Nested1(f1: Nested2)
@@ -61,7 +62,9 @@ class MyDialect extends DefaultParserDialect
  * Hive to generate them (in contrast to HiveQuerySuite).  Often this is because the query is
  * valid, but Hive currently cannot execute it.
  */
-class SQLQuerySuite extends QueryTest {
+class SQLQuerySuite extends QueryTest with SQLTestUtils {
+  override def sqlContext: SQLContext = TestHive
+
   test("SPARK-6835: udtf in lateral view") {
     val df = Seq((1, 1)).toDF("c1", "c2")
     df.registerTempTable("table1")
@@ -195,17 +198,17 @@ class SQLQuerySuite extends QueryTest {
     def checkRelation(tableName: String, isDataSourceParquet: Boolean): Unit = {
       val relation = EliminateSubQueries(catalog.lookupRelation(Seq(tableName)))
       relation match {
-        case LogicalRelation(r: ParquetRelation2) =>
+        case LogicalRelation(r: ParquetRelation) =>
           if (!isDataSourceParquet) {
             fail(
               s"${classOf[MetastoreRelation].getCanonicalName} is expected, but found " +
-              s"${ParquetRelation2.getClass.getCanonicalName}.")
+              s"${ParquetRelation.getClass.getCanonicalName}.")
           }
 
         case r: MetastoreRelation =>
           if (isDataSourceParquet) {
             fail(
-              s"${ParquetRelation2.getClass.getCanonicalName} is expected, but found " +
+              s"${ParquetRelation.getClass.getCanonicalName} is expected, but found " +
               s"${classOf[MetastoreRelation].getCanonicalName}.")
           }
       }
@@ -350,33 +353,26 @@ class SQLQuerySuite extends QueryTest {
       "serde_p1=p1", "serde_p2=p2", "tbl_p1=p11", "tbl_p2=p22", "MANAGED_TABLE"
     )
 
-    val origUseParquetDataSource = conf.parquetUseDataSourceApi
-    try {
-      setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, false)
-      sql(
-        """CREATE TABLE ctas5
-          | STORED AS parquet AS
-          |   SELECT key, value
-          |   FROM src
-          |   ORDER BY key, value""".stripMargin).collect()
-
-      checkExistence(sql("DESC EXTENDED ctas5"), true,
-        "name:key", "type:string", "name:value", "ctas5",
-        "org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat",
-        "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat",
-        "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe",
-        "MANAGED_TABLE"
-      )
-
-      val default = convertMetastoreParquet
-      // use the Hive SerDe for parquet tables
-      sql("set spark.sql.hive.convertMetastoreParquet = false")
+    sql(
+      """CREATE TABLE ctas5
+        | STORED AS parquet AS
+        |   SELECT key, value
+        |   FROM src
+        |   ORDER BY key, value""".stripMargin).collect()
+
+    checkExistence(sql("DESC EXTENDED ctas5"), true,
+      "name:key", "type:string", "name:value", "ctas5",
+      "org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat",
+      "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat",
+      "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe",
+      "MANAGED_TABLE"
+    )
+
+    // use the Hive SerDe for parquet tables
+    withSQLConf(HiveContext.CONVERT_METASTORE_PARQUET.key -> "false") {
       checkAnswer(
         sql("SELECT key, value FROM ctas5 ORDER BY key, value"),
         sql("SELECT key, value FROM src ORDER BY key, value").collect().toSeq)
-      sql(s"set spark.sql.hive.convertMetastoreParquet = $default")
-    } finally {
-      setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, origUseParquetDataSource)
     }
   }
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
index 82a8daf8b4b09..f56fb96c52d37 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
@@ -22,13 +22,13 @@ import java.io.File
 import org.scalatest.BeforeAndAfterAll
 
 import org.apache.spark.sql._
-import org.apache.spark.sql.execution.{ExecutedCommand, PhysicalRDD}
 import org.apache.spark.sql.execution.datasources.{InsertIntoDataSource, InsertIntoHadoopFsRelation, LogicalRelation}
+import org.apache.spark.sql.execution.{ExecutedCommand, PhysicalRDD}
 import org.apache.spark.sql.hive.execution.HiveTableScan
 import org.apache.spark.sql.hive.test.TestHive
 import org.apache.spark.sql.hive.test.TestHive._
 import org.apache.spark.sql.hive.test.TestHive.implicits._
-import org.apache.spark.sql.parquet.{ParquetRelation2, ParquetTableScan}
+import org.apache.spark.sql.parquet.ParquetRelation
 import org.apache.spark.sql.test.SQLTestUtils
 import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
@@ -57,7 +57,7 @@ case class ParquetDataWithKeyAndComplexTypes(
  * A suite to test the automatic conversion of metastore tables with parquet data to use the
  * built in parquet support.
  */
-class ParquetMetastoreSuiteBase extends ParquetPartitioningTest {
+class ParquetMetastoreSuite extends ParquetPartitioningTest {
   override def beforeAll(): Unit = {
     super.beforeAll()
 
@@ -134,6 +134,19 @@ class ParquetMetastoreSuiteBase extends ParquetPartitioningTest {
       LOCATION '${partitionedTableDirWithKeyAndComplexTypes.getCanonicalPath}'
     """)
 
+    sql(
+      """
+        |create table test_parquet
+        |(
+        |  intField INT,
+        |  stringField STRING
+        |)
+        |ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
+        |STORED AS
+        |  INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
+        |  OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
+      """.stripMargin)
+
     (1 to 10).foreach { p =>
       sql(s"ALTER TABLE partitioned_parquet ADD PARTITION (p=$p)")
     }
@@ -166,6 +179,7 @@ class ParquetMetastoreSuiteBase extends ParquetPartitioningTest {
     sql("DROP TABLE normal_parquet")
     sql("DROP TABLE IF EXISTS jt")
     sql("DROP TABLE IF EXISTS jt_array")
+    sql("DROP TABLE IF EXISTS test_parquet")
     setConf(HiveContext.CONVERT_METASTORE_PARQUET, false)
   }
 
@@ -176,40 +190,9 @@ class ParquetMetastoreSuiteBase extends ParquetPartitioningTest {
       }.isEmpty)
     assert(
       sql("SELECT * FROM normal_parquet").queryExecution.executedPlan.collect {
-        case _: ParquetTableScan => true
         case _: PhysicalRDD => true
       }.nonEmpty)
   }
-}
-
-class ParquetDataSourceOnMetastoreSuite extends ParquetMetastoreSuiteBase {
-  val originalConf = conf.parquetUseDataSourceApi
-
-  override def beforeAll(): Unit = {
-    super.beforeAll()
-
-    sql(
-      """
-        |create table test_parquet
-        |(
-        |  intField INT,
-        |  stringField STRING
-        |)
-        |ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
-        |STORED AS
-        |  INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
-        |  OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
-      """.stripMargin)
-
-    conf.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, true)
-  }
-
-  override def afterAll(): Unit = {
-    super.afterAll()
-    sql("DROP TABLE IF EXISTS test_parquet")
-
-    setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, originalConf)
-  }
 
   test("scan an empty parquet table") {
     checkAnswer(sql("SELECT count(*) FROM test_parquet"), Row(0))
@@ -292,10 +275,10 @@ class ParquetDataSourceOnMetastoreSuite extends ParquetMetastoreSuiteBase {
     )
 
     table("test_parquet_ctas").queryExecution.optimizedPlan match {
-      case LogicalRelation(_: ParquetRelation2) => // OK
+      case LogicalRelation(_: ParquetRelation) => // OK
       case _ => fail(
         "test_parquet_ctas should be converted to " +
-          s"${classOf[ParquetRelation2].getCanonicalName}")
+          s"${classOf[ParquetRelation].getCanonicalName}")
     }
 
     sql("DROP TABLE IF EXISTS test_parquet_ctas")
@@ -316,9 +299,9 @@ class ParquetDataSourceOnMetastoreSuite extends ParquetMetastoreSuiteBase {
 
     val df = sql("INSERT INTO TABLE test_insert_parquet SELECT a FROM jt")
     df.queryExecution.executedPlan match {
-      case ExecutedCommand(InsertIntoHadoopFsRelation(_: ParquetRelation2, _, _)) => // OK
+      case ExecutedCommand(InsertIntoHadoopFsRelation(_: ParquetRelation, _, _)) => // OK
       case o => fail("test_insert_parquet should be converted to a " +
-        s"${classOf[ParquetRelation2].getCanonicalName} and " +
+        s"${classOf[ParquetRelation].getCanonicalName} and " +
         s"${classOf[InsertIntoDataSource].getCanonicalName} is expcted as the SparkPlan. " +
         s"However, found a ${o.toString} ")
     }
@@ -346,9 +329,9 @@ class ParquetDataSourceOnMetastoreSuite extends ParquetMetastoreSuiteBase {
 
     val df = sql("INSERT INTO TABLE test_insert_parquet SELECT a FROM jt_array")
     df.queryExecution.executedPlan match {
-      case ExecutedCommand(InsertIntoHadoopFsRelation(r: ParquetRelation2, _, _)) => // OK
+      case ExecutedCommand(InsertIntoHadoopFsRelation(r: ParquetRelation, _, _)) => // OK
       case o => fail("test_insert_parquet should be converted to a " +
-        s"${classOf[ParquetRelation2].getCanonicalName} and " +
+        s"${classOf[ParquetRelation].getCanonicalName} and " +
         s"${classOf[InsertIntoDataSource].getCanonicalName} is expcted as the SparkPlan." +
         s"However, found a ${o.toString} ")
     }
@@ -379,17 +362,17 @@ class ParquetDataSourceOnMetastoreSuite extends ParquetMetastoreSuiteBase {
 
     assertResult(2) {
       analyzed.collect {
-        case r @ LogicalRelation(_: ParquetRelation2) => r
+        case r @ LogicalRelation(_: ParquetRelation) => r
       }.size
     }
 
     sql("DROP TABLE ms_convert")
   }
 
-  def collectParquetRelation(df: DataFrame): ParquetRelation2 = {
+  def collectParquetRelation(df: DataFrame): ParquetRelation = {
     val plan = df.queryExecution.analyzed
     plan.collectFirst {
-      case LogicalRelation(r: ParquetRelation2) => r
+      case LogicalRelation(r: ParquetRelation) => r
     }.getOrElse {
       fail(s"Expecting a ParquetRelation2, but got:\n$plan")
     }
@@ -439,7 +422,7 @@ class ParquetDataSourceOnMetastoreSuite extends ParquetMetastoreSuiteBase {
       // Converted test_parquet should be cached.
       catalog.cachedDataSourceTables.getIfPresent(tableIdentifier) match {
         case null => fail("Converted test_parquet should be cached in the cache.")
-        case logical @ LogicalRelation(parquetRelation: ParquetRelation2) => // OK
+        case logical @ LogicalRelation(parquetRelation: ParquetRelation) => // OK
         case other =>
           fail(
             "The cached test_parquet should be a Parquet Relation. " +
@@ -543,81 +526,10 @@ class ParquetDataSourceOnMetastoreSuite extends ParquetMetastoreSuiteBase {
   }
 }
 
-class ParquetDataSourceOffMetastoreSuite extends ParquetMetastoreSuiteBase {
-  val originalConf = conf.parquetUseDataSourceApi
-
-  override def beforeAll(): Unit = {
-    super.beforeAll()
-    conf.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, false)
-  }
-
-  override def afterAll(): Unit = {
-    super.afterAll()
-    setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, originalConf)
-  }
-
-  test("MetastoreRelation in InsertIntoTable will not be converted") {
-    sql(
-      """
-        |create table test_insert_parquet
-        |(
-        |  intField INT
-        |)
-        |ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
-        |STORED AS
-        |  INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
-        |  OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
-      """.stripMargin)
-
-    val df = sql("INSERT INTO TABLE test_insert_parquet SELECT a FROM jt")
-    df.queryExecution.executedPlan match {
-      case insert: execution.InsertIntoHiveTable => // OK
-      case o => fail(s"The SparkPlan should be ${classOf[InsertIntoHiveTable].getCanonicalName}. " +
-        s"However, found ${o.toString}.")
-    }
-
-    checkAnswer(
-      sql("SELECT intField FROM test_insert_parquet WHERE test_insert_parquet.intField > 5"),
-      sql("SELECT a FROM jt WHERE jt.a > 5").collect()
-    )
-
-    sql("DROP TABLE IF EXISTS test_insert_parquet")
-  }
-
-  // TODO: enable it after the fix of SPARK-5950.
-  ignore("MetastoreRelation in InsertIntoHiveTable will not be converted") {
-    sql(
-      """
-        |create table test_insert_parquet
-        |(
-        |  int_array array<int>
-        |)
-        |ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
-        |STORED AS
-        |  INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
-        |  OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
-      """.stripMargin)
-
-    val df = sql("INSERT INTO TABLE test_insert_parquet SELECT a FROM jt_array")
-    df.queryExecution.executedPlan match {
-      case insert: execution.InsertIntoHiveTable => // OK
-      case o => fail(s"The SparkPlan should be ${classOf[InsertIntoHiveTable].getCanonicalName}. " +
-        s"However, found ${o.toString}.")
-    }
-
-    checkAnswer(
-      sql("SELECT int_array FROM test_insert_parquet"),
-      sql("SELECT a FROM jt_array").collect()
-    )
-
-    sql("DROP TABLE IF EXISTS test_insert_parquet")
-  }
-}
-
 /**
  * A suite of tests for the Parquet support through the data sources API.
  */
-class ParquetSourceSuiteBase extends ParquetPartitioningTest {
+class ParquetSourceSuite extends ParquetPartitioningTest {
   override def beforeAll(): Unit = {
     super.beforeAll()
 
@@ -712,20 +624,6 @@ class ParquetSourceSuiteBase extends ParquetPartitioningTest {
       }
     }
   }
-}
-
-class ParquetDataSourceOnSourceSuite extends ParquetSourceSuiteBase {
-  val originalConf = conf.parquetUseDataSourceApi
-
-  override def beforeAll(): Unit = {
-    super.beforeAll()
-    conf.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, true)
-  }
-
-  override def afterAll(): Unit = {
-    super.afterAll()
-    setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, originalConf)
-  }
 
   test("values in arrays and maps stored in parquet are always nullable") {
     val df = createDataFrame(Tuple2(Map(2 -> 3), Seq(4, 5, 6)) :: Nil).toDF("m", "a")
@@ -734,7 +632,7 @@ class ParquetDataSourceOnSourceSuite extends ParquetSourceSuiteBase {
     val expectedSchema1 =
       StructType(
         StructField("m", mapType1, nullable = true) ::
-        StructField("a", arrayType1, nullable = true) :: Nil)
+          StructField("a", arrayType1, nullable = true) :: Nil)
     assert(df.schema === expectedSchema1)
 
     df.write.format("parquet").saveAsTable("alwaysNullable")
@@ -772,20 +670,6 @@ class ParquetDataSourceOnSourceSuite extends ParquetSourceSuiteBase {
   }
 }
 
-class ParquetDataSourceOffSourceSuite extends ParquetSourceSuiteBase {
-  val originalConf = conf.parquetUseDataSourceApi
-
-  override def beforeAll(): Unit = {
-    super.beforeAll()
-    conf.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, false)
-  }
-
-  override def afterAll(): Unit = {
-    super.afterAll()
-    setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, originalConf)
-  }
-}
-
 /**
  * A collection of tests for parquet data with various forms of partitioning.
  */

From 1efe97dc9ed31e3b8727b81be633b7e96dd3cd34 Mon Sep 17 00:00:00 2001
From: Cheng Hao <hao.cheng@intel.com>
Date: Sun, 26 Jul 2015 18:34:19 -0700
Subject: [PATCH 385/549] [SPARK-8867][SQL] Support list / describe function
 usage

As Hive does, we need to list all of the registered UDF and its usage for user.

We add the annotation to describe a UDF, so we can get the literal description info while registering the UDF.
e.g.
```scala
ExpressionDescription(
    usage = "_FUNC_(expr) - Returns the absolute value of the numeric value",
    extended = """> SELECT _FUNC_('-1')
                  1""")
 case class Abs(child: Expression) extends UnaryArithmetic {
...
```

Author: Cheng Hao <hao.cheng@intel.com>

Closes #7259 from chenghao-intel/desc_function and squashes the following commits:

cf29bba [Cheng Hao] fixing the code style issue
5193855 [Cheng Hao] Add more powerful parser for show functions
c645a6b [Cheng Hao] fix bug in unit test
78d40f1 [Cheng Hao] update the padding issue for usage
48ee4b3 [Cheng Hao] update as feedback
70eb4e9 [Cheng Hao] add show/describe function support
---
 .../expressions/ExpressionDescription.java    | 43 +++++++++++
 .../catalyst/expressions/ExpressionInfo.java  | 55 +++++++++++++
 .../catalyst/analysis/FunctionRegistry.scala  | 56 +++++++++++---
 .../sql/catalyst/expressions/arithmetic.scala |  3 +
 .../expressions/stringOperations.scala        |  6 ++
 .../sql/catalyst/plans/logical/commands.scala | 28 ++++++-
 .../org/apache/spark/sql/SparkSQLParser.scala | 28 ++++++-
 .../spark/sql/execution/SparkStrategies.scala |  5 ++
 .../apache/spark/sql/execution/commands.scala | 77 ++++++++++++++++++-
 .../org/apache/spark/sql/SQLQuerySuite.scala  | 26 +++++++
 .../org/apache/spark/sql/hive/hiveUDFs.scala  | 28 ++++++-
 .../hive/execution/HiveComparisonTest.scala   |  6 +-
 .../sql/hive/execution/SQLQuerySuite.scala    | 48 +++++++++++-
 13 files changed, 389 insertions(+), 20 deletions(-)
 create mode 100644 sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionDescription.java
 create mode 100644 sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionInfo.java

diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionDescription.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionDescription.java
new file mode 100644
index 0000000000000..9e10f27d59d55
--- /dev/null
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionDescription.java
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions;
+
+import org.apache.spark.annotation.DeveloperApi;
+
+import java.lang.annotation.Retention;
+import java.lang.annotation.RetentionPolicy;
+
+/**
+ * ::DeveloperApi::
+
+ * A function description type which can be recognized by FunctionRegistry, and will be used to
+ * show the usage of the function in human language.
+ *
+ * `usage()` will be used for the function usage in brief way.
+ * `extended()` will be used for the function usage in verbose way, suppose
+ *              an example will be provided.
+ *
+ *  And we can refer the function name by `_FUNC_`, in `usage` and `extended`, as it's
+ *  registered in `FunctionRegistry`.
+ */
+@DeveloperApi
+@Retention(RetentionPolicy.RUNTIME)
+public @interface ExpressionDescription {
+    String usage() default "_FUNC_ is undocumented";
+    String extended() default "No example for _FUNC_.";
+}
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionInfo.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionInfo.java
new file mode 100644
index 0000000000000..ba8e9cb4be28b
--- /dev/null
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionInfo.java
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions;
+
+/**
+ * Expression information, will be used to describe a expression.
+ */
+public class ExpressionInfo {
+    private String className;
+    private String usage;
+    private String name;
+    private String extended;
+
+    public String getClassName() {
+        return className;
+    }
+
+    public String getUsage() {
+        return usage;
+    }
+
+    public String getName() {
+        return name;
+    }
+
+    public String getExtended() {
+        return extended;
+    }
+
+    public ExpressionInfo(String className, String name, String usage, String extended) {
+        this.className = className;
+        this.name = name;
+        this.usage = usage;
+        this.extended = extended;
+    }
+
+    public ExpressionInfo(String className, String name) {
+        this(className, name, null, null);
+    }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index 9c349838c28a1..aa05f448d12bc 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -30,26 +30,44 @@ import org.apache.spark.sql.catalyst.util.StringKeyHashMap
 /** A catalog for looking up user defined functions, used by an [[Analyzer]]. */
 trait FunctionRegistry {
 
-  def registerFunction(name: String, builder: FunctionBuilder): Unit
+  final def registerFunction(name: String, builder: FunctionBuilder): Unit = {
+    registerFunction(name, new ExpressionInfo(builder.getClass.getCanonicalName, name), builder)
+  }
+
+  def registerFunction(name: String, info: ExpressionInfo, builder: FunctionBuilder): Unit
 
   @throws[AnalysisException]("If function does not exist")
   def lookupFunction(name: String, children: Seq[Expression]): Expression
+
+  /* List all of the registered function names. */
+  def listFunction(): Seq[String]
+
+  /* Get the class of the registered function by specified name. */
+  def lookupFunction(name: String): Option[ExpressionInfo]
 }
 
 class SimpleFunctionRegistry extends FunctionRegistry {
 
-  private val functionBuilders = StringKeyHashMap[FunctionBuilder](caseSensitive = false)
+  private val functionBuilders =
+    StringKeyHashMap[(ExpressionInfo, FunctionBuilder)](caseSensitive = false)
 
-  override def registerFunction(name: String, builder: FunctionBuilder): Unit = {
-    functionBuilders.put(name, builder)
+  override def registerFunction(name: String, info: ExpressionInfo, builder: FunctionBuilder)
+  : Unit = {
+    functionBuilders.put(name, (info, builder))
   }
 
   override def lookupFunction(name: String, children: Seq[Expression]): Expression = {
-    val func = functionBuilders.get(name).getOrElse {
+    val func = functionBuilders.get(name).map(_._2).getOrElse {
       throw new AnalysisException(s"undefined function $name")
     }
     func(children)
   }
+
+  override def listFunction(): Seq[String] = functionBuilders.iterator.map(_._1).toList.sorted
+
+  override def lookupFunction(name: String): Option[ExpressionInfo] = {
+    functionBuilders.get(name).map(_._1)
+  }
 }
 
 /**
@@ -57,13 +75,22 @@ class SimpleFunctionRegistry extends FunctionRegistry {
  * functions are already filled in and the analyzer needs only to resolve attribute references.
  */
 object EmptyFunctionRegistry extends FunctionRegistry {
-  override def registerFunction(name: String, builder: FunctionBuilder): Unit = {
+  override def registerFunction(name: String, info: ExpressionInfo, builder: FunctionBuilder)
+  : Unit = {
     throw new UnsupportedOperationException
   }
 
   override def lookupFunction(name: String, children: Seq[Expression]): Expression = {
     throw new UnsupportedOperationException
   }
+
+  override def listFunction(): Seq[String] = {
+    throw new UnsupportedOperationException
+  }
+
+  override def lookupFunction(name: String): Option[ExpressionInfo] = {
+    throw new UnsupportedOperationException
+  }
 }
 
 
@@ -71,7 +98,7 @@ object FunctionRegistry {
 
   type FunctionBuilder = Seq[Expression] => Expression
 
-  val expressions: Map[String, FunctionBuilder] = Map(
+  val expressions: Map[String, (ExpressionInfo, FunctionBuilder)] = Map(
     // misc non-aggregate functions
     expression[Abs]("abs"),
     expression[CreateArray]("array"),
@@ -205,13 +232,13 @@ object FunctionRegistry {
 
   val builtin: FunctionRegistry = {
     val fr = new SimpleFunctionRegistry
-    expressions.foreach { case (name, builder) => fr.registerFunction(name, builder) }
+    expressions.foreach { case (name, (info, builder)) => fr.registerFunction(name, info, builder) }
     fr
   }
 
   /** See usage above. */
   private def expression[T <: Expression](name: String)
-      (implicit tag: ClassTag[T]): (String, FunctionBuilder) = {
+      (implicit tag: ClassTag[T]): (String, (ExpressionInfo, FunctionBuilder)) = {
 
     // See if we can find a constructor that accepts Seq[Expression]
     val varargCtor = Try(tag.runtimeClass.getDeclaredConstructor(classOf[Seq[_]])).toOption
@@ -237,6 +264,15 @@ object FunctionRegistry {
         }
       }
     }
-    (name, builder)
+
+    val clazz = tag.runtimeClass
+    val df = clazz.getAnnotation(classOf[ExpressionDescription])
+    if (df != null) {
+      (name,
+        (new ExpressionInfo(clazz.getCanonicalName, name, df.usage(), df.extended()),
+        builder))
+    } else {
+      (name, (new ExpressionInfo(clazz.getCanonicalName, name), builder))
+    }
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
index 7c254a8750a9f..b37f530ec6814 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
@@ -65,6 +65,9 @@ case class UnaryPositive(child: Expression) extends UnaryExpression with Expects
 /**
  * A function that get the absolute value of the numeric value.
  */
+@ExpressionDescription(
+  usage = "_FUNC_(expr) - Returns the absolute value of the numeric value",
+  extended = "> SELECT _FUNC_('-1');\n1")
 case class Abs(child: Expression)
   extends UnaryExpression with ExpectsInputTypes with CodegenFallback {
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
index cf187ad5a0a9f..38b0fb37dee3b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
@@ -214,6 +214,9 @@ trait String2StringExpression extends ImplicitCastInputTypes {
 /**
  * A function that converts the characters of a string to uppercase.
  */
+@ExpressionDescription(
+  usage = "_FUNC_(str) - Returns str with all characters changed to uppercase",
+  extended = "> SELECT _FUNC_('SparkSql');\n 'SPARKSQL'")
 case class Upper(child: Expression)
   extends UnaryExpression with String2StringExpression {
 
@@ -227,6 +230,9 @@ case class Upper(child: Expression)
 /**
  * A function that converts the characters of a string to lowercase.
  */
+@ExpressionDescription(
+  usage = "_FUNC_(str) - Returns str with all characters changed to lowercase",
+  extended = "> SELECT _FUNC_('SparkSql');\n'sparksql'")
 case class Lower(child: Expression) extends UnaryExpression with String2StringExpression {
 
   override def convert(v: UTF8String): UTF8String = v.toLowerCase
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/commands.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/commands.scala
index 246f4d7e34d3d..e6621e0f50a9e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/commands.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/commands.scala
@@ -17,7 +17,8 @@
 
 package org.apache.spark.sql.catalyst.plans.logical
 
-import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Attribute}
+import org.apache.spark.sql.types.StringType
 
 /**
  * A logical node that represents a non-query command to be executed by the system.  For example,
@@ -25,3 +26,28 @@ import org.apache.spark.sql.catalyst.expressions.Attribute
  * eagerly executed.
  */
 trait Command
+
+/**
+ * Returned for the "DESCRIBE [EXTENDED] FUNCTION functionName" command.
+ * @param functionName The function to be described.
+ * @param isExtended True if "DESCRIBE EXTENDED" is used. Otherwise, false.
+ */
+private[sql] case class DescribeFunction(
+    functionName: String,
+    isExtended: Boolean) extends LogicalPlan with Command {
+
+  override def children: Seq[LogicalPlan] = Seq.empty
+  override val output: Seq[Attribute] = Seq(
+    AttributeReference("function_desc", StringType, nullable = false)())
+}
+
+/**
+ * Returned for the "SHOW FUNCTIONS" command, which will list all of the
+ * registered function list.
+ */
+private[sql] case class ShowFunctions(
+    db: Option[String], pattern: Option[String]) extends LogicalPlan with Command {
+  override def children: Seq[LogicalPlan] = Seq.empty
+  override val output: Seq[Attribute] = Seq(
+    AttributeReference("function", StringType, nullable = false)())
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSQLParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSQLParser.scala
index e59fa6e162900..ea8fce6ca9cf2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSQLParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSQLParser.scala
@@ -21,7 +21,7 @@ import scala.util.parsing.combinator.RegexParsers
 
 import org.apache.spark.sql.catalyst.AbstractSparkSQLParser
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
-import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.catalyst.plans.logical.{DescribeFunction, LogicalPlan, ShowFunctions}
 import org.apache.spark.sql.execution._
 import org.apache.spark.sql.types.StringType
 
@@ -57,6 +57,10 @@ private[sql] class SparkSQLParser(fallback: String => LogicalPlan) extends Abstr
   protected val AS = Keyword("AS")
   protected val CACHE = Keyword("CACHE")
   protected val CLEAR = Keyword("CLEAR")
+  protected val DESCRIBE = Keyword("DESCRIBE")
+  protected val EXTENDED = Keyword("EXTENDED")
+  protected val FUNCTION = Keyword("FUNCTION")
+  protected val FUNCTIONS = Keyword("FUNCTIONS")
   protected val IN = Keyword("IN")
   protected val LAZY = Keyword("LAZY")
   protected val SET = Keyword("SET")
@@ -65,7 +69,8 @@ private[sql] class SparkSQLParser(fallback: String => LogicalPlan) extends Abstr
   protected val TABLES = Keyword("TABLES")
   protected val UNCACHE = Keyword("UNCACHE")
 
-  override protected lazy val start: Parser[LogicalPlan] = cache | uncache | set | show | others
+  override protected lazy val start: Parser[LogicalPlan] =
+    cache | uncache | set | show | desc | others
 
   private lazy val cache: Parser[LogicalPlan] =
     CACHE ~> LAZY.? ~ (TABLE ~> ident) ~ (AS ~> restInput).? ^^ {
@@ -85,9 +90,24 @@ private[sql] class SparkSQLParser(fallback: String => LogicalPlan) extends Abstr
       case input => SetCommandParser(input)
     }
 
+  // It can be the following patterns:
+  // SHOW FUNCTIONS;
+  // SHOW FUNCTIONS mydb.func1;
+  // SHOW FUNCTIONS func1;
+  // SHOW FUNCTIONS `mydb.a`.`func1.aa`;
   private lazy val show: Parser[LogicalPlan] =
-    SHOW ~> TABLES ~ (IN ~> ident).? ^^ {
-      case _ ~ dbName => ShowTablesCommand(dbName)
+    ( SHOW ~> TABLES ~ (IN ~> ident).? ^^ {
+        case _ ~ dbName => ShowTablesCommand(dbName)
+      }
+    | SHOW ~ FUNCTIONS ~> ((ident <~ ".").? ~ (ident | stringLit)).? ^^ {
+        case Some(f) => ShowFunctions(f._1, Some(f._2))
+        case None => ShowFunctions(None, None)
+      }
+    )
+
+  private lazy val desc: Parser[LogicalPlan] =
+    DESCRIBE ~ FUNCTION ~> EXTENDED.? ~ (ident | stringLit) ^^ {
+      case isExtended ~ functionName => DescribeFunction(functionName, isExtended.isDefined)
     }
 
   private lazy val others: Parser[LogicalPlan] =
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index e2c7e8006f3b1..deeea3900c241 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -428,6 +428,11 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
         ExecutedCommand(
           RunnableDescribeCommand(resultPlan, describe.output, isExtended)) :: Nil
 
+      case logical.ShowFunctions(db, pattern) => ExecutedCommand(ShowFunctions(db, pattern)) :: Nil
+
+      case logical.DescribeFunction(function, extended) =>
+        ExecutedCommand(DescribeFunction(function, extended)) :: Nil
+
       case _ => Nil
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala
index bace3f8a9c8d4..6b83025d5a153 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala
@@ -24,7 +24,7 @@ import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.{InternalRow, CatalystTypeConverters}
 import org.apache.spark.sql.catalyst.errors.TreeNodeException
-import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
+import org.apache.spark.sql.catalyst.expressions.{ExpressionDescription, Expression, Attribute, AttributeReference}
 import org.apache.spark.sql.catalyst.plans.logical
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.types._
@@ -298,3 +298,78 @@ case class ShowTablesCommand(databaseName: Option[String]) extends RunnableComma
     rows
   }
 }
+
+/**
+ * A command for users to list all of the registered functions.
+ * The syntax of using this command in SQL is:
+ * {{{
+ *    SHOW FUNCTIONS
+ * }}}
+ * TODO currently we are simply ignore the db
+ */
+case class ShowFunctions(db: Option[String], pattern: Option[String]) extends RunnableCommand {
+  override val output: Seq[Attribute] = {
+    val schema = StructType(
+      StructField("function", StringType, nullable = false) :: Nil)
+
+    schema.toAttributes
+  }
+
+  override def run(sqlContext: SQLContext): Seq[Row] = pattern match {
+    case Some(p) =>
+      try {
+        val regex = java.util.regex.Pattern.compile(p)
+        sqlContext.functionRegistry.listFunction().filter(regex.matcher(_).matches()).map(Row(_))
+      } catch {
+        // probably will failed in the regex that user provided, then returns empty row.
+        case _: Throwable => Seq.empty[Row]
+      }
+    case None =>
+      sqlContext.functionRegistry.listFunction().map(Row(_))
+  }
+}
+
+/**
+ * A command for users to get the usage of a registered function.
+ * The syntax of using this command in SQL is
+ * {{{
+ *   DESCRIBE FUNCTION [EXTENDED] upper;
+ * }}}
+ */
+case class DescribeFunction(
+    functionName: String,
+    isExtended: Boolean) extends RunnableCommand {
+
+  override val output: Seq[Attribute] = {
+    val schema = StructType(
+      StructField("function_desc", StringType, nullable = false) :: Nil)
+
+    schema.toAttributes
+  }
+
+  private def replaceFunctionName(usage: String, functionName: String): String = {
+    if (usage == null) {
+      "To be added."
+    } else {
+      usage.replaceAll("_FUNC_", functionName)
+    }
+  }
+
+  override def run(sqlContext: SQLContext): Seq[Row] = {
+    sqlContext.functionRegistry.lookupFunction(functionName) match {
+      case Some(info) =>
+        val result =
+          Row(s"Function: ${info.getName}") ::
+          Row(s"Class: ${info.getClassName}") ::
+          Row(s"Usage: ${replaceFunctionName(info.getUsage(), info.getName)}") :: Nil
+
+        if (isExtended) {
+          result :+ Row(s"Extended Usage:\n${replaceFunctionName(info.getExtended, info.getName)}")
+        } else {
+          result
+        }
+
+      case None => Seq(Row(s"Function: $functionName is not found."))
+    }
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index cd386b7a3ecf9..8cef0b39f87dc 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql
 
+import org.apache.spark.sql.catalyst.analysis.FunctionRegistry
 import org.scalatest.BeforeAndAfterAll
 
 import java.sql.Timestamp
@@ -58,6 +59,31 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll with SQLTestUtils {
     checkAnswer(queryCoalesce, Row("1") :: Nil)
   }
 
+  test("show functions") {
+    checkAnswer(sql("SHOW functions"), FunctionRegistry.builtin.listFunction().sorted.map(Row(_)))
+  }
+
+  test("describe functions") {
+    checkExistence(sql("describe function extended upper"), true,
+      "Function: upper",
+      "Class: org.apache.spark.sql.catalyst.expressions.Upper",
+      "Usage: upper(str) - Returns str with all characters changed to uppercase",
+      "Extended Usage:",
+      "> SELECT upper('SparkSql');",
+      "'SPARKSQL'")
+
+    checkExistence(sql("describe functioN Upper"), true,
+      "Function: upper",
+      "Class: org.apache.spark.sql.catalyst.expressions.Upper",
+      "Usage: upper(str) - Returns str with all characters changed to uppercase")
+
+    checkExistence(sql("describe functioN Upper"), false,
+      "Extended Usage")
+
+    checkExistence(sql("describe functioN abcadf"), true,
+      "Function: abcadf is not found.")
+  }
+
   test("SPARK-6743: no columns from cache") {
     Seq(
       (83, 0, 38),
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
index 54bf6bd67ff84..8732e9abf8d31 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
@@ -76,8 +76,32 @@ private[hive] class HiveFunctionRegistry(underlying: analysis.FunctionRegistry)
     }
   }
 
-  override def registerFunction(name: String, builder: FunctionBuilder): Unit =
-    underlying.registerFunction(name, builder)
+  override def registerFunction(name: String, info: ExpressionInfo, builder: FunctionBuilder)
+  : Unit = underlying.registerFunction(name, info, builder)
+
+  /* List all of the registered function names. */
+  override def listFunction(): Seq[String] = {
+    val a = FunctionRegistry.getFunctionNames ++ underlying.listFunction()
+    a.toList.sorted
+  }
+
+  /* Get the class of the registered function by specified name. */
+  override def lookupFunction(name: String): Option[ExpressionInfo] = {
+    underlying.lookupFunction(name).orElse(
+    Try {
+      val info = FunctionRegistry.getFunctionInfo(name)
+      val annotation = info.getFunctionClass.getAnnotation(classOf[Description])
+      if (annotation != null) {
+        Some(new ExpressionInfo(
+          info.getFunctionClass.getCanonicalName,
+          annotation.name(),
+          annotation.value(),
+          annotation.extended()))
+      } else {
+        None
+      }
+    }.getOrElse(None))
+  }
 }
 
 private[hive] case class HiveSimpleUDF(funcWrapper: HiveFunctionWrapper, children: Seq[Expression])
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
index efb04bf3d5097..638b9c810372a 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
@@ -370,7 +370,11 @@ abstract class HiveComparisonTest
             // Check that the results match unless its an EXPLAIN query.
             val preparedHive = prepareAnswer(hiveQuery, hive)
 
-            if ((!hiveQuery.logical.isInstanceOf[ExplainCommand]) && preparedHive != catalyst) {
+            // We will ignore the ExplainCommand, ShowFunctions, DescribeFunction
+            if ((!hiveQuery.logical.isInstanceOf[ExplainCommand]) &&
+                (!hiveQuery.logical.isInstanceOf[ShowFunctions]) &&
+                (!hiveQuery.logical.isInstanceOf[DescribeFunction]) &&
+                preparedHive != catalyst) {
 
               val hivePrintOut = s"== HIVE - ${preparedHive.size} row(s) ==" +: preparedHive
               val catalystPrintOut = s"== CATALYST - ${catalyst.size} row(s) ==" +: catalyst
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index ff42fdefaa62a..013936377b24c 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -19,9 +19,11 @@ package org.apache.spark.sql.hive.execution
 
 import java.sql.{Date, Timestamp}
 
+import scala.collection.JavaConversions._
+
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.DefaultParserDialect
-import org.apache.spark.sql.catalyst.analysis.EliminateSubQueries
+import org.apache.spark.sql.catalyst.analysis.{FunctionRegistry, EliminateSubQueries}
 import org.apache.spark.sql.catalyst.errors.DialectException
 import org.apache.spark.sql.execution.datasources.LogicalRelation
 import org.apache.spark.sql.hive.test.TestHive
@@ -138,6 +140,50 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils {
       (1 to 6).map(_ => Row("CA", 20151)))
   }
 
+  test("show functions") {
+    val allFunctions =
+      (FunctionRegistry.builtin.listFunction().toSet[String] ++
+        org.apache.hadoop.hive.ql.exec.FunctionRegistry.getFunctionNames).toList.sorted
+    checkAnswer(sql("SHOW functions"), allFunctions.map(Row(_)))
+    checkAnswer(sql("SHOW functions abs"), Row("abs"))
+    checkAnswer(sql("SHOW functions 'abs'"), Row("abs"))
+    checkAnswer(sql("SHOW functions abc.abs"), Row("abs"))
+    checkAnswer(sql("SHOW functions `abc`.`abs`"), Row("abs"))
+    checkAnswer(sql("SHOW functions `abc`.`abs`"), Row("abs"))
+    checkAnswer(sql("SHOW functions `~`"), Row("~"))
+    checkAnswer(sql("SHOW functions `a function doens't exist`"), Nil)
+    checkAnswer(sql("SHOW functions `weekofyea.*`"), Row("weekofyear"))
+    // this probably will failed if we add more function with `sha` prefixing.
+    checkAnswer(sql("SHOW functions `sha.*`"), Row("sha") :: Row("sha1") :: Row("sha2") :: Nil)
+  }
+
+  test("describe functions") {
+    // The Spark SQL built-in functions
+    checkExistence(sql("describe function extended upper"), true,
+      "Function: upper",
+      "Class: org.apache.spark.sql.catalyst.expressions.Upper",
+      "Usage: upper(str) - Returns str with all characters changed to uppercase",
+      "Extended Usage:",
+      "> SELECT upper('SparkSql')",
+      "'SPARKSQL'")
+
+    checkExistence(sql("describe functioN Upper"), true,
+      "Function: upper",
+      "Class: org.apache.spark.sql.catalyst.expressions.Upper",
+      "Usage: upper(str) - Returns str with all characters changed to uppercase")
+
+    checkExistence(sql("describe functioN Upper"), false,
+      "Extended Usage")
+
+    checkExistence(sql("describe functioN abcadf"), true,
+      "Function: abcadf is not found.")
+
+    checkExistence(sql("describe functioN  `~`"), true,
+      "Function: ~",
+      "Class: org.apache.hadoop.hive.ql.udf.UDFOPBitNot",
+      "Usage: ~ n - Bitwise not")
+  }
+
   test("SPARK-5371: union with null and sum") {
     val df = Seq((1, 1)).toDF("c1", "c2")
     df.registerTempTable("table1")

From 945d8bcbf67032edd7bdd201cf9f88c75b3464f7 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@appier.com>
Date: Sun, 26 Jul 2015 22:13:37 -0700
Subject: [PATCH 386/549] [SPARK-9306] [SQL] Don't use SortMergeJoin when
 joining on unsortable columns

JIRA: https://issues.apache.org/jira/browse/SPARK-9306

Author: Liang-Chi Hsieh <viirya@appier.com>

Closes #7645 from viirya/smj_unsortable and squashes the following commits:

a240707 [Liang-Chi Hsieh] Use forall instead of exists for readability.
55221fa [Liang-Chi Hsieh] Shouldn't use SortMergeJoin when joining on unsortable columns.
---
 .../sql/catalyst/planning/patterns.scala      |  2 +-
 .../spark/sql/execution/SparkStrategies.scala | 19 +++++++++++++++----
 .../org/apache/spark/sql/JoinSuite.scala      | 12 ++++++++++++
 3 files changed, 28 insertions(+), 5 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala
index b8e3b0d53a505..1e7b2a536ac12 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala
@@ -184,7 +184,7 @@ object PartialAggregation {
  * A pattern that finds joins with equality conditions that can be evaluated using equi-join.
  */
 object ExtractEquiJoinKeys extends Logging with PredicateHelper {
-  /** (joinType, rightKeys, leftKeys, condition, leftChild, rightChild) */
+  /** (joinType, leftKeys, rightKeys, condition, leftChild, rightChild) */
   type ReturnType =
     (JoinType, Seq[Expression], Seq[Expression], Option[Expression], LogicalPlan, LogicalPlan)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index deeea3900c241..306bbfec624c0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -35,9 +35,8 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
 
   object LeftSemiJoin extends Strategy with PredicateHelper {
     def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
-      case ExtractEquiJoinKeys(LeftSemi, leftKeys, rightKeys, condition, left, right)
-        if sqlContext.conf.autoBroadcastJoinThreshold > 0 &&
-          right.statistics.sizeInBytes <= sqlContext.conf.autoBroadcastJoinThreshold =>
+      case ExtractEquiJoinKeys(
+             LeftSemi, leftKeys, rightKeys, condition, left, CanBroadcast(right)) =>
         joins.BroadcastLeftSemiJoinHash(
           leftKeys, rightKeys, planLater(left), planLater(right), condition) :: Nil
       // Find left semi joins where at least some predicates can be evaluated by matching join keys
@@ -90,6 +89,18 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
       condition.map(Filter(_, broadcastHashJoin)).getOrElse(broadcastHashJoin) :: Nil
     }
 
+    private[this] def isValidSort(
+        leftKeys: Seq[Expression],
+        rightKeys: Seq[Expression]): Boolean = {
+      leftKeys.zip(rightKeys).forall { keys =>
+        (keys._1.dataType, keys._2.dataType) match {
+          case (l: AtomicType, r: AtomicType) => true
+          case (NullType, NullType) => true
+          case _ => false
+        }
+      }
+    }
+
     def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
       case ExtractEquiJoinKeys(Inner, leftKeys, rightKeys, condition, left, CanBroadcast(right)) =>
         makeBroadcastHashJoin(leftKeys, rightKeys, left, right, condition, joins.BuildRight)
@@ -100,7 +111,7 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
       // If the sort merge join option is set, we want to use sort merge join prior to hashjoin
       // for now let's support inner join first, then add outer join
       case ExtractEquiJoinKeys(Inner, leftKeys, rightKeys, condition, left, right)
-        if sqlContext.conf.sortMergeJoinEnabled =>
+        if sqlContext.conf.sortMergeJoinEnabled && isValidSort(leftKeys, rightKeys) =>
         val mergeJoin =
           joins.SortMergeJoin(leftKeys, rightKeys, planLater(left), planLater(right))
         condition.map(Filter(_, mergeJoin)).getOrElse(mergeJoin) :: Nil
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
index 8953889d1fae9..dfb2a7e099748 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
@@ -108,6 +108,18 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach {
     }
   }
 
+  test("SortMergeJoin shouldn't work on unsortable columns") {
+    val SORTMERGEJOIN_ENABLED: Boolean = ctx.conf.sortMergeJoinEnabled
+    try {
+      ctx.conf.setConf(SQLConf.SORTMERGE_JOIN, true)
+      Seq(
+        ("SELECT * FROM arrayData JOIN complexData ON data = a", classOf[ShuffledHashJoin])
+      ).foreach { case (query, joinClass) => assertJoin(query, joinClass) }
+    } finally {
+      ctx.conf.setConf(SQLConf.SORTMERGE_JOIN, SORTMERGEJOIN_ENABLED)
+    }
+  }
+
   test("broadcasted hash join operator selection") {
     ctx.cacheManager.clearCache()
     ctx.sql("CACHE TABLE testData")

From aa80c64fcf9626b3720ee000a653db9266b74839 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Sun, 26 Jul 2015 23:01:04 -0700
Subject: [PATCH 387/549] [SPARK-9368][SQL] Support get(ordinal, dataType)
 generic getter in UnsafeRow.

Author: Reynold Xin <rxin@databricks.com>

Closes #7682 from rxin/unsaferow-generic-getter and squashes the following commits:

3063788 [Reynold Xin] Reset the change for real this time.
0f57c55 [Reynold Xin] Reset the changes in ExpressionEvalHelper.
fb6ca30 [Reynold Xin] Support BinaryType.
24a3e46 [Reynold Xin] Added support for DateType/TimestampType.
9989064 [Reynold Xin] JoinedRow.
11f80a3 [Reynold Xin] [SPARK-9368][SQL] Support get(ordinal, dataType) generic getter in UnsafeRow.
---
 .../sql/catalyst/expressions/UnsafeRow.java   | 52 ++++++++++++++++++-
 .../spark/sql/catalyst/InternalRow.scala      |  4 +-
 .../sql/catalyst/expressions/Projection.scala |  2 +-
 .../expressions/SpecificMutableRow.scala      |  2 +-
 .../codegen/GenerateProjection.scala          |  2 +-
 .../spark/sql/catalyst/expressions/rows.scala |  4 +-
 6 files changed, 58 insertions(+), 8 deletions(-)

diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
index 87e5a89c19658..0fb33dd5a15a0 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
@@ -24,7 +24,7 @@
 import java.util.HashSet;
 import java.util.Set;
 
-import org.apache.spark.sql.types.DataType;
+import org.apache.spark.sql.types.*;
 import org.apache.spark.unsafe.PlatformDependent;
 import org.apache.spark.unsafe.array.ByteArrayMethods;
 import org.apache.spark.unsafe.bitset.BitSetMethods;
@@ -235,6 +235,41 @@ public Object get(int ordinal) {
     throw new UnsupportedOperationException();
   }
 
+  @Override
+  public Object get(int ordinal, DataType dataType) {
+    if (dataType instanceof NullType) {
+      return null;
+    } else if (dataType instanceof BooleanType) {
+      return getBoolean(ordinal);
+    } else if (dataType instanceof ByteType) {
+      return getByte(ordinal);
+    } else if (dataType instanceof ShortType) {
+      return getShort(ordinal);
+    } else if (dataType instanceof IntegerType) {
+      return getInt(ordinal);
+    } else if (dataType instanceof LongType) {
+      return getLong(ordinal);
+    } else if (dataType instanceof FloatType) {
+      return getFloat(ordinal);
+    } else if (dataType instanceof DoubleType) {
+      return getDouble(ordinal);
+    } else if (dataType instanceof DecimalType) {
+      return getDecimal(ordinal);
+    } else if (dataType instanceof DateType) {
+      return getInt(ordinal);
+    } else if (dataType instanceof TimestampType) {
+      return getLong(ordinal);
+    } else if (dataType instanceof BinaryType) {
+      return getBinary(ordinal);
+    } else if (dataType instanceof StringType) {
+      return getUTF8String(ordinal);
+    } else if (dataType instanceof StructType) {
+      return getStruct(ordinal, ((StructType) dataType).size());
+    } else {
+      throw new UnsupportedOperationException("Unsupported data type " + dataType.simpleString());
+    }
+  }
+
   @Override
   public boolean isNullAt(int ordinal) {
     assertIndexIsValid(ordinal);
@@ -436,4 +471,19 @@ public String toString() {
   public boolean anyNull() {
     return BitSetMethods.anySet(baseObject, baseOffset, bitSetWidthInBytes / 8);
   }
+
+  /**
+   * Writes the content of this row into a memory address, identified by an object and an offset.
+   * The target memory address must already been allocated, and have enough space to hold all the
+   * bytes in this string.
+   */
+  public void writeToMemory(Object target, long targetOffset) {
+    PlatformDependent.copyMemory(
+      baseObject,
+      baseOffset,
+      target,
+      targetOffset,
+      sizeInBytes
+    );
+  }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala
index 385d9671386dc..ad3977281d1a9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala
@@ -30,11 +30,11 @@ abstract class InternalRow extends Serializable {
 
   def numFields: Int
 
-  def get(ordinal: Int): Any
+  def get(ordinal: Int): Any = get(ordinal, null)
 
   def genericGet(ordinal: Int): Any = get(ordinal, null)
 
-  def get(ordinal: Int, dataType: DataType): Any = get(ordinal)
+  def get(ordinal: Int, dataType: DataType): Any
 
   def getAs[T](ordinal: Int, dataType: DataType): T = get(ordinal, dataType).asInstanceOf[T]
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
index cc89d74146b34..27d6ff587ab71 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
@@ -198,7 +198,7 @@ class JoinedRow extends InternalRow {
     if (i < row1.numFields) row1.getBinary(i) else row2.getBinary(i - row1.numFields)
   }
 
-  override def get(i: Int): Any =
+  override def get(i: Int, dataType: DataType): Any =
     if (i < row1.numFields) row1.get(i) else row2.get(i - row1.numFields)
 
   override def isNullAt(i: Int): Boolean =
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SpecificMutableRow.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SpecificMutableRow.scala
index 5953a093dc684..b877ce47c083f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SpecificMutableRow.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SpecificMutableRow.scala
@@ -219,7 +219,7 @@ final class SpecificMutableRow(val values: Array[MutableValue]) extends MutableR
     values(i).isNull = true
   }
 
-  override def get(i: Int): Any = values(i).boxed
+  override def get(i: Int, dataType: DataType): Any = values(i).boxed
 
   override def getStruct(ordinal: Int, numFields: Int): InternalRow = {
     values(ordinal).boxed.asInstanceOf[InternalRow]
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala
index a361b216eb472..35920147105ff 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala
@@ -183,7 +183,7 @@ object GenerateProjection extends CodeGenerator[Seq[Expression], Projection] {
         public void setNullAt(int i) { nullBits[i] = true; }
         public boolean isNullAt(int i) { return nullBits[i]; }
 
-        public Object get(int i) {
+        public Object get(int i, ${classOf[DataType].getName} dataType) {
           if (isNullAt(i)) return null;
           switch (i) {
           $getCases
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala
index daeabe8e90f1d..b7c4ece4a16fe 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala
@@ -99,7 +99,7 @@ class GenericInternalRow(protected[sql] val values: Array[Any]) extends Internal
 
   override def numFields: Int = values.length
 
-  override def get(i: Int): Any = values(i)
+  override def get(i: Int, dataType: DataType): Any = values(i)
 
   override def getStruct(ordinal: Int, numFields: Int): InternalRow = {
     values(ordinal).asInstanceOf[InternalRow]
@@ -130,7 +130,7 @@ class GenericMutableRow(val values: Array[Any]) extends MutableRow {
 
   override def numFields: Int = values.length
 
-  override def get(i: Int): Any = values(i)
+  override def get(i: Int, dataType: DataType): Any = values(i)
 
   override def getStruct(ordinal: Int, numFields: Int): InternalRow = {
     values(ordinal).asInstanceOf[InternalRow]

From 4ffd3a1db5ecff653b02aa325786e734351c8bd2 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@outlook.com>
Date: Sun, 26 Jul 2015 23:58:03 -0700
Subject: [PATCH 388/549] [SPARK-9371][SQL] fix the support for special chars
 in column names for hive context

Author: Wenchen Fan <cloud0fan@outlook.com>

Closes #7684 from cloud-fan/hive and squashes the following commits:

da21ffe [Wenchen Fan] fix the support for special chars in column names for hive context
---
 .../src/main/scala/org/apache/spark/sql/hive/HiveQl.scala | 6 +++---
 .../apache/spark/sql/hive/execution/SQLQuerySuite.scala   | 8 ++++++++
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
index 620b8a44d8a9b..2f79b0aad045c 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
@@ -1321,11 +1321,11 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
     /* Attribute References */
     case Token("TOK_TABLE_OR_COL",
            Token(name, Nil) :: Nil) =>
-      UnresolvedAttribute(cleanIdentifier(name))
+      UnresolvedAttribute.quoted(cleanIdentifier(name))
     case Token(".", qualifier :: Token(attr, Nil) :: Nil) =>
       nodeToExpr(qualifier) match {
-        case UnresolvedAttribute(qualifierName) =>
-          UnresolvedAttribute(qualifierName :+ cleanIdentifier(attr))
+        case UnresolvedAttribute(nameParts) =>
+          UnresolvedAttribute(nameParts :+ cleanIdentifier(attr))
         case other => UnresolvedExtractValue(other, Literal(attr))
       }
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index 013936377b24c..8371dd0716c06 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -1067,4 +1067,12 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils {
     )
     TestHive.dropTempTable("test_SPARK8588")
   }
+
+  test("SPARK-9371: fix the support for special chars in column names for hive context") {
+    TestHive.read.json(TestHive.sparkContext.makeRDD(
+      """{"a": {"c.b": 1}, "b.$q": [{"a@!.q": 1}], "q.w": {"w.i&": [1]}}""" :: Nil))
+      .registerTempTable("t")
+
+    checkAnswer(sql("SELECT a.`c.b`, `b.$q`[0].`a@!.q`, `q.w`.`w.i&`[0] FROM t"), Row(1, 1, 1))
+  }
 }

From 72981bc8f0d421e2563e2543a8c16a8cc76ad3aa Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Mon, 27 Jul 2015 17:15:35 +0800
Subject: [PATCH 389/549] [SPARK-7943] [SPARK-8105] [SPARK-8435] [SPARK-8714]
 [SPARK-8561] Fixes multi-database support

This PR fixes a set of issues related to multi-database. A new data structure `TableIdentifier` is introduced to identify a table among multiple databases. We should stop using a single `String` (table name without database name), or `Seq[String]` (optional database name plus table name) to identify tables internally.

Author: Cheng Lian <lian@databricks.com>

Closes #7623 from liancheng/spark-8131-multi-db and squashes the following commits:

f3bcd4b [Cheng Lian] Addresses PR comments
e0eb76a [Cheng Lian] Fixes styling issues
41e2207 [Cheng Lian] Fixes multi-database support
d4d1ec2 [Cheng Lian] Adds multi-database test cases
---
 .../apache/spark/sql/catalyst/SqlParser.scala |  14 ++
 .../spark/sql/catalyst/TableIdentifier.scala  |  31 ++++
 .../spark/sql/catalyst/analysis/Catalog.scala |   9 +-
 .../apache/spark/sql/DataFrameWriter.scala    |  83 ++++-----
 .../org/apache/spark/sql/SQLContext.scala     |   6 +-
 .../spark/sql/execution/datasources/ddl.scala |  15 +-
 .../spark/sql/parquet/ParquetTest.scala       |   4 +-
 .../apache/spark/sql/test/SQLTestUtils.scala  |  29 +++-
 .../apache/spark/sql/hive/HiveContext.scala   |   5 +-
 .../spark/sql/hive/HiveMetastoreCatalog.scala |  31 +++-
 .../spark/sql/hive/MultiDatabaseSuite.scala   | 159 ++++++++++++++++++
 .../apache/spark/sql/hive/orc/OrcTest.scala   |   7 +-
 12 files changed, 327 insertions(+), 66 deletions(-)
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/TableIdentifier.scala
 create mode 100644 sql/hive/src/test/scala/org/apache/spark/sql/hive/MultiDatabaseSuite.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
index c494e5d704213..b423f0fa04f69 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
@@ -48,6 +48,15 @@ class SqlParser extends AbstractSparkSQLParser with DataTypeParser {
     }
   }
 
+  def parseTableIdentifier(input: String): TableIdentifier = {
+    // Initialize the Keywords.
+    initLexical
+    phrase(tableIdentifier)(new lexical.Scanner(input)) match {
+      case Success(ident, _) => ident
+      case failureOrError => sys.error(failureOrError.toString)
+    }
+  }
+
   // Keyword is a convention with AbstractSparkSQLParser, which will scan all of the `Keyword`
   // properties via reflection the class in runtime for constructing the SqlLexical object
   protected val ALL = Keyword("ALL")
@@ -444,4 +453,9 @@ class SqlParser extends AbstractSparkSQLParser with DataTypeParser {
     (ident <~ ".") ~ ident ~ rep("." ~> ident) ^^ {
       case i1 ~ i2 ~ rest => UnresolvedAttribute(Seq(i1, i2) ++ rest)
     }
+
+  protected lazy val tableIdentifier: Parser[TableIdentifier] =
+    (ident <~ ".").? ~ ident ^^ {
+      case maybeDbName ~ tableName => TableIdentifier(tableName, maybeDbName)
+    }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/TableIdentifier.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/TableIdentifier.scala
new file mode 100644
index 0000000000000..aebcdeb9d070f
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/TableIdentifier.scala
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst
+
+/**
+ * Identifies a `table` in `database`.  If `database` is not defined, the current database is used.
+ */
+private[sql] case class TableIdentifier(table: String, database: Option[String] = None) {
+  def withDatabase(database: String): TableIdentifier = this.copy(database = Some(database))
+
+  def toSeq: Seq[String] = database.toSeq :+ table
+
+  override def toString: String = toSeq.map("`" + _ + "`").mkString(".")
+
+  def unquotedString: String = toSeq.mkString(".")
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Catalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Catalog.scala
index 1541491608b24..5766e6a2dd51a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Catalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Catalog.scala
@@ -23,8 +23,7 @@ import scala.collection.JavaConversions._
 import scala.collection.mutable
 import scala.collection.mutable.ArrayBuffer
 
-import org.apache.spark.sql.catalyst.CatalystConf
-import org.apache.spark.sql.catalyst.EmptyConf
+import org.apache.spark.sql.catalyst.{TableIdentifier, CatalystConf, EmptyConf}
 import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Subquery}
 
 /**
@@ -54,7 +53,7 @@ trait Catalog {
    */
   def getTables(databaseName: Option[String]): Seq[(String, Boolean)]
 
-  def refreshTable(databaseName: String, tableName: String): Unit
+  def refreshTable(tableIdent: TableIdentifier): Unit
 
   def registerTable(tableIdentifier: Seq[String], plan: LogicalPlan): Unit
 
@@ -132,7 +131,7 @@ class SimpleCatalog(val conf: CatalystConf) extends Catalog {
     result
   }
 
-  override def refreshTable(databaseName: String, tableName: String): Unit = {
+  override def refreshTable(tableIdent: TableIdentifier): Unit = {
     throw new UnsupportedOperationException
   }
 }
@@ -241,7 +240,7 @@ object EmptyCatalog extends Catalog {
 
   override def unregisterAllTables(): Unit = {}
 
-  override def refreshTable(databaseName: String, tableName: String): Unit = {
+  override def refreshTable(tableIdent: TableIdentifier): Unit = {
     throw new UnsupportedOperationException
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index 05da05d7b8050..7e3318cefe62c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql
 import java.util.Properties
 
 import org.apache.spark.annotation.Experimental
+import org.apache.spark.sql.catalyst.{SqlParser, TableIdentifier}
 import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation
 import org.apache.spark.sql.catalyst.plans.logical.InsertIntoTable
 import org.apache.spark.sql.execution.datasources.{CreateTableUsingAsSelect, ResolvedDataSource}
@@ -159,15 +160,19 @@ final class DataFrameWriter private[sql](df: DataFrame) {
    * @since 1.4.0
    */
   def insertInto(tableName: String): Unit = {
-    val partitions =
-      partitioningColumns.map(_.map(col => col -> (None: Option[String])).toMap)
-    val overwrite = (mode == SaveMode.Overwrite)
-    df.sqlContext.executePlan(InsertIntoTable(
-      UnresolvedRelation(Seq(tableName)),
-      partitions.getOrElse(Map.empty[String, Option[String]]),
-      df.logicalPlan,
-      overwrite,
-      ifNotExists = false)).toRdd
+    insertInto(new SqlParser().parseTableIdentifier(tableName))
+  }
+
+  private def insertInto(tableIdent: TableIdentifier): Unit = {
+    val partitions = partitioningColumns.map(_.map(col => col -> (None: Option[String])).toMap)
+    val overwrite = mode == SaveMode.Overwrite
+    df.sqlContext.executePlan(
+      InsertIntoTable(
+        UnresolvedRelation(tableIdent.toSeq),
+        partitions.getOrElse(Map.empty[String, Option[String]]),
+        df.logicalPlan,
+        overwrite,
+        ifNotExists = false)).toRdd
   }
 
   /**
@@ -183,35 +188,37 @@ final class DataFrameWriter private[sql](df: DataFrame) {
    * @since 1.4.0
    */
   def saveAsTable(tableName: String): Unit = {
-    if (df.sqlContext.catalog.tableExists(tableName :: Nil) && mode != SaveMode.Overwrite) {
-      mode match {
-        case SaveMode.Ignore =>
-          // Do nothing
-
-        case SaveMode.ErrorIfExists =>
-          throw new AnalysisException(s"Table $tableName already exists.")
-
-        case SaveMode.Append =>
-          // If it is Append, we just ask insertInto to handle it. We will not use insertInto
-          // to handle saveAsTable with Overwrite because saveAsTable can change the schema of
-          // the table. But, insertInto with Overwrite requires the schema of data be the same
-          // the schema of the table.
-          insertInto(tableName)
-
-        case SaveMode.Overwrite =>
-          throw new UnsupportedOperationException("overwrite mode unsupported.")
-      }
-    } else {
-      val cmd =
-        CreateTableUsingAsSelect(
-          tableName,
-          source,
-          temporary = false,
-          partitioningColumns.map(_.toArray).getOrElse(Array.empty[String]),
-          mode,
-          extraOptions.toMap,
-          df.logicalPlan)
-      df.sqlContext.executePlan(cmd).toRdd
+    saveAsTable(new SqlParser().parseTableIdentifier(tableName))
+  }
+
+  private def saveAsTable(tableIdent: TableIdentifier): Unit = {
+    val tableExists = df.sqlContext.catalog.tableExists(tableIdent.toSeq)
+
+    (tableExists, mode) match {
+      case (true, SaveMode.Ignore) =>
+        // Do nothing
+
+      case (true, SaveMode.ErrorIfExists) =>
+        throw new AnalysisException(s"Table $tableIdent already exists.")
+
+      case (true, SaveMode.Append) =>
+        // If it is Append, we just ask insertInto to handle it. We will not use insertInto
+        // to handle saveAsTable with Overwrite because saveAsTable can change the schema of
+        // the table. But, insertInto with Overwrite requires the schema of data be the same
+        // the schema of the table.
+        insertInto(tableIdent)
+
+      case _ =>
+        val cmd =
+          CreateTableUsingAsSelect(
+            tableIdent.unquotedString,
+            source,
+            temporary = false,
+            partitioningColumns.map(_.toArray).getOrElse(Array.empty[String]),
+            mode,
+            extraOptions.toMap,
+            df.logicalPlan)
+        df.sqlContext.executePlan(cmd).toRdd
     }
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 0e25e06e99ab2..dbb2a09846548 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -798,8 +798,10 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * @group ddl_ops
    * @since 1.3.0
    */
-  def table(tableName: String): DataFrame =
-    DataFrame(this, catalog.lookupRelation(Seq(tableName)))
+  def table(tableName: String): DataFrame = {
+    val tableIdent = new SqlParser().parseTableIdentifier(tableName)
+    DataFrame(this, catalog.lookupRelation(tableIdent.toSeq))
+  }
 
   /**
    * Returns a [[DataFrame]] containing names of existing tables in the current database.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ddl.scala
index 1f2797ec5527a..e73b3704d4dfe 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ddl.scala
@@ -21,16 +21,17 @@ import scala.language.{existentials, implicitConversions}
 import scala.util.matching.Regex
 
 import org.apache.hadoop.fs.Path
+
 import org.apache.spark.Logging
 import org.apache.spark.deploy.SparkHadoopUtil
-import org.apache.spark.sql.{AnalysisException, DataFrame, Row, SQLContext, SaveMode}
-import org.apache.spark.sql.catalyst.AbstractSparkSQLParser
 import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
 import org.apache.spark.sql.catalyst.plans.logical._
+import org.apache.spark.sql.catalyst.{AbstractSparkSQLParser, TableIdentifier}
 import org.apache.spark.sql.execution.RunnableCommand
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types._
+import org.apache.spark.sql.{AnalysisException, DataFrame, Row, SQLContext, SaveMode}
 import org.apache.spark.util.Utils
 
 /**
@@ -151,7 +152,7 @@ private[sql] class DDLParser(
   protected lazy val refreshTable: Parser[LogicalPlan] =
     REFRESH ~> TABLE ~> (ident <~ ".").? ~ ident ^^ {
       case maybeDatabaseName ~ tableName =>
-        RefreshTable(maybeDatabaseName.getOrElse("default"), tableName)
+        RefreshTable(TableIdentifier(tableName, maybeDatabaseName))
     }
 
   protected lazy val options: Parser[Map[String, String]] =
@@ -442,16 +443,16 @@ private[sql] case class CreateTempTableUsingAsSelect(
   }
 }
 
-private[sql] case class RefreshTable(databaseName: String, tableName: String)
+private[sql] case class RefreshTable(tableIdent: TableIdentifier)
   extends RunnableCommand {
 
   override def run(sqlContext: SQLContext): Seq[Row] = {
     // Refresh the given table's metadata first.
-    sqlContext.catalog.refreshTable(databaseName, tableName)
+    sqlContext.catalog.refreshTable(tableIdent)
 
     // If this table is cached as a InMemoryColumnarRelation, drop the original
     // cached version and make the new version cached lazily.
-    val logicalPlan = sqlContext.catalog.lookupRelation(Seq(databaseName, tableName))
+    val logicalPlan = sqlContext.catalog.lookupRelation(tableIdent.toSeq)
     // Use lookupCachedData directly since RefreshTable also takes databaseName.
     val isCached = sqlContext.cacheManager.lookupCachedData(logicalPlan).nonEmpty
     if (isCached) {
@@ -461,7 +462,7 @@ private[sql] case class RefreshTable(databaseName: String, tableName: String)
       // Uncache the logicalPlan.
       sqlContext.cacheManager.tryUncacheQuery(df, blocking = true)
       // Cache it again.
-      sqlContext.cacheManager.cacheQuery(df, Some(tableName))
+      sqlContext.cacheManager.cacheQuery(df, Some(tableIdent.table))
     }
 
     Seq.empty[Row]
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetTest.scala
index eb15a1609f1d0..64e94056f209a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetTest.scala
@@ -22,6 +22,7 @@ import java.io.File
 import scala.reflect.ClassTag
 import scala.reflect.runtime.universe.TypeTag
 
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.test.SQLTestUtils
 import org.apache.spark.sql.{DataFrame, SaveMode}
 
@@ -32,8 +33,7 @@ import org.apache.spark.sql.{DataFrame, SaveMode}
  * convenient to use tuples rather than special case classes when writing test cases/suites.
  * Especially, `Tuple1.apply` can be used to easily wrap a single type/value.
  */
-private[sql] trait ParquetTest extends SQLTestUtils {
-
+private[sql] trait ParquetTest extends SQLTestUtils { this: SparkFunSuite =>
   /**
    * Writes `data` to a Parquet file, which is then passed to `f` and will be deleted after `f`
    * returns.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala
index fa01823e9417c..4c11acdab9ec0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala
@@ -18,13 +18,15 @@
 package org.apache.spark.sql.test
 
 import java.io.File
+import java.util.UUID
 
 import scala.util.Try
 
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.SQLContext
 import org.apache.spark.util.Utils
 
-trait SQLTestUtils {
+trait SQLTestUtils { this: SparkFunSuite =>
   def sqlContext: SQLContext
 
   protected def configuration = sqlContext.sparkContext.hadoopConfiguration
@@ -87,4 +89,29 @@ trait SQLTestUtils {
       }
     }
   }
+
+  /**
+   * Creates a temporary database and switches current database to it before executing `f`.  This
+   * database is dropped after `f` returns.
+   */
+  protected def withTempDatabase(f: String => Unit): Unit = {
+    val dbName = s"db_${UUID.randomUUID().toString.replace('-', '_')}"
+
+    try {
+      sqlContext.sql(s"CREATE DATABASE $dbName")
+    } catch { case cause: Throwable =>
+      fail("Failed to create temporary database", cause)
+    }
+
+    try f(dbName) finally sqlContext.sql(s"DROP DATABASE $dbName CASCADE")
+  }
+
+  /**
+   * Activates database `db` before executing `f`, then switches back to `default` database after
+   * `f` returns.
+   */
+  protected def activateDatabase(db: String)(f: => Unit): Unit = {
+    sqlContext.sql(s"USE $db")
+    try f finally sqlContext.sql(s"USE default")
+  }
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index 1b8edefef4093..110f51a305861 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -40,7 +40,7 @@ import org.apache.spark.annotation.Experimental
 import org.apache.spark.sql._
 import org.apache.spark.sql.SQLConf.SQLConfEntry
 import org.apache.spark.sql.SQLConf.SQLConfEntry._
-import org.apache.spark.sql.catalyst.ParserDialect
+import org.apache.spark.sql.catalyst.{TableIdentifier, ParserDialect}
 import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.execution.{ExecutedCommand, ExtractPythonUDFs, SetCommand}
@@ -267,7 +267,8 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) with Logging {
    * @since 1.3.0
    */
   def refreshTable(tableName: String): Unit = {
-    catalog.refreshTable(catalog.client.currentDatabase, tableName)
+    val tableIdent = TableIdentifier(tableName).withDatabase(catalog.client.currentDatabase)
+    catalog.refreshTable(tableIdent)
   }
 
   protected[hive] def invalidateTable(tableName: String): Unit = {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index 262923531216f..9c707a7a2eca1 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -29,13 +29,13 @@ import org.apache.hadoop.hive.ql.metadata._
 import org.apache.hadoop.hive.ql.plan.TableDesc
 
 import org.apache.spark.Logging
-import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.{Catalog, MultiInstanceRelation, OverrideCatalog}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.planning.PhysicalOperation
 import org.apache.spark.sql.catalyst.plans.logical
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules._
+import org.apache.spark.sql.catalyst.{InternalRow, SqlParser, TableIdentifier}
 import org.apache.spark.sql.execution.datasources
 import org.apache.spark.sql.execution.datasources.{CreateTableUsingAsSelect, LogicalRelation, Partition => ParquetPartition, PartitionSpec, ResolvedDataSource}
 import org.apache.spark.sql.hive.client._
@@ -43,7 +43,6 @@ import org.apache.spark.sql.parquet.ParquetRelation
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.{AnalysisException, SQLContext, SaveMode}
 
-
 private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: HiveContext)
   extends Catalog with Logging {
 
@@ -115,7 +114,7 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
     CacheBuilder.newBuilder().maximumSize(1000).build(cacheLoader)
   }
 
-  override def refreshTable(databaseName: String, tableName: String): Unit = {
+  override def refreshTable(tableIdent: TableIdentifier): Unit = {
     // refreshTable does not eagerly reload the cache. It just invalidate the cache.
     // Next time when we use the table, it will be populated in the cache.
     // Since we also cache ParquetRelations converted from Hive Parquet tables and
@@ -124,7 +123,7 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
     // it is better at here to invalidate the cache to avoid confusing waring logs from the
     // cache loader (e.g. cannot find data source provider, which is only defined for
     // data source table.).
-    invalidateTable(databaseName, tableName)
+    invalidateTable(tableIdent.database.getOrElse(client.currentDatabase), tableIdent.table)
   }
 
   def invalidateTable(databaseName: String, tableName: String): Unit = {
@@ -144,7 +143,27 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
       provider: String,
       options: Map[String, String],
       isExternal: Boolean): Unit = {
-    val (dbName, tblName) = processDatabaseAndTableName(client.currentDatabase, tableName)
+    createDataSourceTable(
+      new SqlParser().parseTableIdentifier(tableName),
+      userSpecifiedSchema,
+      partitionColumns,
+      provider,
+      options,
+      isExternal)
+  }
+
+  private def createDataSourceTable(
+      tableIdent: TableIdentifier,
+      userSpecifiedSchema: Option[StructType],
+      partitionColumns: Array[String],
+      provider: String,
+      options: Map[String, String],
+      isExternal: Boolean): Unit = {
+    val (dbName, tblName) = {
+      val database = tableIdent.database.getOrElse(client.currentDatabase)
+      processDatabaseAndTableName(database, tableIdent.table)
+    }
+
     val tableProperties = new scala.collection.mutable.HashMap[String, String]
     tableProperties.put("spark.sql.sources.provider", provider)
 
@@ -177,7 +196,7 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
         // partitions when we load the table. However, if there are specified partition columns,
         // we simplily ignore them and provide a warning message..
         logWarning(
-          s"The schema and partitions of table $tableName will be inferred when it is loaded. " +
+          s"The schema and partitions of table $tableIdent will be inferred when it is loaded. " +
             s"Specified partition columns (${partitionColumns.mkString(",")}) will be ignored.")
       }
       Seq.empty[HiveColumn]
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MultiDatabaseSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MultiDatabaseSuite.scala
new file mode 100644
index 0000000000000..73852f13ad20d
--- /dev/null
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MultiDatabaseSuite.scala
@@ -0,0 +1,159 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive
+
+import org.apache.spark.sql.hive.test.TestHive
+import org.apache.spark.sql.test.SQLTestUtils
+import org.apache.spark.sql.{QueryTest, SQLContext, SaveMode}
+
+class MultiDatabaseSuite extends QueryTest with SQLTestUtils {
+  override val sqlContext: SQLContext = TestHive
+
+  import sqlContext.sql
+
+  private val df = sqlContext.range(10).coalesce(1)
+
+  test(s"saveAsTable() to non-default database - with USE - Overwrite") {
+    withTempDatabase { db =>
+      activateDatabase(db) {
+        df.write.mode(SaveMode.Overwrite).saveAsTable("t")
+        assert(sqlContext.tableNames().contains("t"))
+        checkAnswer(sqlContext.table("t"), df)
+      }
+
+      assert(sqlContext.tableNames(db).contains("t"))
+      checkAnswer(sqlContext.table(s"$db.t"), df)
+    }
+  }
+
+  test(s"saveAsTable() to non-default database - without USE - Overwrite") {
+    withTempDatabase { db =>
+      df.write.mode(SaveMode.Overwrite).saveAsTable(s"$db.t")
+      assert(sqlContext.tableNames(db).contains("t"))
+      checkAnswer(sqlContext.table(s"$db.t"), df)
+    }
+  }
+
+  test(s"saveAsTable() to non-default database - with USE - Append") {
+    withTempDatabase { db =>
+      activateDatabase(db) {
+        df.write.mode(SaveMode.Overwrite).saveAsTable("t")
+        df.write.mode(SaveMode.Append).saveAsTable("t")
+        assert(sqlContext.tableNames().contains("t"))
+        checkAnswer(sqlContext.table("t"), df.unionAll(df))
+      }
+
+      assert(sqlContext.tableNames(db).contains("t"))
+      checkAnswer(sqlContext.table(s"$db.t"), df.unionAll(df))
+    }
+  }
+
+  test(s"saveAsTable() to non-default database - without USE - Append") {
+    withTempDatabase { db =>
+      df.write.mode(SaveMode.Overwrite).saveAsTable(s"$db.t")
+      df.write.mode(SaveMode.Append).saveAsTable(s"$db.t")
+      assert(sqlContext.tableNames(db).contains("t"))
+      checkAnswer(sqlContext.table(s"$db.t"), df.unionAll(df))
+    }
+  }
+
+  test(s"insertInto() non-default database - with USE") {
+    withTempDatabase { db =>
+      activateDatabase(db) {
+        df.write.mode(SaveMode.Overwrite).saveAsTable("t")
+        assert(sqlContext.tableNames().contains("t"))
+
+        df.write.insertInto(s"$db.t")
+        checkAnswer(sqlContext.table(s"$db.t"), df.unionAll(df))
+      }
+    }
+  }
+
+  test(s"insertInto() non-default database - without USE") {
+    withTempDatabase { db =>
+      activateDatabase(db) {
+        df.write.mode(SaveMode.Overwrite).saveAsTable("t")
+        assert(sqlContext.tableNames().contains("t"))
+      }
+
+      assert(sqlContext.tableNames(db).contains("t"))
+
+      df.write.insertInto(s"$db.t")
+      checkAnswer(sqlContext.table(s"$db.t"), df.unionAll(df))
+    }
+  }
+
+  test("Looks up tables in non-default database") {
+    withTempDatabase { db =>
+      activateDatabase(db) {
+        sql("CREATE TABLE t (key INT)")
+        checkAnswer(sqlContext.table("t"), sqlContext.emptyDataFrame)
+      }
+
+      checkAnswer(sqlContext.table(s"$db.t"), sqlContext.emptyDataFrame)
+    }
+  }
+
+  test("Drops a table in a non-default database") {
+    withTempDatabase { db =>
+      activateDatabase(db) {
+        sql(s"CREATE TABLE t (key INT)")
+        assert(sqlContext.tableNames().contains("t"))
+        assert(!sqlContext.tableNames("default").contains("t"))
+      }
+
+      assert(!sqlContext.tableNames().contains("t"))
+      assert(sqlContext.tableNames(db).contains("t"))
+
+      activateDatabase(db) {
+        sql(s"DROP TABLE t")
+        assert(!sqlContext.tableNames().contains("t"))
+        assert(!sqlContext.tableNames("default").contains("t"))
+      }
+
+      assert(!sqlContext.tableNames().contains("t"))
+      assert(!sqlContext.tableNames(db).contains("t"))
+    }
+  }
+
+  test("Refreshes a table in a non-default database") {
+    import org.apache.spark.sql.functions.lit
+
+    withTempDatabase { db =>
+      withTempPath { dir =>
+        val path = dir.getCanonicalPath
+
+        activateDatabase(db) {
+          sql(
+            s"""CREATE EXTERNAL TABLE t (id BIGINT)
+               |PARTITIONED BY (p INT)
+               |STORED AS PARQUET
+               |LOCATION '$path'
+             """.stripMargin)
+
+          checkAnswer(sqlContext.table("t"), sqlContext.emptyDataFrame)
+
+          df.write.parquet(s"$path/p=1")
+          sql("ALTER TABLE t ADD PARTITION (p=1)")
+          sql("REFRESH TABLE t")
+          checkAnswer(sqlContext.table("t"), df.withColumn("p", lit(1)))
+        }
+      }
+    }
+  }
+}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcTest.scala
index 9d76d6503a3e6..145965388da01 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcTest.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcTest.scala
@@ -22,14 +22,15 @@ import java.io.File
 import scala.reflect.ClassTag
 import scala.reflect.runtime.universe.TypeTag
 
-import org.apache.spark.sql.test.SQLTestUtils
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql._
+import org.apache.spark.sql.test.SQLTestUtils
 
-private[sql] trait OrcTest extends SQLTestUtils {
+private[sql] trait OrcTest extends SQLTestUtils { this: SparkFunSuite =>
   lazy val sqlContext = org.apache.spark.sql.hive.test.TestHive
 
-  import sqlContext.sparkContext
   import sqlContext.implicits._
+  import sqlContext.sparkContext
 
   /**
    * Writes `data` to a Orc file, which is then passed to `f` and will be deleted after `f`

From 622838165756e9669cbf7af13eccbc719638f40b Mon Sep 17 00:00:00 2001
From: Carson Wang <carson.wang@intel.com>
Date: Mon, 27 Jul 2015 08:02:40 -0500
Subject: [PATCH 390/549] [SPARK-8405] [DOC] Add how to view logs on Web UI
 when yarn log aggregation is enabled

Some users may not be aware that the logs are available on Web UI even if Yarn log aggregation is enabled. Update the doc to make this clear and what need to be configured.

Author: Carson Wang <carson.wang@intel.com>

Closes #7463 from carsonwang/YarnLogDoc and squashes the following commits:

274c054 [Carson Wang] Minor text fix
74df3a1 [Carson Wang] address comments
5a95046 [Carson Wang] Update the text in the doc
e5775c1 [Carson Wang] Update doc about how to view the logs on Web UI when yarn log aggregation is enabled
---
 docs/running-on-yarn.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/running-on-yarn.md b/docs/running-on-yarn.md
index de22ab557cacf..cac08a91b97d9 100644
--- a/docs/running-on-yarn.md
+++ b/docs/running-on-yarn.md
@@ -68,9 +68,9 @@ In YARN terminology, executors and application masters run inside "containers".
 
     yarn logs -applicationId <app ID>
     
-will print out the contents of all log files from all containers from the given application. You can also view the container log files directly in HDFS using the HDFS shell or API. The directory where they are located can be found by looking at your YARN configs (`yarn.nodemanager.remote-app-log-dir` and `yarn.nodemanager.remote-app-log-dir-suffix`).
+will print out the contents of all log files from all containers from the given application. You can also view the container log files directly in HDFS using the HDFS shell or API. The directory where they are located can be found by looking at your YARN configs (`yarn.nodemanager.remote-app-log-dir` and `yarn.nodemanager.remote-app-log-dir-suffix`). The logs are also available on the Spark Web UI under the Executors Tab. You need to have both the Spark history server and the MapReduce history server running and configure `yarn.log.server.url` in `yarn-site.xml` properly. The log URL on the Spark history server UI will redirect you to the MapReduce history server to show the aggregated logs.
 
-When log aggregation isn't turned on, logs are retained locally on each machine under `YARN_APP_LOGS_DIR`, which is usually configured to `/tmp/logs` or `$HADOOP_HOME/logs/userlogs` depending on the Hadoop version and installation. Viewing logs for a container requires going to the host that contains them and looking in this directory.  Subdirectories organize log files by application ID and container ID.
+When log aggregation isn't turned on, logs are retained locally on each machine under `YARN_APP_LOGS_DIR`, which is usually configured to `/tmp/logs` or `$HADOOP_HOME/logs/userlogs` depending on the Hadoop version and installation. Viewing logs for a container requires going to the host that contains them and looking in this directory.  Subdirectories organize log files by application ID and container ID. The logs are also available on the Spark Web UI under the Executors Tab and doesn't require running the MapReduce history server.
 
 To review per-container launch environment, increase `yarn.nodemanager.delete.debug-delay-sec` to a
 large value (e.g. 36000), and then access the application cache through `yarn.nodemanager.local-dirs`

From aa19c696e25ebb07fd3df110cfcbcc69954ce335 Mon Sep 17 00:00:00 2001
From: Rene Treffer <treffer+github@measite.de>
Date: Mon, 27 Jul 2015 23:29:40 +0800
Subject: [PATCH 391/549] [SPARK-4176] [SQL] Supports decimal types with
 precision > 18 in Parquet

This PR is based on #6796 authored by rtreffer.

To support large decimal precisions (> 18), we do the following things in this PR:

1. Making `CatalystSchemaConverter` support large decimal precision

   Decimal types with large precision are always converted to fixed-length byte array.

2. Making `CatalystRowConverter` support reading decimal values with large precision

   When the precision is > 18, constructs `Decimal` values with an unscaled `BigInteger` rather than an unscaled `Long`.

3. Making `RowWriteSupport` support writing decimal values with large precision

   In this PR we always write decimals as fixed-length byte array, because Parquet write path hasn't been refactored to conform Parquet format spec (see SPARK-6774 & SPARK-8848).

Two follow-up tasks should be done in future PRs:

- [ ] Writing decimals as `INT32`, `INT64` when possible while fixing SPARK-8848
- [ ] Adding compatibility tests as part of SPARK-5463

Author: Cheng Lian <lian@databricks.com>

Closes #7455 from liancheng/spark-4176 and squashes the following commits:

a543d10 [Cheng Lian] Fixes errors introduced while rebasing
9e31cdf [Cheng Lian] Supports decimals with precision > 18 for Parquet
---
 .../sql/parquet/CatalystRowConverter.scala    | 25 +++++---
 .../sql/parquet/CatalystSchemaConverter.scala | 46 +++++++------
 .../sql/parquet/ParquetTableSupport.scala     | 64 +++++++++++++------
 .../spark/sql/parquet/ParquetIOSuite.scala    | 10 +--
 4 files changed, 85 insertions(+), 60 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystRowConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystRowConverter.scala
index b5e4263008f56..e00bd90edb3dd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystRowConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystRowConverter.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.parquet
 
+import java.math.{BigDecimal, BigInteger}
 import java.nio.ByteOrder
 
 import scala.collection.JavaConversions._
@@ -263,17 +264,23 @@ private[parquet] class CatalystRowConverter(
       val scale = decimalType.scale
       val bytes = value.getBytes
 
-      var unscaled = 0L
-      var i = 0
+      if (precision <= 8) {
+        // Constructs a `Decimal` with an unscaled `Long` value if possible.
+        var unscaled = 0L
+        var i = 0
 
-      while (i < bytes.length) {
-        unscaled = (unscaled << 8) | (bytes(i) & 0xff)
-        i += 1
-      }
+        while (i < bytes.length) {
+          unscaled = (unscaled << 8) | (bytes(i) & 0xff)
+          i += 1
+        }
 
-      val bits = 8 * bytes.length
-      unscaled = (unscaled << (64 - bits)) >> (64 - bits)
-      Decimal(unscaled, precision, scale)
+        val bits = 8 * bytes.length
+        unscaled = (unscaled << (64 - bits)) >> (64 - bits)
+        Decimal(unscaled, precision, scale)
+      } else {
+        // Otherwise, resorts to an unscaled `BigInteger` instead.
+        Decimal(new BigDecimal(new BigInteger(bytes), scale), precision, scale)
+      }
     }
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala
index e9ef01e2dba1b..d43ca95b4eea0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala
@@ -387,24 +387,18 @@ private[parquet] class CatalystSchemaConverter(
       // =====================================
 
       // Spark 1.4.x and prior versions only support decimals with a maximum precision of 18 and
-      // always store decimals in fixed-length byte arrays.
-      case DecimalType.Fixed(precision, scale)
-        if precision <= maxPrecisionForBytes(8) && !followParquetFormatSpec =>
+      // always store decimals in fixed-length byte arrays.  To keep compatibility with these older
+      // versions, here we convert decimals with all precisions to `FIXED_LEN_BYTE_ARRAY` annotated
+      // by `DECIMAL`.
+      case DecimalType.Fixed(precision, scale) if !followParquetFormatSpec =>
         Types
           .primitive(FIXED_LEN_BYTE_ARRAY, repetition)
           .as(DECIMAL)
           .precision(precision)
           .scale(scale)
-          .length(minBytesForPrecision(precision))
+          .length(CatalystSchemaConverter.minBytesForPrecision(precision))
           .named(field.name)
 
-      case dec @ DecimalType() if !followParquetFormatSpec =>
-        throw new AnalysisException(
-          s"Data type $dec is not supported. " +
-            s"When ${SQLConf.PARQUET_FOLLOW_PARQUET_FORMAT_SPEC.key} is set to false," +
-            "decimal precision and scale must be specified, " +
-            "and precision must be less than or equal to 18.")
-
       // =====================================
       // Decimals (follow Parquet format spec)
       // =====================================
@@ -436,7 +430,7 @@ private[parquet] class CatalystSchemaConverter(
           .as(DECIMAL)
           .precision(precision)
           .scale(scale)
-          .length(minBytesForPrecision(precision))
+          .length(CatalystSchemaConverter.minBytesForPrecision(precision))
           .named(field.name)
 
       // ===================================================
@@ -548,15 +542,6 @@ private[parquet] class CatalystSchemaConverter(
         Math.pow(2, 8 * numBytes - 1) - 1)))  // max value stored in numBytes
       .asInstanceOf[Int]
   }
-
-  // Min byte counts needed to store decimals with various precisions
-  private val minBytesForPrecision: Array[Int] = Array.tabulate(38) { precision =>
-    var numBytes = 1
-    while (math.pow(2.0, 8 * numBytes - 1) < math.pow(10.0, precision)) {
-      numBytes += 1
-    }
-    numBytes
-  }
 }
 
 
@@ -580,4 +565,23 @@ private[parquet] object CatalystSchemaConverter {
       throw new AnalysisException(message)
     }
   }
+
+  private def computeMinBytesForPrecision(precision : Int) : Int = {
+    var numBytes = 1
+    while (math.pow(2.0, 8 * numBytes - 1) < math.pow(10.0, precision)) {
+      numBytes += 1
+    }
+    numBytes
+  }
+
+  private val MIN_BYTES_FOR_PRECISION = Array.tabulate[Int](39)(computeMinBytesForPrecision)
+
+  // Returns the minimum number of bytes needed to store a decimal with a given `precision`.
+  def minBytesForPrecision(precision : Int) : Int = {
+    if (precision < MIN_BYTES_FOR_PRECISION.length) {
+      MIN_BYTES_FOR_PRECISION(precision)
+    } else {
+      computeMinBytesForPrecision(precision)
+    }
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
index fc9f61a636768..78ecfad1d57c6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.parquet
 
+import java.math.BigInteger
 import java.nio.{ByteBuffer, ByteOrder}
 import java.util.{HashMap => JHashMap}
 
@@ -114,11 +115,8 @@ private[parquet] class RowWriteSupport extends WriteSupport[InternalRow] with Lo
           Binary.fromByteArray(value.asInstanceOf[UTF8String].getBytes))
         case BinaryType => writer.addBinary(
           Binary.fromByteArray(value.asInstanceOf[Array[Byte]]))
-        case d: DecimalType =>
-          if (d.precision > 18) {
-            sys.error(s"Unsupported datatype $d, cannot write to consumer")
-          }
-          writeDecimal(value.asInstanceOf[Decimal], d.precision)
+        case DecimalType.Fixed(precision, _) =>
+          writeDecimal(value.asInstanceOf[Decimal], precision)
         case _ => sys.error(s"Do not know how to writer $schema to consumer")
       }
     }
@@ -199,20 +197,47 @@ private[parquet] class RowWriteSupport extends WriteSupport[InternalRow] with Lo
     writer.endGroup()
   }
 
-  // Scratch array used to write decimals as fixed-length binary
-  private[this] val scratchBytes = new Array[Byte](8)
+  // Scratch array used to write decimals as fixed-length byte array
+  private[this] var reusableDecimalBytes = new Array[Byte](16)
 
   private[parquet] def writeDecimal(decimal: Decimal, precision: Int): Unit = {
-    val numBytes = ParquetTypesConverter.BYTES_FOR_PRECISION(precision)
-    val unscaledLong = decimal.toUnscaledLong
-    var i = 0
-    var shift = 8 * (numBytes - 1)
-    while (i < numBytes) {
-      scratchBytes(i) = (unscaledLong >> shift).toByte
-      i += 1
-      shift -= 8
+    val numBytes = CatalystSchemaConverter.minBytesForPrecision(precision)
+
+    def longToBinary(unscaled: Long): Binary = {
+      var i = 0
+      var shift = 8 * (numBytes - 1)
+      while (i < numBytes) {
+        reusableDecimalBytes(i) = (unscaled >> shift).toByte
+        i += 1
+        shift -= 8
+      }
+      Binary.fromByteArray(reusableDecimalBytes, 0, numBytes)
     }
-    writer.addBinary(Binary.fromByteArray(scratchBytes, 0, numBytes))
+
+    def bigIntegerToBinary(unscaled: BigInteger): Binary = {
+      unscaled.toByteArray match {
+        case bytes if bytes.length == numBytes =>
+          Binary.fromByteArray(bytes)
+
+        case bytes if bytes.length <= reusableDecimalBytes.length =>
+          val signedByte = (if (bytes.head < 0) -1 else 0).toByte
+          java.util.Arrays.fill(reusableDecimalBytes, 0, numBytes - bytes.length, signedByte)
+          System.arraycopy(bytes, 0, reusableDecimalBytes, numBytes - bytes.length, bytes.length)
+          Binary.fromByteArray(reusableDecimalBytes, 0, numBytes)
+
+        case bytes =>
+          reusableDecimalBytes = new Array[Byte](bytes.length)
+          bigIntegerToBinary(unscaled)
+      }
+    }
+
+    val binary = if (numBytes <= 8) {
+      longToBinary(decimal.toUnscaledLong)
+    } else {
+      bigIntegerToBinary(decimal.toJavaBigDecimal.unscaledValue())
+    }
+
+    writer.addBinary(binary)
   }
 
   // array used to write Timestamp as Int96 (fixed-length binary)
@@ -268,11 +293,8 @@ private[parquet] class MutableRowWriteSupport extends RowWriteSupport {
         writer.addBinary(Binary.fromByteArray(record.getUTF8String(index).getBytes))
       case BinaryType =>
         writer.addBinary(Binary.fromByteArray(record.getBinary(index)))
-      case d: DecimalType =>
-        if (d.precision > 18) {
-          sys.error(s"Unsupported datatype $d, cannot write to consumer")
-        }
-        writeDecimal(record.getDecimal(index), d.precision)
+      case DecimalType.Fixed(precision, _) =>
+        writeDecimal(record.getDecimal(index), precision)
       case _ => sys.error(s"Unsupported datatype $ctype, cannot write to consumer")
     }
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala
index b5314a3dd92e5..b415da5b8c136 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala
@@ -106,21 +106,13 @@ class ParquetIOSuite extends QueryTest with ParquetTest {
         // Parquet doesn't allow column names with spaces, have to add an alias here
         .select($"_1" cast decimal as "dec")
 
-    for ((precision, scale) <- Seq((5, 2), (1, 0), (1, 1), (18, 10), (18, 17))) {
+    for ((precision, scale) <- Seq((5, 2), (1, 0), (1, 1), (18, 10), (18, 17), (19, 0), (38, 37))) {
       withTempPath { dir =>
         val data = makeDecimalRDD(DecimalType(precision, scale))
         data.write.parquet(dir.getCanonicalPath)
         checkAnswer(sqlContext.read.parquet(dir.getCanonicalPath), data.collect().toSeq)
       }
     }
-
-    // Decimals with precision above 18 are not yet supported
-    intercept[Throwable] {
-      withTempPath { dir =>
-        makeDecimalRDD(DecimalType(19, 10)).write.parquet(dir.getCanonicalPath)
-        sqlContext.read.parquet(dir.getCanonicalPath).collect()
-      }
-    }
   }
 
   test("date type") {

From 90006f3c51f8cf9535854246050e27bb76b043f0 Mon Sep 17 00:00:00 2001
From: Alexander Ulanov <nashb@yandex.ru>
Date: Tue, 28 Jul 2015 01:33:31 +0900
Subject: [PATCH 392/549] Pregel example type fix

Pregel example to express single source shortest path from https://spark.apache.org/docs/latest/graphx-programming-guide.html#pregel-api does not work due to incorrect type. The reason is that `GraphGenerators.logNormalGraph` returns the graph with `Long` vertices. Fixing `val graph: Graph[Int, Double]` to `val graph: Graph[Long, Double]`.

Author: Alexander Ulanov <nashb@yandex.ru>

Closes #7695 from avulanov/SPARK-9380-pregel-doc and squashes the following commits:

c269429 [Alexander Ulanov] Pregel example type fix
---
 docs/graphx-programming-guide.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/graphx-programming-guide.md b/docs/graphx-programming-guide.md
index 3f10cb2dc3d2a..99f8c827f767f 100644
--- a/docs/graphx-programming-guide.md
+++ b/docs/graphx-programming-guide.md
@@ -800,7 +800,7 @@ import org.apache.spark.graphx._
 // Import random graph generation library
 import org.apache.spark.graphx.util.GraphGenerators
 // A graph with edge attributes containing distances
-val graph: Graph[Int, Double] =
+val graph: Graph[Long, Double] =
   GraphGenerators.logNormalGraph(sc, numVertices = 100).mapEdges(e => e.attr.toDouble)
 val sourceId: VertexId = 42 // The ultimate source
 // Initialize the graph such that all vertices except the root have distance infinity.

From ecad9d4346ec158746e61aebdf1590215a77f369 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Mon, 27 Jul 2015 09:34:49 -0700
Subject: [PATCH 393/549] [SPARK-9364] Fix array out of bounds and
 use-after-free bugs in UnsafeExternalSorter

This patch fixes two bugs in UnsafeExternalSorter and UnsafeExternalRowSorter:

- UnsafeExternalSorter does not properly update freeSpaceInCurrentPage, which can cause it to write past the end of memory pages and trigger segfaults.
- UnsafeExternalRowSorter has a use-after-free bug when returning the last row from an iterator.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #7680 from JoshRosen/SPARK-9364 and squashes the following commits:

590f311 [Josh Rosen] null out row
f4cf91d [Josh Rosen] Fix use-after-free bug in UnsafeExternalRowSorter.
8abcf82 [Josh Rosen] Properly decrement freeSpaceInCurrentPage in UnsafeExternalSorter
---
 .../unsafe/sort/UnsafeExternalSorter.java     |  7 ++++++-
 .../sort/UnsafeExternalSorterSuite.java       | 19 +++++++++++++++++++
 .../execution/UnsafeExternalRowSorter.java    |  9 ++++++---
 3 files changed, 31 insertions(+), 4 deletions(-)

diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java
index 4d6731ee60af3..80b03d7e99e2b 100644
--- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java
+++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java
@@ -150,6 +150,11 @@ private long getMemoryUsage() {
     return sorter.getMemoryUsage() + (allocatedPages.size() * (long) PAGE_SIZE);
   }
 
+  @VisibleForTesting
+  public int getNumberOfAllocatedPages() {
+    return allocatedPages.size();
+  }
+
   public long freeMemory() {
     long memoryFreed = 0;
     for (MemoryBlock block : allocatedPages) {
@@ -257,7 +262,7 @@ public void insertRecord(
       currentPagePosition,
       lengthInBytes);
     currentPagePosition += lengthInBytes;
-
+    freeSpaceInCurrentPage -= totalSpaceRequired;
     sorter.insertRecord(recordAddress, prefix);
   }
 
diff --git a/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java b/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java
index ea8755e21eb68..0e391b751226d 100644
--- a/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java
+++ b/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java
@@ -199,4 +199,23 @@ public void testSortingEmptyArrays() throws Exception {
     }
   }
 
+  @Test
+  public void testFillingPage() throws Exception {
+    final UnsafeExternalSorter sorter = new UnsafeExternalSorter(
+      memoryManager,
+      shuffleMemoryManager,
+      blockManager,
+      taskContext,
+      recordComparator,
+      prefixComparator,
+      1024,
+      new SparkConf());
+
+    byte[] record = new byte[16];
+    while (sorter.getNumberOfAllocatedPages() < 2) {
+      sorter.insertRecord(record, PlatformDependent.BYTE_ARRAY_OFFSET, record.length, 0);
+    }
+    sorter.freeMemory();
+  }
+
 }
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java b/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java
index be4ff400c4754..4c3f2c6557140 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java
@@ -124,7 +124,7 @@ Iterator<InternalRow> sort() throws IOException {
       return new AbstractScalaRowIterator() {
 
         private final int numFields = schema.length();
-        private final UnsafeRow row = new UnsafeRow();
+        private UnsafeRow row = new UnsafeRow();
 
         @Override
         public boolean hasNext() {
@@ -141,10 +141,13 @@ public InternalRow next() {
               numFields,
               sortedIterator.getRecordLength());
             if (!hasNext()) {
-              row.copy(); // so that we don't have dangling pointers to freed page
+              UnsafeRow copy = row.copy(); // so that we don't have dangling pointers to freed page
+              row = null; // so that we don't keep references to the base object
               cleanupResources();
+              return copy;
+            } else {
+              return row;
             }
-            return row;
           } catch (IOException e) {
             cleanupResources();
             // Scala iterators don't declare any checked exceptions, so we need to use this hack

From c0b7df68f81c2a2a9c1065009fe75c278fa30499 Mon Sep 17 00:00:00 2001
From: Ryan Williams <ryan.blake.williams@gmail.com>
Date: Mon, 27 Jul 2015 12:54:08 -0500
Subject: [PATCH 394/549] [SPARK-9366] use task's stageAttemptId in TaskEnd
 event

Author: Ryan Williams <ryan.blake.williams@gmail.com>

Closes #7681 from ryan-williams/task-stage-attempt and squashes the following commits:

d6d5f0f [Ryan Williams] use task's stageAttemptId in TaskEnd event
---
 .../main/scala/org/apache/spark/scheduler/DAGScheduler.scala    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
index 552dabcfa5139..b6a833bbb0833 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -927,7 +927,7 @@ class DAGScheduler(
     // The success case is dealt with separately below, since we need to compute accumulator
     // updates before posting.
     if (event.reason != Success) {
-      val attemptId = stageIdToStage.get(task.stageId).map(_.latestInfo.attemptId).getOrElse(-1)
+      val attemptId = task.stageAttemptId
       listenerBus.post(SparkListenerTaskEnd(stageId, attemptId, taskType, event.reason,
         event.taskInfo, event.taskMetrics))
     }

From e2f38167f8b5678ac45794eacb9c7bb9b951af82 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@outlook.com>
Date: Mon, 27 Jul 2015 11:02:16 -0700
Subject: [PATCH 395/549] [SPARK-9376] [SQL] use a seed in
 RandomDataGeneratorSuite

Make this test deterministic, i.e. make sure this test can be passed no matter how many times we run it.

The origin implementation uses a random seed and gives a chance that we may break the null check assertion `assert(Iterator.fill(100)(generator()).contains(null))`.

Author: Wenchen Fan <cloud0fan@outlook.com>

Closes #7691 from cloud-fan/seed and squashes the following commits:

eae7281 [Wenchen Fan] use a seed in RandomDataGeneratorSuite
---
 .../scala/org/apache/spark/sql/RandomDataGeneratorSuite.scala   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGeneratorSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGeneratorSuite.scala
index 677ba0a18040c..cccac7efa09e9 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGeneratorSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGeneratorSuite.scala
@@ -32,7 +32,7 @@ class RandomDataGeneratorSuite extends SparkFunSuite {
    */
   def testRandomDataGeneration(dataType: DataType, nullable: Boolean = true): Unit = {
     val toCatalyst = CatalystTypeConverters.createToCatalystConverter(dataType)
-    val generator = RandomDataGenerator.forType(dataType, nullable).getOrElse {
+    val generator = RandomDataGenerator.forType(dataType, nullable, Some(33)).getOrElse {
       fail(s"Random data generator was not defined for $dataType")
     }
     if (nullable) {

From 1f7b3d9dc7c2ed9d31f9083284cf900fd4c21e42 Mon Sep 17 00:00:00 2001
From: George Dittmar <georgedittmar@gmail.com>
Date: Mon, 27 Jul 2015 11:16:33 -0700
Subject: [PATCH 396/549] [SPARK-7423] [MLLIB] Modify ClassificationModel and
 Probabalistic model to use Vector.argmax

Use Vector.argmax call instead of converting to dense vector before calculating predictions.

Author: George Dittmar <georgedittmar@gmail.com>

Closes #7670 from GeorgeDittmar/sprk-7423 and squashes the following commits:

e796747 [George Dittmar] Changing ClassificationModel and ProbabilisticClassificationModel to use Vector.argmax instead of converting to DenseVector
---
 .../scala/org/apache/spark/ml/classification/Classifier.scala   | 2 +-
 .../spark/ml/classification/ProbabilisticClassifier.scala       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/Classifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/Classifier.scala
index 85c097bc64a4f..581d8fa7749be 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/Classifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/Classifier.scala
@@ -156,5 +156,5 @@ abstract class ClassificationModel[FeaturesType, M <: ClassificationModel[Featur
    * This may be overridden to support thresholds which favor particular labels.
    * @return  predicted label
    */
-  protected def raw2prediction(rawPrediction: Vector): Double = rawPrediction.toDense.argmax
+  protected def raw2prediction(rawPrediction: Vector): Double = rawPrediction.argmax
 }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala
index 38e832372698c..dad451108626d 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala
@@ -173,5 +173,5 @@ private[spark] abstract class ProbabilisticClassificationModel[
    * This may be overridden to support thresholds which favor particular labels.
    * @return  predicted label
    */
-  protected def probability2prediction(probability: Vector): Double = probability.toDense.argmax
+  protected def probability2prediction(probability: Vector): Double = probability.argmax
 }

From dd9ae7945ab65d353ed2b113e0c1a00a0533ffd6 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@outlook.com>
Date: Mon, 27 Jul 2015 11:23:29 -0700
Subject: [PATCH 397/549] [SPARK-9351] [SQL] remove literals from grouping
 expressions in Aggregate

literals in grouping expressions have no effect at all, only make our grouping key bigger, so we should remove them in Optimizer.

I also make old and new aggregation code consistent about literals in grouping here. In old aggregation, actually literals in grouping are already removed but new aggregation is not. So I explicitly make it a rule in Optimizer.

Author: Wenchen Fan <cloud0fan@outlook.com>

Closes #7583 from cloud-fan/minor and squashes the following commits:

471adff [Wenchen Fan] add test
0839925 [Wenchen Fan] use transformDown when rewrite final result expressions
---
 .../sql/catalyst/optimizer/Optimizer.scala    | 17 +++++++++--
 .../sql/catalyst/planning/patterns.scala      |  4 +--
 ...ite.scala => AggregateOptimizeSuite.scala} | 19 ++++++++++--
 .../org/apache/spark/sql/SQLQuerySuite.scala  | 29 +++++++++++++++----
 4 files changed, 57 insertions(+), 12 deletions(-)
 rename sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/{ReplaceDistinctWithAggregateSuite.scala => AggregateOptimizeSuite.scala} (72%)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index b59f800e7cc0f..813c62009666c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -36,8 +36,9 @@ object DefaultOptimizer extends Optimizer {
     // SubQueries are only needed for analysis and can be removed before execution.
     Batch("Remove SubQueries", FixedPoint(100),
       EliminateSubQueries) ::
-    Batch("Distinct", FixedPoint(100),
-      ReplaceDistinctWithAggregate) ::
+    Batch("Aggregate", FixedPoint(100),
+      ReplaceDistinctWithAggregate,
+      RemoveLiteralFromGroupExpressions) ::
     Batch("Operator Optimizations", FixedPoint(100),
       // Operator push down
       SetOperationPushDown,
@@ -799,3 +800,15 @@ object ReplaceDistinctWithAggregate extends Rule[LogicalPlan] {
     case Distinct(child) => Aggregate(child.output, child.output, child)
   }
 }
+
+/**
+ * Removes literals from group expressions in [[Aggregate]], as they have no effect to the result
+ * but only makes the grouping key bigger.
+ */
+object RemoveLiteralFromGroupExpressions extends Rule[LogicalPlan] {
+  def apply(plan: LogicalPlan): LogicalPlan = plan transform {
+    case a @ Aggregate(grouping, _, _) =>
+      val newGrouping = grouping.filter(!_.foldable)
+      a.copy(groupingExpressions = newGrouping)
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala
index 1e7b2a536ac12..b9ca712c1ee1c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala
@@ -144,14 +144,14 @@ object PartialAggregation {
         // time. However some of them might be unnamed so we alias them allowing them to be
         // referenced in the second aggregation.
         val namedGroupingExpressions: Seq[(Expression, NamedExpression)] =
-          groupingExpressions.filter(!_.isInstanceOf[Literal]).map {
+          groupingExpressions.map {
             case n: NamedExpression => (n, n)
             case other => (other, Alias(other, "PartialGroup")())
           }
 
         // Replace aggregations with a new expression that computes the result from the already
         // computed partial evaluations and grouping values.
-        val rewrittenAggregateExpressions = aggregateExpressions.map(_.transformUp {
+        val rewrittenAggregateExpressions = aggregateExpressions.map(_.transformDown {
           case e: Expression if partialEvaluations.contains(new TreeNodeRef(e)) =>
             partialEvaluations(new TreeNodeRef(e)).finalEvaluation
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ReplaceDistinctWithAggregateSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/AggregateOptimizeSuite.scala
similarity index 72%
rename from sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ReplaceDistinctWithAggregateSuite.scala
rename to sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/AggregateOptimizeSuite.scala
index df29a62ff0e15..2d080b95b1292 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ReplaceDistinctWithAggregateSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/AggregateOptimizeSuite.scala
@@ -19,14 +19,17 @@ package org.apache.spark.sql.catalyst.optimizer
 
 import org.apache.spark.sql.catalyst.dsl.plans._
 import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.catalyst.expressions.Literal
 import org.apache.spark.sql.catalyst.plans.PlanTest
 import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, Distinct, LocalRelation, LogicalPlan}
 import org.apache.spark.sql.catalyst.rules.RuleExecutor
 
-class ReplaceDistinctWithAggregateSuite extends PlanTest {
+class AggregateOptimizeSuite extends PlanTest {
 
   object Optimize extends RuleExecutor[LogicalPlan] {
-    val batches = Batch("ProjectCollapsing", Once, ReplaceDistinctWithAggregate) :: Nil
+    val batches = Batch("Aggregate", FixedPoint(100),
+      ReplaceDistinctWithAggregate,
+      RemoveLiteralFromGroupExpressions) :: Nil
   }
 
   test("replace distinct with aggregate") {
@@ -39,4 +42,16 @@ class ReplaceDistinctWithAggregateSuite extends PlanTest {
 
     comparePlans(optimized, correctAnswer)
   }
+
+  test("remove literals in grouping expression") {
+    val input = LocalRelation('a.int, 'b.int)
+
+    val query =
+      input.groupBy('a, Literal(1), Literal(1) + Literal(2))(sum('b))
+    val optimized = Optimize.execute(query)
+
+    val correctAnswer = input.groupBy('a)(sum('b))
+
+    comparePlans(optimized, correctAnswer)
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 8cef0b39f87dc..358e319476e83 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -463,12 +463,29 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll with SQLTestUtils {
   }
 
   test("literal in agg grouping expressions") {
-    checkAnswer(
-      sql("SELECT a, count(1) FROM testData2 GROUP BY a, 1"),
-      Seq(Row(1, 2), Row(2, 2), Row(3, 2)))
-    checkAnswer(
-      sql("SELECT a, count(2) FROM testData2 GROUP BY a, 2"),
-      Seq(Row(1, 2), Row(2, 2), Row(3, 2)))
+    def literalInAggTest(): Unit = {
+      checkAnswer(
+        sql("SELECT a, count(1) FROM testData2 GROUP BY a, 1"),
+        Seq(Row(1, 2), Row(2, 2), Row(3, 2)))
+      checkAnswer(
+        sql("SELECT a, count(2) FROM testData2 GROUP BY a, 2"),
+        Seq(Row(1, 2), Row(2, 2), Row(3, 2)))
+
+      checkAnswer(
+        sql("SELECT a, 1, sum(b) FROM testData2 GROUP BY a, 1"),
+        sql("SELECT a, 1, sum(b) FROM testData2 GROUP BY a"))
+      checkAnswer(
+        sql("SELECT a, 1, sum(b) FROM testData2 GROUP BY a, 1 + 2"),
+        sql("SELECT a, 1, sum(b) FROM testData2 GROUP BY a"))
+      checkAnswer(
+        sql("SELECT 1, 2, sum(b) FROM testData2 GROUP BY 1, 2"),
+        sql("SELECT 1, 2, sum(b) FROM testData2"))
+    }
+
+    literalInAggTest()
+    withSQLConf(SQLConf.USE_SQL_AGGREGATE2.key -> "false") {
+      literalInAggTest()
+    }
   }
 
   test("aggregates with nulls") {

From 75438422c2cd90dca53f84879cddecfc2ee0e957 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@outlook.com>
Date: Mon, 27 Jul 2015 11:28:22 -0700
Subject: [PATCH 398/549] [SPARK-9369][SQL] Support IntervalType in UnsafeRow

Author: Wenchen Fan <cloud0fan@outlook.com>

Closes #7688 from cloud-fan/interval and squashes the following commits:

5b36b17 [Wenchen Fan] fix codegen
a99ed50 [Wenchen Fan] address comment
9e6d319 [Wenchen Fan] Support IntervalType in UnsafeRow
---
 .../sql/catalyst/expressions/UnsafeRow.java   | 23 ++++++++++++++-----
 .../expressions/UnsafeRowWriters.java         | 19 ++++++++++++++-
 .../spark/sql/catalyst/InternalRow.scala      |  4 +++-
 .../catalyst/expressions/BoundAttribute.scala |  1 +
 .../spark/sql/catalyst/expressions/Cast.scala |  2 +-
 .../expressions/codegen/CodeGenerator.scala   |  7 +++---
 .../codegen/GenerateUnsafeProjection.scala    |  6 +++++
 .../expressions/ExpressionEvalHelper.scala    |  2 --
 8 files changed, 50 insertions(+), 14 deletions(-)

diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
index 0fb33dd5a15a0..fb084dd13b620 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
@@ -29,6 +29,7 @@
 import org.apache.spark.unsafe.array.ByteArrayMethods;
 import org.apache.spark.unsafe.bitset.BitSetMethods;
 import org.apache.spark.unsafe.hash.Murmur3_x86_32;
+import org.apache.spark.unsafe.types.Interval;
 import org.apache.spark.unsafe.types.UTF8String;
 
 import static org.apache.spark.sql.types.DataTypes.*;
@@ -90,7 +91,8 @@ public static int calculateBitSetWidthInBytes(int numFields) {
     final Set<DataType> _readableFieldTypes = new HashSet<>(
       Arrays.asList(new DataType[]{
         StringType,
-        BinaryType
+        BinaryType,
+        IntervalType
       }));
     _readableFieldTypes.addAll(settableFieldTypes);
     readableFieldTypes = Collections.unmodifiableSet(_readableFieldTypes);
@@ -332,11 +334,6 @@ public UTF8String getUTF8String(int ordinal) {
     return isNullAt(ordinal) ? null : UTF8String.fromBytes(getBinary(ordinal));
   }
 
-  @Override
-  public String getString(int ordinal) {
-    return getUTF8String(ordinal).toString();
-  }
-
   @Override
   public byte[] getBinary(int ordinal) {
     if (isNullAt(ordinal)) {
@@ -358,6 +355,20 @@ public byte[] getBinary(int ordinal) {
     }
   }
 
+  @Override
+  public Interval getInterval(int ordinal) {
+    if (isNullAt(ordinal)) {
+      return null;
+    } else {
+      final long offsetAndSize = getLong(ordinal);
+      final int offset = (int) (offsetAndSize >> 32);
+      final int months = (int) PlatformDependent.UNSAFE.getLong(baseObject, baseOffset + offset);
+      final long microseconds =
+        PlatformDependent.UNSAFE.getLong(baseObject, baseOffset + offset + 8);
+      return new Interval(months, microseconds);
+    }
+  }
+
   @Override
   public UnsafeRow getStruct(int ordinal, int numFields) {
     if (isNullAt(ordinal)) {
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRowWriters.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRowWriters.java
index 87521d1f23c99..0ba31d3b9b743 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRowWriters.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRowWriters.java
@@ -20,6 +20,7 @@
 import org.apache.spark.unsafe.PlatformDependent;
 import org.apache.spark.unsafe.array.ByteArrayMethods;
 import org.apache.spark.unsafe.types.ByteArray;
+import org.apache.spark.unsafe.types.Interval;
 import org.apache.spark.unsafe.types.UTF8String;
 
 /**
@@ -54,7 +55,7 @@ public static int write(UnsafeRow target, int ordinal, int cursor, UTF8String in
     }
   }
 
-  /** Writer for bianry (byte array) type. */
+  /** Writer for binary (byte array) type. */
   public static class BinaryWriter {
 
     public static int getSize(byte[] input) {
@@ -80,4 +81,20 @@ public static int write(UnsafeRow target, int ordinal, int cursor, byte[] input)
     }
   }
 
+  /** Writer for interval type. */
+  public static class IntervalWriter {
+
+    public static int write(UnsafeRow target, int ordinal, int cursor, Interval input) {
+      final long offset = target.getBaseOffset() + cursor;
+
+      // Write the months and microseconds fields of Interval to the variable length portion.
+      PlatformDependent.UNSAFE.putLong(target.getBaseObject(), offset, input.months);
+      PlatformDependent.UNSAFE.putLong(target.getBaseObject(), offset + 8, input.microseconds);
+
+      // Set the fixed length portion.
+      target.setLong(ordinal, ((long) cursor) << 32);
+      return 16;
+    }
+  }
+
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala
index ad3977281d1a9..9a11de3840ce2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.types._
-import org.apache.spark.unsafe.types.UTF8String
+import org.apache.spark.unsafe.types.{Interval, UTF8String}
 
 /**
  * An abstract class for row used internal in Spark SQL, which only contain the columns as
@@ -60,6 +60,8 @@ abstract class InternalRow extends Serializable {
 
   def getDecimal(ordinal: Int): Decimal = getAs[Decimal](ordinal, DecimalType.SYSTEM_DEFAULT)
 
+  def getInterval(ordinal: Int): Interval = getAs[Interval](ordinal, IntervalType)
+
   // This is only use for test and will throw a null pointer exception if the position is null.
   def getString(ordinal: Int): String = getUTF8String(ordinal).toString
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala
index 6b5c450e3fb0a..41a877f214e55 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala
@@ -48,6 +48,7 @@ case class BoundReference(ordinal: Int, dataType: DataType, nullable: Boolean)
         case DoubleType => input.getDouble(ordinal)
         case StringType => input.getUTF8String(ordinal)
         case BinaryType => input.getBinary(ordinal)
+        case IntervalType => input.getInterval(ordinal)
         case t: StructType => input.getStruct(ordinal, t.size)
         case dataType => input.get(ordinal, dataType)
       }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
index e208262da96dc..bd8b0177eb00e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -630,7 +630,7 @@ case class Cast(child: Expression, dataType: DataType)
   private[this] def castToIntervalCode(from: DataType): CastFunction = from match {
     case StringType =>
       (c, evPrim, evNull) =>
-        s"$evPrim = org.apache.spark.unsafe.types.Interval.fromString($c.toString());"
+        s"$evPrim = Interval.fromString($c.toString());"
   }
 
   private[this] def decimalToTimestampCode(d: String): String =
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index 2a1e288cb8377..2f02c90b1d5b3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -79,7 +79,6 @@ class CodeGenContext {
     mutableStates += ((javaType, variableName, initCode))
   }
 
-  final val intervalType: String = classOf[Interval].getName
   final val JAVA_BOOLEAN = "boolean"
   final val JAVA_BYTE = "byte"
   final val JAVA_SHORT = "short"
@@ -109,6 +108,7 @@ class CodeGenContext {
       case _ if isPrimitiveType(jt) => s"$row.get${primitiveTypeName(jt)}($ordinal)"
       case StringType => s"$row.getUTF8String($ordinal)"
       case BinaryType => s"$row.getBinary($ordinal)"
+      case IntervalType => s"$row.getInterval($ordinal)"
       case t: StructType => s"$row.getStruct($ordinal, ${t.size})"
       case _ => s"($jt)$row.get($ordinal)"
     }
@@ -150,7 +150,7 @@ class CodeGenContext {
     case dt: DecimalType => "Decimal"
     case BinaryType => "byte[]"
     case StringType => "UTF8String"
-    case IntervalType => intervalType
+    case IntervalType => "Interval"
     case _: StructType => "InternalRow"
     case _: ArrayType => s"scala.collection.Seq"
     case _: MapType => s"scala.collection.Map"
@@ -292,7 +292,8 @@ abstract class CodeGenerator[InType <: AnyRef, OutType <: AnyRef] extends Loggin
       classOf[InternalRow].getName,
       classOf[UnsafeRow].getName,
       classOf[UTF8String].getName,
-      classOf[Decimal].getName
+      classOf[Decimal].getName,
+      classOf[Interval].getName
     ))
     evaluator.setExtendedClass(classOf[GeneratedClass])
     try {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
index afd0d9cfa1ddd..9d2161947b351 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
@@ -33,10 +33,12 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
 
   private val StringWriter = classOf[UnsafeRowWriters.UTF8StringWriter].getName
   private val BinaryWriter = classOf[UnsafeRowWriters.BinaryWriter].getName
+  private val IntervalWriter = classOf[UnsafeRowWriters.IntervalWriter].getName
 
   /** Returns true iff we support this data type. */
   def canSupport(dataType: DataType): Boolean = dataType match {
     case t: AtomicType if !t.isInstanceOf[DecimalType] => true
+    case _: IntervalType => true
     case NullType => true
     case _ => false
   }
@@ -68,6 +70,8 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
           s" + (${exprs(i).isNull} ? 0 : $StringWriter.getSize(${exprs(i).primitive}))"
         case BinaryType =>
           s" + (${exprs(i).isNull} ? 0 : $BinaryWriter.getSize(${exprs(i).primitive}))"
+        case IntervalType =>
+          s" + (${exprs(i).isNull} ? 0 : 16)"
         case _ => ""
       }
     }.mkString("")
@@ -80,6 +84,8 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
           s"$cursorTerm += $StringWriter.write($ret, $i, $cursorTerm, ${exprs(i).primitive})"
         case BinaryType =>
           s"$cursorTerm += $BinaryWriter.write($ret, $i, $cursorTerm, ${exprs(i).primitive})"
+        case IntervalType =>
+          s"$cursorTerm += $IntervalWriter.write($ret, $i, $cursorTerm, ${exprs(i).primitive})"
         case NullType => ""
         case _ =>
           throw new UnsupportedOperationException(s"Not supported DataType: ${e.dataType}")
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
index 8b0f90cf3a623..ab0cdc857c80e 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
@@ -78,8 +78,6 @@ trait ExpressionEvalHelper {
       generator
     } catch {
       case e: Throwable =>
-        val ctx = new CodeGenContext
-        val evaluated = expression.gen(ctx)
         fail(
           s"""
             |Code generation of $expression failed:

From 85a50a6352b72c4619d010e29e3a76774dbc0c71 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Mon, 27 Jul 2015 12:25:34 -0700
Subject: [PATCH 399/549] [HOTFIX] Disable pylint since it is failing master.

---
 dev/lint-python | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/dev/lint-python b/dev/lint-python
index e02dff220eb87..53bccc1fab535 100755
--- a/dev/lint-python
+++ b/dev/lint-python
@@ -96,19 +96,19 @@ fi
 
 rm "$PEP8_REPORT_PATH"
 
-for to_be_checked in "$PATHS_TO_CHECK"
-do
-    pylint --rcfile="$SPARK_ROOT_DIR/pylintrc" $to_be_checked >> "$PYLINT_REPORT_PATH"
-done
-
-if [ "${PIPESTATUS[0]}" -ne 0 ]; then
-    lint_status=1
-    echo "Pylint checks failed."
-    cat "$PYLINT_REPORT_PATH"
-else
-    echo "Pylint checks passed."
-fi
-
-rm "$PYLINT_REPORT_PATH"
+# for to_be_checked in "$PATHS_TO_CHECK"
+# do
+#     pylint --rcfile="$SPARK_ROOT_DIR/pylintrc" $to_be_checked >> "$PYLINT_REPORT_PATH"
+# done
+
+# if [ "${PIPESTATUS[0]}" -ne 0 ]; then
+#     lint_status=1
+#     echo "Pylint checks failed."
+#     cat "$PYLINT_REPORT_PATH"
+# else
+#     echo "Pylint checks passed."
+# fi
+
+# rm "$PYLINT_REPORT_PATH"
 
 exit "$lint_status"

From fa84e4a7ba6eab476487185178a556e4f04e4199 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Mon, 27 Jul 2015 13:21:04 -0700
Subject: [PATCH 400/549] Closes #7690 since it has been merged into
 branch-1.4.


From 55946e76fd136958081f073c0c5e3ff8563d505b Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Mon, 27 Jul 2015 13:26:57 -0700
Subject: [PATCH 401/549] [SPARK-9349] [SQL] UDAF cleanup

https://issues.apache.org/jira/browse/SPARK-9349

With this PR, we only expose `UserDefinedAggregateFunction` (an abstract class) and `MutableAggregationBuffer` (an interface). Other internal wrappers and helper classes are moved to `org.apache.spark.sql.execution.aggregate` and marked as `private[sql]`.

Author: Yin Huai <yhuai@databricks.com>

Closes #7687 from yhuai/UDAF-cleanup and squashes the following commits:

db36542 [Yin Huai] Add comments to UDAF examples.
ae17f66 [Yin Huai] Address comments.
9c9fa5f [Yin Huai] UDAF cleanup.
---
 .../apache/spark/sql/UDAFRegistration.scala   |   3 +-
 .../aggregate/udaf.scala                      | 122 +++++-------------
 .../apache/spark/sql/expressions/udaf.scala   | 101 +++++++++++++++
 .../spark/sql/hive/aggregate/MyDoubleAvg.java |  34 ++++-
 .../spark/sql/hive/aggregate/MyDoubleSum.java |  28 +++-
 5 files changed, 187 insertions(+), 101 deletions(-)
 rename sql/core/src/main/scala/org/apache/spark/sql/{expressions => execution}/aggregate/udaf.scala (67%)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/expressions/udaf.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/UDAFRegistration.scala b/sql/core/src/main/scala/org/apache/spark/sql/UDAFRegistration.scala
index 5b872f5e3eecd..0d4e30f29255e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/UDAFRegistration.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/UDAFRegistration.scala
@@ -19,7 +19,8 @@ package org.apache.spark.sql
 
 import org.apache.spark.Logging
 import org.apache.spark.sql.catalyst.expressions.{Expression}
-import org.apache.spark.sql.expressions.aggregate.{ScalaUDAF, UserDefinedAggregateFunction}
+import org.apache.spark.sql.execution.aggregate.ScalaUDAF
+import org.apache.spark.sql.expressions.UserDefinedAggregateFunction
 
 class UDAFRegistration private[sql] (sqlContext: SQLContext) extends Logging {
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/aggregate/udaf.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala
similarity index 67%
rename from sql/core/src/main/scala/org/apache/spark/sql/expressions/aggregate/udaf.scala
rename to sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala
index 4ada9eca7a035..073c45ae2f9f2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/aggregate/udaf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala
@@ -15,87 +15,29 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.expressions.aggregate
+package org.apache.spark.sql.execution.aggregate
 
 import org.apache.spark.Logging
-import org.apache.spark.sql.catalyst.expressions.codegen.GenerateMutableProjection
+import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.{InternalRow, CatalystTypeConverters}
-import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.expressions.codegen.GenerateMutableProjection
+import org.apache.spark.sql.catalyst.expressions.{MutableRow, InterpretedMutableProjection, AttributeReference, Expression}
 import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateFunction2
-import org.apache.spark.sql.types._
-import org.apache.spark.sql.Row
+import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction}
+import org.apache.spark.sql.types.{Metadata, StructField, StructType, DataType}
 
 /**
- * The abstract class for implementing user-defined aggregate function.
+ * A Mutable [[Row]] representing an mutable aggregation buffer.
  */
-abstract class UserDefinedAggregateFunction extends Serializable {
-
-  /**
-   * A [[StructType]] represents data types of input arguments of this aggregate function.
-   * For example, if a [[UserDefinedAggregateFunction]] expects two input arguments
-   * with type of [[DoubleType]] and [[LongType]], the returned [[StructType]] will look like
-   *
-   * ```
-   *   StructType(Seq(StructField("doubleInput", DoubleType), StructField("longInput", LongType)))
-   * ```
-   *
-   * The name of a field of this [[StructType]] is only used to identify the corresponding
-   * input argument. Users can choose names to identify the input arguments.
-   */
-  def inputSchema: StructType
-
-  /**
-   * A [[StructType]] represents data types of values in the aggregation buffer.
-   * For example, if a [[UserDefinedAggregateFunction]]'s buffer has two values
-   * (i.e. two intermediate values) with type of [[DoubleType]] and [[LongType]],
-   * the returned [[StructType]] will look like
-   *
-   * ```
-   *   StructType(Seq(StructField("doubleInput", DoubleType), StructField("longInput", LongType)))
-   * ```
-   *
-   * The name of a field of this [[StructType]] is only used to identify the corresponding
-   * buffer value. Users can choose names to identify the input arguments.
-   */
-  def bufferSchema: StructType
-
-  /**
-   * The [[DataType]] of the returned value of this [[UserDefinedAggregateFunction]].
-   */
-  def returnDataType: DataType
-
-  /** Indicates if this function is deterministic. */
-  def deterministic: Boolean
-
-  /**
-   *  Initializes the given aggregation buffer. Initial values set by this method should satisfy
-   *  the condition that when merging two buffers with initial values, the new buffer should
-   *  still store initial values.
-   */
-  def initialize(buffer: MutableAggregationBuffer): Unit
-
-  /** Updates the given aggregation buffer `buffer` with new input data from `input`. */
-  def update(buffer: MutableAggregationBuffer, input: Row): Unit
-
-  /** Merges two aggregation buffers and stores the updated buffer values back in `buffer1`. */
-  def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit
-
-  /**
-   * Calculates the final result of this [[UserDefinedAggregateFunction]] based on the given
-   * aggregation buffer.
-   */
-  def evaluate(buffer: Row): Any
-}
-
-private[sql] abstract class AggregationBuffer(
+private[sql] class MutableAggregationBufferImpl (
+    schema: StructType,
     toCatalystConverters: Array[Any => Any],
     toScalaConverters: Array[Any => Any],
-    bufferOffset: Int)
-  extends Row {
-
-  override def length: Int = toCatalystConverters.length
+    bufferOffset: Int,
+    var underlyingBuffer: MutableRow)
+  extends MutableAggregationBuffer {
 
-  protected val offsets: Array[Int] = {
+  private[this] val offsets: Array[Int] = {
     val newOffsets = new Array[Int](length)
     var i = 0
     while (i < newOffsets.length) {
@@ -104,18 +46,8 @@ private[sql] abstract class AggregationBuffer(
     }
     newOffsets
   }
-}
 
-/**
- * A Mutable [[Row]] representing an mutable aggregation buffer.
- */
-class MutableAggregationBuffer private[sql] (
-    schema: StructType,
-    toCatalystConverters: Array[Any => Any],
-    toScalaConverters: Array[Any => Any],
-    bufferOffset: Int,
-    var underlyingBuffer: MutableRow)
-  extends AggregationBuffer(toCatalystConverters, toScalaConverters, bufferOffset) {
+  override def length: Int = toCatalystConverters.length
 
   override def get(i: Int): Any = {
     if (i >= length || i < 0) {
@@ -133,8 +65,8 @@ class MutableAggregationBuffer private[sql] (
     underlyingBuffer.update(offsets(i), toCatalystConverters(i)(value))
   }
 
-  override def copy(): MutableAggregationBuffer = {
-    new MutableAggregationBuffer(
+  override def copy(): MutableAggregationBufferImpl = {
+    new MutableAggregationBufferImpl(
       schema,
       toCatalystConverters,
       toScalaConverters,
@@ -146,13 +78,25 @@ class MutableAggregationBuffer private[sql] (
 /**
  * A [[Row]] representing an immutable aggregation buffer.
  */
-class InputAggregationBuffer private[sql] (
+private[sql] class InputAggregationBuffer private[sql] (
     schema: StructType,
     toCatalystConverters: Array[Any => Any],
     toScalaConverters: Array[Any => Any],
     bufferOffset: Int,
     var underlyingInputBuffer: InternalRow)
-  extends AggregationBuffer(toCatalystConverters, toScalaConverters, bufferOffset) {
+  extends Row {
+
+  private[this] val offsets: Array[Int] = {
+    val newOffsets = new Array[Int](length)
+    var i = 0
+    while (i < newOffsets.length) {
+      newOffsets(i) = bufferOffset + i
+      i += 1
+    }
+    newOffsets
+  }
+
+  override def length: Int = toCatalystConverters.length
 
   override def get(i: Int): Any = {
     if (i >= length || i < 0) {
@@ -179,7 +123,7 @@ class InputAggregationBuffer private[sql] (
  * @param children
  * @param udaf
  */
-case class ScalaUDAF(
+private[sql] case class ScalaUDAF(
     children: Seq[Expression],
     udaf: UserDefinedAggregateFunction)
   extends AggregateFunction2 with Logging {
@@ -243,8 +187,8 @@ case class ScalaUDAF(
       bufferOffset,
       null)
 
-  lazy val mutableAggregateBuffer: MutableAggregationBuffer =
-    new MutableAggregationBuffer(
+  lazy val mutableAggregateBuffer: MutableAggregationBufferImpl =
+    new MutableAggregationBufferImpl(
       bufferSchema,
       bufferValuesToCatalystConverters,
       bufferValuesToScalaConverters,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/udaf.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/udaf.scala
new file mode 100644
index 0000000000000..278dd438fab4a
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/udaf.scala
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.expressions
+
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.types._
+import org.apache.spark.annotation.Experimental
+
+/**
+ * :: Experimental ::
+ * The abstract class for implementing user-defined aggregate functions.
+ */
+@Experimental
+abstract class UserDefinedAggregateFunction extends Serializable {
+
+  /**
+   * A [[StructType]] represents data types of input arguments of this aggregate function.
+   * For example, if a [[UserDefinedAggregateFunction]] expects two input arguments
+   * with type of [[DoubleType]] and [[LongType]], the returned [[StructType]] will look like
+   *
+   * ```
+   *   new StructType()
+   *    .add("doubleInput", DoubleType)
+   *    .add("longInput", LongType)
+   * ```
+   *
+   * The name of a field of this [[StructType]] is only used to identify the corresponding
+   * input argument. Users can choose names to identify the input arguments.
+   */
+  def inputSchema: StructType
+
+  /**
+   * A [[StructType]] represents data types of values in the aggregation buffer.
+   * For example, if a [[UserDefinedAggregateFunction]]'s buffer has two values
+   * (i.e. two intermediate values) with type of [[DoubleType]] and [[LongType]],
+   * the returned [[StructType]] will look like
+   *
+   * ```
+   *   new StructType()
+   *    .add("doubleInput", DoubleType)
+   *    .add("longInput", LongType)
+   * ```
+   *
+   * The name of a field of this [[StructType]] is only used to identify the corresponding
+   * buffer value. Users can choose names to identify the input arguments.
+   */
+  def bufferSchema: StructType
+
+  /**
+   * The [[DataType]] of the returned value of this [[UserDefinedAggregateFunction]].
+   */
+  def returnDataType: DataType
+
+  /** Indicates if this function is deterministic. */
+  def deterministic: Boolean
+
+  /**
+   *  Initializes the given aggregation buffer. Initial values set by this method should satisfy
+   *  the condition that when merging two buffers with initial values, the new buffer
+   *  still store initial values.
+   */
+  def initialize(buffer: MutableAggregationBuffer): Unit
+
+  /** Updates the given aggregation buffer `buffer` with new input data from `input`. */
+  def update(buffer: MutableAggregationBuffer, input: Row): Unit
+
+  /** Merges two aggregation buffers and stores the updated buffer values back to `buffer1`. */
+  def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit
+
+  /**
+   * Calculates the final result of this [[UserDefinedAggregateFunction]] based on the given
+   * aggregation buffer.
+   */
+  def evaluate(buffer: Row): Any
+}
+
+/**
+ * :: Experimental ::
+ * A [[Row]] representing an mutable aggregation buffer.
+ */
+@Experimental
+trait MutableAggregationBuffer extends Row {
+
+  /** Update the ith value of this buffer. */
+  def update(i: Int, value: Any): Unit
+}
diff --git a/sql/hive/src/test/java/test/org/apache/spark/sql/hive/aggregate/MyDoubleAvg.java b/sql/hive/src/test/java/test/org/apache/spark/sql/hive/aggregate/MyDoubleAvg.java
index 5c9d0e97a99c6..a2247e3da1554 100644
--- a/sql/hive/src/test/java/test/org/apache/spark/sql/hive/aggregate/MyDoubleAvg.java
+++ b/sql/hive/src/test/java/test/org/apache/spark/sql/hive/aggregate/MyDoubleAvg.java
@@ -21,13 +21,18 @@
 import java.util.List;
 
 import org.apache.spark.sql.Row;
-import org.apache.spark.sql.expressions.aggregate.MutableAggregationBuffer;
-import org.apache.spark.sql.expressions.aggregate.UserDefinedAggregateFunction;
+import org.apache.spark.sql.expressions.MutableAggregationBuffer;
+import org.apache.spark.sql.expressions.UserDefinedAggregateFunction;
 import org.apache.spark.sql.types.DataType;
 import org.apache.spark.sql.types.DataTypes;
 import org.apache.spark.sql.types.StructField;
 import org.apache.spark.sql.types.StructType;
 
+/**
+ * An example {@link UserDefinedAggregateFunction} to calculate a special average value of a
+ * {@link org.apache.spark.sql.types.DoubleType} column. This special average value is the sum
+ * of the average value of input values and 100.0.
+ */
 public class MyDoubleAvg extends UserDefinedAggregateFunction {
 
   private StructType _inputDataType;
@@ -37,10 +42,13 @@ public class MyDoubleAvg extends UserDefinedAggregateFunction {
   private DataType _returnDataType;
 
   public MyDoubleAvg() {
-    List<StructField> inputfields = new ArrayList<StructField>();
-    inputfields.add(DataTypes.createStructField("inputDouble", DataTypes.DoubleType, true));
-    _inputDataType = DataTypes.createStructType(inputfields);
+    List<StructField> inputFields = new ArrayList<StructField>();
+    inputFields.add(DataTypes.createStructField("inputDouble", DataTypes.DoubleType, true));
+    _inputDataType = DataTypes.createStructType(inputFields);
 
+    // The buffer has two values, bufferSum for storing the current sum and
+    // bufferCount for storing the number of non-null input values that have been contribuetd
+    // to the current sum.
     List<StructField> bufferFields = new ArrayList<StructField>();
     bufferFields.add(DataTypes.createStructField("bufferSum", DataTypes.DoubleType, true));
     bufferFields.add(DataTypes.createStructField("bufferCount", DataTypes.LongType, true));
@@ -66,16 +74,23 @@ public MyDoubleAvg() {
   }
 
   @Override public void initialize(MutableAggregationBuffer buffer) {
+    // The initial value of the sum is null.
     buffer.update(0, null);
+    // The initial value of the count is 0.
     buffer.update(1, 0L);
   }
 
   @Override public void update(MutableAggregationBuffer buffer, Row input) {
+    // This input Row only has a single column storing the input value in Double.
+    // We only update the buffer when the input value is not null.
     if (!input.isNullAt(0)) {
+      // If the buffer value (the intermediate result of the sum) is still null,
+      // we set the input value to the buffer and set the bufferCount to 1.
       if (buffer.isNullAt(0)) {
         buffer.update(0, input.getDouble(0));
         buffer.update(1, 1L);
       } else {
+        // Otherwise, update the bufferSum and increment bufferCount.
         Double newValue = input.getDouble(0) + buffer.getDouble(0);
         buffer.update(0, newValue);
         buffer.update(1, buffer.getLong(1) + 1L);
@@ -84,11 +99,16 @@ public MyDoubleAvg() {
   }
 
   @Override public void merge(MutableAggregationBuffer buffer1, Row buffer2) {
+    // buffer1 and buffer2 have the same structure.
+    // We only update the buffer1 when the input buffer2's sum value is not null.
     if (!buffer2.isNullAt(0)) {
       if (buffer1.isNullAt(0)) {
+        // If the buffer value (intermediate result of the sum) is still null,
+        // we set the it as the input buffer's value.
         buffer1.update(0, buffer2.getDouble(0));
         buffer1.update(1, buffer2.getLong(1));
       } else {
+        // Otherwise, we update the bufferSum and bufferCount.
         Double newValue = buffer2.getDouble(0) + buffer1.getDouble(0);
         buffer1.update(0, newValue);
         buffer1.update(1, buffer1.getLong(1) + buffer2.getLong(1));
@@ -98,10 +118,12 @@ public MyDoubleAvg() {
 
   @Override public Object evaluate(Row buffer) {
     if (buffer.isNullAt(0)) {
+      // If the bufferSum is still null, we return null because this function has not got
+      // any input row.
       return null;
     } else {
+      // Otherwise, we calculate the special average value.
       return buffer.getDouble(0) / buffer.getLong(1) + 100.0;
     }
   }
 }
-
diff --git a/sql/hive/src/test/java/test/org/apache/spark/sql/hive/aggregate/MyDoubleSum.java b/sql/hive/src/test/java/test/org/apache/spark/sql/hive/aggregate/MyDoubleSum.java
index 1d4587a27c787..da29e24d267dd 100644
--- a/sql/hive/src/test/java/test/org/apache/spark/sql/hive/aggregate/MyDoubleSum.java
+++ b/sql/hive/src/test/java/test/org/apache/spark/sql/hive/aggregate/MyDoubleSum.java
@@ -20,14 +20,18 @@
 import java.util.ArrayList;
 import java.util.List;
 
-import org.apache.spark.sql.expressions.aggregate.MutableAggregationBuffer;
-import org.apache.spark.sql.expressions.aggregate.UserDefinedAggregateFunction;
+import org.apache.spark.sql.expressions.MutableAggregationBuffer;
+import org.apache.spark.sql.expressions.UserDefinedAggregateFunction;
 import org.apache.spark.sql.types.StructField;
 import org.apache.spark.sql.types.StructType;
 import org.apache.spark.sql.types.DataType;
 import org.apache.spark.sql.types.DataTypes;
 import org.apache.spark.sql.Row;
 
+/**
+ * An example {@link UserDefinedAggregateFunction} to calculate the sum of a
+ * {@link org.apache.spark.sql.types.DoubleType} column.
+ */
 public class MyDoubleSum extends UserDefinedAggregateFunction {
 
   private StructType _inputDataType;
@@ -37,9 +41,9 @@ public class MyDoubleSum extends UserDefinedAggregateFunction {
   private DataType _returnDataType;
 
   public MyDoubleSum() {
-    List<StructField> inputfields = new ArrayList<StructField>();
-    inputfields.add(DataTypes.createStructField("inputDouble", DataTypes.DoubleType, true));
-    _inputDataType = DataTypes.createStructType(inputfields);
+    List<StructField> inputFields = new ArrayList<StructField>();
+    inputFields.add(DataTypes.createStructField("inputDouble", DataTypes.DoubleType, true));
+    _inputDataType = DataTypes.createStructType(inputFields);
 
     List<StructField> bufferFields = new ArrayList<StructField>();
     bufferFields.add(DataTypes.createStructField("bufferDouble", DataTypes.DoubleType, true));
@@ -65,14 +69,20 @@ public MyDoubleSum() {
   }
 
   @Override public void initialize(MutableAggregationBuffer buffer) {
+    // The initial value of the sum is null.
     buffer.update(0, null);
   }
 
   @Override public void update(MutableAggregationBuffer buffer, Row input) {
+    // This input Row only has a single column storing the input value in Double.
+    // We only update the buffer when the input value is not null.
     if (!input.isNullAt(0)) {
       if (buffer.isNullAt(0)) {
+        // If the buffer value (the intermediate result of the sum) is still null,
+        // we set the input value to the buffer.
         buffer.update(0, input.getDouble(0));
       } else {
+        // Otherwise, we add the input value to the buffer value.
         Double newValue = input.getDouble(0) + buffer.getDouble(0);
         buffer.update(0, newValue);
       }
@@ -80,10 +90,16 @@ public MyDoubleSum() {
   }
 
   @Override public void merge(MutableAggregationBuffer buffer1, Row buffer2) {
+    // buffer1 and buffer2 have the same structure.
+    // We only update the buffer1 when the input buffer2's value is not null.
     if (!buffer2.isNullAt(0)) {
       if (buffer1.isNullAt(0)) {
+        // If the buffer value (intermediate result of the sum) is still null,
+        // we set the it as the input buffer's value.
         buffer1.update(0, buffer2.getDouble(0));
       } else {
+        // Otherwise, we add the input buffer's value (buffer1) to the mutable
+        // buffer's value (buffer2).
         Double newValue = buffer2.getDouble(0) + buffer1.getDouble(0);
         buffer1.update(0, newValue);
       }
@@ -92,8 +108,10 @@ public MyDoubleSum() {
 
   @Override public Object evaluate(Row buffer) {
     if (buffer.isNullAt(0)) {
+      // If the buffer value is still null, we return null.
       return null;
     } else {
+      // Otherwise, the intermediate sum is the final result.
       return buffer.getDouble(0);
     }
   }

From 8e7d2bee23dad1535846dae2dc31e35058db16cd Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Mon, 27 Jul 2015 13:28:03 -0700
Subject: [PATCH 402/549] [SPARK-9378] [SQL] Fixes test case "CTAS with serde"

This is a proper version of PR #7693 authored by viirya

The reason why "CTAS with serde" fails is that the `MetastoreRelation` gets converted to a Parquet data source relation by default.

Author: Cheng Lian <lian@databricks.com>

Closes #7700 from liancheng/spark-9378-fix-ctas-test and squashes the following commits:

4413af0 [Cheng Lian] Fixes test case "CTAS with serde"
---
 .../spark/sql/hive/execution/SQLQuerySuite.scala | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index 8371dd0716c06..c4923d83e48f3 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -406,13 +406,15 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils {
         |   FROM src
         |   ORDER BY key, value""".stripMargin).collect()
 
-    checkExistence(sql("DESC EXTENDED ctas5"), true,
-      "name:key", "type:string", "name:value", "ctas5",
-      "org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat",
-      "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat",
-      "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe",
-      "MANAGED_TABLE"
-    )
+    withSQLConf(HiveContext.CONVERT_METASTORE_PARQUET.key -> "false") {
+      checkExistence(sql("DESC EXTENDED ctas5"), true,
+        "name:key", "type:string", "name:value", "ctas5",
+        "org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat",
+        "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat",
+        "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe",
+        "MANAGED_TABLE"
+      )
+    }
 
     // use the Hive SerDe for parquet tables
     withSQLConf(HiveContext.CONVERT_METASTORE_PARQUET.key -> "false") {

From 3ab7525dceeb1c2f3c21efb1ee5a9c8bb0fd0c13 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@outlook.com>
Date: Mon, 27 Jul 2015 13:40:50 -0700
Subject: [PATCH 403/549] [SPARK-9355][SQL] Remove InternalRow.get generic
 getter call in columnar cache code

Author: Wenchen Fan <cloud0fan@outlook.com>

Closes #7673 from cloud-fan/row-generic-getter-columnar and squashes the following commits:

88b1170 [Wenchen Fan] fix style
eeae712 [Wenchen Fan] Remove Internal.get generic getter call in columnar cache code
---
 .../spark/sql/columnar/ColumnAccessor.scala   | 12 ++---
 .../spark/sql/columnar/ColumnBuilder.scala    | 18 +++----
 .../spark/sql/columnar/ColumnStats.scala      |  6 ++-
 .../spark/sql/columnar/ColumnType.scala       | 49 +++++++++++--------
 .../compression/CompressionScheme.scala       |  2 +-
 .../compression/compressionSchemes.scala      | 14 +++---
 .../spark/sql/columnar/ColumnStatsSuite.scala | 12 ++---
 .../spark/sql/columnar/ColumnTypeSuite.scala  | 30 ++++++------
 .../sql/columnar/ColumnarTestUtils.scala      | 18 +++----
 .../NullableColumnAccessorSuite.scala         | 18 +++----
 .../columnar/NullableColumnBuilderSuite.scala | 21 ++++----
 .../compression/BooleanBitSetSuite.scala      |  2 +-
 12 files changed, 107 insertions(+), 95 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnAccessor.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnAccessor.scala
index 931469bed634a..4c29a093218a0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnAccessor.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnAccessor.scala
@@ -41,9 +41,9 @@ private[sql] trait ColumnAccessor {
   protected def underlyingBuffer: ByteBuffer
 }
 
-private[sql] abstract class BasicColumnAccessor[T <: DataType, JvmType](
+private[sql] abstract class BasicColumnAccessor[JvmType](
     protected val buffer: ByteBuffer,
-    protected val columnType: ColumnType[T, JvmType])
+    protected val columnType: ColumnType[JvmType])
   extends ColumnAccessor {
 
   protected def initialize() {}
@@ -93,14 +93,14 @@ private[sql] class StringColumnAccessor(buffer: ByteBuffer)
   extends NativeColumnAccessor(buffer, STRING)
 
 private[sql] class BinaryColumnAccessor(buffer: ByteBuffer)
-  extends BasicColumnAccessor[BinaryType.type, Array[Byte]](buffer, BINARY)
+  extends BasicColumnAccessor[Array[Byte]](buffer, BINARY)
   with NullableColumnAccessor
 
 private[sql] class FixedDecimalColumnAccessor(buffer: ByteBuffer, precision: Int, scale: Int)
   extends NativeColumnAccessor(buffer, FIXED_DECIMAL(precision, scale))
 
-private[sql] class GenericColumnAccessor(buffer: ByteBuffer)
-  extends BasicColumnAccessor[DataType, Array[Byte]](buffer, GENERIC)
+private[sql] class GenericColumnAccessor(buffer: ByteBuffer, dataType: DataType)
+  extends BasicColumnAccessor[Array[Byte]](buffer, GENERIC(dataType))
   with NullableColumnAccessor
 
 private[sql] class DateColumnAccessor(buffer: ByteBuffer)
@@ -131,7 +131,7 @@ private[sql] object ColumnAccessor {
       case BinaryType => new BinaryColumnAccessor(dup)
       case DecimalType.Fixed(precision, scale) if precision < 19 =>
         new FixedDecimalColumnAccessor(dup, precision, scale)
-      case _ => new GenericColumnAccessor(dup)
+      case other => new GenericColumnAccessor(dup, other)
     }
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnBuilder.scala
index 087c52239713d..454b7b91a63f5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnBuilder.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnBuilder.scala
@@ -46,9 +46,9 @@ private[sql] trait ColumnBuilder {
   def build(): ByteBuffer
 }
 
-private[sql] class BasicColumnBuilder[T <: DataType, JvmType](
+private[sql] class BasicColumnBuilder[JvmType](
     val columnStats: ColumnStats,
-    val columnType: ColumnType[T, JvmType])
+    val columnType: ColumnType[JvmType])
   extends ColumnBuilder {
 
   protected var columnName: String = _
@@ -78,16 +78,16 @@ private[sql] class BasicColumnBuilder[T <: DataType, JvmType](
   }
 }
 
-private[sql] abstract class ComplexColumnBuilder[T <: DataType, JvmType](
+private[sql] abstract class ComplexColumnBuilder[JvmType](
     columnStats: ColumnStats,
-    columnType: ColumnType[T, JvmType])
-  extends BasicColumnBuilder[T, JvmType](columnStats, columnType)
+    columnType: ColumnType[JvmType])
+  extends BasicColumnBuilder[JvmType](columnStats, columnType)
   with NullableColumnBuilder
 
 private[sql] abstract class NativeColumnBuilder[T <: AtomicType](
     override val columnStats: ColumnStats,
     override val columnType: NativeColumnType[T])
-  extends BasicColumnBuilder[T, T#InternalType](columnStats, columnType)
+  extends BasicColumnBuilder[T#InternalType](columnStats, columnType)
   with NullableColumnBuilder
   with AllCompressionSchemes
   with CompressibleColumnBuilder[T]
@@ -118,8 +118,8 @@ private[sql] class FixedDecimalColumnBuilder(
     FIXED_DECIMAL(precision, scale))
 
 // TODO (lian) Add support for array, struct and map
-private[sql] class GenericColumnBuilder
-  extends ComplexColumnBuilder(new GenericColumnStats, GENERIC)
+private[sql] class GenericColumnBuilder(dataType: DataType)
+  extends ComplexColumnBuilder(new GenericColumnStats(dataType), GENERIC(dataType))
 
 private[sql] class DateColumnBuilder extends NativeColumnBuilder(new DateColumnStats, DATE)
 
@@ -164,7 +164,7 @@ private[sql] object ColumnBuilder {
       case BinaryType => new BinaryColumnBuilder
       case DecimalType.Fixed(precision, scale) if precision < 19 =>
         new FixedDecimalColumnBuilder(precision, scale)
-      case _ => new GenericColumnBuilder
+      case other => new GenericColumnBuilder(other)
     }
 
     builder.initialize(initialSize, columnName, useCompression)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnStats.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnStats.scala
index 7c63179af6470..32a84b2676e07 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnStats.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnStats.scala
@@ -252,11 +252,13 @@ private[sql] class FixedDecimalColumnStats extends ColumnStats {
     InternalRow(lower, upper, nullCount, count, sizeInBytes)
 }
 
-private[sql] class GenericColumnStats extends ColumnStats {
+private[sql] class GenericColumnStats(dataType: DataType) extends ColumnStats {
+  val columnType = GENERIC(dataType)
+
   override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
     super.gatherStats(row, ordinal)
     if (!row.isNullAt(ordinal)) {
-      sizeInBytes += GENERIC.actualSize(row, ordinal)
+      sizeInBytes += columnType.actualSize(row, ordinal)
     }
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala
index c0ca52751b66c..2863f6c230a9d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala
@@ -31,14 +31,18 @@ import org.apache.spark.unsafe.types.UTF8String
  * An abstract class that represents type of a column. Used to append/extract Java objects into/from
  * the underlying [[ByteBuffer]] of a column.
  *
- * @param typeId A unique ID representing the type.
- * @param defaultSize Default size in bytes for one element of type T (e.g. 4 for `Int`).
- * @tparam T Scala data type for the column.
  * @tparam JvmType Underlying Java type to represent the elements.
  */
-private[sql] sealed abstract class ColumnType[T <: DataType, JvmType](
-    val typeId: Int,
-    val defaultSize: Int) {
+private[sql] sealed abstract class ColumnType[JvmType] {
+
+  // The catalyst data type of this column.
+  def dataType: DataType
+
+  // A unique ID representing the type.
+  def typeId: Int
+
+  // Default size in bytes for one element of type T (e.g. 4 for `Int`).
+  def defaultSize: Int
 
   /**
    * Extracts a value out of the buffer at the buffer's current position.
@@ -90,7 +94,7 @@ private[sql] sealed abstract class ColumnType[T <: DataType, JvmType](
    * boxing/unboxing costs whenever possible.
    */
   def copyField(from: InternalRow, fromOrdinal: Int, to: MutableRow, toOrdinal: Int): Unit = {
-    to(toOrdinal) = from.get(fromOrdinal)
+    to.update(toOrdinal, from.get(fromOrdinal, dataType))
   }
 
   /**
@@ -103,9 +107,9 @@ private[sql] sealed abstract class ColumnType[T <: DataType, JvmType](
 
 private[sql] abstract class NativeColumnType[T <: AtomicType](
     val dataType: T,
-    typeId: Int,
-    defaultSize: Int)
-  extends ColumnType[T, T#InternalType](typeId, defaultSize) {
+    val typeId: Int,
+    val defaultSize: Int)
+  extends ColumnType[T#InternalType] {
 
   /**
    * Scala TypeTag. Can be used to create primitive arrays and hash tables.
@@ -400,10 +404,10 @@ private[sql] object FIXED_DECIMAL {
   val defaultSize = 8
 }
 
-private[sql] sealed abstract class ByteArrayColumnType[T <: DataType](
-    typeId: Int,
-    defaultSize: Int)
-  extends ColumnType[T, Array[Byte]](typeId, defaultSize) {
+private[sql] sealed abstract class ByteArrayColumnType(
+    val typeId: Int,
+    val defaultSize: Int)
+  extends ColumnType[Array[Byte]] {
 
   override def actualSize(row: InternalRow, ordinal: Int): Int = {
     getField(row, ordinal).length + 4
@@ -421,9 +425,12 @@ private[sql] sealed abstract class ByteArrayColumnType[T <: DataType](
   }
 }
 
-private[sql] object BINARY extends ByteArrayColumnType[BinaryType.type](11, 16) {
+private[sql] object BINARY extends ByteArrayColumnType(11, 16) {
+
+  def dataType: DataType = BooleanType
+
   override def setField(row: MutableRow, ordinal: Int, value: Array[Byte]): Unit = {
-    row(ordinal) = value
+    row.update(ordinal, value)
   }
 
   override def getField(row: InternalRow, ordinal: Int): Array[Byte] = {
@@ -434,18 +441,18 @@ private[sql] object BINARY extends ByteArrayColumnType[BinaryType.type](11, 16)
 // Used to process generic objects (all types other than those listed above). Objects should be
 // serialized first before appending to the column `ByteBuffer`, and is also extracted as serialized
 // byte array.
-private[sql] object GENERIC extends ByteArrayColumnType[DataType](12, 16) {
+private[sql] case class GENERIC(dataType: DataType) extends ByteArrayColumnType(12, 16) {
   override def setField(row: MutableRow, ordinal: Int, value: Array[Byte]): Unit = {
-    row(ordinal) = SparkSqlSerializer.deserialize[Any](value)
+    row.update(ordinal, SparkSqlSerializer.deserialize[Any](value))
   }
 
   override def getField(row: InternalRow, ordinal: Int): Array[Byte] = {
-    SparkSqlSerializer.serialize(row.get(ordinal))
+    SparkSqlSerializer.serialize(row.get(ordinal, dataType))
   }
 }
 
 private[sql] object ColumnType {
-  def apply(dataType: DataType): ColumnType[_, _] = {
+  def apply(dataType: DataType): ColumnType[_] = {
     dataType match {
       case BooleanType => BOOLEAN
       case ByteType => BYTE
@@ -460,7 +467,7 @@ private[sql] object ColumnType {
       case BinaryType => BINARY
       case DecimalType.Fixed(precision, scale) if precision < 19 =>
         FIXED_DECIMAL(precision, scale)
-      case _ => GENERIC
+      case other => GENERIC(other)
     }
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/CompressionScheme.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/CompressionScheme.scala
index 4eaec6d853d4d..b1ef9b2ef7849 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/CompressionScheme.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/CompressionScheme.scala
@@ -46,7 +46,7 @@ private[sql] trait Decoder[T <: AtomicType] {
 private[sql] trait CompressionScheme {
   def typeId: Int
 
-  def supports(columnType: ColumnType[_, _]): Boolean
+  def supports(columnType: ColumnType[_]): Boolean
 
   def encoder[T <: AtomicType](columnType: NativeColumnType[T]): Encoder[T]
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/compressionSchemes.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/compressionSchemes.scala
index 6150df6930b32..c91d960a0932b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/compressionSchemes.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/compressionSchemes.scala
@@ -32,7 +32,7 @@ import org.apache.spark.util.Utils
 private[sql] case object PassThrough extends CompressionScheme {
   override val typeId = 0
 
-  override def supports(columnType: ColumnType[_, _]): Boolean = true
+  override def supports(columnType: ColumnType[_]): Boolean = true
 
   override def encoder[T <: AtomicType](columnType: NativeColumnType[T]): Encoder[T] = {
     new this.Encoder[T](columnType)
@@ -78,7 +78,7 @@ private[sql] case object RunLengthEncoding extends CompressionScheme {
     new this.Decoder(buffer, columnType)
   }
 
-  override def supports(columnType: ColumnType[_, _]): Boolean = columnType match {
+  override def supports(columnType: ColumnType[_]): Boolean = columnType match {
     case INT | LONG | SHORT | BYTE | STRING | BOOLEAN => true
     case _ => false
   }
@@ -128,7 +128,7 @@ private[sql] case object RunLengthEncoding extends CompressionScheme {
         while (from.hasRemaining) {
           columnType.extract(from, value, 0)
 
-          if (value.get(0) == currentValue.get(0)) {
+          if (value.get(0, columnType.dataType) == currentValue.get(0, columnType.dataType)) {
             currentRun += 1
           } else {
             // Writes current run
@@ -189,7 +189,7 @@ private[sql] case object DictionaryEncoding extends CompressionScheme {
     new this.Encoder[T](columnType)
   }
 
-  override def supports(columnType: ColumnType[_, _]): Boolean = columnType match {
+  override def supports(columnType: ColumnType[_]): Boolean = columnType match {
     case INT | LONG | STRING => true
     case _ => false
   }
@@ -304,7 +304,7 @@ private[sql] case object BooleanBitSet extends CompressionScheme {
     (new this.Encoder).asInstanceOf[compression.Encoder[T]]
   }
 
-  override def supports(columnType: ColumnType[_, _]): Boolean = columnType == BOOLEAN
+  override def supports(columnType: ColumnType[_]): Boolean = columnType == BOOLEAN
 
   class Encoder extends compression.Encoder[BooleanType.type] {
     private var _uncompressedSize = 0
@@ -392,7 +392,7 @@ private[sql] case object IntDelta extends CompressionScheme {
     (new Encoder).asInstanceOf[compression.Encoder[T]]
   }
 
-  override def supports(columnType: ColumnType[_, _]): Boolean = columnType == INT
+  override def supports(columnType: ColumnType[_]): Boolean = columnType == INT
 
   class Encoder extends compression.Encoder[IntegerType.type] {
     protected var _compressedSize: Int = 0
@@ -472,7 +472,7 @@ private[sql] case object LongDelta extends CompressionScheme {
     (new Encoder).asInstanceOf[compression.Encoder[T]]
   }
 
-  override def supports(columnType: ColumnType[_, _]): Boolean = columnType == LONG
+  override def supports(columnType: ColumnType[_]): Boolean = columnType == LONG
 
   class Encoder extends compression.Encoder[LongType.type] {
     protected var _compressedSize: Int = 0
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnStatsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnStatsSuite.scala
index 31e7b0e72e510..4499a7207031d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnStatsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnStatsSuite.scala
@@ -58,15 +58,15 @@ class ColumnStatsSuite extends SparkFunSuite {
       val rows = Seq.fill(10)(makeRandomRow(columnType)) ++ Seq.fill(10)(makeNullRow(1))
       rows.foreach(columnStats.gatherStats(_, 0))
 
-      val values = rows.take(10).map(_.get(0).asInstanceOf[T#InternalType])
+      val values = rows.take(10).map(_.get(0, columnType.dataType).asInstanceOf[T#InternalType])
       val ordering = columnType.dataType.ordering.asInstanceOf[Ordering[T#InternalType]]
       val stats = columnStats.collectedStatistics
 
-      assertResult(values.min(ordering), "Wrong lower bound")(stats.get(0))
-      assertResult(values.max(ordering), "Wrong upper bound")(stats.get(1))
-      assertResult(10, "Wrong null count")(stats.get(2))
-      assertResult(20, "Wrong row count")(stats.get(3))
-      assertResult(stats.get(4), "Wrong size in bytes") {
+      assertResult(values.min(ordering), "Wrong lower bound")(stats.genericGet(0))
+      assertResult(values.max(ordering), "Wrong upper bound")(stats.genericGet(1))
+      assertResult(10, "Wrong null count")(stats.genericGet(2))
+      assertResult(20, "Wrong row count")(stats.genericGet(3))
+      assertResult(stats.genericGet(4), "Wrong size in bytes") {
         rows.map { row =>
           if (row.isNullAt(0)) 4 else columnType.actualSize(row, 0)
         }.sum
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnTypeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnTypeSuite.scala
index 4d46a657056e0..8f024690efd0d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnTypeSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnTypeSuite.scala
@@ -32,13 +32,15 @@ import org.apache.spark.unsafe.types.UTF8String
 
 
 class ColumnTypeSuite extends SparkFunSuite with Logging {
-  val DEFAULT_BUFFER_SIZE = 512
+  private val DEFAULT_BUFFER_SIZE = 512
+  private val MAP_GENERIC = GENERIC(MapType(IntegerType, StringType))
 
   test("defaultSize") {
     val checks = Map(
       BOOLEAN -> 1, BYTE -> 1, SHORT -> 2, INT -> 4, DATE -> 4,
       LONG -> 8, TIMESTAMP -> 8, FLOAT -> 4, DOUBLE -> 8,
-      STRING -> 8, BINARY -> 16, FIXED_DECIMAL(15, 10) -> 8, GENERIC -> 16)
+      STRING -> 8, BINARY -> 16, FIXED_DECIMAL(15, 10) -> 8,
+      MAP_GENERIC -> 16)
 
     checks.foreach { case (columnType, expectedSize) =>
       assertResult(expectedSize, s"Wrong defaultSize for $columnType") {
@@ -48,8 +50,8 @@ class ColumnTypeSuite extends SparkFunSuite with Logging {
   }
 
   test("actualSize") {
-    def checkActualSize[T <: DataType, JvmType](
-        columnType: ColumnType[T, JvmType],
+    def checkActualSize[JvmType](
+        columnType: ColumnType[JvmType],
         value: JvmType,
         expected: Int): Unit = {
 
@@ -74,7 +76,7 @@ class ColumnTypeSuite extends SparkFunSuite with Logging {
     checkActualSize(FIXED_DECIMAL(15, 10), Decimal(0, 15, 10), 8)
 
     val generic = Map(1 -> "a")
-    checkActualSize(GENERIC, SparkSqlSerializer.serialize(generic), 4 + 8)
+    checkActualSize(MAP_GENERIC, SparkSqlSerializer.serialize(generic), 4 + 8)
   }
 
   testNativeColumnType(BOOLEAN)(
@@ -123,7 +125,7 @@ class ColumnTypeSuite extends SparkFunSuite with Logging {
       UTF8String.fromBytes(bytes)
     })
 
-  testColumnType[BinaryType.type, Array[Byte]](
+  testColumnType[Array[Byte]](
     BINARY,
     (buffer: ByteBuffer, bytes: Array[Byte]) => {
       buffer.putInt(bytes.length).put(bytes)
@@ -140,7 +142,7 @@ class ColumnTypeSuite extends SparkFunSuite with Logging {
     val obj = Map(1 -> "spark", 2 -> "sql")
     val serializedObj = SparkSqlSerializer.serialize(obj)
 
-    GENERIC.append(SparkSqlSerializer.serialize(obj), buffer)
+    MAP_GENERIC.append(SparkSqlSerializer.serialize(obj), buffer)
     buffer.rewind()
 
     val length = buffer.getInt()
@@ -157,7 +159,7 @@ class ColumnTypeSuite extends SparkFunSuite with Logging {
 
     assertResult(obj, "Deserialized object didn't equal to the original object") {
       buffer.rewind()
-      SparkSqlSerializer.deserialize(GENERIC.extract(buffer))
+      SparkSqlSerializer.deserialize(MAP_GENERIC.extract(buffer))
     }
   }
 
@@ -170,7 +172,7 @@ class ColumnTypeSuite extends SparkFunSuite with Logging {
     val obj = CustomClass(Int.MaxValue, Long.MaxValue)
     val serializedObj = serializer.serialize(obj).array()
 
-    GENERIC.append(serializer.serialize(obj).array(), buffer)
+    MAP_GENERIC.append(serializer.serialize(obj).array(), buffer)
     buffer.rewind()
 
     val length = buffer.getInt
@@ -192,7 +194,7 @@ class ColumnTypeSuite extends SparkFunSuite with Logging {
 
     assertResult(obj, "Custom deserialized object didn't equal the original object") {
       buffer.rewind()
-      serializer.deserialize(ByteBuffer.wrap(GENERIC.extract(buffer)))
+      serializer.deserialize(ByteBuffer.wrap(MAP_GENERIC.extract(buffer)))
     }
   }
 
@@ -201,11 +203,11 @@ class ColumnTypeSuite extends SparkFunSuite with Logging {
       (putter: (ByteBuffer, T#InternalType) => Unit,
       getter: (ByteBuffer) => T#InternalType): Unit = {
 
-    testColumnType[T, T#InternalType](columnType, putter, getter)
+    testColumnType[T#InternalType](columnType, putter, getter)
   }
 
-  def testColumnType[T <: DataType, JvmType](
-      columnType: ColumnType[T, JvmType],
+  def testColumnType[JvmType](
+      columnType: ColumnType[JvmType],
       putter: (ByteBuffer, JvmType) => Unit,
       getter: (ByteBuffer) => JvmType): Unit = {
 
@@ -262,7 +264,7 @@ class ColumnTypeSuite extends SparkFunSuite with Logging {
       }
     }
 
-    assertResult(GENERIC) {
+    assertResult(GENERIC(DecimalType(19, 0))) {
       ColumnType(DecimalType(19, 0))
     }
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnarTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnarTestUtils.scala
index d9861339739c9..79bb7d072feb2 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnarTestUtils.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnarTestUtils.scala
@@ -31,7 +31,7 @@ object ColumnarTestUtils {
     row
   }
 
-  def makeRandomValue[T <: DataType, JvmType](columnType: ColumnType[T, JvmType]): JvmType = {
+  def makeRandomValue[JvmType](columnType: ColumnType[JvmType]): JvmType = {
     def randomBytes(length: Int) = {
       val bytes = new Array[Byte](length)
       Random.nextBytes(bytes)
@@ -58,15 +58,15 @@ object ColumnarTestUtils {
   }
 
   def makeRandomValues(
-      head: ColumnType[_ <: DataType, _],
-      tail: ColumnType[_ <: DataType, _]*): Seq[Any] = makeRandomValues(Seq(head) ++ tail)
+      head: ColumnType[_],
+      tail: ColumnType[_]*): Seq[Any] = makeRandomValues(Seq(head) ++ tail)
 
-  def makeRandomValues(columnTypes: Seq[ColumnType[_ <: DataType, _]]): Seq[Any] = {
+  def makeRandomValues(columnTypes: Seq[ColumnType[_]]): Seq[Any] = {
     columnTypes.map(makeRandomValue(_))
   }
 
-  def makeUniqueRandomValues[T <: DataType, JvmType](
-      columnType: ColumnType[T, JvmType],
+  def makeUniqueRandomValues[JvmType](
+      columnType: ColumnType[JvmType],
       count: Int): Seq[JvmType] = {
 
     Iterator.iterate(HashSet.empty[JvmType]) { set =>
@@ -75,10 +75,10 @@ object ColumnarTestUtils {
   }
 
   def makeRandomRow(
-      head: ColumnType[_ <: DataType, _],
-      tail: ColumnType[_ <: DataType, _]*): InternalRow = makeRandomRow(Seq(head) ++ tail)
+      head: ColumnType[_],
+      tail: ColumnType[_]*): InternalRow = makeRandomRow(Seq(head) ++ tail)
 
-  def makeRandomRow(columnTypes: Seq[ColumnType[_ <: DataType, _]]): InternalRow = {
+  def makeRandomRow(columnTypes: Seq[ColumnType[_]]): InternalRow = {
     val row = new GenericMutableRow(columnTypes.length)
     makeRandomValues(columnTypes).zipWithIndex.foreach { case (value, index) =>
       row(index) = value
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnAccessorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnAccessorSuite.scala
index d421f4d8d091e..f4f6c7649bfa8 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnAccessorSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnAccessorSuite.scala
@@ -21,17 +21,17 @@ import java.nio.ByteBuffer
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.expressions.GenericMutableRow
-import org.apache.spark.sql.types.DataType
+import org.apache.spark.sql.types.{StringType, ArrayType, DataType}
 
-class TestNullableColumnAccessor[T <: DataType, JvmType](
+class TestNullableColumnAccessor[JvmType](
     buffer: ByteBuffer,
-    columnType: ColumnType[T, JvmType])
+    columnType: ColumnType[JvmType])
   extends BasicColumnAccessor(buffer, columnType)
   with NullableColumnAccessor
 
 object TestNullableColumnAccessor {
-  def apply[T <: DataType, JvmType](buffer: ByteBuffer, columnType: ColumnType[T, JvmType])
-    : TestNullableColumnAccessor[T, JvmType] = {
+  def apply[JvmType](buffer: ByteBuffer, columnType: ColumnType[JvmType])
+    : TestNullableColumnAccessor[JvmType] = {
     // Skips the column type ID
     buffer.getInt()
     new TestNullableColumnAccessor(buffer, columnType)
@@ -43,13 +43,13 @@ class NullableColumnAccessorSuite extends SparkFunSuite {
 
   Seq(
     BOOLEAN, BYTE, SHORT, INT, DATE, LONG, TIMESTAMP, FLOAT, DOUBLE,
-    STRING, BINARY, FIXED_DECIMAL(15, 10), GENERIC)
+    STRING, BINARY, FIXED_DECIMAL(15, 10), GENERIC(ArrayType(StringType)))
     .foreach {
     testNullableColumnAccessor(_)
   }
 
-  def testNullableColumnAccessor[T <: DataType, JvmType](
-      columnType: ColumnType[T, JvmType]): Unit = {
+  def testNullableColumnAccessor[JvmType](
+      columnType: ColumnType[JvmType]): Unit = {
 
     val typeName = columnType.getClass.getSimpleName.stripSuffix("$")
     val nullRow = makeNullRow(1)
@@ -75,7 +75,7 @@ class NullableColumnAccessorSuite extends SparkFunSuite {
       (0 until 4).foreach { _ =>
         assert(accessor.hasNext)
         accessor.extractTo(row, 0)
-        assert(row.get(0) === randomRow.get(0))
+        assert(row.get(0, columnType.dataType) === randomRow.get(0, columnType.dataType))
 
         assert(accessor.hasNext)
         accessor.extractTo(row, 0)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnBuilderSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnBuilderSuite.scala
index cd8bf75ff1752..241d09ea205e9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnBuilderSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnBuilderSuite.scala
@@ -21,13 +21,13 @@ import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.execution.SparkSqlSerializer
 import org.apache.spark.sql.types._
 
-class TestNullableColumnBuilder[T <: DataType, JvmType](columnType: ColumnType[T, JvmType])
-  extends BasicColumnBuilder[T, JvmType](new NoopColumnStats, columnType)
+class TestNullableColumnBuilder[JvmType](columnType: ColumnType[JvmType])
+  extends BasicColumnBuilder[JvmType](new NoopColumnStats, columnType)
   with NullableColumnBuilder
 
 object TestNullableColumnBuilder {
-  def apply[T <: DataType, JvmType](columnType: ColumnType[T, JvmType], initialSize: Int = 0)
-    : TestNullableColumnBuilder[T, JvmType] = {
+  def apply[JvmType](columnType: ColumnType[JvmType], initialSize: Int = 0)
+    : TestNullableColumnBuilder[JvmType] = {
     val builder = new TestNullableColumnBuilder(columnType)
     builder.initialize(initialSize)
     builder
@@ -39,13 +39,13 @@ class NullableColumnBuilderSuite extends SparkFunSuite {
 
   Seq(
     BOOLEAN, BYTE, SHORT, INT, DATE, LONG, TIMESTAMP, FLOAT, DOUBLE,
-    STRING, BINARY, FIXED_DECIMAL(15, 10), GENERIC)
+    STRING, BINARY, FIXED_DECIMAL(15, 10), GENERIC(ArrayType(StringType)))
     .foreach {
     testNullableColumnBuilder(_)
   }
 
-  def testNullableColumnBuilder[T <: DataType, JvmType](
-      columnType: ColumnType[T, JvmType]): Unit = {
+  def testNullableColumnBuilder[JvmType](
+      columnType: ColumnType[JvmType]): Unit = {
 
     val typeName = columnType.getClass.getSimpleName.stripSuffix("$")
 
@@ -92,13 +92,14 @@ class NullableColumnBuilderSuite extends SparkFunSuite {
 
       // For non-null values
       (0 until 4).foreach { _ =>
-        val actual = if (columnType == GENERIC) {
-          SparkSqlSerializer.deserialize[Any](GENERIC.extract(buffer))
+        val actual = if (columnType.isInstanceOf[GENERIC]) {
+          SparkSqlSerializer.deserialize[Any](columnType.extract(buffer).asInstanceOf[Array[Byte]])
         } else {
           columnType.extract(buffer)
         }
 
-        assert(actual === randomRow.get(0), "Extracted value didn't equal to the original one")
+        assert(actual === randomRow.get(0, columnType.dataType),
+          "Extracted value didn't equal to the original one")
       }
 
       assert(!buffer.hasRemaining)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/BooleanBitSetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/BooleanBitSetSuite.scala
index 33092c83a1a1c..9a2948c59ba42 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/BooleanBitSetSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/compression/BooleanBitSetSuite.scala
@@ -33,7 +33,7 @@ class BooleanBitSetSuite extends SparkFunSuite {
 
     val builder = TestCompressibleColumnBuilder(new NoopColumnStats, BOOLEAN, BooleanBitSet)
     val rows = Seq.fill[InternalRow](count)(makeRandomRow(BOOLEAN))
-    val values = rows.map(_.get(0))
+    val values = rows.map(_.getBoolean(0))
 
     rows.foreach(builder.appendFrom(_, 0))
     val buffer = builder.build()

From c1be9f309acad4d1b1908fa7800e7ef4f3e872ce Mon Sep 17 00:00:00 2001
From: Hari Shreedharan <hshreedharan@apache.org>
Date: Mon, 27 Jul 2015 15:16:46 -0700
Subject: [PATCH 404/549] =?UTF-8?q?[SPARK-8988]=20[YARN]=20Make=20sure=20d?=
 =?UTF-8?q?river=20log=20links=20appear=20in=20secure=20cluste=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…r mode.

The NodeReports API currently used does not work in secure mode since we do not get RM tokens. Instead this patch just uses environment vars exported by YARN to create the log links.

Author: Hari Shreedharan <hshreedharan@apache.org>

Closes #7624 from harishreedharan/driver-logs-env and squashes the following commits:

7368c7e [Hari Shreedharan] [SPARK-8988][YARN] Make sure driver log links appear in secure cluster mode.
---
 .../cluster/YarnClusterSchedulerBackend.scala | 71 +++++--------------
 1 file changed, 17 insertions(+), 54 deletions(-)

diff --git a/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterSchedulerBackend.scala b/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterSchedulerBackend.scala
index 33f580aaebdc0..1aed5a1675075 100644
--- a/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterSchedulerBackend.scala
+++ b/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterSchedulerBackend.scala
@@ -19,6 +19,8 @@ package org.apache.spark.scheduler.cluster
 
 import java.net.NetworkInterface
 
+import org.apache.hadoop.yarn.api.ApplicationConstants.Environment
+
 import scala.collection.JavaConverters._
 
 import org.apache.hadoop.yarn.api.records.NodeState
@@ -64,68 +66,29 @@ private[spark] class YarnClusterSchedulerBackend(
     }
 
   override def getDriverLogUrls: Option[Map[String, String]] = {
-    var yarnClientOpt: Option[YarnClient] = None
     var driverLogs: Option[Map[String, String]] = None
     try {
       val yarnConf = new YarnConfiguration(sc.hadoopConfiguration)
       val containerId = YarnSparkHadoopUtil.get.getContainerId
-      yarnClientOpt = Some(YarnClient.createYarnClient())
-      yarnClientOpt.foreach { yarnClient =>
-        yarnClient.init(yarnConf)
-        yarnClient.start()
-
-        // For newer versions of YARN, we can find the HTTP address for a given node by getting a
-        // container report for a given container. But container reports came only in Hadoop 2.4,
-        // so we basically have to get the node reports for all nodes and find the one which runs
-        // this container. For that we have to compare the node's host against the current host.
-        // Since the host can have multiple addresses, we need to compare against all of them to
-        // find out if one matches.
-
-        // Get all the addresses of this node.
-        val addresses =
-          NetworkInterface.getNetworkInterfaces.asScala
-            .flatMap(_.getInetAddresses.asScala)
-            .toSeq
-
-        // Find a node report that matches one of the addresses
-        val nodeReport =
-          yarnClient.getNodeReports(NodeState.RUNNING).asScala.find { x =>
-            val host = x.getNodeId.getHost
-            addresses.exists { address =>
-              address.getHostAddress == host ||
-                address.getHostName == host ||
-                address.getCanonicalHostName == host
-            }
-          }
 
-        // Now that we have found the report for the Node Manager that the AM is running on, we
-        // can get the base HTTP address for the Node manager from the report.
-        // The format used for the logs for each container is well-known and can be constructed
-        // using the NM's HTTP address and the container ID.
-        // The NM may be running several containers, but we can build the URL for the AM using
-        // the AM's container ID, which we already know.
-        nodeReport.foreach { report =>
-          val httpAddress = report.getHttpAddress
-          // lookup appropriate http scheme for container log urls
-          val yarnHttpPolicy = yarnConf.get(
-            YarnConfiguration.YARN_HTTP_POLICY_KEY,
-            YarnConfiguration.YARN_HTTP_POLICY_DEFAULT
-          )
-          val user = Utils.getCurrentUserName()
-          val httpScheme = if (yarnHttpPolicy == "HTTPS_ONLY") "https://" else "http://"
-          val baseUrl = s"$httpScheme$httpAddress/node/containerlogs/$containerId/$user"
-          logDebug(s"Base URL for logs: $baseUrl")
-          driverLogs = Some(Map(
-            "stderr" -> s"$baseUrl/stderr?start=-4096",
-            "stdout" -> s"$baseUrl/stdout?start=-4096"))
-        }
-      }
+      val httpAddress = System.getenv(Environment.NM_HOST.name()) +
+        ":" + System.getenv(Environment.NM_HTTP_PORT.name())
+      // lookup appropriate http scheme for container log urls
+      val yarnHttpPolicy = yarnConf.get(
+        YarnConfiguration.YARN_HTTP_POLICY_KEY,
+        YarnConfiguration.YARN_HTTP_POLICY_DEFAULT
+      )
+      val user = Utils.getCurrentUserName()
+      val httpScheme = if (yarnHttpPolicy == "HTTPS_ONLY") "https://" else "http://"
+      val baseUrl = s"$httpScheme$httpAddress/node/containerlogs/$containerId/$user"
+      logDebug(s"Base URL for logs: $baseUrl")
+      driverLogs = Some(Map(
+        "stderr" -> s"$baseUrl/stderr?start=-4096",
+        "stdout" -> s"$baseUrl/stdout?start=-4096"))
     } catch {
       case e: Exception =>
-        logInfo("Node Report API is not available in the version of YARN being used, so AM" +
+        logInfo("Error while building AM log links, so AM" +
           " logs link will not appear in application UI", e)
-    } finally {
-      yarnClientOpt.foreach(_.close())
     }
     driverLogs
   }

From 2104931d7d726eda2c098e0f403c7f1533df8746 Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Mon, 27 Jul 2015 15:18:48 -0700
Subject: [PATCH 405/549] [SPARK-9385] [HOT-FIX] [PYSPARK] Comment out Python
 style check

https://issues.apache.org/jira/browse/SPARK-9385

Comment out Python style check because of error shown in https://amplab.cs.berkeley.edu/jenkins/job/Spark-Master-SBT/3088/AMPLAB_JENKINS_BUILD_PROFILE=hadoop1.0,label=centos/console

Author: Yin Huai <yhuai@databricks.com>

Closes #7702 from yhuai/SPARK-9385 and squashes the following commits:

146e6ef [Yin Huai] Comment out Python style check because of error shown in https://amplab.cs.berkeley.edu/jenkins/job/Spark-Master-SBT/3088/AMPLAB_JENKINS_BUILD_PROFILE=hadoop1.0,label=centos/console
---
 dev/run-tests.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/dev/run-tests.py b/dev/run-tests.py
index 1f0d218514f92..d1cb66860b3f8 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -198,8 +198,9 @@ def run_scala_style_checks():
 
 
 def run_python_style_checks():
-    set_title_and_block("Running Python style checks", "BLOCK_PYTHON_STYLE")
-    run_cmd([os.path.join(SPARK_HOME, "dev", "lint-python")])
+    # set_title_and_block("Running Python style checks", "BLOCK_PYTHON_STYLE")
+    # run_cmd([os.path.join(SPARK_HOME, "dev", "lint-python")])
+    pass
 
 
 def build_spark_documentation():

From ab625956616664c2b4861781a578311da75a9ae4 Mon Sep 17 00:00:00 2001
From: jerryshao <saisai.shao@intel.com>
Date: Mon, 27 Jul 2015 15:46:35 -0700
Subject: [PATCH 406/549] [SPARK-4352] [YARN] [WIP] Incorporate locality
 preferences in dynamic allocation requests

Currently there's no locality preference for container request in YARN mode, this will affect the performance if fetching data remotely, so here proposed to add locality in Yarn dynamic allocation mode.

Ping sryza, please help to review, thanks a lot.

Author: jerryshao <saisai.shao@intel.com>

Closes #6394 from jerryshao/SPARK-4352 and squashes the following commits:

d45fecb [jerryshao] Add documents
6c3fe5c [jerryshao] Fix bug
8db6c0e [jerryshao] Further address the comments
2e2b2cb [jerryshao] Fix rebase compiling problem
ce5f096 [jerryshao] Fix style issue
7f7df95 [jerryshao] Fix rebase issue
9ca9e07 [jerryshao] Code refactor according to comments
d3e4236 [jerryshao] Further address the comments
5e7a593 [jerryshao] Fix bug introduced code rebase
9ca7783 [jerryshao] Style changes
08317f9 [jerryshao] code and comment refines
65b2423 [jerryshao] Further address the comments
a27c587 [jerryshao] address the comment
27faabc [jerryshao] redundant code remove
9ce06a1 [jerryshao] refactor the code
f5ba27b [jerryshao] Style fix
2c6cc8a [jerryshao] Fix bug and add unit tests
0757335 [jerryshao] Consider the distribution of existed containers to recalculate the new container requests
0ad66ff [jerryshao] Fix compile bugs
1c20381 [jerryshao] Minor fix
5ef2dc8 [jerryshao] Add docs and improve the code
3359814 [jerryshao] Fix rebase and test bugs
0398539 [jerryshao] reinitialize the new implementation
67596d6 [jerryshao] Still fix the code
654e1d2 [jerryshao] Fix some bugs
45b1c89 [jerryshao] Further polish the algorithm
dea0152 [jerryshao] Enable node locality information in YarnAllocator
74bbcc6 [jerryshao] Support node locality for dynamic allocation initial commit
---
 .../spark/ExecutorAllocationClient.scala      |  18 +-
 .../spark/ExecutorAllocationManager.scala     |  62 +++++-
 .../scala/org/apache/spark/SparkContext.scala |  25 ++-
 .../apache/spark/scheduler/DAGScheduler.scala |  26 ++-
 .../org/apache/spark/scheduler/Stage.scala    |   7 +-
 .../apache/spark/scheduler/StageInfo.scala    |  13 +-
 .../cluster/CoarseGrainedClusterMessage.scala |   6 +-
 .../CoarseGrainedSchedulerBackend.scala       |  32 ++-
 .../cluster/YarnSchedulerBackend.scala        |   3 +-
 .../ExecutorAllocationManagerSuite.scala      |  55 +++++-
 .../apache/spark/HeartbeatReceiverSuite.scala |   7 +-
 .../spark/deploy/yarn/ApplicationMaster.scala |   5 +-
 ...yPreferredContainerPlacementStrategy.scala | 182 ++++++++++++++++++
 .../spark/deploy/yarn/YarnAllocator.scala     |  47 ++++-
 .../ContainerPlacementStrategySuite.scala     | 125 ++++++++++++
 .../deploy/yarn/YarnAllocatorSuite.scala      |  14 +-
 16 files changed, 578 insertions(+), 49 deletions(-)
 create mode 100644 yarn/src/main/scala/org/apache/spark/deploy/yarn/LocalityPreferredContainerPlacementStrategy.scala
 create mode 100644 yarn/src/test/scala/org/apache/spark/deploy/yarn/ContainerPlacementStrategySuite.scala

diff --git a/core/src/main/scala/org/apache/spark/ExecutorAllocationClient.scala b/core/src/main/scala/org/apache/spark/ExecutorAllocationClient.scala
index 443830f8d03b6..842bfdbadc948 100644
--- a/core/src/main/scala/org/apache/spark/ExecutorAllocationClient.scala
+++ b/core/src/main/scala/org/apache/spark/ExecutorAllocationClient.scala
@@ -24,11 +24,23 @@ package org.apache.spark
 private[spark] trait ExecutorAllocationClient {
 
   /**
-   * Express a preference to the cluster manager for a given total number of executors.
-   * This can result in canceling pending requests or filing additional requests.
+   * Update the cluster manager on our scheduling needs. Three bits of information are included
+   * to help it make decisions.
+   * @param numExecutors The total number of executors we'd like to have. The cluster manager
+   *                     shouldn't kill any running executor to reach this number, but,
+   *                     if all existing executors were to die, this is the number of executors
+   *                     we'd want to be allocated.
+   * @param localityAwareTasks The number of tasks in all active stages that have a locality
+   *                           preferences. This includes running, pending, and completed tasks.
+   * @param hostToLocalTaskCount A map of hosts to the number of tasks from all active stages
+   *                             that would like to like to run on that host.
+   *                             This includes running, pending, and completed tasks.
    * @return whether the request is acknowledged by the cluster manager.
    */
-  private[spark] def requestTotalExecutors(numExecutors: Int): Boolean
+  private[spark] def requestTotalExecutors(
+      numExecutors: Int,
+      localityAwareTasks: Int,
+      hostToLocalTaskCount: Map[String, Int]): Boolean
 
   /**
    * Request an additional number of executors from the cluster manager.
diff --git a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
index 648bcfe28cad2..1877aaf2cac55 100644
--- a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
+++ b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
@@ -161,6 +161,12 @@ private[spark] class ExecutorAllocationManager(
   //   (2) an executor idle timeout has elapsed.
   @volatile private var initializing: Boolean = true
 
+  // Number of locality aware tasks, used for executor placement.
+  private var localityAwareTasks = 0
+
+  // Host to possible task running on it, used for executor placement.
+  private var hostToLocalTaskCount: Map[String, Int] = Map.empty
+
   /**
    * Verify that the settings specified through the config are valid.
    * If not, throw an appropriate exception.
@@ -295,7 +301,7 @@ private[spark] class ExecutorAllocationManager(
 
       // If the new target has not changed, avoid sending a message to the cluster manager
       if (numExecutorsTarget < oldNumExecutorsTarget) {
-        client.requestTotalExecutors(numExecutorsTarget)
+        client.requestTotalExecutors(numExecutorsTarget, localityAwareTasks, hostToLocalTaskCount)
         logDebug(s"Lowering target number of executors to $numExecutorsTarget (previously " +
           s"$oldNumExecutorsTarget) because not all requested executors are actually needed")
       }
@@ -349,7 +355,8 @@ private[spark] class ExecutorAllocationManager(
       return 0
     }
 
-    val addRequestAcknowledged = testing || client.requestTotalExecutors(numExecutorsTarget)
+    val addRequestAcknowledged = testing ||
+      client.requestTotalExecutors(numExecutorsTarget, localityAwareTasks, hostToLocalTaskCount)
     if (addRequestAcknowledged) {
       val executorsString = "executor" + { if (delta > 1) "s" else "" }
       logInfo(s"Requesting $delta new $executorsString because tasks are backlogged" +
@@ -519,6 +526,12 @@ private[spark] class ExecutorAllocationManager(
     // Number of tasks currently running on the cluster.  Should be 0 when no stages are active.
     private var numRunningTasks: Int = _
 
+    // stageId to tuple (the number of task with locality preferences, a map where each pair is a
+    // node and the number of tasks that would like to be scheduled on that node) map,
+    // maintain the executor placement hints for each stage Id used by resource framework to better
+    // place the executors.
+    private val stageIdToExecutorPlacementHints = new mutable.HashMap[Int, (Int, Map[String, Int])]
+
     override def onStageSubmitted(stageSubmitted: SparkListenerStageSubmitted): Unit = {
       initializing = false
       val stageId = stageSubmitted.stageInfo.stageId
@@ -526,6 +539,24 @@ private[spark] class ExecutorAllocationManager(
       allocationManager.synchronized {
         stageIdToNumTasks(stageId) = numTasks
         allocationManager.onSchedulerBacklogged()
+
+        // Compute the number of tasks requested by the stage on each host
+        var numTasksPending = 0
+        val hostToLocalTaskCountPerStage = new mutable.HashMap[String, Int]()
+        stageSubmitted.stageInfo.taskLocalityPreferences.foreach { locality =>
+          if (!locality.isEmpty) {
+            numTasksPending += 1
+            locality.foreach { location =>
+              val count = hostToLocalTaskCountPerStage.getOrElse(location.host, 0) + 1
+              hostToLocalTaskCountPerStage(location.host) = count
+            }
+          }
+        }
+        stageIdToExecutorPlacementHints.put(stageId,
+          (numTasksPending, hostToLocalTaskCountPerStage.toMap))
+
+        // Update the executor placement hints
+        updateExecutorPlacementHints()
       }
     }
 
@@ -534,6 +565,10 @@ private[spark] class ExecutorAllocationManager(
       allocationManager.synchronized {
         stageIdToNumTasks -= stageId
         stageIdToTaskIndices -= stageId
+        stageIdToExecutorPlacementHints -= stageId
+
+        // Update the executor placement hints
+        updateExecutorPlacementHints()
 
         // If this is the last stage with pending tasks, mark the scheduler queue as empty
         // This is needed in case the stage is aborted for any reason
@@ -637,6 +672,29 @@ private[spark] class ExecutorAllocationManager(
     def isExecutorIdle(executorId: String): Boolean = {
       !executorIdToTaskIds.contains(executorId)
     }
+
+    /**
+     * Update the Executor placement hints (the number of tasks with locality preferences,
+     * a map where each pair is a node and the number of tasks that would like to be scheduled
+     * on that node).
+     *
+     * These hints are updated when stages arrive and complete, so are not up-to-date at task
+     * granularity within stages.
+     */
+    def updateExecutorPlacementHints(): Unit = {
+      var localityAwareTasks = 0
+      val localityToCount = new mutable.HashMap[String, Int]()
+      stageIdToExecutorPlacementHints.values.foreach { case (numTasksPending, localities) =>
+        localityAwareTasks += numTasksPending
+        localities.foreach { case (hostname, count) =>
+          val updatedCount = localityToCount.getOrElse(hostname, 0) + count
+          localityToCount(hostname) = updatedCount
+        }
+      }
+
+      allocationManager.localityAwareTasks = localityAwareTasks
+      allocationManager.hostToLocalTaskCount = localityToCount.toMap
+    }
   }
 
   /**
diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index 6a6b94a271cfc..ac6ac6c216767 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -1382,16 +1382,29 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
   }
 
   /**
-   * Express a preference to the cluster manager for a given total number of executors.
-   * This can result in canceling pending requests or filing additional requests.
-   * This is currently only supported in YARN mode. Return whether the request is received.
-   */
-  private[spark] override def requestTotalExecutors(numExecutors: Int): Boolean = {
+   * Update the cluster manager on our scheduling needs. Three bits of information are included
+   * to help it make decisions.
+   * @param numExecutors The total number of executors we'd like to have. The cluster manager
+   *                     shouldn't kill any running executor to reach this number, but,
+   *                     if all existing executors were to die, this is the number of executors
+   *                     we'd want to be allocated.
+   * @param localityAwareTasks The number of tasks in all active stages that have a locality
+   *                           preferences. This includes running, pending, and completed tasks.
+   * @param hostToLocalTaskCount A map of hosts to the number of tasks from all active stages
+   *                             that would like to like to run on that host.
+   *                             This includes running, pending, and completed tasks.
+   * @return whether the request is acknowledged by the cluster manager.
+   */
+  private[spark] override def requestTotalExecutors(
+      numExecutors: Int,
+      localityAwareTasks: Int,
+      hostToLocalTaskCount: scala.collection.immutable.Map[String, Int]
+    ): Boolean = {
     assert(supportDynamicAllocation,
       "Requesting executors is currently only supported in YARN and Mesos modes")
     schedulerBackend match {
       case b: CoarseGrainedSchedulerBackend =>
-        b.requestTotalExecutors(numExecutors)
+        b.requestTotalExecutors(numExecutors, localityAwareTasks, hostToLocalTaskCount)
       case _ =>
         logWarning("Requesting executors is only supported in coarse-grained mode")
         false
diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
index b6a833bbb0833..cdf6078421123 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -790,8 +790,28 @@ class DAGScheduler(
     // serializable. If tasks are not serializable, a SparkListenerStageCompleted event
     // will be posted, which should always come after a corresponding SparkListenerStageSubmitted
     // event.
-    stage.makeNewStageAttempt(partitionsToCompute.size)
     outputCommitCoordinator.stageStart(stage.id)
+    val taskIdToLocations = try {
+      stage match {
+        case s: ShuffleMapStage =>
+          partitionsToCompute.map { id => (id, getPreferredLocs(stage.rdd, id))}.toMap
+        case s: ResultStage =>
+          val job = s.resultOfJob.get
+          partitionsToCompute.map { id =>
+            val p = job.partitions(id)
+            (id, getPreferredLocs(stage.rdd, p))
+          }.toMap
+      }
+    } catch {
+      case NonFatal(e) =>
+        stage.makeNewStageAttempt(partitionsToCompute.size)
+        listenerBus.post(SparkListenerStageSubmitted(stage.latestInfo, properties))
+        abortStage(stage, s"Task creation failed: $e\n${e.getStackTraceString}")
+        runningStages -= stage
+        return
+    }
+
+    stage.makeNewStageAttempt(partitionsToCompute.size, taskIdToLocations.values.toSeq)
     listenerBus.post(SparkListenerStageSubmitted(stage.latestInfo, properties))
 
     // TODO: Maybe we can keep the taskBinary in Stage to avoid serializing it multiple times.
@@ -830,7 +850,7 @@ class DAGScheduler(
       stage match {
         case stage: ShuffleMapStage =>
           partitionsToCompute.map { id =>
-            val locs = getPreferredLocs(stage.rdd, id)
+            val locs = taskIdToLocations(id)
             val part = stage.rdd.partitions(id)
             new ShuffleMapTask(stage.id, stage.latestInfo.attemptId, taskBinary, part, locs)
           }
@@ -840,7 +860,7 @@ class DAGScheduler(
           partitionsToCompute.map { id =>
             val p: Int = job.partitions(id)
             val part = stage.rdd.partitions(p)
-            val locs = getPreferredLocs(stage.rdd, p)
+            val locs = taskIdToLocations(id)
             new ResultTask(stage.id, stage.latestInfo.attemptId, taskBinary, part, locs, id)
           }
       }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/Stage.scala b/core/src/main/scala/org/apache/spark/scheduler/Stage.scala
index b86724de2cb73..40a333a3e06b2 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/Stage.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/Stage.scala
@@ -77,8 +77,11 @@ private[spark] abstract class Stage(
   private var _latestInfo: StageInfo = StageInfo.fromStage(this, nextAttemptId)
 
   /** Creates a new attempt for this stage by creating a new StageInfo with a new attempt ID. */
-  def makeNewStageAttempt(numPartitionsToCompute: Int): Unit = {
-    _latestInfo = StageInfo.fromStage(this, nextAttemptId, Some(numPartitionsToCompute))
+  def makeNewStageAttempt(
+      numPartitionsToCompute: Int,
+      taskLocalityPreferences: Seq[Seq[TaskLocation]] = Seq.empty): Unit = {
+    _latestInfo = StageInfo.fromStage(
+      this, nextAttemptId, Some(numPartitionsToCompute), taskLocalityPreferences)
     nextAttemptId += 1
   }
 
diff --git a/core/src/main/scala/org/apache/spark/scheduler/StageInfo.scala b/core/src/main/scala/org/apache/spark/scheduler/StageInfo.scala
index 5d2abbc67e9d9..24796c14300b1 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/StageInfo.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/StageInfo.scala
@@ -34,7 +34,8 @@ class StageInfo(
     val numTasks: Int,
     val rddInfos: Seq[RDDInfo],
     val parentIds: Seq[Int],
-    val details: String) {
+    val details: String,
+    private[spark] val taskLocalityPreferences: Seq[Seq[TaskLocation]] = Seq.empty) {
   /** When this stage was submitted from the DAGScheduler to a TaskScheduler. */
   var submissionTime: Option[Long] = None
   /** Time when all tasks in the stage completed or when the stage was cancelled. */
@@ -70,7 +71,12 @@ private[spark] object StageInfo {
    * shuffle dependencies. Therefore, all ancestor RDDs related to this Stage's RDD through a
    * sequence of narrow dependencies should also be associated with this Stage.
    */
-  def fromStage(stage: Stage, attemptId: Int, numTasks: Option[Int] = None): StageInfo = {
+  def fromStage(
+      stage: Stage,
+      attemptId: Int,
+      numTasks: Option[Int] = None,
+      taskLocalityPreferences: Seq[Seq[TaskLocation]] = Seq.empty
+    ): StageInfo = {
     val ancestorRddInfos = stage.rdd.getNarrowAncestors.map(RDDInfo.fromRdd)
     val rddInfos = Seq(RDDInfo.fromRdd(stage.rdd)) ++ ancestorRddInfos
     new StageInfo(
@@ -80,6 +86,7 @@ private[spark] object StageInfo {
       numTasks.getOrElse(stage.numTasks),
       rddInfos,
       stage.parents.map(_.id),
-      stage.details)
+      stage.details,
+      taskLocalityPreferences)
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala
index 4be1eda2e9291..06f5438433b6e 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala
@@ -86,7 +86,11 @@ private[spark] object CoarseGrainedClusterMessages {
 
   // Request executors by specifying the new total number of executors desired
   // This includes executors already pending or running
-  case class RequestExecutors(requestedTotal: Int) extends CoarseGrainedClusterMessage
+  case class RequestExecutors(
+      requestedTotal: Int,
+      localityAwareTasks: Int,
+      hostToLocalTaskCount: Map[String, Int])
+    extends CoarseGrainedClusterMessage
 
   case class KillExecutors(executorIds: Seq[String]) extends CoarseGrainedClusterMessage
 
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
index c65b3e517773e..660702f6e6fd0 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
@@ -66,6 +66,12 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
   // Executors we have requested the cluster manager to kill that have not died yet
   private val executorsPendingToRemove = new HashSet[String]
 
+  // A map to store hostname with its possible task number running on it
+  protected var hostToLocalTaskCount: Map[String, Int] = Map.empty
+
+  // The number of pending tasks which is locality required
+  protected var localityAwareTasks = 0
+
   class DriverEndpoint(override val rpcEnv: RpcEnv, sparkProperties: Seq[(String, String)])
     extends ThreadSafeRpcEndpoint with Logging {
 
@@ -339,6 +345,7 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
     }
     logInfo(s"Requesting $numAdditionalExecutors additional executor(s) from the cluster manager")
     logDebug(s"Number of pending executors is now $numPendingExecutors")
+
     numPendingExecutors += numAdditionalExecutors
     // Account for executors pending to be added or removed
     val newTotal = numExistingExecutors + numPendingExecutors - executorsPendingToRemove.size
@@ -346,16 +353,33 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
   }
 
   /**
-   * Express a preference to the cluster manager for a given total number of executors. This can
-   * result in canceling pending requests or filing additional requests.
-   * @return whether the request is acknowledged.
+   * Update the cluster manager on our scheduling needs. Three bits of information are included
+   * to help it make decisions.
+   * @param numExecutors The total number of executors we'd like to have. The cluster manager
+   *                     shouldn't kill any running executor to reach this number, but,
+   *                     if all existing executors were to die, this is the number of executors
+   *                     we'd want to be allocated.
+   * @param localityAwareTasks The number of tasks in all active stages that have a locality
+   *                           preferences. This includes running, pending, and completed tasks.
+   * @param hostToLocalTaskCount A map of hosts to the number of tasks from all active stages
+   *                             that would like to like to run on that host.
+   *                             This includes running, pending, and completed tasks.
+   * @return whether the request is acknowledged by the cluster manager.
    */
-  final override def requestTotalExecutors(numExecutors: Int): Boolean = synchronized {
+  final override def requestTotalExecutors(
+      numExecutors: Int,
+      localityAwareTasks: Int,
+      hostToLocalTaskCount: Map[String, Int]
+    ): Boolean = synchronized {
     if (numExecutors < 0) {
       throw new IllegalArgumentException(
         "Attempted to request a negative number of executor(s) " +
           s"$numExecutors from the cluster manager. Please specify a positive number!")
     }
+
+    this.localityAwareTasks = localityAwareTasks
+    this.hostToLocalTaskCount = hostToLocalTaskCount
+
     numPendingExecutors =
       math.max(numExecutors - numExistingExecutors + executorsPendingToRemove.size, 0)
     doRequestTotalExecutors(numExecutors)
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala
index bc67abb5df446..074282d1be37d 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala
@@ -53,7 +53,8 @@ private[spark] abstract class YarnSchedulerBackend(
    * This includes executors already pending or running.
    */
   override def doRequestTotalExecutors(requestedTotal: Int): Boolean = {
-    yarnSchedulerEndpoint.askWithRetry[Boolean](RequestExecutors(requestedTotal))
+    yarnSchedulerEndpoint.askWithRetry[Boolean](
+      RequestExecutors(requestedTotal, localityAwareTasks, hostToLocalTaskCount))
   }
 
   /**
diff --git a/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala b/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala
index 803e1831bb269..34caca892891c 100644
--- a/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala
@@ -751,6 +751,42 @@ class ExecutorAllocationManagerSuite
     assert(numExecutorsTarget(manager) === 2)
   }
 
+  test("get pending task number and related locality preference") {
+    sc = createSparkContext(2, 5, 3)
+    val manager = sc.executorAllocationManager.get
+
+    val localityPreferences1 = Seq(
+      Seq(TaskLocation("host1"), TaskLocation("host2"), TaskLocation("host3")),
+      Seq(TaskLocation("host1"), TaskLocation("host2"), TaskLocation("host4")),
+      Seq(TaskLocation("host2"), TaskLocation("host3"), TaskLocation("host4")),
+      Seq.empty,
+      Seq.empty
+    )
+    val stageInfo1 = createStageInfo(1, 5, localityPreferences1)
+    sc.listenerBus.postToAll(SparkListenerStageSubmitted(stageInfo1))
+
+    assert(localityAwareTasks(manager) === 3)
+    assert(hostToLocalTaskCount(manager) ===
+      Map("host1" -> 2, "host2" -> 3, "host3" -> 2, "host4" -> 2))
+
+    val localityPreferences2 = Seq(
+      Seq(TaskLocation("host2"), TaskLocation("host3"), TaskLocation("host5")),
+      Seq(TaskLocation("host3"), TaskLocation("host4"), TaskLocation("host5")),
+      Seq.empty
+    )
+    val stageInfo2 = createStageInfo(2, 3, localityPreferences2)
+    sc.listenerBus.postToAll(SparkListenerStageSubmitted(stageInfo2))
+
+    assert(localityAwareTasks(manager) === 5)
+    assert(hostToLocalTaskCount(manager) ===
+      Map("host1" -> 2, "host2" -> 4, "host3" -> 4, "host4" -> 3, "host5" -> 2))
+
+    sc.listenerBus.postToAll(SparkListenerStageCompleted(stageInfo1))
+    assert(localityAwareTasks(manager) === 2)
+    assert(hostToLocalTaskCount(manager) ===
+      Map("host2" -> 1, "host3" -> 2, "host4" -> 1, "host5" -> 2))
+  }
+
   private def createSparkContext(
       minExecutors: Int = 1,
       maxExecutors: Int = 5,
@@ -784,8 +820,13 @@ private object ExecutorAllocationManagerSuite extends PrivateMethodTester {
   private val sustainedSchedulerBacklogTimeout = 2L
   private val executorIdleTimeout = 3L
 
-  private def createStageInfo(stageId: Int, numTasks: Int): StageInfo = {
-    new StageInfo(stageId, 0, "name", numTasks, Seq.empty, Seq.empty, "no details")
+  private def createStageInfo(
+      stageId: Int,
+      numTasks: Int,
+      taskLocalityPreferences: Seq[Seq[TaskLocation]] = Seq.empty
+    ): StageInfo = {
+    new StageInfo(
+      stageId, 0, "name", numTasks, Seq.empty, Seq.empty, "no details", taskLocalityPreferences)
   }
 
   private def createTaskInfo(taskId: Int, taskIndex: Int, executorId: String): TaskInfo = {
@@ -815,6 +856,8 @@ private object ExecutorAllocationManagerSuite extends PrivateMethodTester {
   private val _onSchedulerQueueEmpty = PrivateMethod[Unit]('onSchedulerQueueEmpty)
   private val _onExecutorIdle = PrivateMethod[Unit]('onExecutorIdle)
   private val _onExecutorBusy = PrivateMethod[Unit]('onExecutorBusy)
+  private val _localityAwareTasks = PrivateMethod[Int]('localityAwareTasks)
+  private val _hostToLocalTaskCount = PrivateMethod[Map[String, Int]]('hostToLocalTaskCount)
 
   private def numExecutorsToAdd(manager: ExecutorAllocationManager): Int = {
     manager invokePrivate _numExecutorsToAdd()
@@ -885,4 +928,12 @@ private object ExecutorAllocationManagerSuite extends PrivateMethodTester {
   private def onExecutorBusy(manager: ExecutorAllocationManager, id: String): Unit = {
     manager invokePrivate _onExecutorBusy(id)
   }
+
+  private def localityAwareTasks(manager: ExecutorAllocationManager): Int = {
+    manager invokePrivate _localityAwareTasks()
+  }
+
+  private def hostToLocalTaskCount(manager: ExecutorAllocationManager): Map[String, Int] = {
+    manager invokePrivate _hostToLocalTaskCount()
+  }
 }
diff --git a/core/src/test/scala/org/apache/spark/HeartbeatReceiverSuite.scala b/core/src/test/scala/org/apache/spark/HeartbeatReceiverSuite.scala
index 5a2670e4d1cf0..139b8dc25f4b4 100644
--- a/core/src/test/scala/org/apache/spark/HeartbeatReceiverSuite.scala
+++ b/core/src/test/scala/org/apache/spark/HeartbeatReceiverSuite.scala
@@ -182,7 +182,7 @@ class HeartbeatReceiverSuite
 
     // Adjust the target number of executors on the cluster manager side
     assert(fakeClusterManager.getTargetNumExecutors === 0)
-    sc.requestTotalExecutors(2)
+    sc.requestTotalExecutors(2, 0, Map.empty)
     assert(fakeClusterManager.getTargetNumExecutors === 2)
     assert(fakeClusterManager.getExecutorIdsToKill.isEmpty)
 
@@ -241,7 +241,8 @@ private class FakeSchedulerBackend(
   extends CoarseGrainedSchedulerBackend(scheduler, rpcEnv) {
 
   protected override def doRequestTotalExecutors(requestedTotal: Int): Boolean = {
-    clusterManagerEndpoint.askWithRetry[Boolean](RequestExecutors(requestedTotal))
+    clusterManagerEndpoint.askWithRetry[Boolean](
+      RequestExecutors(requestedTotal, localityAwareTasks, hostToLocalTaskCount))
   }
 
   protected override def doKillExecutors(executorIds: Seq[String]): Boolean = {
@@ -260,7 +261,7 @@ private class FakeClusterManager(override val rpcEnv: RpcEnv) extends RpcEndpoin
   def getExecutorIdsToKill: Set[String] = executorIdsToKill.toSet
 
   override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
-    case RequestExecutors(requestedTotal) =>
+    case RequestExecutors(requestedTotal, _, _) =>
       targetNumExecutors = requestedTotal
       context.reply(true)
     case KillExecutors(executorIds) =>
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
index 83dafa4a125d2..44acc7374d024 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
@@ -555,11 +555,12 @@ private[spark] class ApplicationMaster(
     }
 
     override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
-      case RequestExecutors(requestedTotal) =>
+      case RequestExecutors(requestedTotal, localityAwareTasks, hostToLocalTaskCount) =>
         Option(allocator) match {
           case Some(a) =>
             allocatorLock.synchronized {
-              if (a.requestTotalExecutors(requestedTotal)) {
+              if (a.requestTotalExecutorsWithPreferredLocalities(requestedTotal,
+                localityAwareTasks, hostToLocalTaskCount)) {
                 allocatorLock.notifyAll()
               }
             }
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/LocalityPreferredContainerPlacementStrategy.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/LocalityPreferredContainerPlacementStrategy.scala
new file mode 100644
index 0000000000000..081780204e424
--- /dev/null
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/LocalityPreferredContainerPlacementStrategy.scala
@@ -0,0 +1,182 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.yarn
+
+import scala.collection.mutable.{ArrayBuffer, HashMap, Set}
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.yarn.api.records.{ContainerId, Resource}
+import org.apache.hadoop.yarn.util.RackResolver
+
+import org.apache.spark.SparkConf
+
+private[yarn] case class ContainerLocalityPreferences(nodes: Array[String], racks: Array[String])
+
+/**
+ * This strategy is calculating the optimal locality preferences of YARN containers by considering
+ * the node ratio of pending tasks, number of required cores/containers and and locality of current
+ * existing containers. The target of this algorithm is to maximize the number of tasks that
+ * would run locally.
+ *
+ * Consider a situation in which we have 20 tasks that require (host1, host2, host3)
+ * and 10 tasks that require (host1, host2, host4), besides each container has 2 cores
+ * and cpus per task is 1, so the required container number is 15,
+ * and host ratio is (host1: 30, host2: 30, host3: 20, host4: 10).
+ *
+ * 1. If requested container number (18) is more than the required container number (15):
+ *
+ * requests for 5 containers with nodes: (host1, host2, host3, host4)
+ * requests for 5 containers with nodes: (host1, host2, host3)
+ * requests for 5 containers with nodes: (host1, host2)
+ * requests for 3 containers with no locality preferences.
+ *
+ * The placement ratio is 3 : 3 : 2 : 1, and set the additional containers with no locality
+ * preferences.
+ *
+ * 2. If requested container number (10) is less than or equal to the required container number
+ * (15):
+ *
+ * requests for 4 containers with nodes: (host1, host2, host3, host4)
+ * requests for 3 containers with nodes: (host1, host2, host3)
+ * requests for 3 containers with nodes: (host1, host2)
+ *
+ * The placement ratio is 10 : 10 : 7 : 4, close to expected ratio (3 : 3 : 2 : 1)
+ *
+ * 3. If containers exist but none of them can match the requested localities,
+ * follow the method of 1 and 2.
+ *
+ * 4. If containers exist and some of them can match the requested localities.
+ * For example if we have 1 containers on each node (host1: 1, host2: 1: host3: 1, host4: 1),
+ * and the expected containers on each node would be (host1: 5, host2: 5, host3: 4, host4: 2),
+ * so the newly requested containers on each node would be updated to (host1: 4, host2: 4,
+ * host3: 3, host4: 1), 12 containers by total.
+ *
+ *   4.1 If requested container number (18) is more than newly required containers (12). Follow
+ *   method 1 with updated ratio 4 : 4 : 3 : 1.
+ *
+ *   4.2 If request container number (10) is more than newly required containers (12). Follow
+ *   method 2 with updated ratio 4 : 4 : 3 : 1.
+ *
+ * 5. If containers exist and existing localities can fully cover the requested localities.
+ * For example if we have 5 containers on each node (host1: 5, host2: 5, host3: 5, host4: 5),
+ * which could cover the current requested localities. This algorithm will allocate all the
+ * requested containers with no localities.
+ */
+private[yarn] class LocalityPreferredContainerPlacementStrategy(
+    val sparkConf: SparkConf,
+    val yarnConf: Configuration,
+    val resource: Resource) {
+
+  // Number of CPUs per task
+  private val CPUS_PER_TASK = sparkConf.getInt("spark.task.cpus", 1)
+
+  /**
+   * Calculate each container's node locality and rack locality
+   * @param numContainer number of containers to calculate
+   * @param numLocalityAwareTasks number of locality required tasks
+   * @param hostToLocalTaskCount a map to store the preferred hostname and possible task
+   *                             numbers running on it, used as hints for container allocation
+   * @return node localities and rack localities, each locality is an array of string,
+   *         the length of localities is the same as number of containers
+   */
+  def localityOfRequestedContainers(
+      numContainer: Int,
+      numLocalityAwareTasks: Int,
+      hostToLocalTaskCount: Map[String, Int],
+      allocatedHostToContainersMap: HashMap[String, Set[ContainerId]]
+    ): Array[ContainerLocalityPreferences] = {
+    val updatedHostToContainerCount = expectedHostToContainerCount(
+      numLocalityAwareTasks, hostToLocalTaskCount, allocatedHostToContainersMap)
+    val updatedLocalityAwareContainerNum = updatedHostToContainerCount.values.sum
+
+    // The number of containers to allocate, divided into two groups, one with preferred locality,
+    // and the other without locality preference.
+    val requiredLocalityFreeContainerNum =
+      math.max(0, numContainer - updatedLocalityAwareContainerNum)
+    val requiredLocalityAwareContainerNum = numContainer - requiredLocalityFreeContainerNum
+
+    val containerLocalityPreferences = ArrayBuffer[ContainerLocalityPreferences]()
+    if (requiredLocalityFreeContainerNum > 0) {
+      for (i <- 0 until requiredLocalityFreeContainerNum) {
+        containerLocalityPreferences += ContainerLocalityPreferences(
+          null.asInstanceOf[Array[String]], null.asInstanceOf[Array[String]])
+      }
+    }
+
+    if (requiredLocalityAwareContainerNum > 0) {
+      val largestRatio = updatedHostToContainerCount.values.max
+      // Round the ratio of preferred locality to the number of locality required container
+      // number, which is used for locality preferred host calculating.
+      var preferredLocalityRatio = updatedHostToContainerCount.mapValues { ratio =>
+        val adjustedRatio = ratio.toDouble * requiredLocalityAwareContainerNum / largestRatio
+        adjustedRatio.ceil.toInt
+      }
+
+      for (i <- 0 until requiredLocalityAwareContainerNum) {
+        // Only filter out the ratio which is larger than 0, which means the current host can
+        // still be allocated with new container request.
+        val hosts = preferredLocalityRatio.filter(_._2 > 0).keys.toArray
+        val racks = hosts.map { h =>
+          RackResolver.resolve(yarnConf, h).getNetworkLocation
+        }.toSet
+        containerLocalityPreferences += ContainerLocalityPreferences(hosts, racks.toArray)
+
+        // Minus 1 each time when the host is used. When the current ratio is 0,
+        // which means all the required ratio is satisfied, this host will not be allocated again.
+        preferredLocalityRatio = preferredLocalityRatio.mapValues(_ - 1)
+      }
+    }
+
+    containerLocalityPreferences.toArray
+  }
+
+  /**
+   * Calculate the number of executors need to satisfy the given number of pending tasks.
+   */
+  private def numExecutorsPending(numTasksPending: Int): Int = {
+    val coresPerExecutor = resource.getVirtualCores
+    (numTasksPending * CPUS_PER_TASK + coresPerExecutor - 1) / coresPerExecutor
+  }
+
+  /**
+   * Calculate the expected host to number of containers by considering with allocated containers.
+   * @param localityAwareTasks number of locality aware tasks
+   * @param hostToLocalTaskCount a map to store the preferred hostname and possible task
+   *                             numbers running on it, used as hints for container allocation
+   * @return a map with hostname as key and required number of containers on this host as value
+   */
+  private def expectedHostToContainerCount(
+      localityAwareTasks: Int,
+      hostToLocalTaskCount: Map[String, Int],
+      allocatedHostToContainersMap: HashMap[String, Set[ContainerId]]
+    ): Map[String, Int] = {
+    val totalLocalTaskNum = hostToLocalTaskCount.values.sum
+    hostToLocalTaskCount.map { case (host, count) =>
+      val expectedCount =
+        count.toDouble * numExecutorsPending(localityAwareTasks) / totalLocalTaskNum
+      val existedCount = allocatedHostToContainersMap.get(host)
+        .map(_.size)
+        .getOrElse(0)
+
+      // If existing container can not fully satisfy the expected number of container,
+      // the required container number is expected count minus existed count. Otherwise the
+      // required container number is 0.
+      (host, math.max(0, (expectedCount - existedCount).ceil.toInt))
+    }
+  }
+}
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
index 940873fbd046c..6c103394af098 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
@@ -96,7 +96,7 @@ private[yarn] class YarnAllocator(
   // Number of cores per executor.
   protected val executorCores = args.executorCores
   // Resource capability requested for each executors
-  private val resource = Resource.newInstance(executorMemory + memoryOverhead, executorCores)
+  private[yarn] val resource = Resource.newInstance(executorMemory + memoryOverhead, executorCores)
 
   private val launcherPool = new ThreadPoolExecutor(
     // max pool size of Integer.MAX_VALUE is ignored because we use an unbounded queue
@@ -127,6 +127,16 @@ private[yarn] class YarnAllocator(
     }
   }
 
+  // A map to store preferred hostname and possible task numbers running on it.
+  private var hostToLocalTaskCounts: Map[String, Int] = Map.empty
+
+  // Number of tasks that have locality preferences in active stages
+  private var numLocalityAwareTasks: Int = 0
+
+  // A container placement strategy based on pending tasks' locality preference
+  private[yarn] val containerPlacementStrategy =
+    new LocalityPreferredContainerPlacementStrategy(sparkConf, conf, resource)
+
   def getNumExecutorsRunning: Int = numExecutorsRunning
 
   def getNumExecutorsFailed: Int = numExecutorsFailed
@@ -146,10 +156,19 @@ private[yarn] class YarnAllocator(
    * Request as many executors from the ResourceManager as needed to reach the desired total. If
    * the requested total is smaller than the current number of running executors, no executors will
    * be killed.
-   *
+   * @param requestedTotal total number of containers requested
+   * @param localityAwareTasks number of locality aware tasks to be used as container placement hint
+   * @param hostToLocalTaskCount a map of preferred hostname to possible task counts to be used as
+   *                             container placement hint.
    * @return Whether the new requested total is different than the old value.
    */
-  def requestTotalExecutors(requestedTotal: Int): Boolean = synchronized {
+  def requestTotalExecutorsWithPreferredLocalities(
+      requestedTotal: Int,
+      localityAwareTasks: Int,
+      hostToLocalTaskCount: Map[String, Int]): Boolean = synchronized {
+    this.numLocalityAwareTasks = localityAwareTasks
+    this.hostToLocalTaskCounts = hostToLocalTaskCount
+
     if (requestedTotal != targetNumExecutors) {
       logInfo(s"Driver requested a total number of $requestedTotal executor(s).")
       targetNumExecutors = requestedTotal
@@ -221,12 +240,20 @@ private[yarn] class YarnAllocator(
     val numPendingAllocate = getNumPendingAllocate
     val missing = targetNumExecutors - numPendingAllocate - numExecutorsRunning
 
+    // TODO. Consider locality preferences of pending container requests.
+    // Since the last time we made container requests, stages have completed and been submitted,
+    // and that the localities at which we requested our pending executors
+    // no longer apply to our current needs. We should consider to remove all outstanding
+    // container requests and add requests anew each time to avoid this.
     if (missing > 0) {
       logInfo(s"Will request $missing executor containers, each with ${resource.getVirtualCores} " +
         s"cores and ${resource.getMemory} MB memory including $memoryOverhead MB overhead")
 
-      for (i <- 0 until missing) {
-        val request = createContainerRequest(resource)
+      val containerLocalityPreferences = containerPlacementStrategy.localityOfRequestedContainers(
+        missing, numLocalityAwareTasks, hostToLocalTaskCounts, allocatedHostToContainersMap)
+
+      for (locality <- containerLocalityPreferences) {
+        val request = createContainerRequest(resource, locality.nodes, locality.racks)
         amClient.addContainerRequest(request)
         val nodes = request.getNodes
         val hostStr = if (nodes == null || nodes.isEmpty) "Any" else nodes.last
@@ -249,11 +276,14 @@ private[yarn] class YarnAllocator(
    * Creates a container request, handling the reflection required to use YARN features that were
    * added in recent versions.
    */
-  private def createContainerRequest(resource: Resource): ContainerRequest = {
+  protected def createContainerRequest(
+      resource: Resource,
+      nodes: Array[String],
+      racks: Array[String]): ContainerRequest = {
     nodeLabelConstructor.map { constructor =>
-      constructor.newInstance(resource, null, null, RM_REQUEST_PRIORITY, true: java.lang.Boolean,
+      constructor.newInstance(resource, nodes, racks, RM_REQUEST_PRIORITY, true: java.lang.Boolean,
         labelExpression.orNull)
-    }.getOrElse(new ContainerRequest(resource, null, null, RM_REQUEST_PRIORITY))
+    }.getOrElse(new ContainerRequest(resource, nodes, racks, RM_REQUEST_PRIORITY))
   }
 
   /**
@@ -437,7 +467,6 @@ private[yarn] class YarnAllocator(
     releasedContainers.add(container.getId())
     amClient.releaseAssignedContainer(container.getId())
   }
-
 }
 
 private object YarnAllocator {
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/ContainerPlacementStrategySuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/ContainerPlacementStrategySuite.scala
new file mode 100644
index 0000000000000..b7fe4ccc67a38
--- /dev/null
+++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/ContainerPlacementStrategySuite.scala
@@ -0,0 +1,125 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.yarn
+
+import org.scalatest.{BeforeAndAfterEach, Matchers}
+
+import org.apache.spark.SparkFunSuite
+
+class ContainerPlacementStrategySuite extends SparkFunSuite with Matchers with BeforeAndAfterEach {
+
+  private val yarnAllocatorSuite = new YarnAllocatorSuite
+  import yarnAllocatorSuite._
+
+  override def beforeEach() {
+    yarnAllocatorSuite.beforeEach()
+  }
+
+  override def afterEach() {
+    yarnAllocatorSuite.afterEach()
+  }
+
+  test("allocate locality preferred containers with enough resource and no matched existed " +
+    "containers") {
+    // 1. All the locations of current containers cannot satisfy the new requirements
+    // 2. Current requested container number can fully satisfy the pending tasks.
+
+    val handler = createAllocator(2)
+    handler.updateResourceRequests()
+    handler.handleAllocatedContainers(Array(createContainer("host1"), createContainer("host2")))
+
+    val localities = handler.containerPlacementStrategy.localityOfRequestedContainers(
+      3, 15, Map("host3" -> 15, "host4" -> 15, "host5" -> 10), handler.allocatedHostToContainersMap)
+
+    assert(localities.map(_.nodes) === Array(
+      Array("host3", "host4", "host5"),
+      Array("host3", "host4", "host5"),
+      Array("host3", "host4")))
+  }
+
+  test("allocate locality preferred containers with enough resource and partially matched " +
+    "containers") {
+    // 1. Parts of current containers' locations can satisfy the new requirements
+    // 2. Current requested container number can fully satisfy the pending tasks.
+
+    val handler = createAllocator(3)
+    handler.updateResourceRequests()
+    handler.handleAllocatedContainers(Array(
+      createContainer("host1"),
+      createContainer("host1"),
+      createContainer("host2")
+    ))
+
+    val localities = handler.containerPlacementStrategy.localityOfRequestedContainers(
+      3, 15, Map("host1" -> 15, "host2" -> 15, "host3" -> 10), handler.allocatedHostToContainersMap)
+
+    assert(localities.map(_.nodes) ===
+      Array(null, Array("host2", "host3"), Array("host2", "host3")))
+  }
+
+  test("allocate locality preferred containers with limited resource and partially matched " +
+    "containers") {
+    // 1. Parts of current containers' locations can satisfy the new requirements
+    // 2. Current requested container number cannot fully satisfy the pending tasks.
+
+    val handler = createAllocator(3)
+    handler.updateResourceRequests()
+    handler.handleAllocatedContainers(Array(
+      createContainer("host1"),
+      createContainer("host1"),
+      createContainer("host2")
+    ))
+
+    val localities = handler.containerPlacementStrategy.localityOfRequestedContainers(
+      1, 15, Map("host1" -> 15, "host2" -> 15, "host3" -> 10), handler.allocatedHostToContainersMap)
+
+    assert(localities.map(_.nodes) === Array(Array("host2", "host3")))
+  }
+
+  test("allocate locality preferred containers with fully matched containers") {
+    // Current containers' locations can fully satisfy the new requirements
+
+    val handler = createAllocator(5)
+    handler.updateResourceRequests()
+    handler.handleAllocatedContainers(Array(
+      createContainer("host1"),
+      createContainer("host1"),
+      createContainer("host2"),
+      createContainer("host2"),
+      createContainer("host3")
+    ))
+
+    val localities = handler.containerPlacementStrategy.localityOfRequestedContainers(
+      3, 15, Map("host1" -> 15, "host2" -> 15, "host3" -> 10), handler.allocatedHostToContainersMap)
+
+    assert(localities.map(_.nodes) === Array(null, null, null))
+  }
+
+  test("allocate containers with no locality preference") {
+    // Request new container without locality preference
+
+    val handler = createAllocator(2)
+    handler.updateResourceRequests()
+    handler.handleAllocatedContainers(Array(createContainer("host1"), createContainer("host2")))
+
+    val localities = handler.containerPlacementStrategy.localityOfRequestedContainers(
+      1, 0, Map.empty, handler.allocatedHostToContainersMap)
+
+    assert(localities.map(_.nodes) === Array(null))
+  }
+}
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnAllocatorSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnAllocatorSuite.scala
index 7509000771d94..37a789fcd375b 100644
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnAllocatorSuite.scala
+++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnAllocatorSuite.scala
@@ -25,6 +25,7 @@ import org.apache.hadoop.net.DNSToSwitchMapping
 import org.apache.hadoop.yarn.api.records._
 import org.apache.hadoop.yarn.client.api.AMRMClient
 import org.apache.hadoop.yarn.client.api.AMRMClient.ContainerRequest
+import org.scalatest.{BeforeAndAfterEach, Matchers}
 
 import org.apache.spark.{SecurityManager, SparkFunSuite}
 import org.apache.spark.SparkConf
@@ -32,8 +33,6 @@ import org.apache.spark.deploy.yarn.YarnSparkHadoopUtil._
 import org.apache.spark.deploy.yarn.YarnAllocator._
 import org.apache.spark.scheduler.SplitInfo
 
-import org.scalatest.{BeforeAndAfterEach, Matchers}
-
 class MockResolver extends DNSToSwitchMapping {
 
   override def resolve(names: JList[String]): JList[String] = {
@@ -171,7 +170,7 @@ class YarnAllocatorSuite extends SparkFunSuite with Matchers with BeforeAndAfter
     handler.getNumExecutorsRunning should be (0)
     handler.getNumPendingAllocate should be (4)
 
-    handler.requestTotalExecutors(3)
+    handler.requestTotalExecutorsWithPreferredLocalities(3, 0, Map.empty)
     handler.updateResourceRequests()
     handler.getNumPendingAllocate should be (3)
 
@@ -182,7 +181,7 @@ class YarnAllocatorSuite extends SparkFunSuite with Matchers with BeforeAndAfter
     handler.allocatedContainerToHostMap.get(container.getId).get should be ("host1")
     handler.allocatedHostToContainersMap.get("host1").get should contain (container.getId)
 
-    handler.requestTotalExecutors(2)
+    handler.requestTotalExecutorsWithPreferredLocalities(2, 0, Map.empty)
     handler.updateResourceRequests()
     handler.getNumPendingAllocate should be (1)
   }
@@ -193,7 +192,7 @@ class YarnAllocatorSuite extends SparkFunSuite with Matchers with BeforeAndAfter
     handler.getNumExecutorsRunning should be (0)
     handler.getNumPendingAllocate should be (4)
 
-    handler.requestTotalExecutors(3)
+    handler.requestTotalExecutorsWithPreferredLocalities(3, 0, Map.empty)
     handler.updateResourceRequests()
     handler.getNumPendingAllocate should be (3)
 
@@ -203,7 +202,7 @@ class YarnAllocatorSuite extends SparkFunSuite with Matchers with BeforeAndAfter
 
     handler.getNumExecutorsRunning should be (2)
 
-    handler.requestTotalExecutors(1)
+    handler.requestTotalExecutorsWithPreferredLocalities(1, 0, Map.empty)
     handler.updateResourceRequests()
     handler.getNumPendingAllocate should be (0)
     handler.getNumExecutorsRunning should be (2)
@@ -219,7 +218,7 @@ class YarnAllocatorSuite extends SparkFunSuite with Matchers with BeforeAndAfter
     val container2 = createContainer("host2")
     handler.handleAllocatedContainers(Array(container1, container2))
 
-    handler.requestTotalExecutors(1)
+    handler.requestTotalExecutorsWithPreferredLocalities(1, 0, Map.empty)
     handler.executorIdToContainer.keys.foreach { id => handler.killExecutor(id ) }
 
     val statuses = Seq(container1, container2).map { c =>
@@ -241,5 +240,4 @@ class YarnAllocatorSuite extends SparkFunSuite with Matchers with BeforeAndAfter
     assert(vmemMsg.contains("5.8 GB of 4.2 GB virtual memory used."))
     assert(pmemMsg.contains("2.1 MB of 2 GB physical memory used."))
   }
-
 }

From dafe8d857dff4c61981476282cbfe11f5c008078 Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Mon, 27 Jul 2015 15:49:42 -0700
Subject: [PATCH 407/549] [SPARK-9385] [PYSPARK] Enable PEP8 but disable
 installing pylint.

Instead of disabling all python style check, we should enable PEP8. So, this PR just comments out the part installing pylint.

Author: Yin Huai <yhuai@databricks.com>

Closes #7704 from yhuai/SPARK-9385 and squashes the following commits:

0056359 [Yin Huai] Enable PEP8 but disable installing pylint.
---
 dev/lint-python  | 30 +++++++++++++++---------------
 dev/run-tests.py |  5 ++---
 2 files changed, 17 insertions(+), 18 deletions(-)

diff --git a/dev/lint-python b/dev/lint-python
index 53bccc1fab535..575dbb0ae321b 100755
--- a/dev/lint-python
+++ b/dev/lint-python
@@ -58,21 +58,21 @@ export "PYTHONPATH=$SPARK_ROOT_DIR/dev/pylint"
 export "PYLINT_HOME=$PYTHONPATH"
 export "PATH=$PYTHONPATH:$PATH"
 
-if [ ! -d "$PYLINT_HOME" ]; then
-    mkdir "$PYLINT_HOME"
-    # Redirect the annoying pylint installation output.
-    easy_install -d "$PYLINT_HOME" pylint==1.4.4 &>> "$PYLINT_INSTALL_INFO"
-    easy_install_status="$?"
-
-    if [ "$easy_install_status" -ne 0 ]; then
-        echo "Unable to install pylint locally in \"$PYTHONPATH\"."
-        cat "$PYLINT_INSTALL_INFO"
-        exit "$easy_install_status"
-    fi
-
-    rm "$PYLINT_INSTALL_INFO"
-
-fi
+# if [ ! -d "$PYLINT_HOME" ]; then
+#     mkdir "$PYLINT_HOME"
+#     # Redirect the annoying pylint installation output.
+#     easy_install -d "$PYLINT_HOME" pylint==1.4.4 &>> "$PYLINT_INSTALL_INFO"
+#     easy_install_status="$?"
+#
+#     if [ "$easy_install_status" -ne 0 ]; then
+#         echo "Unable to install pylint locally in \"$PYTHONPATH\"."
+#         cat "$PYLINT_INSTALL_INFO"
+#         exit "$easy_install_status"
+#     fi
+#
+#     rm "$PYLINT_INSTALL_INFO"
+#
+# fi
 
 # There is no need to write this output to a file
 #+ first, but we do so so that the check status can
diff --git a/dev/run-tests.py b/dev/run-tests.py
index d1cb66860b3f8..1f0d218514f92 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -198,9 +198,8 @@ def run_scala_style_checks():
 
 
 def run_python_style_checks():
-    # set_title_and_block("Running Python style checks", "BLOCK_PYTHON_STYLE")
-    # run_cmd([os.path.join(SPARK_HOME, "dev", "lint-python")])
-    pass
+    set_title_and_block("Running Python style checks", "BLOCK_PYTHON_STYLE")
+    run_cmd([os.path.join(SPARK_HOME, "dev", "lint-python")])
 
 
 def build_spark_documentation():

From 8ddfa52c208bf329c2b2c8909c6be04301e36083 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Mon, 27 Jul 2015 17:17:49 -0700
Subject: [PATCH 408/549] [SPARK-9230] [ML] Support StringType features in
 RFormula

This adds StringType feature support via OneHotEncoder. As part of this task it was necessary to change RFormula to an Estimator, so that factor levels could be determined from the training dataset.

Not sure if I am using uids correctly here, would be good to get reviewer help on that.
cc mengxr

Umbrella design doc: https://docs.google.com/document/d/10NZNSEurN2EdWM31uFYsgayIPfCFHiuIu3pCWrUmP_c/edit#

Author: Eric Liang <ekl@databricks.com>

Closes #7574 from ericl/string-features and squashes the following commits:

f99131a [Eric Liang] comments
0bf3c26 [Eric Liang] update docs
c302a2c [Eric Liang] fix tests
9d1ac82 [Eric Liang] Merge remote-tracking branch 'upstream/master' into string-features
e713da3 [Eric Liang] comments
4d79193 [Eric Liang] revert to seq + distinct
169a085 [Eric Liang] tweak functional test
a230a47 [Eric Liang] Merge branch 'master' into string-features
72bd6f3 [Eric Liang] fix merge
d841cec [Eric Liang] Merge branch 'master' into string-features
5b2c4a2 [Eric Liang] Mon Jul 20 18:45:33 PDT 2015
b01c7c5 [Eric Liang] add test
8a637db [Eric Liang] encoder wip
a1d03f4 [Eric Liang] refactor into estimator
---
 R/pkg/inst/tests/test_mllib.R                 |   6 +-
 .../apache/spark/ml/feature/RFormula.scala    | 133 ++++++++++++++----
 .../ml/feature/RFormulaParserSuite.scala      |   1 +
 .../spark/ml/feature/RFormulaSuite.scala      |  64 +++++----
 4 files changed, 142 insertions(+), 62 deletions(-)

diff --git a/R/pkg/inst/tests/test_mllib.R b/R/pkg/inst/tests/test_mllib.R
index a492763344ae6..29152a11688a2 100644
--- a/R/pkg/inst/tests/test_mllib.R
+++ b/R/pkg/inst/tests/test_mllib.R
@@ -35,8 +35,8 @@ test_that("glm and predict", {
 
 test_that("predictions match with native glm", {
   training <- createDataFrame(sqlContext, iris)
-  model <- glm(Sepal_Width ~ Sepal_Length, data = training)
+  model <- glm(Sepal_Width ~ Sepal_Length + Species, data = training)
   vals <- collect(select(predict(model, training), "prediction"))
-  rVals <- predict(glm(Sepal.Width ~ Sepal.Length, data = iris), iris)
-  expect_true(all(abs(rVals - vals) < 1e-9), rVals - vals)
+  rVals <- predict(glm(Sepal.Width ~ Sepal.Length + Species, data = iris), iris)
+  expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
 })
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
index f7b46efa10e90..0a95b1ee8de6e 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
@@ -17,17 +17,34 @@
 
 package org.apache.spark.ml.feature
 
+import scala.collection.mutable.ArrayBuffer
 import scala.util.parsing.combinator.RegexParsers
 
 import org.apache.spark.annotation.Experimental
-import org.apache.spark.ml.Transformer
+import org.apache.spark.ml.{Estimator, Model, Transformer, Pipeline, PipelineModel, PipelineStage}
 import org.apache.spark.ml.param.{Param, ParamMap}
 import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasLabelCol}
 import org.apache.spark.ml.util.Identifiable
+import org.apache.spark.mllib.linalg.VectorUDT
 import org.apache.spark.sql.DataFrame
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types._
 
+/**
+ * Base trait for [[RFormula]] and [[RFormulaModel]].
+ */
+private[feature] trait RFormulaBase extends HasFeaturesCol with HasLabelCol {
+  /** @group getParam */
+  def setFeaturesCol(value: String): this.type = set(featuresCol, value)
+
+  /** @group getParam */
+  def setLabelCol(value: String): this.type = set(labelCol, value)
+
+  protected def hasLabelCol(schema: StructType): Boolean = {
+    schema.map(_.name).contains($(labelCol))
+  }
+}
+
 /**
  * :: Experimental ::
  * Implements the transforms required for fitting a dataset against an R model formula. Currently
@@ -35,8 +52,7 @@ import org.apache.spark.sql.types._
  * docs here: http://stat.ethz.ch/R-manual/R-patched/library/stats/html/formula.html
  */
 @Experimental
-class RFormula(override val uid: String)
-  extends Transformer with HasFeaturesCol with HasLabelCol {
+class RFormula(override val uid: String) extends Estimator[RFormulaModel] with RFormulaBase {
 
   def this() = this(Identifiable.randomUID("rFormula"))
 
@@ -62,19 +78,74 @@ class RFormula(override val uid: String)
   /** @group getParam */
   def getFormula: String = $(formula)
 
-  /** @group getParam */
-  def setFeaturesCol(value: String): this.type = set(featuresCol, value)
+  override def fit(dataset: DataFrame): RFormulaModel = {
+    require(parsedFormula.isDefined, "Must call setFormula() first.")
+    // StringType terms and terms representing interactions need to be encoded before assembly.
+    // TODO(ekl) add support for feature interactions
+    var encoderStages = ArrayBuffer[PipelineStage]()
+    var tempColumns = ArrayBuffer[String]()
+    val encodedTerms = parsedFormula.get.terms.map { term =>
+      dataset.schema(term) match {
+        case column if column.dataType == StringType =>
+          val indexCol = term + "_idx_" + uid
+          val encodedCol = term + "_onehot_" + uid
+          encoderStages += new StringIndexer().setInputCol(term).setOutputCol(indexCol)
+          encoderStages += new OneHotEncoder().setInputCol(indexCol).setOutputCol(encodedCol)
+          tempColumns += indexCol
+          tempColumns += encodedCol
+          encodedCol
+        case _ =>
+          term
+      }
+    }
+    encoderStages += new VectorAssembler(uid)
+      .setInputCols(encodedTerms.toArray)
+      .setOutputCol($(featuresCol))
+    encoderStages += new ColumnPruner(tempColumns.toSet)
+    val pipelineModel = new Pipeline(uid).setStages(encoderStages.toArray).fit(dataset)
+    copyValues(new RFormulaModel(uid, parsedFormula.get, pipelineModel).setParent(this))
+  }
 
-  /** @group getParam */
-  def setLabelCol(value: String): this.type = set(labelCol, value)
+  // optimistic schema; does not contain any ML attributes
+  override def transformSchema(schema: StructType): StructType = {
+    if (hasLabelCol(schema)) {
+      StructType(schema.fields :+ StructField($(featuresCol), new VectorUDT, true))
+    } else {
+      StructType(schema.fields :+ StructField($(featuresCol), new VectorUDT, true) :+
+        StructField($(labelCol), DoubleType, true))
+    }
+  }
+
+  override def copy(extra: ParamMap): RFormula = defaultCopy(extra)
+
+  override def toString: String = s"RFormula(${get(formula)})"
+}
+
+/**
+ * :: Experimental ::
+ * A fitted RFormula. Fitting is required to determine the factor levels of formula terms.
+ * @param parsedFormula a pre-parsed R formula.
+ * @param pipelineModel the fitted feature model, including factor to index mappings.
+ */
+@Experimental
+class RFormulaModel private[feature](
+    override val uid: String,
+    parsedFormula: ParsedRFormula,
+    pipelineModel: PipelineModel)
+  extends Model[RFormulaModel] with RFormulaBase {
+
+  override def transform(dataset: DataFrame): DataFrame = {
+    checkCanTransform(dataset.schema)
+    transformLabel(pipelineModel.transform(dataset))
+  }
 
   override def transformSchema(schema: StructType): StructType = {
     checkCanTransform(schema)
-    val withFeatures = transformFeatures.transformSchema(schema)
+    val withFeatures = pipelineModel.transformSchema(schema)
     if (hasLabelCol(schema)) {
       withFeatures
-    } else if (schema.exists(_.name == parsedFormula.get.label)) {
-      val nullable = schema(parsedFormula.get.label).dataType match {
+    } else if (schema.exists(_.name == parsedFormula.label)) {
+      val nullable = schema(parsedFormula.label).dataType match {
         case _: NumericType | BooleanType => false
         case _ => true
       }
@@ -86,24 +157,19 @@ class RFormula(override val uid: String)
     }
   }
 
-  override def transform(dataset: DataFrame): DataFrame = {
-    checkCanTransform(dataset.schema)
-    transformLabel(transformFeatures.transform(dataset))
-  }
-
-  override def copy(extra: ParamMap): RFormula = defaultCopy(extra)
+  override def copy(extra: ParamMap): RFormulaModel = copyValues(
+    new RFormulaModel(uid, parsedFormula, pipelineModel))
 
-  override def toString: String = s"RFormula(${get(formula)})"
+  override def toString: String = s"RFormulaModel(${parsedFormula})"
 
   private def transformLabel(dataset: DataFrame): DataFrame = {
-    val labelName = parsedFormula.get.label
+    val labelName = parsedFormula.label
     if (hasLabelCol(dataset.schema)) {
       dataset
     } else if (dataset.schema.exists(_.name == labelName)) {
       dataset.schema(labelName).dataType match {
         case _: NumericType | BooleanType =>
           dataset.withColumn($(labelCol), dataset(labelName).cast(DoubleType))
-        // TODO(ekl) add support for string-type labels
         case other =>
           throw new IllegalArgumentException("Unsupported type for label: " + other)
       }
@@ -114,25 +180,32 @@ class RFormula(override val uid: String)
     }
   }
 
-  private def transformFeatures: Transformer = {
-    // TODO(ekl) add support for non-numeric features and feature interactions
-    new VectorAssembler(uid)
-      .setInputCols(parsedFormula.get.terms.toArray)
-      .setOutputCol($(featuresCol))
-  }
-
   private def checkCanTransform(schema: StructType) {
-    require(parsedFormula.isDefined, "Must call setFormula() first.")
     val columnNames = schema.map(_.name)
     require(!columnNames.contains($(featuresCol)), "Features column already exists.")
     require(
       !columnNames.contains($(labelCol)) || schema($(labelCol)).dataType == DoubleType,
       "Label column already exists and is not of type DoubleType.")
   }
+}
 
-  private def hasLabelCol(schema: StructType): Boolean = {
-    schema.map(_.name).contains($(labelCol))
+/**
+ * Utility transformer for removing temporary columns from a DataFrame.
+ * TODO(ekl) make this a public transformer
+ */
+private class ColumnPruner(columnsToPrune: Set[String]) extends Transformer {
+  override val uid = Identifiable.randomUID("columnPruner")
+
+  override def transform(dataset: DataFrame): DataFrame = {
+    val columnsToKeep = dataset.columns.filter(!columnsToPrune.contains(_))
+    dataset.select(columnsToKeep.map(dataset.col) : _*)
   }
+
+  override def transformSchema(schema: StructType): StructType = {
+    StructType(schema.fields.filter(col => !columnsToPrune.contains(col.name)))
+  }
+
+  override def copy(extra: ParamMap): ColumnPruner = defaultCopy(extra)
 }
 
 /**
@@ -149,7 +222,7 @@ private[ml] object RFormulaParser extends RegexParsers {
   def expr: Parser[List[String]] = term ~ rep("+" ~> term) ^^ { case a ~ list => a :: list }
 
   def formula: Parser[ParsedRFormula] =
-    (term ~ "~" ~ expr) ^^ { case r ~ "~" ~ t => ParsedRFormula(r, t) }
+    (term ~ "~" ~ expr) ^^ { case r ~ "~" ~ t => ParsedRFormula(r, t.distinct) }
 
   def parse(value: String): ParsedRFormula = parseAll(formula, value) match {
     case Success(result, _) => result
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaParserSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaParserSuite.scala
index c8d065f37a605..c4b45aee06384 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaParserSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaParserSuite.scala
@@ -28,6 +28,7 @@ class RFormulaParserSuite extends SparkFunSuite {
 
   test("parse simple formulas") {
     checkParse("y ~ x", "y", Seq("x"))
+    checkParse("y ~ x + x", "y", Seq("x"))
     checkParse("y ~   ._foo  ", "y", Seq("._foo"))
     checkParse("resp ~ A_VAR + B + c123", "resp", Seq("A_VAR", "B", "c123"))
   }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
index 79c4ccf02d4e0..8148c553e9051 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
@@ -31,72 +31,78 @@ class RFormulaSuite extends SparkFunSuite with MLlibTestSparkContext {
     val formula = new RFormula().setFormula("id ~ v1 + v2")
     val original = sqlContext.createDataFrame(
       Seq((0, 1.0, 3.0), (2, 2.0, 5.0))).toDF("id", "v1", "v2")
-    val result = formula.transform(original)
-    val resultSchema = formula.transformSchema(original.schema)
+    val model = formula.fit(original)
+    val result = model.transform(original)
+    val resultSchema = model.transformSchema(original.schema)
     val expected = sqlContext.createDataFrame(
       Seq(
-        (0, 1.0, 3.0, Vectors.dense(Array(1.0, 3.0)), 0.0),
-        (2, 2.0, 5.0, Vectors.dense(Array(2.0, 5.0)), 2.0))
+        (0, 1.0, 3.0, Vectors.dense(1.0, 3.0), 0.0),
+        (2, 2.0, 5.0, Vectors.dense(2.0, 5.0), 2.0))
       ).toDF("id", "v1", "v2", "features", "label")
     // TODO(ekl) make schema comparisons ignore metadata, to avoid .toString
     assert(result.schema.toString == resultSchema.toString)
     assert(resultSchema == expected.schema)
-    assert(result.collect().toSeq == expected.collect().toSeq)
+    assert(result.collect() === expected.collect())
   }
 
   test("features column already exists") {
     val formula = new RFormula().setFormula("y ~ x").setFeaturesCol("x")
     val original = sqlContext.createDataFrame(Seq((0, 1.0), (2, 2.0))).toDF("x", "y")
     intercept[IllegalArgumentException] {
-      formula.transformSchema(original.schema)
+      formula.fit(original)
     }
     intercept[IllegalArgumentException] {
-      formula.transform(original)
+      formula.fit(original)
     }
   }
 
   test("label column already exists") {
     val formula = new RFormula().setFormula("y ~ x").setLabelCol("y")
     val original = sqlContext.createDataFrame(Seq((0, 1.0), (2, 2.0))).toDF("x", "y")
-    val resultSchema = formula.transformSchema(original.schema)
+    val model = formula.fit(original)
+    val resultSchema = model.transformSchema(original.schema)
     assert(resultSchema.length == 3)
-    assert(resultSchema.toString == formula.transform(original).schema.toString)
+    assert(resultSchema.toString == model.transform(original).schema.toString)
   }
 
   test("label column already exists but is not double type") {
     val formula = new RFormula().setFormula("y ~ x").setLabelCol("y")
     val original = sqlContext.createDataFrame(Seq((0, 1), (2, 2))).toDF("x", "y")
+    val model = formula.fit(original)
     intercept[IllegalArgumentException] {
-      formula.transformSchema(original.schema)
+      model.transformSchema(original.schema)
     }
     intercept[IllegalArgumentException] {
-      formula.transform(original)
+      model.transform(original)
     }
   }
 
   test("allow missing label column for test datasets") {
     val formula = new RFormula().setFormula("y ~ x").setLabelCol("label")
     val original = sqlContext.createDataFrame(Seq((0, 1.0), (2, 2.0))).toDF("x", "_not_y")
-    val resultSchema = formula.transformSchema(original.schema)
+    val model = formula.fit(original)
+    val resultSchema = model.transformSchema(original.schema)
     assert(resultSchema.length == 3)
     assert(!resultSchema.exists(_.name == "label"))
-    assert(resultSchema.toString == formula.transform(original).schema.toString)
+    assert(resultSchema.toString == model.transform(original).schema.toString)
   }
 
-// TODO(ekl) enable after we implement string label support
-//  test("transform string label") {
-//    val formula = new RFormula().setFormula("name ~ id")
-//    val original = sqlContext.createDataFrame(
-//      Seq((1, "foo"), (2, "bar"), (3, "bar"))).toDF("id", "name")
-//    val result = formula.transform(original)
-//    val resultSchema = formula.transformSchema(original.schema)
-//    val expected = sqlContext.createDataFrame(
-//      Seq(
-//        (1, "foo", Vectors.dense(Array(1.0)), 1.0),
-//        (2, "bar", Vectors.dense(Array(2.0)), 0.0),
-//        (3, "bar", Vectors.dense(Array(3.0)), 0.0))
-//      ).toDF("id", "name", "features", "label")
-//    assert(result.schema.toString == resultSchema.toString)
-//    assert(result.collect().toSeq == expected.collect().toSeq)
-//  }
+  test("encodes string terms") {
+    val formula = new RFormula().setFormula("id ~ a + b")
+    val original = sqlContext.createDataFrame(
+      Seq((1, "foo", 4), (2, "bar", 4), (3, "bar", 5), (4, "baz", 5))
+    ).toDF("id", "a", "b")
+    val model = formula.fit(original)
+    val result = model.transform(original)
+    val resultSchema = model.transformSchema(original.schema)
+    val expected = sqlContext.createDataFrame(
+      Seq(
+        (1, "foo", 4, Vectors.dense(0.0, 1.0, 4.0), 1.0),
+        (2, "bar", 4, Vectors.dense(1.0, 0.0, 4.0), 2.0),
+        (3, "bar", 5, Vectors.dense(1.0, 0.0, 5.0), 3.0),
+        (4, "baz", 5, Vectors.dense(0.0, 0.0, 5.0), 4.0))
+      ).toDF("id", "a", "b", "features", "label")
+    assert(result.schema.toString == resultSchema.toString)
+    assert(result.collect() === expected.collect())
+  }
 }

From ce89ff477aea6def68265ed218f6105680755c9a Mon Sep 17 00:00:00 2001
From: Michael Armbrust <michael@databricks.com>
Date: Mon, 27 Jul 2015 17:32:34 -0700
Subject: [PATCH 409/549] [SPARK-9386] [SQL] Feature flag for metastore
 partition pruning

Since we have been seeing a lot of failures related to this new feature, lets put it behind a flag and turn it off by default.

Author: Michael Armbrust <michael@databricks.com>

Closes #7703 from marmbrus/optionalMetastorePruning and squashes the following commits:

6ad128c [Michael Armbrust] style
8447835 [Michael Armbrust] [SPARK-9386][SQL] Feature flag for metastore partition pruning
fd37b87 [Michael Armbrust] add config flag
---
 .../main/scala/org/apache/spark/sql/SQLConf.scala    |  7 +++++++
 .../apache/spark/sql/hive/HiveMetastoreCatalog.scala | 12 +++++++++++-
 .../spark/sql/hive/client/ClientInterface.scala      | 10 ++++------
 3 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
index 9b2dbd7442f5c..40eba33f595ca 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
@@ -301,6 +301,11 @@ private[spark] object SQLConf {
     defaultValue = Some(true),
     doc = "<TODO>")
 
+  val HIVE_METASTORE_PARTITION_PRUNING = booleanConf("spark.sql.hive.metastorePartitionPruning",
+    defaultValue = Some(false),
+    doc = "When true, some predicates will be pushed down into the Hive metastore so that " +
+          "unmatching partitions can be eliminated earlier.")
+
   val COLUMN_NAME_OF_CORRUPT_RECORD = stringConf("spark.sql.columnNameOfCorruptRecord",
     defaultValue = Some("_corrupt_record"),
     doc = "<TODO>")
@@ -456,6 +461,8 @@ private[sql] class SQLConf extends Serializable with CatalystConf {
 
   private[spark] def verifyPartitionPath: Boolean = getConf(HIVE_VERIFY_PARTITION_PATH)
 
+  private[spark] def metastorePartitionPruning: Boolean = getConf(HIVE_METASTORE_PARTITION_PRUNING)
+
   private[spark] def externalSortEnabled: Boolean = getConf(EXTERNAL_SORT)
 
   private[spark] def sortMergeJoinEnabled: Boolean = getConf(SORTMERGE_JOIN)
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index 9c707a7a2eca1..3180c05445c9f 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -678,8 +678,18 @@ private[hive] case class MetastoreRelation
     }
   )
 
+  // When metastore partition pruning is turned off, we cache the list of all partitions to
+  // mimic the behavior of Spark < 1.5
+  lazy val allPartitions = table.getAllPartitions
+
   def getHiveQlPartitions(predicates: Seq[Expression] = Nil): Seq[Partition] = {
-    table.getPartitions(predicates).map { p =>
+    val rawPartitions = if (sqlContext.conf.metastorePartitionPruning) {
+      table.getPartitions(predicates)
+    } else {
+      allPartitions
+    }
+
+    rawPartitions.map { p =>
       val tPartition = new org.apache.hadoop.hive.metastore.api.Partition
       tPartition.setDbName(databaseName)
       tPartition.setTableName(tableName)
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientInterface.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientInterface.scala
index 1656587d14835..d834b4e83e043 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientInterface.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientInterface.scala
@@ -72,12 +72,10 @@ private[hive] case class HiveTable(
 
   def isPartitioned: Boolean = partitionColumns.nonEmpty
 
-  def getPartitions(predicates: Seq[Expression]): Seq[HivePartition] = {
-    predicates match {
-      case Nil => client.getAllPartitions(this)
-      case _ => client.getPartitionsByFilter(this, predicates)
-    }
-  }
+  def getAllPartitions: Seq[HivePartition] = client.getAllPartitions(this)
+
+  def getPartitions(predicates: Seq[Expression]): Seq[HivePartition] =
+    client.getPartitionsByFilter(this, predicates)
 
   // Hive does not support backticks when passing names to the client.
   def qualifiedName: String = s"$database.$name"

From daa1964b6098f79100def78451bda181b5c92198 Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Mon, 27 Jul 2015 17:59:43 -0700
Subject: [PATCH 410/549] [SPARK-8882] [STREAMING] Add a new Receiver
 scheduling mechanism

The design doc: https://docs.google.com/document/d/1ZsoRvHjpISPrDmSjsGzuSu8UjwgbtmoCTzmhgTurHJw/edit?usp=sharing

Author: zsxwing <zsxwing@gmail.com>

Closes #7276 from zsxwing/receiver-scheduling and squashes the following commits:

137b257 [zsxwing] Add preferredNumExecutors to rescheduleReceiver
61a6c3f [zsxwing] Set state to ReceiverState.INACTIVE in deregisterReceiver
5e1fa48 [zsxwing] Fix the code style
7451498 [zsxwing] Move DummyReceiver back to ReceiverTrackerSuite
715ef9c [zsxwing] Rename: scheduledLocations -> scheduledExecutors; locations -> executors
05daf9c [zsxwing] Use receiverTrackingInfo.toReceiverInfo
1d6d7c8 [zsxwing] Merge branch 'master' into receiver-scheduling
8f93c8d [zsxwing] Use hostPort as the receiver location rather than host; fix comments and unit tests
59f8887 [zsxwing] Schedule all receivers at the same time when launching them
075e0a3 [zsxwing] Add receiver RDD name; use '!isTrackerStarted' instead
276a4ac [zsxwing] Remove "ReceiverLauncher" and move codes to "launchReceivers"
fab9a01 [zsxwing] Move methods back to the outer class
4e639c4 [zsxwing] Fix unintentional changes
f60d021 [zsxwing] Reorganize ReceiverTracker to use an event loop for lock free
105037e [zsxwing] Merge branch 'master' into receiver-scheduling
5fee132 [zsxwing] Update tha scheduling algorithm to avoid to keep restarting Receiver
9e242c8 [zsxwing] Remove the ScheduleReceiver message because we can refuse it when receiving RegisterReceiver
a9acfbf [zsxwing] Merge branch 'squash-pr-6294' into receiver-scheduling
881edb9 [zsxwing] ReceiverScheduler -> ReceiverSchedulingPolicy
e530bcc [zsxwing] [SPARK-5681][Streaming] Use a lock to eliminate the race condition when stopping receivers and registering receivers happen at the same time #6294
3b87e4a [zsxwing] Revert SparkContext.scala
a86850c [zsxwing] Remove submitAsyncJob and revert JobWaiter
f549595 [zsxwing] Add comments for the scheduling approach
9ecc08e [zsxwing] Fix comments and code style
28d1bee [zsxwing] Make 'host' protected; rescheduleReceiver -> getAllowedLocations
2c86a9e [zsxwing] Use tryFailure to support calling jobFailed multiple times
ca6fe35 [zsxwing] Add a test for Receiver.restart
27acd45 [zsxwing] Add unit tests for LoadBalanceReceiverSchedulerImplSuite
cc76142 [zsxwing] Add JobWaiter.toFuture to avoid blocking threads
d9a3e72 [zsxwing] Add a new Receiver scheduling mechanism
---
 .../receiver/ReceiverSupervisor.scala         |   4 +-
 .../receiver/ReceiverSupervisorImpl.scala     |   6 +-
 .../streaming/scheduler/ReceiverInfo.scala    |   1 -
 .../scheduler/ReceiverSchedulingPolicy.scala  | 171 +++++++
 .../streaming/scheduler/ReceiverTracker.scala | 468 +++++++++++-------
 .../scheduler/ReceiverTrackingInfo.scala      |  55 ++
 .../ReceiverSchedulingPolicySuite.scala       | 130 +++++
 .../scheduler/ReceiverTrackerSuite.scala      |  66 +--
 .../StreamingJobProgressListenerSuite.scala   |   6 +-
 9 files changed, 674 insertions(+), 233 deletions(-)
 create mode 100644 streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicy.scala
 create mode 100644 streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTrackingInfo.scala
 create mode 100644 streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicySuite.scala

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisor.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisor.scala
index a7c220f426ecf..e98017a63756e 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisor.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisor.scala
@@ -24,9 +24,9 @@ import scala.collection.mutable.ArrayBuffer
 import scala.concurrent._
 import scala.util.control.NonFatal
 
-import org.apache.spark.{Logging, SparkConf}
+import org.apache.spark.{SparkEnv, Logging, SparkConf}
 import org.apache.spark.storage.StreamBlockId
-import org.apache.spark.util.ThreadUtils
+import org.apache.spark.util.{Utils, ThreadUtils}
 
 /**
  * Abstract class that is responsible for supervising a Receiver in the worker.
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisorImpl.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisorImpl.scala
index 2f6841ee8879c..0d802f83549af 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisorImpl.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisorImpl.scala
@@ -30,7 +30,7 @@ import org.apache.spark.storage.StreamBlockId
 import org.apache.spark.streaming.Time
 import org.apache.spark.streaming.scheduler._
 import org.apache.spark.streaming.util.WriteAheadLogUtils
-import org.apache.spark.util.{RpcUtils, Utils}
+import org.apache.spark.util.RpcUtils
 import org.apache.spark.{Logging, SparkEnv, SparkException}
 
 /**
@@ -46,6 +46,8 @@ private[streaming] class ReceiverSupervisorImpl(
     checkpointDirOption: Option[String]
   ) extends ReceiverSupervisor(receiver, env.conf) with Logging {
 
+  private val hostPort = SparkEnv.get.blockManager.blockManagerId.hostPort
+
   private val receivedBlockHandler: ReceivedBlockHandler = {
     if (WriteAheadLogUtils.enableReceiverLog(env.conf)) {
       if (checkpointDirOption.isEmpty) {
@@ -170,7 +172,7 @@ private[streaming] class ReceiverSupervisorImpl(
 
   override protected def onReceiverStart(): Boolean = {
     val msg = RegisterReceiver(
-      streamId, receiver.getClass.getSimpleName, Utils.localHostName(), endpoint)
+      streamId, receiver.getClass.getSimpleName, hostPort, endpoint)
     trackerEndpoint.askWithRetry[Boolean](msg)
   }
 
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverInfo.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverInfo.scala
index de85f24dd988d..59df892397fe0 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverInfo.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverInfo.scala
@@ -28,7 +28,6 @@ import org.apache.spark.rpc.RpcEndpointRef
 case class ReceiverInfo(
     streamId: Int,
     name: String,
-    private[streaming] val endpoint: RpcEndpointRef,
     active: Boolean,
     location: String,
     lastErrorMessage: String = "",
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicy.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicy.scala
new file mode 100644
index 0000000000000..ef5b687b5831a
--- /dev/null
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicy.scala
@@ -0,0 +1,171 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.streaming.scheduler
+
+import scala.collection.Map
+import scala.collection.mutable
+
+import org.apache.spark.streaming.receiver.Receiver
+
+private[streaming] class ReceiverSchedulingPolicy {
+
+  /**
+   * Try our best to schedule receivers with evenly distributed. However, if the
+   * `preferredLocation`s of receivers are not even, we may not be able to schedule them evenly
+   * because we have to respect them.
+   *
+   * Here is the approach to schedule executors:
+   * <ol>
+   *   <li>First, schedule all the receivers with preferred locations (hosts), evenly among the
+   *       executors running on those host.</li>
+   *   <li>Then, schedule all other receivers evenly among all the executors such that overall
+   *       distribution over all the receivers is even.</li>
+   * </ol>
+   *
+   * This method is called when we start to launch receivers at the first time.
+   */
+  def scheduleReceivers(
+      receivers: Seq[Receiver[_]], executors: Seq[String]): Map[Int, Seq[String]] = {
+    if (receivers.isEmpty) {
+      return Map.empty
+    }
+
+    if (executors.isEmpty) {
+      return receivers.map(_.streamId -> Seq.empty).toMap
+    }
+
+    val hostToExecutors = executors.groupBy(_.split(":")(0))
+    val scheduledExecutors = Array.fill(receivers.length)(new mutable.ArrayBuffer[String])
+    val numReceiversOnExecutor = mutable.HashMap[String, Int]()
+    // Set the initial value to 0
+    executors.foreach(e => numReceiversOnExecutor(e) = 0)
+
+    // Firstly, we need to respect "preferredLocation". So if a receiver has "preferredLocation",
+    // we need to make sure the "preferredLocation" is in the candidate scheduled executor list.
+    for (i <- 0 until receivers.length) {
+      // Note: preferredLocation is host but executors are host:port
+      receivers(i).preferredLocation.foreach { host =>
+        hostToExecutors.get(host) match {
+          case Some(executorsOnHost) =>
+            // preferredLocation is a known host. Select an executor that has the least receivers in
+            // this host
+            val leastScheduledExecutor =
+              executorsOnHost.minBy(executor => numReceiversOnExecutor(executor))
+            scheduledExecutors(i) += leastScheduledExecutor
+            numReceiversOnExecutor(leastScheduledExecutor) =
+              numReceiversOnExecutor(leastScheduledExecutor) + 1
+          case None =>
+            // preferredLocation is an unknown host.
+            // Note: There are two cases:
+            // 1. This executor is not up. But it may be up later.
+            // 2. This executor is dead, or it's not a host in the cluster.
+            // Currently, simply add host to the scheduled executors.
+            scheduledExecutors(i) += host
+        }
+      }
+    }
+
+    // For those receivers that don't have preferredLocation, make sure we assign at least one
+    // executor to them.
+    for (scheduledExecutorsForOneReceiver <- scheduledExecutors.filter(_.isEmpty)) {
+      // Select the executor that has the least receivers
+      val (leastScheduledExecutor, numReceivers) = numReceiversOnExecutor.minBy(_._2)
+      scheduledExecutorsForOneReceiver += leastScheduledExecutor
+      numReceiversOnExecutor(leastScheduledExecutor) = numReceivers + 1
+    }
+
+    // Assign idle executors to receivers that have less executors
+    val idleExecutors = numReceiversOnExecutor.filter(_._2 == 0).map(_._1)
+    for (executor <- idleExecutors) {
+      // Assign an idle executor to the receiver that has least candidate executors.
+      val leastScheduledExecutors = scheduledExecutors.minBy(_.size)
+      leastScheduledExecutors += executor
+    }
+
+    receivers.map(_.streamId).zip(scheduledExecutors).toMap
+  }
+
+  /**
+   * Return a list of candidate executors to run the receiver. If the list is empty, the caller can
+   * run this receiver in arbitrary executor. The caller can use `preferredNumExecutors` to require
+   * returning `preferredNumExecutors` executors if possible.
+   *
+   * This method tries to balance executors' load. Here is the approach to schedule executors
+   * for a receiver.
+   * <ol>
+   *   <li>
+   *     If preferredLocation is set, preferredLocation should be one of the candidate executors.
+   *   </li>
+   *   <li>
+   *     Every executor will be assigned to a weight according to the receivers running or
+   *     scheduling on it.
+   *     <ul>
+   *       <li>
+   *         If a receiver is running on an executor, it contributes 1.0 to the executor's weight.
+   *       </li>
+   *       <li>
+   *         If a receiver is scheduled to an executor but has not yet run, it contributes
+   *         `1.0 / #candidate_executors_of_this_receiver` to the executor's weight.</li>
+   *     </ul>
+   *     At last, if there are more than `preferredNumExecutors` idle executors (weight = 0),
+   *     returns all idle executors. Otherwise, we only return `preferredNumExecutors` best options
+   *     according to the weights.
+   *   </li>
+   * </ol>
+   *
+   * This method is called when a receiver is registering with ReceiverTracker or is restarting.
+   */
+  def rescheduleReceiver(
+      receiverId: Int,
+      preferredLocation: Option[String],
+      receiverTrackingInfoMap: Map[Int, ReceiverTrackingInfo],
+      executors: Seq[String],
+      preferredNumExecutors: Int = 3): Seq[String] = {
+    if (executors.isEmpty) {
+      return Seq.empty
+    }
+
+    // Always try to schedule to the preferred locations
+    val scheduledExecutors = mutable.Set[String]()
+    scheduledExecutors ++= preferredLocation
+
+    val executorWeights = receiverTrackingInfoMap.values.flatMap { receiverTrackingInfo =>
+      receiverTrackingInfo.state match {
+        case ReceiverState.INACTIVE => Nil
+        case ReceiverState.SCHEDULED =>
+          val scheduledExecutors = receiverTrackingInfo.scheduledExecutors.get
+          // The probability that a scheduled receiver will run in an executor is
+          // 1.0 / scheduledLocations.size
+          scheduledExecutors.map(location => location -> (1.0 / scheduledExecutors.size))
+        case ReceiverState.ACTIVE => Seq(receiverTrackingInfo.runningExecutor.get -> 1.0)
+      }
+    }.groupBy(_._1).mapValues(_.map(_._2).sum) // Sum weights for each executor
+
+    val idleExecutors = (executors.toSet -- executorWeights.keys).toSeq
+    if (idleExecutors.size >= preferredNumExecutors) {
+      // If there are more than `preferredNumExecutors` idle executors, return all of them
+      scheduledExecutors ++= idleExecutors
+    } else {
+      // If there are less than `preferredNumExecutors` idle executors, return 3 best options
+      scheduledExecutors ++= idleExecutors
+      val sortedExecutors = executorWeights.toSeq.sortBy(_._2).map(_._1)
+      scheduledExecutors ++= (idleExecutors ++ sortedExecutors).take(preferredNumExecutors)
+    }
+    scheduledExecutors.toSeq
+  }
+}
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala
index 9cc6ffcd12f61..6270137951b5a 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala
@@ -17,17 +17,27 @@
 
 package org.apache.spark.streaming.scheduler
 
-import scala.collection.mutable.{ArrayBuffer, HashMap, SynchronizedMap}
+import java.util.concurrent.{TimeUnit, CountDownLatch}
+
+import scala.collection.mutable.HashMap
+import scala.concurrent.ExecutionContext
 import scala.language.existentials
-import scala.math.max
+import scala.util.{Failure, Success}
 
 import org.apache.spark.streaming.util.WriteAheadLogUtils
-import org.apache.spark.{Logging, SparkEnv, SparkException}
+import org.apache.spark._
+import org.apache.spark.rdd.RDD
 import org.apache.spark.rpc._
 import org.apache.spark.streaming.{StreamingContext, Time}
-import org.apache.spark.streaming.receiver.{CleanupOldBlocks, Receiver, ReceiverSupervisorImpl,
-  StopReceiver, UpdateRateLimit}
-import org.apache.spark.util.SerializableConfiguration
+import org.apache.spark.streaming.receiver._
+import org.apache.spark.util.{ThreadUtils, SerializableConfiguration}
+
+
+/** Enumeration to identify current state of a Receiver */
+private[streaming] object ReceiverState extends Enumeration {
+  type ReceiverState = Value
+  val INACTIVE, SCHEDULED, ACTIVE = Value
+}
 
 /**
  * Messages used by the NetworkReceiver and the ReceiverTracker to communicate
@@ -37,7 +47,7 @@ private[streaming] sealed trait ReceiverTrackerMessage
 private[streaming] case class RegisterReceiver(
     streamId: Int,
     typ: String,
-    host: String,
+    hostPort: String,
     receiverEndpoint: RpcEndpointRef
   ) extends ReceiverTrackerMessage
 private[streaming] case class AddBlock(receivedBlockInfo: ReceivedBlockInfo)
@@ -46,7 +56,38 @@ private[streaming] case class ReportError(streamId: Int, message: String, error:
 private[streaming] case class DeregisterReceiver(streamId: Int, msg: String, error: String)
   extends ReceiverTrackerMessage
 
-private[streaming] case object StopAllReceivers extends ReceiverTrackerMessage
+/**
+ * Messages used by the driver and ReceiverTrackerEndpoint to communicate locally.
+ */
+private[streaming] sealed trait ReceiverTrackerLocalMessage
+
+/**
+ * This message will trigger ReceiverTrackerEndpoint to restart a Spark job for the receiver.
+ */
+private[streaming] case class RestartReceiver(receiver: Receiver[_])
+  extends ReceiverTrackerLocalMessage
+
+/**
+ * This message is sent to ReceiverTrackerEndpoint when we start to launch Spark jobs for receivers
+ * at the first time.
+ */
+private[streaming] case class StartAllReceivers(receiver: Seq[Receiver[_]])
+  extends ReceiverTrackerLocalMessage
+
+/**
+ * This message will trigger ReceiverTrackerEndpoint to send stop signals to all registered
+ * receivers.
+ */
+private[streaming] case object StopAllReceivers extends ReceiverTrackerLocalMessage
+
+/**
+ * A message used by ReceiverTracker to ask all receiver's ids still stored in
+ * ReceiverTrackerEndpoint.
+ */
+private[streaming] case object AllReceiverIds extends ReceiverTrackerLocalMessage
+
+private[streaming] case class UpdateReceiverRateLimit(streamUID: Int, newRate: Long)
+  extends ReceiverTrackerLocalMessage
 
 /**
  * This class manages the execution of the receivers of ReceiverInputDStreams. Instance of
@@ -60,8 +101,6 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false
 
   private val receiverInputStreams = ssc.graph.getReceiverInputStreams()
   private val receiverInputStreamIds = receiverInputStreams.map { _.id }
-  private val receiverExecutor = new ReceiverLauncher()
-  private val receiverInfo = new HashMap[Int, ReceiverInfo] with SynchronizedMap[Int, ReceiverInfo]
   private val receivedBlockTracker = new ReceivedBlockTracker(
     ssc.sparkContext.conf,
     ssc.sparkContext.hadoopConfiguration,
@@ -86,6 +125,24 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false
   // This not being null means the tracker has been started and not stopped
   private var endpoint: RpcEndpointRef = null
 
+  private val schedulingPolicy = new ReceiverSchedulingPolicy()
+
+  // Track the active receiver job number. When a receiver job exits ultimately, countDown will
+  // be called.
+  private val receiverJobExitLatch = new CountDownLatch(receiverInputStreams.size)
+
+  /**
+   * Track all receivers' information. The key is the receiver id, the value is the receiver info.
+   * It's only accessed in ReceiverTrackerEndpoint.
+   */
+  private val receiverTrackingInfos = new HashMap[Int, ReceiverTrackingInfo]
+
+  /**
+   * Store all preferred locations for all receivers. We need this information to schedule
+   * receivers. It's only accessed in ReceiverTrackerEndpoint.
+   */
+  private val receiverPreferredLocations = new HashMap[Int, Option[String]]
+
   /** Start the endpoint and receiver execution thread. */
   def start(): Unit = synchronized {
     if (isTrackerStarted) {
@@ -95,7 +152,7 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false
     if (!receiverInputStreams.isEmpty) {
       endpoint = ssc.env.rpcEnv.setupEndpoint(
         "ReceiverTracker", new ReceiverTrackerEndpoint(ssc.env.rpcEnv))
-      if (!skipReceiverLaunch) receiverExecutor.start()
+      if (!skipReceiverLaunch) launchReceivers()
       logInfo("ReceiverTracker started")
       trackerState = Started
     }
@@ -112,20 +169,18 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false
 
         // Wait for the Spark job that runs the receivers to be over
         // That is, for the receivers to quit gracefully.
-        receiverExecutor.awaitTermination(10000)
+        receiverJobExitLatch.await(10, TimeUnit.SECONDS)
 
         if (graceful) {
-          val pollTime = 100
           logInfo("Waiting for receiver job to terminate gracefully")
-          while (receiverInfo.nonEmpty || receiverExecutor.running) {
-            Thread.sleep(pollTime)
-          }
+          receiverJobExitLatch.await()
           logInfo("Waited for receiver job to terminate gracefully")
         }
 
         // Check if all the receivers have been deregistered or not
-        if (receiverInfo.nonEmpty) {
-          logWarning("Not all of the receivers have deregistered, " + receiverInfo)
+        val receivers = endpoint.askWithRetry[Seq[Int]](AllReceiverIds)
+        if (receivers.nonEmpty) {
+          logWarning("Not all of the receivers have deregistered, " + receivers)
         } else {
           logInfo("All of the receivers have deregistered successfully")
         }
@@ -154,9 +209,7 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false
 
   /** Get the blocks allocated to the given batch and stream. */
   def getBlocksOfBatchAndStream(batchTime: Time, streamId: Int): Seq[ReceivedBlockInfo] = {
-    synchronized {
-      receivedBlockTracker.getBlocksOfBatchAndStream(batchTime, streamId)
-    }
+    receivedBlockTracker.getBlocksOfBatchAndStream(batchTime, streamId)
   }
 
   /**
@@ -170,8 +223,7 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false
     // Signal the receivers to delete old block data
     if (WriteAheadLogUtils.enableReceiverLog(ssc.conf)) {
       logInfo(s"Cleanup old received batch data: $cleanupThreshTime")
-      receiverInfo.values.flatMap { info => Option(info.endpoint) }
-        .foreach { _.send(CleanupOldBlocks(cleanupThreshTime)) }
+      endpoint.send(CleanupOldBlocks(cleanupThreshTime))
     }
   }
 
@@ -179,7 +231,7 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false
   private def registerReceiver(
       streamId: Int,
       typ: String,
-      host: String,
+      hostPort: String,
       receiverEndpoint: RpcEndpointRef,
       senderAddress: RpcAddress
     ): Boolean = {
@@ -189,13 +241,20 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false
 
     if (isTrackerStopping || isTrackerStopped) {
       false
+    } else if (!scheduleReceiver(streamId).contains(hostPort)) {
+      // Refuse it since it's scheduled to a wrong executor
+      false
     } else {
-      // "stopReceivers" won't happen at the same time because both "registerReceiver" and are
-      // called in the event loop. So here we can assume "stopReceivers" has not yet been called. If
-      // "stopReceivers" is called later, it should be able to see this receiver.
-      receiverInfo(streamId) = ReceiverInfo(
-        streamId, s"${typ}-${streamId}", receiverEndpoint, true, host)
-      listenerBus.post(StreamingListenerReceiverStarted(receiverInfo(streamId)))
+      val name = s"${typ}-${streamId}"
+      val receiverTrackingInfo = ReceiverTrackingInfo(
+        streamId,
+        ReceiverState.ACTIVE,
+        scheduledExecutors = None,
+        runningExecutor = Some(hostPort),
+        name = Some(name),
+        endpoint = Some(receiverEndpoint))
+      receiverTrackingInfos.put(streamId, receiverTrackingInfo)
+      listenerBus.post(StreamingListenerReceiverStarted(receiverTrackingInfo.toReceiverInfo))
       logInfo("Registered receiver for stream " + streamId + " from " + senderAddress)
       true
     }
@@ -203,21 +262,20 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false
 
   /** Deregister a receiver */
   private def deregisterReceiver(streamId: Int, message: String, error: String) {
-    val newReceiverInfo = receiverInfo.get(streamId) match {
+    val lastErrorTime =
+      if (error == null || error == "") -1 else ssc.scheduler.clock.getTimeMillis()
+    val errorInfo = ReceiverErrorInfo(
+      lastErrorMessage = message, lastError = error, lastErrorTime = lastErrorTime)
+    val newReceiverTrackingInfo = receiverTrackingInfos.get(streamId) match {
       case Some(oldInfo) =>
-        val lastErrorTime =
-          if (error == null || error == "") -1 else ssc.scheduler.clock.getTimeMillis()
-        oldInfo.copy(endpoint = null, active = false, lastErrorMessage = message,
-          lastError = error, lastErrorTime = lastErrorTime)
+        oldInfo.copy(state = ReceiverState.INACTIVE, errorInfo = Some(errorInfo))
       case None =>
         logWarning("No prior receiver info")
-        val lastErrorTime =
-          if (error == null || error == "") -1 else ssc.scheduler.clock.getTimeMillis()
-        ReceiverInfo(streamId, "", null, false, "", lastErrorMessage = message,
-          lastError = error, lastErrorTime = lastErrorTime)
+        ReceiverTrackingInfo(
+          streamId, ReceiverState.INACTIVE, None, None, None, None, Some(errorInfo))
     }
-    receiverInfo -= streamId
-    listenerBus.post(StreamingListenerReceiverStopped(newReceiverInfo))
+    receiverTrackingInfos -= streamId
+    listenerBus.post(StreamingListenerReceiverStopped(newReceiverTrackingInfo.toReceiverInfo))
     val messageWithError = if (error != null && !error.isEmpty) {
       s"$message - $error"
     } else {
@@ -228,9 +286,7 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false
 
   /** Update a receiver's maximum ingestion rate */
   def sendRateUpdate(streamUID: Int, newRate: Long): Unit = {
-    for (info <- receiverInfo.get(streamUID); eP <- Option(info.endpoint)) {
-      eP.send(UpdateRateLimit(newRate))
-    }
+    endpoint.send(UpdateReceiverRateLimit(streamUID, newRate))
   }
 
   /** Add new blocks for the given stream */
@@ -240,16 +296,21 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false
 
   /** Report error sent by a receiver */
   private def reportError(streamId: Int, message: String, error: String) {
-    val newReceiverInfo = receiverInfo.get(streamId) match {
+    val newReceiverTrackingInfo = receiverTrackingInfos.get(streamId) match {
       case Some(oldInfo) =>
-        oldInfo.copy(lastErrorMessage = message, lastError = error)
+        val errorInfo = ReceiverErrorInfo(lastErrorMessage = message, lastError = error,
+          lastErrorTime = oldInfo.errorInfo.map(_.lastErrorTime).getOrElse(-1L))
+        oldInfo.copy(errorInfo = Some(errorInfo))
       case None =>
         logWarning("No prior receiver info")
-        ReceiverInfo(streamId, "", null, false, "", lastErrorMessage = message,
-          lastError = error, lastErrorTime = ssc.scheduler.clock.getTimeMillis())
+        val errorInfo = ReceiverErrorInfo(lastErrorMessage = message, lastError = error,
+          lastErrorTime = ssc.scheduler.clock.getTimeMillis())
+        ReceiverTrackingInfo(
+          streamId, ReceiverState.INACTIVE, None, None, None, None, Some(errorInfo))
     }
-    receiverInfo(streamId) = newReceiverInfo
-    listenerBus.post(StreamingListenerReceiverError(receiverInfo(streamId)))
+
+    receiverTrackingInfos(streamId) = newReceiverTrackingInfo
+    listenerBus.post(StreamingListenerReceiverError(newReceiverTrackingInfo.toReceiverInfo))
     val messageWithError = if (error != null && !error.isEmpty) {
       s"$message - $error"
     } else {
@@ -258,171 +319,242 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false
     logWarning(s"Error reported by receiver for stream $streamId: $messageWithError")
   }
 
+  private def scheduleReceiver(receiverId: Int): Seq[String] = {
+    val preferredLocation = receiverPreferredLocations.getOrElse(receiverId, None)
+    val scheduledExecutors = schedulingPolicy.rescheduleReceiver(
+      receiverId, preferredLocation, receiverTrackingInfos, getExecutors)
+    updateReceiverScheduledExecutors(receiverId, scheduledExecutors)
+    scheduledExecutors
+  }
+
+  private def updateReceiverScheduledExecutors(
+      receiverId: Int, scheduledExecutors: Seq[String]): Unit = {
+    val newReceiverTrackingInfo = receiverTrackingInfos.get(receiverId) match {
+      case Some(oldInfo) =>
+        oldInfo.copy(state = ReceiverState.SCHEDULED,
+          scheduledExecutors = Some(scheduledExecutors))
+      case None =>
+        ReceiverTrackingInfo(
+          receiverId,
+          ReceiverState.SCHEDULED,
+          Some(scheduledExecutors),
+          runningExecutor = None)
+    }
+    receiverTrackingInfos.put(receiverId, newReceiverTrackingInfo)
+  }
+
   /** Check if any blocks are left to be processed */
   def hasUnallocatedBlocks: Boolean = {
     receivedBlockTracker.hasUnallocatedReceivedBlocks
   }
 
+  /**
+   * Get the list of executors excluding driver
+   */
+  private def getExecutors: Seq[String] = {
+    if (ssc.sc.isLocal) {
+      Seq(ssc.sparkContext.env.blockManager.blockManagerId.hostPort)
+    } else {
+      ssc.sparkContext.env.blockManager.master.getMemoryStatus.filter { case (blockManagerId, _) =>
+        blockManagerId.executorId != SparkContext.DRIVER_IDENTIFIER // Ignore the driver location
+      }.map { case (blockManagerId, _) => blockManagerId.hostPort }.toSeq
+    }
+  }
+
+  /**
+   * Run the dummy Spark job to ensure that all slaves have registered. This avoids all the
+   * receivers to be scheduled on the same node.
+   *
+   * TODO Should poll the executor number and wait for executors according to
+   * "spark.scheduler.minRegisteredResourcesRatio" and
+   * "spark.scheduler.maxRegisteredResourcesWaitingTime" rather than running a dummy job.
+   */
+  private def runDummySparkJob(): Unit = {
+    if (!ssc.sparkContext.isLocal) {
+      ssc.sparkContext.makeRDD(1 to 50, 50).map(x => (x, 1)).reduceByKey(_ + _, 20).collect()
+    }
+    assert(getExecutors.nonEmpty)
+  }
+
+  /**
+   * Get the receivers from the ReceiverInputDStreams, distributes them to the
+   * worker nodes as a parallel collection, and runs them.
+   */
+  private def launchReceivers(): Unit = {
+    val receivers = receiverInputStreams.map(nis => {
+      val rcvr = nis.getReceiver()
+      rcvr.setReceiverId(nis.id)
+      rcvr
+    })
+
+    runDummySparkJob()
+
+    logInfo("Starting " + receivers.length + " receivers")
+    endpoint.send(StartAllReceivers(receivers))
+  }
+
+  /** Check if tracker has been marked for starting */
+  private def isTrackerStarted: Boolean = trackerState == Started
+
+  /** Check if tracker has been marked for stopping */
+  private def isTrackerStopping: Boolean = trackerState == Stopping
+
+  /** Check if tracker has been marked for stopped */
+  private def isTrackerStopped: Boolean = trackerState == Stopped
+
   /** RpcEndpoint to receive messages from the receivers. */
   private class ReceiverTrackerEndpoint(override val rpcEnv: RpcEnv) extends ThreadSafeRpcEndpoint {
 
+    // TODO Remove this thread pool after https://github.com/apache/spark/issues/7385 is merged
+    private val submitJobThreadPool = ExecutionContext.fromExecutorService(
+      ThreadUtils.newDaemonCachedThreadPool("submit-job-thead-pool"))
+
     override def receive: PartialFunction[Any, Unit] = {
+      // Local messages
+      case StartAllReceivers(receivers) =>
+        val scheduledExecutors = schedulingPolicy.scheduleReceivers(receivers, getExecutors)
+        for (receiver <- receivers) {
+          val executors = scheduledExecutors(receiver.streamId)
+          updateReceiverScheduledExecutors(receiver.streamId, executors)
+          receiverPreferredLocations(receiver.streamId) = receiver.preferredLocation
+          startReceiver(receiver, executors)
+        }
+      case RestartReceiver(receiver) =>
+        val scheduledExecutors = schedulingPolicy.rescheduleReceiver(
+          receiver.streamId,
+          receiver.preferredLocation,
+          receiverTrackingInfos,
+          getExecutors)
+        updateReceiverScheduledExecutors(receiver.streamId, scheduledExecutors)
+        startReceiver(receiver, scheduledExecutors)
+      case c: CleanupOldBlocks =>
+        receiverTrackingInfos.values.flatMap(_.endpoint).foreach(_.send(c))
+      case UpdateReceiverRateLimit(streamUID, newRate) =>
+        for (info <- receiverTrackingInfos.get(streamUID); eP <- info.endpoint) {
+          eP.send(UpdateRateLimit(newRate))
+        }
+      // Remote messages
       case ReportError(streamId, message, error) =>
         reportError(streamId, message, error)
     }
 
     override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
-      case RegisterReceiver(streamId, typ, host, receiverEndpoint) =>
+      // Remote messages
+      case RegisterReceiver(streamId, typ, hostPort, receiverEndpoint) =>
         val successful =
-          registerReceiver(streamId, typ, host, receiverEndpoint, context.sender.address)
+          registerReceiver(streamId, typ, hostPort, receiverEndpoint, context.sender.address)
         context.reply(successful)
       case AddBlock(receivedBlockInfo) =>
         context.reply(addBlock(receivedBlockInfo))
       case DeregisterReceiver(streamId, message, error) =>
         deregisterReceiver(streamId, message, error)
         context.reply(true)
+      // Local messages
+      case AllReceiverIds =>
+        context.reply(receiverTrackingInfos.keys.toSeq)
       case StopAllReceivers =>
         assert(isTrackerStopping || isTrackerStopped)
         stopReceivers()
         context.reply(true)
     }
 
-    /** Send stop signal to the receivers. */
-    private def stopReceivers() {
-      // Signal the receivers to stop
-      receiverInfo.values.flatMap { info => Option(info.endpoint)}
-        .foreach { _.send(StopReceiver) }
-      logInfo("Sent stop signal to all " + receiverInfo.size + " receivers")
-    }
-  }
-
-  /** This thread class runs all the receivers on the cluster.  */
-  class ReceiverLauncher {
-    @transient val env = ssc.env
-    @volatile @transient var running = false
-    @transient val thread = new Thread() {
-      override def run() {
-        try {
-          SparkEnv.set(env)
-          startReceivers()
-        } catch {
-          case ie: InterruptedException => logInfo("ReceiverLauncher interrupted")
-        }
-      }
-    }
-
-    def start() {
-      thread.start()
-    }
-
     /**
-     * Get the list of executors excluding driver
-     */
-    private def getExecutors(ssc: StreamingContext): List[String] = {
-      val executors = ssc.sparkContext.getExecutorMemoryStatus.map(_._1.split(":")(0)).toList
-      val driver = ssc.sparkContext.getConf.get("spark.driver.host")
-      executors.diff(List(driver))
-    }
-
-    /** Set host location(s) for each receiver so as to distribute them over
-     * executors in a round-robin fashion taking into account preferredLocation if set
+     * Start a receiver along with its scheduled executors
      */
-    private[streaming] def scheduleReceivers(receivers: Seq[Receiver[_]],
-      executors: List[String]): Array[ArrayBuffer[String]] = {
-      val locations = new Array[ArrayBuffer[String]](receivers.length)
-      var i = 0
-      for (i <- 0 until receivers.length) {
-        locations(i) = new ArrayBuffer[String]()
-        if (receivers(i).preferredLocation.isDefined) {
-          locations(i) += receivers(i).preferredLocation.get
-        }
+    private def startReceiver(receiver: Receiver[_], scheduledExecutors: Seq[String]): Unit = {
+      val receiverId = receiver.streamId
+      if (!isTrackerStarted) {
+        onReceiverJobFinish(receiverId)
+        return
       }
-      var count = 0
-      for (i <- 0 until max(receivers.length, executors.length)) {
-        if (!receivers(i % receivers.length).preferredLocation.isDefined) {
-          locations(i % receivers.length) += executors(count)
-          count += 1
-          if (count == executors.length) {
-            count = 0
-          }
-        }
-      }
-      locations
-    }
-
-    /**
-     * Get the receivers from the ReceiverInputDStreams, distributes them to the
-     * worker nodes as a parallel collection, and runs them.
-     */
-    private def startReceivers() {
-      val receivers = receiverInputStreams.map(nis => {
-        val rcvr = nis.getReceiver()
-        rcvr.setReceiverId(nis.id)
-        rcvr
-      })
 
       val checkpointDirOption = Option(ssc.checkpointDir)
       val serializableHadoopConf =
         new SerializableConfiguration(ssc.sparkContext.hadoopConfiguration)
 
       // Function to start the receiver on the worker node
-      val startReceiver = (iterator: Iterator[Receiver[_]]) => {
-        if (!iterator.hasNext) {
-          throw new SparkException(
-            "Could not start receiver as object not found.")
-        }
-        val receiver = iterator.next()
-        val supervisor = new ReceiverSupervisorImpl(
-          receiver, SparkEnv.get, serializableHadoopConf.value, checkpointDirOption)
-        supervisor.start()
-        supervisor.awaitTermination()
-      }
-
-      // Run the dummy Spark job to ensure that all slaves have registered.
-      // This avoids all the receivers to be scheduled on the same node.
-      if (!ssc.sparkContext.isLocal) {
-        ssc.sparkContext.makeRDD(1 to 50, 50).map(x => (x, 1)).reduceByKey(_ + _, 20).collect()
-      }
+      val startReceiverFunc = new StartReceiverFunc(checkpointDirOption, serializableHadoopConf)
 
-      // Get the list of executors and schedule receivers
-      val executors = getExecutors(ssc)
-      val tempRDD =
-        if (!executors.isEmpty) {
-          val locations = scheduleReceivers(receivers, executors)
-          val roundRobinReceivers = (0 until receivers.length).map(i =>
-            (receivers(i), locations(i)))
-          ssc.sc.makeRDD[Receiver[_]](roundRobinReceivers)
+      // Create the RDD using the scheduledExecutors to run the receiver in a Spark job
+      val receiverRDD: RDD[Receiver[_]] =
+        if (scheduledExecutors.isEmpty) {
+          ssc.sc.makeRDD(Seq(receiver), 1)
         } else {
-          ssc.sc.makeRDD(receivers, receivers.size)
+          ssc.sc.makeRDD(Seq(receiver -> scheduledExecutors))
         }
+      receiverRDD.setName(s"Receiver $receiverId")
+      val future = ssc.sparkContext.submitJob[Receiver[_], Unit, Unit](
+        receiverRDD, startReceiverFunc, Seq(0), (_, _) => Unit, ())
+      // We will keep restarting the receiver job until ReceiverTracker is stopped
+      future.onComplete {
+        case Success(_) =>
+          if (!isTrackerStarted) {
+            onReceiverJobFinish(receiverId)
+          } else {
+            logInfo(s"Restarting Receiver $receiverId")
+            self.send(RestartReceiver(receiver))
+          }
+        case Failure(e) =>
+          if (!isTrackerStarted) {
+            onReceiverJobFinish(receiverId)
+          } else {
+            logError("Receiver has been stopped. Try to restart it.", e)
+            logInfo(s"Restarting Receiver $receiverId")
+            self.send(RestartReceiver(receiver))
+          }
+      }(submitJobThreadPool)
+      logInfo(s"Receiver ${receiver.streamId} started")
+    }
 
-      // Distribute the receivers and start them
-      logInfo("Starting " + receivers.length + " receivers")
-      running = true
-      try {
-        ssc.sparkContext.runJob(tempRDD, ssc.sparkContext.clean(startReceiver))
-        logInfo("All of the receivers have been terminated")
-      } finally {
-        running = false
-      }
+    override def onStop(): Unit = {
+      submitJobThreadPool.shutdownNow()
     }
 
     /**
-     * Wait until the Spark job that runs the receivers is terminated, or return when
-     * `milliseconds` elapses
+     * Call when a receiver is terminated. It means we won't restart its Spark job.
      */
-    def awaitTermination(milliseconds: Long): Unit = {
-      thread.join(milliseconds)
+    private def onReceiverJobFinish(receiverId: Int): Unit = {
+      receiverJobExitLatch.countDown()
+      receiverTrackingInfos.remove(receiverId).foreach { receiverTrackingInfo =>
+        if (receiverTrackingInfo.state == ReceiverState.ACTIVE) {
+          logWarning(s"Receiver $receiverId exited but didn't deregister")
+        }
+      }
     }
-  }
 
-  /** Check if tracker has been marked for starting */
-  private def isTrackerStarted(): Boolean = trackerState == Started
+    /** Send stop signal to the receivers. */
+    private def stopReceivers() {
+      receiverTrackingInfos.values.flatMap(_.endpoint).foreach { _.send(StopReceiver) }
+      logInfo("Sent stop signal to all " + receiverTrackingInfos.size + " receivers")
+    }
+  }
 
-  /** Check if tracker has been marked for stopping */
-  private def isTrackerStopping(): Boolean = trackerState == Stopping
+}
 
-  /** Check if tracker has been marked for stopped */
-  private def isTrackerStopped(): Boolean = trackerState == Stopped
+/**
+ * Function to start the receiver on the worker node. Use a class instead of closure to avoid
+ * the serialization issue.
+ */
+private class StartReceiverFunc(
+    checkpointDirOption: Option[String],
+    serializableHadoopConf: SerializableConfiguration)
+  extends (Iterator[Receiver[_]] => Unit) with Serializable {
+
+  override def apply(iterator: Iterator[Receiver[_]]): Unit = {
+    if (!iterator.hasNext) {
+      throw new SparkException(
+        "Could not start receiver as object not found.")
+    }
+    if (TaskContext.get().attemptNumber() == 0) {
+      val receiver = iterator.next()
+      assert(iterator.hasNext == false)
+      val supervisor = new ReceiverSupervisorImpl(
+        receiver, SparkEnv.get, serializableHadoopConf.value, checkpointDirOption)
+      supervisor.start()
+      supervisor.awaitTermination()
+    } else {
+      // It's restarted by TaskScheduler, but we want to reschedule it again. So exit it.
+    }
+  }
 
 }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTrackingInfo.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTrackingInfo.scala
new file mode 100644
index 0000000000000..043ff4d0ff054
--- /dev/null
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTrackingInfo.scala
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.streaming.scheduler
+
+import org.apache.spark.rpc.RpcEndpointRef
+import org.apache.spark.streaming.scheduler.ReceiverState._
+
+private[streaming] case class ReceiverErrorInfo(
+    lastErrorMessage: String = "", lastError: String = "", lastErrorTime: Long = -1L)
+
+/**
+ * Class having information about a receiver.
+ *
+ * @param receiverId the unique receiver id
+ * @param state the current Receiver state
+ * @param scheduledExecutors the scheduled executors provided by ReceiverSchedulingPolicy
+ * @param runningExecutor the running executor if the receiver is active
+ * @param name the receiver name
+ * @param endpoint the receiver endpoint. It can be used to send messages to the receiver
+ * @param errorInfo the receiver error information if it fails
+ */
+private[streaming] case class ReceiverTrackingInfo(
+    receiverId: Int,
+    state: ReceiverState,
+    scheduledExecutors: Option[Seq[String]],
+    runningExecutor: Option[String],
+    name: Option[String] = None,
+    endpoint: Option[RpcEndpointRef] = None,
+    errorInfo: Option[ReceiverErrorInfo] = None) {
+
+  def toReceiverInfo: ReceiverInfo = ReceiverInfo(
+    receiverId,
+    name.getOrElse(""),
+    state == ReceiverState.ACTIVE,
+    location = runningExecutor.getOrElse(""),
+    lastErrorMessage = errorInfo.map(_.lastErrorMessage).getOrElse(""),
+    lastError = errorInfo.map(_.lastError).getOrElse(""),
+    lastErrorTime = errorInfo.map(_.lastErrorTime).getOrElse(-1L)
+  )
+}
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicySuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicySuite.scala
new file mode 100644
index 0000000000000..93f920fdc71f1
--- /dev/null
+++ b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicySuite.scala
@@ -0,0 +1,130 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.streaming.scheduler
+
+import scala.collection.mutable
+
+import org.apache.spark.SparkFunSuite
+
+class ReceiverSchedulingPolicySuite extends SparkFunSuite {
+
+  val receiverSchedulingPolicy = new ReceiverSchedulingPolicy
+
+  test("rescheduleReceiver: empty executors") {
+    val scheduledExecutors =
+      receiverSchedulingPolicy.rescheduleReceiver(0, None, Map.empty, executors = Seq.empty)
+    assert(scheduledExecutors === Seq.empty)
+  }
+
+  test("rescheduleReceiver: receiver preferredLocation") {
+    val receiverTrackingInfoMap = Map(
+      0 -> ReceiverTrackingInfo(0, ReceiverState.INACTIVE, None, None))
+    val scheduledExecutors = receiverSchedulingPolicy.rescheduleReceiver(
+      0, Some("host1"), receiverTrackingInfoMap, executors = Seq("host2"))
+    assert(scheduledExecutors.toSet === Set("host1", "host2"))
+  }
+
+  test("rescheduleReceiver: return all idle executors if more than 3 idle executors") {
+    val executors = Seq("host1", "host2", "host3", "host4", "host5")
+    // host3 is idle
+    val receiverTrackingInfoMap = Map(
+      0 -> ReceiverTrackingInfo(0, ReceiverState.ACTIVE, None, Some("host1")))
+    val scheduledExecutors = receiverSchedulingPolicy.rescheduleReceiver(
+      1, None, receiverTrackingInfoMap, executors)
+    assert(scheduledExecutors.toSet === Set("host2", "host3", "host4", "host5"))
+  }
+
+  test("rescheduleReceiver: return 3 best options if less than 3 idle executors") {
+    val executors = Seq("host1", "host2", "host3", "host4", "host5")
+    // Weights: host1 = 1.5, host2 = 0.5, host3 = 1.0
+    // host4 and host5 are idle
+    val receiverTrackingInfoMap = Map(
+      0 -> ReceiverTrackingInfo(0, ReceiverState.ACTIVE, None, Some("host1")),
+      1 -> ReceiverTrackingInfo(1, ReceiverState.SCHEDULED, Some(Seq("host2", "host3")), None),
+      2 -> ReceiverTrackingInfo(1, ReceiverState.SCHEDULED, Some(Seq("host1", "host3")), None))
+    val scheduledExecutors = receiverSchedulingPolicy.rescheduleReceiver(
+      3, None, receiverTrackingInfoMap, executors)
+    assert(scheduledExecutors.toSet === Set("host2", "host4", "host5"))
+  }
+
+  test("scheduleReceivers: " +
+    "schedule receivers evenly when there are more receivers than executors") {
+    val receivers = (0 until 6).map(new DummyReceiver(_))
+    val executors = (10000 until 10003).map(port => s"localhost:${port}")
+    val scheduledExecutors = receiverSchedulingPolicy.scheduleReceivers(receivers, executors)
+    val numReceiversOnExecutor = mutable.HashMap[String, Int]()
+    // There should be 2 receivers running on each executor and each receiver has one executor
+    scheduledExecutors.foreach { case (receiverId, executors) =>
+      assert(executors.size == 1)
+      numReceiversOnExecutor(executors(0)) = numReceiversOnExecutor.getOrElse(executors(0), 0) + 1
+    }
+    assert(numReceiversOnExecutor === executors.map(_ -> 2).toMap)
+  }
+
+
+  test("scheduleReceivers: " +
+    "schedule receivers evenly when there are more executors than receivers") {
+    val receivers = (0 until 3).map(new DummyReceiver(_))
+    val executors = (10000 until 10006).map(port => s"localhost:${port}")
+    val scheduledExecutors = receiverSchedulingPolicy.scheduleReceivers(receivers, executors)
+    val numReceiversOnExecutor = mutable.HashMap[String, Int]()
+    // There should be 1 receiver running on each executor and each receiver has two executors
+    scheduledExecutors.foreach { case (receiverId, executors) =>
+      assert(executors.size == 2)
+      executors.foreach { l =>
+        numReceiversOnExecutor(l) = numReceiversOnExecutor.getOrElse(l, 0) + 1
+      }
+    }
+    assert(numReceiversOnExecutor === executors.map(_ -> 1).toMap)
+  }
+
+  test("scheduleReceivers: schedule receivers evenly when the preferredLocations are even") {
+    val receivers = (0 until 3).map(new DummyReceiver(_)) ++
+      (3 until 6).map(new DummyReceiver(_, Some("localhost")))
+    val executors = (10000 until 10003).map(port => s"localhost:${port}") ++
+      (10003 until 10006).map(port => s"localhost2:${port}")
+    val scheduledExecutors = receiverSchedulingPolicy.scheduleReceivers(receivers, executors)
+    val numReceiversOnExecutor = mutable.HashMap[String, Int]()
+    // There should be 1 receiver running on each executor and each receiver has 1 executor
+    scheduledExecutors.foreach { case (receiverId, executors) =>
+      assert(executors.size == 1)
+      executors.foreach { l =>
+        numReceiversOnExecutor(l) = numReceiversOnExecutor.getOrElse(l, 0) + 1
+      }
+    }
+    assert(numReceiversOnExecutor === executors.map(_ -> 1).toMap)
+    // Make sure we schedule the receivers to their preferredLocations
+    val executorsForReceiversWithPreferredLocation =
+      scheduledExecutors.filter { case (receiverId, executors) => receiverId >= 3 }.flatMap(_._2)
+    // We can simply check the executor set because we only know each receiver only has 1 executor
+    assert(executorsForReceiversWithPreferredLocation.toSet ===
+      (10000 until 10003).map(port => s"localhost:${port}").toSet)
+  }
+
+  test("scheduleReceivers: return empty if no receiver") {
+    assert(receiverSchedulingPolicy.scheduleReceivers(Seq.empty, Seq("localhost:10000")).isEmpty)
+  }
+
+  test("scheduleReceivers: return empty scheduled executors if no executors") {
+    val receivers = (0 until 3).map(new DummyReceiver(_))
+    val scheduledExecutors = receiverSchedulingPolicy.scheduleReceivers(receivers, Seq.empty)
+    scheduledExecutors.foreach { case (receiverId, executors) =>
+      assert(executors.isEmpty)
+    }
+  }
+}
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala
index aadb7231757b8..e2159bd4f225d 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala
@@ -18,66 +18,18 @@
 package org.apache.spark.streaming.scheduler
 
 import org.scalatest.concurrent.Eventually._
-import org.scalatest.concurrent.Timeouts
 import org.scalatest.time.SpanSugar._
-import org.apache.spark.streaming._
+
 import org.apache.spark.SparkConf
-import org.apache.spark.storage.StorageLevel
+import org.apache.spark.streaming._
 import org.apache.spark.streaming.receiver._
-import org.apache.spark.util.Utils
-import org.apache.spark.streaming.dstream.InputDStream
-import scala.reflect.ClassTag
 import org.apache.spark.streaming.dstream.ReceiverInputDStream
+import org.apache.spark.storage.StorageLevel
 
 /** Testsuite for receiver scheduling */
 class ReceiverTrackerSuite extends TestSuiteBase {
   val sparkConf = new SparkConf().setMaster("local[8]").setAppName("test")
   val ssc = new StreamingContext(sparkConf, Milliseconds(100))
-  val tracker = new ReceiverTracker(ssc)
-  val launcher = new tracker.ReceiverLauncher()
-  val executors: List[String] = List("0", "1", "2", "3")
-
-  test("receiver scheduling - all or none have preferred location") {
-
-    def parse(s: String): Array[Array[String]] = {
-      val outerSplit = s.split("\\|")
-      val loc = new Array[Array[String]](outerSplit.length)
-      var i = 0
-      for (i <- 0 until outerSplit.length) {
-        loc(i) = outerSplit(i).split("\\,")
-      }
-      loc
-    }
-
-    def testScheduler(numReceivers: Int, preferredLocation: Boolean, allocation: String) {
-      val receivers =
-        if (preferredLocation) {
-          Array.tabulate(numReceivers)(i => new DummyReceiver(host =
-            Some(((i + 1) % executors.length).toString)))
-        } else {
-          Array.tabulate(numReceivers)(_ => new DummyReceiver)
-        }
-      val locations = launcher.scheduleReceivers(receivers, executors)
-      val expectedLocations = parse(allocation)
-      assert(locations.deep === expectedLocations.deep)
-    }
-
-    testScheduler(numReceivers = 5, preferredLocation = false, allocation = "0|1|2|3|0")
-    testScheduler(numReceivers = 3, preferredLocation = false, allocation = "0,3|1|2")
-    testScheduler(numReceivers = 4, preferredLocation = true, allocation = "1|2|3|0")
-  }
-
-  test("receiver scheduling - some have preferred location") {
-    val numReceivers = 4;
-    val receivers: Seq[Receiver[_]] = Seq(new DummyReceiver(host = Some("1")),
-      new DummyReceiver, new DummyReceiver, new DummyReceiver)
-    val locations = launcher.scheduleReceivers(receivers, executors)
-    assert(locations(0)(0) === "1")
-    assert(locations(1)(0) === "0")
-    assert(locations(2)(0) === "1")
-    assert(locations(0).length === 1)
-    assert(locations(3).length === 1)
-  }
 
   test("Receiver tracker - propagates rate limit") {
     object ReceiverStartedWaiter extends StreamingListener {
@@ -134,19 +86,19 @@ private class RateLimitInputDStream(@transient ssc_ : StreamingContext)
  * @note It's necessary to be a top-level object, or else serialization would create another
  *       one on the executor side and we won't be able to read its rate limit.
  */
-private object SingletonDummyReceiver extends DummyReceiver
+private object SingletonDummyReceiver extends DummyReceiver(0)
 
 /**
  * Dummy receiver implementation
  */
-private class DummyReceiver(host: Option[String] = None)
+private class DummyReceiver(receiverId: Int, host: Option[String] = None)
   extends Receiver[Int](StorageLevel.MEMORY_ONLY) {
 
-  def onStart() {
-  }
+  setReceiverId(receiverId)
 
-  def onStop() {
-  }
+  override def onStart(): Unit = {}
+
+  override def onStop(): Unit = {}
 
   override def preferredLocation: Option[String] = host
 }
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/ui/StreamingJobProgressListenerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/ui/StreamingJobProgressListenerSuite.scala
index 40dc1fb601bd0..0891309f956d2 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/ui/StreamingJobProgressListenerSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/ui/StreamingJobProgressListenerSuite.scala
@@ -119,20 +119,20 @@ class StreamingJobProgressListenerSuite extends TestSuiteBase with Matchers {
     listener.numTotalReceivedRecords should be (600)
 
     // onReceiverStarted
-    val receiverInfoStarted = ReceiverInfo(0, "test", null, true, "localhost")
+    val receiverInfoStarted = ReceiverInfo(0, "test", true, "localhost")
     listener.onReceiverStarted(StreamingListenerReceiverStarted(receiverInfoStarted))
     listener.receiverInfo(0) should be (Some(receiverInfoStarted))
     listener.receiverInfo(1) should be (None)
 
     // onReceiverError
-    val receiverInfoError = ReceiverInfo(1, "test", null, true, "localhost")
+    val receiverInfoError = ReceiverInfo(1, "test", true, "localhost")
     listener.onReceiverError(StreamingListenerReceiverError(receiverInfoError))
     listener.receiverInfo(0) should be (Some(receiverInfoStarted))
     listener.receiverInfo(1) should be (Some(receiverInfoError))
     listener.receiverInfo(2) should be (None)
 
     // onReceiverStopped
-    val receiverInfoStopped = ReceiverInfo(2, "test", null, true, "localhost")
+    val receiverInfoStopped = ReceiverInfo(2, "test", true, "localhost")
     listener.onReceiverStopped(StreamingListenerReceiverStopped(receiverInfoStopped))
     listener.receiverInfo(0) should be (Some(receiverInfoStarted))
     listener.receiverInfo(1) should be (Some(receiverInfoError))

From 2e7f99a004f08a42e86f6f603e4ba35cb52561c4 Mon Sep 17 00:00:00 2001
From: Daoyuan Wang <daoyuan.wang@intel.com>
Date: Mon, 27 Jul 2015 21:08:56 -0700
Subject: [PATCH 411/549] [SPARK-8195] [SPARK-8196] [SQL] udf next_day last_day

next_day, returns next certain dayofweek.
last_day, returns the last day of the month which given date belongs to.

Author: Daoyuan Wang <daoyuan.wang@intel.com>

Closes #6986 from adrian-wang/udfnlday and squashes the following commits:

ef7e3da [Daoyuan Wang] fix
02b3426 [Daoyuan Wang] address 2 comments
dc69630 [Daoyuan Wang] address comments from rxin
8846086 [Daoyuan Wang] address comments from rxin
d09bcce [Daoyuan Wang] multi fix
1a9de3d [Daoyuan Wang] function next_day and last_day
---
 .../catalyst/analysis/FunctionRegistry.scala  |  4 +-
 .../expressions/datetimeFunctions.scala       | 72 +++++++++++++++++++
 .../sql/catalyst/util/DateTimeUtils.scala     | 46 ++++++++++++
 .../expressions/DateExpressionsSuite.scala    | 28 ++++++++
 .../org/apache/spark/sql/functions.scala      | 17 +++++
 .../apache/spark/sql/DateFunctionsSuite.scala | 22 ++++++
 6 files changed, 188 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index aa05f448d12bc..61ee6f6f71631 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -219,8 +219,10 @@ object FunctionRegistry {
     expression[DayOfYear]("dayofyear"),
     expression[DayOfMonth]("dayofmonth"),
     expression[Hour]("hour"),
-    expression[Month]("month"),
+    expression[LastDay]("last_day"),
     expression[Minute]("minute"),
+    expression[Month]("month"),
+    expression[NextDay]("next_day"),
     expression[Quarter]("quarter"),
     expression[Second]("second"),
     expression[WeekOfYear]("weekofyear"),
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeFunctions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeFunctions.scala
index 9e55f0546e123..b00a1b26fa285 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeFunctions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeFunctions.scala
@@ -265,3 +265,75 @@ case class DateFormatClass(left: Expression, right: Expression) extends BinaryEx
     })
   }
 }
+
+/**
+ * Returns the last day of the month which the date belongs to.
+ */
+case class LastDay(startDate: Expression) extends UnaryExpression with ImplicitCastInputTypes {
+  override def child: Expression = startDate
+
+  override def inputTypes: Seq[AbstractDataType] = Seq(DateType)
+
+  override def dataType: DataType = DateType
+
+  override def prettyName: String = "last_day"
+
+  override def nullSafeEval(date: Any): Any = {
+    val days = date.asInstanceOf[Int]
+    DateTimeUtils.getLastDayOfMonth(days)
+  }
+
+  override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    val dtu = DateTimeUtils.getClass.getName.stripSuffix("$")
+    defineCodeGen(ctx, ev, (sd) => {
+      s"$dtu.getLastDayOfMonth($sd)"
+    })
+  }
+}
+
+/**
+ * Returns the first date which is later than startDate and named as dayOfWeek.
+ * For example, NextDay(2015-07-27, Sunday) would return 2015-08-02, which is the first
+ * sunday later than 2015-07-27.
+ */
+case class NextDay(startDate: Expression, dayOfWeek: Expression)
+  extends BinaryExpression with ImplicitCastInputTypes {
+
+  override def left: Expression = startDate
+  override def right: Expression = dayOfWeek
+
+  override def inputTypes: Seq[AbstractDataType] = Seq(DateType, StringType)
+
+  override def dataType: DataType = DateType
+
+  override def nullSafeEval(start: Any, dayOfW: Any): Any = {
+    val dow = DateTimeUtils.getDayOfWeekFromString(dayOfW.asInstanceOf[UTF8String])
+    if (dow == -1) {
+      null
+    } else {
+      val sd = start.asInstanceOf[Int]
+      DateTimeUtils.getNextDateForDayOfWeek(sd, dow)
+    }
+  }
+
+  override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    nullSafeCodeGen(ctx, ev, (sd, dowS) => {
+      val dtu = DateTimeUtils.getClass.getName.stripSuffix("$")
+      val dow = ctx.freshName("dow")
+      val genDow = if (right.foldable) {
+        val dowVal = DateTimeUtils.getDayOfWeekFromString(
+          dayOfWeek.eval(InternalRow.empty).asInstanceOf[UTF8String])
+        s"int $dow = $dowVal;"
+      } else {
+        s"int $dow = $dtu.getDayOfWeekFromString($dowS);"
+      }
+      genDow + s"""
+        if ($dow == -1) {
+          ${ev.isNull} = true;
+        } else {
+          ${ev.primitive} = $dtu.getNextDateForDayOfWeek($sd, $dow);
+        }
+       """
+    })
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
index 07412e73b6a5b..2e28fb9af9b65 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
@@ -573,4 +573,50 @@ object DateTimeUtils {
       dayInYear - 334
     }
   }
+
+  /**
+   * Returns Day of week from String. Starting from Thursday, marked as 0.
+   * (Because 1970-01-01 is Thursday).
+   */
+  def getDayOfWeekFromString(string: UTF8String): Int = {
+    val dowString = string.toString.toUpperCase
+    dowString match {
+      case "SU" | "SUN" | "SUNDAY" => 3
+      case "MO" | "MON" | "MONDAY" => 4
+      case "TU" | "TUE" | "TUESDAY" => 5
+      case "WE" | "WED" | "WEDNESDAY" => 6
+      case "TH" | "THU" | "THURSDAY" => 0
+      case "FR" | "FRI" | "FRIDAY" => 1
+      case "SA" | "SAT" | "SATURDAY" => 2
+      case _ => -1
+    }
+  }
+
+  /**
+   * Returns the first date which is later than startDate and is of the given dayOfWeek.
+   * dayOfWeek is an integer ranges in [0, 6], and 0 is Thu, 1 is Fri, etc,.
+   */
+  def getNextDateForDayOfWeek(startDate: Int, dayOfWeek: Int): Int = {
+    startDate + 1 + ((dayOfWeek - 1 - startDate) % 7 + 7) % 7
+  }
+
+  /**
+   * number of days in a non-leap year.
+   */
+  private[this] val daysInNormalYear = Array(31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31)
+
+  /**
+   * Returns last day of the month for the given date. The date is expressed in days
+   * since 1.1.1970.
+   */
+  def getLastDayOfMonth(date: Int): Int = {
+    val dayOfMonth = getDayOfMonth(date)
+    val month = getMonth(date)
+    if (month == 2 && isLeapYear(getYear(date))) {
+      date + daysInNormalYear(month - 1) + 1 - dayOfMonth
+    } else {
+      date + daysInNormalYear(month - 1) - dayOfMonth
+    }
+  }
+
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
index bdba6ce891386..4d2d33765a269 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
@@ -22,6 +22,7 @@ import java.text.SimpleDateFormat
 import java.util.Calendar
 
 import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.types.{StringType, TimestampType, DateType}
 
 class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
@@ -246,4 +247,31 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     }
   }
 
+  test("last_day") {
+    checkEvaluation(LastDay(Literal(Date.valueOf("2015-02-28"))), Date.valueOf("2015-02-28"))
+    checkEvaluation(LastDay(Literal(Date.valueOf("2015-03-27"))), Date.valueOf("2015-03-31"))
+    checkEvaluation(LastDay(Literal(Date.valueOf("2015-04-26"))), Date.valueOf("2015-04-30"))
+    checkEvaluation(LastDay(Literal(Date.valueOf("2015-05-25"))), Date.valueOf("2015-05-31"))
+    checkEvaluation(LastDay(Literal(Date.valueOf("2015-06-24"))), Date.valueOf("2015-06-30"))
+    checkEvaluation(LastDay(Literal(Date.valueOf("2015-07-23"))), Date.valueOf("2015-07-31"))
+    checkEvaluation(LastDay(Literal(Date.valueOf("2015-08-01"))), Date.valueOf("2015-08-31"))
+    checkEvaluation(LastDay(Literal(Date.valueOf("2015-09-02"))), Date.valueOf("2015-09-30"))
+    checkEvaluation(LastDay(Literal(Date.valueOf("2015-10-03"))), Date.valueOf("2015-10-31"))
+    checkEvaluation(LastDay(Literal(Date.valueOf("2015-11-04"))), Date.valueOf("2015-11-30"))
+    checkEvaluation(LastDay(Literal(Date.valueOf("2015-12-05"))), Date.valueOf("2015-12-31"))
+    checkEvaluation(LastDay(Literal(Date.valueOf("2016-01-06"))), Date.valueOf("2016-01-31"))
+    checkEvaluation(LastDay(Literal(Date.valueOf("2016-02-07"))), Date.valueOf("2016-02-29"))
+  }
+
+  test("next_day") {
+    checkEvaluation(
+      NextDay(Literal(Date.valueOf("2015-07-23")), Literal("Thu")),
+      DateTimeUtils.fromJavaDate(Date.valueOf("2015-07-30")))
+    checkEvaluation(
+      NextDay(Literal(Date.valueOf("2015-07-23")), Literal("THURSDAY")),
+      DateTimeUtils.fromJavaDate(Date.valueOf("2015-07-30")))
+    checkEvaluation(
+      NextDay(Literal(Date.valueOf("2015-07-23")), Literal("th")),
+      DateTimeUtils.fromJavaDate(Date.valueOf("2015-07-30")))
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index cab3db609dd4b..d18558b510f0b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -2032,6 +2032,13 @@ object functions {
    */
   def hour(columnName: String): Column = hour(Column(columnName))
 
+  /**
+   * Returns the last day of the month which the given date belongs to.
+   * @group datetime_funcs
+   * @since 1.5.0
+   */
+  def last_day(e: Column): Column = LastDay(e.expr)
+
   /**
    * Extracts the minutes as an integer from a given date/timestamp/string.
    * @group datetime_funcs
@@ -2046,6 +2053,16 @@ object functions {
    */
   def minute(columnName: String): Column = minute(Column(columnName))
 
+  /**
+   * Returns the first date which is later than given date sd and named as dow.
+   * For example, `next_day('2015-07-27', "Sunday")` would return 2015-08-02, which is the
+   * first Sunday later than 2015-07-27. The parameter dayOfWeek could be 2-letter, 3-letter,
+   * or full name of the day of the week (e.g. Mo, tue, FRIDAY).
+   * @group datetime_funcs
+   * @since 1.5.0
+   */
+  def next_day(sd: Column, dayOfWeek: String): Column = NextDay(sd.expr, lit(dayOfWeek).expr)
+
   /**
    * Extracts the seconds as an integer from a given date/timestamp/string.
    * @group datetime_funcs
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
index 9e80ae86920d9..ff1c7562dc4a6 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
@@ -184,4 +184,26 @@ class DateFunctionsSuite extends QueryTest {
       Row(15, 15, 15))
   }
 
+  test("function last_day") {
+    val df1 = Seq((1, "2015-07-23"), (2, "2015-07-24")).toDF("i", "d")
+    val df2 = Seq((1, "2015-07-23 00:11:22"), (2, "2015-07-24 11:22:33")).toDF("i", "t")
+    checkAnswer(
+      df1.select(last_day(col("d"))),
+      Seq(Row(Date.valueOf("2015-07-31")), Row(Date.valueOf("2015-07-31"))))
+    checkAnswer(
+      df2.select(last_day(col("t"))),
+      Seq(Row(Date.valueOf("2015-07-31")), Row(Date.valueOf("2015-07-31"))))
+  }
+
+  test("function next_day") {
+    val df1 = Seq(("mon", "2015-07-23"), ("tuesday", "2015-07-20")).toDF("dow", "d")
+    val df2 = Seq(("th", "2015-07-23 00:11:22"), ("xx", "2015-07-24 11:22:33")).toDF("dow", "t")
+    checkAnswer(
+      df1.select(next_day(col("d"), "MONDAY")),
+      Seq(Row(Date.valueOf("2015-07-27")), Row(Date.valueOf("2015-07-27"))))
+    checkAnswer(
+      df2.select(next_day(col("t"), "th")),
+      Seq(Row(Date.valueOf("2015-07-30")), Row(null)))
+  }
+
 }

From 84da8792e2a99736edb6c94df7eda87915a8a476 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Mon, 27 Jul 2015 21:41:15 -0700
Subject: [PATCH 412/549] [SPARK-9395][SQL] Create a SpecializedGetters
 interface to track all the specialized getters.

As we are adding more and more specialized getters to more classes (coming soon ArrayData), this interface can help us prevent missing a method in some interfaces.

Author: Reynold Xin <rxin@databricks.com>

Closes #7713 from rxin/SpecializedGetters and squashes the following commits:

3b39be1 [Reynold Xin] Added override modifier.
567ba9c [Reynold Xin] [SPARK-9395][SQL] Create a SpecializedGetters interface to track all the specialized getters.
---
 .../expressions/SpecializedGetters.java       | 53 +++++++++++++++++++
 .../spark/sql/catalyst/InternalRow.scala      | 30 ++++++-----
 2 files changed, 69 insertions(+), 14 deletions(-)
 create mode 100644 sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/SpecializedGetters.java

diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/SpecializedGetters.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/SpecializedGetters.java
new file mode 100644
index 0000000000000..5f28d52a94bd7
--- /dev/null
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/SpecializedGetters.java
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions;
+
+import org.apache.spark.sql.catalyst.InternalRow;
+import org.apache.spark.sql.types.Decimal;
+import org.apache.spark.unsafe.types.Interval;
+import org.apache.spark.unsafe.types.UTF8String;
+
+public interface SpecializedGetters {
+
+  boolean isNullAt(int ordinal);
+
+  boolean getBoolean(int ordinal);
+
+  byte getByte(int ordinal);
+
+  short getShort(int ordinal);
+
+  int getInt(int ordinal);
+
+  long getLong(int ordinal);
+
+  float getFloat(int ordinal);
+
+  double getDouble(int ordinal);
+
+  Decimal getDecimal(int ordinal);
+
+  UTF8String getUTF8String(int ordinal);
+
+  byte[] getBinary(int ordinal);
+
+  Interval getInterval(int ordinal);
+
+  InternalRow getStruct(int ordinal, int numFields);
+
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala
index 9a11de3840ce2..e395a67434fa7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala
@@ -26,7 +26,7 @@ import org.apache.spark.unsafe.types.{Interval, UTF8String}
  * An abstract class for row used internal in Spark SQL, which only contain the columns as
  * internal types.
  */
-abstract class InternalRow extends Serializable {
+abstract class InternalRow extends Serializable with SpecializedGetters {
 
   def numFields: Int
 
@@ -38,29 +38,30 @@ abstract class InternalRow extends Serializable {
 
   def getAs[T](ordinal: Int, dataType: DataType): T = get(ordinal, dataType).asInstanceOf[T]
 
-  def isNullAt(ordinal: Int): Boolean = get(ordinal) == null
+  override def isNullAt(ordinal: Int): Boolean = get(ordinal) == null
 
-  def getBoolean(ordinal: Int): Boolean = getAs[Boolean](ordinal, BooleanType)
+  override def getBoolean(ordinal: Int): Boolean = getAs[Boolean](ordinal, BooleanType)
 
-  def getByte(ordinal: Int): Byte = getAs[Byte](ordinal, ByteType)
+  override def getByte(ordinal: Int): Byte = getAs[Byte](ordinal, ByteType)
 
-  def getShort(ordinal: Int): Short = getAs[Short](ordinal, ShortType)
+  override def getShort(ordinal: Int): Short = getAs[Short](ordinal, ShortType)
 
-  def getInt(ordinal: Int): Int = getAs[Int](ordinal, IntegerType)
+  override def getInt(ordinal: Int): Int = getAs[Int](ordinal, IntegerType)
 
-  def getLong(ordinal: Int): Long = getAs[Long](ordinal, LongType)
+  override def getLong(ordinal: Int): Long = getAs[Long](ordinal, LongType)
 
-  def getFloat(ordinal: Int): Float = getAs[Float](ordinal, FloatType)
+  override def getFloat(ordinal: Int): Float = getAs[Float](ordinal, FloatType)
 
-  def getDouble(ordinal: Int): Double = getAs[Double](ordinal, DoubleType)
+  override def getDouble(ordinal: Int): Double = getAs[Double](ordinal, DoubleType)
 
-  def getUTF8String(ordinal: Int): UTF8String = getAs[UTF8String](ordinal, StringType)
+  override def getUTF8String(ordinal: Int): UTF8String = getAs[UTF8String](ordinal, StringType)
 
-  def getBinary(ordinal: Int): Array[Byte] = getAs[Array[Byte]](ordinal, BinaryType)
+  override def getBinary(ordinal: Int): Array[Byte] = getAs[Array[Byte]](ordinal, BinaryType)
 
-  def getDecimal(ordinal: Int): Decimal = getAs[Decimal](ordinal, DecimalType.SYSTEM_DEFAULT)
+  override def getDecimal(ordinal: Int): Decimal =
+    getAs[Decimal](ordinal, DecimalType.SYSTEM_DEFAULT)
 
-  def getInterval(ordinal: Int): Interval = getAs[Interval](ordinal, IntervalType)
+  override def getInterval(ordinal: Int): Interval = getAs[Interval](ordinal, IntervalType)
 
   // This is only use for test and will throw a null pointer exception if the position is null.
   def getString(ordinal: Int): String = getUTF8String(ordinal).toString
@@ -71,7 +72,8 @@ abstract class InternalRow extends Serializable {
    * @param ordinal position to get the struct from.
    * @param numFields number of fields the struct type has
    */
-  def getStruct(ordinal: Int, numFields: Int): InternalRow = getAs[InternalRow](ordinal, null)
+  override def getStruct(ordinal: Int, numFields: Int): InternalRow =
+    getAs[InternalRow](ordinal, null)
 
   override def toString: String = s"[${this.mkString(",")}]"
 

From 3bc7055e265ee5c75af8726579663cea0590f6c0 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Mon, 27 Jul 2015 22:04:54 -0700
Subject: [PATCH 413/549] Fixed a test failure.

---
 .../test/scala/org/apache/spark/sql/DateFunctionsSuite.scala    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
index ff1c7562dc4a6..001fcd035c82a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
@@ -203,7 +203,7 @@ class DateFunctionsSuite extends QueryTest {
       Seq(Row(Date.valueOf("2015-07-27")), Row(Date.valueOf("2015-07-27"))))
     checkAnswer(
       df2.select(next_day(col("t"), "th")),
-      Seq(Row(Date.valueOf("2015-07-30")), Row(null)))
+      Seq(Row(Date.valueOf("2015-07-30")), Row(Date.valueOf("2015-07-30"))))
   }
 
 }

From 63a492b931765b1edd66624421d503f1927825ec Mon Sep 17 00:00:00 2001
From: Yijie Shen <henry.yijieshen@gmail.com>
Date: Mon, 27 Jul 2015 22:47:31 -0700
Subject: [PATCH 414/549] [SPARK-8828] [SQL] Revert SPARK-5680

JIRA: https://issues.apache.org/jira/browse/SPARK-8828

Author: Yijie Shen <henry.yijieshen@gmail.com>

Closes #7667 from yjshen/revert_combinesum_2 and squashes the following commits:

c37ccb1 [Yijie Shen] add test case
8377214 [Yijie Shen] revert spark.sql.useAggregate2 to its default value
e2305ac [Yijie Shen] fix bug - avg on decimal column
7cb0e95 [Yijie Shen] [wip] resolving bugs
1fadb5a [Yijie Shen] remove occurance
17c6248 [Yijie Shen] revert SPARK-5680
---
 .../sql/catalyst/expressions/aggregates.scala | 70 ++-----------------
 .../sql/execution/GeneratedAggregate.scala    | 41 +----------
 .../spark/sql/execution/SparkStrategies.scala |  2 +-
 .../org/apache/spark/sql/SQLQuerySuite.scala  | 31 ++++++++
 .../execution/HiveCompatibilitySuite.scala    |  1 -
 ..._format-0-eff4ef3c207d14d5121368f294697964 |  0
 ..._format-1-4a03c4328565c60ca99689239f07fb16 |  1 -
 7 files changed, 37 insertions(+), 109 deletions(-)
 delete mode 100644 sql/hive/src/test/resources/golden/udaf_number_format-0-eff4ef3c207d14d5121368f294697964
 delete mode 100644 sql/hive/src/test/resources/golden/udaf_number_format-1-4a03c4328565c60ca99689239f07fb16

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
index 42343d4d8d79c..5d4b349b1597a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
@@ -404,7 +404,7 @@ case class Average(child: Expression) extends UnaryExpression with PartialAggreg
 
         // partialSum already increase the precision by 10
         val castedSum = Cast(Sum(partialSum.toAttribute), partialSum.dataType)
-        val castedCount = Sum(partialCount.toAttribute)
+        val castedCount = Cast(Sum(partialCount.toAttribute), partialSum.dataType)
         SplitEvaluation(
           Cast(Divide(castedSum, castedCount), dataType),
           partialCount :: partialSum :: Nil)
@@ -490,13 +490,13 @@ case class Sum(child: Expression) extends UnaryExpression with PartialAggregate1
       case DecimalType.Fixed(_, _) =>
         val partialSum = Alias(Sum(child), "PartialSum")()
         SplitEvaluation(
-          Cast(CombineSum(partialSum.toAttribute), dataType),
+          Cast(Sum(partialSum.toAttribute), dataType),
           partialSum :: Nil)
 
       case _ =>
         val partialSum = Alias(Sum(child), "PartialSum")()
         SplitEvaluation(
-          CombineSum(partialSum.toAttribute),
+          Sum(partialSum.toAttribute),
           partialSum :: Nil)
     }
   }
@@ -522,8 +522,7 @@ case class SumFunction(expr: Expression, base: AggregateExpression1) extends Agg
 
   private val sum = MutableLiteral(null, calcType)
 
-  private val addFunction =
-    Coalesce(Seq(Add(Coalesce(Seq(sum, zero)), Cast(expr, calcType)), sum, zero))
+  private val addFunction = Coalesce(Seq(Add(Coalesce(Seq(sum, zero)), Cast(expr, calcType)), sum))
 
   override def update(input: InternalRow): Unit = {
     sum.update(addFunction, input)
@@ -538,67 +537,6 @@ case class SumFunction(expr: Expression, base: AggregateExpression1) extends Agg
   }
 }
 
-/**
- * Sum should satisfy 3 cases:
- * 1) sum of all null values = zero
- * 2) sum for table column with no data = null
- * 3) sum of column with null and not null values = sum of not null values
- * Require separate CombineSum Expression and function as it has to distinguish "No data" case
- * versus "data equals null" case, while aggregating results and at each partial expression.i.e.,
- * Combining    PartitionLevel   InputData
- *                           <-- null
- * Zero     <-- Zero         <-- null
- *
- *          <-- null         <-- no data
- * null     <-- null         <-- no data
- */
-case class CombineSum(child: Expression) extends AggregateExpression1 {
-  def this() = this(null)
-
-  override def children: Seq[Expression] = child :: Nil
-  override def nullable: Boolean = true
-  override def dataType: DataType = child.dataType
-  override def toString: String = s"CombineSum($child)"
-  override def newInstance(): CombineSumFunction = new CombineSumFunction(child, this)
-}
-
-case class CombineSumFunction(expr: Expression, base: AggregateExpression1)
-  extends AggregateFunction1 {
-
-  def this() = this(null, null) // Required for serialization.
-
-  private val calcType =
-    expr.dataType match {
-      case DecimalType.Fixed(precision, scale) =>
-        DecimalType.bounded(precision + 10, scale)
-      case _ =>
-        expr.dataType
-    }
-
-  private val zero = Cast(Literal(0), calcType)
-
-  private val sum = MutableLiteral(null, calcType)
-
-  private val addFunction =
-    Coalesce(Seq(Add(Coalesce(Seq(sum, zero)), Cast(expr, calcType)), sum, zero))
-
-  override def update(input: InternalRow): Unit = {
-    val result = expr.eval(input)
-    // partial sum result can be null only when no input rows present
-    if(result != null) {
-      sum.update(addFunction, input)
-    }
-  }
-
-  override def eval(input: InternalRow): Any = {
-    expr.dataType match {
-      case DecimalType.Fixed(_, _) =>
-        Cast(sum, dataType).eval(null)
-      case _ => sum.eval(null)
-    }
-  }
-}
-
 case class SumDistinct(child: Expression) extends UnaryExpression with PartialAggregate1 {
 
   def this() = this(null)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala
index 5ad4691a5ca07..1cd1420480f03 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala
@@ -108,7 +108,7 @@ case class GeneratedAggregate(
           Add(
             Coalesce(currentSum :: zero :: Nil),
             Cast(expr, calcType)
-          ) :: currentSum :: zero :: Nil)
+          ) :: currentSum :: Nil)
         val result =
           expr.dataType match {
             case DecimalType.Fixed(_, _) =>
@@ -118,45 +118,6 @@ case class GeneratedAggregate(
 
         AggregateEvaluation(currentSum :: Nil, initialValue :: Nil, updateFunction :: Nil, result)
 
-      case cs @ CombineSum(expr) =>
-        val calcType =
-          expr.dataType match {
-            case DecimalType.Fixed(p, s) =>
-              DecimalType.bounded(p + 10, s)
-            case _ =>
-              expr.dataType
-          }
-
-        val currentSum = AttributeReference("currentSum", calcType, nullable = true)()
-        val initialValue = Literal.create(null, calcType)
-
-        // Coalesce avoids double calculation...
-        // but really, common sub expression elimination would be better....
-        val zero = Cast(Literal(0), calcType)
-        // If we're evaluating UnscaledValue(x), we can do Count on x directly, since its
-        // UnscaledValue will be null if and only if x is null; helps with Average on decimals
-        val actualExpr = expr match {
-          case UnscaledValue(e) => e
-          case _ => expr
-        }
-        // partial sum result can be null only when no input rows present
-        val updateFunction = If(
-          IsNotNull(actualExpr),
-          Coalesce(
-            Add(
-              Coalesce(currentSum :: zero :: Nil),
-              Cast(expr, calcType)) :: currentSum :: zero :: Nil),
-          currentSum)
-
-        val result =
-          expr.dataType match {
-            case DecimalType.Fixed(_, _) =>
-              Cast(currentSum, cs.dataType)
-            case _ => currentSum
-          }
-
-        AggregateEvaluation(currentSum :: Nil, initialValue :: Nil, updateFunction :: Nil, result)
-
       case m @ Max(expr) =>
         val currentMax = AttributeReference("currentMax", expr.dataType, nullable = true)()
         val initialValue = Literal.create(null, expr.dataType)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index 306bbfec624c0..d88a02298c00d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -201,7 +201,7 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
     }
 
     def canBeCodeGened(aggs: Seq[AggregateExpression1]): Boolean = aggs.forall {
-      case _: CombineSum | _: Sum | _: Count | _: Max | _: Min |  _: CombineSetsAndCount => true
+      case _: Sum | _: Count | _: Max | _: Min |  _: CombineSetsAndCount => true
       // The generated set implementation is pretty limited ATM.
       case CollectHashSet(exprs) if exprs.size == 1  &&
            Seq(IntegerType, LongType).contains(exprs.head.dataType) => true
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 358e319476e83..42724ed766af5 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -227,6 +227,37 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll with SQLTestUtils {
       Seq(Row("1"), Row("2")))
   }
 
+  test("SPARK-8828 sum should return null if all input values are null") {
+    withSQLConf(SQLConf.USE_SQL_AGGREGATE2.key -> "true") {
+      withSQLConf(SQLConf.CODEGEN_ENABLED.key -> "true") {
+        checkAnswer(
+          sql("select sum(a), avg(a) from allNulls"),
+          Seq(Row(null, null))
+        )
+      }
+      withSQLConf(SQLConf.CODEGEN_ENABLED.key -> "false") {
+        checkAnswer(
+          sql("select sum(a), avg(a) from allNulls"),
+          Seq(Row(null, null))
+        )
+      }
+    }
+    withSQLConf(SQLConf.USE_SQL_AGGREGATE2.key -> "false") {
+      withSQLConf(SQLConf.CODEGEN_ENABLED.key -> "true") {
+        checkAnswer(
+          sql("select sum(a), avg(a) from allNulls"),
+          Seq(Row(null, null))
+        )
+      }
+      withSQLConf(SQLConf.CODEGEN_ENABLED.key -> "false") {
+        checkAnswer(
+          sql("select sum(a), avg(a) from allNulls"),
+          Seq(Row(null, null))
+        )
+      }
+    }
+  }
+
   test("aggregation with codegen") {
     val originalValue = sqlContext.conf.codegenEnabled
     sqlContext.setConf(SQLConf.CODEGEN_ENABLED, true)
diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
index b12b3838e615c..ec959cb2194b0 100644
--- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
+++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
@@ -822,7 +822,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "udaf_covar_pop",
     "udaf_covar_samp",
     "udaf_histogram_numeric",
-    "udaf_number_format",
     "udf2",
     "udf5",
     "udf6",
diff --git a/sql/hive/src/test/resources/golden/udaf_number_format-0-eff4ef3c207d14d5121368f294697964 b/sql/hive/src/test/resources/golden/udaf_number_format-0-eff4ef3c207d14d5121368f294697964
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/sql/hive/src/test/resources/golden/udaf_number_format-1-4a03c4328565c60ca99689239f07fb16 b/sql/hive/src/test/resources/golden/udaf_number_format-1-4a03c4328565c60ca99689239f07fb16
deleted file mode 100644
index c6f275a0db131..0000000000000
--- a/sql/hive/src/test/resources/golden/udaf_number_format-1-4a03c4328565c60ca99689239f07fb16
+++ /dev/null
@@ -1 +0,0 @@
-0.0	NULL	NULL	NULL

From 60f08c7c8775c0462b74bc65b41397be6eb24b6d Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Mon, 27 Jul 2015 22:51:15 -0700
Subject: [PATCH 415/549] [SPARK-9373][SQL] Support StructType in Tungsten
 projection

This pull request updates GenerateUnsafeProjection to support StructType. If an input struct type is backed already by an UnsafeRow, GenerateUnsafeProjection copies the bytes directly into its buffer space without any conversion. However, if the input is not an UnsafeRow, GenerateUnsafeProjection runs the code generated recursively to convert the input into an UnsafeRow and then copies it into the buffer space.

Also create a TungstenProject operator that projects data directly into UnsafeRow. Note that I'm not sure if this is the way we want to structure Unsafe+codegen operators, but we can defer that decision to follow-up pull requests.

Author: Reynold Xin <rxin@databricks.com>

Closes #7689 from rxin/tungsten-struct-type and squashes the following commits:

9162f42 [Reynold Xin] Support IntervalType in UnsafeRow's getter.
be9f377 [Reynold Xin] Fixed tests.
10c4b7c [Reynold Xin] Format generated code.
77e8d0e [Reynold Xin] Fixed NondeterministicSuite.
ac4951d [Reynold Xin] Yay.
ac203bf [Reynold Xin] More comments.
9f36216 [Reynold Xin] Updated comment.
6b781fe [Reynold Xin] Reset the change in DataFrameSuite.
525b95b [Reynold Xin] Merged with master, more documentation & test cases.
321859a [Reynold Xin] [SPARK-9373][SQL] Support StructType in Tungsten projection [WIP]
---
 .../sql/catalyst/expressions/UnsafeRow.java   |   2 +
 .../expressions/UnsafeRowWriters.java         |  48 +++++-
 .../catalyst/expressions/BoundAttribute.scala |   9 +-
 .../codegen/GenerateUnsafeProjection.scala    | 162 ++++++++++++++++--
 .../expressions/complexTypeCreator.scala      |  94 ++++++++--
 .../ArithmeticExpressionSuite.scala           |   2 +-
 .../expressions/BitwiseFunctionsSuite.scala   |  20 ++-
 .../expressions/ExpressionEvalHelper.scala    |  26 ++-
 .../spark/sql/execution/SparkStrategies.scala |   9 +-
 .../spark/sql/execution/basicOperators.scala  |  25 +++
 .../spark/sql/DataFrameTungstenSuite.scala    |  84 +++++++++
 .../expression/NondeterministicSuite.scala    |   2 +-
 12 files changed, 430 insertions(+), 53 deletions(-)
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/DataFrameTungstenSuite.scala

diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
index fb084dd13b620..955fb4226fc0e 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
@@ -265,6 +265,8 @@ public Object get(int ordinal, DataType dataType) {
       return getBinary(ordinal);
     } else if (dataType instanceof StringType) {
       return getUTF8String(ordinal);
+    } else if (dataType instanceof IntervalType) {
+      return getInterval(ordinal);
     } else if (dataType instanceof StructType) {
       return getStruct(ordinal, ((StructType) dataType).size());
     } else {
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRowWriters.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRowWriters.java
index 0ba31d3b9b743..8fdd7399602d2 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRowWriters.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRowWriters.java
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.catalyst.expressions;
 
+import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.unsafe.PlatformDependent;
 import org.apache.spark.unsafe.array.ByteArrayMethods;
 import org.apache.spark.unsafe.types.ByteArray;
@@ -81,6 +82,52 @@ public static int write(UnsafeRow target, int ordinal, int cursor, byte[] input)
     }
   }
 
+  /**
+   * Writer for struct type where the struct field is backed by an {@link UnsafeRow}.
+   *
+   * We throw UnsupportedOperationException for inputs that are not backed by {@link UnsafeRow}.
+   * Non-UnsafeRow struct fields are handled directly in
+   * {@link org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection}
+   * by generating the Java code needed to convert them into UnsafeRow.
+   */
+  public static class StructWriter {
+    public static int getSize(InternalRow input) {
+      int numBytes = 0;
+      if (input instanceof UnsafeRow) {
+        numBytes = ((UnsafeRow) input).getSizeInBytes();
+      } else {
+        // This is handled directly in GenerateUnsafeProjection.
+        throw new UnsupportedOperationException();
+      }
+      return ByteArrayMethods.roundNumberOfBytesToNearestWord(numBytes);
+    }
+
+    public static int write(UnsafeRow target, int ordinal, int cursor, InternalRow input) {
+      int numBytes = 0;
+      final long offset = target.getBaseOffset() + cursor;
+      if (input instanceof UnsafeRow) {
+        final UnsafeRow row = (UnsafeRow) input;
+        numBytes = row.getSizeInBytes();
+
+        // zero-out the padding bytes
+        if ((numBytes & 0x07) > 0) {
+          PlatformDependent.UNSAFE.putLong(
+            target.getBaseObject(), offset + ((numBytes >> 3) << 3), 0L);
+        }
+
+        // Write the string to the variable length portion.
+        row.writeToMemory(target.getBaseObject(), offset);
+
+        // Set the fixed length portion.
+        target.setLong(ordinal, (((long) cursor) << 32) | ((long) numBytes));
+      } else {
+        // This is handled directly in GenerateUnsafeProjection.
+        throw new UnsupportedOperationException();
+      }
+      return ByteArrayMethods.roundNumberOfBytesToNearestWord(numBytes);
+    }
+  }
+
   /** Writer for interval type. */
   public static class IntervalWriter {
 
@@ -96,5 +143,4 @@ public static int write(UnsafeRow target, int ordinal, int cursor, Interval inpu
       return 16;
     }
   }
-
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala
index 41a877f214e55..8304d4ccd47f7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala
@@ -50,7 +50,7 @@ case class BoundReference(ordinal: Int, dataType: DataType, nullable: Boolean)
         case BinaryType => input.getBinary(ordinal)
         case IntervalType => input.getInterval(ordinal)
         case t: StructType => input.getStruct(ordinal, t.size)
-        case dataType => input.get(ordinal, dataType)
+        case _ => input.get(ordinal, dataType)
       }
     }
   }
@@ -64,10 +64,11 @@ case class BoundReference(ordinal: Int, dataType: DataType, nullable: Boolean)
   override def exprId: ExprId = throw new UnsupportedOperationException
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    val javaType = ctx.javaType(dataType)
+    val value = ctx.getColumn("i", dataType, ordinal)
     s"""
-        boolean ${ev.isNull} = i.isNullAt($ordinal);
-        ${ctx.javaType(dataType)} ${ev.primitive} = ${ev.isNull} ?
-            ${ctx.defaultValue(dataType)} : (${ctx.getColumn("i", dataType, ordinal)});
+      boolean ${ev.isNull} = i.isNullAt($ordinal);
+      $javaType ${ev.primitive} = ${ev.isNull} ? ${ctx.defaultValue(dataType)} : ($value);
     """
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
index 9d2161947b351..3e87f7285847c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
@@ -34,11 +34,13 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
   private val StringWriter = classOf[UnsafeRowWriters.UTF8StringWriter].getName
   private val BinaryWriter = classOf[UnsafeRowWriters.BinaryWriter].getName
   private val IntervalWriter = classOf[UnsafeRowWriters.IntervalWriter].getName
+  private val StructWriter = classOf[UnsafeRowWriters.StructWriter].getName
 
   /** Returns true iff we support this data type. */
   def canSupport(dataType: DataType): Boolean = dataType match {
     case t: AtomicType if !t.isInstanceOf[DecimalType] => true
     case _: IntervalType => true
+    case t: StructType => t.toSeq.forall(field => canSupport(field.dataType))
     case NullType => true
     case _ => false
   }
@@ -55,15 +57,22 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
 
     val ret = ev.primitive
     ctx.addMutableState("UnsafeRow", ret, s"$ret = new UnsafeRow();")
-    val bufferTerm = ctx.freshName("buffer")
-    ctx.addMutableState("byte[]", bufferTerm, s"$bufferTerm = new byte[64];")
-    val cursorTerm = ctx.freshName("cursor")
-    val numBytesTerm = ctx.freshName("numBytes")
+    val buffer = ctx.freshName("buffer")
+    ctx.addMutableState("byte[]", buffer, s"$buffer = new byte[64];")
+    val cursor = ctx.freshName("cursor")
+    val numBytes = ctx.freshName("numBytes")
 
-    val exprs = expressions.map(_.gen(ctx))
+    val exprs = expressions.zipWithIndex.map { case (e, i) =>
+      e.dataType match {
+        case st: StructType =>
+          createCodeForStruct(ctx, e.gen(ctx), st)
+        case _ =>
+          e.gen(ctx)
+      }
+    }
     val allExprs = exprs.map(_.code).mkString("\n")
-    val fixedSize = 8 * exprs.length + UnsafeRow.calculateBitSetWidthInBytes(exprs.length)
 
+    val fixedSize = 8 * exprs.length + UnsafeRow.calculateBitSetWidthInBytes(exprs.length)
     val additionalSize = expressions.zipWithIndex.map { case (e, i) =>
       e.dataType match {
         case StringType =>
@@ -72,6 +81,8 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
           s" + (${exprs(i).isNull} ? 0 : $BinaryWriter.getSize(${exprs(i).primitive}))"
         case IntervalType =>
           s" + (${exprs(i).isNull} ? 0 : 16)"
+        case _: StructType =>
+          s" + (${exprs(i).isNull} ? 0 : $StructWriter.getSize(${exprs(i).primitive}))"
         case _ => ""
       }
     }.mkString("")
@@ -81,11 +92,13 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
         case dt if ctx.isPrimitiveType(dt) =>
           s"${ctx.setColumn(ret, dt, i, exprs(i).primitive)}"
         case StringType =>
-          s"$cursorTerm += $StringWriter.write($ret, $i, $cursorTerm, ${exprs(i).primitive})"
+          s"$cursor += $StringWriter.write($ret, $i, $cursor, ${exprs(i).primitive})"
         case BinaryType =>
-          s"$cursorTerm += $BinaryWriter.write($ret, $i, $cursorTerm, ${exprs(i).primitive})"
+          s"$cursor += $BinaryWriter.write($ret, $i, $cursor, ${exprs(i).primitive})"
         case IntervalType =>
-          s"$cursorTerm += $IntervalWriter.write($ret, $i, $cursorTerm, ${exprs(i).primitive})"
+          s"$cursor += $IntervalWriter.write($ret, $i, $cursor, ${exprs(i).primitive})"
+        case t: StructType =>
+          s"$cursor += $StructWriter.write($ret, $i, $cursor, ${exprs(i).primitive})"
         case NullType => ""
         case _ =>
           throw new UnsupportedOperationException(s"Not supported DataType: ${e.dataType}")
@@ -99,24 +112,139 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
 
     s"""
       $allExprs
-      int $numBytesTerm = $fixedSize $additionalSize;
-      if ($numBytesTerm > $bufferTerm.length) {
-        $bufferTerm = new byte[$numBytesTerm];
+      int $numBytes = $fixedSize $additionalSize;
+      if ($numBytes > $buffer.length) {
+        $buffer = new byte[$numBytes];
       }
 
       $ret.pointTo(
-        $bufferTerm,
+        $buffer,
         org.apache.spark.unsafe.PlatformDependent.BYTE_ARRAY_OFFSET,
         ${expressions.size},
-        $numBytesTerm);
-      int $cursorTerm = $fixedSize;
-
+        $numBytes);
+      int $cursor = $fixedSize;
 
       $writers
       boolean ${ev.isNull} = false;
      """
   }
 
+  /**
+   * Generates the Java code to convert a struct (backed by InternalRow) to UnsafeRow.
+   *
+   * This function also handles nested structs by recursively generating the code to do conversion.
+   *
+   * @param ctx code generation context
+   * @param input the input struct, identified by a [[GeneratedExpressionCode]]
+   * @param schema schema of the struct field
+   */
+  // TODO: refactor createCode and this function to reduce code duplication.
+  private def createCodeForStruct(
+      ctx: CodeGenContext,
+      input: GeneratedExpressionCode,
+      schema: StructType): GeneratedExpressionCode = {
+
+    val isNull = input.isNull
+    val primitive = ctx.freshName("structConvert")
+    ctx.addMutableState("UnsafeRow", primitive, s"$primitive = new UnsafeRow();")
+    val buffer = ctx.freshName("buffer")
+    ctx.addMutableState("byte[]", buffer, s"$buffer = new byte[64];")
+    val cursor = ctx.freshName("cursor")
+
+    val exprs: Seq[GeneratedExpressionCode] = schema.map(_.dataType).zipWithIndex.map {
+      case (dt, i) => dt match {
+      case st: StructType =>
+        val nestedStructEv = GeneratedExpressionCode(
+          code = "",
+          isNull = s"${input.primitive}.isNullAt($i)",
+          primitive = s"${ctx.getColumn(input.primitive, dt, i)}"
+        )
+        createCodeForStruct(ctx, nestedStructEv, st)
+      case _ =>
+        GeneratedExpressionCode(
+          code = "",
+          isNull = s"${input.primitive}.isNullAt($i)",
+          primitive = s"${ctx.getColumn(input.primitive, dt, i)}"
+        )
+      }
+    }
+    val allExprs = exprs.map(_.code).mkString("\n")
+
+    val fixedSize = 8 * exprs.length + UnsafeRow.calculateBitSetWidthInBytes(exprs.length)
+    val additionalSize = schema.toSeq.map(_.dataType).zip(exprs).map { case (dt, ev) =>
+      dt match {
+        case StringType =>
+          s" + (${ev.isNull} ? 0 : $StringWriter.getSize(${ev.primitive}))"
+        case BinaryType =>
+          s" + (${ev.isNull} ? 0 : $BinaryWriter.getSize(${ev.primitive}))"
+        case IntervalType =>
+          s" + (${ev.isNull} ? 0 : 16)"
+        case _: StructType =>
+          s" + (${ev.isNull} ? 0 : $StructWriter.getSize(${ev.primitive}))"
+        case _ => ""
+      }
+    }.mkString("")
+
+    val writers = schema.toSeq.map(_.dataType).zip(exprs).zipWithIndex.map { case ((dt, ev), i) =>
+      val update = dt match {
+        case _ if ctx.isPrimitiveType(dt) =>
+          s"${ctx.setColumn(primitive, dt, i, exprs(i).primitive)}"
+        case StringType =>
+          s"$cursor += $StringWriter.write($primitive, $i, $cursor, ${exprs(i).primitive})"
+        case BinaryType =>
+          s"$cursor += $BinaryWriter.write($primitive, $i, $cursor, ${exprs(i).primitive})"
+        case IntervalType =>
+          s"$cursor += $IntervalWriter.write($primitive, $i, $cursor, ${exprs(i).primitive})"
+        case t: StructType =>
+          s"$cursor += $StructWriter.write($primitive, $i, $cursor, ${exprs(i).primitive})"
+        case NullType => ""
+        case _ =>
+          throw new UnsupportedOperationException(s"Not supported DataType: $dt")
+      }
+      s"""
+          if (${exprs(i).isNull}) {
+            $primitive.setNullAt($i);
+          } else {
+            $update;
+          }
+        """
+    }.mkString("\n          ")
+
+    // Note that we add a shortcut here for performance: if the input is already an UnsafeRow,
+    // just copy the bytes directly into our buffer space without running any conversion.
+    // We also had to use a hack to introduce a "tmp" variable, to avoid the Java compiler from
+    // complaining that a GenericMutableRow (generated by expressions) cannot be cast to UnsafeRow.
+    val tmp = ctx.freshName("tmp")
+    val numBytes = ctx.freshName("numBytes")
+    val code = s"""
+       |${input.code}
+       |if (!${input.isNull}) {
+       |  Object $tmp = (Object) ${input.primitive};
+       |  if ($tmp instanceof UnsafeRow) {
+       |    $primitive = (UnsafeRow) $tmp;
+       |  } else {
+       |    $allExprs
+       |
+       |    int $numBytes = $fixedSize $additionalSize;
+       |    if ($numBytes > $buffer.length) {
+       |      $buffer = new byte[$numBytes];
+       |    }
+       |
+       |    $primitive.pointTo(
+       |      $buffer,
+       |      org.apache.spark.unsafe.PlatformDependent.BYTE_ARRAY_OFFSET,
+       |      ${exprs.size},
+       |      $numBytes);
+       |    int $cursor = $fixedSize;
+       |
+       |    $writers
+       |  }
+       |}
+     """.stripMargin
+
+    GeneratedExpressionCode(code, isNull, primitive)
+  }
+
   protected def canonicalize(in: Seq[Expression]): Seq[Expression] =
     in.map(ExpressionCanonicalizer.execute)
 
@@ -159,7 +287,7 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
       }
       """
 
-    logDebug(s"code for ${expressions.mkString(",")}:\n$code")
+    logDebug(s"code for ${expressions.mkString(",")}:\n${CodeFormatter.format(code)}")
 
     val c = compile(code)
     c.generate(ctx.references.toArray).asInstanceOf[UnsafeProjection]
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
index 119168fa59f15..d8c9087ff5380 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
@@ -104,18 +104,19 @@ case class CreateStruct(children: Seq[Expression]) extends Expression {
       children.zipWithIndex.map { case (e, i) =>
         val eval = e.gen(ctx)
         eval.code + s"""
-        if (${eval.isNull}) {
-          ${ev.primitive}.update($i, null);
-        } else {
-          ${ev.primitive}.update($i, ${eval.primitive});
-        }
-       """
+          if (${eval.isNull}) {
+            ${ev.primitive}.update($i, null);
+          } else {
+            ${ev.primitive}.update($i, ${eval.primitive});
+          }
+         """
       }.mkString("\n")
   }
 
   override def prettyName: String = "struct"
 }
 
+
 /**
  * Creates a struct with the given field names and values
  *
@@ -168,14 +169,83 @@ case class CreateNamedStruct(children: Seq[Expression]) extends Expression {
       valExprs.zipWithIndex.map { case (e, i) =>
         val eval = e.gen(ctx)
         eval.code + s"""
-        if (${eval.isNull}) {
-          ${ev.primitive}.update($i, null);
-        } else {
-          ${ev.primitive}.update($i, ${eval.primitive});
-        }
-       """
+          if (${eval.isNull}) {
+            ${ev.primitive}.update($i, null);
+          } else {
+            ${ev.primitive}.update($i, ${eval.primitive});
+          }
+         """
       }.mkString("\n")
   }
 
   override def prettyName: String = "named_struct"
 }
+
+/**
+ * Returns a Row containing the evaluation of all children expressions. This is a variant that
+ * returns UnsafeRow directly. The unsafe projection operator replaces [[CreateStruct]] with
+ * this expression automatically at runtime.
+ */
+case class CreateStructUnsafe(children: Seq[Expression]) extends Expression {
+
+  override def foldable: Boolean = children.forall(_.foldable)
+
+  override lazy val resolved: Boolean = childrenResolved
+
+  override lazy val dataType: StructType = {
+    val fields = children.zipWithIndex.map { case (child, idx) =>
+      child match {
+        case ne: NamedExpression =>
+          StructField(ne.name, ne.dataType, ne.nullable, ne.metadata)
+        case _ =>
+          StructField(s"col${idx + 1}", child.dataType, child.nullable, Metadata.empty)
+      }
+    }
+    StructType(fields)
+  }
+
+  override def nullable: Boolean = false
+
+  override def eval(input: InternalRow): Any = throw new UnsupportedOperationException
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    GenerateUnsafeProjection.createCode(ctx, ev, children)
+  }
+
+  override def prettyName: String = "struct_unsafe"
+}
+
+
+/**
+ * Creates a struct with the given field names and values. This is a variant that returns
+ * UnsafeRow directly. The unsafe projection operator replaces [[CreateStruct]] with
+ * this expression automatically at runtime.
+ *
+ * @param children Seq(name1, val1, name2, val2, ...)
+ */
+case class CreateNamedStructUnsafe(children: Seq[Expression]) extends Expression {
+
+  private lazy val (nameExprs, valExprs) =
+    children.grouped(2).map { case Seq(name, value) => (name, value) }.toList.unzip
+
+  private lazy val names = nameExprs.map(_.eval(EmptyRow).toString)
+
+  override lazy val dataType: StructType = {
+    val fields = names.zip(valExprs).map { case (name, valExpr) =>
+      StructField(name, valExpr.dataType, valExpr.nullable, Metadata.empty)
+    }
+    StructType(fields)
+  }
+
+  override def foldable: Boolean = valExprs.forall(_.foldable)
+
+  override def nullable: Boolean = false
+
+  override def eval(input: InternalRow): Any = throw new UnsupportedOperationException
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    GenerateUnsafeProjection.createCode(ctx, ev, valExprs)
+  }
+
+  override def prettyName: String = "named_struct_unsafe"
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ArithmeticExpressionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ArithmeticExpressionSuite.scala
index e7e5231d32c9e..7773e098e0caa 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ArithmeticExpressionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ArithmeticExpressionSuite.scala
@@ -170,6 +170,6 @@ class ArithmeticExpressionSuite extends SparkFunSuite with ExpressionEvalHelper
     checkEvaluation(Pmod(-7, 3), 2)
     checkEvaluation(Pmod(7.2D, 4.1D), 3.1000000000000005)
     checkEvaluation(Pmod(Decimal(0.7), Decimal(0.2)), Decimal(0.1))
-    checkEvaluation(Pmod(2L, Long.MaxValue), 2)
+    checkEvaluation(Pmod(2L, Long.MaxValue), 2L)
   }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/BitwiseFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/BitwiseFunctionsSuite.scala
index 648fbf5a4c30b..fa30fbe528479 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/BitwiseFunctionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/BitwiseFunctionsSuite.scala
@@ -30,8 +30,9 @@ class BitwiseFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
       checkEvaluation(expr, expected)
     }
 
-    check(1.toByte, ~1.toByte)
-    check(1000.toShort, ~1000.toShort)
+    // Need the extra toByte even though IntelliJ thought it's not needed.
+    check(1.toByte, (~1.toByte).toByte)
+    check(1000.toShort, (~1000.toShort).toShort)
     check(1000000, ~1000000)
     check(123456789123L, ~123456789123L)
 
@@ -45,8 +46,9 @@ class BitwiseFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
       checkEvaluation(expr, expected)
     }
 
-    check(1.toByte, 2.toByte, 1.toByte & 2.toByte)
-    check(1000.toShort, 2.toShort, 1000.toShort & 2.toShort)
+    // Need the extra toByte even though IntelliJ thought it's not needed.
+    check(1.toByte, 2.toByte, (1.toByte & 2.toByte).toByte)
+    check(1000.toShort, 2.toShort, (1000.toShort & 2.toShort).toShort)
     check(1000000, 4, 1000000 & 4)
     check(123456789123L, 5L, 123456789123L & 5L)
 
@@ -63,8 +65,9 @@ class BitwiseFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
       checkEvaluation(expr, expected)
     }
 
-    check(1.toByte, 2.toByte, 1.toByte | 2.toByte)
-    check(1000.toShort, 2.toShort, 1000.toShort | 2.toShort)
+    // Need the extra toByte even though IntelliJ thought it's not needed.
+    check(1.toByte, 2.toByte, (1.toByte | 2.toByte).toByte)
+    check(1000.toShort, 2.toShort, (1000.toShort | 2.toShort).toShort)
     check(1000000, 4, 1000000 | 4)
     check(123456789123L, 5L, 123456789123L | 5L)
 
@@ -81,8 +84,9 @@ class BitwiseFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
       checkEvaluation(expr, expected)
     }
 
-    check(1.toByte, 2.toByte, 1.toByte ^ 2.toByte)
-    check(1000.toShort, 2.toShort, 1000.toShort ^ 2.toShort)
+    // Need the extra toByte even though IntelliJ thought it's not needed.
+    check(1.toByte, 2.toByte, (1.toByte ^ 2.toByte).toByte)
+    check(1000.toShort, 2.toShort, (1000.toShort ^ 2.toShort).toShort)
     check(1000000, 4, 1000000 ^ 4)
     check(123456789123L, 5L, 123456789123L ^ 5L)
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
index ab0cdc857c80e..136368bf5b368 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
@@ -114,7 +114,7 @@ trait ExpressionEvalHelper {
     val actual = plan(inputRow).get(0, expression.dataType)
     if (!checkResult(actual, expected)) {
       val input = if (inputRow == EmptyRow) "" else s", input: $inputRow"
-      fail(s"Incorrect Evaluation: $expression, actual: $actual, expected: $expected$input")
+      fail(s"Incorrect evaluation: $expression, actual: $actual, expected: $expected$input")
     }
   }
 
@@ -146,7 +146,8 @@ trait ExpressionEvalHelper {
 
     if (actual != expectedRow) {
       val input = if (inputRow == EmptyRow) "" else s", input: $inputRow"
-      fail(s"Incorrect Evaluation: $expression, actual: $actual, expected: $expectedRow$input")
+      fail("Incorrect Evaluation in codegen mode: " +
+        s"$expression, actual: $actual, expected: $expectedRow$input")
     }
     if (actual.copy() != expectedRow) {
       fail(s"Copy of generated Row is wrong: actual: ${actual.copy()}, expected: $expectedRow")
@@ -163,12 +164,21 @@ trait ExpressionEvalHelper {
       expression)
 
     val unsafeRow = plan(inputRow)
-    // UnsafeRow cannot be compared with GenericInternalRow directly
-    val actual = FromUnsafeProjection(expression.dataType :: Nil)(unsafeRow)
-    val expectedRow = InternalRow(expected)
-    if (actual != expectedRow) {
-      val input = if (inputRow == EmptyRow) "" else s", input: $inputRow"
-      fail(s"Incorrect Evaluation: $expression, actual: $actual, expected: $expectedRow$input")
+    val input = if (inputRow == EmptyRow) "" else s", input: $inputRow"
+
+    if (expected == null) {
+      if (!unsafeRow.isNullAt(0)) {
+        val expectedRow = InternalRow(expected)
+        fail("Incorrect evaluation in unsafe mode: " +
+          s"$expression, actual: $unsafeRow, expected: $expectedRow$input")
+      }
+    } else {
+      val lit = InternalRow(expected)
+      val expectedRow = UnsafeProjection.create(Array(expression.dataType)).apply(lit)
+      if (unsafeRow != expectedRow) {
+        fail("Incorrect evaluation in unsafe mode: " +
+          s"$expression, actual: $unsafeRow, expected: $expectedRow$input")
+      }
     }
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index d88a02298c00d..314b85f126dd2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -363,7 +363,14 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
       case logical.Sort(sortExprs, global, child) =>
         getSortOperator(sortExprs, global, planLater(child)):: Nil
       case logical.Project(projectList, child) =>
-        execution.Project(projectList, planLater(child)) :: Nil
+        // If unsafe mode is enabled and we support these data types in Unsafe, use the
+        // Tungsten project. Otherwise, use the normal project.
+        if (sqlContext.conf.unsafeEnabled &&
+          UnsafeProjection.canSupport(projectList) && UnsafeProjection.canSupport(child.schema)) {
+          execution.TungstenProject(projectList, planLater(child)) :: Nil
+        } else {
+          execution.Project(projectList, planLater(child)) :: Nil
+        }
       case logical.Filter(condition, child) =>
         execution.Filter(condition, planLater(child)) :: Nil
       case e @ logical.Expand(_, _, _, child) =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
index fe429d862a0a3..b02e60dc85cdd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
@@ -49,6 +49,31 @@ case class Project(projectList: Seq[NamedExpression], child: SparkPlan) extends
   override def outputOrdering: Seq[SortOrder] = child.outputOrdering
 }
 
+
+/**
+ * A variant of [[Project]] that returns [[UnsafeRow]]s.
+ */
+case class TungstenProject(projectList: Seq[NamedExpression], child: SparkPlan) extends UnaryNode {
+
+  override def outputsUnsafeRows: Boolean = true
+  override def canProcessUnsafeRows: Boolean = true
+  override def canProcessSafeRows: Boolean = true
+
+  override def output: Seq[Attribute] = projectList.map(_.toAttribute)
+
+  protected override def doExecute(): RDD[InternalRow] = child.execute().mapPartitions { iter =>
+    this.transformAllExpressions {
+      case CreateStruct(children) => CreateStructUnsafe(children)
+      case CreateNamedStruct(children) => CreateNamedStructUnsafe(children)
+    }
+    val project = UnsafeProjection.create(projectList, child.output)
+    iter.map(project)
+  }
+
+  override def outputOrdering: Seq[SortOrder] = child.outputOrdering
+}
+
+
 /**
  * :: DeveloperApi ::
  */
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameTungstenSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameTungstenSuite.scala
new file mode 100644
index 0000000000000..bf8ef9a97bc60
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameTungstenSuite.scala
@@ -0,0 +1,84 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql
+
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.test.SQLTestUtils
+import org.apache.spark.sql.types._
+
+/**
+ * An end-to-end test suite specifically for testing Tungsten (Unsafe/CodeGen) mode.
+ *
+ * This is here for now so I can make sure Tungsten project is tested without refactoring existing
+ * end-to-end test infra. In the long run this should just go away.
+ */
+class DataFrameTungstenSuite extends QueryTest with SQLTestUtils {
+
+  override lazy val sqlContext: SQLContext = org.apache.spark.sql.test.TestSQLContext
+  import sqlContext.implicits._
+
+  test("test simple types") {
+    withSQLConf(SQLConf.UNSAFE_ENABLED.key -> "true") {
+      val df = sqlContext.sparkContext.parallelize(Seq((1, 2))).toDF("a", "b")
+      assert(df.select(struct("a", "b")).first().getStruct(0) === Row(1, 2))
+    }
+  }
+
+  test("test struct type") {
+    withSQLConf(SQLConf.UNSAFE_ENABLED.key -> "true") {
+      val struct = Row(1, 2L, 3.0F, 3.0)
+      val data = sqlContext.sparkContext.parallelize(Seq(Row(1, struct)))
+
+      val schema = new StructType()
+        .add("a", IntegerType)
+        .add("b",
+          new StructType()
+            .add("b1", IntegerType)
+            .add("b2", LongType)
+            .add("b3", FloatType)
+            .add("b4", DoubleType))
+
+      val df = sqlContext.createDataFrame(data, schema)
+      assert(df.select("b").first() === Row(struct))
+    }
+  }
+
+  test("test nested struct type") {
+    withSQLConf(SQLConf.UNSAFE_ENABLED.key -> "true") {
+      val innerStruct = Row(1, "abcd")
+      val outerStruct = Row(1, 2L, 3.0F, 3.0, innerStruct, "efg")
+      val data = sqlContext.sparkContext.parallelize(Seq(Row(1, outerStruct)))
+
+      val schema = new StructType()
+        .add("a", IntegerType)
+        .add("b",
+          new StructType()
+            .add("b1", IntegerType)
+            .add("b2", LongType)
+            .add("b3", FloatType)
+            .add("b4", DoubleType)
+            .add("b5", new StructType()
+            .add("b5a", IntegerType)
+            .add("b5b", StringType))
+            .add("b6", StringType))
+
+      val df = sqlContext.createDataFrame(data, schema)
+      assert(df.select("b").first() === Row(outerStruct))
+    }
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/expression/NondeterministicSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/expression/NondeterministicSuite.scala
index 99e11fd64b2b9..1c5a2ed2c0a53 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/expression/NondeterministicSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/expression/NondeterministicSuite.scala
@@ -23,7 +23,7 @@ import org.apache.spark.sql.execution.expressions.{SparkPartitionID, Monotonical
 
 class NondeterministicSuite extends SparkFunSuite with ExpressionEvalHelper {
   test("MonotonicallyIncreasingID") {
-    checkEvaluation(MonotonicallyIncreasingID(), 0)
+    checkEvaluation(MonotonicallyIncreasingID(), 0L)
   }
 
   test("SparkPartitionID") {

From 9c5612f4e197dec82a5eac9542896d6216a866b7 Mon Sep 17 00:00:00 2001
From: Cheng Hao <hao.cheng@intel.com>
Date: Mon, 27 Jul 2015 23:02:23 -0700
Subject: [PATCH 416/549] [MINOR] [SQL] Support mutable expression unit test
 with codegen projection

This is actually contains 3 minor issues:
1) Enable the unit test(codegen) for mutable expressions (FormatNumber, Regexp_Replace/Regexp_Extract)
2) Use the `PlatformDependent.copyMemory` instead of the `System.arrayCopy`

Author: Cheng Hao <hao.cheng@intel.com>

Closes #7566 from chenghao-intel/codegen_ut and squashes the following commits:

24f43ea [Cheng Hao] enable codegen for mutable expression & UTF8String performance
---
 .../expressions/stringOperations.scala        |  1 -
 .../spark/sql/StringFunctionsSuite.scala      | 34 ++++++++++++++-----
 .../apache/spark/unsafe/types/UTF8String.java | 32 ++++++++---------
 3 files changed, 41 insertions(+), 26 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
index 38b0fb37dee3b..edfffbc01c7b0 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
@@ -777,7 +777,6 @@ case class Levenshtein(left: Expression, right: Expression) extends BinaryExpres
   override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType)
 
   override def dataType: DataType = IntegerType
-
   protected override def nullSafeEval(leftValue: Any, rightValue: Any): Any =
     leftValue.asInstanceOf[UTF8String].levenshteinDistance(rightValue.asInstanceOf[UTF8String])
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
index 0f9c986f649a1..8e0ea76d15881 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
@@ -57,19 +57,27 @@ class StringFunctionsSuite extends QueryTest {
   }
 
   test("string regex_replace / regex_extract") {
-    val df = Seq(("100-200", "")).toDF("a", "b")
+    val df = Seq(
+      ("100-200", "(\\d+)-(\\d+)", "300"),
+      ("100-200", "(\\d+)-(\\d+)", "400"),
+      ("100-200", "(\\d+)", "400")).toDF("a", "b", "c")
 
     checkAnswer(
       df.select(
         regexp_replace($"a", "(\\d+)", "num"),
         regexp_extract($"a", "(\\d+)-(\\d+)", 1)),
-      Row("num-num", "100"))
-
-    checkAnswer(
-      df.selectExpr(
-        "regexp_replace(a, '(\\d+)', 'num')",
-        "regexp_extract(a, '(\\d+)-(\\d+)', 2)"),
-      Row("num-num", "200"))
+      Row("num-num", "100") :: Row("num-num", "100") :: Row("num-num", "100") :: Nil)
+
+    // for testing the mutable state of the expression in code gen.
+    // This is a hack way to enable the codegen, thus the codegen is enable by default,
+    // it will still use the interpretProjection if projection followed by a LocalRelation,
+    // hence we add a filter operator.
+    // See the optimizer rule `ConvertToLocalRelation`
+    checkAnswer(
+      df.filter("isnotnull(a)").selectExpr(
+        "regexp_replace(a, b, c)",
+        "regexp_extract(a, b, 1)"),
+      Row("300", "100") :: Row("400", "100") :: Row("400-400", "100") :: Nil)
   }
 
   test("string ascii function") {
@@ -290,5 +298,15 @@ class StringFunctionsSuite extends QueryTest {
         df.selectExpr("format_number(e, g)"), // decimal type of the 2nd argument is unacceptable
         Row("5.0000"))
     }
+
+    // for testing the mutable state of the expression in code gen.
+    // This is a hack way to enable the codegen, thus the codegen is enable by default,
+    // it will still use the interpretProjection if projection follows by a LocalRelation,
+    // hence we add a filter operator.
+    // See the optimizer rule `ConvertToLocalRelation`
+    val df2 = Seq((5L, 4), (4L, 3), (3L, 2)).toDF("a", "b")
+    checkAnswer(
+      df2.filter("b>0").selectExpr("format_number(a, b)"),
+      Row("5.0000") :: Row("4.000") :: Row("3.00") :: Nil)
   }
 }
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
index 85381cf0ef425..3e1cc67dbf337 100644
--- a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
+++ b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -300,13 +300,13 @@ public UTF8String trimRight() {
   }
 
   public UTF8String reverse() {
-    byte[] bytes = getBytes();
-    byte[] result = new byte[bytes.length];
+    byte[] result = new byte[this.numBytes];
 
     int i = 0; // position in byte
     while (i < numBytes) {
       int len = numBytesForFirstByte(getByte(i));
-      System.arraycopy(bytes, i, result, result.length - i - len, len);
+      copyMemory(this.base, this.offset + i, result,
+              BYTE_ARRAY_OFFSET + result.length - i - len, len);
 
       i += len;
     }
@@ -316,11 +316,11 @@ public UTF8String reverse() {
 
   public UTF8String repeat(int times) {
     if (times <=0) {
-      return fromBytes(new byte[0]);
+      return EMPTY_UTF8;
     }
 
     byte[] newBytes = new byte[numBytes * times];
-    System.arraycopy(getBytes(), 0, newBytes, 0, numBytes);
+    copyMemory(this.base, this.offset, newBytes, BYTE_ARRAY_OFFSET, numBytes);
 
     int copied = 1;
     while (copied < times) {
@@ -385,16 +385,15 @@ public UTF8String rpad(int len, UTF8String pad) {
       UTF8String remain = pad.substring(0, spaces - padChars * count);
 
       byte[] data = new byte[this.numBytes + pad.numBytes * count + remain.numBytes];
-      System.arraycopy(getBytes(), 0, data, 0, this.numBytes);
+      copyMemory(this.base, this.offset, data, BYTE_ARRAY_OFFSET, this.numBytes);
       int offset = this.numBytes;
       int idx = 0;
-      byte[] padBytes = pad.getBytes();
       while (idx < count) {
-        System.arraycopy(padBytes, 0, data, offset, pad.numBytes);
+        copyMemory(pad.base, pad.offset, data, BYTE_ARRAY_OFFSET + offset, pad.numBytes);
         ++idx;
         offset += pad.numBytes;
       }
-      System.arraycopy(remain.getBytes(), 0, data, offset, remain.numBytes);
+      copyMemory(remain.base, remain.offset, data, BYTE_ARRAY_OFFSET + offset, remain.numBytes);
 
       return UTF8String.fromBytes(data);
     }
@@ -421,15 +420,14 @@ public UTF8String lpad(int len, UTF8String pad) {
 
       int offset = 0;
       int idx = 0;
-      byte[] padBytes = pad.getBytes();
       while (idx < count) {
-        System.arraycopy(padBytes, 0, data, offset, pad.numBytes);
+        copyMemory(pad.base, pad.offset, data, BYTE_ARRAY_OFFSET + offset, pad.numBytes);
         ++idx;
         offset += pad.numBytes;
       }
-      System.arraycopy(remain.getBytes(), 0, data, offset, remain.numBytes);
+      copyMemory(remain.base, remain.offset, data, BYTE_ARRAY_OFFSET + offset, remain.numBytes);
       offset += remain.numBytes;
-      System.arraycopy(getBytes(), 0, data, offset, numBytes());
+      copyMemory(this.base, this.offset, data, BYTE_ARRAY_OFFSET + offset, numBytes());
 
       return UTF8String.fromBytes(data);
     }
@@ -454,9 +452,9 @@ public static UTF8String concat(UTF8String... inputs) {
     int offset = 0;
     for (int i = 0; i < inputs.length; i++) {
       int len = inputs[i].numBytes;
-      PlatformDependent.copyMemory(
+      copyMemory(
         inputs[i].base, inputs[i].offset,
-        result, PlatformDependent.BYTE_ARRAY_OFFSET + offset,
+        result, BYTE_ARRAY_OFFSET + offset,
         len);
       offset += len;
     }
@@ -494,7 +492,7 @@ public static UTF8String concatWs(UTF8String separator, UTF8String... inputs) {
     for (int i = 0, j = 0; i < inputs.length; i++) {
       if (inputs[i] != null) {
         int len = inputs[i].numBytes;
-        PlatformDependent.copyMemory(
+        copyMemory(
           inputs[i].base, inputs[i].offset,
           result, PlatformDependent.BYTE_ARRAY_OFFSET + offset,
           len);
@@ -503,7 +501,7 @@ public static UTF8String concatWs(UTF8String separator, UTF8String... inputs) {
         j++;
         // Add separator if this is not the last input.
         if (j < numInputs) {
-          PlatformDependent.copyMemory(
+          copyMemory(
             separator.base, separator.offset,
             result, PlatformDependent.BYTE_ARRAY_OFFSET + offset,
             separator.numBytes);

From d93ab93d673c5007a1edb90a424b451c91c8a285 Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Mon, 27 Jul 2015 23:34:29 -0700
Subject: [PATCH 417/549] [SPARK-9335] [STREAMING] [TESTS] Make sure the test
 stream is deleted in KinesisBackedBlockRDDSuite

KinesisBackedBlockRDDSuite should make sure delete the stream.

Author: zsxwing <zsxwing@gmail.com>

Closes #7663 from zsxwing/fix-SPARK-9335 and squashes the following commits:

f0e9154 [zsxwing] Revert "[HOTFIX] - Disable Kinesis tests due to rate limits"
71a4552 [zsxwing] Make sure the test stream is deleted
---
 .../streaming/kinesis/KinesisBackedBlockRDDSuite.scala     | 7 +++++--
 .../spark/streaming/kinesis/KinesisStreamSuite.scala       | 4 ++--
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDDSuite.scala b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDDSuite.scala
index b2e2a4246dbd5..e81fb11e5959f 100644
--- a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDDSuite.scala
+++ b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDDSuite.scala
@@ -17,10 +17,10 @@
 
 package org.apache.spark.streaming.kinesis
 
-import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll}
+import org.scalatest.BeforeAndAfterAll
 
 import org.apache.spark.storage.{BlockId, BlockManager, StorageLevel, StreamBlockId}
-import org.apache.spark.{SparkConf, SparkContext, SparkException, SparkFunSuite}
+import org.apache.spark.{SparkConf, SparkContext, SparkException}
 
 class KinesisBackedBlockRDDSuite extends KinesisFunSuite with BeforeAndAfterAll {
 
@@ -65,6 +65,9 @@ class KinesisBackedBlockRDDSuite extends KinesisFunSuite with BeforeAndAfterAll
   }
 
   override def afterAll(): Unit = {
+    if (testUtils != null) {
+      testUtils.deleteStream()
+    }
     if (sc != null) {
       sc.stop()
     }
diff --git a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisStreamSuite.scala b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisStreamSuite.scala
index 4992b041765e9..f9c952b9468bb 100644
--- a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisStreamSuite.scala
+++ b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisStreamSuite.scala
@@ -59,7 +59,7 @@ class KinesisStreamSuite extends KinesisFunSuite
     }
   }
 
-  ignore("KinesisUtils API") {
+  test("KinesisUtils API") {
     ssc = new StreamingContext(sc, Seconds(1))
     // Tests the API, does not actually test data receiving
     val kinesisStream1 = KinesisUtils.createStream(ssc, "mySparkStream",
@@ -83,7 +83,7 @@ class KinesisStreamSuite extends KinesisFunSuite
    * you must have AWS credentials available through the default AWS provider chain,
    * and you have to set the system environment variable RUN_KINESIS_TESTS=1 .
    */
-  ignore("basic operation") {
+  testIfEnabled("basic operation") {
     val kinesisTestUtils = new KinesisTestUtils()
     try {
       kinesisTestUtils.createStream()

From fc3bd96bc3e4a1a2a1eb9b982b3468abd137e395 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Mon, 27 Jul 2015 23:56:16 -0700
Subject: [PATCH 418/549] Closes #6836 since Round has already been
 implemented.


From 15724fac569258d2a149507d8c767d0de0ae8306 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Tue, 28 Jul 2015 00:52:26 -0700
Subject: [PATCH 419/549] [SPARK-9394][SQL] Handle parentheses in
 CodeFormatter.

Our CodeFormatter currently does not handle parentheses, and as a result in code dump, we see code formatted this way:

```
foo(
a,
b,
c)
```

With this patch, it is formatted this way:
```
foo(
  a,
  b,
  c)
```

Author: Reynold Xin <rxin@databricks.com>

Closes #7712 from rxin/codeformat-parentheses and squashes the following commits:

c2b1c5f [Reynold Xin] Took square bracket out
3cfb174 [Reynold Xin] Code review feedback.
91f5bb1 [Reynold Xin] [SPARK-9394][SQL] Handle parentheses in CodeFormatter.
---
 .../expressions/codegen/CodeFormatter.scala   |  8 ++---
 .../codegen/CodeFormatterSuite.scala          | 30 +++++++++++++++++++
 2 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatter.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatter.scala
index 2087cc7f109bc..c98182c96b165 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatter.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatter.scala
@@ -18,8 +18,7 @@
 package org.apache.spark.sql.catalyst.expressions.codegen
 
 /**
- * An utility class that indents a block of code based on the curly braces.
- *
+ * An utility class that indents a block of code based on the curly braces and parentheses.
  * This is used to prettify generated code when in debug mode (or exceptions).
  *
  * Written by Matei Zaharia.
@@ -35,11 +34,12 @@ private class CodeFormatter {
   private var indentString = ""
 
   private def addLine(line: String): Unit = {
-    val indentChange = line.count(_ == '{') - line.count(_ == '}')
+    val indentChange =
+      line.count(c => "({".indexOf(c) >= 0) - line.count(c => ")}".indexOf(c) >= 0)
     val newIndentLevel = math.max(0, indentLevel + indentChange)
     // Lines starting with '}' should be de-indented even if they contain '{' after;
     // in addition, lines ending with ':' are typically labels
-    val thisLineIndent = if (line.startsWith("}") || line.endsWith(":")) {
+    val thisLineIndent = if (line.startsWith("}") || line.startsWith(")") || line.endsWith(":")) {
       " " * (indentSize * (indentLevel - 1))
     } else {
       indentString
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatterSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatterSuite.scala
index 478702fea6146..46daa3eb8bf80 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatterSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatterSuite.scala
@@ -73,4 +73,34 @@ class CodeFormatterSuite extends SparkFunSuite {
       |}
     """.stripMargin
   }
+
+  testCase("if else on the same line") {
+    """
+      |class A {
+      | if (c) {duh;} else {boo;}
+      |}
+    """.stripMargin
+  }{
+    """
+      |class A {
+      |  if (c) {duh;} else {boo;}
+      |}
+    """.stripMargin
+  }
+
+  testCase("function calls") {
+    """
+      |foo(
+      |a,
+      |b,
+      |c)
+    """.stripMargin
+  }{
+    """
+      |foo(
+      |  a,
+      |  b,
+      |  c)
+    """.stripMargin
+  }
 }

From ac8c549e2fa9ff3451deb4c3e49d151eeac18acc Mon Sep 17 00:00:00 2001
From: Kenichi Maehashi <webmaster@kenichimaehashi.com>
Date: Tue, 28 Jul 2015 15:57:21 +0100
Subject: [PATCH 420/549] [EC2] Cosmetic fix for usage of spark-ec2
 --ebs-vol-num option

The last line of the usage seems ugly.

```
$ spark-ec2 --help
<snip>
  --ebs-vol-num=EBS_VOL_NUM
                        Number of EBS volumes to attach to each node as
                        /vol[x]. The volumes will be deleted when the
                        instances terminate. Only possible on EBS-backed AMIs.
                        EBS volumes are only attached if --ebs-vol-size >
                        0.Only support up to 8 EBS volumes.
```

After applying this patch:

```
$ spark-ec2 --help
<snip>
  --ebs-vol-num=EBS_VOL_NUM
                        Number of EBS volumes to attach to each node as
                        /vol[x]. The volumes will be deleted when the
                        instances terminate. Only possible on EBS-backed AMIs.
                        EBS volumes are only attached if --ebs-vol-size > 0.
                        Only support up to 8 EBS volumes.
```

As this is a trivial thing I didn't create JIRA for this.

Author: Kenichi Maehashi <webmaster@kenichimaehashi.com>

Closes #7632 from kmaehashi/spark-ec2-cosmetic-fix and squashes the following commits:

526c118 [Kenichi Maehashi] cosmetic fix for spark-ec2 --ebs-vol-num option usage
---
 ec2/spark_ec2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ec2/spark_ec2.py b/ec2/spark_ec2.py
index 7c83d68e7993e..ccf922d9371fb 100755
--- a/ec2/spark_ec2.py
+++ b/ec2/spark_ec2.py
@@ -242,7 +242,7 @@ def parse_args():
         help="Number of EBS volumes to attach to each node as /vol[x]. " +
              "The volumes will be deleted when the instances terminate. " +
              "Only possible on EBS-backed AMIs. " +
-             "EBS volumes are only attached if --ebs-vol-size > 0." +
+             "EBS volumes are only attached if --ebs-vol-size > 0. " +
              "Only support up to 8 EBS volumes.")
     parser.add_option(
         "--placement-group", type="string", default=None,

From 4af622c855a32b1846242a6dd38b252ca30c8b82 Mon Sep 17 00:00:00 2001
From: vinodkc <vinod.kc.in@gmail.com>
Date: Tue, 28 Jul 2015 08:48:57 -0700
Subject: [PATCH 421/549] [SPARK-8919] [DOCUMENTATION, MLLIB] Added @since tags
 to mllib.recommendation

Author: vinodkc <vinod.kc.in@gmail.com>

Closes #7325 from vinodkc/add_since_mllib.recommendation and squashes the following commits:

93156f2 [vinodkc] Changed 0.8.0 to 0.9.1
c413350 [vinodkc] Added @since
---
 .../spark/mllib/recommendation/ALS.scala      | 10 +++++
 .../MatrixFactorizationModel.scala            | 38 ++++++++++++++++++-
 2 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala
index 93290e6508529..56c549ef99cb7 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala
@@ -26,6 +26,7 @@ import org.apache.spark.storage.StorageLevel
 
 /**
  * A more compact class to represent a rating than Tuple3[Int, Int, Double].
+ * @since 0.8.0
  */
 case class Rating(user: Int, product: Int, rating: Double)
 
@@ -254,6 +255,7 @@ class ALS private (
 
 /**
  * Top-level methods for calling Alternating Least Squares (ALS) matrix factorization.
+ * @since 0.8.0
  */
 object ALS {
   /**
@@ -269,6 +271,7 @@ object ALS {
    * @param lambda     regularization factor (recommended: 0.01)
    * @param blocks     level of parallelism to split computation into
    * @param seed       random seed
+   * @since 0.9.1
    */
   def train(
       ratings: RDD[Rating],
@@ -293,6 +296,7 @@ object ALS {
    * @param iterations number of iterations of ALS (recommended: 10-20)
    * @param lambda     regularization factor (recommended: 0.01)
    * @param blocks     level of parallelism to split computation into
+   * @since 0.8.0
    */
   def train(
       ratings: RDD[Rating],
@@ -315,6 +319,7 @@ object ALS {
    * @param rank       number of features to use
    * @param iterations number of iterations of ALS (recommended: 10-20)
    * @param lambda     regularization factor (recommended: 0.01)
+   * @since 0.8.0
    */
   def train(ratings: RDD[Rating], rank: Int, iterations: Int, lambda: Double)
     : MatrixFactorizationModel = {
@@ -331,6 +336,7 @@ object ALS {
    * @param ratings    RDD of (userID, productID, rating) pairs
    * @param rank       number of features to use
    * @param iterations number of iterations of ALS (recommended: 10-20)
+   * @since 0.8.0
    */
   def train(ratings: RDD[Rating], rank: Int, iterations: Int)
     : MatrixFactorizationModel = {
@@ -351,6 +357,7 @@ object ALS {
    * @param blocks     level of parallelism to split computation into
    * @param alpha      confidence parameter
    * @param seed       random seed
+   * @since 0.8.1
    */
   def trainImplicit(
       ratings: RDD[Rating],
@@ -377,6 +384,7 @@ object ALS {
    * @param lambda     regularization factor (recommended: 0.01)
    * @param blocks     level of parallelism to split computation into
    * @param alpha      confidence parameter
+   * @since 0.8.1
    */
   def trainImplicit(
       ratings: RDD[Rating],
@@ -401,6 +409,7 @@ object ALS {
    * @param iterations number of iterations of ALS (recommended: 10-20)
    * @param lambda     regularization factor (recommended: 0.01)
    * @param alpha      confidence parameter
+   * @since 0.8.1
    */
   def trainImplicit(ratings: RDD[Rating], rank: Int, iterations: Int, lambda: Double, alpha: Double)
     : MatrixFactorizationModel = {
@@ -418,6 +427,7 @@ object ALS {
    * @param ratings    RDD of (userID, productID, rating) pairs
    * @param rank       number of features to use
    * @param iterations number of iterations of ALS (recommended: 10-20)
+   * @since 0.8.1
    */
   def trainImplicit(ratings: RDD[Rating], rank: Int, iterations: Int)
     : MatrixFactorizationModel = {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala
index 43d219a49cf4e..261ca9cef0c5b 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala
@@ -49,6 +49,7 @@ import org.apache.spark.storage.StorageLevel
  *                     the features computed for this user.
  * @param productFeatures RDD of tuples where each tuple represents the productId
  *                        and the features computed for this product.
+ * @since 0.8.0
  */
 class MatrixFactorizationModel(
     val rank: Int,
@@ -73,7 +74,9 @@ class MatrixFactorizationModel(
     }
   }
 
-  /** Predict the rating of one user for one product. */
+  /** Predict the rating of one user for one product.
+   * @since 0.8.0
+   */
   def predict(user: Int, product: Int): Double = {
     val userVector = userFeatures.lookup(user).head
     val productVector = productFeatures.lookup(product).head
@@ -111,6 +114,7 @@ class MatrixFactorizationModel(
    *
    * @param usersProducts  RDD of (user, product) pairs.
    * @return RDD of Ratings.
+   * @since 0.9.0
    */
   def predict(usersProducts: RDD[(Int, Int)]): RDD[Rating] = {
     // Previously the partitions of ratings are only based on the given products.
@@ -142,6 +146,7 @@ class MatrixFactorizationModel(
 
   /**
    * Java-friendly version of [[MatrixFactorizationModel.predict]].
+   * @since 1.2.0
    */
   def predict(usersProducts: JavaPairRDD[JavaInteger, JavaInteger]): JavaRDD[Rating] = {
     predict(usersProducts.rdd.asInstanceOf[RDD[(Int, Int)]]).toJavaRDD()
@@ -157,6 +162,7 @@ class MatrixFactorizationModel(
    *  by score, decreasing. The first returned is the one predicted to be most strongly
    *  recommended to the user. The score is an opaque value that indicates how strongly
    *  recommended the product is.
+   *  @since 1.1.0
    */
   def recommendProducts(user: Int, num: Int): Array[Rating] =
     MatrixFactorizationModel.recommend(userFeatures.lookup(user).head, productFeatures, num)
@@ -173,6 +179,7 @@ class MatrixFactorizationModel(
    *  by score, decreasing. The first returned is the one predicted to be most strongly
    *  recommended to the product. The score is an opaque value that indicates how strongly
    *  recommended the user is.
+   *  @since 1.1.0
    */
   def recommendUsers(product: Int, num: Int): Array[Rating] =
     MatrixFactorizationModel.recommend(productFeatures.lookup(product).head, userFeatures, num)
@@ -180,6 +187,20 @@ class MatrixFactorizationModel(
 
   protected override val formatVersion: String = "1.0"
 
+  /**
+   * Save this model to the given path.
+   *
+   * This saves:
+   *  - human-readable (JSON) model metadata to path/metadata/
+   *  - Parquet formatted data to path/data/
+   *
+   * The model may be loaded using [[Loader.load]].
+   *
+   * @param sc  Spark context used to save model data.
+   * @param path  Path specifying the directory in which to save this model.
+   *              If the directory already exists, this method throws an exception.
+   * @since 1.3.0
+   */
   override def save(sc: SparkContext, path: String): Unit = {
     MatrixFactorizationModel.SaveLoadV1_0.save(this, path)
   }
@@ -191,6 +212,7 @@ class MatrixFactorizationModel(
    * @return [(Int, Array[Rating])] objects, where every tuple contains a userID and an array of
    * rating objects which contains the same userId, recommended productID and a "score" in the
    * rating field. Semantics of score is same as recommendProducts API
+   * @since 1.4.0
    */
   def recommendProductsForUsers(num: Int): RDD[(Int, Array[Rating])] = {
     MatrixFactorizationModel.recommendForAll(rank, userFeatures, productFeatures, num).map {
@@ -208,6 +230,7 @@ class MatrixFactorizationModel(
    * @return [(Int, Array[Rating])] objects, where every tuple contains a productID and an array
    * of rating objects which contains the recommended userId, same productID and a "score" in the
    * rating field. Semantics of score is same as recommendUsers API
+   * @since 1.4.0
    */
   def recommendUsersForProducts(num: Int): RDD[(Int, Array[Rating])] = {
     MatrixFactorizationModel.recommendForAll(rank, productFeatures, userFeatures, num).map {
@@ -218,6 +241,9 @@ class MatrixFactorizationModel(
   }
 }
 
+/**
+ * @since 1.3.0
+ */
 object MatrixFactorizationModel extends Loader[MatrixFactorizationModel] {
 
   import org.apache.spark.mllib.util.Loader._
@@ -292,6 +318,16 @@ object MatrixFactorizationModel extends Loader[MatrixFactorizationModel] {
     }
   }
 
+  /**
+   * Load a model from the given path.
+   *
+   * The model should have been saved by [[Saveable.save]].
+   *
+   * @param sc  Spark context used for loading model files.
+   * @param path  Path specifying the directory to which the model was saved.
+   * @return  Model instance
+   * @since 1.3.0
+   */
   override def load(sc: SparkContext, path: String): MatrixFactorizationModel = {
     val (loadedClassName, formatVersion, _) = loadMetadata(sc, path)
     val classNameV1_0 = SaveLoadV1_0.thisClassName

From 5a2330e546074013ef706ac09028626912ec5475 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Tue, 28 Jul 2015 09:42:35 -0700
Subject: [PATCH 422/549] [SPARK-9402][SQL] Remove CodegenFallback from Abs /
 FormatNumber.

Both expressions already implement code generation.

Author: Reynold Xin <rxin@databricks.com>

Closes #7723 from rxin/abs-formatnum and squashes the following commits:

31ed765 [Reynold Xin] [SPARK-9402][SQL] Remove CodegenFallback from Abs / FormatNumber.
---
 .../org/apache/spark/sql/catalyst/expressions/arithmetic.scala | 3 +--
 .../spark/sql/catalyst/expressions/stringOperations.scala      | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
index b37f530ec6814..4ec866475f8b0 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
@@ -68,8 +68,7 @@ case class UnaryPositive(child: Expression) extends UnaryExpression with Expects
 @ExpressionDescription(
   usage = "_FUNC_(expr) - Returns the absolute value of the numeric value",
   extended = "> SELECT _FUNC_('-1');\n1")
-case class Abs(child: Expression)
-  extends UnaryExpression with ExpectsInputTypes with CodegenFallback {
+case class Abs(child: Expression) extends UnaryExpression with ExpectsInputTypes {
 
   override def inputTypes: Seq[AbstractDataType] = Seq(NumericType)
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
index edfffbc01c7b0..6db4e19c24ed5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
@@ -1139,7 +1139,7 @@ case class RegExpExtract(subject: Expression, regexp: Expression, idx: Expressio
  * fractional part.
  */
 case class FormatNumber(x: Expression, d: Expression)
-  extends BinaryExpression with ExpectsInputTypes with CodegenFallback {
+  extends BinaryExpression with ExpectsInputTypes {
 
   override def left: Expression = x
   override def right: Expression = d

From c740bed17215a9608c9eb9d80ffdf0fcf72c3911 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Tue, 28 Jul 2015 09:43:12 -0700
Subject: [PATCH 423/549] [SPARK-9373][SQL] follow up for StructType support in
 Tungsten projection.

Author: Reynold Xin <rxin@databricks.com>

Closes #7720 from rxin/struct-followup and squashes the following commits:

d9757f5 [Reynold Xin] [SPARK-9373][SQL] follow up for StructType support in Tungsten projection.
---
 .../expressions/UnsafeRowWriters.java         |  6 +--
 .../codegen/GenerateUnsafeProjection.scala    | 40 +++++++++----------
 .../spark/sql/execution/SparkStrategies.scala |  3 +-
 3 files changed, 23 insertions(+), 26 deletions(-)

diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRowWriters.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRowWriters.java
index 8fdd7399602d2..32faad374015c 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRowWriters.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRowWriters.java
@@ -47,7 +47,7 @@ public static int write(UnsafeRow target, int ordinal, int cursor, UTF8String in
           target.getBaseObject(), offset + ((numBytes >> 3) << 3), 0L);
       }
 
-      // Write the string to the variable length portion.
+      // Write the bytes to the variable length portion.
       input.writeToMemory(target.getBaseObject(), offset);
 
       // Set the fixed length portion.
@@ -73,7 +73,7 @@ public static int write(UnsafeRow target, int ordinal, int cursor, byte[] input)
           target.getBaseObject(), offset + ((numBytes >> 3) << 3), 0L);
       }
 
-      // Write the string to the variable length portion.
+      // Write the bytes to the variable length portion.
       ByteArray.writeToMemory(input, target.getBaseObject(), offset);
 
       // Set the fixed length portion.
@@ -115,7 +115,7 @@ public static int write(UnsafeRow target, int ordinal, int cursor, InternalRow i
             target.getBaseObject(), offset + ((numBytes >> 3) << 3), 0L);
         }
 
-        // Write the string to the variable length portion.
+        // Write the bytes to the variable length portion.
         row.writeToMemory(target.getBaseObject(), offset);
 
         // Set the fixed length portion.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
index 3e87f7285847c..9a4c00e86a3ec 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
@@ -62,14 +62,10 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
     val cursor = ctx.freshName("cursor")
     val numBytes = ctx.freshName("numBytes")
 
-    val exprs = expressions.zipWithIndex.map { case (e, i) =>
-      e.dataType match {
-        case st: StructType =>
-          createCodeForStruct(ctx, e.gen(ctx), st)
-        case _ =>
-          e.gen(ctx)
-      }
-    }
+    val exprs = expressions.map { e => e.dataType match {
+      case st: StructType => createCodeForStruct(ctx, e.gen(ctx), st)
+      case _ => e.gen(ctx)
+    }}
     val allExprs = exprs.map(_.code).mkString("\n")
 
     val fixedSize = 8 * exprs.length + UnsafeRow.calculateBitSetWidthInBytes(exprs.length)
@@ -153,20 +149,20 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
 
     val exprs: Seq[GeneratedExpressionCode] = schema.map(_.dataType).zipWithIndex.map {
       case (dt, i) => dt match {
-      case st: StructType =>
-        val nestedStructEv = GeneratedExpressionCode(
-          code = "",
-          isNull = s"${input.primitive}.isNullAt($i)",
-          primitive = s"${ctx.getColumn(input.primitive, dt, i)}"
-        )
-        createCodeForStruct(ctx, nestedStructEv, st)
-      case _ =>
-        GeneratedExpressionCode(
-          code = "",
-          isNull = s"${input.primitive}.isNullAt($i)",
-          primitive = s"${ctx.getColumn(input.primitive, dt, i)}"
-        )
-      }
+        case st: StructType =>
+          val nestedStructEv = GeneratedExpressionCode(
+            code = "",
+            isNull = s"${input.primitive}.isNullAt($i)",
+            primitive = s"${ctx.getColumn(input.primitive, dt, i)}"
+          )
+          createCodeForStruct(ctx, nestedStructEv, st)
+        case _ =>
+          GeneratedExpressionCode(
+            code = "",
+            isNull = s"${input.primitive}.isNullAt($i)",
+            primitive = s"${ctx.getColumn(input.primitive, dt, i)}"
+          )
+        }
     }
     val allExprs = exprs.map(_.code).mkString("\n")
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index 314b85f126dd2..f3ef066528ff8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -339,7 +339,8 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
      *               if necessary.
      */
     def getSortOperator(sortExprs: Seq[SortOrder], global: Boolean, child: SparkPlan): SparkPlan = {
-      if (sqlContext.conf.unsafeEnabled && UnsafeExternalSort.supportsSchema(child.schema)) {
+      if (sqlContext.conf.unsafeEnabled && sqlContext.conf.codegenEnabled &&
+        UnsafeExternalSort.supportsSchema(child.schema)) {
         execution.UnsafeExternalSort(sortExprs, global, child)
       } else if (sqlContext.conf.externalSortEnabled) {
         execution.ExternalSort(sortExprs, global, child)

From 9bbe0171cb434edb160fad30ea2d4221f525c919 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Tue, 28 Jul 2015 09:43:39 -0700
Subject: [PATCH 424/549] [SPARK-8196][SQL] Fix null handling & documentation
 for next_day.

The original patch didn't handle nulls correctly for next_day.

Author: Reynold Xin <rxin@databricks.com>

Closes #7718 from rxin/next_day and squashes the following commits:

616a425 [Reynold Xin] Merged DatetimeExpressionsSuite into DateFunctionsSuite.
faa78cf [Reynold Xin] Merged DatetimeFunctionsSuite into DateExpressionsSuite.
6c4fb6a [Reynold Xin] [SPARK-8196][SQL] Fix null handling & documentation for next_day.
---
 .../sql/catalyst/expressions/Expression.scala | 12 ++---
 .../expressions/datetimeFunctions.scala       | 46 ++++++++++-------
 .../sql/catalyst/expressions/literals.scala   |  2 +-
 .../sql/catalyst/util/DateTimeUtils.scala     |  2 +-
 .../expressions/DateExpressionsSuite.scala    | 43 +++++++++++++---
 .../expressions/DatetimeFunctionsSuite.scala  | 37 --------------
 .../expressions/ExpressionEvalHelper.scala    |  1 +
 .../expressions/NonFoldableLiteral.scala      | 50 +++++++++++++++++++
 .../org/apache/spark/sql/functions.scala      | 20 +++++---
 .../apache/spark/sql/DateFunctionsSuite.scala | 21 ++++++++
 .../spark/sql/DatetimeExpressionsSuite.scala  | 48 ------------------
 11 files changed, 158 insertions(+), 124 deletions(-)
 delete mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DatetimeFunctionsSuite.scala
 create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NonFoldableLiteral.scala
 delete mode 100644 sql/core/src/test/scala/org/apache/spark/sql/DatetimeExpressionsSuite.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
index cb4c3f24b2721..03e36c7871bcf 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
@@ -355,9 +355,9 @@ abstract class BinaryExpression extends Expression {
    * @param f accepts two variable names and returns Java code to compute the output.
    */
   protected def defineCodeGen(
-    ctx: CodeGenContext,
-    ev: GeneratedExpressionCode,
-    f: (String, String) => String): String = {
+      ctx: CodeGenContext,
+      ev: GeneratedExpressionCode,
+      f: (String, String) => String): String = {
     nullSafeCodeGen(ctx, ev, (eval1, eval2) => {
       s"${ev.primitive} = ${f(eval1, eval2)};"
     })
@@ -372,9 +372,9 @@ abstract class BinaryExpression extends Expression {
    *          and returns Java code to compute the output.
    */
   protected def nullSafeCodeGen(
-    ctx: CodeGenContext,
-    ev: GeneratedExpressionCode,
-    f: (String, String) => String): String = {
+      ctx: CodeGenContext,
+      ev: GeneratedExpressionCode,
+      f: (String, String) => String): String = {
     val eval1 = left.gen(ctx)
     val eval2 = right.gen(ctx)
     val resultCode = f(eval1.primitive, eval2.primitive)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeFunctions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeFunctions.scala
index b00a1b26fa285..c37afc13f2d17 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeFunctions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeFunctions.scala
@@ -276,8 +276,6 @@ case class LastDay(startDate: Expression) extends UnaryExpression with ImplicitC
 
   override def dataType: DataType = DateType
 
-  override def prettyName: String = "last_day"
-
   override def nullSafeEval(date: Any): Any = {
     val days = date.asInstanceOf[Int]
     DateTimeUtils.getLastDayOfMonth(days)
@@ -289,12 +287,16 @@ case class LastDay(startDate: Expression) extends UnaryExpression with ImplicitC
       s"$dtu.getLastDayOfMonth($sd)"
     })
   }
+
+  override def prettyName: String = "last_day"
 }
 
 /**
  * Returns the first date which is later than startDate and named as dayOfWeek.
  * For example, NextDay(2015-07-27, Sunday) would return 2015-08-02, which is the first
- * sunday later than 2015-07-27.
+ * Sunday later than 2015-07-27.
+ *
+ * Allowed "dayOfWeek" is defined in [[DateTimeUtils.getDayOfWeekFromString]].
  */
 case class NextDay(startDate: Expression, dayOfWeek: Expression)
   extends BinaryExpression with ImplicitCastInputTypes {
@@ -318,22 +320,32 @@ case class NextDay(startDate: Expression, dayOfWeek: Expression)
 
   override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     nullSafeCodeGen(ctx, ev, (sd, dowS) => {
-      val dtu = DateTimeUtils.getClass.getName.stripSuffix("$")
-      val dow = ctx.freshName("dow")
-      val genDow = if (right.foldable) {
-        val dowVal = DateTimeUtils.getDayOfWeekFromString(
-          dayOfWeek.eval(InternalRow.empty).asInstanceOf[UTF8String])
-        s"int $dow = $dowVal;"
-      } else {
-        s"int $dow = $dtu.getDayOfWeekFromString($dowS);"
-      }
-      genDow + s"""
-        if ($dow == -1) {
-          ${ev.isNull} = true;
+      val dateTimeUtilClass = DateTimeUtils.getClass.getName.stripSuffix("$")
+      val dayOfWeekTerm = ctx.freshName("dayOfWeek")
+      if (dayOfWeek.foldable) {
+        val input = dayOfWeek.eval().asInstanceOf[UTF8String]
+        if ((input eq null) || DateTimeUtils.getDayOfWeekFromString(input) == -1) {
+          s"""
+             |${ev.isNull} = true;
+           """.stripMargin
         } else {
-          ${ev.primitive} = $dtu.getNextDateForDayOfWeek($sd, $dow);
+          val dayOfWeekValue = DateTimeUtils.getDayOfWeekFromString(input)
+          s"""
+             |${ev.primitive} = $dateTimeUtilClass.getNextDateForDayOfWeek($sd, $dayOfWeekValue);
+           """.stripMargin
         }
-       """
+      } else {
+        s"""
+           |int $dayOfWeekTerm = $dateTimeUtilClass.getDayOfWeekFromString($dowS);
+           |if ($dayOfWeekTerm == -1) {
+           |  ${ev.isNull} = true;
+           |} else {
+           |  ${ev.primitive} = $dateTimeUtilClass.getNextDateForDayOfWeek($sd, $dayOfWeekTerm);
+           |}
+         """.stripMargin
+      }
     })
   }
+
+  override def prettyName: String = "next_day"
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
index 85060b7893556..064a1720c36e8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
@@ -118,7 +118,7 @@ case class Literal protected (value: Any, dataType: DataType)
             super.genCode(ctx, ev)
           } else {
             ev.isNull = "false"
-            ev.primitive = s"${value}"
+            ev.primitive = s"${value}D"
             ""
           }
         case ByteType | ShortType =>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
index 2e28fb9af9b65..8b0b80c26db17 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
@@ -575,7 +575,7 @@ object DateTimeUtils {
   }
 
   /**
-   * Returns Day of week from String. Starting from Thursday, marked as 0.
+   * Returns day of week from String. Starting from Thursday, marked as 0.
    * (Because 1970-01-01 is Thursday).
    */
   def getDayOfWeekFromString(string: UTF8String): Int = {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
index 4d2d33765a269..30c5769424bd7 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
@@ -32,6 +32,19 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
   val d = new Date(sdf.parse("2015-04-08 13:10:15").getTime)
   val ts = new Timestamp(sdf.parse("2013-11-08 13:10:15").getTime)
 
+  test("datetime function current_date") {
+    val d0 = DateTimeUtils.millisToDays(System.currentTimeMillis())
+    val cd = CurrentDate().eval(EmptyRow).asInstanceOf[Int]
+    val d1 = DateTimeUtils.millisToDays(System.currentTimeMillis())
+    assert(d0 <= cd && cd <= d1 && d1 - d0 <= 1)
+  }
+
+  test("datetime function current_timestamp") {
+    val ct = DateTimeUtils.toJavaTimestamp(CurrentTimestamp().eval(EmptyRow).asInstanceOf[Long])
+    val t1 = System.currentTimeMillis()
+    assert(math.abs(t1 - ct.getTime) < 5000)
+  }
+
   test("DayOfYear") {
     val sdfDay = new SimpleDateFormat("D")
     (2002 to 2004).foreach { y =>
@@ -264,14 +277,28 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
   }
 
   test("next_day") {
+    def testNextDay(input: String, dayOfWeek: String, output: String): Unit = {
+      checkEvaluation(
+        NextDay(Literal(Date.valueOf(input)), NonFoldableLiteral(dayOfWeek)),
+        DateTimeUtils.fromJavaDate(Date.valueOf(output)))
+      checkEvaluation(
+        NextDay(Literal(Date.valueOf(input)), Literal(dayOfWeek)),
+        DateTimeUtils.fromJavaDate(Date.valueOf(output)))
+    }
+    testNextDay("2015-07-23", "Mon", "2015-07-27")
+    testNextDay("2015-07-23", "mo", "2015-07-27")
+    testNextDay("2015-07-23", "Tue", "2015-07-28")
+    testNextDay("2015-07-23", "tu", "2015-07-28")
+    testNextDay("2015-07-23", "we", "2015-07-29")
+    testNextDay("2015-07-23", "wed", "2015-07-29")
+    testNextDay("2015-07-23", "Thu", "2015-07-30")
+    testNextDay("2015-07-23", "TH", "2015-07-30")
+    testNextDay("2015-07-23", "Fri", "2015-07-24")
+    testNextDay("2015-07-23", "fr", "2015-07-24")
+
+    checkEvaluation(NextDay(Literal(Date.valueOf("2015-07-23")), Literal("xx")), null)
+    checkEvaluation(NextDay(Literal.create(null, DateType), Literal("xx")), null)
     checkEvaluation(
-      NextDay(Literal(Date.valueOf("2015-07-23")), Literal("Thu")),
-      DateTimeUtils.fromJavaDate(Date.valueOf("2015-07-30")))
-    checkEvaluation(
-      NextDay(Literal(Date.valueOf("2015-07-23")), Literal("THURSDAY")),
-      DateTimeUtils.fromJavaDate(Date.valueOf("2015-07-30")))
-    checkEvaluation(
-      NextDay(Literal(Date.valueOf("2015-07-23")), Literal("th")),
-      DateTimeUtils.fromJavaDate(Date.valueOf("2015-07-30")))
+      NextDay(Literal(Date.valueOf("2015-07-23")), Literal.create(null, StringType)), null)
   }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DatetimeFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DatetimeFunctionsSuite.scala
deleted file mode 100644
index 1618c24871c60..0000000000000
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DatetimeFunctionsSuite.scala
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.catalyst.expressions
-
-import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.catalyst.util.DateTimeUtils
-
-class DatetimeFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
-  test("datetime function current_date") {
-    val d0 = DateTimeUtils.millisToDays(System.currentTimeMillis())
-    val cd = CurrentDate().eval(EmptyRow).asInstanceOf[Int]
-    val d1 = DateTimeUtils.millisToDays(System.currentTimeMillis())
-    assert(d0 <= cd && cd <= d1 && d1 - d0 <= 1)
-  }
-
-  test("datetime function current_timestamp") {
-    val ct = DateTimeUtils.toJavaTimestamp(CurrentTimestamp().eval(EmptyRow).asInstanceOf[Long])
-    val t1 = System.currentTimeMillis()
-    assert(math.abs(t1 - ct.getTime) < 5000)
-  }
-
-}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
index 136368bf5b368..0c8611d5ddefa 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
@@ -82,6 +82,7 @@ trait ExpressionEvalHelper {
           s"""
             |Code generation of $expression failed:
             |$e
+            |${e.getStackTraceString}
           """.stripMargin)
     }
   }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NonFoldableLiteral.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NonFoldableLiteral.scala
new file mode 100644
index 0000000000000..0559fb80e7fce
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NonFoldableLiteral.scala
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions
+
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.codegen._
+import org.apache.spark.sql.types._
+
+
+/**
+ * A literal value that is not foldable. Used in expression codegen testing to test code path
+ * that behave differently based on foldable values.
+ */
+case class NonFoldableLiteral(value: Any, dataType: DataType)
+  extends LeafExpression with CodegenFallback {
+
+  override def foldable: Boolean = false
+  override def nullable: Boolean = true
+
+  override def toString: String = if (value != null) value.toString else "null"
+
+  override def eval(input: InternalRow): Any = value
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    Literal.create(value, dataType).genCode(ctx, ev)
+  }
+}
+
+
+object NonFoldableLiteral {
+  def apply(value: Any): NonFoldableLiteral = {
+    val lit = Literal(value)
+    NonFoldableLiteral(lit.value, lit.dataType)
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index d18558b510f0b..cec61b66b157c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -2033,7 +2033,10 @@ object functions {
   def hour(columnName: String): Column = hour(Column(columnName))
 
   /**
-   * Returns the last day of the month which the given date belongs to.
+   * Given a date column, returns the last day of the month which the given date belongs to.
+   * For example, input "2015-07-27" returns "2015-07-31" since July 31 is the last day of the
+   * month in July 2015.
+   *
    * @group datetime_funcs
    * @since 1.5.0
    */
@@ -2054,14 +2057,19 @@ object functions {
   def minute(columnName: String): Column = minute(Column(columnName))
 
   /**
-   * Returns the first date which is later than given date sd and named as dow.
-   * For example, `next_day('2015-07-27', "Sunday")` would return 2015-08-02, which is the
-   * first Sunday later than 2015-07-27. The parameter dayOfWeek could be 2-letter, 3-letter,
-   * or full name of the day of the week (e.g. Mo, tue, FRIDAY).
+   * Given a date column, returns the first date which is later than the value of the date column
+   * that is on the specified day of the week.
+   *
+   * For example, `next_day('2015-07-27', "Sunday")` returns 2015-08-02 because that is the first
+   * Sunday after 2015-07-27.
+   *
+   * Day of the week parameter is case insensitive, and accepts:
+   * "Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun".
+   *
    * @group datetime_funcs
    * @since 1.5.0
    */
-  def next_day(sd: Column, dayOfWeek: String): Column = NextDay(sd.expr, lit(dayOfWeek).expr)
+  def next_day(date: Column, dayOfWeek: String): Column = NextDay(date.expr, lit(dayOfWeek).expr)
 
   /**
    * Extracts the seconds as an integer from a given date/timestamp/string.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
index 001fcd035c82a..36820cbbc7e5e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql
 import java.sql.{Timestamp, Date}
 import java.text.SimpleDateFormat
 
+import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.functions._
 
 class DateFunctionsSuite extends QueryTest {
@@ -27,6 +28,26 @@ class DateFunctionsSuite extends QueryTest {
 
   import ctx.implicits._
 
+  test("function current_date") {
+    val df1 = Seq((1, 2), (3, 1)).toDF("a", "b")
+    val d0 = DateTimeUtils.millisToDays(System.currentTimeMillis())
+    val d1 = DateTimeUtils.fromJavaDate(df1.select(current_date()).collect().head.getDate(0))
+    val d2 = DateTimeUtils.fromJavaDate(
+      ctx.sql("""SELECT CURRENT_DATE()""").collect().head.getDate(0))
+    val d3 = DateTimeUtils.millisToDays(System.currentTimeMillis())
+    assert(d0 <= d1 && d1 <= d2 && d2 <= d3 && d3 - d0 <= 1)
+  }
+
+  test("function current_timestamp") {
+    val df1 = Seq((1, 2), (3, 1)).toDF("a", "b")
+    checkAnswer(df1.select(countDistinct(current_timestamp())), Row(1))
+    // Execution in one query should return the same value
+    checkAnswer(ctx.sql("""SELECT CURRENT_TIMESTAMP() = CURRENT_TIMESTAMP()"""),
+      Row(true))
+    assert(math.abs(ctx.sql("""SELECT CURRENT_TIMESTAMP()""").collect().head.getTimestamp(
+      0).getTime - System.currentTimeMillis()) < 5000)
+  }
+
   val sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
   val sdfDate = new SimpleDateFormat("yyyy-MM-dd")
   val d = new Date(sdf.parse("2015-04-08 13:10:15").getTime)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatetimeExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatetimeExpressionsSuite.scala
deleted file mode 100644
index 44b915304533c..0000000000000
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatetimeExpressionsSuite.scala
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql
-
-import org.apache.spark.sql.catalyst.util.DateTimeUtils
-import org.apache.spark.sql.functions._
-
-class DatetimeExpressionsSuite extends QueryTest {
-  private lazy val ctx = org.apache.spark.sql.test.TestSQLContext
-
-  import ctx.implicits._
-
-  lazy val df1 = Seq((1, 2), (3, 1)).toDF("a", "b")
-
-  test("function current_date") {
-    val d0 = DateTimeUtils.millisToDays(System.currentTimeMillis())
-    val d1 = DateTimeUtils.fromJavaDate(df1.select(current_date()).collect().head.getDate(0))
-    val d2 = DateTimeUtils.fromJavaDate(
-      ctx.sql("""SELECT CURRENT_DATE()""").collect().head.getDate(0))
-    val d3 = DateTimeUtils.millisToDays(System.currentTimeMillis())
-    assert(d0 <= d1 && d1 <= d2 && d2 <= d3 && d3 - d0 <= 1)
-  }
-
-  test("function current_timestamp") {
-    checkAnswer(df1.select(countDistinct(current_timestamp())), Row(1))
-    // Execution in one query should return the same value
-    checkAnswer(ctx.sql("""SELECT CURRENT_TIMESTAMP() = CURRENT_TIMESTAMP()"""),
-      Row(true))
-    assert(math.abs(ctx.sql("""SELECT CURRENT_TIMESTAMP()""").collect().head.getTimestamp(
-      0).getTime - System.currentTimeMillis()) < 5000)
-  }
-
-}

From 35ef853b3f9d955949c464e4a0d445147e0e9a07 Mon Sep 17 00:00:00 2001
From: Aaron Davidson <aaron@databricks.com>
Date: Tue, 28 Jul 2015 10:12:09 -0700
Subject: [PATCH 425/549] [SPARK-9397] DataFrame should provide an API to find
 source data files if applicable

Certain applications would benefit from being able to inspect DataFrames that are straightforwardly produced by data sources that stem from files, and find out their source data. For example, one might want to display to a user the size of the data underlying a table, or to copy or mutate it.

This PR exposes an `inputFiles` method on DataFrame which attempts to discover the source data in a best-effort manner, by inspecting HadoopFsRelations and JSONRelations.

Author: Aaron Davidson <aaron@databricks.com>

Closes #7717 from aarondav/paths and squashes the following commits:

ff67430 [Aaron Davidson] inputFiles
0acd3ad [Aaron Davidson] [SPARK-9397] DataFrame should provide an API to find source data files if applicable
---
 .../org/apache/spark/sql/DataFrame.scala      | 20 +++++++++++++++++--
 .../org/apache/spark/sql/DataFrameSuite.scala | 20 +++++++++++++++++++
 .../spark/sql/hive/HiveMetastoreCatalog.scala |  6 +++---
 3 files changed, 41 insertions(+), 5 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index 114ab91d10aa0..3ea0f9ed3bddd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -40,8 +40,9 @@ import org.apache.spark.sql.catalyst.plans.logical.{Filter, _}
 import org.apache.spark.sql.catalyst.plans.{Inner, JoinType}
 import org.apache.spark.sql.catalyst.{CatalystTypeConverters, ScalaReflection, SqlParser}
 import org.apache.spark.sql.execution.{EvaluatePython, ExplainCommand, LogicalRDD}
-import org.apache.spark.sql.execution.datasources.CreateTableUsingAsSelect
-import org.apache.spark.sql.json.JacksonGenerator
+import org.apache.spark.sql.execution.datasources.{CreateTableUsingAsSelect, LogicalRelation}
+import org.apache.spark.sql.json.{JacksonGenerator, JSONRelation}
+import org.apache.spark.sql.sources.HadoopFsRelation
 import org.apache.spark.sql.types._
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.util.Utils
@@ -1546,6 +1547,21 @@ class DataFrame private[sql](
     }
   }
 
+  /**
+   * Returns a best-effort snapshot of the files that compose this DataFrame. This method simply
+   * asks each constituent BaseRelation for its respective files and takes the union of all results.
+   * Depending on the source relations, this may not find all input files. Duplicates are removed.
+   */
+  def inputFiles: Array[String] = {
+    val files: Seq[String] = logicalPlan.collect {
+      case LogicalRelation(fsBasedRelation: HadoopFsRelation) =>
+        fsBasedRelation.paths.toSeq
+      case LogicalRelation(jsonRelation: JSONRelation) =>
+        jsonRelation.path.toSeq
+    }.flatten
+    files.toSet.toArray
+  }
+
   ////////////////////////////////////////////////////////////////////////////
   // for Python API
   ////////////////////////////////////////////////////////////////////////////
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index f67f2c60c0e16..3151e071b19ea 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -23,7 +23,10 @@ import scala.language.postfixOps
 import scala.util.Random
 
 import org.apache.spark.sql.catalyst.plans.logical.OneRowRelation
+import org.apache.spark.sql.execution.datasources.LogicalRelation
 import org.apache.spark.sql.functions._
+import org.apache.spark.sql.json.JSONRelation
+import org.apache.spark.sql.parquet.ParquetRelation
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.test.{ExamplePointUDT, ExamplePoint, SQLTestUtils}
 
@@ -491,6 +494,23 @@ class DataFrameSuite extends QueryTest with SQLTestUtils {
     checkAnswer(df.select(df("key")), testData.select('key).collect().toSeq)
   }
 
+  test("inputFiles") {
+    val fakeRelation1 = new ParquetRelation(Array("/my/path", "/my/other/path"),
+      Some(testData.schema), None, Map.empty)(sqlContext)
+    val df1 = DataFrame(sqlContext, LogicalRelation(fakeRelation1))
+    assert(df1.inputFiles.toSet == fakeRelation1.paths.toSet)
+
+    val fakeRelation2 = new JSONRelation("/json/path", 1, Some(testData.schema), sqlContext)
+    val df2 = DataFrame(sqlContext, LogicalRelation(fakeRelation2))
+    assert(df2.inputFiles.toSet == fakeRelation2.path.toSet)
+
+    val unionDF = df1.unionAll(df2)
+    assert(unionDF.inputFiles.toSet == fakeRelation1.paths.toSet ++ fakeRelation2.path)
+
+    val filtered = df1.filter("false").unionAll(df2.intersect(df2))
+    assert(filtered.inputFiles.toSet == fakeRelation1.paths.toSet ++ fakeRelation2.path)
+  }
+
   ignore("show") {
     // This test case is intended ignored, but to make sure it compiles correctly
     testData.select($"*").show()
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index 3180c05445c9f..a8c9b4fa71b99 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -274,9 +274,9 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
     val metastoreSchema = StructType.fromAttributes(metastoreRelation.output)
     val mergeSchema = hive.convertMetastoreParquetWithSchemaMerging
 
-    // NOTE: Instead of passing Metastore schema directly to `ParquetRelation2`, we have to
+    // NOTE: Instead of passing Metastore schema directly to `ParquetRelation`, we have to
     // serialize the Metastore schema to JSON and pass it as a data source option because of the
-    // evil case insensitivity issue, which is reconciled within `ParquetRelation2`.
+    // evil case insensitivity issue, which is reconciled within `ParquetRelation`.
     val parquetOptions = Map(
       ParquetRelation.METASTORE_SCHEMA -> metastoreSchema.json,
       ParquetRelation.MERGE_SCHEMA -> mergeSchema.toString)
@@ -290,7 +290,7 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
         partitionSpecInMetastore: Option[PartitionSpec]): Option[LogicalRelation] = {
       cachedDataSourceTables.getIfPresent(tableIdentifier) match {
         case null => None // Cache miss
-        case logical@LogicalRelation(parquetRelation: ParquetRelation) =>
+        case logical @ LogicalRelation(parquetRelation: ParquetRelation) =>
           // If we have the same paths, same schema, and same partition spec,
           // we will use the cached Parquet Relation.
           val useCached =

From 614323406225a3522ee601935ce3052449614145 Mon Sep 17 00:00:00 2001
From: trestletech <jeff.allen@trestletechnology.net>
Date: Tue, 28 Jul 2015 10:45:19 -0700
Subject: [PATCH 426/549] Use vector-friendly comparison for packages argument.

Otherwise, `sparkR.init()` with multiple `sparkPackages` results in this warning:

```
Warning message:
In if (packages != "") { :
  the condition has length > 1 and only the first element will be used
```

Author: trestletech <jeff.allen@trestletechnology.net>

Closes #7701 from trestletech/compare-packages and squashes the following commits:

72c8b36 [trestletech] Correct function name.
c52db0e [trestletech] Added test for multiple packages.
3aab1a7 [trestletech] Use vector-friendly comparison for packages argument.
---
 R/pkg/R/client.R               | 2 +-
 R/pkg/inst/tests/test_client.R | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/R/pkg/R/client.R b/R/pkg/R/client.R
index 6f772158ddfe8..c811d1dac3bd5 100644
--- a/R/pkg/R/client.R
+++ b/R/pkg/R/client.R
@@ -48,7 +48,7 @@ generateSparkSubmitArgs <- function(args, sparkHome, jars, sparkSubmitOpts, pack
     jars <- paste("--jars", jars)
   }
 
-  if (packages != "") {
+  if (!identical(packages, "")) {
     packages <- paste("--packages", packages)
   }
 
diff --git a/R/pkg/inst/tests/test_client.R b/R/pkg/inst/tests/test_client.R
index 30b05c1a2afcd..8a20991f89af8 100644
--- a/R/pkg/inst/tests/test_client.R
+++ b/R/pkg/inst/tests/test_client.R
@@ -30,3 +30,7 @@ test_that("no package specified doesn't add packages flag", {
   expect_equal(gsub("[[:space:]]", "", args),
                "")
 })
+
+test_that("multiple packages don't produce a warning", {
+  expect_that(generateSparkSubmitArgs("", "", "", "", c("A", "B")), not(gives_warning()))
+})

From 31ec6a871eebd2377961c5195f9c2bff3a899fba Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Tue, 28 Jul 2015 11:48:56 -0700
Subject: [PATCH 427/549] [SPARK-9327] [DOCS] Fix documentation about classpath
 config options.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #7651 from vanzin/SPARK-9327 and squashes the following commits:

2923e23 [Marcelo Vanzin] [SPARK-9327] [docs] Fix documentation about classpath config options.
---
 docs/configuration.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/configuration.md b/docs/configuration.md
index 200f3cd212e46..fd236137cb96e 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -203,7 +203,7 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.driver.extraClassPath</code></td>
   <td>(none)</td>
   <td>
-    Extra classpath entries to append to the classpath of the driver.
+    Extra classpath entries to prepend to the classpath of the driver.
 
     <br /><em>Note:</em> In client mode, this config must not be set through the <code>SparkConf</code>
     directly in your application, because the driver JVM has already started at that point.
@@ -250,7 +250,7 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.executor.extraClassPath</code></td>
   <td>(none)</td>
   <td>
-    Extra classpath entries to append to the classpath of executors. This exists primarily for 
+    Extra classpath entries to prepend to the classpath of executors. This exists primarily for 
     backwards-compatibility with older versions of Spark. Users typically should not need to set 
     this option.
   </td>

From 6cdcc21fe654ac0a2d0d72783eb10005fc513af6 Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Tue, 28 Jul 2015 13:16:48 -0700
Subject: [PATCH 428/549] [SPARK-9196] [SQL] Ignore test
 DatetimeExpressionsSuite: function current_timestamp.

This test is flaky. https://issues.apache.org/jira/browse/SPARK-9196 will track the fix of it. For now, let's disable this test.

Author: Yin Huai <yhuai@databricks.com>

Closes #7727 from yhuai/SPARK-9196-ignore and squashes the following commits:

f92bded [Yin Huai] Ignore current_timestamp.
---
 .../test/scala/org/apache/spark/sql/DateFunctionsSuite.scala   | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
index 36820cbbc7e5e..07eb6e4a8d8cd 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
@@ -38,7 +38,8 @@ class DateFunctionsSuite extends QueryTest {
     assert(d0 <= d1 && d1 <= d2 && d2 <= d3 && d3 - d0 <= 1)
   }
 
-  test("function current_timestamp") {
+  // This is a bad test. SPARK-9196 will fix it and re-enable it.
+  ignore("function current_timestamp") {
     val df1 = Seq((1, 2), (3, 1)).toDF("a", "b")
     checkAnswer(df1.select(countDistinct(current_timestamp())), Row(1))
     // Execution in one query should return the same value

From 8d5bb5283c3cc9180ef34b05be4a715d83073b1e Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Tue, 28 Jul 2015 14:16:57 -0700
Subject: [PATCH 429/549] [SPARK-9391] [ML] Support minus, dot, and intercept
 operators in SparkR RFormula

Adds '.', '-', and intercept parsing to RFormula. Also splits RFormulaParser into a separate file.

Umbrella design doc here: https://docs.google.com/document/d/10NZNSEurN2EdWM31uFYsgayIPfCFHiuIu3pCWrUmP_c/edit?usp=sharing

mengxr

Author: Eric Liang <ekl@databricks.com>

Closes #7707 from ericl/string-features-2 and squashes the following commits:

8588625 [Eric Liang] exclude complex types for .
8106ffe [Eric Liang] comments
a9350bb [Eric Liang] s/var/val
9c50d4d [Eric Liang] Merge branch 'string-features' into string-features-2
581afb2 [Eric Liang] Merge branch 'master' into string-features
08ae539 [Eric Liang] Merge branch 'string-features' into string-features-2
f99131a [Eric Liang] comments
cecec43 [Eric Liang] Merge branch 'string-features' into string-features-2
0bf3c26 [Eric Liang] update docs
4592df2 [Eric Liang] intercept supports
7412a2e [Eric Liang] Fri Jul 24 14:56:51 PDT 2015
3cf848e [Eric Liang] fix the parser
0556c2b [Eric Liang] Merge branch 'string-features' into string-features-2
c302a2c [Eric Liang] fix tests
9d1ac82 [Eric Liang] Merge remote-tracking branch 'upstream/master' into string-features
e713da3 [Eric Liang] comments
cd231a9 [Eric Liang] Wed Jul 22 17:18:44 PDT 2015
4d79193 [Eric Liang] revert to seq + distinct
169a085 [Eric Liang] tweak functional test
a230a47 [Eric Liang] Merge branch 'master' into string-features
72bd6f3 [Eric Liang] fix merge
d841cec [Eric Liang] Merge branch 'master' into string-features
5b2c4a2 [Eric Liang] Mon Jul 20 18:45:33 PDT 2015
b01c7c5 [Eric Liang] add test
8a637db [Eric Liang] encoder wip
a1d03f4 [Eric Liang] refactor into estimator
---
 R/pkg/R/mllib.R                               |   2 +-
 R/pkg/inst/tests/test_mllib.R                 |   8 ++
 .../apache/spark/ml/feature/RFormula.scala    |  52 +++----
 .../spark/ml/feature/RFormulaParser.scala     | 129 ++++++++++++++++++
 .../apache/spark/ml/r/SparkRWrappers.scala    |  10 +-
 .../ml/feature/RFormulaParserSuite.scala      |  55 +++++++-
 6 files changed, 215 insertions(+), 41 deletions(-)
 create mode 100644 mllib/src/main/scala/org/apache/spark/ml/feature/RFormulaParser.scala

diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
index 258e354081fc1..6a8bacaa552c6 100644
--- a/R/pkg/R/mllib.R
+++ b/R/pkg/R/mllib.R
@@ -27,7 +27,7 @@ setClass("PipelineModel", representation(model = "jobj"))
 #' Fits a generalized linear model, similarly to R's glm(). Also see the glmnet package.
 #'
 #' @param formula A symbolic description of the model to be fitted. Currently only a few formula
-#'                operators are supported, including '~' and '+'.
+#'                operators are supported, including '~', '+', '-', and '.'.
 #' @param data DataFrame for training
 #' @param family Error distribution. "gaussian" -> linear regression, "binomial" -> logistic reg.
 #' @param lambda Regularization parameter
diff --git a/R/pkg/inst/tests/test_mllib.R b/R/pkg/inst/tests/test_mllib.R
index 29152a11688a2..3bef69324770a 100644
--- a/R/pkg/inst/tests/test_mllib.R
+++ b/R/pkg/inst/tests/test_mllib.R
@@ -40,3 +40,11 @@ test_that("predictions match with native glm", {
   rVals <- predict(glm(Sepal.Width ~ Sepal.Length + Species, data = iris), iris)
   expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
 })
+
+test_that("dot minus and intercept vs native glm", {
+  training <- createDataFrame(sqlContext, iris)
+  model <- glm(Sepal_Width ~ . - Species + 0, data = training)
+  vals <- collect(select(predict(model, training), "prediction"))
+  rVals <- predict(glm(Sepal.Width ~ . - Species + 0, data = iris), iris)
+  expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
+})
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
index 0a95b1ee8de6e..0b428d278d908 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
@@ -78,13 +78,20 @@ class RFormula(override val uid: String) extends Estimator[RFormulaModel] with R
   /** @group getParam */
   def getFormula: String = $(formula)
 
+  /** Whether the formula specifies fitting an intercept. */
+  private[ml] def hasIntercept: Boolean = {
+    require(parsedFormula.isDefined, "Must call setFormula() first.")
+    parsedFormula.get.hasIntercept
+  }
+
   override def fit(dataset: DataFrame): RFormulaModel = {
     require(parsedFormula.isDefined, "Must call setFormula() first.")
+    val resolvedFormula = parsedFormula.get.resolve(dataset.schema)
     // StringType terms and terms representing interactions need to be encoded before assembly.
     // TODO(ekl) add support for feature interactions
-    var encoderStages = ArrayBuffer[PipelineStage]()
-    var tempColumns = ArrayBuffer[String]()
-    val encodedTerms = parsedFormula.get.terms.map { term =>
+    val encoderStages = ArrayBuffer[PipelineStage]()
+    val tempColumns = ArrayBuffer[String]()
+    val encodedTerms = resolvedFormula.terms.map { term =>
       dataset.schema(term) match {
         case column if column.dataType == StringType =>
           val indexCol = term + "_idx_" + uid
@@ -103,7 +110,7 @@ class RFormula(override val uid: String) extends Estimator[RFormulaModel] with R
       .setOutputCol($(featuresCol))
     encoderStages += new ColumnPruner(tempColumns.toSet)
     val pipelineModel = new Pipeline(uid).setStages(encoderStages.toArray).fit(dataset)
-    copyValues(new RFormulaModel(uid, parsedFormula.get, pipelineModel).setParent(this))
+    copyValues(new RFormulaModel(uid, resolvedFormula, pipelineModel).setParent(this))
   }
 
   // optimistic schema; does not contain any ML attributes
@@ -124,13 +131,13 @@ class RFormula(override val uid: String) extends Estimator[RFormulaModel] with R
 /**
  * :: Experimental ::
  * A fitted RFormula. Fitting is required to determine the factor levels of formula terms.
- * @param parsedFormula a pre-parsed R formula.
+ * @param resolvedFormula the fitted R formula.
  * @param pipelineModel the fitted feature model, including factor to index mappings.
  */
 @Experimental
 class RFormulaModel private[feature](
     override val uid: String,
-    parsedFormula: ParsedRFormula,
+    resolvedFormula: ResolvedRFormula,
     pipelineModel: PipelineModel)
   extends Model[RFormulaModel] with RFormulaBase {
 
@@ -144,8 +151,8 @@ class RFormulaModel private[feature](
     val withFeatures = pipelineModel.transformSchema(schema)
     if (hasLabelCol(schema)) {
       withFeatures
-    } else if (schema.exists(_.name == parsedFormula.label)) {
-      val nullable = schema(parsedFormula.label).dataType match {
+    } else if (schema.exists(_.name == resolvedFormula.label)) {
+      val nullable = schema(resolvedFormula.label).dataType match {
         case _: NumericType | BooleanType => false
         case _ => true
       }
@@ -158,12 +165,12 @@ class RFormulaModel private[feature](
   }
 
   override def copy(extra: ParamMap): RFormulaModel = copyValues(
-    new RFormulaModel(uid, parsedFormula, pipelineModel))
+    new RFormulaModel(uid, resolvedFormula, pipelineModel))
 
-  override def toString: String = s"RFormulaModel(${parsedFormula})"
+  override def toString: String = s"RFormulaModel(${resolvedFormula})"
 
   private def transformLabel(dataset: DataFrame): DataFrame = {
-    val labelName = parsedFormula.label
+    val labelName = resolvedFormula.label
     if (hasLabelCol(dataset.schema)) {
       dataset
     } else if (dataset.schema.exists(_.name == labelName)) {
@@ -207,26 +214,3 @@ private class ColumnPruner(columnsToPrune: Set[String]) extends Transformer {
 
   override def copy(extra: ParamMap): ColumnPruner = defaultCopy(extra)
 }
-
-/**
- * Represents a parsed R formula.
- */
-private[ml] case class ParsedRFormula(label: String, terms: Seq[String])
-
-/**
- * Limited implementation of R formula parsing. Currently supports: '~', '+'.
- */
-private[ml] object RFormulaParser extends RegexParsers {
-  def term: Parser[String] = "([a-zA-Z]|\\.[a-zA-Z_])[a-zA-Z0-9._]*".r
-
-  def expr: Parser[List[String]] = term ~ rep("+" ~> term) ^^ { case a ~ list => a :: list }
-
-  def formula: Parser[ParsedRFormula] =
-    (term ~ "~" ~ expr) ^^ { case r ~ "~" ~ t => ParsedRFormula(r, t.distinct) }
-
-  def parse(value: String): ParsedRFormula = parseAll(formula, value) match {
-    case Success(result, _) => result
-    case failure: NoSuccess => throw new IllegalArgumentException(
-      "Could not parse formula: " + value)
-  }
-}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormulaParser.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormulaParser.scala
new file mode 100644
index 0000000000000..1ca3b92a7d92a
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormulaParser.scala
@@ -0,0 +1,129 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.feature
+
+import scala.util.parsing.combinator.RegexParsers
+
+import org.apache.spark.mllib.linalg.VectorUDT
+import org.apache.spark.sql.types._
+
+/**
+ * Represents a parsed R formula.
+ */
+private[ml] case class ParsedRFormula(label: ColumnRef, terms: Seq[Term]) {
+  /**
+   * Resolves formula terms into column names. A schema is necessary for inferring the meaning
+   * of the special '.' term. Duplicate terms will be removed during resolution.
+   */
+  def resolve(schema: StructType): ResolvedRFormula = {
+    var includedTerms = Seq[String]()
+    terms.foreach {
+      case Dot =>
+        includedTerms ++= simpleTypes(schema).filter(_ != label.value)
+      case ColumnRef(value) =>
+        includedTerms :+= value
+      case Deletion(term: Term) =>
+        term match {
+          case ColumnRef(value) =>
+            includedTerms = includedTerms.filter(_ != value)
+          case Dot =>
+            // e.g. "- .", which removes all first-order terms
+            val fromSchema = simpleTypes(schema)
+            includedTerms = includedTerms.filter(fromSchema.contains(_))
+          case _: Deletion =>
+            assert(false, "Deletion terms cannot be nested")
+          case _: Intercept =>
+        }
+      case _: Intercept =>
+    }
+    ResolvedRFormula(label.value, includedTerms.distinct)
+  }
+
+  /** Whether this formula specifies fitting with an intercept term. */
+  def hasIntercept: Boolean = {
+    var intercept = true
+    terms.foreach {
+      case Intercept(enabled) =>
+        intercept = enabled
+      case Deletion(Intercept(enabled)) =>
+        intercept = !enabled
+      case _ =>
+    }
+    intercept
+  }
+
+  // the dot operator excludes complex column types
+  private def simpleTypes(schema: StructType): Seq[String] = {
+    schema.fields.filter(_.dataType match {
+      case _: NumericType | StringType | BooleanType | _: VectorUDT => true
+      case _ => false
+    }).map(_.name)
+  }
+}
+
+/**
+ * Represents a fully evaluated and simplified R formula.
+ */
+private[ml] case class ResolvedRFormula(label: String, terms: Seq[String])
+
+/**
+ * R formula terms. See the R formula docs here for more information:
+ * http://stat.ethz.ch/R-manual/R-patched/library/stats/html/formula.html
+ */
+private[ml] sealed trait Term
+
+/* R formula reference to all available columns, e.g. "." in a formula */
+private[ml] case object Dot extends Term
+
+/* R formula reference to a column, e.g. "+ Species" in a formula */
+private[ml] case class ColumnRef(value: String) extends Term
+
+/* R formula intercept toggle, e.g. "+ 0" in a formula */
+private[ml] case class Intercept(enabled: Boolean) extends Term
+
+/* R formula deletion of a variable, e.g. "- Species" in a formula */
+private[ml] case class Deletion(term: Term) extends Term
+
+/**
+ * Limited implementation of R formula parsing. Currently supports: '~', '+', '-', '.'.
+ */
+private[ml] object RFormulaParser extends RegexParsers {
+  def intercept: Parser[Intercept] =
+    "([01])".r ^^ { case a => Intercept(a == "1") }
+
+  def columnRef: Parser[ColumnRef] =
+    "([a-zA-Z]|\\.[a-zA-Z_])[a-zA-Z0-9._]*".r ^^ { case a => ColumnRef(a) }
+
+  def term: Parser[Term] = intercept | columnRef | "\\.".r ^^ { case _ => Dot }
+
+  def terms: Parser[List[Term]] = (term ~ rep("+" ~ term | "-" ~ term)) ^^ {
+    case op ~ list => list.foldLeft(List(op)) {
+      case (left, "+" ~ right) => left ++ Seq(right)
+      case (left, "-" ~ right) => left ++ Seq(Deletion(right))
+    }
+  }
+
+  def formula: Parser[ParsedRFormula] =
+    (columnRef ~ "~" ~ terms) ^^ { case r ~ "~" ~ t => ParsedRFormula(r, t) }
+
+  def parse(value: String): ParsedRFormula = parseAll(formula, value) match {
+    case Success(result, _) => result
+    case failure: NoSuccess => throw new IllegalArgumentException(
+      "Could not parse formula: " + value)
+  }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/SparkRWrappers.scala b/mllib/src/main/scala/org/apache/spark/ml/r/SparkRWrappers.scala
index 1ee080641e3e3..9f70592ccad7e 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/SparkRWrappers.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/SparkRWrappers.scala
@@ -32,8 +32,14 @@ private[r] object SparkRWrappers {
       alpha: Double): PipelineModel = {
     val formula = new RFormula().setFormula(value)
     val estimator = family match {
-      case "gaussian" => new LinearRegression().setRegParam(lambda).setElasticNetParam(alpha)
-      case "binomial" => new LogisticRegression().setRegParam(lambda).setElasticNetParam(alpha)
+      case "gaussian" => new LinearRegression()
+        .setRegParam(lambda)
+        .setElasticNetParam(alpha)
+        .setFitIntercept(formula.hasIntercept)
+      case "binomial" => new LogisticRegression()
+        .setRegParam(lambda)
+        .setElasticNetParam(alpha)
+        .setFitIntercept(formula.hasIntercept)
     }
     val pipeline = new Pipeline().setStages(Array(formula, estimator))
     pipeline.fit(df)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaParserSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaParserSuite.scala
index c4b45aee06384..436e66bab09b0 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaParserSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaParserSuite.scala
@@ -18,12 +18,17 @@
 package org.apache.spark.ml.feature
 
 import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.types._
 
 class RFormulaParserSuite extends SparkFunSuite {
-  private def checkParse(formula: String, label: String, terms: Seq[String]) {
-    val parsed = RFormulaParser.parse(formula)
-    assert(parsed.label == label)
-    assert(parsed.terms == terms)
+  private def checkParse(
+      formula: String,
+      label: String,
+      terms: Seq[String],
+      schema: StructType = null) {
+    val resolved = RFormulaParser.parse(formula).resolve(schema)
+    assert(resolved.label == label)
+    assert(resolved.terms == terms)
   }
 
   test("parse simple formulas") {
@@ -32,4 +37,46 @@ class RFormulaParserSuite extends SparkFunSuite {
     checkParse("y ~   ._foo  ", "y", Seq("._foo"))
     checkParse("resp ~ A_VAR + B + c123", "resp", Seq("A_VAR", "B", "c123"))
   }
+
+  test("parse dot") {
+    val schema = (new StructType)
+      .add("a", "int", true)
+      .add("b", "long", false)
+      .add("c", "string", true)
+    checkParse("a ~ .", "a", Seq("b", "c"), schema)
+  }
+
+  test("parse deletion") {
+    val schema = (new StructType)
+      .add("a", "int", true)
+      .add("b", "long", false)
+      .add("c", "string", true)
+    checkParse("a ~ c - b", "a", Seq("c"), schema)
+  }
+
+  test("parse additions and deletions in order") {
+    val schema = (new StructType)
+      .add("a", "int", true)
+      .add("b", "long", false)
+      .add("c", "string", true)
+    checkParse("a ~ . - b + . - c", "a", Seq("b"), schema)
+  }
+
+  test("dot ignores complex column types") {
+    val schema = (new StructType)
+      .add("a", "int", true)
+      .add("b", "tinyint", false)
+      .add("c", "map<string, string>", true)
+    checkParse("a ~ .", "a", Seq("b"), schema)
+  }
+
+  test("parse intercept") {
+    assert(RFormulaParser.parse("a ~ b").hasIntercept)
+    assert(RFormulaParser.parse("a ~ b + 1").hasIntercept)
+    assert(RFormulaParser.parse("a ~ b - 0").hasIntercept)
+    assert(RFormulaParser.parse("a ~ b - 1 + 1").hasIntercept)
+    assert(!RFormulaParser.parse("a ~ b + 0").hasIntercept)
+    assert(!RFormulaParser.parse("a ~ b - 1").hasIntercept)
+    assert(!RFormulaParser.parse("a ~ b + 1 - 1").hasIntercept)
+  }
 }

From b88b868eb378bdb7459978842b5572a0b498f412 Mon Sep 17 00:00:00 2001
From: Joseph Batchik <josephbatchik@gmail.com>
Date: Tue, 28 Jul 2015 14:39:25 -0700
Subject: [PATCH 430/549] [SPARK-8003][SQL] Added virtual column support to
 Spark

Added virtual column support by adding a new resolution role to the query analyzer. Additional virtual columns can be added by adding case expressions to [the new rule](https://github.com/JDrit/spark/blob/virt_columns/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala#L1026) and my modifying the [logical plan](https://github.com/JDrit/spark/blob/virt_columns/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala#L216) to resolve them.

This also solves [SPARK-8003](https://issues.apache.org/jira/browse/SPARK-8003)

This allows you to perform queries such as:
```sql
select spark__partition__id, count(*) as c from table group by spark__partition__id;
```

Author: Joseph Batchik <josephbatchik@gmail.com>
Author: JD <jd@csh.rit.edu>

Closes #7478 from JDrit/virt_columns and squashes the following commits:

7932bf0 [Joseph Batchik] adding spark__partition__id to hive as well
f8a9c6c [Joseph Batchik] merging in master
e49da48 [JD] fixes for @rxin's suggestions
60e120b [JD] fixing test in merge
4bf8554 [JD] merging in master
c68bc0f [Joseph Batchik] Adding function register ability to SQLContext and adding a function for spark__partition__id()
---
 .../sql/catalyst/analysis/FunctionRegistry.scala    |  2 +-
 .../scala/org/apache/spark/sql/SQLContext.scala     | 11 ++++++++++-
 .../execution/expressions/SparkPartitionID.scala    |  2 +-
 .../main/scala/org/apache/spark/sql/functions.scala |  2 +-
 .../test/scala/org/apache/spark/sql/UDFSuite.scala  |  7 +++++++
 .../expression/NondeterministicSuite.scala          |  2 +-
 .../org/apache/spark/sql/hive/HiveContext.scala     | 13 +++++++++++--
 .../scala/org/apache/spark/sql/hive/UDFSuite.scala  |  9 ++++++++-
 8 files changed, 40 insertions(+), 8 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index 61ee6f6f71631..9b60943a1e147 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -239,7 +239,7 @@ object FunctionRegistry {
   }
 
   /** See usage above. */
-  private def expression[T <: Expression](name: String)
+  def expression[T <: Expression](name: String)
       (implicit tag: ClassTag[T]): (String, (ExpressionInfo, FunctionBuilder)) = {
 
     // See if we can find a constructor that accepts Seq[Expression]
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index dbb2a09846548..56cd8f22e7cf4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -31,6 +31,8 @@ import org.apache.spark.SparkContext
 import org.apache.spark.annotation.{DeveloperApi, Experimental}
 import org.apache.spark.api.java.{JavaRDD, JavaSparkContext}
 import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.analysis.FunctionRegistry.{expression => FunctionExpression, FunctionBuilder}
+import org.apache.spark.sql.execution.expressions.SparkPartitionID
 import org.apache.spark.sql.SQLConf.SQLConfEntry
 import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.errors.DialectException
@@ -140,7 +142,14 @@ class SQLContext(@transient val sparkContext: SparkContext)
 
   // TODO how to handle the temp function per user session?
   @transient
-  protected[sql] lazy val functionRegistry: FunctionRegistry = FunctionRegistry.builtin
+  protected[sql] lazy val functionRegistry: FunctionRegistry = {
+    val reg = FunctionRegistry.builtin
+    val extendedFunctions = List[(String, (ExpressionInfo, FunctionBuilder))](
+      FunctionExpression[SparkPartitionID]("spark__partition__id")
+    )
+    extendedFunctions.foreach { case(name, (info, fun)) => reg.registerFunction(name, info, fun) }
+    reg
+  }
 
   @transient
   protected[sql] lazy val analyzer: Analyzer =
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/SparkPartitionID.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/SparkPartitionID.scala
index 61ef079d89af5..98c8eab8372aa 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/SparkPartitionID.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/SparkPartitionID.scala
@@ -27,7 +27,7 @@ import org.apache.spark.sql.types.{IntegerType, DataType}
 /**
  * Expression that returns the current partition id of the Spark task.
  */
-private[sql] case object SparkPartitionID extends LeafExpression with Nondeterministic {
+private[sql] case class SparkPartitionID() extends LeafExpression with Nondeterministic {
 
   override def nullable: Boolean = false
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index cec61b66b157c..0148991512213 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -741,7 +741,7 @@ object functions {
    * @group normal_funcs
    * @since 1.4.0
    */
-  def sparkPartitionId(): Column = execution.expressions.SparkPartitionID
+  def sparkPartitionId(): Column = execution.expressions.SparkPartitionID()
 
   /**
    * Computes the square root of the specified float value.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
index c1516b450cbd4..9b326c16350c8 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
@@ -51,6 +51,13 @@ class UDFSuite extends QueryTest {
     df.selectExpr("count(distinct a)")
   }
 
+  test("SPARK-8003 spark__partition__id") {
+    val df = Seq((1, "Tearing down the walls that divide us")).toDF("id", "saying")
+    df.registerTempTable("tmp_table")
+    checkAnswer(ctx.sql("select spark__partition__id() from tmp_table").toDF(), Row(0))
+    ctx.dropTempTable("tmp_table")
+  }
+
   test("error reporting for incorrect number of arguments") {
     val df = ctx.emptyDataFrame
     val e = intercept[AnalysisException] {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/expression/NondeterministicSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/expression/NondeterministicSuite.scala
index 1c5a2ed2c0a53..b6e79ff9cc95d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/expression/NondeterministicSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/expression/NondeterministicSuite.scala
@@ -27,6 +27,6 @@ class NondeterministicSuite extends SparkFunSuite with ExpressionEvalHelper {
   }
 
   test("SparkPartitionID") {
-    checkEvaluation(SparkPartitionID, 0)
+    checkEvaluation(SparkPartitionID(), 0)
   }
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index 110f51a305861..8b35c1275f388 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -38,6 +38,9 @@ import org.apache.spark.Logging
 import org.apache.spark.SparkContext
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.sql._
+import org.apache.spark.sql.catalyst.analysis.FunctionRegistry.{expression => FunctionExpression, FunctionBuilder}
+import org.apache.spark.sql.catalyst.expressions.ExpressionInfo
+import org.apache.spark.sql.execution.expressions.SparkPartitionID
 import org.apache.spark.sql.SQLConf.SQLConfEntry
 import org.apache.spark.sql.SQLConf.SQLConfEntry._
 import org.apache.spark.sql.catalyst.{TableIdentifier, ParserDialect}
@@ -372,8 +375,14 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) with Logging {
 
   // Note that HiveUDFs will be overridden by functions registered in this context.
   @transient
-  override protected[sql] lazy val functionRegistry: FunctionRegistry =
-    new HiveFunctionRegistry(FunctionRegistry.builtin)
+  override protected[sql] lazy val functionRegistry: FunctionRegistry = {
+    val reg = new HiveFunctionRegistry(FunctionRegistry.builtin)
+    val extendedFunctions = List[(String, (ExpressionInfo, FunctionBuilder))](
+      FunctionExpression[SparkPartitionID]("spark__partition__id")
+    )
+    extendedFunctions.foreach { case(name, (info, fun)) => reg.registerFunction(name, info, fun) }
+    reg
+  }
 
   /* An analyzer that uses the Hive metastore. */
   @transient
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/UDFSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/UDFSuite.scala
index 4056dee777574..9cea5d413c817 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/UDFSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/UDFSuite.scala
@@ -17,13 +17,14 @@
 
 package org.apache.spark.sql.hive
 
-import org.apache.spark.sql.QueryTest
+import org.apache.spark.sql.{Row, QueryTest}
 
 case class FunctionResult(f1: String, f2: String)
 
 class UDFSuite extends QueryTest {
 
   private lazy val ctx = org.apache.spark.sql.hive.test.TestHive
+  import ctx.implicits._
 
   test("UDF case insensitive") {
     ctx.udf.register("random0", () => { Math.random() })
@@ -33,4 +34,10 @@ class UDFSuite extends QueryTest {
     assert(ctx.sql("SELECT RANDOm1() FROM src LIMIT 1").head().getDouble(0) >= 0.0)
     assert(ctx.sql("SELECT strlenscala('test', 1) FROM src LIMIT 1").head().getInt(0) === 5)
   }
+
+  test("SPARK-8003 spark__partition__id") {
+    val df = Seq((1, "Two Fiiiiive")).toDF("id", "saying")
+    ctx.registerDataFrameAsTable(df, "test_table")
+    checkAnswer(ctx.sql("select spark__partition__id() from test_table LIMIT 1").toDF(), Row(0))
+  }
 }

From 198d181dfb2c04102afe40680a4637d951e92c0b Mon Sep 17 00:00:00 2001
From: MechCoder <manojkumarsivaraj334@gmail.com>
Date: Tue, 28 Jul 2015 15:00:25 -0700
Subject: [PATCH 431/549] [SPARK-7105] [PYSPARK] [MLLIB] Support model
 save/load in GMM

This PR introduces save / load for GMM's in python API.

Also I refactored `GaussianMixtureModel` and inherited it from `JavaModelWrapper` with model being `GaussianMixtureModelWrapper`, a wrapper which provides convenience methods to `GaussianMixtureModel` (due to serialization and deserialization issues) and I moved the creation of gaussians to the scala backend.

Author: MechCoder <manojkumarsivaraj334@gmail.com>

Closes #7617 from MechCoder/python_gmm_save_load and squashes the following commits:

9c305aa [MechCoder] [SPARK-7105] [PySpark] [MLlib] Support model save/load in GMM
---
 .../python/GaussianMixtureModelWrapper.scala  | 53 +++++++++++++
 .../mllib/api/python/PythonMLLibAPI.scala     | 13 +---
 python/pyspark/mllib/clustering.py            | 75 +++++++++++++------
 python/pyspark/mllib/util.py                  |  6 ++
 4 files changed, 114 insertions(+), 33 deletions(-)
 create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/api/python/GaussianMixtureModelWrapper.scala

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/GaussianMixtureModelWrapper.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/GaussianMixtureModelWrapper.scala
new file mode 100644
index 0000000000000..0ec88ef77d695
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/GaussianMixtureModelWrapper.scala
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.api.python
+
+import java.util.{List => JList}
+
+import scala.collection.JavaConverters._
+import scala.collection.mutable.ArrayBuffer
+
+import org.apache.spark.SparkContext
+import org.apache.spark.mllib.linalg.{Vector, Vectors, Matrix}
+import org.apache.spark.mllib.clustering.GaussianMixtureModel
+
+/**
+  * Wrapper around GaussianMixtureModel to provide helper methods in Python
+  */
+private[python] class GaussianMixtureModelWrapper(model: GaussianMixtureModel) {
+  val weights: Vector = Vectors.dense(model.weights)
+  val k: Int = weights.size
+
+  /**
+    * Returns gaussians as a List of Vectors and Matrices corresponding each MultivariateGaussian
+    */
+  val gaussians: JList[Object] = {
+    val modelGaussians = model.gaussians
+    var i = 0
+    var mu = ArrayBuffer.empty[Vector]
+    var sigma = ArrayBuffer.empty[Matrix]
+    while (i < k) {
+      mu += modelGaussians(i).mu
+      sigma += modelGaussians(i).sigma
+      i += 1
+    }
+    List(mu.toArray, sigma.toArray).map(_.asInstanceOf[Object]).asJava
+  }
+
+  def save(sc: SparkContext, path: String): Unit = model.save(sc, path)
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
index fda8d5a0b048f..6f080d32bbf4d 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
@@ -364,7 +364,7 @@ private[python] class PythonMLLibAPI extends Serializable {
       seed: java.lang.Long,
       initialModelWeights: java.util.ArrayList[Double],
       initialModelMu: java.util.ArrayList[Vector],
-      initialModelSigma: java.util.ArrayList[Matrix]): JList[Object] = {
+      initialModelSigma: java.util.ArrayList[Matrix]): GaussianMixtureModelWrapper = {
     val gmmAlg = new GaussianMixture()
       .setK(k)
       .setConvergenceTol(convergenceTol)
@@ -382,16 +382,7 @@ private[python] class PythonMLLibAPI extends Serializable {
     if (seed != null) gmmAlg.setSeed(seed)
 
     try {
-      val model = gmmAlg.run(data.rdd.persist(StorageLevel.MEMORY_AND_DISK))
-      var wt = ArrayBuffer.empty[Double]
-      var mu = ArrayBuffer.empty[Vector]
-      var sigma = ArrayBuffer.empty[Matrix]
-      for (i <- 0 until model.k) {
-          wt += model.weights(i)
-          mu += model.gaussians(i).mu
-          sigma += model.gaussians(i).sigma
-      }
-      List(Vectors.dense(wt.toArray), mu.toArray, sigma.toArray).map(_.asInstanceOf[Object]).asJava
+      new GaussianMixtureModelWrapper(gmmAlg.run(data.rdd.persist(StorageLevel.MEMORY_AND_DISK)))
     } finally {
       data.rdd.unpersist(blocking = false)
     }
diff --git a/python/pyspark/mllib/clustering.py b/python/pyspark/mllib/clustering.py
index 58ad99d46e23b..900ade248c386 100644
--- a/python/pyspark/mllib/clustering.py
+++ b/python/pyspark/mllib/clustering.py
@@ -152,11 +152,19 @@ def train(cls, rdd, k, maxIterations=100, runs=1, initializationMode="k-means||"
         return KMeansModel([c.toArray() for c in centers])
 
 
-class GaussianMixtureModel(object):
+@inherit_doc
+class GaussianMixtureModel(JavaModelWrapper, JavaSaveable, JavaLoader):
+
+    """
+    .. note:: Experimental
 
-    """A clustering model derived from the Gaussian Mixture Model method.
+    A clustering model derived from the Gaussian Mixture Model method.
 
     >>> from pyspark.mllib.linalg import Vectors, DenseMatrix
+    >>> from numpy.testing import assert_equal
+    >>> from shutil import rmtree
+    >>> import os, tempfile
+
     >>> clusterdata_1 =  sc.parallelize(array([-0.1,-0.05,-0.01,-0.1,
     ...                                         0.9,0.8,0.75,0.935,
     ...                                        -0.83,-0.68,-0.91,-0.76 ]).reshape(6, 2))
@@ -169,6 +177,25 @@ class GaussianMixtureModel(object):
     True
     >>> labels[4]==labels[5]
     True
+
+    >>> path = tempfile.mkdtemp()
+    >>> model.save(sc, path)
+    >>> sameModel = GaussianMixtureModel.load(sc, path)
+    >>> assert_equal(model.weights, sameModel.weights)
+    >>> mus, sigmas = list(
+    ...     zip(*[(g.mu, g.sigma) for g in model.gaussians]))
+    >>> sameMus, sameSigmas = list(
+    ...     zip(*[(g.mu, g.sigma) for g in sameModel.gaussians]))
+    >>> mus == sameMus
+    True
+    >>> sigmas == sameSigmas
+    True
+    >>> from shutil import rmtree
+    >>> try:
+    ...     rmtree(path)
+    ... except OSError:
+    ...     pass
+
     >>> data =  array([-5.1971, -2.5359, -3.8220,
     ...                -5.2211, -5.0602,  4.7118,
     ...                 6.8989, 3.4592,  4.6322,
@@ -182,25 +209,15 @@ class GaussianMixtureModel(object):
     True
     >>> labels[3]==labels[4]
     True
-    >>> clusterdata_3 = sc.parallelize(data.reshape(15, 1))
-    >>> im = GaussianMixtureModel([0.5, 0.5],
-    ...      [MultivariateGaussian(Vectors.dense([-1.0]), DenseMatrix(1, 1, [1.0])),
-    ...      MultivariateGaussian(Vectors.dense([1.0]), DenseMatrix(1, 1, [1.0]))])
-    >>> model = GaussianMixture.train(clusterdata_3, 2, initialModel=im)
     """
 
-    def __init__(self, weights, gaussians):
-        self._weights = weights
-        self._gaussians = gaussians
-        self._k = len(self._weights)
-
     @property
     def weights(self):
         """
         Weights for each Gaussian distribution in the mixture, where weights[i] is
         the weight for Gaussian i, and weights.sum == 1.
         """
-        return self._weights
+        return array(self.call("weights"))
 
     @property
     def gaussians(self):
@@ -208,12 +225,14 @@ def gaussians(self):
         Array of MultivariateGaussian where gaussians[i] represents
         the Multivariate Gaussian (Normal) Distribution for Gaussian i.
         """
-        return self._gaussians
+        return [
+            MultivariateGaussian(gaussian[0], gaussian[1])
+            for gaussian in zip(*self.call("gaussians"))]
 
     @property
     def k(self):
         """Number of gaussians in mixture."""
-        return self._k
+        return len(self.weights)
 
     def predict(self, x):
         """
@@ -238,17 +257,30 @@ def predictSoft(self, x):
         :return:     membership_matrix. RDD of array of double values.
         """
         if isinstance(x, RDD):
-            means, sigmas = zip(*[(g.mu, g.sigma) for g in self._gaussians])
+            means, sigmas = zip(*[(g.mu, g.sigma) for g in self.gaussians])
             membership_matrix = callMLlibFunc("predictSoftGMM", x.map(_convert_to_vector),
-                                              _convert_to_vector(self._weights), means, sigmas)
+                                              _convert_to_vector(self.weights), means, sigmas)
             return membership_matrix.map(lambda x: pyarray.array('d', x))
         else:
             raise TypeError("x should be represented by an RDD, "
                             "but got %s." % type(x))
 
+    @classmethod
+    def load(cls, sc, path):
+        """Load the GaussianMixtureModel from disk.
+
+        :param sc: SparkContext
+        :param path: str, path to where the model is stored.
+        """
+        model = cls._load_java(sc, path)
+        wrapper = sc._jvm.GaussianMixtureModelWrapper(model)
+        return cls(wrapper)
+
 
 class GaussianMixture(object):
     """
+    .. note:: Experimental
+
     Learning algorithm for Gaussian Mixtures using the expectation-maximization algorithm.
 
     :param data:            RDD of data points
@@ -271,11 +303,10 @@ def train(cls, rdd, k, convergenceTol=1e-3, maxIterations=100, seed=None, initia
             initialModelWeights = initialModel.weights
             initialModelMu = [initialModel.gaussians[i].mu for i in range(initialModel.k)]
             initialModelSigma = [initialModel.gaussians[i].sigma for i in range(initialModel.k)]
-        weight, mu, sigma = callMLlibFunc("trainGaussianMixtureModel", rdd.map(_convert_to_vector),
-                                          k, convergenceTol, maxIterations, seed,
-                                          initialModelWeights, initialModelMu, initialModelSigma)
-        mvg_obj = [MultivariateGaussian(mu[i], sigma[i]) for i in range(k)]
-        return GaussianMixtureModel(weight, mvg_obj)
+        java_model = callMLlibFunc("trainGaussianMixtureModel", rdd.map(_convert_to_vector),
+                                   k, convergenceTol, maxIterations, seed,
+                                   initialModelWeights, initialModelMu, initialModelSigma)
+        return GaussianMixtureModel(java_model)
 
 
 class PowerIterationClusteringModel(JavaModelWrapper, JavaSaveable, JavaLoader):
diff --git a/python/pyspark/mllib/util.py b/python/pyspark/mllib/util.py
index 875d3b2d642c6..916de2d6fcdbd 100644
--- a/python/pyspark/mllib/util.py
+++ b/python/pyspark/mllib/util.py
@@ -21,7 +21,9 @@
 
 if sys.version > '3':
     xrange = range
+    basestring = str
 
+from pyspark import SparkContext
 from pyspark.mllib.common import callMLlibFunc, inherit_doc
 from pyspark.mllib.linalg import Vectors, SparseVector, _convert_to_vector
 
@@ -223,6 +225,10 @@ class JavaSaveable(Saveable):
     """
 
     def save(self, sc, path):
+        if not isinstance(sc, SparkContext):
+            raise TypeError("sc should be a SparkContext, got type %s" % type(sc))
+        if not isinstance(path, basestring):
+            raise TypeError("path should be a basestring, got type %s" % type(path))
         self._java_model.save(sc._jsc.sc(), path)
 
 
From 21825529eae66293ec5d8638911303fa54944dd5 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Tue, 28 Jul 2015 15:56:19 -0700
Subject: [PATCH 432/549] [SPARK-9247] [SQL] Use BytesToBytesMap for broadcast
 join

This PR introduce BytesToBytesMap to UnsafeHashedRelation, use it in executor for better performance.

It serialize all the key and values from java HashMap, put them into a BytesToBytesMap while deserializing. All the values for a same key are stored continuous to have better memory locality.

This PR also address the comments for #7480 , do some clean up.

Author: Davies Liu <davies@databricks.com>

Closes #7592 from davies/unsafe_map2 and squashes the following commits:

42c578a [Davies Liu] Merge branch 'master' of github.com:apache/spark into unsafe_map2
fd09528 [Davies Liu] remove thread local cache and update docs
1c5ad8d [Davies Liu] fix test
5eb1b5a [Davies Liu] address comments in #7480
46f1f22 [Davies Liu] fix style
fc221e0 [Davies Liu] use BytesToBytesMap for broadcast join
---
 .../execution/joins/BroadcastHashJoin.scala   |   2 +-
 .../joins/BroadcastHashOuterJoin.scala        |   2 +-
 .../joins/BroadcastLeftSemiJoinHash.scala     |   6 +-
 .../joins/BroadcastNestedLoopJoin.scala       |  36 ++--
 .../spark/sql/execution/joins/HashJoin.scala  |  35 ++--
 .../sql/execution/joins/HashOuterJoin.scala   |  34 ++--
 .../sql/execution/joins/HashSemiJoin.scala    |  14 +-
 .../sql/execution/joins/HashedRelation.scala  | 166 ++++++++++++++----
 .../execution/joins/LeftSemiJoinHash.scala    |   2 +-
 .../execution/joins/ShuffledHashJoin.scala    |   2 +-
 .../joins/ShuffledHashOuterJoin.scala         |   8 +-
 .../execution/joins/HashedRelationSuite.scala |  28 +--
 12 files changed, 214 insertions(+), 121 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoin.scala
index abaa4a6ce86a2..624efc1b1d734 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoin.scala
@@ -62,7 +62,7 @@ case class BroadcastHashJoin(
   private val broadcastFuture = future {
     // Note that we use .execute().collect() because we don't want to convert data to Scala types
     val input: Array[InternalRow] = buildPlan.execute().map(_.copy()).collect()
-    val hashed = buildHashRelation(input.iterator)
+    val hashed = HashedRelation(input.iterator, buildSideKeyGenerator, input.size)
     sparkContext.broadcast(hashed)
   }(BroadcastHashJoin.broadcastHashJoinExecutionContext)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashOuterJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashOuterJoin.scala
index c9d1a880f4ef4..77e7fe71009b7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashOuterJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashOuterJoin.scala
@@ -61,7 +61,7 @@ case class BroadcastHashOuterJoin(
   private val broadcastFuture = future {
     // Note that we use .execute().collect() because we don't want to convert data to Scala types
     val input: Array[InternalRow] = buildPlan.execute().map(_.copy()).collect()
-    val hashed = buildHashRelation(input.iterator)
+    val hashed = HashedRelation(input.iterator, buildKeyGenerator, input.size)
     sparkContext.broadcast(hashed)
   }(BroadcastHashOuterJoin.broadcastHashOuterJoinExecutionContext)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastLeftSemiJoinHash.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastLeftSemiJoinHash.scala
index f71c0ce352904..a60593911f94f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastLeftSemiJoinHash.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastLeftSemiJoinHash.scala
@@ -37,17 +37,17 @@ case class BroadcastLeftSemiJoinHash(
     condition: Option[Expression]) extends BinaryNode with HashSemiJoin {
 
   protected override def doExecute(): RDD[InternalRow] = {
-    val buildIter = right.execute().map(_.copy()).collect().toIterator
+    val input = right.execute().map(_.copy()).collect()
 
     if (condition.isEmpty) {
-      val hashSet = buildKeyHashSet(buildIter)
+      val hashSet = buildKeyHashSet(input.toIterator)
       val broadcastedRelation = sparkContext.broadcast(hashSet)
 
       left.execute().mapPartitions { streamIter =>
         hashSemiJoin(streamIter, broadcastedRelation.value)
       }
     } else {
-      val hashRelation = buildHashRelation(buildIter)
+      val hashRelation = HashedRelation(input.toIterator, rightKeyGenerator, input.size)
       val broadcastedRelation = sparkContext.broadcast(hashRelation)
 
       left.execute().mapPartitions { streamIter =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastNestedLoopJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastNestedLoopJoin.scala
index 700636966f8be..83b726a8e2897 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastNestedLoopJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastNestedLoopJoin.scala
@@ -47,13 +47,11 @@ case class BroadcastNestedLoopJoin(
   override def outputsUnsafeRows: Boolean = left.outputsUnsafeRows || right.outputsUnsafeRows
   override def canProcessUnsafeRows: Boolean = true
 
-  @transient private[this] lazy val resultProjection: Projection = {
+  @transient private[this] lazy val resultProjection: InternalRow => InternalRow = {
     if (outputsUnsafeRows) {
       UnsafeProjection.create(schema)
     } else {
-      new Projection {
-        override def apply(r: InternalRow): InternalRow = r
-      }
+      identity[InternalRow]
     }
   }
 
@@ -96,7 +94,6 @@ case class BroadcastNestedLoopJoin(
         var streamRowMatched = false
 
         while (i < broadcastedRelation.value.size) {
-          // TODO: One bitset per partition instead of per row.
           val broadcastedRow = broadcastedRelation.value(i)
           buildSide match {
             case BuildRight if boundCondition(joinedRow(streamedRow, broadcastedRow)) =>
@@ -135,17 +132,26 @@ case class BroadcastNestedLoopJoin(
       val buf: CompactBuffer[InternalRow] = new CompactBuffer()
       var i = 0
       val rel = broadcastedRelation.value
-      while (i < rel.length) {
-        if (!allIncludedBroadcastTuples.contains(i)) {
-          (joinType, buildSide) match {
-            case (RightOuter | FullOuter, BuildRight) =>
-              buf += resultProjection(new JoinedRow(leftNulls, rel(i)))
-            case (LeftOuter | FullOuter, BuildLeft) =>
-              buf += resultProjection(new JoinedRow(rel(i), rightNulls))
-            case _ =>
+      (joinType, buildSide) match {
+        case (RightOuter | FullOuter, BuildRight) =>
+          val joinedRow = new JoinedRow
+          joinedRow.withLeft(leftNulls)
+          while (i < rel.length) {
+            if (!allIncludedBroadcastTuples.contains(i)) {
+              buf += resultProjection(joinedRow.withRight(rel(i))).copy()
+            }
+            i += 1
           }
-        }
-        i += 1
+        case (LeftOuter | FullOuter, BuildLeft) =>
+          val joinedRow = new JoinedRow
+          joinedRow.withRight(rightNulls)
+          while (i < rel.length) {
+            if (!allIncludedBroadcastTuples.contains(i)) {
+              buf += resultProjection(joinedRow.withLeft(rel(i))).copy()
+            }
+            i += 1
+          }
+        case _ =>
       }
       buf.toSeq
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala
index 46ab5b0d1cc6d..6b3d1652923fd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala
@@ -20,7 +20,6 @@ package org.apache.spark.sql.execution.joins
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.execution.SparkPlan
-import org.apache.spark.util.collection.CompactBuffer
 
 
 trait HashJoin {
@@ -44,16 +43,24 @@ trait HashJoin {
 
   override def output: Seq[Attribute] = left.output ++ right.output
 
-  protected[this] def supportUnsafe: Boolean = {
+  protected[this] def isUnsafeMode: Boolean = {
     (self.codegenEnabled && UnsafeProjection.canSupport(buildKeys)
       && UnsafeProjection.canSupport(self.schema))
   }
 
-  override def outputsUnsafeRows: Boolean = supportUnsafe
-  override def canProcessUnsafeRows: Boolean = supportUnsafe
+  override def outputsUnsafeRows: Boolean = isUnsafeMode
+  override def canProcessUnsafeRows: Boolean = isUnsafeMode
+  override def canProcessSafeRows: Boolean = !isUnsafeMode
+
+  @transient protected lazy val buildSideKeyGenerator: Projection =
+    if (isUnsafeMode) {
+      UnsafeProjection.create(buildKeys, buildPlan.output)
+    } else {
+      newMutableProjection(buildKeys, buildPlan.output)()
+    }
 
   @transient protected lazy val streamSideKeyGenerator: Projection =
-    if (supportUnsafe) {
+    if (isUnsafeMode) {
       UnsafeProjection.create(streamedKeys, streamedPlan.output)
     } else {
       newMutableProjection(streamedKeys, streamedPlan.output)()
@@ -65,18 +72,16 @@ trait HashJoin {
   {
     new Iterator[InternalRow] {
       private[this] var currentStreamedRow: InternalRow = _
-      private[this] var currentHashMatches: CompactBuffer[InternalRow] = _
+      private[this] var currentHashMatches: Seq[InternalRow] = _
       private[this] var currentMatchPosition: Int = -1
 
       // Mutable per row objects.
       private[this] val joinRow = new JoinedRow
-      private[this] val resultProjection: Projection = {
-        if (supportUnsafe) {
+      private[this] val resultProjection: (InternalRow) => InternalRow = {
+        if (isUnsafeMode) {
           UnsafeProjection.create(self.schema)
         } else {
-          new Projection {
-            override def apply(r: InternalRow): InternalRow = r
-          }
+          identity[InternalRow]
         }
       }
 
@@ -122,12 +127,4 @@ trait HashJoin {
       }
     }
   }
-
-  protected[this] def buildHashRelation(buildIter: Iterator[InternalRow]): HashedRelation = {
-    if (supportUnsafe) {
-      UnsafeHashedRelation(buildIter, buildKeys, buildPlan)
-    } else {
-      HashedRelation(buildIter, newProjection(buildKeys, buildPlan.output))
-    }
-  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashOuterJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashOuterJoin.scala
index 6bf2f82954046..7e671e7914f1a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashOuterJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashOuterJoin.scala
@@ -75,30 +75,36 @@ trait HashOuterJoin {
         s"HashOuterJoin should not take $x as the JoinType")
   }
 
-  protected[this] def supportUnsafe: Boolean = {
+  protected[this] def isUnsafeMode: Boolean = {
     (self.codegenEnabled && joinType != FullOuter
       && UnsafeProjection.canSupport(buildKeys)
       && UnsafeProjection.canSupport(self.schema))
   }
 
-  override def outputsUnsafeRows: Boolean = supportUnsafe
-  override def canProcessUnsafeRows: Boolean = supportUnsafe
+  override def outputsUnsafeRows: Boolean = isUnsafeMode
+  override def canProcessUnsafeRows: Boolean = isUnsafeMode
+  override def canProcessSafeRows: Boolean = !isUnsafeMode
 
-  protected[this] def streamedKeyGenerator(): Projection = {
-    if (supportUnsafe) {
+  @transient protected lazy val buildKeyGenerator: Projection =
+    if (isUnsafeMode) {
+      UnsafeProjection.create(buildKeys, buildPlan.output)
+    } else {
+      newMutableProjection(buildKeys, buildPlan.output)()
+    }
+
+  @transient protected[this] lazy val streamedKeyGenerator: Projection = {
+    if (isUnsafeMode) {
       UnsafeProjection.create(streamedKeys, streamedPlan.output)
     } else {
       newProjection(streamedKeys, streamedPlan.output)
     }
   }
 
-  @transient private[this] lazy val resultProjection: Projection = {
-    if (supportUnsafe) {
+  @transient private[this] lazy val resultProjection: InternalRow => InternalRow = {
+    if (isUnsafeMode) {
       UnsafeProjection.create(self.schema)
     } else {
-      new Projection {
-        override def apply(r: InternalRow): InternalRow = r
-      }
+      identity[InternalRow]
     }
   }
 
@@ -230,12 +236,4 @@ trait HashOuterJoin {
 
     hashTable
   }
-
-  protected[this] def buildHashRelation(buildIter: Iterator[InternalRow]): HashedRelation = {
-    if (supportUnsafe) {
-      UnsafeHashedRelation(buildIter, buildKeys, buildPlan)
-    } else {
-      HashedRelation(buildIter, newProjection(buildKeys, buildPlan.output))
-    }
-  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashSemiJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashSemiJoin.scala
index 7f49264d40354..97fde8f975bfd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashSemiJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashSemiJoin.scala
@@ -35,11 +35,13 @@ trait HashSemiJoin {
   protected[this] def supportUnsafe: Boolean = {
     (self.codegenEnabled && UnsafeProjection.canSupport(leftKeys)
       && UnsafeProjection.canSupport(rightKeys)
-      && UnsafeProjection.canSupport(left.schema))
+      && UnsafeProjection.canSupport(left.schema)
+      && UnsafeProjection.canSupport(right.schema))
   }
 
-  override def outputsUnsafeRows: Boolean = right.outputsUnsafeRows
+  override def outputsUnsafeRows: Boolean = supportUnsafe
   override def canProcessUnsafeRows: Boolean = supportUnsafe
+  override def canProcessSafeRows: Boolean = !supportUnsafe
 
   @transient protected lazy val leftKeyGenerator: Projection =
     if (supportUnsafe) {
@@ -87,14 +89,6 @@ trait HashSemiJoin {
     })
   }
 
-  protected def buildHashRelation(buildIter: Iterator[InternalRow]): HashedRelation = {
-    if (supportUnsafe) {
-      UnsafeHashedRelation(buildIter, rightKeys, right)
-    } else {
-      HashedRelation(buildIter, newProjection(rightKeys, right.output))
-    }
-  }
-
   protected def hashSemiJoin(
       streamIter: Iterator[InternalRow],
       hashedRelation: HashedRelation): Iterator[InternalRow] = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
index 8d5731afd59b8..9c058f1f72fe4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
@@ -18,12 +18,15 @@
 package org.apache.spark.sql.execution.joins
 
 import java.io.{Externalizable, ObjectInput, ObjectOutput}
+import java.nio.ByteOrder
 import java.util.{HashMap => JavaHashMap}
 
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.execution.{SparkPlan, SparkSqlSerializer}
-import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.execution.SparkSqlSerializer
+import org.apache.spark.unsafe.PlatformDependent
+import org.apache.spark.unsafe.map.BytesToBytesMap
+import org.apache.spark.unsafe.memory.{ExecutorMemoryManager, MemoryAllocator, TaskMemoryManager}
 import org.apache.spark.util.collection.CompactBuffer
 
 
@@ -32,7 +35,7 @@ import org.apache.spark.util.collection.CompactBuffer
  * object.
  */
 private[joins] sealed trait HashedRelation {
-  def get(key: InternalRow): CompactBuffer[InternalRow]
+  def get(key: InternalRow): Seq[InternalRow]
 
   // This is a helper method to implement Externalizable, and is used by
   // GeneralHashedRelation and UniqueKeyHashedRelation
@@ -59,9 +62,9 @@ private[joins] final class GeneralHashedRelation(
     private var hashTable: JavaHashMap[InternalRow, CompactBuffer[InternalRow]])
   extends HashedRelation with Externalizable {
 
-  def this() = this(null) // Needed for serialization
+  private def this() = this(null) // Needed for serialization
 
-  override def get(key: InternalRow): CompactBuffer[InternalRow] = hashTable.get(key)
+  override def get(key: InternalRow): Seq[InternalRow] = hashTable.get(key)
 
   override def writeExternal(out: ObjectOutput): Unit = {
     writeBytes(out, SparkSqlSerializer.serialize(hashTable))
@@ -81,9 +84,9 @@ private[joins]
 final class UniqueKeyHashedRelation(private var hashTable: JavaHashMap[InternalRow, InternalRow])
   extends HashedRelation with Externalizable {
 
-  def this() = this(null) // Needed for serialization
+  private def this() = this(null) // Needed for serialization
 
-  override def get(key: InternalRow): CompactBuffer[InternalRow] = {
+  override def get(key: InternalRow): Seq[InternalRow] = {
     val v = hashTable.get(key)
     if (v eq null) null else CompactBuffer(v)
   }
@@ -109,6 +112,10 @@ private[joins] object HashedRelation {
       keyGenerator: Projection,
       sizeEstimate: Int = 64): HashedRelation = {
 
+    if (keyGenerator.isInstanceOf[UnsafeProjection]) {
+      return UnsafeHashedRelation(input, keyGenerator.asInstanceOf[UnsafeProjection], sizeEstimate)
+    }
+
     // TODO: Use Spark's HashMap implementation.
     val hashTable = new JavaHashMap[InternalRow, CompactBuffer[InternalRow]](sizeEstimate)
     var currentRow: InternalRow = null
@@ -149,31 +156,133 @@ private[joins] object HashedRelation {
   }
 }
 
-
 /**
- * A HashedRelation for UnsafeRow, which is backed by BytesToBytesMap that maps the key into a
- * sequence of values.
+ * A HashedRelation for UnsafeRow, which is backed by HashMap or BytesToBytesMap that maps the key
+ * into a sequence of values.
+ *
+ * When it's created, it uses HashMap. After it's serialized and deserialized, it switch to use
+ * BytesToBytesMap for better memory performance (multiple values for the same are stored as a
+ * continuous byte array.
  *
- * TODO(davies): use BytesToBytesMap
+ * It's serialized in the following format:
+ *  [number of keys]
+ *  [size of key] [size of all values in bytes] [key bytes] [bytes for all values]
+ *  ...
+ *
+ * All the values are serialized as following:
+ *   [number of fields] [number of bytes] [underlying bytes of UnsafeRow]
+ *   ...
  */
 private[joins] final class UnsafeHashedRelation(
     private var hashTable: JavaHashMap[UnsafeRow, CompactBuffer[UnsafeRow]])
   extends HashedRelation with Externalizable {
 
-  def this() = this(null)  // Needed for serialization
+  private[joins] def this() = this(null)  // Needed for serialization
+
+  // Use BytesToBytesMap in executor for better performance (it's created when deserialization)
+  @transient private[this] var binaryMap: BytesToBytesMap = _
 
-  override def get(key: InternalRow): CompactBuffer[InternalRow] = {
+  override def get(key: InternalRow): Seq[InternalRow] = {
     val unsafeKey = key.asInstanceOf[UnsafeRow]
-    // Thanks to type eraser
-    hashTable.get(unsafeKey).asInstanceOf[CompactBuffer[InternalRow]]
+
+    if (binaryMap != null) {
+      // Used in Broadcast join
+      val loc = binaryMap.lookup(unsafeKey.getBaseObject, unsafeKey.getBaseOffset,
+        unsafeKey.getSizeInBytes)
+      if (loc.isDefined) {
+        val buffer = CompactBuffer[UnsafeRow]()
+
+        val base = loc.getValueAddress.getBaseObject
+        var offset = loc.getValueAddress.getBaseOffset
+        val last = loc.getValueAddress.getBaseOffset + loc.getValueLength
+        while (offset < last) {
+          val numFields = PlatformDependent.UNSAFE.getInt(base, offset)
+          val sizeInBytes = PlatformDependent.UNSAFE.getInt(base, offset + 4)
+          offset += 8
+
+          val row = new UnsafeRow
+          row.pointTo(base, offset, numFields, sizeInBytes)
+          buffer += row
+          offset += sizeInBytes
+        }
+        buffer
+      } else {
+        null
+      }
+
+    } else {
+      // Use the JavaHashMap in Local mode or ShuffleHashJoin
+      hashTable.get(unsafeKey)
+    }
   }
 
   override def writeExternal(out: ObjectOutput): Unit = {
-    writeBytes(out, SparkSqlSerializer.serialize(hashTable))
+    out.writeInt(hashTable.size())
+
+    val iter = hashTable.entrySet().iterator()
+    while (iter.hasNext) {
+      val entry = iter.next()
+      val key = entry.getKey
+      val values = entry.getValue
+
+      // write all the values as single byte array
+      var totalSize = 0L
+      var i = 0
+      while (i < values.size) {
+        totalSize += values(i).getSizeInBytes + 4 + 4
+        i += 1
+      }
+      assert(totalSize < Integer.MAX_VALUE, "values are too big")
+
+      // [key size] [values size] [key bytes] [values bytes]
+      out.writeInt(key.getSizeInBytes)
+      out.writeInt(totalSize.toInt)
+      out.write(key.getBytes)
+      i = 0
+      while (i < values.size) {
+        // [num of fields] [num of bytes] [row bytes]
+        // write the integer in native order, so they can be read by UNSAFE.getInt()
+        if (ByteOrder.nativeOrder() == ByteOrder.BIG_ENDIAN) {
+          out.writeInt(values(i).numFields())
+          out.writeInt(values(i).getSizeInBytes)
+        } else {
+          out.writeInt(Integer.reverseBytes(values(i).numFields()))
+          out.writeInt(Integer.reverseBytes(values(i).getSizeInBytes))
+        }
+        out.write(values(i).getBytes)
+        i += 1
+      }
+    }
   }
 
   override def readExternal(in: ObjectInput): Unit = {
-    hashTable = SparkSqlSerializer.deserialize(readBytes(in))
+    val nKeys = in.readInt()
+    // This is used in Broadcast, shared by multiple tasks, so we use on-heap memory
+    val memoryManager = new TaskMemoryManager(new ExecutorMemoryManager(MemoryAllocator.HEAP))
+    binaryMap = new BytesToBytesMap(memoryManager, nKeys * 2) // reduce hash collision
+
+    var i = 0
+    var keyBuffer = new Array[Byte](1024)
+    var valuesBuffer = new Array[Byte](1024)
+    while (i < nKeys) {
+      val keySize = in.readInt()
+      val valuesSize = in.readInt()
+      if (keySize > keyBuffer.size) {
+        keyBuffer = new Array[Byte](keySize)
+      }
+      in.readFully(keyBuffer, 0, keySize)
+      if (valuesSize > valuesBuffer.size) {
+        valuesBuffer = new Array[Byte](valuesSize)
+      }
+      in.readFully(valuesBuffer, 0, valuesSize)
+
+      // put it into binary map
+      val loc = binaryMap.lookup(keyBuffer, PlatformDependent.BYTE_ARRAY_OFFSET, keySize)
+      assert(!loc.isDefined, "Duplicated key found!")
+      loc.putNewKey(keyBuffer, PlatformDependent.BYTE_ARRAY_OFFSET, keySize,
+        valuesBuffer, PlatformDependent.BYTE_ARRAY_OFFSET, valuesSize)
+      i += 1
+    }
   }
 }
 
@@ -181,33 +290,14 @@ private[joins] object UnsafeHashedRelation {
 
   def apply(
       input: Iterator[InternalRow],
-      buildKeys: Seq[Expression],
-      buildPlan: SparkPlan,
-      sizeEstimate: Int = 64): HashedRelation = {
-    val boundedKeys = buildKeys.map(BindReferences.bindReference(_, buildPlan.output))
-    apply(input, boundedKeys, buildPlan.schema, sizeEstimate)
-  }
-
-  // Used for tests
-  def apply(
-      input: Iterator[InternalRow],
-      buildKeys: Seq[Expression],
-      rowSchema: StructType,
+      keyGenerator: UnsafeProjection,
       sizeEstimate: Int): HashedRelation = {
 
-    // TODO: Use BytesToBytesMap.
     val hashTable = new JavaHashMap[UnsafeRow, CompactBuffer[UnsafeRow]](sizeEstimate)
-    val toUnsafe = UnsafeProjection.create(rowSchema)
-    val keyGenerator = UnsafeProjection.create(buildKeys)
 
     // Create a mapping of buildKeys -> rows
     while (input.hasNext) {
-      val currentRow = input.next()
-      val unsafeRow = if (currentRow.isInstanceOf[UnsafeRow]) {
-        currentRow.asInstanceOf[UnsafeRow]
-      } else {
-        toUnsafe(currentRow)
-      }
+      val unsafeRow = input.next().asInstanceOf[UnsafeRow]
       val rowKey = keyGenerator(unsafeRow)
       if (!rowKey.anyNull) {
         val existingMatchList = hashTable.get(rowKey)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/LeftSemiJoinHash.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/LeftSemiJoinHash.scala
index 874712a4e739f..26a664104d6fb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/LeftSemiJoinHash.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/LeftSemiJoinHash.scala
@@ -46,7 +46,7 @@ case class LeftSemiJoinHash(
         val hashSet = buildKeyHashSet(buildIter)
         hashSemiJoin(streamIter, hashSet)
       } else {
-        val hashRelation = buildHashRelation(buildIter)
+        val hashRelation = HashedRelation(buildIter, rightKeyGenerator)
         hashSemiJoin(streamIter, hashRelation)
       }
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashJoin.scala
index 948d0ccebceb0..5439e10a60b2a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashJoin.scala
@@ -45,7 +45,7 @@ case class ShuffledHashJoin(
 
   protected override def doExecute(): RDD[InternalRow] = {
     buildPlan.execute().zipPartitions(streamedPlan.execute()) { (buildIter, streamIter) =>
-      val hashed = buildHashRelation(buildIter)
+      val hashed = HashedRelation(buildIter, buildSideKeyGenerator)
       hashJoin(streamIter, hashed)
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashOuterJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashOuterJoin.scala
index f54f1edd38ec8..d29b593207c4d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashOuterJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashOuterJoin.scala
@@ -50,8 +50,8 @@ case class ShuffledHashOuterJoin(
       // TODO this probably can be replaced by external sort (sort merged join?)
       joinType match {
         case LeftOuter =>
-          val hashed = buildHashRelation(rightIter)
-          val keyGenerator = streamedKeyGenerator()
+          val hashed = HashedRelation(rightIter, buildKeyGenerator)
+          val keyGenerator = streamedKeyGenerator
           leftIter.flatMap( currentRow => {
             val rowKey = keyGenerator(currentRow)
             joinedRow.withLeft(currentRow)
@@ -59,8 +59,8 @@ case class ShuffledHashOuterJoin(
           })
 
         case RightOuter =>
-          val hashed = buildHashRelation(leftIter)
-          val keyGenerator = streamedKeyGenerator()
+          val hashed = HashedRelation(leftIter, buildKeyGenerator)
+          val keyGenerator = streamedKeyGenerator
           rightIter.flatMap ( currentRow => {
             val rowKey = keyGenerator(currentRow)
             joinedRow.withRight(currentRow)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala
index 9dd2220f0967e..8b1a9b21a96b9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala
@@ -17,11 +17,12 @@
 
 package org.apache.spark.sql.execution.joins
 
+import java.io.{ByteArrayInputStream, ByteArrayOutputStream, ObjectInputStream, ObjectOutputStream}
+
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.execution.SparkSqlSerializer
-import org.apache.spark.sql.types.{StructField, StructType, IntegerType}
+import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
 import org.apache.spark.util.collection.CompactBuffer
 
 
@@ -64,27 +65,34 @@ class HashedRelationSuite extends SparkFunSuite {
   }
 
   test("UnsafeHashedRelation") {
+    val schema = StructType(StructField("a", IntegerType, true) :: Nil)
     val data = Array(InternalRow(0), InternalRow(1), InternalRow(2), InternalRow(2))
+    val toUnsafe = UnsafeProjection.create(schema)
+    val unsafeData = data.map(toUnsafe(_).copy()).toArray
+
     val buildKey = Seq(BoundReference(0, IntegerType, false))
-    val schema = StructType(StructField("a", IntegerType, true) :: Nil)
-    val hashed = UnsafeHashedRelation(data.iterator, buildKey, schema, 1)
+    val keyGenerator = UnsafeProjection.create(buildKey)
+    val hashed = UnsafeHashedRelation(unsafeData.iterator, keyGenerator, 1)
     assert(hashed.isInstanceOf[UnsafeHashedRelation])
 
-    val toUnsafeKey = UnsafeProjection.create(schema)
-    val unsafeData = data.map(toUnsafeKey(_).copy()).toArray
     assert(hashed.get(unsafeData(0)) === CompactBuffer[InternalRow](unsafeData(0)))
     assert(hashed.get(unsafeData(1)) === CompactBuffer[InternalRow](unsafeData(1)))
-    assert(hashed.get(toUnsafeKey(InternalRow(10))) === null)
+    assert(hashed.get(toUnsafe(InternalRow(10))) === null)
 
     val data2 = CompactBuffer[InternalRow](unsafeData(2).copy())
     data2 += unsafeData(2).copy()
     assert(hashed.get(unsafeData(2)) === data2)
 
-    val hashed2 = SparkSqlSerializer.deserialize(SparkSqlSerializer.serialize(hashed))
-      .asInstanceOf[UnsafeHashedRelation]
+    val os = new ByteArrayOutputStream()
+    val out = new ObjectOutputStream(os)
+    hashed.asInstanceOf[UnsafeHashedRelation].writeExternal(out)
+    out.flush()
+    val in = new ObjectInputStream(new ByteArrayInputStream(os.toByteArray))
+    val hashed2 = new UnsafeHashedRelation()
+    hashed2.readExternal(in)
     assert(hashed2.get(unsafeData(0)) === CompactBuffer[InternalRow](unsafeData(0)))
     assert(hashed2.get(unsafeData(1)) === CompactBuffer[InternalRow](unsafeData(1)))
-    assert(hashed2.get(toUnsafeKey(InternalRow(10))) === null)
+    assert(hashed2.get(toUnsafe(InternalRow(10))) === null)
     assert(hashed2.get(unsafeData(2)) === data2)
   }
 }

From 59b92add7cc9cca1eaf0c558edb7c4add66c284f Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Tue, 28 Jul 2015 16:04:48 -0700
Subject: [PATCH 433/549] [SPARK-9393] [SQL] Fix several error-handling bugs in
 ScriptTransform operator

SparkSQL's ScriptTransform operator has several serious bugs which make debugging fairly difficult:

- If exceptions are thrown in the writing thread then the child process will not be killed, leading to a deadlock because the reader thread will block while waiting for input that will never arrive.
- TaskContext is not propagated to the writer thread, which may cause errors in upstream pipelined operators.
- Exceptions which occur in the writer thread are not propagated to the main reader thread, which may cause upstream errors to be silently ignored instead of killing the job.  This can lead to silently incorrect query results.
- The writer thread is not a daemon thread, but it should be.

In addition, the code in this file is extremely messy:

- Lots of fields are nullable but the nullability isn't clearly explained.
- Many confusing variable names: for instance, there are variables named `ite` and `iterator` that are defined in the same scope.
- Some code was misindented.
- The `*serdeClass` variables are actually expected to be single-quoted strings, which is really confusing: I feel that this parsing / extraction should be performed in the analyzer, not in the operator itself.
- There were no unit tests for the operator itself, only end-to-end tests.

This pull request addresses these issues, borrowing some error-handling techniques from PySpark's PythonRDD.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #7710 from JoshRosen/script-transform and squashes the following commits:

16c44e2 [Josh Rosen] Update some comments
983f200 [Josh Rosen] Use unescapeSQLString instead of stripQuotes
6a06a8c [Josh Rosen] Clean up handling of quotes in serde class name
494cde0 [Josh Rosen] Propagate TaskContext to writer thread
323bb2b [Josh Rosen] Fix error-swallowing bug
b31258d [Josh Rosen] Rename iterator variables to disambiguate.
88278de [Josh Rosen] Split ScriptTransformation writer thread into own class.
8b162b6 [Josh Rosen] Add failing test which demonstrates exception masking issue
4ee36a2 [Josh Rosen] Kill script transform subprocess when error occurs in input writer.
bd4c948 [Josh Rosen] Skip launching of external command for empty partitions.
b43e4ec [Josh Rosen] Clean up nullability in ScriptTransformation
fa18d26 [Josh Rosen] Add basic unit test for script transform with 'cat' command.
---
 .../spark/sql/execution/SparkPlanTest.scala   |  27 +-
 .../org/apache/spark/sql/hive/HiveQl.scala    |  10 +-
 .../hive/execution/ScriptTransformation.scala | 280 +++++++++++-------
 .../execution/ScriptTransformationSuite.scala | 123 ++++++++
 4 files changed, 317 insertions(+), 123 deletions(-)
 create mode 100644 sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ScriptTransformationSuite.scala

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala
index 6a8f394545816..f46855edfe0de 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala
@@ -21,7 +21,7 @@ import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
 import org.apache.spark.sql.catalyst.util._
 import org.apache.spark.sql.test.TestSQLContext
-import org.apache.spark.sql.{DataFrame, DataFrameHolder, Row}
+import org.apache.spark.sql.{SQLContext, DataFrame, DataFrameHolder, Row}
 
 import scala.language.implicitConversions
 import scala.reflect.runtime.universe.TypeTag
@@ -33,11 +33,13 @@ import scala.util.control.NonFatal
  */
 class SparkPlanTest extends SparkFunSuite {
 
+  protected def sqlContext: SQLContext = TestSQLContext
+
   /**
    * Creates a DataFrame from a local Seq of Product.
    */
   implicit def localSeqToDataFrameHolder[A <: Product : TypeTag](data: Seq[A]): DataFrameHolder = {
-    TestSQLContext.implicits.localSeqToDataFrameHolder(data)
+    sqlContext.implicits.localSeqToDataFrameHolder(data)
   }
 
   /**
@@ -98,7 +100,7 @@ class SparkPlanTest extends SparkFunSuite {
       planFunction: Seq[SparkPlan] => SparkPlan,
       expectedAnswer: Seq[Row],
       sortAnswers: Boolean = true): Unit = {
-    SparkPlanTest.checkAnswer(input, planFunction, expectedAnswer, sortAnswers) match {
+    SparkPlanTest.checkAnswer(input, planFunction, expectedAnswer, sortAnswers, sqlContext) match {
       case Some(errorMessage) => fail(errorMessage)
       case None =>
     }
@@ -121,7 +123,8 @@ class SparkPlanTest extends SparkFunSuite {
       planFunction: SparkPlan => SparkPlan,
       expectedPlanFunction: SparkPlan => SparkPlan,
       sortAnswers: Boolean = true): Unit = {
-    SparkPlanTest.checkAnswer(input, planFunction, expectedPlanFunction, sortAnswers) match {
+    SparkPlanTest.checkAnswer(
+        input, planFunction, expectedPlanFunction, sortAnswers, sqlContext) match {
       case Some(errorMessage) => fail(errorMessage)
       case None =>
     }
@@ -147,13 +150,14 @@ object SparkPlanTest {
       input: DataFrame,
       planFunction: SparkPlan => SparkPlan,
       expectedPlanFunction: SparkPlan => SparkPlan,
-      sortAnswers: Boolean): Option[String] = {
+      sortAnswers: Boolean,
+      sqlContext: SQLContext): Option[String] = {
 
     val outputPlan = planFunction(input.queryExecution.sparkPlan)
     val expectedOutputPlan = expectedPlanFunction(input.queryExecution.sparkPlan)
 
     val expectedAnswer: Seq[Row] = try {
-      executePlan(expectedOutputPlan)
+      executePlan(expectedOutputPlan, sqlContext)
     } catch {
       case NonFatal(e) =>
         val errorMessage =
@@ -168,7 +172,7 @@ object SparkPlanTest {
     }
 
     val actualAnswer: Seq[Row] = try {
-      executePlan(outputPlan)
+      executePlan(outputPlan, sqlContext)
     } catch {
       case NonFatal(e) =>
         val errorMessage =
@@ -207,12 +211,13 @@ object SparkPlanTest {
       input: Seq[DataFrame],
       planFunction: Seq[SparkPlan] => SparkPlan,
       expectedAnswer: Seq[Row],
-      sortAnswers: Boolean): Option[String] = {
+      sortAnswers: Boolean,
+      sqlContext: SQLContext): Option[String] = {
 
     val outputPlan = planFunction(input.map(_.queryExecution.sparkPlan))
 
     val sparkAnswer: Seq[Row] = try {
-      executePlan(outputPlan)
+      executePlan(outputPlan, sqlContext)
     } catch {
       case NonFatal(e) =>
         val errorMessage =
@@ -275,10 +280,10 @@ object SparkPlanTest {
     }
   }
 
-  private def executePlan(outputPlan: SparkPlan): Seq[Row] = {
+  private def executePlan(outputPlan: SparkPlan, sqlContext: SQLContext): Seq[Row] = {
     // A very simple resolver to make writing tests easier. In contrast to the real resolver
     // this is always case sensitive and does not try to handle scoping or complex type resolution.
-    val resolvedPlan = TestSQLContext.prepareForExecution.execute(
+    val resolvedPlan = sqlContext.prepareForExecution.execute(
       outputPlan transform {
         case plan: SparkPlan =>
           val inputMap = plan.children.flatMap(_.output).map(a => (a.name, a)).toMap
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
index 2f79b0aad045c..e6df64d2642bc 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
@@ -874,15 +874,15 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
             }
 
             def matchSerDe(clause: Seq[ASTNode])
-              : (Seq[(String, String)], String, Seq[(String, String)]) = clause match {
+              : (Seq[(String, String)], Option[String], Seq[(String, String)]) = clause match {
               case Token("TOK_SERDEPROPS", propsClause) :: Nil =>
                 val rowFormat = propsClause.map {
                   case Token(name, Token(value, Nil) :: Nil) => (name, value)
                 }
-                (rowFormat, "", Nil)
+                (rowFormat, None, Nil)
 
               case Token("TOK_SERDENAME", Token(serdeClass, Nil) :: Nil) :: Nil =>
-                (Nil, serdeClass, Nil)
+                (Nil, Some(BaseSemanticAnalyzer.unescapeSQLString(serdeClass)), Nil)
 
               case Token("TOK_SERDENAME", Token(serdeClass, Nil) ::
                 Token("TOK_TABLEPROPERTIES",
@@ -891,9 +891,9 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
                   case Token("TOK_TABLEPROPERTY", Token(name, Nil) :: Token(value, Nil) :: Nil) =>
                     (name, value)
                 }
-                (Nil, serdeClass, serdeProps)
+                (Nil, Some(BaseSemanticAnalyzer.unescapeSQLString(serdeClass)), serdeProps)
 
-              case Nil => (Nil, "", Nil)
+              case Nil => (Nil, None, Nil)
             }
 
             val (inRowFormat, inSerdeClass, inSerdeProps) = matchSerDe(inputSerdeClause)
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala
index 205e622195f09..741c705e2a253 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala
@@ -17,15 +17,18 @@
 
 package org.apache.spark.sql.hive.execution
 
-import java.io.{BufferedReader, DataInputStream, DataOutputStream, EOFException, InputStreamReader}
+import java.io._
 import java.util.Properties
+import javax.annotation.Nullable
 
 import scala.collection.JavaConversions._
+import scala.util.control.NonFatal
 
 import org.apache.hadoop.hive.serde.serdeConstants
 import org.apache.hadoop.hive.serde2.AbstractSerDe
 import org.apache.hadoop.hive.serde2.objectinspector._
 
+import org.apache.spark.{TaskContext, Logging}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.CatalystTypeConverters
@@ -56,21 +59,53 @@ case class ScriptTransformation(
   override def otherCopyArgs: Seq[HiveContext] = sc :: Nil
 
   protected override def doExecute(): RDD[InternalRow] = {
-    child.execute().mapPartitions { iter =>
+    def processIterator(inputIterator: Iterator[InternalRow]): Iterator[InternalRow] = {
       val cmd = List("/bin/bash", "-c", script)
       val builder = new ProcessBuilder(cmd)
-      // We need to start threads connected to the process pipeline:
-      // 1) The error msg generated by the script process would be hidden.
-      // 2) If the error msg is too big to chock up the buffer, the input logic would be hung
+
       val proc = builder.start()
       val inputStream = proc.getInputStream
       val outputStream = proc.getOutputStream
       val errorStream = proc.getErrorStream
-      val reader = new BufferedReader(new InputStreamReader(inputStream))
 
-      val (outputSerde, outputSoi) = ioschema.initOutputSerDe(output)
+      // In order to avoid deadlocks, we need to consume the error output of the child process.
+      // To avoid issues caused by large error output, we use a circular buffer to limit the amount
+      // of error output that we retain. See SPARK-7862 for more discussion of the deadlock / hang
+      // that motivates this.
+      val stderrBuffer = new CircularBuffer(2048)
+      new RedirectThread(
+        errorStream,
+        stderrBuffer,
+        "Thread-ScriptTransformation-STDERR-Consumer").start()
+
+      val outputProjection = new InterpretedProjection(input, child.output)
+
+      // This nullability is a performance optimization in order to avoid an Option.foreach() call
+      // inside of a loop
+      @Nullable val (inputSerde, inputSoi) = ioschema.initInputSerDe(input).getOrElse((null, null))
+
+      // This new thread will consume the ScriptTransformation's input rows and write them to the
+      // external process. That process's output will be read by this current thread.
+      val writerThread = new ScriptTransformationWriterThread(
+        inputIterator,
+        outputProjection,
+        inputSerde,
+        inputSoi,
+        ioschema,
+        outputStream,
+        proc,
+        stderrBuffer,
+        TaskContext.get()
+      )
+
+      // This nullability is a performance optimization in order to avoid an Option.foreach() call
+      // inside of a loop
+      @Nullable val (outputSerde, outputSoi) = {
+        ioschema.initOutputSerDe(output).getOrElse((null, null))
+      }
 
-      val iterator: Iterator[InternalRow] = new Iterator[InternalRow] with HiveInspectors {
+      val reader = new BufferedReader(new InputStreamReader(inputStream))
+      val outputIterator: Iterator[InternalRow] = new Iterator[InternalRow] with HiveInspectors {
         var cacheRow: InternalRow = null
         var curLine: String = null
         var eof: Boolean = false
@@ -79,12 +114,26 @@ case class ScriptTransformation(
           if (outputSerde == null) {
             if (curLine == null) {
               curLine = reader.readLine()
-              curLine != null
+              if (curLine == null) {
+                if (writerThread.exception.isDefined) {
+                  throw writerThread.exception.get
+                }
+                false
+              } else {
+                true
+              }
             } else {
               true
             }
           } else {
-            !eof
+            if (eof) {
+              if (writerThread.exception.isDefined) {
+                throw writerThread.exception.get
+              }
+              false
+            } else {
+              true
+            }
           }
         }
 
@@ -110,11 +159,11 @@ case class ScriptTransformation(
               }
               i += 1
             })
-            return mutableRow
+            mutableRow
           } catch {
             case e: EOFException =>
               eof = true
-              return null
+              null
           }
         }
 
@@ -146,49 +195,83 @@ case class ScriptTransformation(
         }
       }
 
-      val (inputSerde, inputSoi) = ioschema.initInputSerDe(input)
-      val dataOutputStream = new DataOutputStream(outputStream)
-      val outputProjection = new InterpretedProjection(input, child.output)
+      writerThread.start()
 
-      // TODO make the 2048 configurable?
-      val stderrBuffer = new CircularBuffer(2048)
-      // Consume the error stream from the pipeline, otherwise it will be blocked if
-      // the pipeline is full.
-      new RedirectThread(errorStream, // input stream from the pipeline
-        stderrBuffer,                 // output to a circular buffer
-        "Thread-ScriptTransformation-STDERR-Consumer").start()
+      outputIterator
+    }
 
-      // Put the write(output to the pipeline) into a single thread
-      // and keep the collector as remain in the main thread.
-      // otherwise it will causes deadlock if the data size greater than
-      // the pipeline / buffer capacity.
-      new Thread(new Runnable() {
-        override def run(): Unit = {
-          Utils.tryWithSafeFinally {
-            iter
-              .map(outputProjection)
-              .foreach { row =>
-              if (inputSerde == null) {
-                val data = row.mkString("", ioschema.inputRowFormatMap("TOK_TABLEROWFORMATFIELD"),
-                  ioschema.inputRowFormatMap("TOK_TABLEROWFORMATLINES")).getBytes("utf-8")
-
-                outputStream.write(data)
-              } else {
-                val writable = inputSerde.serialize(
-                  row.asInstanceOf[GenericInternalRow].values, inputSoi)
-                prepareWritable(writable).write(dataOutputStream)
-              }
-            }
-            outputStream.close()
-          } {
-            if (proc.waitFor() != 0) {
-              logError(stderrBuffer.toString) // log the stderr circular buffer
-            }
-          }
-        }
-      }, "Thread-ScriptTransformation-Feed").start()
+    child.execute().mapPartitions { iter =>
+      if (iter.hasNext) {
+        processIterator(iter)
+      } else {
+        // If the input iterator has no rows then do not launch the external script.
+        Iterator.empty
+      }
+    }
+  }
+}
 
-      iterator
+private class ScriptTransformationWriterThread(
+    iter: Iterator[InternalRow],
+    outputProjection: Projection,
+    @Nullable inputSerde: AbstractSerDe,
+    @Nullable inputSoi: ObjectInspector,
+    ioschema: HiveScriptIOSchema,
+    outputStream: OutputStream,
+    proc: Process,
+    stderrBuffer: CircularBuffer,
+    taskContext: TaskContext
+  ) extends Thread("Thread-ScriptTransformation-Feed") with Logging {
+
+  setDaemon(true)
+
+  @volatile private var _exception: Throwable = null
+
+  /** Contains the exception thrown while writing the parent iterator to the external process. */
+  def exception: Option[Throwable] = Option(_exception)
+
+  override def run(): Unit = Utils.logUncaughtExceptions {
+    TaskContext.setTaskContext(taskContext)
+
+    val dataOutputStream = new DataOutputStream(outputStream)
+
+    // We can't use Utils.tryWithSafeFinally here because we also need a `catch` block, so
+    // let's use a variable to record whether the `finally` block was hit due to an exception
+    var threwException: Boolean = true
+    try {
+      iter.map(outputProjection).foreach { row =>
+        if (inputSerde == null) {
+          val data = row.mkString("", ioschema.inputRowFormatMap("TOK_TABLEROWFORMATFIELD"),
+            ioschema.inputRowFormatMap("TOK_TABLEROWFORMATLINES")).getBytes("utf-8")
+          outputStream.write(data)
+        } else {
+          val writable = inputSerde.serialize(
+            row.asInstanceOf[GenericInternalRow].values, inputSoi)
+          prepareWritable(writable).write(dataOutputStream)
+        }
+      }
+      outputStream.close()
+      threwException = false
+    } catch {
+      case NonFatal(e) =>
+        // An error occurred while writing input, so kill the child process. According to the
+        // Javadoc this call will not throw an exception:
+        _exception = e
+        proc.destroy()
+        throw e
+    } finally {
+      try {
+        if (proc.waitFor() != 0) {
+          logError(stderrBuffer.toString) // log the stderr circular buffer
+        }
+      } catch {
+        case NonFatal(exceptionFromFinallyBlock) =>
+          if (!threwException) {
+            throw exceptionFromFinallyBlock
+          } else {
+            log.error("Exception in finally block", exceptionFromFinallyBlock)
+          }
+      }
     }
   }
 }
@@ -200,33 +283,43 @@ private[hive]
 case class HiveScriptIOSchema (
     inputRowFormat: Seq[(String, String)],
     outputRowFormat: Seq[(String, String)],
-    inputSerdeClass: String,
-    outputSerdeClass: String,
+    inputSerdeClass: Option[String],
+    outputSerdeClass: Option[String],
     inputSerdeProps: Seq[(String, String)],
     outputSerdeProps: Seq[(String, String)],
     schemaLess: Boolean) extends ScriptInputOutputSchema with HiveInspectors {
 
-  val defaultFormat = Map(("TOK_TABLEROWFORMATFIELD", "\t"),
-                          ("TOK_TABLEROWFORMATLINES", "\n"))
+  private val defaultFormat = Map(
+    ("TOK_TABLEROWFORMATFIELD", "\t"),
+    ("TOK_TABLEROWFORMATLINES", "\n")
+  )
 
   val inputRowFormatMap = inputRowFormat.toMap.withDefault((k) => defaultFormat(k))
   val outputRowFormatMap = outputRowFormat.toMap.withDefault((k) => defaultFormat(k))
 
 
-  def initInputSerDe(input: Seq[Expression]): (AbstractSerDe, ObjectInspector) = {
-    val (columns, columnTypes) = parseAttrs(input)
-    val serde = initSerDe(inputSerdeClass, columns, columnTypes, inputSerdeProps)
-    (serde, initInputSoi(serde, columns, columnTypes))
+  def initInputSerDe(input: Seq[Expression]): Option[(AbstractSerDe, ObjectInspector)] = {
+    inputSerdeClass.map { serdeClass =>
+      val (columns, columnTypes) = parseAttrs(input)
+      val serde = initSerDe(serdeClass, columns, columnTypes, inputSerdeProps)
+      val fieldObjectInspectors = columnTypes.map(toInspector)
+      val objectInspector = ObjectInspectorFactory
+        .getStandardStructObjectInspector(columns, fieldObjectInspectors)
+        .asInstanceOf[ObjectInspector]
+      (serde, objectInspector)
+    }
   }
 
-  def initOutputSerDe(output: Seq[Attribute]): (AbstractSerDe, StructObjectInspector) = {
-    val (columns, columnTypes) = parseAttrs(output)
-    val serde = initSerDe(outputSerdeClass, columns, columnTypes, outputSerdeProps)
-    (serde, initOutputputSoi(serde))
+  def initOutputSerDe(output: Seq[Attribute]): Option[(AbstractSerDe, StructObjectInspector)] = {
+    outputSerdeClass.map { serdeClass =>
+      val (columns, columnTypes) = parseAttrs(output)
+      val serde = initSerDe(serdeClass, columns, columnTypes, outputSerdeProps)
+      val structObjectInspector = serde.getObjectInspector().asInstanceOf[StructObjectInspector]
+      (serde, structObjectInspector)
+    }
   }
 
-  def parseAttrs(attrs: Seq[Expression]): (Seq[String], Seq[DataType]) = {
-
+  private def parseAttrs(attrs: Seq[Expression]): (Seq[String], Seq[DataType]) = {
     val columns = attrs.map {
       case aref: AttributeReference => aref.name
       case e: NamedExpression => e.name
@@ -242,52 +335,25 @@ case class HiveScriptIOSchema (
     (columns, columnTypes)
   }
 
-  def initSerDe(serdeClassName: String, columns: Seq[String],
-    columnTypes: Seq[DataType], serdeProps: Seq[(String, String)]): AbstractSerDe = {
+  private def initSerDe(
+      serdeClassName: String,
+      columns: Seq[String],
+      columnTypes: Seq[DataType],
+      serdeProps: Seq[(String, String)]): AbstractSerDe = {
 
-    val serde: AbstractSerDe = if (serdeClassName != "") {
-      val trimed_class = serdeClassName.split("'")(1)
-      Utils.classForName(trimed_class)
-        .newInstance.asInstanceOf[AbstractSerDe]
-    } else {
-      null
-    }
+    val serde = Utils.classForName(serdeClassName).newInstance.asInstanceOf[AbstractSerDe]
 
-    if (serde != null) {
-      val columnTypesNames = columnTypes.map(_.toTypeInfo.getTypeName()).mkString(",")
+    val columnTypesNames = columnTypes.map(_.toTypeInfo.getTypeName()).mkString(",")
 
-      var propsMap = serdeProps.map(kv => {
-        (kv._1.split("'")(1), kv._2.split("'")(1))
-      }).toMap + (serdeConstants.LIST_COLUMNS -> columns.mkString(","))
-      propsMap = propsMap + (serdeConstants.LIST_COLUMN_TYPES -> columnTypesNames)
+    var propsMap = serdeProps.map(kv => {
+      (kv._1.split("'")(1), kv._2.split("'")(1))
+    }).toMap + (serdeConstants.LIST_COLUMNS -> columns.mkString(","))
+    propsMap = propsMap + (serdeConstants.LIST_COLUMN_TYPES -> columnTypesNames)
 
-      val properties = new Properties()
-      properties.putAll(propsMap)
-      serde.initialize(null, properties)
-    }
+    val properties = new Properties()
+    properties.putAll(propsMap)
+    serde.initialize(null, properties)
 
     serde
   }
-
-  def initInputSoi(inputSerde: AbstractSerDe, columns: Seq[String], columnTypes: Seq[DataType])
-    : ObjectInspector = {
-
-    if (inputSerde != null) {
-      val fieldObjectInspectors = columnTypes.map(toInspector(_))
-      ObjectInspectorFactory
-        .getStandardStructObjectInspector(columns, fieldObjectInspectors)
-        .asInstanceOf[ObjectInspector]
-    } else {
-      null
-    }
-  }
-
-  def initOutputputSoi(outputSerde: AbstractSerDe): StructObjectInspector = {
-    if (outputSerde != null) {
-      outputSerde.getObjectInspector().asInstanceOf[StructObjectInspector]
-    } else {
-      null
-    }
-  }
 }
-
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ScriptTransformationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ScriptTransformationSuite.scala
new file mode 100644
index 0000000000000..0875232aede3e
--- /dev/null
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ScriptTransformationSuite.scala
@@ -0,0 +1,123 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.execution
+
+import org.apache.hadoop.hive.serde2.`lazy`.LazySimpleSerDe
+import org.scalatest.exceptions.TestFailedException
+
+import org.apache.spark.TaskContext
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
+import org.apache.spark.sql.execution.{UnaryNode, SparkPlan, SparkPlanTest}
+import org.apache.spark.sql.hive.test.TestHive
+import org.apache.spark.sql.types.StringType
+
+class ScriptTransformationSuite extends SparkPlanTest {
+
+  override def sqlContext: SQLContext = TestHive
+
+  private val noSerdeIOSchema = HiveScriptIOSchema(
+    inputRowFormat = Seq.empty,
+    outputRowFormat = Seq.empty,
+    inputSerdeClass = None,
+    outputSerdeClass = None,
+    inputSerdeProps = Seq.empty,
+    outputSerdeProps = Seq.empty,
+    schemaLess = false
+  )
+
+  private val serdeIOSchema = noSerdeIOSchema.copy(
+    inputSerdeClass = Some(classOf[LazySimpleSerDe].getCanonicalName),
+    outputSerdeClass = Some(classOf[LazySimpleSerDe].getCanonicalName)
+  )
+
+  test("cat without SerDe") {
+    val rowsDf = Seq("a", "b", "c").map(Tuple1.apply).toDF("a")
+    checkAnswer(
+      rowsDf,
+      (child: SparkPlan) => new ScriptTransformation(
+        input = Seq(rowsDf.col("a").expr),
+        script = "cat",
+        output = Seq(AttributeReference("a", StringType)()),
+        child = child,
+        ioschema = noSerdeIOSchema
+      )(TestHive),
+      rowsDf.collect())
+  }
+
+  test("cat with LazySimpleSerDe") {
+    val rowsDf = Seq("a", "b", "c").map(Tuple1.apply).toDF("a")
+    checkAnswer(
+      rowsDf,
+      (child: SparkPlan) => new ScriptTransformation(
+        input = Seq(rowsDf.col("a").expr),
+        script = "cat",
+        output = Seq(AttributeReference("a", StringType)()),
+        child = child,
+        ioschema = serdeIOSchema
+      )(TestHive),
+      rowsDf.collect())
+  }
+
+  test("script transformation should not swallow errors from upstream operators (no serde)") {
+    val rowsDf = Seq("a", "b", "c").map(Tuple1.apply).toDF("a")
+    val e = intercept[TestFailedException] {
+      checkAnswer(
+        rowsDf,
+        (child: SparkPlan) => new ScriptTransformation(
+          input = Seq(rowsDf.col("a").expr),
+          script = "cat",
+          output = Seq(AttributeReference("a", StringType)()),
+          child = ExceptionInjectingOperator(child),
+          ioschema = noSerdeIOSchema
+        )(TestHive),
+        rowsDf.collect())
+    }
+    assert(e.getMessage().contains("intentional exception"))
+  }
+
+  test("script transformation should not swallow errors from upstream operators (with serde)") {
+    val rowsDf = Seq("a", "b", "c").map(Tuple1.apply).toDF("a")
+    val e = intercept[TestFailedException] {
+      checkAnswer(
+        rowsDf,
+        (child: SparkPlan) => new ScriptTransformation(
+          input = Seq(rowsDf.col("a").expr),
+          script = "cat",
+          output = Seq(AttributeReference("a", StringType)()),
+          child = ExceptionInjectingOperator(child),
+          ioschema = serdeIOSchema
+        )(TestHive),
+        rowsDf.collect())
+    }
+    assert(e.getMessage().contains("intentional exception"))
+  }
+}
+
+private case class ExceptionInjectingOperator(child: SparkPlan) extends UnaryNode {
+  override protected def doExecute(): RDD[InternalRow] = {
+    child.execute().map { x =>
+      assert(TaskContext.get() != null) // Make sure that TaskContext is defined.
+      Thread.sleep(1000) // This sleep gives the external process time to start.
+      throw new IllegalArgumentException("intentional exception")
+    }
+  }
+  override def output: Seq[Attribute] = child.output
+}

From c5ed36953f840018f603dfde94fcb4651e5246ac Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Tue, 28 Jul 2015 16:41:56 -0700
Subject: [PATCH 434/549] [STREAMING] [HOTFIX] Ignore ReceiverTrackerSuite
 flaky test

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #7738 from tdas/ReceiverTrackerSuite-hotfix and squashes the following commits:

00f0ee1 [Tathagata Das] ignore flaky test
---
 .../apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala
index e2159bd4f225d..b039233f36316 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala
@@ -31,7 +31,7 @@ class ReceiverTrackerSuite extends TestSuiteBase {
   val sparkConf = new SparkConf().setMaster("local[8]").setAppName("test")
   val ssc = new StreamingContext(sparkConf, Milliseconds(100))
 
-  test("Receiver tracker - propagates rate limit") {
+  ignore("Receiver tracker - propagates rate limit") {
     object ReceiverStartedWaiter extends StreamingListener {
       @volatile
       var started = false

From b7f54119f86f916481aeccc67f07e77dc2a924c7 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Tue, 28 Jul 2015 17:03:59 -0700
Subject: [PATCH 435/549] [SPARK-9420][SQL] Move expressions in sql/core
 package to catalyst.

Since catalyst package already depends on Spark core, we can move those expressions
into catalyst, and simplify function registry.

This is a followup of #7478.

Author: Reynold Xin <rxin@databricks.com>

Closes #7735 from rxin/SPARK-8003 and squashes the following commits:

2ffbdc3 [Reynold Xin] [SPARK-8003][SQL] Move expressions in sql/core package to catalyst.
---
 .../sql/catalyst/analysis/Analyzer.scala      |  3 ++-
 .../catalyst/analysis/FunctionRegistry.scala  | 17 +++++++-------
 .../MonotonicallyIncreasingID.scala           |  3 +--
 .../expressions/SparkPartitionID.scala        |  3 +--
 .../expressions}/NondeterministicSuite.scala  |  4 +---
 .../org/apache/spark/sql/SQLContext.scala     | 11 +--------
 .../sql/execution/expressions/package.scala   | 23 -------------------
 .../org/apache/spark/sql/functions.scala      |  4 ++--
 .../scala/org/apache/spark/sql/UDFSuite.scala |  4 ++--
 .../apache/spark/sql/hive/HiveContext.scala   | 13 ++---------
 .../org/apache/spark/sql/hive/UDFSuite.scala  |  4 ++--
 11 files changed, 23 insertions(+), 66 deletions(-)
 rename sql/{core/src/main/scala/org/apache/spark/sql/execution => catalyst/src/main/scala/org/apache/spark/sql/catalyst}/expressions/MonotonicallyIncreasingID.scala (95%)
 rename sql/{core/src/main/scala/org/apache/spark/sql/execution => catalyst/src/main/scala/org/apache/spark/sql/catalyst}/expressions/SparkPartitionID.scala (93%)
 rename sql/{core/src/test/scala/org/apache/spark/sql/execution/expression => catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions}/NondeterministicSuite.scala (83%)
 delete mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/package.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index a723e92114b32..a309ee35ee582 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.catalyst.analysis
 
+import scala.collection.mutable.ArrayBuffer
+
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.expressions.aggregate.{Complete, AggregateExpression2, AggregateFunction2}
 import org.apache.spark.sql.catalyst.expressions._
@@ -25,7 +27,6 @@ import org.apache.spark.sql.catalyst.rules._
 import org.apache.spark.sql.catalyst.trees.TreeNodeRef
 import org.apache.spark.sql.catalyst.{SimpleCatalystConf, CatalystConf}
 import org.apache.spark.sql.types._
-import scala.collection.mutable.ArrayBuffer
 
 /**
  * A trivial [[Analyzer]] with an [[EmptyCatalog]] and [[EmptyFunctionRegistry]]. Used for testing
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index 9b60943a1e147..372f80d4a8b16 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -161,13 +161,6 @@ object FunctionRegistry {
     expression[ToDegrees]("degrees"),
     expression[ToRadians]("radians"),
 
-    // misc functions
-    expression[Md5]("md5"),
-    expression[Sha2]("sha2"),
-    expression[Sha1]("sha1"),
-    expression[Sha1]("sha"),
-    expression[Crc32]("crc32"),
-
     // aggregate functions
     expression[Average]("avg"),
     expression[Count]("count"),
@@ -229,7 +222,15 @@ object FunctionRegistry {
     expression[Year]("year"),
 
     // collection functions
-    expression[Size]("size")
+    expression[Size]("size"),
+
+    // misc functions
+    expression[Crc32]("crc32"),
+    expression[Md5]("md5"),
+    expression[Sha1]("sha"),
+    expression[Sha1]("sha1"),
+    expression[Sha2]("sha2"),
+    expression[SparkPartitionID]("spark_partition_id")
   )
 
   val builtin: FunctionRegistry = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/MonotonicallyIncreasingID.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/MonotonicallyIncreasingID.scala
similarity index 95%
rename from sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/MonotonicallyIncreasingID.scala
rename to sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/MonotonicallyIncreasingID.scala
index eca36b3274420..291b7a5bc3af5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/MonotonicallyIncreasingID.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/MonotonicallyIncreasingID.scala
@@ -15,11 +15,10 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.execution.expressions
+package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.TaskContext
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.{Nondeterministic, LeafExpression}
 import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, CodeGenContext}
 import org.apache.spark.sql.types.{LongType, DataType}
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/SparkPartitionID.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SparkPartitionID.scala
similarity index 93%
rename from sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/SparkPartitionID.scala
rename to sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SparkPartitionID.scala
index 98c8eab8372aa..3f6480bbf0114 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/SparkPartitionID.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SparkPartitionID.scala
@@ -15,11 +15,10 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.execution.expressions
+package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.TaskContext
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.{Nondeterministic, LeafExpression}
 import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, CodeGenContext}
 import org.apache.spark.sql.types.{IntegerType, DataType}
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/expression/NondeterministicSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NondeterministicSuite.scala
similarity index 83%
rename from sql/core/src/test/scala/org/apache/spark/sql/execution/expression/NondeterministicSuite.scala
rename to sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NondeterministicSuite.scala
index b6e79ff9cc95d..82894822ab0f4 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/expression/NondeterministicSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NondeterministicSuite.scala
@@ -15,11 +15,9 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.execution.expression
+package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.catalyst.expressions. ExpressionEvalHelper
-import org.apache.spark.sql.execution.expressions.{SparkPartitionID, MonotonicallyIncreasingID}
 
 class NondeterministicSuite extends SparkFunSuite with ExpressionEvalHelper {
   test("MonotonicallyIncreasingID") {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 56cd8f22e7cf4..dbb2a09846548 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -31,8 +31,6 @@ import org.apache.spark.SparkContext
 import org.apache.spark.annotation.{DeveloperApi, Experimental}
 import org.apache.spark.api.java.{JavaRDD, JavaSparkContext}
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.catalyst.analysis.FunctionRegistry.{expression => FunctionExpression, FunctionBuilder}
-import org.apache.spark.sql.execution.expressions.SparkPartitionID
 import org.apache.spark.sql.SQLConf.SQLConfEntry
 import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.errors.DialectException
@@ -142,14 +140,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
 
   // TODO how to handle the temp function per user session?
   @transient
-  protected[sql] lazy val functionRegistry: FunctionRegistry = {
-    val reg = FunctionRegistry.builtin
-    val extendedFunctions = List[(String, (ExpressionInfo, FunctionBuilder))](
-      FunctionExpression[SparkPartitionID]("spark__partition__id")
-    )
-    extendedFunctions.foreach { case(name, (info, fun)) => reg.registerFunction(name, info, fun) }
-    reg
-  }
+  protected[sql] lazy val functionRegistry: FunctionRegistry = FunctionRegistry.builtin
 
   @transient
   protected[sql] lazy val analyzer: Analyzer =
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/package.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/package.scala
deleted file mode 100644
index 568b7ac2c5987..0000000000000
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/expressions/package.scala
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.execution
-
-/**
- * Package containing expressions that are specific to Spark runtime.
- */
-package object expressions
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 0148991512213..4261a5e7cbeb5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -634,7 +634,7 @@ object functions {
    * @group normal_funcs
    * @since 1.4.0
    */
-  def monotonicallyIncreasingId(): Column = execution.expressions.MonotonicallyIncreasingID()
+  def monotonicallyIncreasingId(): Column = MonotonicallyIncreasingID()
 
   /**
    * Return an alternative value `r` if `l` is NaN.
@@ -741,7 +741,7 @@ object functions {
    * @group normal_funcs
    * @since 1.4.0
    */
-  def sparkPartitionId(): Column = execution.expressions.SparkPartitionID()
+  def sparkPartitionId(): Column = SparkPartitionID()
 
   /**
    * Computes the square root of the specified float value.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
index 9b326c16350c8..d9c8b380ef146 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
@@ -51,10 +51,10 @@ class UDFSuite extends QueryTest {
     df.selectExpr("count(distinct a)")
   }
 
-  test("SPARK-8003 spark__partition__id") {
+  test("SPARK-8003 spark_partition_id") {
     val df = Seq((1, "Tearing down the walls that divide us")).toDF("id", "saying")
     df.registerTempTable("tmp_table")
-    checkAnswer(ctx.sql("select spark__partition__id() from tmp_table").toDF(), Row(0))
+    checkAnswer(ctx.sql("select spark_partition_id() from tmp_table").toDF(), Row(0))
     ctx.dropTempTable("tmp_table")
   }
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index 8b35c1275f388..110f51a305861 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -38,9 +38,6 @@ import org.apache.spark.Logging
 import org.apache.spark.SparkContext
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.sql._
-import org.apache.spark.sql.catalyst.analysis.FunctionRegistry.{expression => FunctionExpression, FunctionBuilder}
-import org.apache.spark.sql.catalyst.expressions.ExpressionInfo
-import org.apache.spark.sql.execution.expressions.SparkPartitionID
 import org.apache.spark.sql.SQLConf.SQLConfEntry
 import org.apache.spark.sql.SQLConf.SQLConfEntry._
 import org.apache.spark.sql.catalyst.{TableIdentifier, ParserDialect}
@@ -375,14 +372,8 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) with Logging {
 
   // Note that HiveUDFs will be overridden by functions registered in this context.
   @transient
-  override protected[sql] lazy val functionRegistry: FunctionRegistry = {
-    val reg = new HiveFunctionRegistry(FunctionRegistry.builtin)
-    val extendedFunctions = List[(String, (ExpressionInfo, FunctionBuilder))](
-      FunctionExpression[SparkPartitionID]("spark__partition__id")
-    )
-    extendedFunctions.foreach { case(name, (info, fun)) => reg.registerFunction(name, info, fun) }
-    reg
-  }
+  override protected[sql] lazy val functionRegistry: FunctionRegistry =
+    new HiveFunctionRegistry(FunctionRegistry.builtin)
 
   /* An analyzer that uses the Hive metastore. */
   @transient
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/UDFSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/UDFSuite.scala
index 9cea5d413c817..37afc2142abf7 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/UDFSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/UDFSuite.scala
@@ -35,9 +35,9 @@ class UDFSuite extends QueryTest {
     assert(ctx.sql("SELECT strlenscala('test', 1) FROM src LIMIT 1").head().getInt(0) === 5)
   }
 
-  test("SPARK-8003 spark__partition__id") {
+  test("SPARK-8003 spark_partition_id") {
     val df = Seq((1, "Two Fiiiiive")).toDF("id", "saying")
     ctx.registerDataFrameAsTable(df, "test_table")
-    checkAnswer(ctx.sql("select spark__partition__id() from test_table LIMIT 1").toDF(), Row(0))
+    checkAnswer(ctx.sql("select spark_partition_id() from test_table LIMIT 1").toDF(), Row(0))
   }
 }

From 6662ee21244067180c1bcef0b16107b2979fd933 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Tue, 28 Jul 2015 17:42:35 -0700
Subject: [PATCH 436/549] [SPARK-9418][SQL] Use sort-merge join as the default
 shuffle join.

Sort-merge join is more robust in Spark since sorting can be made using the Tungsten sort operator.

Author: Reynold Xin <rxin@databricks.com>

Closes #7733 from rxin/smj and squashes the following commits:

61e4d34 [Reynold Xin] Fixed test case.
5ffd731 [Reynold Xin] Fixed JoinSuite.
a137dc0 [Reynold Xin] [SPARK-9418][SQL] Use sort-merge join as the default shuffle join.
---
 .../src/main/scala/org/apache/spark/sql/SQLConf.scala     | 2 +-
 .../src/test/scala/org/apache/spark/sql/JoinSuite.scala   | 6 +++---
 ...bilitySuite.scala => HashJoinCompatibilitySuite.scala} | 8 ++++----
 .../scala/org/apache/spark/sql/hive/StatisticsSuite.scala | 2 +-
 4 files changed, 9 insertions(+), 9 deletions(-)
 rename sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/{SortMergeCompatibilitySuite.scala => HashJoinCompatibilitySuite.scala} (97%)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
index 40eba33f595ca..cdb0c7a1c07a7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
@@ -322,7 +322,7 @@ private[spark] object SQLConf {
       " memory.")
 
   val SORTMERGE_JOIN = booleanConf("spark.sql.planner.sortMergeJoin",
-    defaultValue = Some(false),
+    defaultValue = Some(true),
     doc = "When true, use sort merge join (as opposed to hash join) by default for large joins.")
 
   // This is only used for the thriftserver
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
index dfb2a7e099748..666f26bf620e1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
@@ -79,9 +79,9 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach {
       ("SELECT * FROM testData FULL OUTER JOIN testData2 WHERE key = 2", classOf[CartesianProduct]),
       ("SELECT * FROM testData JOIN testData2 WHERE key > a", classOf[CartesianProduct]),
       ("SELECT * FROM testData FULL OUTER JOIN testData2 WHERE key > a", classOf[CartesianProduct]),
-      ("SELECT * FROM testData JOIN testData2 ON key = a", classOf[ShuffledHashJoin]),
-      ("SELECT * FROM testData JOIN testData2 ON key = a and key = 2", classOf[ShuffledHashJoin]),
-      ("SELECT * FROM testData JOIN testData2 ON key = a where key = 2", classOf[ShuffledHashJoin]),
+      ("SELECT * FROM testData JOIN testData2 ON key = a", classOf[SortMergeJoin]),
+      ("SELECT * FROM testData JOIN testData2 ON key = a and key = 2", classOf[SortMergeJoin]),
+      ("SELECT * FROM testData JOIN testData2 ON key = a where key = 2", classOf[SortMergeJoin]),
       ("SELECT * FROM testData LEFT JOIN testData2 ON key = a", classOf[ShuffledHashOuterJoin]),
       ("SELECT * FROM testData RIGHT JOIN testData2 ON key = a where key = 2",
         classOf[ShuffledHashOuterJoin]),
diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/SortMergeCompatibilitySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HashJoinCompatibilitySuite.scala
similarity index 97%
rename from sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/SortMergeCompatibilitySuite.scala
rename to sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HashJoinCompatibilitySuite.scala
index 1fe4fe9629c02..1a5ba20404c4e 100644
--- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/SortMergeCompatibilitySuite.scala
+++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HashJoinCompatibilitySuite.scala
@@ -23,16 +23,16 @@ import org.apache.spark.sql.SQLConf
 import org.apache.spark.sql.hive.test.TestHive
 
 /**
- * Runs the test cases that are included in the hive distribution with sort merge join is true.
+ * Runs the test cases that are included in the hive distribution with hash joins.
  */
-class SortMergeCompatibilitySuite extends HiveCompatibilitySuite {
+class HashJoinCompatibilitySuite extends HiveCompatibilitySuite {
   override def beforeAll() {
     super.beforeAll()
-    TestHive.setConf(SQLConf.SORTMERGE_JOIN, true)
+    TestHive.setConf(SQLConf.SORTMERGE_JOIN, false)
   }
 
   override def afterAll() {
-    TestHive.setConf(SQLConf.SORTMERGE_JOIN, false)
+    TestHive.setConf(SQLConf.SORTMERGE_JOIN, true)
     super.afterAll()
   }
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
index f067ea0d4fc75..bc72b0172a467 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
@@ -172,7 +172,7 @@ class StatisticsSuite extends QueryTest with BeforeAndAfterAll {
         bhj = df.queryExecution.sparkPlan.collect { case j: BroadcastHashJoin => j }
         assert(bhj.isEmpty, "BroadcastHashJoin still planned even though it is switched off")
 
-        val shj = df.queryExecution.sparkPlan.collect { case j: ShuffledHashJoin => j }
+        val shj = df.queryExecution.sparkPlan.collect { case j: SortMergeJoin => j }
         assert(shj.size === 1,
           "ShuffledHashJoin should be planned when BroadcastHashJoin is turned off")
 

From e78ec1a8fabfe409c92c4904208f53dbdcfcf139 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Tue, 28 Jul 2015 17:51:58 -0700
Subject: [PATCH 437/549] [SPARK-9421] Fix null-handling bugs in
 UnsafeRow.getDouble, getFloat(), and get(ordinal, dataType)

UnsafeRow.getDouble and getFloat() return NaN when called on columns that are null, which is inconsistent with the behavior of other row classes (which is to return 0.0).

In addition, the generic get(ordinal, dataType) method should always return null for a null literal, but currently it handles nulls by calling the type-specific accessors.

This patch addresses both of these issues and adds a regression test.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #7736 from JoshRosen/unsafe-row-null-fixes and squashes the following commits:

c8eb2ee [Josh Rosen] Fix test in UnsafeRowConverterSuite
6214682 [Josh Rosen] Fixes to null handling in UnsafeRow
---
 .../sql/catalyst/expressions/UnsafeRow.java     | 14 +++-----------
 .../expressions/UnsafeRowConverterSuite.scala   |  4 ++--
 .../org/apache/spark/sql/UnsafeRowSuite.scala   | 17 ++++++++++++++++-
 3 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
index 955fb4226fc0e..64a8edc34d681 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
@@ -239,7 +239,7 @@ public Object get(int ordinal) {
 
   @Override
   public Object get(int ordinal, DataType dataType) {
-    if (dataType instanceof NullType) {
+    if (isNullAt(ordinal) || dataType instanceof NullType) {
       return null;
     } else if (dataType instanceof BooleanType) {
       return getBoolean(ordinal);
@@ -313,21 +313,13 @@ public long getLong(int ordinal) {
   @Override
   public float getFloat(int ordinal) {
     assertIndexIsValid(ordinal);
-    if (isNullAt(ordinal)) {
-      return Float.NaN;
-    } else {
-      return PlatformDependent.UNSAFE.getFloat(baseObject, getFieldOffset(ordinal));
-    }
+    return PlatformDependent.UNSAFE.getFloat(baseObject, getFieldOffset(ordinal));
   }
 
   @Override
   public double getDouble(int ordinal) {
     assertIndexIsValid(ordinal);
-    if (isNullAt(ordinal)) {
-      return Float.NaN;
-    } else {
-      return PlatformDependent.UNSAFE.getDouble(baseObject, getFieldOffset(ordinal));
-    }
+    return PlatformDependent.UNSAFE.getDouble(baseObject, getFieldOffset(ordinal));
   }
 
   @Override
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala
index 2834b54e8fb2e..b7bc17f89e82f 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala
@@ -146,8 +146,8 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers {
     assert(createdFromNull.getShort(3) === 0)
     assert(createdFromNull.getInt(4) === 0)
     assert(createdFromNull.getLong(5) === 0)
-    assert(java.lang.Float.isNaN(createdFromNull.getFloat(6)))
-    assert(java.lang.Double.isNaN(createdFromNull.getDouble(7)))
+    assert(createdFromNull.getFloat(6) === 0.0f)
+    assert(createdFromNull.getDouble(7) === 0.0d)
     assert(createdFromNull.getUTF8String(8) === null)
     assert(createdFromNull.getBinary(9) === null)
     // assert(createdFromNull.get(10) === null)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UnsafeRowSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UnsafeRowSuite.scala
index ad3bb1744cb3c..e72a1bc6c4e20 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/UnsafeRowSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/UnsafeRowSuite.scala
@@ -22,7 +22,7 @@ import java.io.ByteArrayOutputStream
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{UnsafeRow, UnsafeProjection}
-import org.apache.spark.sql.types.{DataType, IntegerType, StringType}
+import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.PlatformDependent
 import org.apache.spark.unsafe.memory.MemoryAllocator
 import org.apache.spark.unsafe.types.UTF8String
@@ -67,4 +67,19 @@ class UnsafeRowSuite extends SparkFunSuite {
 
     assert(bytesFromArrayBackedRow === bytesFromOffheapRow)
   }
+
+  test("calling getDouble() and getFloat() on null columns") {
+    val row = InternalRow.apply(null, null)
+    val unsafeRow = UnsafeProjection.create(Array[DataType](FloatType, DoubleType)).apply(row)
+    assert(unsafeRow.getFloat(0) === row.getFloat(0))
+    assert(unsafeRow.getDouble(1) === row.getDouble(1))
+  }
+
+  test("calling get(ordinal, datatype) on null columns") {
+    val row = InternalRow.apply(null)
+    val unsafeRow = UnsafeProjection.create(Array[DataType](NullType)).apply(row)
+    for (dataType <- DataTypeTestUtils.atomicTypes) {
+      assert(unsafeRow.get(0, dataType) === null)
+    }
+  }
 }

From 3744b7fd42e52011af60cc205fcb4e4b23b35c68 Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Tue, 28 Jul 2015 19:01:25 -0700
Subject: [PATCH 438/549] [SPARK-9422] [SQL] Remove the placeholder attributes
 used in the aggregation buffers

https://issues.apache.org/jira/browse/SPARK-9422

Author: Yin Huai <yhuai@databricks.com>

Closes #7737 from yhuai/removePlaceHolder and squashes the following commits:

ec29b44 [Yin Huai]  Remove placeholder attributes.
---
 .../expressions/aggregate/interfaces.scala    |  27 ++-
 .../aggregate/aggregateOperators.scala        |   4 +-
 .../aggregate/sortBasedIterators.scala        | 209 +++++++-----------
 .../spark/sql/execution/aggregate/udaf.scala  |  17 +-
 .../spark/sql/execution/aggregate/utils.scala |   4 +-
 5 files changed, 121 insertions(+), 140 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala
index 10bd19c8a840f..9fb7623172e78 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala
@@ -103,9 +103,30 @@ abstract class AggregateFunction2
   final override def foldable: Boolean = false
 
   /**
-   * The offset of this function's buffer in the underlying buffer shared with other functions.
+   * The offset of this function's start buffer value in the
+   * underlying shared mutable aggregation buffer.
+   * For example, we have two aggregate functions `avg(x)` and `avg(y)`, which share
+   * the same aggregation buffer. In this shared buffer, the position of the first
+   * buffer value of `avg(x)` will be 0 and the position of the first buffer value of `avg(y)`
+   * will be 2.
    */
-  var bufferOffset: Int = 0
+  var mutableBufferOffset: Int = 0
+
+  /**
+   * The offset of this function's start buffer value in the
+   * underlying shared input aggregation buffer. An input aggregation buffer is used
+   * when we merge two aggregation buffers and it is basically the immutable one
+   * (we merge an input aggregation buffer and a mutable aggregation buffer and
+   * then store the new buffer values to the mutable aggregation buffer).
+   * Usually, an input aggregation buffer also contain extra elements like grouping
+   * keys at the beginning. So, mutableBufferOffset and inputBufferOffset are often
+   * different.
+   * For example, we have a grouping expression `key``, and two aggregate functions
+   * `avg(x)` and `avg(y)`. In this shared input aggregation buffer, the position of the first
+   * buffer value of `avg(x)` will be 1 and the position of the first buffer value of `avg(y)`
+   * will be 3 (position 0 is used for the value of key`).
+   */
+  var inputBufferOffset: Int = 0
 
   /** The schema of the aggregation buffer. */
   def bufferSchema: StructType
@@ -176,7 +197,7 @@ abstract class AlgebraicAggregate extends AggregateFunction2 with Serializable w
   override def initialize(buffer: MutableRow): Unit = {
     var i = 0
     while (i < bufferAttributes.size) {
-      buffer(i + bufferOffset) = initialValues(i).eval()
+      buffer(i + mutableBufferOffset) = initialValues(i).eval()
       i += 1
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/aggregateOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/aggregateOperators.scala
index 0c9082897f390..98538c462bc89 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/aggregateOperators.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/aggregateOperators.scala
@@ -72,8 +72,10 @@ case class Aggregate2Sort(
   protected override def doExecute(): RDD[InternalRow] = attachTree(this, "execute") {
     child.execute().mapPartitions { iter =>
       if (aggregateExpressions.length == 0) {
-        new GroupingIterator(
+        new FinalSortAggregationIterator(
           groupingExpressions,
+          Nil,
+          Nil,
           resultExpressions,
           newMutableProjection,
           child.output,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/sortBasedIterators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/sortBasedIterators.scala
index 1b89edafa8dad..2ca0cb82c1aab 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/sortBasedIterators.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/sortBasedIterators.scala
@@ -41,7 +41,8 @@ private[sql] abstract class SortAggregationIterator(
   ///////////////////////////////////////////////////////////////////////////
 
   protected val aggregateFunctions: Array[AggregateFunction2] = {
-    var bufferOffset = initialBufferOffset
+    var mutableBufferOffset = 0
+    var inputBufferOffset: Int = initialInputBufferOffset
     val functions = new Array[AggregateFunction2](aggregateExpressions.length)
     var i = 0
     while (i < aggregateExpressions.length) {
@@ -54,13 +55,18 @@ private[sql] abstract class SortAggregationIterator(
           // function's children in the update method of this aggregate function.
           // Those eval calls require BoundReferences to work.
           BindReferences.bindReference(func, inputAttributes)
-        case _ => func
+        case _ =>
+          // We only need to set inputBufferOffset for aggregate functions with mode
+          // PartialMerge and Final.
+          func.inputBufferOffset = inputBufferOffset
+          inputBufferOffset += func.bufferSchema.length
+          func
       }
-      // Set bufferOffset for this function. It is important that setting bufferOffset
-      // happens after all potential bindReference operations because bindReference
-      // will create a new instance of the function.
-      funcWithBoundReferences.bufferOffset = bufferOffset
-      bufferOffset += funcWithBoundReferences.bufferSchema.length
+      // Set mutableBufferOffset for this function. It is important that setting
+      // mutableBufferOffset happens after all potential bindReference operations
+      // because bindReference will create a new instance of the function.
+      funcWithBoundReferences.mutableBufferOffset = mutableBufferOffset
+      mutableBufferOffset += funcWithBoundReferences.bufferSchema.length
       functions(i) = funcWithBoundReferences
       i += 1
     }
@@ -97,25 +103,24 @@ private[sql] abstract class SortAggregationIterator(
     // The number of elements of the underlying buffer of this operator.
     // All aggregate functions are sharing this underlying buffer and they find their
     // buffer values through bufferOffset.
-    var size = initialBufferOffset
-    var i = 0
-    while (i < aggregateFunctions.length) {
-      size += aggregateFunctions(i).bufferSchema.length
-      i += 1
-    }
-    new GenericMutableRow(size)
+    // var size = 0
+    // var i = 0
+    // while (i < aggregateFunctions.length) {
+    //  size += aggregateFunctions(i).bufferSchema.length
+    //  i += 1
+    // }
+    new GenericMutableRow(aggregateFunctions.map(_.bufferSchema.length).sum)
   }
 
   protected val joinedRow = new JoinedRow
 
-  protected val placeholderExpressions = Seq.fill(initialBufferOffset)(NoOp)
-
   // This projection is used to initialize buffer values for all AlgebraicAggregates.
   protected val algebraicInitialProjection = {
-    val initExpressions = placeholderExpressions ++ aggregateFunctions.flatMap {
+    val initExpressions = aggregateFunctions.flatMap {
       case ae: AlgebraicAggregate => ae.initialValues
       case agg: AggregateFunction2 => Seq.fill(agg.bufferAttributes.length)(NoOp)
     }
+
     newMutableProjection(initExpressions, Nil)().target(buffer)
   }
 
@@ -132,10 +137,6 @@ private[sql] abstract class SortAggregationIterator(
   // Indicates if we has new group of rows to process.
   protected var hasNewGroup: Boolean = true
 
-  ///////////////////////////////////////////////////////////////////////////
-  // Private methods
-  ///////////////////////////////////////////////////////////////////////////
-
   /** Initializes buffer values for all aggregate functions. */
   protected def initializeBuffer(): Unit = {
     algebraicInitialProjection(EmptyRow)
@@ -160,6 +161,10 @@ private[sql] abstract class SortAggregationIterator(
     }
   }
 
+  ///////////////////////////////////////////////////////////////////////////
+  // Private methods
+  ///////////////////////////////////////////////////////////////////////////
+
   /** Processes rows in the current group. It will stop when it find a new group. */
   private def processCurrentGroup(): Unit = {
     currentGroupingKey = nextGroupingKey
@@ -218,10 +223,13 @@ private[sql] abstract class SortAggregationIterator(
   // Methods that need to be implemented
   ///////////////////////////////////////////////////////////////////////////
 
-  protected def initialBufferOffset: Int
+  /** The initial input buffer offset for `inputBufferOffset` of an [[AggregateFunction2]]. */
+  protected def initialInputBufferOffset: Int
 
+  /** The function used to process an input row. */
   protected def processRow(row: InternalRow): Unit
 
+  /** The function used to generate the result row. */
   protected def generateOutput(): InternalRow
 
   ///////////////////////////////////////////////////////////////////////////
@@ -231,37 +239,6 @@ private[sql] abstract class SortAggregationIterator(
   initialize()
 }
 
-/**
- * An iterator only used to group input rows according to values of `groupingExpressions`.
- * It assumes that input rows are already grouped by values of `groupingExpressions`.
- */
-class GroupingIterator(
-    groupingExpressions: Seq[NamedExpression],
-    resultExpressions: Seq[NamedExpression],
-    newMutableProjection: (Seq[Expression], Seq[Attribute]) => (() => MutableProjection),
-    inputAttributes: Seq[Attribute],
-    inputIter: Iterator[InternalRow])
-  extends SortAggregationIterator(
-    groupingExpressions,
-    Nil,
-    newMutableProjection,
-    inputAttributes,
-    inputIter) {
-
-  private val resultProjection =
-    newMutableProjection(resultExpressions, groupingExpressions.map(_.toAttribute))()
-
-  override protected def initialBufferOffset: Int = 0
-
-  override protected def processRow(row: InternalRow): Unit = {
-    // Since we only do grouping, there is nothing to do at here.
-  }
-
-  override protected def generateOutput(): InternalRow = {
-    resultProjection(currentGroupingKey)
-  }
-}
-
 /**
  * An iterator used to do partial aggregations (for those aggregate functions with mode Partial).
  * It assumes that input rows are already grouped by values of `groupingExpressions`.
@@ -291,7 +268,7 @@ class PartialSortAggregationIterator(
     newMutableProjection(updateExpressions, bufferSchema ++ inputAttributes)().target(buffer)
   }
 
-  override protected def initialBufferOffset: Int = 0
+  override protected def initialInputBufferOffset: Int = 0
 
   override protected def processRow(row: InternalRow): Unit = {
     // Process all algebraic aggregate functions.
@@ -318,11 +295,7 @@ class PartialSortAggregationIterator(
  * |groupingExpr1|...|groupingExprN|aggregationBuffer1|...|aggregationBufferN|
  *
  * The format of its internal buffer is:
- * |placeholder1|...|placeholderN|aggregationBuffer1|...|aggregationBufferN|
- * Every placeholder is for a grouping expression.
- * The actual buffers are stored after placeholderN.
- * The reason that we have placeholders at here is to make our underlying buffer have the same
- * length with a input row.
+ * |aggregationBuffer1|...|aggregationBufferN|
  *
  * The format of its output rows is:
  * |groupingExpr1|...|groupingExprN|aggregationBuffer1|...|aggregationBufferN|
@@ -340,33 +313,21 @@ class PartialMergeSortAggregationIterator(
     inputAttributes,
     inputIter) {
 
-  private val placeholderAttributes =
-    Seq.fill(initialBufferOffset)(AttributeReference("placeholder", NullType)())
-
   // This projection is used to merge buffer values for all AlgebraicAggregates.
   private val algebraicMergeProjection = {
-    val bufferSchemata =
-      placeholderAttributes ++ aggregateFunctions.flatMap(_.bufferAttributes) ++
-        placeholderAttributes ++ aggregateFunctions.flatMap(_.cloneBufferAttributes)
-    val mergeExpressions = placeholderExpressions ++ aggregateFunctions.flatMap {
+    val mergeInputSchema =
+      aggregateFunctions.flatMap(_.bufferAttributes) ++
+        groupingExpressions.map(_.toAttribute) ++
+        aggregateFunctions.flatMap(_.cloneBufferAttributes)
+    val mergeExpressions = aggregateFunctions.flatMap {
       case ae: AlgebraicAggregate => ae.mergeExpressions
       case agg: AggregateFunction2 => Seq.fill(agg.bufferAttributes.length)(NoOp)
     }
 
-    newMutableProjection(mergeExpressions, bufferSchemata)()
+    newMutableProjection(mergeExpressions, mergeInputSchema)()
   }
 
-  // This projection is used to extract aggregation buffers from the underlying buffer.
-  // We need it because the underlying buffer has placeholders at its beginning.
-  private val extractsBufferValues = {
-    val expressions = aggregateFunctions.flatMap {
-      case agg => agg.bufferAttributes
-    }
-
-    newMutableProjection(expressions, inputAttributes)()
-  }
-
-  override protected def initialBufferOffset: Int = groupingExpressions.length
+  override protected def initialInputBufferOffset: Int = groupingExpressions.length
 
   override protected def processRow(row: InternalRow): Unit = {
     // Process all algebraic aggregate functions.
@@ -381,7 +342,7 @@ class PartialMergeSortAggregationIterator(
 
   override protected def generateOutput(): InternalRow = {
     // We output grouping expressions and aggregation buffers.
-    joinedRow(currentGroupingKey, extractsBufferValues(buffer))
+    joinedRow(currentGroupingKey, buffer).copy()
   }
 }
 
@@ -393,11 +354,7 @@ class PartialMergeSortAggregationIterator(
  * |groupingExpr1|...|groupingExprN|aggregationBuffer1|...|aggregationBufferN|
  *
  * The format of its internal buffer is:
- * |placeholder1|...|placeholder N|aggregationBuffer1|...|aggregationBufferN|
- * Every placeholder is for a grouping expression.
- * The actual buffers are stored after placeholderN.
- * The reason that we have placeholders at here is to make our underlying buffer have the same
- * length with a input row.
+ * |aggregationBuffer1|...|aggregationBufferN|
  *
  * The format of its output rows is represented by the schema of `resultExpressions`.
  */
@@ -425,27 +382,23 @@ class FinalSortAggregationIterator(
     newMutableProjection(
       resultExpressions, groupingExpressions.map(_.toAttribute) ++ aggregateAttributes)()
 
-  private val offsetAttributes =
-    Seq.fill(initialBufferOffset)(AttributeReference("placeholder", NullType)())
-
   // This projection is used to merge buffer values for all AlgebraicAggregates.
   private val algebraicMergeProjection = {
-    val bufferSchemata =
-      offsetAttributes ++ aggregateFunctions.flatMap(_.bufferAttributes) ++
-        offsetAttributes ++ aggregateFunctions.flatMap(_.cloneBufferAttributes)
-    val mergeExpressions = placeholderExpressions ++ aggregateFunctions.flatMap {
+    val mergeInputSchema =
+      aggregateFunctions.flatMap(_.bufferAttributes) ++
+        groupingExpressions.map(_.toAttribute) ++
+        aggregateFunctions.flatMap(_.cloneBufferAttributes)
+    val mergeExpressions = aggregateFunctions.flatMap {
       case ae: AlgebraicAggregate => ae.mergeExpressions
       case agg: AggregateFunction2 => Seq.fill(agg.bufferAttributes.length)(NoOp)
     }
 
-    newMutableProjection(mergeExpressions, bufferSchemata)()
+    newMutableProjection(mergeExpressions, mergeInputSchema)()
   }
 
   // This projection is used to evaluate all AlgebraicAggregates.
   private val algebraicEvalProjection = {
-    val bufferSchemata =
-      offsetAttributes ++ aggregateFunctions.flatMap(_.bufferAttributes) ++
-        offsetAttributes ++ aggregateFunctions.flatMap(_.cloneBufferAttributes)
+    val bufferSchemata = aggregateFunctions.flatMap(_.bufferAttributes)
     val evalExpressions = aggregateFunctions.map {
       case ae: AlgebraicAggregate => ae.evaluateExpression
       case agg: AggregateFunction2 => NoOp
@@ -454,7 +407,7 @@ class FinalSortAggregationIterator(
     newMutableProjection(evalExpressions, bufferSchemata)()
   }
 
-  override protected def initialBufferOffset: Int = groupingExpressions.length
+  override protected def initialInputBufferOffset: Int = groupingExpressions.length
 
   override def initialize(): Unit = {
     if (inputIter.hasNext) {
@@ -471,7 +424,10 @@ class FinalSortAggregationIterator(
         // Right now, the buffer only contains initial buffer values. Because
         // merging two buffers with initial values will generate a row that
         // still store initial values. We set the currentRow as the copy of the current buffer.
-        val currentRow = buffer.copy()
+        // Because input aggregation buffer has initialInputBufferOffset extra values at the
+        // beginning, we create a dummy row for this part.
+        val currentRow =
+          joinedRow(new GenericInternalRow(initialInputBufferOffset), buffer).copy()
         nextGroupingKey = groupGenerator(currentRow).copy()
         firstRowInNextGroup = currentRow
       } else {
@@ -518,18 +474,15 @@ class FinalSortAggregationIterator(
  * Final mode.
  *
  * The format of its internal buffer is:
- * |placeholder1|...|placeholder(N+M)|aggregationBuffer1|...|aggregationBuffer(N+M)|
- * The first N placeholders represent slots of grouping expressions.
- * Then, next M placeholders represent slots of col1 to colM.
+ * |aggregationBuffer1|...|aggregationBuffer(N+M)|
  * For aggregation buffers, first N aggregation buffers are used by N aggregate functions with
  * mode Final. Then, the last M aggregation buffers are used by M aggregate functions with mode
- * Complete. The reason that we have placeholders at here is to make our underlying buffer
- * have the same length with a input row.
+ * Complete.
  *
  * The format of its output rows is represented by the schema of `resultExpressions`.
  */
 class FinalAndCompleteSortAggregationIterator(
-    override protected val initialBufferOffset: Int,
+    override protected val initialInputBufferOffset: Int,
     groupingExpressions: Seq[NamedExpression],
     finalAggregateExpressions: Seq[AggregateExpression2],
     finalAggregateAttributes: Seq[Attribute],
@@ -561,9 +514,6 @@ class FinalAndCompleteSortAggregationIterator(
     newMutableProjection(resultExpressions, inputSchema)()
   }
 
-  private val offsetAttributes =
-    Seq.fill(initialBufferOffset)(AttributeReference("placeholder", NullType)())
-
   // All aggregate functions with mode Final.
   private val finalAggregateFunctions: Array[AggregateFunction2] = {
     val functions = new Array[AggregateFunction2](finalAggregateExpressions.length)
@@ -601,38 +551,38 @@ class FinalAndCompleteSortAggregationIterator(
   // This projection is used to merge buffer values for all AlgebraicAggregates with mode
   // Final.
   private val finalAlgebraicMergeProjection = {
-    val numCompleteOffsetAttributes =
-      completeAggregateFunctions.map(_.bufferAttributes.length).sum
-    val completeOffsetAttributes =
-      Seq.fill(numCompleteOffsetAttributes)(AttributeReference("placeholder", NullType)())
-    val completeOffsetExpressions = Seq.fill(numCompleteOffsetAttributes)(NoOp)
-
-    val bufferSchemata =
-      offsetAttributes ++ finalAggregateFunctions.flatMap(_.bufferAttributes) ++
-        completeOffsetAttributes ++ offsetAttributes ++
-        finalAggregateFunctions.flatMap(_.cloneBufferAttributes) ++ completeOffsetAttributes
+    // The first initialInputBufferOffset values of the input aggregation buffer is
+    // for grouping expressions and distinct columns.
+    val groupingAttributesAndDistinctColumns = inputAttributes.take(initialInputBufferOffset)
+
+    val completeOffsetExpressions =
+      Seq.fill(completeAggregateFunctions.map(_.bufferAttributes.length).sum)(NoOp)
+
+    val mergeInputSchema =
+      finalAggregateFunctions.flatMap(_.bufferAttributes) ++
+        completeAggregateFunctions.flatMap(_.bufferAttributes) ++
+        groupingAttributesAndDistinctColumns ++
+        finalAggregateFunctions.flatMap(_.cloneBufferAttributes)
     val mergeExpressions =
-      placeholderExpressions ++ finalAggregateFunctions.flatMap {
+      finalAggregateFunctions.flatMap {
         case ae: AlgebraicAggregate => ae.mergeExpressions
         case agg: AggregateFunction2 => Seq.fill(agg.bufferAttributes.length)(NoOp)
       } ++ completeOffsetExpressions
-
-    newMutableProjection(mergeExpressions, bufferSchemata)()
+    newMutableProjection(mergeExpressions, mergeInputSchema)()
   }
 
   // This projection is used to update buffer values for all AlgebraicAggregates with mode
   // Complete.
   private val completeAlgebraicUpdateProjection = {
-    val numFinalOffsetAttributes = finalAggregateFunctions.map(_.bufferAttributes.length).sum
-    val finalOffsetAttributes =
-      Seq.fill(numFinalOffsetAttributes)(AttributeReference("placeholder", NullType)())
-    val finalOffsetExpressions = Seq.fill(numFinalOffsetAttributes)(NoOp)
+    // We do not touch buffer values of aggregate functions with the Final mode.
+    val finalOffsetExpressions =
+      Seq.fill(finalAggregateFunctions.map(_.bufferAttributes.length).sum)(NoOp)
 
     val bufferSchema =
-      offsetAttributes ++ finalOffsetAttributes ++
+      finalAggregateFunctions.flatMap(_.bufferAttributes) ++
         completeAggregateFunctions.flatMap(_.bufferAttributes)
     val updateExpressions =
-      placeholderExpressions ++ finalOffsetExpressions ++ completeAggregateFunctions.flatMap {
+      finalOffsetExpressions ++ completeAggregateFunctions.flatMap {
         case ae: AlgebraicAggregate => ae.updateExpressions
         case agg: AggregateFunction2 => Seq.fill(agg.bufferAttributes.length)(NoOp)
       }
@@ -641,9 +591,7 @@ class FinalAndCompleteSortAggregationIterator(
 
   // This projection is used to evaluate all AlgebraicAggregates.
   private val algebraicEvalProjection = {
-    val bufferSchemata =
-      offsetAttributes ++ aggregateFunctions.flatMap(_.bufferAttributes) ++
-        offsetAttributes ++ aggregateFunctions.flatMap(_.cloneBufferAttributes)
+    val bufferSchemata = aggregateFunctions.flatMap(_.bufferAttributes)
     val evalExpressions = aggregateFunctions.map {
       case ae: AlgebraicAggregate => ae.evaluateExpression
       case agg: AggregateFunction2 => NoOp
@@ -667,7 +615,10 @@ class FinalAndCompleteSortAggregationIterator(
         // Right now, the buffer only contains initial buffer values. Because
         // merging two buffers with initial values will generate a row that
         // still store initial values. We set the currentRow as the copy of the current buffer.
-        val currentRow = buffer.copy()
+        // Because input aggregation buffer has initialInputBufferOffset extra values at the
+        // beginning, we create a dummy row for this part.
+        val currentRow =
+          joinedRow(new GenericInternalRow(initialInputBufferOffset), buffer).copy()
         nextGroupingKey = groupGenerator(currentRow).copy()
         firstRowInNextGroup = currentRow
       } else {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala
index 073c45ae2f9f2..cc54319171bdb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala
@@ -184,7 +184,7 @@ private[sql] case class ScalaUDAF(
       bufferSchema,
       bufferValuesToCatalystConverters,
       bufferValuesToScalaConverters,
-      bufferOffset,
+      inputBufferOffset,
       null)
 
   lazy val mutableAggregateBuffer: MutableAggregationBufferImpl =
@@ -192,9 +192,16 @@ private[sql] case class ScalaUDAF(
       bufferSchema,
       bufferValuesToCatalystConverters,
       bufferValuesToScalaConverters,
-      bufferOffset,
+      mutableBufferOffset,
       null)
 
+  lazy val evalAggregateBuffer: InputAggregationBuffer =
+    new InputAggregationBuffer(
+      bufferSchema,
+      bufferValuesToCatalystConverters,
+      bufferValuesToScalaConverters,
+      mutableBufferOffset,
+      null)
 
   override def initialize(buffer: MutableRow): Unit = {
     mutableAggregateBuffer.underlyingBuffer = buffer
@@ -217,10 +224,10 @@ private[sql] case class ScalaUDAF(
     udaf.merge(mutableAggregateBuffer, inputAggregateBuffer)
   }
 
-  override def eval(buffer: InternalRow = null): Any = {
-    inputAggregateBuffer.underlyingInputBuffer = buffer
+  override def eval(buffer: InternalRow): Any = {
+    evalAggregateBuffer.underlyingInputBuffer = buffer
 
-    udaf.evaluate(inputAggregateBuffer)
+    udaf.evaluate(evalAggregateBuffer)
   }
 
   override def toString: String = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/utils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/utils.scala
index 5bbe6c162ff4b..6549c87752a7d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/utils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/utils.scala
@@ -292,8 +292,8 @@ object Utils {
         AggregateExpression2(aggregateFunction, PartialMerge, false)
     }
     val partialMergeAggregateAttributes =
-      partialMergeAggregateExpressions.map {
-        expr => aggregateFunctionMap(expr.aggregateFunction, expr.isDistinct)
+      partialMergeAggregateExpressions.flatMap { agg =>
+        agg.aggregateFunction.bufferAttributes
       }
     val partialMergeAggregate =
       Aggregate2Sort(

From 429b2f0df4ef97a3b94cead06a7eb51581eabb18 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@outlook.com>
Date: Tue, 28 Jul 2015 21:37:50 -0700
Subject: [PATCH 439/549] [SPARK-8608][SPARK-8609][SPARK-9083][SQL] reset
 mutable states of nondeterministic expression before evaluation and fix
 PullOutNondeterministic

We will do local projection for LocalRelation, and thus reuse the same Expression object among multiply evaluations. We should reset the mutable states of Expression before evaluate it.

Fix `PullOutNondeterministic` rule to make it work for `Sort`.

Also got a chance to cleanup the dataframe test suite.

Author: Wenchen Fan <cloud0fan@outlook.com>

Closes #7674 from cloud-fan/show and squashes the following commits:

888934f [Wenchen Fan] fix sort
c0e93e8 [Wenchen Fan] local DataFrame with random columns should return same value when call `show`
---
 .../sql/catalyst/analysis/Analyzer.scala      |  15 +-
 .../sql/catalyst/expressions/Expression.scala |   8 +-
 .../sql/catalyst/expressions/Projection.scala |   4 +-
 .../sql/catalyst/expressions/predicates.scala |   2 +-
 .../sql/catalyst/analysis/AnalysisSuite.scala |  12 +-
 .../expressions/ExpressionEvalHelper.scala    |   2 +-
 .../org/apache/spark/sql/DataFrameSuite.scala | 153 +++++++++++-------
 7 files changed, 120 insertions(+), 76 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index a309ee35ee582..a6ea0cc0a83a8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -928,12 +928,17 @@ class Analyzer(
       // from LogicalPlan, currently we only do it for UnaryNode which has same output
       // schema with its child.
       case p: UnaryNode if p.output == p.child.output && p.expressions.exists(!_.deterministic) =>
-        val nondeterministicExprs = p.expressions.filterNot(_.deterministic).map { e =>
-          val ne = e match {
-            case n: NamedExpression => n
-            case _ => Alias(e, "_nondeterministic")()
+        val nondeterministicExprs = p.expressions.filterNot(_.deterministic).flatMap { expr =>
+          val leafNondeterministic = expr.collect {
+            case n: Nondeterministic => n
+          }
+          leafNondeterministic.map { e =>
+            val ne = e match {
+              case n: NamedExpression => n
+              case _ => Alias(e, "_nondeterministic")()
+            }
+            new TreeNodeRef(e) -> ne
           }
-          new TreeNodeRef(e) -> ne
         }.toMap
         val newPlan = p.transformExpressions { case e =>
           nondeterministicExprs.get(new TreeNodeRef(e)).map(_.toAttribute).getOrElse(e)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
index 03e36c7871bcf..8fc182607ce68 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
@@ -201,11 +201,9 @@ trait Nondeterministic extends Expression {
 
   private[this] var initialized = false
 
-  final def initialize(): Unit = {
-    if (!initialized) {
-      initInternal()
-      initialized = true
-    }
+  final def setInitialValues(): Unit = {
+    initInternal()
+    initialized = true
   }
 
   protected def initInternal(): Unit
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
index 27d6ff587ab71..b3beb7e28f208 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
@@ -32,7 +32,7 @@ class InterpretedProjection(expressions: Seq[Expression]) extends Projection {
     this(expressions.map(BindReferences.bindReference(_, inputSchema)))
 
   expressions.foreach(_.foreach {
-    case n: Nondeterministic => n.initialize()
+    case n: Nondeterministic => n.setInitialValues()
     case _ =>
   })
 
@@ -63,7 +63,7 @@ case class InterpretedMutableProjection(expressions: Seq[Expression]) extends Mu
     this(expressions.map(BindReferences.bindReference(_, inputSchema)))
 
   expressions.foreach(_.foreach {
-    case n: Nondeterministic => n.initialize()
+    case n: Nondeterministic => n.setInitialValues()
     case _ =>
   })
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
index 5bfe1cad24a3e..ab7d3afce8f2e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
@@ -31,7 +31,7 @@ object InterpretedPredicate {
 
   def create(expression: Expression): (InternalRow => Boolean) = {
     expression.foreach {
-      case n: Nondeterministic => n.initialize()
+      case n: Nondeterministic => n.setInitialValues()
       case _ =>
     }
     (r: InternalRow) => expression.eval(r).asInstanceOf[Boolean]
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
index ed645b618dc9b..4589facb49b76 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
@@ -153,7 +153,7 @@ class AnalysisSuite extends AnalysisTest {
     assert(pl(4).dataType == DoubleType)
   }
 
-  test("pull out nondeterministic expressions from unary LogicalPlan") {
+  test("pull out nondeterministic expressions from RepartitionByExpression") {
     val plan = RepartitionByExpression(Seq(Rand(33)), testRelation)
     val projected = Alias(Rand(33), "_nondeterministic")()
     val expected =
@@ -162,4 +162,14 @@ class AnalysisSuite extends AnalysisTest {
           Project(testRelation.output :+ projected, testRelation)))
     checkAnalysis(plan, expected)
   }
+
+  test("pull out nondeterministic expressions from Sort") {
+    val plan = Sort(Seq(SortOrder(Rand(33), Ascending)), false, testRelation)
+    val projected = Alias(Rand(33), "_nondeterministic")()
+    val expected =
+      Project(testRelation.output,
+        Sort(Seq(SortOrder(projected.toAttribute, Ascending)), false,
+          Project(testRelation.output :+ projected, testRelation)))
+    checkAnalysis(plan, expected)
+  }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
index 0c8611d5ddefa..3c05e5c3b833c 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
@@ -65,7 +65,7 @@ trait ExpressionEvalHelper {
 
   protected def evaluate(expression: Expression, inputRow: InternalRow = EmptyRow): Any = {
     expression.foreach {
-      case n: Nondeterministic => n.initialize()
+      case n: Nondeterministic => n.setInitialValues()
       case _ =>
     }
     expression.eval(inputRow)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 3151e071b19ea..97beae2f85c50 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -33,33 +33,28 @@ import org.apache.spark.sql.test.{ExamplePointUDT, ExamplePoint, SQLTestUtils}
 class DataFrameSuite extends QueryTest with SQLTestUtils {
   import org.apache.spark.sql.TestData._
 
-  lazy val ctx = org.apache.spark.sql.test.TestSQLContext
-  import ctx.implicits._
-
-  def sqlContext: SQLContext = ctx
+  lazy val sqlContext = org.apache.spark.sql.test.TestSQLContext
+  import sqlContext.implicits._
 
   test("analysis error should be eagerly reported") {
-    val oldSetting = ctx.conf.dataFrameEagerAnalysis
     // Eager analysis.
-    ctx.setConf(SQLConf.DATAFRAME_EAGER_ANALYSIS, true)
-
-    intercept[Exception] { testData.select('nonExistentName) }
-    intercept[Exception] {
-      testData.groupBy('key).agg(Map("nonExistentName" -> "sum"))
-    }
-    intercept[Exception] {
-      testData.groupBy("nonExistentName").agg(Map("key" -> "sum"))
-    }
-    intercept[Exception] {
-      testData.groupBy($"abcd").agg(Map("key" -> "sum"))
+    withSQLConf(SQLConf.DATAFRAME_EAGER_ANALYSIS.key -> "true") {
+      intercept[Exception] { testData.select('nonExistentName) }
+      intercept[Exception] {
+        testData.groupBy('key).agg(Map("nonExistentName" -> "sum"))
+      }
+      intercept[Exception] {
+        testData.groupBy("nonExistentName").agg(Map("key" -> "sum"))
+      }
+      intercept[Exception] {
+        testData.groupBy($"abcd").agg(Map("key" -> "sum"))
+      }
     }
 
     // No more eager analysis once the flag is turned off
-    ctx.setConf(SQLConf.DATAFRAME_EAGER_ANALYSIS, false)
-    testData.select('nonExistentName)
-
-    // Set the flag back to original value before this test.
-    ctx.setConf(SQLConf.DATAFRAME_EAGER_ANALYSIS, oldSetting)
+    withSQLConf(SQLConf.DATAFRAME_EAGER_ANALYSIS.key -> "false") {
+      testData.select('nonExistentName)
+    }
   }
 
   test("dataframe toString") {
@@ -77,21 +72,18 @@ class DataFrameSuite extends QueryTest with SQLTestUtils {
   }
 
   test("invalid plan toString, debug mode") {
-    val oldSetting = ctx.conf.dataFrameEagerAnalysis
-    ctx.setConf(SQLConf.DATAFRAME_EAGER_ANALYSIS, true)
-
     // Turn on debug mode so we can see invalid query plans.
     import org.apache.spark.sql.execution.debug._
-    ctx.debug()
 
-    val badPlan = testData.select('badColumn)
+    withSQLConf(SQLConf.DATAFRAME_EAGER_ANALYSIS.key -> "true") {
+      sqlContext.debug()
 
-    assert(badPlan.toString contains badPlan.queryExecution.toString,
-      "toString on bad query plans should include the query execution but was:\n" +
-        badPlan.toString)
+      val badPlan = testData.select('badColumn)
 
-    // Set the flag back to original value before this test.
-    ctx.setConf(SQLConf.DATAFRAME_EAGER_ANALYSIS, oldSetting)
+      assert(badPlan.toString contains badPlan.queryExecution.toString,
+        "toString on bad query plans should include the query execution but was:\n" +
+          badPlan.toString)
+    }
   }
 
   test("access complex data") {
@@ -107,8 +99,8 @@ class DataFrameSuite extends QueryTest with SQLTestUtils {
   }
 
   test("empty data frame") {
-    assert(ctx.emptyDataFrame.columns.toSeq === Seq.empty[String])
-    assert(ctx.emptyDataFrame.count() === 0)
+    assert(sqlContext.emptyDataFrame.columns.toSeq === Seq.empty[String])
+    assert(sqlContext.emptyDataFrame.count() === 0)
   }
 
   test("head and take") {
@@ -344,7 +336,7 @@ class DataFrameSuite extends QueryTest with SQLTestUtils {
   }
 
   test("replace column using withColumn") {
-    val df2 = ctx.sparkContext.parallelize(Array(1, 2, 3)).toDF("x")
+    val df2 = sqlContext.sparkContext.parallelize(Array(1, 2, 3)).toDF("x")
     val df3 = df2.withColumn("x", df2("x") + 1)
     checkAnswer(
       df3.select("x"),
@@ -425,7 +417,7 @@ class DataFrameSuite extends QueryTest with SQLTestUtils {
 
   test("randomSplit") {
     val n = 600
-    val data = ctx.sparkContext.parallelize(1 to n, 2).toDF("id")
+    val data = sqlContext.sparkContext.parallelize(1 to n, 2).toDF("id")
     for (seed <- 1 to 5) {
       val splits = data.randomSplit(Array[Double](1, 2, 3), seed)
       assert(splits.length == 3, "wrong number of splits")
@@ -519,7 +511,7 @@ class DataFrameSuite extends QueryTest with SQLTestUtils {
 
   test("showString: truncate = [true, false]") {
     val longString = Array.fill(21)("1").mkString
-    val df = ctx.sparkContext.parallelize(Seq("1", longString)).toDF()
+    val df = sqlContext.sparkContext.parallelize(Seq("1", longString)).toDF()
     val expectedAnswerForFalse = """+---------------------+
                                    ||_1                   |
                                    |+---------------------+
@@ -609,21 +601,17 @@ class DataFrameSuite extends QueryTest with SQLTestUtils {
   }
 
   test("createDataFrame(RDD[Row], StructType) should convert UDTs (SPARK-6672)") {
-    val rowRDD = ctx.sparkContext.parallelize(Seq(Row(new ExamplePoint(1.0, 2.0))))
+    val rowRDD = sqlContext.sparkContext.parallelize(Seq(Row(new ExamplePoint(1.0, 2.0))))
     val schema = StructType(Array(StructField("point", new ExamplePointUDT(), false)))
-    val df = ctx.createDataFrame(rowRDD, schema)
+    val df = sqlContext.createDataFrame(rowRDD, schema)
     df.rdd.collect()
   }
 
-  test("SPARK-6899") {
-    val originalValue = ctx.conf.codegenEnabled
-    ctx.setConf(SQLConf.CODEGEN_ENABLED, true)
-    try{
+  test("SPARK-6899: type should match when using codegen") {
+    withSQLConf(SQLConf.CODEGEN_ENABLED.key -> "true") {
       checkAnswer(
         decimalData.agg(avg('a)),
         Row(new java.math.BigDecimal(2.0)))
-    } finally {
-      ctx.setConf(SQLConf.CODEGEN_ENABLED, originalValue)
     }
   }
 
@@ -635,14 +623,14 @@ class DataFrameSuite extends QueryTest with SQLTestUtils {
   }
 
   test("SPARK-7551: support backticks for DataFrame attribute resolution") {
-    val df = ctx.read.json(ctx.sparkContext.makeRDD(
+    val df = sqlContext.read.json(sqlContext.sparkContext.makeRDD(
       """{"a.b": {"c": {"d..e": {"f": 1}}}}""" :: Nil))
     checkAnswer(
       df.select(df("`a.b`.c.`d..e`.`f`")),
       Row(1)
     )
 
-    val df2 = ctx.read.json(ctx.sparkContext.makeRDD(
+    val df2 = sqlContext.read.json(sqlContext.sparkContext.makeRDD(
       """{"a  b": {"c": {"d  e": {"f": 1}}}}""" :: Nil))
     checkAnswer(
       df2.select(df2("`a  b`.c.d  e.f")),
@@ -662,7 +650,7 @@ class DataFrameSuite extends QueryTest with SQLTestUtils {
   }
 
   test("SPARK-7324 dropDuplicates") {
-    val testData = ctx.sparkContext.parallelize(
+    val testData = sqlContext.sparkContext.parallelize(
       (2, 1, 2) :: (1, 1, 1) ::
       (1, 2, 1) :: (2, 1, 2) ::
       (2, 2, 2) :: (2, 2, 1) ::
@@ -710,49 +698,49 @@ class DataFrameSuite extends QueryTest with SQLTestUtils {
 
   test("SPARK-7150 range api") {
     // numSlice is greater than length
-    val res1 = ctx.range(0, 10, 1, 15).select("id")
+    val res1 = sqlContext.range(0, 10, 1, 15).select("id")
     assert(res1.count == 10)
     assert(res1.agg(sum("id")).as("sumid").collect() === Seq(Row(45)))
 
-    val res2 = ctx.range(3, 15, 3, 2).select("id")
+    val res2 = sqlContext.range(3, 15, 3, 2).select("id")
     assert(res2.count == 4)
     assert(res2.agg(sum("id")).as("sumid").collect() === Seq(Row(30)))
 
-    val res3 = ctx.range(1, -2).select("id")
+    val res3 = sqlContext.range(1, -2).select("id")
     assert(res3.count == 0)
 
     // start is positive, end is negative, step is negative
-    val res4 = ctx.range(1, -2, -2, 6).select("id")
+    val res4 = sqlContext.range(1, -2, -2, 6).select("id")
     assert(res4.count == 2)
     assert(res4.agg(sum("id")).as("sumid").collect() === Seq(Row(0)))
 
     // start, end, step are negative
-    val res5 = ctx.range(-3, -8, -2, 1).select("id")
+    val res5 = sqlContext.range(-3, -8, -2, 1).select("id")
     assert(res5.count == 3)
     assert(res5.agg(sum("id")).as("sumid").collect() === Seq(Row(-15)))
 
     // start, end are negative, step is positive
-    val res6 = ctx.range(-8, -4, 2, 1).select("id")
+    val res6 = sqlContext.range(-8, -4, 2, 1).select("id")
     assert(res6.count == 2)
     assert(res6.agg(sum("id")).as("sumid").collect() === Seq(Row(-14)))
 
-    val res7 = ctx.range(-10, -9, -20, 1).select("id")
+    val res7 = sqlContext.range(-10, -9, -20, 1).select("id")
     assert(res7.count == 0)
 
-    val res8 = ctx.range(Long.MinValue, Long.MaxValue, Long.MaxValue, 100).select("id")
+    val res8 = sqlContext.range(Long.MinValue, Long.MaxValue, Long.MaxValue, 100).select("id")
     assert(res8.count == 3)
     assert(res8.agg(sum("id")).as("sumid").collect() === Seq(Row(-3)))
 
-    val res9 = ctx.range(Long.MaxValue, Long.MinValue, Long.MinValue, 100).select("id")
+    val res9 = sqlContext.range(Long.MaxValue, Long.MinValue, Long.MinValue, 100).select("id")
     assert(res9.count == 2)
     assert(res9.agg(sum("id")).as("sumid").collect() === Seq(Row(Long.MaxValue - 1)))
 
     // only end provided as argument
-    val res10 = ctx.range(10).select("id")
+    val res10 = sqlContext.range(10).select("id")
     assert(res10.count == 10)
     assert(res10.agg(sum("id")).as("sumid").collect() === Seq(Row(45)))
 
-    val res11 = ctx.range(-1).select("id")
+    val res11 = sqlContext.range(-1).select("id")
     assert(res11.count == 0)
   }
 
@@ -819,13 +807,13 @@ class DataFrameSuite extends QueryTest with SQLTestUtils {
 
       // pass case: parquet table (HadoopFsRelation)
       df.write.mode(SaveMode.Overwrite).parquet(tempParquetFile.getCanonicalPath)
-      val pdf = ctx.read.parquet(tempParquetFile.getCanonicalPath)
+      val pdf = sqlContext.read.parquet(tempParquetFile.getCanonicalPath)
       pdf.registerTempTable("parquet_base")
       insertion.write.insertInto("parquet_base")
 
       // pass case: json table (InsertableRelation)
       df.write.mode(SaveMode.Overwrite).json(tempJsonFile.getCanonicalPath)
-      val jdf = ctx.read.json(tempJsonFile.getCanonicalPath)
+      val jdf = sqlContext.read.json(tempJsonFile.getCanonicalPath)
       jdf.registerTempTable("json_base")
       insertion.write.mode(SaveMode.Overwrite).insertInto("json_base")
 
@@ -845,11 +833,54 @@ class DataFrameSuite extends QueryTest with SQLTestUtils {
       assert(e2.getMessage.contains("Inserting into an RDD-based table is not allowed."))
 
       // error case: insert into an OneRowRelation
-      new DataFrame(ctx, OneRowRelation).registerTempTable("one_row")
+      new DataFrame(sqlContext, OneRowRelation).registerTempTable("one_row")
       val e3 = intercept[AnalysisException] {
         insertion.write.insertInto("one_row")
       }
       assert(e3.getMessage.contains("Inserting into an RDD-based table is not allowed."))
     }
   }
+
+  test("SPARK-8608: call `show` on local DataFrame with random columns should return same value") {
+    // Make sure we can pass this test for both codegen mode and interpreted mode.
+    withSQLConf(SQLConf.CODEGEN_ENABLED.key -> "true") {
+      val df = testData.select(rand(33))
+      assert(df.showString(5) == df.showString(5))
+    }
+
+    withSQLConf(SQLConf.CODEGEN_ENABLED.key -> "false") {
+      val df = testData.select(rand(33))
+      assert(df.showString(5) == df.showString(5))
+    }
+
+    // We will reuse the same Expression object for LocalRelation.
+    val df = (1 to 10).map(Tuple1.apply).toDF().select(rand(33))
+    assert(df.showString(5) == df.showString(5))
+  }
+
+  test("SPARK-8609: local DataFrame with random columns should return same value after sort") {
+    // Make sure we can pass this test for both codegen mode and interpreted mode.
+    withSQLConf(SQLConf.CODEGEN_ENABLED.key -> "true") {
+      checkAnswer(testData.sort(rand(33)), testData.sort(rand(33)))
+    }
+
+    withSQLConf(SQLConf.CODEGEN_ENABLED.key -> "false") {
+      checkAnswer(testData.sort(rand(33)), testData.sort(rand(33)))
+    }
+
+    // We will reuse the same Expression object for LocalRelation.
+    val df = (1 to 10).map(Tuple1.apply).toDF()
+    checkAnswer(df.sort(rand(33)), df.sort(rand(33)))
+  }
+
+  test("SPARK-9083: sort with non-deterministic expressions") {
+    import org.apache.spark.util.random.XORShiftRandom
+
+    val seed = 33
+    val df = (1 to 100).map(Tuple1.apply).toDF("i")
+    val random = new XORShiftRandom(seed)
+    val expected = (1 to 100).map(_ -> random.nextDouble()).sortBy(_._2).map(_._1)
+    val actual = df.sort(rand(seed)).collect().map(_.getInt(0))
+    assert(expected === actual)
+  }
 }

From ea49705bd4feb2f25e1b536f0b3ddcfc72a57101 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Tue, 28 Jul 2015 21:53:28 -0700
Subject: [PATCH 440/549] [SPARK-9419] ShuffleMemoryManager and MemoryStore
 should track memory on a per-task, not per-thread, basis

Spark's ShuffleMemoryManager and MemoryStore track memory on a per-thread basis, which causes problems in the handful of cases where we have tasks that use multiple threads. In PythonRDD, RRDD, ScriptTransformation, and PipedRDD we consume the input iterator in a separate thread in order to write it to an external process.  As a result, these RDD's input iterators are consumed in a different thread than the thread that created them, which can cause problems in our memory allocation tracking. For example, if allocations are performed in one thread but deallocations are performed in a separate thread then memory may be leaked or we may get errors complaining that more memory was allocated than was freed.

I think that the right way to fix this is to change our accounting to be performed on a per-task instead of per-thread basis.  Note that the current per-thread tracking has caused problems in the past; SPARK-3731 (#2668) fixes a memory leak in PythonRDD that was caused by this issue (that fix is no longer necessary as of this patch).

Author: Josh Rosen <joshrosen@databricks.com>

Closes #7734 from JoshRosen/memory-tracking-fixes and squashes the following commits:

b4b1702 [Josh Rosen] Propagate TaskContext to writer threads.
57c9b4e [Josh Rosen] Merge remote-tracking branch 'origin/master' into memory-tracking-fixes
ed25d3b [Josh Rosen] Address minor PR review comments
44f6497 [Josh Rosen] Fix long line.
7b0f04b [Josh Rosen] Fix ShuffleMemoryManagerSuite
f57f3f2 [Josh Rosen] More thread -> task changes
fa78ee8 [Josh Rosen] Move Executor's cleanup into Task so that TaskContext is defined when cleanup is performed
5e2f01e [Josh Rosen] Fix capitalization
1b0083b [Josh Rosen] Roll back fix in PySpark, which is no longer necessary
2e1e0f8 [Josh Rosen] Use TaskAttemptIds to track shuffle memory
c9e8e54 [Josh Rosen] Use TaskAttemptIds to track unroll memory
---
 .../apache/spark/api/python/PythonRDD.scala   |  6 +-
 .../scala/org/apache/spark/api/r/RRDD.scala   |  2 +
 .../org/apache/spark/executor/Executor.scala  |  4 -
 .../scala/org/apache/spark/rdd/PipedRDD.scala |  1 +
 .../org/apache/spark/scheduler/Task.scala     | 15 ++-
 .../spark/shuffle/ShuffleMemoryManager.scala  | 88 +++++++++--------
 .../apache/spark/storage/MemoryStore.scala    | 95 ++++++++++---------
 .../shuffle/ShuffleMemoryManagerSuite.scala   | 41 +++++---
 .../spark/storage/BlockManagerSuite.scala     | 84 ++++++++--------
 9 files changed, 184 insertions(+), 152 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
index 598953ac3bcc8..55e563ee968be 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
@@ -207,6 +207,7 @@ private[spark] class PythonRDD(
 
     override def run(): Unit = Utils.logUncaughtExceptions {
       try {
+        TaskContext.setTaskContext(context)
         val stream = new BufferedOutputStream(worker.getOutputStream, bufferSize)
         val dataOut = new DataOutputStream(stream)
         // Partition index
@@ -263,11 +264,6 @@ private[spark] class PythonRDD(
           if (!worker.isClosed) {
             Utils.tryLog(worker.shutdownOutput())
           }
-      } finally {
-        // Release memory used by this thread for shuffles
-        env.shuffleMemoryManager.releaseMemoryForThisThread()
-        // Release memory used by this thread for unrolling blocks
-        env.blockManager.memoryStore.releaseUnrollMemoryForThisThread()
       }
     }
   }
diff --git a/core/src/main/scala/org/apache/spark/api/r/RRDD.scala b/core/src/main/scala/org/apache/spark/api/r/RRDD.scala
index 23a470d6afcae..1cf2824f862ee 100644
--- a/core/src/main/scala/org/apache/spark/api/r/RRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/r/RRDD.scala
@@ -112,6 +112,7 @@ private abstract class BaseRRDD[T: ClassTag, U: ClassTag](
     partition: Int): Unit = {
 
     val env = SparkEnv.get
+    val taskContext = TaskContext.get()
     val bufferSize = System.getProperty("spark.buffer.size", "65536").toInt
     val stream = new BufferedOutputStream(output, bufferSize)
 
@@ -119,6 +120,7 @@ private abstract class BaseRRDD[T: ClassTag, U: ClassTag](
       override def run(): Unit = {
         try {
           SparkEnv.set(env)
+          TaskContext.setTaskContext(taskContext)
           val dataOut = new DataOutputStream(stream)
           dataOut.writeInt(partition)
 
diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala
index e76664f1bd7b0..7bc7fce7ae8dd 100644
--- a/core/src/main/scala/org/apache/spark/executor/Executor.scala
+++ b/core/src/main/scala/org/apache/spark/executor/Executor.scala
@@ -313,10 +313,6 @@ private[spark] class Executor(
           }
 
       } finally {
-        // Release memory used by this thread for shuffles
-        env.shuffleMemoryManager.releaseMemoryForThisThread()
-        // Release memory used by this thread for unrolling blocks
-        env.blockManager.memoryStore.releaseUnrollMemoryForThisThread()
         runningTasks.remove(taskId)
       }
     }
diff --git a/core/src/main/scala/org/apache/spark/rdd/PipedRDD.scala b/core/src/main/scala/org/apache/spark/rdd/PipedRDD.scala
index defdabf95ac4b..3bb9998e1db44 100644
--- a/core/src/main/scala/org/apache/spark/rdd/PipedRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/PipedRDD.scala
@@ -133,6 +133,7 @@ private[spark] class PipedRDD[T: ClassTag](
     // Start a thread to feed the process input from our parent's iterator
     new Thread("stdin writer for " + command) {
       override def run() {
+        TaskContext.setTaskContext(context)
         val out = new PrintWriter(proc.getOutputStream)
 
         // scalastyle:off println
diff --git a/core/src/main/scala/org/apache/spark/scheduler/Task.scala b/core/src/main/scala/org/apache/spark/scheduler/Task.scala
index d11a00956a9a9..1978305cfefbd 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/Task.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/Task.scala
@@ -23,7 +23,7 @@ import java.nio.ByteBuffer
 import scala.collection.mutable.HashMap
 
 import org.apache.spark.metrics.MetricsSystem
-import org.apache.spark.{TaskContextImpl, TaskContext}
+import org.apache.spark.{SparkEnv, TaskContextImpl, TaskContext}
 import org.apache.spark.executor.TaskMetrics
 import org.apache.spark.serializer.SerializerInstance
 import org.apache.spark.unsafe.memory.TaskMemoryManager
@@ -86,7 +86,18 @@ private[spark] abstract class Task[T](
       (runTask(context), context.collectAccumulators())
     } finally {
       context.markTaskCompleted()
-      TaskContext.unset()
+      try {
+        Utils.tryLogNonFatalError {
+          // Release memory used by this thread for shuffles
+          SparkEnv.get.shuffleMemoryManager.releaseMemoryForThisTask()
+        }
+        Utils.tryLogNonFatalError {
+          // Release memory used by this thread for unrolling blocks
+          SparkEnv.get.blockManager.memoryStore.releaseUnrollMemoryForThisTask()
+        }
+      } finally {
+        TaskContext.unset()
+      }
     }
   }
 
diff --git a/core/src/main/scala/org/apache/spark/shuffle/ShuffleMemoryManager.scala b/core/src/main/scala/org/apache/spark/shuffle/ShuffleMemoryManager.scala
index 3bcc7178a3d8b..f038b722957b8 100644
--- a/core/src/main/scala/org/apache/spark/shuffle/ShuffleMemoryManager.scala
+++ b/core/src/main/scala/org/apache/spark/shuffle/ShuffleMemoryManager.scala
@@ -19,95 +19,101 @@ package org.apache.spark.shuffle
 
 import scala.collection.mutable
 
-import org.apache.spark.{Logging, SparkException, SparkConf}
+import org.apache.spark.{Logging, SparkException, SparkConf, TaskContext}
 
 /**
- * Allocates a pool of memory to task threads for use in shuffle operations. Each disk-spilling
+ * Allocates a pool of memory to tasks for use in shuffle operations. Each disk-spilling
  * collection (ExternalAppendOnlyMap or ExternalSorter) used by these tasks can acquire memory
  * from this pool and release it as it spills data out. When a task ends, all its memory will be
  * released by the Executor.
  *
- * This class tries to ensure that each thread gets a reasonable share of memory, instead of some
- * thread ramping up to a large amount first and then causing others to spill to disk repeatedly.
- * If there are N threads, it ensures that each thread can acquire at least 1 / 2N of the memory
+ * This class tries to ensure that each task gets a reasonable share of memory, instead of some
+ * task ramping up to a large amount first and then causing others to spill to disk repeatedly.
+ * If there are N tasks, it ensures that each tasks can acquire at least 1 / 2N of the memory
  * before it has to spill, and at most 1 / N. Because N varies dynamically, we keep track of the
- * set of active threads and redo the calculations of 1 / 2N and 1 / N in waiting threads whenever
+ * set of active tasks and redo the calculations of 1 / 2N and 1 / N in waiting tasks whenever
  * this set changes. This is all done by synchronizing access on "this" to mutate state and using
  * wait() and notifyAll() to signal changes.
  */
 private[spark] class ShuffleMemoryManager(maxMemory: Long) extends Logging {
-  private val threadMemory = new mutable.HashMap[Long, Long]()  // threadId -> memory bytes
+  private val taskMemory = new mutable.HashMap[Long, Long]()  // taskAttemptId -> memory bytes
 
   def this(conf: SparkConf) = this(ShuffleMemoryManager.getMaxMemory(conf))
 
+  private def currentTaskAttemptId(): Long = {
+    // In case this is called on the driver, return an invalid task attempt id.
+    Option(TaskContext.get()).map(_.taskAttemptId()).getOrElse(-1L)
+  }
+
   /**
-   * Try to acquire up to numBytes memory for the current thread, and return the number of bytes
+   * Try to acquire up to numBytes memory for the current task, and return the number of bytes
    * obtained, or 0 if none can be allocated. This call may block until there is enough free memory
-   * in some situations, to make sure each thread has a chance to ramp up to at least 1 / 2N of the
-   * total memory pool (where N is the # of active threads) before it is forced to spill. This can
-   * happen if the number of threads increases but an older thread had a lot of memory already.
+   * in some situations, to make sure each task has a chance to ramp up to at least 1 / 2N of the
+   * total memory pool (where N is the # of active tasks) before it is forced to spill. This can
+   * happen if the number of tasks increases but an older task had a lot of memory already.
    */
   def tryToAcquire(numBytes: Long): Long = synchronized {
-    val threadId = Thread.currentThread().getId
+    val taskAttemptId = currentTaskAttemptId()
     assert(numBytes > 0, "invalid number of bytes requested: " + numBytes)
 
-    // Add this thread to the threadMemory map just so we can keep an accurate count of the number
-    // of active threads, to let other threads ramp down their memory in calls to tryToAcquire
-    if (!threadMemory.contains(threadId)) {
-      threadMemory(threadId) = 0L
-      notifyAll()  // Will later cause waiting threads to wake up and check numThreads again
+    // Add this task to the taskMemory map just so we can keep an accurate count of the number
+    // of active tasks, to let other tasks ramp down their memory in calls to tryToAcquire
+    if (!taskMemory.contains(taskAttemptId)) {
+      taskMemory(taskAttemptId) = 0L
+      notifyAll()  // Will later cause waiting tasks to wake up and check numThreads again
     }
 
     // Keep looping until we're either sure that we don't want to grant this request (because this
-    // thread would have more than 1 / numActiveThreads of the memory) or we have enough free
-    // memory to give it (we always let each thread get at least 1 / (2 * numActiveThreads)).
+    // task would have more than 1 / numActiveTasks of the memory) or we have enough free
+    // memory to give it (we always let each task get at least 1 / (2 * numActiveTasks)).
     while (true) {
-      val numActiveThreads = threadMemory.keys.size
-      val curMem = threadMemory(threadId)
-      val freeMemory = maxMemory - threadMemory.values.sum
+      val numActiveTasks = taskMemory.keys.size
+      val curMem = taskMemory(taskAttemptId)
+      val freeMemory = maxMemory - taskMemory.values.sum
 
-      // How much we can grant this thread; don't let it grow to more than 1 / numActiveThreads;
+      // How much we can grant this task; don't let it grow to more than 1 / numActiveTasks;
       // don't let it be negative
-      val maxToGrant = math.min(numBytes, math.max(0, (maxMemory / numActiveThreads) - curMem))
+      val maxToGrant = math.min(numBytes, math.max(0, (maxMemory / numActiveTasks) - curMem))
 
-      if (curMem < maxMemory / (2 * numActiveThreads)) {
-        // We want to let each thread get at least 1 / (2 * numActiveThreads) before blocking;
-        // if we can't give it this much now, wait for other threads to free up memory
-        // (this happens if older threads allocated lots of memory before N grew)
-        if (freeMemory >= math.min(maxToGrant, maxMemory / (2 * numActiveThreads) - curMem)) {
+      if (curMem < maxMemory / (2 * numActiveTasks)) {
+        // We want to let each task get at least 1 / (2 * numActiveTasks) before blocking;
+        // if we can't give it this much now, wait for other tasks to free up memory
+        // (this happens if older tasks allocated lots of memory before N grew)
+        if (freeMemory >= math.min(maxToGrant, maxMemory / (2 * numActiveTasks) - curMem)) {
           val toGrant = math.min(maxToGrant, freeMemory)
-          threadMemory(threadId) += toGrant
+          taskMemory(taskAttemptId) += toGrant
           return toGrant
         } else {
-          logInfo(s"Thread $threadId waiting for at least 1/2N of shuffle memory pool to be free")
+          logInfo(
+            s"Thread $taskAttemptId waiting for at least 1/2N of shuffle memory pool to be free")
           wait()
         }
       } else {
         // Only give it as much memory as is free, which might be none if it reached 1 / numThreads
         val toGrant = math.min(maxToGrant, freeMemory)
-        threadMemory(threadId) += toGrant
+        taskMemory(taskAttemptId) += toGrant
         return toGrant
       }
     }
     0L  // Never reached
   }
 
-  /** Release numBytes bytes for the current thread. */
+  /** Release numBytes bytes for the current task. */
   def release(numBytes: Long): Unit = synchronized {
-    val threadId = Thread.currentThread().getId
-    val curMem = threadMemory.getOrElse(threadId, 0L)
+    val taskAttemptId = currentTaskAttemptId()
+    val curMem = taskMemory.getOrElse(taskAttemptId, 0L)
     if (curMem < numBytes) {
       throw new SparkException(
-        s"Internal error: release called on ${numBytes} bytes but thread only has ${curMem}")
+        s"Internal error: release called on ${numBytes} bytes but task only has ${curMem}")
     }
-    threadMemory(threadId) -= numBytes
+    taskMemory(taskAttemptId) -= numBytes
     notifyAll()  // Notify waiters who locked "this" in tryToAcquire that memory has been freed
   }
 
-  /** Release all memory for the current thread and mark it as inactive (e.g. when a task ends). */
-  def releaseMemoryForThisThread(): Unit = synchronized {
-    val threadId = Thread.currentThread().getId
-    threadMemory.remove(threadId)
+  /** Release all memory for the current task and mark it as inactive (e.g. when a task ends). */
+  def releaseMemoryForThisTask(): Unit = synchronized {
+    val taskAttemptId = currentTaskAttemptId()
+    taskMemory.remove(taskAttemptId)
     notifyAll()  // Notify waiters who locked "this" in tryToAcquire that memory has been freed
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/storage/MemoryStore.scala b/core/src/main/scala/org/apache/spark/storage/MemoryStore.scala
index ed609772e6979..6f27f00307f8c 100644
--- a/core/src/main/scala/org/apache/spark/storage/MemoryStore.scala
+++ b/core/src/main/scala/org/apache/spark/storage/MemoryStore.scala
@@ -23,6 +23,7 @@ import java.util.LinkedHashMap
 import scala.collection.mutable
 import scala.collection.mutable.ArrayBuffer
 
+import org.apache.spark.TaskContext
 import org.apache.spark.util.{SizeEstimator, Utils}
 import org.apache.spark.util.collection.SizeTrackingVector
 
@@ -43,11 +44,11 @@ private[spark] class MemoryStore(blockManager: BlockManager, maxMemory: Long)
   // Ensure only one thread is putting, and if necessary, dropping blocks at any given time
   private val accountingLock = new Object
 
-  // A mapping from thread ID to amount of memory used for unrolling a block (in bytes)
+  // A mapping from taskAttemptId to amount of memory used for unrolling a block (in bytes)
   // All accesses of this map are assumed to have manually synchronized on `accountingLock`
   private val unrollMemoryMap = mutable.HashMap[Long, Long]()
   // Same as `unrollMemoryMap`, but for pending unroll memory as defined below.
-  // Pending unroll memory refers to the intermediate memory occupied by a thread
+  // Pending unroll memory refers to the intermediate memory occupied by a task
   // after the unroll but before the actual putting of the block in the cache.
   // This chunk of memory is expected to be released *as soon as* we finish
   // caching the corresponding block as opposed to until after the task finishes.
@@ -250,21 +251,21 @@ private[spark] class MemoryStore(blockManager: BlockManager, maxMemory: Long)
     var elementsUnrolled = 0
     // Whether there is still enough memory for us to continue unrolling this block
     var keepUnrolling = true
-    // Initial per-thread memory to request for unrolling blocks (bytes). Exposed for testing.
+    // Initial per-task memory to request for unrolling blocks (bytes). Exposed for testing.
     val initialMemoryThreshold = unrollMemoryThreshold
     // How often to check whether we need to request more memory
     val memoryCheckPeriod = 16
-    // Memory currently reserved by this thread for this particular unrolling operation
+    // Memory currently reserved by this task for this particular unrolling operation
     var memoryThreshold = initialMemoryThreshold
     // Memory to request as a multiple of current vector size
     val memoryGrowthFactor = 1.5
-    // Previous unroll memory held by this thread, for releasing later (only at the very end)
-    val previousMemoryReserved = currentUnrollMemoryForThisThread
+    // Previous unroll memory held by this task, for releasing later (only at the very end)
+    val previousMemoryReserved = currentUnrollMemoryForThisTask
     // Underlying vector for unrolling the block
     var vector = new SizeTrackingVector[Any]
 
     // Request enough memory to begin unrolling
-    keepUnrolling = reserveUnrollMemoryForThisThread(initialMemoryThreshold)
+    keepUnrolling = reserveUnrollMemoryForThisTask(initialMemoryThreshold)
 
     if (!keepUnrolling) {
       logWarning(s"Failed to reserve initial memory threshold of " +
@@ -283,7 +284,7 @@ private[spark] class MemoryStore(blockManager: BlockManager, maxMemory: Long)
             // Hold the accounting lock, in case another thread concurrently puts a block that
             // takes up the unrolling space we just ensured here
             accountingLock.synchronized {
-              if (!reserveUnrollMemoryForThisThread(amountToRequest)) {
+              if (!reserveUnrollMemoryForThisTask(amountToRequest)) {
                 // If the first request is not granted, try again after ensuring free space
                 // If there is still not enough space, give up and drop the partition
                 val spaceToEnsure = maxUnrollMemory - currentUnrollMemory
@@ -291,7 +292,7 @@ private[spark] class MemoryStore(blockManager: BlockManager, maxMemory: Long)
                   val result = ensureFreeSpace(blockId, spaceToEnsure)
                   droppedBlocks ++= result.droppedBlocks
                 }
-                keepUnrolling = reserveUnrollMemoryForThisThread(amountToRequest)
+                keepUnrolling = reserveUnrollMemoryForThisTask(amountToRequest)
               }
             }
             // New threshold is currentSize * memoryGrowthFactor
@@ -317,9 +318,9 @@ private[spark] class MemoryStore(blockManager: BlockManager, maxMemory: Long)
       // later when the task finishes.
       if (keepUnrolling) {
         accountingLock.synchronized {
-          val amountToRelease = currentUnrollMemoryForThisThread - previousMemoryReserved
-          releaseUnrollMemoryForThisThread(amountToRelease)
-          reservePendingUnrollMemoryForThisThread(amountToRelease)
+          val amountToRelease = currentUnrollMemoryForThisTask - previousMemoryReserved
+          releaseUnrollMemoryForThisTask(amountToRelease)
+          reservePendingUnrollMemoryForThisTask(amountToRelease)
         }
       }
     }
@@ -397,7 +398,7 @@ private[spark] class MemoryStore(blockManager: BlockManager, maxMemory: Long)
         droppedBlockStatus.foreach { status => droppedBlocks += ((blockId, status)) }
       }
       // Release the unroll memory used because we no longer need the underlying Array
-      releasePendingUnrollMemoryForThisThread()
+      releasePendingUnrollMemoryForThisTask()
     }
     ResultWithDroppedBlocks(putSuccess, droppedBlocks)
   }
@@ -427,9 +428,9 @@ private[spark] class MemoryStore(blockManager: BlockManager, maxMemory: Long)
 
     // Take into account the amount of memory currently occupied by unrolling blocks
     // and minus the pending unroll memory for that block on current thread.
-    val threadId = Thread.currentThread().getId
+    val taskAttemptId = currentTaskAttemptId()
     val actualFreeMemory = freeMemory - currentUnrollMemory +
-      pendingUnrollMemoryMap.getOrElse(threadId, 0L)
+      pendingUnrollMemoryMap.getOrElse(taskAttemptId, 0L)
 
     if (actualFreeMemory < space) {
       val rddToAdd = getRddId(blockIdToAdd)
@@ -455,7 +456,7 @@ private[spark] class MemoryStore(blockManager: BlockManager, maxMemory: Long)
         logInfo(s"${selectedBlocks.size} blocks selected for dropping")
         for (blockId <- selectedBlocks) {
           val entry = entries.synchronized { entries.get(blockId) }
-          // This should never be null as only one thread should be dropping
+          // This should never be null as only one task should be dropping
           // blocks and removing entries. However the check is still here for
           // future safety.
           if (entry != null) {
@@ -482,79 +483,85 @@ private[spark] class MemoryStore(blockManager: BlockManager, maxMemory: Long)
     entries.synchronized { entries.containsKey(blockId) }
   }
 
+  private def currentTaskAttemptId(): Long = {
+    // In case this is called on the driver, return an invalid task attempt id.
+    Option(TaskContext.get()).map(_.taskAttemptId()).getOrElse(-1L)
+  }
+
   /**
-   * Reserve additional memory for unrolling blocks used by this thread.
+   * Reserve additional memory for unrolling blocks used by this task.
    * Return whether the request is granted.
    */
-  def reserveUnrollMemoryForThisThread(memory: Long): Boolean = {
+  def reserveUnrollMemoryForThisTask(memory: Long): Boolean = {
     accountingLock.synchronized {
       val granted = freeMemory > currentUnrollMemory + memory
       if (granted) {
-        val threadId = Thread.currentThread().getId
-        unrollMemoryMap(threadId) = unrollMemoryMap.getOrElse(threadId, 0L) + memory
+        val taskAttemptId = currentTaskAttemptId()
+        unrollMemoryMap(taskAttemptId) = unrollMemoryMap.getOrElse(taskAttemptId, 0L) + memory
       }
       granted
     }
   }
 
   /**
-   * Release memory used by this thread for unrolling blocks.
-   * If the amount is not specified, remove the current thread's allocation altogether.
+   * Release memory used by this task for unrolling blocks.
+   * If the amount is not specified, remove the current task's allocation altogether.
    */
-  def releaseUnrollMemoryForThisThread(memory: Long = -1L): Unit = {
-    val threadId = Thread.currentThread().getId
+  def releaseUnrollMemoryForThisTask(memory: Long = -1L): Unit = {
+    val taskAttemptId = currentTaskAttemptId()
     accountingLock.synchronized {
       if (memory < 0) {
-        unrollMemoryMap.remove(threadId)
+        unrollMemoryMap.remove(taskAttemptId)
       } else {
-        unrollMemoryMap(threadId) = unrollMemoryMap.getOrElse(threadId, memory) - memory
-        // If this thread claims no more unroll memory, release it completely
-        if (unrollMemoryMap(threadId) <= 0) {
-          unrollMemoryMap.remove(threadId)
+        unrollMemoryMap(taskAttemptId) = unrollMemoryMap.getOrElse(taskAttemptId, memory) - memory
+        // If this task claims no more unroll memory, release it completely
+        if (unrollMemoryMap(taskAttemptId) <= 0) {
+          unrollMemoryMap.remove(taskAttemptId)
         }
       }
     }
   }
 
   /**
-   * Reserve the unroll memory of current unroll successful block used by this thread
+   * Reserve the unroll memory of current unroll successful block used by this task
    * until actually put the block into memory entry.
    */
-  def reservePendingUnrollMemoryForThisThread(memory: Long): Unit = {
-    val threadId = Thread.currentThread().getId
+  def reservePendingUnrollMemoryForThisTask(memory: Long): Unit = {
+    val taskAttemptId = currentTaskAttemptId()
     accountingLock.synchronized {
-       pendingUnrollMemoryMap(threadId) = pendingUnrollMemoryMap.getOrElse(threadId, 0L) + memory
+       pendingUnrollMemoryMap(taskAttemptId) =
+         pendingUnrollMemoryMap.getOrElse(taskAttemptId, 0L) + memory
     }
   }
 
   /**
-   * Release pending unroll memory of current unroll successful block used by this thread
+   * Release pending unroll memory of current unroll successful block used by this task
    */
-  def releasePendingUnrollMemoryForThisThread(): Unit = {
-    val threadId = Thread.currentThread().getId
+  def releasePendingUnrollMemoryForThisTask(): Unit = {
+    val taskAttemptId = currentTaskAttemptId()
     accountingLock.synchronized {
-      pendingUnrollMemoryMap.remove(threadId)
+      pendingUnrollMemoryMap.remove(taskAttemptId)
     }
   }
 
   /**
-   * Return the amount of memory currently occupied for unrolling blocks across all threads.
+   * Return the amount of memory currently occupied for unrolling blocks across all tasks.
    */
   def currentUnrollMemory: Long = accountingLock.synchronized {
     unrollMemoryMap.values.sum + pendingUnrollMemoryMap.values.sum
   }
 
   /**
-   * Return the amount of memory currently occupied for unrolling blocks by this thread.
+   * Return the amount of memory currently occupied for unrolling blocks by this task.
    */
-  def currentUnrollMemoryForThisThread: Long = accountingLock.synchronized {
-    unrollMemoryMap.getOrElse(Thread.currentThread().getId, 0L)
+  def currentUnrollMemoryForThisTask: Long = accountingLock.synchronized {
+    unrollMemoryMap.getOrElse(currentTaskAttemptId(), 0L)
   }
 
   /**
-   * Return the number of threads currently unrolling blocks.
+   * Return the number of tasks currently unrolling blocks.
    */
-  def numThreadsUnrolling: Int = accountingLock.synchronized { unrollMemoryMap.keys.size }
+  def numTasksUnrolling: Int = accountingLock.synchronized { unrollMemoryMap.keys.size }
 
   /**
    * Log information about current memory usage.
@@ -566,7 +573,7 @@ private[spark] class MemoryStore(blockManager: BlockManager, maxMemory: Long)
     logInfo(
       s"Memory use = ${Utils.bytesToString(blocksMemory)} (blocks) + " +
       s"${Utils.bytesToString(unrollMemory)} (scratch space shared across " +
-      s"$numThreadsUnrolling thread(s)) = ${Utils.bytesToString(totalMemory)}. " +
+      s"$numTasksUnrolling tasks(s)) = ${Utils.bytesToString(totalMemory)}. " +
       s"Storage limit = ${Utils.bytesToString(maxMemory)}."
     )
   }
diff --git a/core/src/test/scala/org/apache/spark/shuffle/ShuffleMemoryManagerSuite.scala b/core/src/test/scala/org/apache/spark/shuffle/ShuffleMemoryManagerSuite.scala
index 96778c9ebafb1..f495b6a037958 100644
--- a/core/src/test/scala/org/apache/spark/shuffle/ShuffleMemoryManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/shuffle/ShuffleMemoryManagerSuite.scala
@@ -17,26 +17,39 @@
 
 package org.apache.spark.shuffle
 
+import java.util.concurrent.CountDownLatch
+import java.util.concurrent.atomic.AtomicInteger
+
+import org.mockito.Mockito._
 import org.scalatest.concurrent.Timeouts
 import org.scalatest.time.SpanSugar._
-import java.util.concurrent.atomic.AtomicBoolean
-import java.util.concurrent.CountDownLatch
 
-import org.apache.spark.SparkFunSuite
+import org.apache.spark.{SparkFunSuite, TaskContext}
 
 class ShuffleMemoryManagerSuite extends SparkFunSuite with Timeouts {
+
+  val nextTaskAttemptId = new AtomicInteger()
+
   /** Launch a thread with the given body block and return it. */
   private def startThread(name: String)(body: => Unit): Thread = {
     val thread = new Thread("ShuffleMemorySuite " + name) {
       override def run() {
-        body
+        try {
+          val taskAttemptId = nextTaskAttemptId.getAndIncrement
+          val mockTaskContext = mock(classOf[TaskContext], RETURNS_SMART_NULLS)
+          when(mockTaskContext.taskAttemptId()).thenReturn(taskAttemptId)
+          TaskContext.setTaskContext(mockTaskContext)
+          body
+        } finally {
+          TaskContext.unset()
+        }
       }
     }
     thread.start()
     thread
   }
 
-  test("single thread requesting memory") {
+  test("single task requesting memory") {
     val manager = new ShuffleMemoryManager(1000L)
 
     assert(manager.tryToAcquire(100L) === 100L)
@@ -50,7 +63,7 @@ class ShuffleMemoryManagerSuite extends SparkFunSuite with Timeouts {
     assert(manager.tryToAcquire(300L) === 300L)
     assert(manager.tryToAcquire(300L) === 200L)
 
-    manager.releaseMemoryForThisThread()
+    manager.releaseMemoryForThisTask()
     assert(manager.tryToAcquire(1000L) === 1000L)
     assert(manager.tryToAcquire(100L) === 0L)
   }
@@ -107,8 +120,8 @@ class ShuffleMemoryManagerSuite extends SparkFunSuite with Timeouts {
   }
 
 
-  test("threads cannot grow past 1 / N") {
-    // Two threads request 250 bytes first, wait for each other to get it, and then request
+  test("tasks cannot grow past 1 / N") {
+    // Two tasks request 250 bytes first, wait for each other to get it, and then request
     // 500 more; we should only grant 250 bytes to each of them on this second request
 
     val manager = new ShuffleMemoryManager(1000L)
@@ -158,7 +171,7 @@ class ShuffleMemoryManagerSuite extends SparkFunSuite with Timeouts {
     assert(state.t2Result2 === 250L)
   }
 
-  test("threads can block to get at least 1 / 2N memory") {
+  test("tasks can block to get at least 1 / 2N memory") {
     // t1 grabs 1000 bytes and then waits until t2 is ready to make a request. It sleeps
     // for a bit and releases 250 bytes, which should then be granted to t2. Further requests
     // by t2 will return false right away because it now has 1 / 2N of the memory.
@@ -224,7 +237,7 @@ class ShuffleMemoryManagerSuite extends SparkFunSuite with Timeouts {
     }
   }
 
-  test("releaseMemoryForThisThread") {
+  test("releaseMemoryForThisTask") {
     // t1 grabs 1000 bytes and then waits until t2 is ready to make a request. It sleeps
     // for a bit and releases all its memory. t2 should now be able to grab all the memory.
 
@@ -251,9 +264,9 @@ class ShuffleMemoryManagerSuite extends SparkFunSuite with Timeouts {
         }
       }
       // Sleep a bit before releasing our memory; this is hacky but it would be difficult to make
-      // sure the other thread blocks for some time otherwise
+      // sure the other task blocks for some time otherwise
       Thread.sleep(300)
-      manager.releaseMemoryForThisThread()
+      manager.releaseMemoryForThisTask()
     }
 
     val t2 = startThread("t2") {
@@ -282,7 +295,7 @@ class ShuffleMemoryManagerSuite extends SparkFunSuite with Timeouts {
       t2.join()
     }
 
-    // Both threads should've been able to acquire their memory; the second one will have waited
+    // Both tasks should've been able to acquire their memory; the second one will have waited
     // until the first one acquired 1000 bytes and then released all of it
     state.synchronized {
       assert(state.t1Result === 1000L, "t1 could not allocate memory")
@@ -293,7 +306,7 @@ class ShuffleMemoryManagerSuite extends SparkFunSuite with Timeouts {
     }
   }
 
-  test("threads should not be granted a negative size") {
+  test("tasks should not be granted a negative size") {
     val manager = new ShuffleMemoryManager(1000L)
     manager.tryToAcquire(700L)
 
diff --git a/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala b/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala
index bcee901f5dd5f..f480fd107a0c2 100644
--- a/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala
@@ -1004,32 +1004,32 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE
     store = makeBlockManager(12000)
     val memoryStore = store.memoryStore
     assert(memoryStore.currentUnrollMemory === 0)
-    assert(memoryStore.currentUnrollMemoryForThisThread === 0)
+    assert(memoryStore.currentUnrollMemoryForThisTask === 0)
 
     // Reserve
-    memoryStore.reserveUnrollMemoryForThisThread(100)
-    assert(memoryStore.currentUnrollMemoryForThisThread === 100)
-    memoryStore.reserveUnrollMemoryForThisThread(200)
-    assert(memoryStore.currentUnrollMemoryForThisThread === 300)
-    memoryStore.reserveUnrollMemoryForThisThread(500)
-    assert(memoryStore.currentUnrollMemoryForThisThread === 800)
-    memoryStore.reserveUnrollMemoryForThisThread(1000000)
-    assert(memoryStore.currentUnrollMemoryForThisThread === 800) // not granted
+    memoryStore.reserveUnrollMemoryForThisTask(100)
+    assert(memoryStore.currentUnrollMemoryForThisTask === 100)
+    memoryStore.reserveUnrollMemoryForThisTask(200)
+    assert(memoryStore.currentUnrollMemoryForThisTask === 300)
+    memoryStore.reserveUnrollMemoryForThisTask(500)
+    assert(memoryStore.currentUnrollMemoryForThisTask === 800)
+    memoryStore.reserveUnrollMemoryForThisTask(1000000)
+    assert(memoryStore.currentUnrollMemoryForThisTask === 800) // not granted
     // Release
-    memoryStore.releaseUnrollMemoryForThisThread(100)
-    assert(memoryStore.currentUnrollMemoryForThisThread === 700)
-    memoryStore.releaseUnrollMemoryForThisThread(100)
-    assert(memoryStore.currentUnrollMemoryForThisThread === 600)
+    memoryStore.releaseUnrollMemoryForThisTask(100)
+    assert(memoryStore.currentUnrollMemoryForThisTask === 700)
+    memoryStore.releaseUnrollMemoryForThisTask(100)
+    assert(memoryStore.currentUnrollMemoryForThisTask === 600)
     // Reserve again
-    memoryStore.reserveUnrollMemoryForThisThread(4400)
-    assert(memoryStore.currentUnrollMemoryForThisThread === 5000)
-    memoryStore.reserveUnrollMemoryForThisThread(20000)
-    assert(memoryStore.currentUnrollMemoryForThisThread === 5000) // not granted
+    memoryStore.reserveUnrollMemoryForThisTask(4400)
+    assert(memoryStore.currentUnrollMemoryForThisTask === 5000)
+    memoryStore.reserveUnrollMemoryForThisTask(20000)
+    assert(memoryStore.currentUnrollMemoryForThisTask === 5000) // not granted
     // Release again
-    memoryStore.releaseUnrollMemoryForThisThread(1000)
-    assert(memoryStore.currentUnrollMemoryForThisThread === 4000)
-    memoryStore.releaseUnrollMemoryForThisThread() // release all
-    assert(memoryStore.currentUnrollMemoryForThisThread === 0)
+    memoryStore.releaseUnrollMemoryForThisTask(1000)
+    assert(memoryStore.currentUnrollMemoryForThisTask === 4000)
+    memoryStore.releaseUnrollMemoryForThisTask() // release all
+    assert(memoryStore.currentUnrollMemoryForThisTask === 0)
   }
 
   /**
@@ -1060,24 +1060,24 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE
     val bigList = List.fill(40)(new Array[Byte](1000))
     val memoryStore = store.memoryStore
     val droppedBlocks = new ArrayBuffer[(BlockId, BlockStatus)]
-    assert(memoryStore.currentUnrollMemoryForThisThread === 0)
+    assert(memoryStore.currentUnrollMemoryForThisTask === 0)
 
     // Unroll with all the space in the world. This should succeed and return an array.
     var unrollResult = memoryStore.unrollSafely("unroll", smallList.iterator, droppedBlocks)
     verifyUnroll(smallList.iterator, unrollResult, shouldBeArray = true)
-    assert(memoryStore.currentUnrollMemoryForThisThread === 0)
-    memoryStore.releasePendingUnrollMemoryForThisThread()
+    assert(memoryStore.currentUnrollMemoryForThisTask === 0)
+    memoryStore.releasePendingUnrollMemoryForThisTask()
 
     // Unroll with not enough space. This should succeed after kicking out someBlock1.
     store.putIterator("someBlock1", smallList.iterator, StorageLevel.MEMORY_ONLY)
     store.putIterator("someBlock2", smallList.iterator, StorageLevel.MEMORY_ONLY)
     unrollResult = memoryStore.unrollSafely("unroll", smallList.iterator, droppedBlocks)
     verifyUnroll(smallList.iterator, unrollResult, shouldBeArray = true)
-    assert(memoryStore.currentUnrollMemoryForThisThread === 0)
+    assert(memoryStore.currentUnrollMemoryForThisTask === 0)
     assert(droppedBlocks.size === 1)
     assert(droppedBlocks.head._1 === TestBlockId("someBlock1"))
     droppedBlocks.clear()
-    memoryStore.releasePendingUnrollMemoryForThisThread()
+    memoryStore.releasePendingUnrollMemoryForThisTask()
 
     // Unroll huge block with not enough space. Even after ensuring free space of 12000 * 0.4 =
     // 4800 bytes, there is still not enough room to unroll this block. This returns an iterator.
@@ -1085,7 +1085,7 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE
     store.putIterator("someBlock3", smallList.iterator, StorageLevel.MEMORY_ONLY)
     unrollResult = memoryStore.unrollSafely("unroll", bigList.iterator, droppedBlocks)
     verifyUnroll(bigList.iterator, unrollResult, shouldBeArray = false)
-    assert(memoryStore.currentUnrollMemoryForThisThread > 0) // we returned an iterator
+    assert(memoryStore.currentUnrollMemoryForThisTask > 0) // we returned an iterator
     assert(droppedBlocks.size === 1)
     assert(droppedBlocks.head._1 === TestBlockId("someBlock2"))
     droppedBlocks.clear()
@@ -1099,7 +1099,7 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE
     val bigList = List.fill(40)(new Array[Byte](1000))
     def smallIterator: Iterator[Any] = smallList.iterator.asInstanceOf[Iterator[Any]]
     def bigIterator: Iterator[Any] = bigList.iterator.asInstanceOf[Iterator[Any]]
-    assert(memoryStore.currentUnrollMemoryForThisThread === 0)
+    assert(memoryStore.currentUnrollMemoryForThisTask === 0)
 
     // Unroll with plenty of space. This should succeed and cache both blocks.
     val result1 = memoryStore.putIterator("b1", smallIterator, memOnly, returnValues = true)
@@ -1110,7 +1110,7 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE
     assert(result2.size > 0)
     assert(result1.data.isLeft) // unroll did not drop this block to disk
     assert(result2.data.isLeft)
-    assert(memoryStore.currentUnrollMemoryForThisThread === 0)
+    assert(memoryStore.currentUnrollMemoryForThisTask === 0)
 
     // Re-put these two blocks so block manager knows about them too. Otherwise, block manager
     // would not know how to drop them from memory later.
@@ -1126,7 +1126,7 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE
     assert(!memoryStore.contains("b1"))
     assert(memoryStore.contains("b2"))
     assert(memoryStore.contains("b3"))
-    assert(memoryStore.currentUnrollMemoryForThisThread === 0)
+    assert(memoryStore.currentUnrollMemoryForThisTask === 0)
     memoryStore.remove("b3")
     store.putIterator("b3", smallIterator, memOnly)
 
@@ -1138,7 +1138,7 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE
     assert(!memoryStore.contains("b2"))
     assert(memoryStore.contains("b3"))
     assert(!memoryStore.contains("b4"))
-    assert(memoryStore.currentUnrollMemoryForThisThread > 0) // we returned an iterator
+    assert(memoryStore.currentUnrollMemoryForThisTask > 0) // we returned an iterator
   }
 
   /**
@@ -1153,7 +1153,7 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE
     val bigList = List.fill(40)(new Array[Byte](1000))
     def smallIterator: Iterator[Any] = smallList.iterator.asInstanceOf[Iterator[Any]]
     def bigIterator: Iterator[Any] = bigList.iterator.asInstanceOf[Iterator[Any]]
-    assert(memoryStore.currentUnrollMemoryForThisThread === 0)
+    assert(memoryStore.currentUnrollMemoryForThisTask === 0)
 
     store.putIterator("b1", smallIterator, memAndDisk)
     store.putIterator("b2", smallIterator, memAndDisk)
@@ -1170,7 +1170,7 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE
     assert(!diskStore.contains("b3"))
     memoryStore.remove("b3")
     store.putIterator("b3", smallIterator, StorageLevel.MEMORY_ONLY)
-    assert(memoryStore.currentUnrollMemoryForThisThread === 0)
+    assert(memoryStore.currentUnrollMemoryForThisTask === 0)
 
     // Unroll huge block with not enough space. This should fail and drop the new block to disk
     // directly in addition to kicking out b2 in the process. Memory store should contain only
@@ -1186,7 +1186,7 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE
     assert(diskStore.contains("b2"))
     assert(!diskStore.contains("b3"))
     assert(diskStore.contains("b4"))
-    assert(memoryStore.currentUnrollMemoryForThisThread > 0) // we returned an iterator
+    assert(memoryStore.currentUnrollMemoryForThisTask > 0) // we returned an iterator
   }
 
   test("multiple unrolls by the same thread") {
@@ -1195,32 +1195,32 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE
     val memoryStore = store.memoryStore
     val smallList = List.fill(40)(new Array[Byte](100))
     def smallIterator: Iterator[Any] = smallList.iterator.asInstanceOf[Iterator[Any]]
-    assert(memoryStore.currentUnrollMemoryForThisThread === 0)
+    assert(memoryStore.currentUnrollMemoryForThisTask === 0)
 
     // All unroll memory used is released because unrollSafely returned an array
     memoryStore.putIterator("b1", smallIterator, memOnly, returnValues = true)
-    assert(memoryStore.currentUnrollMemoryForThisThread === 0)
+    assert(memoryStore.currentUnrollMemoryForThisTask === 0)
     memoryStore.putIterator("b2", smallIterator, memOnly, returnValues = true)
-    assert(memoryStore.currentUnrollMemoryForThisThread === 0)
+    assert(memoryStore.currentUnrollMemoryForThisTask === 0)
 
     // Unroll memory is not released because unrollSafely returned an iterator
     // that still depends on the underlying vector used in the process
     memoryStore.putIterator("b3", smallIterator, memOnly, returnValues = true)
-    val unrollMemoryAfterB3 = memoryStore.currentUnrollMemoryForThisThread
+    val unrollMemoryAfterB3 = memoryStore.currentUnrollMemoryForThisTask
     assert(unrollMemoryAfterB3 > 0)
 
     // The unroll memory owned by this thread builds on top of its value after the previous unrolls
     memoryStore.putIterator("b4", smallIterator, memOnly, returnValues = true)
-    val unrollMemoryAfterB4 = memoryStore.currentUnrollMemoryForThisThread
+    val unrollMemoryAfterB4 = memoryStore.currentUnrollMemoryForThisTask
     assert(unrollMemoryAfterB4 > unrollMemoryAfterB3)
 
     // ... but only to a certain extent (until we run out of free space to grant new unroll memory)
     memoryStore.putIterator("b5", smallIterator, memOnly, returnValues = true)
-    val unrollMemoryAfterB5 = memoryStore.currentUnrollMemoryForThisThread
+    val unrollMemoryAfterB5 = memoryStore.currentUnrollMemoryForThisTask
     memoryStore.putIterator("b6", smallIterator, memOnly, returnValues = true)
-    val unrollMemoryAfterB6 = memoryStore.currentUnrollMemoryForThisThread
+    val unrollMemoryAfterB6 = memoryStore.currentUnrollMemoryForThisTask
     memoryStore.putIterator("b7", smallIterator, memOnly, returnValues = true)
-    val unrollMemoryAfterB7 = memoryStore.currentUnrollMemoryForThisThread
+    val unrollMemoryAfterB7 = memoryStore.currentUnrollMemoryForThisTask
     assert(unrollMemoryAfterB5 === unrollMemoryAfterB4)
     assert(unrollMemoryAfterB6 === unrollMemoryAfterB4)
     assert(unrollMemoryAfterB7 === unrollMemoryAfterB4)

From 6309b93467b06f27cd76d4662b51b47de100c677 Mon Sep 17 00:00:00 2001
From: Yijie Shen <henry.yijieshen@gmail.com>
Date: Tue, 28 Jul 2015 22:38:28 -0700
Subject: [PATCH 441/549] [SPARK-9398] [SQL] Datetime cleanup

JIRA: https://issues.apache.org/jira/browse/SPARK-9398

Author: Yijie Shen <henry.yijieshen@gmail.com>

Closes #7725 from yjshen/date_null_check and squashes the following commits:

b4eade1 [Yijie Shen] inline daysToMonthEnd
d09acc1 [Yijie Shen] implement getLastDayOfMonth to avoid repeated evaluation
d857ec3 [Yijie Shen] add null check in DateExpressionSuite
---
 .../expressions/datetimeFunctions.scala       | 45 ++++++-------------
 .../sql/catalyst/util/DateTimeUtils.scala     | 43 +++++++++++++-----
 .../expressions/DateExpressionsSuite.scala    |  2 +
 3 files changed, 47 insertions(+), 43 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeFunctions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeFunctions.scala
index c37afc13f2d17..efecb771f2f5d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeFunctions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeFunctions.scala
@@ -74,9 +74,7 @@ case class Hour(child: Expression) extends UnaryExpression with ImplicitCastInpu
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     val dtu = DateTimeUtils.getClass.getName.stripSuffix("$")
-    defineCodeGen(ctx, ev, (c) =>
-      s"""$dtu.getHours($c)"""
-    )
+    defineCodeGen(ctx, ev, c => s"$dtu.getHours($c)")
   }
 }
 
@@ -92,9 +90,7 @@ case class Minute(child: Expression) extends UnaryExpression with ImplicitCastIn
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     val dtu = DateTimeUtils.getClass.getName.stripSuffix("$")
-    defineCodeGen(ctx, ev, (c) =>
-      s"""$dtu.getMinutes($c)"""
-    )
+    defineCodeGen(ctx, ev, c => s"$dtu.getMinutes($c)")
   }
 }
 
@@ -110,9 +106,7 @@ case class Second(child: Expression) extends UnaryExpression with ImplicitCastIn
 
   override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     val dtu = DateTimeUtils.getClass.getName.stripSuffix("$")
-    defineCodeGen(ctx, ev, (c) =>
-      s"""$dtu.getSeconds($c)"""
-    )
+    defineCodeGen(ctx, ev, c => s"$dtu.getSeconds($c)")
   }
 }
 
@@ -128,9 +122,7 @@ case class DayOfYear(child: Expression) extends UnaryExpression with ImplicitCas
 
   override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     val dtu = DateTimeUtils.getClass.getName.stripSuffix("$")
-    defineCodeGen(ctx, ev, (c) =>
-      s"""$dtu.getDayInYear($c)"""
-    )
+    defineCodeGen(ctx, ev, c => s"$dtu.getDayInYear($c)")
   }
 }
 
@@ -147,9 +139,7 @@ case class Year(child: Expression) extends UnaryExpression with ImplicitCastInpu
 
   override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     val dtu = DateTimeUtils.getClass.getName.stripSuffix("$")
-    defineCodeGen(ctx, ev, c =>
-      s"""$dtu.getYear($c)"""
-    )
+    defineCodeGen(ctx, ev, c => s"$dtu.getYear($c)")
   }
 }
 
@@ -165,9 +155,7 @@ case class Quarter(child: Expression) extends UnaryExpression with ImplicitCastI
 
   override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     val dtu = DateTimeUtils.getClass.getName.stripSuffix("$")
-    defineCodeGen(ctx, ev, (c) =>
-      s"""$dtu.getQuarter($c)"""
-    )
+    defineCodeGen(ctx, ev, c => s"$dtu.getQuarter($c)")
   }
 }
 
@@ -183,9 +171,7 @@ case class Month(child: Expression) extends UnaryExpression with ImplicitCastInp
 
   override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     val dtu = DateTimeUtils.getClass.getName.stripSuffix("$")
-    defineCodeGen(ctx, ev, (c) =>
-      s"""$dtu.getMonth($c)"""
-    )
+    defineCodeGen(ctx, ev, c => s"$dtu.getMonth($c)")
   }
 }
 
@@ -201,9 +187,7 @@ case class DayOfMonth(child: Expression) extends UnaryExpression with ImplicitCa
 
   override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     val dtu = DateTimeUtils.getClass.getName.stripSuffix("$")
-    defineCodeGen(ctx, ev, (c) =>
-      s"""$dtu.getDayOfMonth($c)"""
-    )
+    defineCodeGen(ctx, ev, c => s"$dtu.getDayOfMonth($c)")
   }
 }
 
@@ -226,7 +210,7 @@ case class WeekOfYear(child: Expression) extends UnaryExpression with ImplicitCa
   }
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
-    nullSafeCodeGen(ctx, ev, (time) => {
+    nullSafeCodeGen(ctx, ev, time => {
       val cal = classOf[Calendar].getName
       val c = ctx.freshName("cal")
       ctx.addMutableState(cal, c,
@@ -250,8 +234,6 @@ case class DateFormatClass(left: Expression, right: Expression) extends BinaryEx
 
   override def inputTypes: Seq[AbstractDataType] = Seq(TimestampType, StringType)
 
-  override def prettyName: String = "date_format"
-
   override protected def nullSafeEval(timestamp: Any, format: Any): Any = {
     val sdf = new SimpleDateFormat(format.toString)
     UTF8String.fromString(sdf.format(new Date(timestamp.asInstanceOf[Long] / 1000)))
@@ -264,6 +246,8 @@ case class DateFormatClass(left: Expression, right: Expression) extends BinaryEx
           .format(new java.sql.Date($timestamp / 1000)))"""
     })
   }
+
+  override def prettyName: String = "date_format"
 }
 
 /**
@@ -277,15 +261,12 @@ case class LastDay(startDate: Expression) extends UnaryExpression with ImplicitC
   override def dataType: DataType = DateType
 
   override def nullSafeEval(date: Any): Any = {
-    val days = date.asInstanceOf[Int]
-    DateTimeUtils.getLastDayOfMonth(days)
+    DateTimeUtils.getLastDayOfMonth(date.asInstanceOf[Int])
   }
 
   override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     val dtu = DateTimeUtils.getClass.getName.stripSuffix("$")
-    defineCodeGen(ctx, ev, (sd) => {
-      s"$dtu.getLastDayOfMonth($sd)"
-    })
+    defineCodeGen(ctx, ev, sd => s"$dtu.getLastDayOfMonth($sd)")
   }
 
   override def prettyName: String = "last_day"
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
index 8b0b80c26db17..93966a503c27c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
@@ -600,23 +600,44 @@ object DateTimeUtils {
     startDate + 1 + ((dayOfWeek - 1 - startDate) % 7 + 7) % 7
   }
 
-  /**
-   * number of days in a non-leap year.
-   */
-  private[this] val daysInNormalYear = Array(31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31)
-
   /**
    * Returns last day of the month for the given date. The date is expressed in days
    * since 1.1.1970.
    */
   def getLastDayOfMonth(date: Int): Int = {
-    val dayOfMonth = getDayOfMonth(date)
-    val month = getMonth(date)
-    if (month == 2 && isLeapYear(getYear(date))) {
-      date + daysInNormalYear(month - 1) + 1 - dayOfMonth
+    var (year, dayInYear) = getYearAndDayInYear(date)
+    if (isLeapYear(year)) {
+      if (dayInYear > 31 && dayInYear <= 60) {
+        return date + (60 - dayInYear)
+      } else if (dayInYear > 60) {
+        dayInYear = dayInYear - 1
+      }
+    }
+    val lastDayOfMonthInYear = if (dayInYear <= 31) {
+      31
+    } else if (dayInYear <= 59) {
+      59
+    } else if (dayInYear <= 90) {
+      90
+    } else if (dayInYear <= 120) {
+      120
+    } else if (dayInYear <= 151) {
+      151
+    } else if (dayInYear <= 181) {
+      181
+    } else if (dayInYear <= 212) {
+      212
+    } else if (dayInYear <= 243) {
+      243
+    } else if (dayInYear <= 273) {
+      273
+    } else if (dayInYear <= 304) {
+      304
+    } else if (dayInYear <= 334) {
+      334
     } else {
-      date + daysInNormalYear(month - 1) - dayOfMonth
+      365
     }
+    date + (lastDayOfMonthInYear - dayInYear)
   }
-
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
index 30c5769424bd7..aca8d6eb3500c 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
@@ -106,6 +106,7 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
         }
       }
     }
+    checkEvaluation(DayOfYear(Literal.create(null, DateType)), null)
   }
 
   test("Year") {
@@ -274,6 +275,7 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(LastDay(Literal(Date.valueOf("2015-12-05"))), Date.valueOf("2015-12-31"))
     checkEvaluation(LastDay(Literal(Date.valueOf("2016-01-06"))), Date.valueOf("2016-01-31"))
     checkEvaluation(LastDay(Literal(Date.valueOf("2016-02-07"))), Date.valueOf("2016-02-29"))
+    checkEvaluation(LastDay(Literal.create(null, DateType)), null)
   }
 
   test("next_day") {

From 15667a0afa5fb17f4cc6fbf32b2ddb573630f20a Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Tue, 28 Jul 2015 22:51:08 -0700
Subject: [PATCH 442/549] [SPARK-9281] [SQL] use decimal or double when parsing
 SQL

Right now, we use double to parse all the float number in SQL. When it's used in expression together with DecimalType, it will turn the decimal into double as well. Also it will loss some precision when using double.

This PR change to parse float number to decimal or double, based on it's  using scientific notation or not, see https://msdn.microsoft.com/en-us/library/ms179899.aspx

This is a break change, should we doc it somewhere?

Author: Davies Liu <davies@databricks.com>

Closes #7642 from davies/parse_decimal and squashes the following commits:

1f576d9 [Davies Liu] Merge branch 'master' of github.com:apache/spark into parse_decimal
5e142b6 [Davies Liu] fix scala style
eca99de [Davies Liu] fix tests
2afe702 [Davies Liu] Merge branch 'master' of github.com:apache/spark into parse_decimal
f4a320b [Davies Liu] Update SqlParser.scala
1c48e34 [Davies Liu] use decimal or double when parsing SQL
---
 .../apache/spark/sql/catalyst/SqlParser.scala | 14 +++++-
 .../catalyst/analysis/HiveTypeCoercion.scala  | 50 ++++++++++++-------
 .../sql/catalyst/analysis/AnalysisSuite.scala |  4 +-
 .../spark/sql/MathExpressionsSuite.scala      |  3 +-
 .../org/apache/spark/sql/SQLQuerySuite.scala  | 14 +++---
 .../org/apache/spark/sql/json/JsonSuite.scala | 14 +++---
 6 files changed, 62 insertions(+), 37 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
index b423f0fa04f69..e5f115f74bf3b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
@@ -332,8 +332,7 @@ class SqlParser extends AbstractSparkSQLParser with DataTypeParser {
   protected lazy val numericLiteral: Parser[Literal] =
     ( integral  ^^ { case i => Literal(toNarrowestIntegerType(i)) }
     | sign.? ~ unsignedFloat ^^ {
-      // TODO(davies): some precisions may loss, we should create decimal literal
-      case s ~ f => Literal(BigDecimal(s.getOrElse("") + f).doubleValue())
+      case s ~ f => Literal(toDecimalOrDouble(s.getOrElse("") + f))
     }
     )
 
@@ -420,6 +419,17 @@ class SqlParser extends AbstractSparkSQLParser with DataTypeParser {
     }
   }
 
+  private def toDecimalOrDouble(value: String): Any = {
+    val decimal = BigDecimal(value)
+    // follow the behavior in MS SQL Server
+    // https://msdn.microsoft.com/en-us/library/ms179899.aspx
+    if (value.contains('E') || value.contains('e')) {
+      decimal.doubleValue()
+    } else {
+      decimal.underlying()
+    }
+  }
+
   protected lazy val baseExpression: Parser[Expression] =
     ( "*" ^^^ UnresolvedStar(None)
     | ident <~ "." ~ "*" ^^ { case tableName => UnresolvedStar(Option(tableName)) }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
index e0527503442f0..ecc48986e35d8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
@@ -109,13 +109,35 @@ object HiveTypeCoercion {
    * Find the tightest common type of a set of types by continuously applying
    * `findTightestCommonTypeOfTwo` on these types.
    */
-  private def findTightestCommonType(types: Seq[DataType]) = {
+  private def findTightestCommonType(types: Seq[DataType]): Option[DataType] = {
     types.foldLeft[Option[DataType]](Some(NullType))((r, c) => r match {
       case None => None
       case Some(d) => findTightestCommonTypeOfTwo(d, c)
     })
   }
 
+  private def findWiderTypeForTwo(t1: DataType, t2: DataType): Option[DataType] = (t1, t2) match {
+    case (t1: DecimalType, t2: DecimalType) =>
+      Some(DecimalPrecision.widerDecimalType(t1, t2))
+    case (t: IntegralType, d: DecimalType) =>
+      Some(DecimalPrecision.widerDecimalType(DecimalType.forType(t), d))
+    case (d: DecimalType, t: IntegralType) =>
+      Some(DecimalPrecision.widerDecimalType(DecimalType.forType(t), d))
+    case (t: FractionalType, d: DecimalType) =>
+      Some(DoubleType)
+    case (d: DecimalType, t: FractionalType) =>
+      Some(DoubleType)
+    case _ =>
+      findTightestCommonTypeToString(t1, t2)
+  }
+
+  private def findWiderCommonType(types: Seq[DataType]) = {
+    types.foldLeft[Option[DataType]](Some(NullType))((r, c) => r match {
+      case Some(d) => findWiderTypeForTwo(d, c)
+      case None => None
+    })
+  }
+
   /**
    * Applies any changes to [[AttributeReference]] data types that are made by other rules to
    * instances higher in the query tree.
@@ -182,20 +204,7 @@ object HiveTypeCoercion {
 
       val castedTypes = left.output.zip(right.output).map {
         case (lhs, rhs) if lhs.dataType != rhs.dataType =>
-          (lhs.dataType, rhs.dataType) match {
-            case (t1: DecimalType, t2: DecimalType) =>
-              Some(DecimalPrecision.widerDecimalType(t1, t2))
-            case (t: IntegralType, d: DecimalType) =>
-              Some(DecimalPrecision.widerDecimalType(DecimalType.forType(t), d))
-            case (d: DecimalType, t: IntegralType) =>
-              Some(DecimalPrecision.widerDecimalType(DecimalType.forType(t), d))
-            case (t: FractionalType, d: DecimalType) =>
-              Some(DoubleType)
-            case (d: DecimalType, t: FractionalType) =>
-              Some(DoubleType)
-            case _ =>
-              findTightestCommonTypeToString(lhs.dataType, rhs.dataType)
-          }
+          findWiderTypeForTwo(lhs.dataType, rhs.dataType)
         case other => None
       }
 
@@ -236,8 +245,13 @@ object HiveTypeCoercion {
       // Skip nodes who's children have not been resolved yet.
       case e if !e.childrenResolved => e
 
-      case a @ BinaryArithmetic(left @ StringType(), r) =>
-        a.makeCopy(Array(Cast(left, DoubleType), r))
+      case a @ BinaryArithmetic(left @ StringType(), right @ DecimalType.Expression(_, _)) =>
+        a.makeCopy(Array(Cast(left, DecimalType.SYSTEM_DEFAULT), right))
+      case a @ BinaryArithmetic(left @ DecimalType.Expression(_, _), right @ StringType()) =>
+        a.makeCopy(Array(left, Cast(right, DecimalType.SYSTEM_DEFAULT)))
+
+      case a @ BinaryArithmetic(left @ StringType(), right) =>
+        a.makeCopy(Array(Cast(left, DoubleType), right))
       case a @ BinaryArithmetic(left, right @ StringType()) =>
         a.makeCopy(Array(left, Cast(right, DoubleType)))
 
@@ -543,7 +557,7 @@ object HiveTypeCoercion {
       // compatible with every child column.
       case c @ Coalesce(es) if es.map(_.dataType).distinct.size > 1 =>
         val types = es.map(_.dataType)
-        findTightestCommonTypeAndPromoteToString(types) match {
+        findWiderCommonType(types) match {
           case Some(finalDataType) => Coalesce(es.map(Cast(_, finalDataType)))
           case None => c
         }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
index 4589facb49b76..221b4e92f086c 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
@@ -145,11 +145,11 @@ class AnalysisSuite extends AnalysisTest {
         'e / 'e as 'div5))
     val pl = plan.asInstanceOf[Project].projectList
 
-    // StringType will be promoted into Double
     assert(pl(0).dataType == DoubleType)
     assert(pl(1).dataType == DoubleType)
     assert(pl(2).dataType == DoubleType)
-    assert(pl(3).dataType == DoubleType)
+    // StringType will be promoted into Decimal(38, 18)
+    assert(pl(3).dataType == DecimalType(38, 29))
     assert(pl(4).dataType == DoubleType)
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala
index 21256704a5b16..8cf2ef5957d8d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala
@@ -216,7 +216,8 @@ class MathExpressionsSuite extends QueryTest {
     checkAnswer(
       ctx.sql(s"SELECT round($pi, -3), round($pi, -2), round($pi, -1), " +
         s"round($pi, 0), round($pi, 1), round($pi, 2), round($pi, 3)"),
-      Seq(Row(0.0, 0.0, 0.0, 3.0, 3.1, 3.14, 3.142))
+      Seq(Row(BigDecimal("0E3"), BigDecimal("0E2"), BigDecimal("0E1"), BigDecimal(3),
+        BigDecimal("3.1"), BigDecimal("3.14"), BigDecimal("3.142")))
     )
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 42724ed766af5..d13dde1cdc8b2 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -368,7 +368,7 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll with SQLTestUtils {
       Row(1))
     checkAnswer(
       sql("SELECT COALESCE(null, 1, 1.5)"),
-      Row(1.toDouble))
+      Row(BigDecimal(1)))
     checkAnswer(
       sql("SELECT COALESCE(null, null, null)"),
       Row(null))
@@ -1234,19 +1234,19 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll with SQLTestUtils {
 
   test("Floating point number format") {
     checkAnswer(
-      sql("SELECT 0.3"), Row(0.3)
+      sql("SELECT 0.3"), Row(BigDecimal(0.3).underlying())
     )
 
     checkAnswer(
-      sql("SELECT -0.8"), Row(-0.8)
+      sql("SELECT -0.8"), Row(BigDecimal(-0.8).underlying())
     )
 
     checkAnswer(
-      sql("SELECT .5"), Row(0.5)
+      sql("SELECT .5"), Row(BigDecimal(0.5))
     )
 
     checkAnswer(
-      sql("SELECT -.18"), Row(-0.18)
+      sql("SELECT -.18"), Row(BigDecimal(-0.18))
     )
   }
 
@@ -1279,11 +1279,11 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll with SQLTestUtils {
     )
 
     checkAnswer(
-      sql("SELECT -5.2"), Row(-5.2)
+      sql("SELECT -5.2"), Row(BigDecimal(-5.2))
     )
 
     checkAnswer(
-      sql("SELECT +6.8"), Row(6.8)
+      sql("SELECT +6.8"), Row(BigDecimal(6.8))
     )
 
     checkAnswer(
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
index 3ac312d6f4c50..f19f22fca7d54 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
@@ -422,14 +422,14 @@ class JsonSuite extends QueryTest with TestJsonData {
       Row(-89) :: Row(21474836370L) :: Row(21474836470L) :: Nil
     )
 
-    // Widening to DoubleType
+    // Widening to DecimalType
     checkAnswer(
       sql("select num_num_2 + 1.3 from jsonTable where num_num_2 > 1.1"),
-      Row(21474836472.2) ::
-        Row(92233720368547758071.3) :: Nil
+      Row(BigDecimal("21474836472.2")) ::
+        Row(BigDecimal("92233720368547758071.3")) :: Nil
     )
 
-    // Widening to DoubleType
+    // Widening to Double
     checkAnswer(
       sql("select num_num_3 + 1.2 from jsonTable where num_num_3 > 1.1"),
       Row(101.2) :: Row(21474836471.2) :: Nil
@@ -438,13 +438,13 @@ class JsonSuite extends QueryTest with TestJsonData {
     // Number and String conflict: resolve the type as number in this query.
     checkAnswer(
       sql("select num_str + 1.2 from jsonTable where num_str > 14"),
-      Row(92233720368547758071.2)
+      Row(BigDecimal("92233720368547758071.2"))
     )
 
     // Number and String conflict: resolve the type as number in this query.
     checkAnswer(
       sql("select num_str + 1.2 from jsonTable where num_str >= 92233720368547758060"),
-      Row(new java.math.BigDecimal("92233720368547758071.2").doubleValue)
+      Row(new java.math.BigDecimal("92233720368547758071.2"))
     )
 
     // String and Boolean conflict: resolve the type as string.
@@ -503,7 +503,7 @@ class JsonSuite extends QueryTest with TestJsonData {
     // Number and String conflict: resolve the type as number in this query.
     checkAnswer(
       sql("select num_str + 1.2 from jsonTable where num_str > 13"),
-      Row(14.3) :: Row(92233720368547758071.2) :: Nil
+      Row(BigDecimal("14.3")) :: Row(BigDecimal("92233720368547758071.2")) :: Nil
     )
   }
 

From 708794e8aae2c66bd291bab4f12117c33b57840c Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@outlook.com>
Date: Wed, 29 Jul 2015 00:08:45 -0700
Subject: [PATCH 443/549] [SPARK-9251][SQL] do not order by expressions which
 still need evaluation

as an offline discussion with rxin , it's weird to be computing stuff while doing sorting, we should only order by bound reference during execution.

Author: Wenchen Fan <cloud0fan@outlook.com>

Closes #7593 from cloud-fan/sort and squashes the following commits:

7b1bef7 [Wenchen Fan] add test
daf206d [Wenchen Fan] add more comments
289bee0 [Wenchen Fan] do not order by expressions which still need evaluation
---
 .../sql/catalyst/analysis/Analyzer.scala      | 58 +++++++++++++++++++
 .../sql/catalyst/expressions/random.scala     |  4 +-
 .../plans/logical/basicOperators.scala        | 13 +++--
 .../sql/catalyst/analysis/AnalysisSuite.scala | 36 ++++++++++--
 .../scala/org/apache/spark/sql/TestData.scala |  2 -
 5 files changed, 101 insertions(+), 12 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index a6ea0cc0a83a8..265f3d1e41765 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -79,6 +79,7 @@ class Analyzer(
       ExtractWindowExpressions ::
       GlobalAggregates ::
       UnresolvedHavingClauseAttributes ::
+      RemoveEvaluationFromSort ::
       HiveTypeCoercion.typeCoercionRules ++
       extendedResolutionRules : _*),
     Batch("Nondeterministic", Once,
@@ -947,6 +948,63 @@ class Analyzer(
         Project(p.output, newPlan.withNewChildren(newChild :: Nil))
     }
   }
+
+  /**
+   * Removes all still-need-evaluate ordering expressions from sort and use an inner project to
+   * materialize them, finally use a outer project to project them away to keep the result same.
+   * Then we can make sure we only sort by [[AttributeReference]]s.
+   *
+   * As an example,
+   * {{{
+   *   Sort('a, 'b + 1,
+   *     Relation('a, 'b))
+   * }}}
+   * will be turned into:
+   * {{{
+   *   Project('a, 'b,
+   *     Sort('a, '_sortCondition,
+   *       Project('a, 'b, ('b + 1).as("_sortCondition"),
+   *         Relation('a, 'b))))
+   * }}}
+   */
+  object RemoveEvaluationFromSort extends Rule[LogicalPlan] {
+    private def hasAlias(expr: Expression) = {
+      expr.find {
+        case a: Alias => true
+        case _ => false
+      }.isDefined
+    }
+
+    override def apply(plan: LogicalPlan): LogicalPlan = plan transform {
+      // The ordering expressions have no effect to the output schema of `Sort`,
+      // so `Alias`s in ordering expressions are unnecessary and we should remove them.
+      case s @ Sort(ordering, _, _) if ordering.exists(hasAlias) =>
+        val newOrdering = ordering.map(_.transformUp {
+          case Alias(child, _) => child
+        }.asInstanceOf[SortOrder])
+        s.copy(order = newOrdering)
+
+      case s @ Sort(ordering, global, child)
+        if s.expressions.forall(_.resolved) && s.childrenResolved && !s.hasNoEvaluation =>
+
+        val (ref, needEval) = ordering.partition(_.child.isInstanceOf[AttributeReference])
+
+        val namedExpr = needEval.map(_.child match {
+          case n: NamedExpression => n
+          case e => Alias(e, "_sortCondition")()
+        })
+
+        val newOrdering = ref ++ needEval.zip(namedExpr).map { case (order, ne) =>
+          order.copy(child = ne.toAttribute)
+        }
+
+        // Add still-need-evaluate ordering expressions into inner project and then project
+        // them away after the sort.
+        Project(child.output,
+          Sort(newOrdering, global,
+            Project(child.output ++ namedExpr, child)))
+    }
+  }
 }
 
 /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/random.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/random.scala
index 8f30519697a37..62d3d204ca872 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/random.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/random.scala
@@ -66,7 +66,7 @@ case class Rand(seed: Long) extends RDG {
     val rngTerm = ctx.freshName("rng")
     val className = classOf[XORShiftRandom].getName
     ctx.addMutableState(className, rngTerm,
-      s"$rngTerm = new $className($seed + org.apache.spark.TaskContext.getPartitionId());")
+      s"$rngTerm = new $className(${seed}L + org.apache.spark.TaskContext.getPartitionId());")
     ev.isNull = "false"
     s"""
       final ${ctx.javaType(dataType)} ${ev.primitive} = $rngTerm.nextDouble();
@@ -89,7 +89,7 @@ case class Randn(seed: Long) extends RDG {
     val rngTerm = ctx.freshName("rng")
     val className = classOf[XORShiftRandom].getName
     ctx.addMutableState(className, rngTerm,
-      s"$rngTerm = new $className($seed + org.apache.spark.TaskContext.getPartitionId());")
+      s"$rngTerm = new $className(${seed}L + org.apache.spark.TaskContext.getPartitionId());")
     ev.isNull = "false"
     s"""
       final ${ctx.javaType(dataType)} ${ev.primitive} = $rngTerm.nextGaussian();
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
index af68358daf5f1..ad5af19578f33 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
@@ -33,7 +33,7 @@ case class Project(projectList: Seq[NamedExpression], child: LogicalPlan) extend
       }.nonEmpty
     )
 
-    !expressions.exists(!_.resolved) && childrenResolved && !hasSpecialExpressions
+    expressions.forall(_.resolved) && childrenResolved && !hasSpecialExpressions
   }
 }
 
@@ -67,7 +67,7 @@ case class Generate(
     generator.resolved &&
       childrenResolved &&
       generator.elementTypes.length == generatorOutput.length &&
-      !generatorOutput.exists(!_.resolved)
+      generatorOutput.forall(_.resolved)
   }
 
   // we don't want the gOutput to be taken as part of the expressions
@@ -187,7 +187,7 @@ case class WithWindowDefinition(
 }
 
 /**
- * @param order  The ordering expressions
+ * @param order  The ordering expressions, should all be [[AttributeReference]]
  * @param global True means global sorting apply for entire data set,
  *               False means sorting only apply within the partition.
  * @param child  Child logical plan
@@ -197,6 +197,11 @@ case class Sort(
     global: Boolean,
     child: LogicalPlan) extends UnaryNode {
   override def output: Seq[Attribute] = child.output
+
+  def hasNoEvaluation: Boolean = order.forall(_.child.isInstanceOf[AttributeReference])
+
+  override lazy val resolved: Boolean =
+    expressions.forall(_.resolved) && childrenResolved && hasNoEvaluation
 }
 
 case class Aggregate(
@@ -211,7 +216,7 @@ case class Aggregate(
       }.nonEmpty
     )
 
-    !expressions.exists(!_.resolved) && childrenResolved && !hasWindowExpressions
+    expressions.forall(_.resolved) && childrenResolved && !hasWindowExpressions
   }
 
   override def output: Seq[Attribute] = aggregateExpressions.map(_.toAttribute)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
index 221b4e92f086c..a86cefe941e8e 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
@@ -165,11 +165,39 @@ class AnalysisSuite extends AnalysisTest {
 
   test("pull out nondeterministic expressions from Sort") {
     val plan = Sort(Seq(SortOrder(Rand(33), Ascending)), false, testRelation)
-    val projected = Alias(Rand(33), "_nondeterministic")()
+    val analyzed = caseSensitiveAnalyzer.execute(plan)
+    analyzed.transform {
+      case s: Sort if s.expressions.exists(!_.deterministic) =>
+        fail("nondeterministic expressions are not allowed in Sort")
+    }
+  }
+
+  test("remove still-need-evaluate ordering expressions from sort") {
+    val a = testRelation2.output(0)
+    val b = testRelation2.output(1)
+
+    def makeOrder(e: Expression): SortOrder = SortOrder(e, Ascending)
+
+    val noEvalOrdering = makeOrder(a)
+    val noEvalOrderingWithAlias = makeOrder(Alias(Alias(b, "name1")(), "name2")())
+
+    val needEvalExpr = Coalesce(Seq(a, Literal("1")))
+    val needEvalExpr2 = Coalesce(Seq(a, b))
+    val needEvalOrdering = makeOrder(needEvalExpr)
+    val needEvalOrdering2 = makeOrder(needEvalExpr2)
+
+    val plan = Sort(
+      Seq(noEvalOrdering, noEvalOrderingWithAlias, needEvalOrdering, needEvalOrdering2),
+      false, testRelation2)
+
+    val evaluatedOrdering = makeOrder(AttributeReference("_sortCondition", StringType)())
+    val materializedExprs = Seq(needEvalExpr, needEvalExpr2).map(e => Alias(e, "_sortCondition")())
+
     val expected =
-      Project(testRelation.output,
-        Sort(Seq(SortOrder(projected.toAttribute, Ascending)), false,
-          Project(testRelation.output :+ projected, testRelation)))
+      Project(testRelation2.output,
+        Sort(Seq(makeOrder(a), makeOrder(b), evaluatedOrdering, evaluatedOrdering), false,
+          Project(testRelation2.output ++ materializedExprs, testRelation2)))
+
     checkAnalysis(plan, expected)
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala b/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala
index 207d7a352c7b3..e340f54850bcc 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala
@@ -17,8 +17,6 @@
 
 package org.apache.spark.sql
 
-import java.sql.Timestamp
-
 import org.apache.spark.sql.test.TestSQLContext.implicits._
 import org.apache.spark.sql.test._
 

From 97906944e133dec13068f16520b6abbcdc79e84f Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Wed, 29 Jul 2015 09:36:22 -0700
Subject: [PATCH 444/549] [SPARK-9127][SQL] Rand/Randn codegen fails with long
 seed.

Author: Reynold Xin <rxin@databricks.com>

Closes #7747 from rxin/SPARK-9127 and squashes the following commits:

e851418 [Reynold Xin] [SPARK-9127][SQL] Rand/Randn codegen fails with long seed.
---
 .../spark/sql/catalyst/expressions/RandomSuite.scala       | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RandomSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RandomSuite.scala
index 698c81ba24482..5db992654811a 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RandomSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RandomSuite.scala
@@ -20,8 +20,6 @@ package org.apache.spark.sql.catalyst.expressions
 import org.scalatest.Matchers._
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.catalyst.dsl.expressions._
-import org.apache.spark.sql.types.DoubleType
 
 
 class RandomSuite extends SparkFunSuite with ExpressionEvalHelper {
@@ -30,4 +28,9 @@ class RandomSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkDoubleEvaluation(Rand(30), 0.7363714192755834 +- 0.001)
     checkDoubleEvaluation(Randn(30), 0.5181478766595276 +- 0.001)
   }
+
+  test("SPARK-9127 codegen with long seed") {
+    checkDoubleEvaluation(Rand(5419823303878592871L), 0.4061913198963727 +- 0.001)
+    checkDoubleEvaluation(Randn(5419823303878592871L), -0.24417152005343168 +- 0.001)
+  }
 }

From 069a4c414db4612d7bdb6f5615c1ba36998e5a49 Mon Sep 17 00:00:00 2001
From: Joseph Batchik <joseph.batchik@cloudera.com>
Date: Wed, 29 Jul 2015 14:02:32 -0500
Subject: [PATCH 445/549] [SPARK-746] [CORE] Added Avro Serialization to Kryo

Added a custom Kryo serializer for generic Avro records to reduce the network IO
involved during a shuffle. This compresses the schema and allows for users to
register their schemas ahead of time to further reduce traffic.

Currently Kryo tries to use its default serializer for generic Records, which will include
a lot of unneeded data in each record.

Author: Joseph Batchik <joseph.batchik@cloudera.com>
Author: Joseph Batchik <josephbatchik@gmail.com>

Closes #7004 from JDrit/Avro_serialization and squashes the following commits:

8158d51 [Joseph Batchik] updated per feedback
c0cf329 [Joseph Batchik] implemented @squito suggestion for SparkEnv
dd71efe [Joseph Batchik] fixed bug with serializing
1183a48 [Joseph Batchik] updated codec settings
fa9298b [Joseph Batchik] forgot a couple of fixes
c5fe794 [Joseph Batchik] implemented @squito suggestion
0f5471a [Joseph Batchik] implemented @squito suggestion to use a codec that is already in spark
6d1925c [Joseph Batchik] fixed to changes suggested by @squito
d421bf5 [Joseph Batchik] updated pom to removed versions
ab46d10 [Joseph Batchik] Changed Avro dependency to be similar to parent
f4ae251 [Joseph Batchik] fixed serialization error in that SparkConf cannot be serialized
2b545cc [Joseph Batchik] started working on fixes for pr
97fba62 [Joseph Batchik] Added a custom Kryo serializer for generic Avro records to reduce the network IO involved during a shuffle. This compresses the schema and allows for users to register their schemas ahead of time to further reduce traffic.
---
 core/pom.xml                                  |   5 +
 .../scala/org/apache/spark/SparkConf.scala    |  23 ++-
 .../serializer/GenericAvroSerializer.scala    | 150 ++++++++++++++++++
 .../spark/serializer/KryoSerializer.scala     |   6 +
 .../GenericAvroSerializerSuite.scala          |  84 ++++++++++
 5 files changed, 267 insertions(+), 1 deletion(-)
 create mode 100644 core/src/main/scala/org/apache/spark/serializer/GenericAvroSerializer.scala
 create mode 100644 core/src/test/scala/org/apache/spark/serializer/GenericAvroSerializerSuite.scala

diff --git a/core/pom.xml b/core/pom.xml
index 95f36eb348698..6fa87ec6a24af 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -34,6 +34,11 @@
   <name>Spark Project Core</name>
   <url>http://spark.apache.org/</url>
   <dependencies>
+    <dependency>
+      <groupId>org.apache.avro</groupId>
+      <artifactId>avro-mapred</artifactId>
+      <classifier>${avro.mapred.classifier}</classifier>
+    </dependency>
     <dependency>
       <groupId>com.google.guava</groupId>
       <artifactId>guava</artifactId>
diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala b/core/src/main/scala/org/apache/spark/SparkConf.scala
index 6cf36fbbd6254..4161792976c7b 100644
--- a/core/src/main/scala/org/apache/spark/SparkConf.scala
+++ b/core/src/main/scala/org/apache/spark/SparkConf.scala
@@ -18,11 +18,12 @@
 package org.apache.spark
 
 import java.util.concurrent.ConcurrentHashMap
-import java.util.concurrent.atomic.AtomicBoolean
 
 import scala.collection.JavaConverters._
 import scala.collection.mutable.LinkedHashSet
 
+import org.apache.avro.{SchemaNormalization, Schema}
+
 import org.apache.spark.serializer.KryoSerializer
 import org.apache.spark.util.Utils
 
@@ -161,6 +162,26 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging {
     this
   }
 
+  private final val avroNamespace = "avro.schema."
+
+  /**
+   * Use Kryo serialization and register the given set of Avro schemas so that the generic
+   * record serializer can decrease network IO
+   */
+  def registerAvroSchemas(schemas: Schema*): SparkConf = {
+    for (schema <- schemas) {
+      set(avroNamespace + SchemaNormalization.parsingFingerprint64(schema), schema.toString)
+    }
+    this
+  }
+
+  /** Gets all the avro schemas in the configuration used in the generic Avro record serializer */
+  def getAvroSchema: Map[Long, String] = {
+    getAll.filter { case (k, v) => k.startsWith(avroNamespace) }
+      .map { case (k, v) => (k.substring(avroNamespace.length).toLong, v) }
+      .toMap
+  }
+
   /** Remove a parameter from the configuration */
   def remove(key: String): SparkConf = {
     settings.remove(key)
diff --git a/core/src/main/scala/org/apache/spark/serializer/GenericAvroSerializer.scala b/core/src/main/scala/org/apache/spark/serializer/GenericAvroSerializer.scala
new file mode 100644
index 0000000000000..62f8aae7f2126
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/serializer/GenericAvroSerializer.scala
@@ -0,0 +1,150 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.serializer
+
+import java.io.{ByteArrayInputStream, ByteArrayOutputStream}
+import java.nio.ByteBuffer
+
+import scala.collection.mutable
+
+import com.esotericsoftware.kryo.{Kryo, Serializer => KSerializer}
+import com.esotericsoftware.kryo.io.{Input => KryoInput, Output => KryoOutput}
+import org.apache.avro.{Schema, SchemaNormalization}
+import org.apache.avro.generic.{GenericData, GenericRecord}
+import org.apache.avro.io._
+import org.apache.commons.io.IOUtils
+
+import org.apache.spark.{SparkException, SparkEnv}
+import org.apache.spark.io.CompressionCodec
+
+/**
+ * Custom serializer used for generic Avro records. If the user registers the schemas
+ * ahead of time, then the schema's fingerprint will be sent with each message instead of the actual
+ * schema, as to reduce network IO.
+ * Actions like parsing or compressing schemas are computationally expensive so the serializer
+ * caches all previously seen values as to reduce the amount of work needed to do.
+ * @param schemas a map where the keys are unique IDs for Avro schemas and the values are the
+ *                string representation of the Avro schema, used to decrease the amount of data
+ *                that needs to be serialized.
+ */
+private[serializer] class GenericAvroSerializer(schemas: Map[Long, String])
+  extends KSerializer[GenericRecord] {
+
+  /** Used to reduce the amount of effort to compress the schema */
+  private val compressCache = new mutable.HashMap[Schema, Array[Byte]]()
+  private val decompressCache = new mutable.HashMap[ByteBuffer, Schema]()
+
+  /** Reuses the same datum reader/writer since the same schema will be used many times */
+  private val writerCache = new mutable.HashMap[Schema, DatumWriter[_]]()
+  private val readerCache = new mutable.HashMap[Schema, DatumReader[_]]()
+
+  /** Fingerprinting is very expensive so this alleviates most of the work */
+  private val fingerprintCache = new mutable.HashMap[Schema, Long]()
+  private val schemaCache = new mutable.HashMap[Long, Schema]()
+
+  // GenericAvroSerializer can't take a SparkConf in the constructor b/c then it would become
+  // a member of KryoSerializer, which would make KryoSerializer not Serializable.  We make
+  // the codec lazy here just b/c in some unit tests, we use a KryoSerializer w/out having
+  // the SparkEnv set (note those tests would fail if they tried to serialize avro data).
+  private lazy val codec = CompressionCodec.createCodec(SparkEnv.get.conf)
+
+  /**
+   * Used to compress Schemas when they are being sent over the wire.
+   * The compression results are memoized to reduce the compression time since the
+   * same schema is compressed many times over
+   */
+  def compress(schema: Schema): Array[Byte] = compressCache.getOrElseUpdate(schema, {
+    val bos = new ByteArrayOutputStream()
+    val out = codec.compressedOutputStream(bos)
+    out.write(schema.toString.getBytes("UTF-8"))
+    out.close()
+    bos.toByteArray
+  })
+
+  /**
+   * Decompresses the schema into the actual in-memory object. Keeps an internal cache of already
+   * seen values so to limit the number of times that decompression has to be done.
+   */
+  def decompress(schemaBytes: ByteBuffer): Schema = decompressCache.getOrElseUpdate(schemaBytes, {
+    val bis = new ByteArrayInputStream(schemaBytes.array())
+    val bytes = IOUtils.toByteArray(codec.compressedInputStream(bis))
+    new Schema.Parser().parse(new String(bytes, "UTF-8"))
+  })
+
+  /**
+   * Serializes a record to the given output stream. It caches a lot of the internal data as
+   * to not redo work
+   */
+  def serializeDatum[R <: GenericRecord](datum: R, output: KryoOutput): Unit = {
+    val encoder = EncoderFactory.get.binaryEncoder(output, null)
+    val schema = datum.getSchema
+    val fingerprint = fingerprintCache.getOrElseUpdate(schema, {
+      SchemaNormalization.parsingFingerprint64(schema)
+    })
+    schemas.get(fingerprint) match {
+      case Some(_) =>
+        output.writeBoolean(true)
+        output.writeLong(fingerprint)
+      case None =>
+        output.writeBoolean(false)
+        val compressedSchema = compress(schema)
+        output.writeInt(compressedSchema.length)
+        output.writeBytes(compressedSchema)
+    }
+
+    writerCache.getOrElseUpdate(schema, GenericData.get.createDatumWriter(schema))
+      .asInstanceOf[DatumWriter[R]]
+      .write(datum, encoder)
+    encoder.flush()
+  }
+
+  /**
+   * Deserializes generic records into their in-memory form. There is internal
+   * state to keep a cache of already seen schemas and datum readers.
+   */
+  def deserializeDatum(input: KryoInput): GenericRecord = {
+    val schema = {
+      if (input.readBoolean()) {
+        val fingerprint = input.readLong()
+        schemaCache.getOrElseUpdate(fingerprint, {
+          schemas.get(fingerprint) match {
+            case Some(s) => new Schema.Parser().parse(s)
+            case None =>
+              throw new SparkException(
+                "Error reading attempting to read avro data -- encountered an unknown " +
+                  s"fingerprint: $fingerprint, not sure what schema to use.  This could happen " +
+                  "if you registered additional schemas after starting your spark context.")
+          }
+        })
+      } else {
+        val length = input.readInt()
+        decompress(ByteBuffer.wrap(input.readBytes(length)))
+      }
+    }
+    val decoder = DecoderFactory.get.directBinaryDecoder(input, null)
+    readerCache.getOrElseUpdate(schema, GenericData.get.createDatumReader(schema))
+      .asInstanceOf[DatumReader[GenericRecord]]
+      .read(null, decoder)
+  }
+
+  override def write(kryo: Kryo, output: KryoOutput, datum: GenericRecord): Unit =
+    serializeDatum(datum, output)
+
+  override def read(kryo: Kryo, input: KryoInput, datumClass: Class[GenericRecord]): GenericRecord =
+    deserializeDatum(input)
+}
diff --git a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala
index 7cb6e080533ad..0ff7562e912ca 100644
--- a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala
+++ b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala
@@ -27,6 +27,7 @@ import com.esotericsoftware.kryo.{Kryo, KryoException}
 import com.esotericsoftware.kryo.io.{Input => KryoInput, Output => KryoOutput}
 import com.esotericsoftware.kryo.serializers.{JavaSerializer => KryoJavaSerializer}
 import com.twitter.chill.{AllScalaRegistrar, EmptyScalaKryoInstantiator}
+import org.apache.avro.generic.{GenericData, GenericRecord}
 import org.roaringbitmap.{ArrayContainer, BitmapContainer, RoaringArray, RoaringBitmap}
 
 import org.apache.spark._
@@ -73,6 +74,8 @@ class KryoSerializer(conf: SparkConf)
     .split(',')
     .filter(!_.isEmpty)
 
+  private val avroSchemas = conf.getAvroSchema
+
   def newKryoOutput(): KryoOutput = new KryoOutput(bufferSize, math.max(bufferSize, maxBufferSize))
 
   def newKryo(): Kryo = {
@@ -101,6 +104,9 @@ class KryoSerializer(conf: SparkConf)
     kryo.register(classOf[HttpBroadcast[_]], new KryoJavaSerializer())
     kryo.register(classOf[PythonBroadcast], new KryoJavaSerializer())
 
+    kryo.register(classOf[GenericRecord], new GenericAvroSerializer(avroSchemas))
+    kryo.register(classOf[GenericData.Record], new GenericAvroSerializer(avroSchemas))
+
     try {
       // scalastyle:off classforname
       // Use the default classloader when calling the user registrator.
diff --git a/core/src/test/scala/org/apache/spark/serializer/GenericAvroSerializerSuite.scala b/core/src/test/scala/org/apache/spark/serializer/GenericAvroSerializerSuite.scala
new file mode 100644
index 0000000000000..bc9f3708ed69d
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/serializer/GenericAvroSerializerSuite.scala
@@ -0,0 +1,84 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.serializer
+
+import java.io.{ByteArrayInputStream, ByteArrayOutputStream}
+import java.nio.ByteBuffer
+
+import com.esotericsoftware.kryo.io.{Output, Input}
+import org.apache.avro.{SchemaBuilder, Schema}
+import org.apache.avro.generic.GenericData.Record
+
+import org.apache.spark.{SparkFunSuite, SharedSparkContext}
+
+class GenericAvroSerializerSuite extends SparkFunSuite with SharedSparkContext {
+  conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
+
+  val schema : Schema = SchemaBuilder
+    .record("testRecord").fields()
+    .requiredString("data")
+    .endRecord()
+  val record = new Record(schema)
+  record.put("data", "test data")
+
+  test("schema compression and decompression") {
+    val genericSer = new GenericAvroSerializer(conf.getAvroSchema)
+    assert(schema === genericSer.decompress(ByteBuffer.wrap(genericSer.compress(schema))))
+  }
+
+  test("record serialization and deserialization") {
+    val genericSer = new GenericAvroSerializer(conf.getAvroSchema)
+
+    val outputStream = new ByteArrayOutputStream()
+    val output = new Output(outputStream)
+    genericSer.serializeDatum(record, output)
+    output.flush()
+    output.close()
+
+    val input = new Input(new ByteArrayInputStream(outputStream.toByteArray))
+    assert(genericSer.deserializeDatum(input) === record)
+  }
+
+  test("uses schema fingerprint to decrease message size") {
+    val genericSerFull = new GenericAvroSerializer(conf.getAvroSchema)
+
+    val output = new Output(new ByteArrayOutputStream())
+
+    val beginningNormalPosition = output.total()
+    genericSerFull.serializeDatum(record, output)
+    output.flush()
+    val normalLength = output.total - beginningNormalPosition
+
+    conf.registerAvroSchemas(schema)
+    val genericSerFinger = new GenericAvroSerializer(conf.getAvroSchema)
+    val beginningFingerprintPosition = output.total()
+    genericSerFinger.serializeDatum(record, output)
+    val fingerprintLength = output.total - beginningFingerprintPosition
+
+    assert(fingerprintLength < normalLength)
+  }
+
+  test("caches previously seen schemas") {
+    val genericSer = new GenericAvroSerializer(conf.getAvroSchema)
+    val compressedSchema = genericSer.compress(schema)
+    val decompressedScheam = genericSer.decompress(ByteBuffer.wrap(compressedSchema))
+
+    assert(compressedSchema.eq(genericSer.compress(schema)))
+    assert(decompressedScheam.eq(genericSer.decompress(ByteBuffer.wrap(compressedSchema))))
+  }
+}

From 819be46e5a73f2d19230354ebba30c58538590f5 Mon Sep 17 00:00:00 2001
From: Iulian Dragos <jaguarul@gmail.com>
Date: Wed, 29 Jul 2015 13:47:37 -0700
Subject: [PATCH 446/549] [SPARK-8977] [STREAMING] Defines the RateEstimator
 interface, and impements the RateController
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Based on #7471.

- [x] add a test that exercises the publish path from driver to receiver
- [ ] remove Serializable from `RateController` and `RateEstimator`

Author: Iulian Dragos <jaguarul@gmail.com>
Author: François Garillot <francois@garillot.net>

Closes #7600 from dragos/topic/streaming-bp/rate-controller and squashes the following commits:

f168c94 [Iulian Dragos] Latest review round.
5125e60 [Iulian Dragos] Fix style.
a2eb3b9 [Iulian Dragos] Merge remote-tracking branch 'upstream/master' into topic/streaming-bp/rate-controller
475e346 [Iulian Dragos] Latest round of reviews.
e9fb45e [Iulian Dragos] - Add a test for checkpointing - fixed serialization for RateController.executionContext
715437a [Iulian Dragos] Review comments and added a `reset` call in ReceiverTrackerTest.
e57c66b [Iulian Dragos] Added a couple of tests for the full scenario from driver to receivers, with several rate updates.
b425d32 [Iulian Dragos] Removed DeveloperAPI, removed rateEstimator field, removed Noop rate estimator, changed logic for initialising rate estimator.
238cfc6 [Iulian Dragos] Merge remote-tracking branch 'upstream/master' into topic/streaming-bp/rate-controller
34a389d [Iulian Dragos] Various style changes and a first test for the rate controller.
d32ca36 [François Garillot] [SPARK-8977][Streaming] Defines the RateEstimator interface, and implements the ReceiverRateController
8941cf9 [Iulian Dragos] Renames and other nitpicks.
162d9e5 [Iulian Dragos] Use Reflection for accessing truly private `executor` method and use the listener bus to know when receivers have registered (`onStart` is called before receivers have registered, leading to flaky behavior).
210f495 [Iulian Dragos] Revert "Added a few tests that measure the receiver’s rate."
0c51959 [Iulian Dragos] Added a few tests that measure the receiver’s rate.
261a051 [Iulian Dragos] - removed field to hold the current rate limit in rate limiter - made rate limit a Long and default to Long.MaxValue (consequence of the above) - removed custom `waitUntil` and replaced it by `eventually`
cd1397d [Iulian Dragos] Add a test for the propagation of a new rate limit from driver to receivers.
6369b30 [Iulian Dragos] Merge pull request #15 from huitseeker/SPARK-8975
d15de42 [François Garillot] [SPARK-8975][Streaming] Adds Ratelimiter unit tests w.r.t. spark.streaming.receiver.maxRate
4721c7d [François Garillot] [SPARK-8975][Streaming] Add a mechanism to send a new rate from the driver to the block generator
---
 .../streaming/dstream/InputDStream.scala      |   7 +-
 .../dstream/ReceiverInputDStream.scala        |  26 ++++-
 .../streaming/scheduler/JobScheduler.scala    |   6 +
 .../streaming/scheduler/RateController.scala  |  90 +++++++++++++++
 .../scheduler/rate/RateEstimator.scala        |  59 ++++++++++
 .../spark/streaming/CheckpointSuite.scala     |  28 +++++
 .../scheduler/RateControllerSuite.scala       | 103 ++++++++++++++++++
 .../ReceiverSchedulingPolicySuite.scala       |  10 +-
 .../scheduler/ReceiverTrackerSuite.scala      |  41 +++++--
 9 files changed, 355 insertions(+), 15 deletions(-)
 create mode 100644 streaming/src/main/scala/org/apache/spark/streaming/scheduler/RateController.scala
 create mode 100644 streaming/src/main/scala/org/apache/spark/streaming/scheduler/rate/RateEstimator.scala
 create mode 100644 streaming/src/test/scala/org/apache/spark/streaming/scheduler/RateControllerSuite.scala

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/InputDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/InputDStream.scala
index d58c99a8ff321..a6c4cd220e42f 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/InputDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/InputDStream.scala
@@ -21,7 +21,9 @@ import scala.reflect.ClassTag
 
 import org.apache.spark.SparkContext
 import org.apache.spark.rdd.RDDOperationScope
-import org.apache.spark.streaming.{Time, Duration, StreamingContext}
+import org.apache.spark.streaming.{Duration, StreamingContext, Time}
+import org.apache.spark.streaming.scheduler.RateController
+import org.apache.spark.streaming.scheduler.rate.RateEstimator
 import org.apache.spark.util.Utils
 
 /**
@@ -47,6 +49,9 @@ abstract class InputDStream[T: ClassTag] (@transient ssc_ : StreamingContext)
   /** This is an unique identifier for the input stream. */
   val id = ssc.getNewInputStreamId()
 
+  // Keep track of the freshest rate for this stream using the rateEstimator
+  protected[streaming] val rateController: Option[RateController] = None
+
   /** A human-readable name of this InputDStream */
   private[streaming] def name: String = {
     // e.g. FlumePollingDStream -> "Flume polling stream"
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReceiverInputDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReceiverInputDStream.scala
index a50f0efc030ce..646a8c3530a62 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReceiverInputDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReceiverInputDStream.scala
@@ -21,10 +21,11 @@ import scala.reflect.ClassTag
 
 import org.apache.spark.rdd.{BlockRDD, RDD}
 import org.apache.spark.storage.BlockId
-import org.apache.spark.streaming._
+import org.apache.spark.streaming.{StreamingContext, Time}
 import org.apache.spark.streaming.rdd.WriteAheadLogBackedBlockRDD
 import org.apache.spark.streaming.receiver.Receiver
-import org.apache.spark.streaming.scheduler.StreamInputInfo
+import org.apache.spark.streaming.scheduler.{RateController, StreamInputInfo}
+import org.apache.spark.streaming.scheduler.rate.RateEstimator
 import org.apache.spark.streaming.util.WriteAheadLogUtils
 
 /**
@@ -40,6 +41,17 @@ import org.apache.spark.streaming.util.WriteAheadLogUtils
 abstract class ReceiverInputDStream[T: ClassTag](@transient ssc_ : StreamingContext)
   extends InputDStream[T](ssc_) {
 
+  /**
+   * Asynchronously maintains & sends new rate limits to the receiver through the receiver tracker.
+   */
+  override protected[streaming] val rateController: Option[RateController] = {
+    if (RateController.isBackPressureEnabled(ssc.conf)) {
+      RateEstimator.create(ssc.conf).map { new ReceiverRateController(id, _) }
+    } else {
+      None
+    }
+  }
+
   /**
    * Gets the receiver object that will be sent to the worker nodes
    * to receive data. This method needs to defined by any specific implementation
@@ -110,4 +122,14 @@ abstract class ReceiverInputDStream[T: ClassTag](@transient ssc_ : StreamingCont
     }
     Some(blockRDD)
   }
+
+  /**
+   * A RateController that sends the new rate to receivers, via the receiver tracker.
+   */
+  private[streaming] class ReceiverRateController(id: Int, estimator: RateEstimator)
+      extends RateController(id, estimator) {
+    override def publish(rate: Long): Unit =
+      ssc.scheduler.receiverTracker.sendRateUpdate(id, rate)
+  }
 }
+
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala
index 4af9b6d3b56ab..58bdda7794bf2 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala
@@ -66,6 +66,12 @@ class JobScheduler(val ssc: StreamingContext) extends Logging {
     }
     eventLoop.start()
 
+    // attach rate controllers of input streams to receive batch completion updates
+    for {
+      inputDStream <- ssc.graph.getInputStreams
+      rateController <- inputDStream.rateController
+    } ssc.addStreamingListener(rateController)
+
     listenerBus.start(ssc.sparkContext)
     receiverTracker = new ReceiverTracker(ssc)
     inputInfoTracker = new InputInfoTracker(ssc)
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/RateController.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/RateController.scala
new file mode 100644
index 0000000000000..882ca0676b6ad
--- /dev/null
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/RateController.scala
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.streaming.scheduler
+
+import java.io.ObjectInputStream
+import java.util.concurrent.atomic.AtomicLong
+
+import scala.concurrent.{ExecutionContext, Future}
+
+import org.apache.spark.SparkConf
+import org.apache.spark.streaming.scheduler.rate.RateEstimator
+import org.apache.spark.util.{ThreadUtils, Utils}
+
+/**
+ * A StreamingListener that receives batch completion updates, and maintains
+ * an estimate of the speed at which this stream should ingest messages,
+ * given an estimate computation from a `RateEstimator`
+ */
+private[streaming] abstract class RateController(val streamUID: Int, rateEstimator: RateEstimator)
+    extends StreamingListener with Serializable {
+
+  init()
+
+  protected def publish(rate: Long): Unit
+
+  @transient
+  implicit private var executionContext: ExecutionContext = _
+
+  @transient
+  private var rateLimit: AtomicLong = _
+
+  /**
+   * An initialization method called both from the constructor and Serialization code.
+   */
+  private def init() {
+    executionContext = ExecutionContext.fromExecutorService(
+      ThreadUtils.newDaemonSingleThreadExecutor("stream-rate-update"))
+    rateLimit = new AtomicLong(-1L)
+  }
+
+  private def readObject(ois: ObjectInputStream): Unit = Utils.tryOrIOException {
+    ois.defaultReadObject()
+    init()
+  }
+
+  /**
+   * Compute the new rate limit and publish it asynchronously.
+   */
+  private def computeAndPublish(time: Long, elems: Long, workDelay: Long, waitDelay: Long): Unit =
+    Future[Unit] {
+      val newRate = rateEstimator.compute(time, elems, workDelay, waitDelay)
+      newRate.foreach { s =>
+        rateLimit.set(s.toLong)
+        publish(getLatestRate())
+      }
+    }
+
+  def getLatestRate(): Long = rateLimit.get()
+
+  override def onBatchCompleted(batchCompleted: StreamingListenerBatchCompleted) {
+    val elements = batchCompleted.batchInfo.streamIdToInputInfo
+
+    for {
+      processingEnd <- batchCompleted.batchInfo.processingEndTime;
+      workDelay <- batchCompleted.batchInfo.processingDelay;
+      waitDelay <- batchCompleted.batchInfo.schedulingDelay;
+      elems <- elements.get(streamUID).map(_.numRecords)
+    } computeAndPublish(processingEnd, elems, workDelay, waitDelay)
+  }
+}
+
+object RateController {
+  def isBackPressureEnabled(conf: SparkConf): Boolean =
+    conf.getBoolean("spark.streaming.backpressure.enable", false)
+}
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/rate/RateEstimator.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/rate/RateEstimator.scala
new file mode 100644
index 0000000000000..a08685119e5d5
--- /dev/null
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/rate/RateEstimator.scala
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.streaming.scheduler.rate
+
+import org.apache.spark.SparkConf
+import org.apache.spark.SparkException
+
+/**
+ * A component that estimates the rate at wich an InputDStream should ingest
+ * elements, based on updates at every batch completion.
+ */
+private[streaming] trait RateEstimator extends Serializable {
+
+  /**
+   * Computes the number of elements the stream attached to this `RateEstimator`
+   * should ingest per second, given an update on the size and completion
+   * times of the latest batch.
+   *
+   * @param time The timetamp of the current batch interval that just finished
+   * @param elements The number of elements that were processed in this batch
+   * @param processingDelay The time in ms that took for the job to complete
+   * @param schedulingDelay The time in ms that the job spent in the scheduling queue
+   */
+  def compute(
+      time: Long,
+      elements: Long,
+      processingDelay: Long,
+      schedulingDelay: Long): Option[Double]
+}
+
+object RateEstimator {
+
+  /**
+   * Return a new RateEstimator based on the value of `spark.streaming.RateEstimator`.
+   *
+   * @return None if there is no configured estimator, otherwise an instance of RateEstimator
+   * @throws IllegalArgumentException if there is a configured RateEstimator that doesn't match any
+   *         known estimators.
+   */
+  def create(conf: SparkConf): Option[RateEstimator] =
+    conf.getOption("spark.streaming.backpressure.rateEstimator").map { estimator =>
+      throw new IllegalArgumentException(s"Unkown rate estimator: $estimator")
+    }
+}
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala
index d308ac05a54fe..67c2d900940ab 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala
@@ -30,8 +30,10 @@ import org.apache.hadoop.io.{IntWritable, Text}
 import org.apache.hadoop.mapred.TextOutputFormat
 import org.apache.hadoop.mapreduce.lib.output.{TextOutputFormat => NewTextOutputFormat}
 import org.scalatest.concurrent.Eventually._
+import org.scalatest.time.SpanSugar._
 
 import org.apache.spark.streaming.dstream.{DStream, FileInputDStream}
+import org.apache.spark.streaming.scheduler.{RateLimitInputDStream, ConstantEstimator, SingletonTestRateReceiver}
 import org.apache.spark.util.{Clock, ManualClock, Utils}
 
 /**
@@ -391,6 +393,32 @@ class CheckpointSuite extends TestSuiteBase {
     testCheckpointedOperation(input, operation, output, 7)
   }
 
+  test("recovery maintains rate controller") {
+    ssc = new StreamingContext(conf, batchDuration)
+    ssc.checkpoint(checkpointDir)
+
+    val dstream = new RateLimitInputDStream(ssc) {
+      override val rateController =
+        Some(new ReceiverRateController(id, new ConstantEstimator(200.0)))
+    }
+    SingletonTestRateReceiver.reset()
+
+    val output = new TestOutputStreamWithPartitions(dstream.checkpoint(batchDuration * 2))
+    output.register()
+    runStreams(ssc, 5, 5)
+
+    SingletonTestRateReceiver.reset()
+    ssc = new StreamingContext(checkpointDir)
+    ssc.start()
+    val outputNew = advanceTimeWithRealDelay(ssc, 2)
+
+    eventually(timeout(5.seconds)) {
+      assert(dstream.getCurrentRateLimit === Some(200))
+    }
+    ssc.stop()
+    ssc = null
+  }
+
   // This tests whether file input stream remembers what files were seen before
   // the master failure and uses them again to process a large window operation.
   // It also tests whether batches, whose processing was incomplete due to the
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/RateControllerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/RateControllerSuite.scala
new file mode 100644
index 0000000000000..921da773f6c11
--- /dev/null
+++ b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/RateControllerSuite.scala
@@ -0,0 +1,103 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.streaming.scheduler
+
+import scala.collection.mutable
+import scala.reflect.ClassTag
+import scala.util.control.NonFatal
+
+import org.scalatest.Matchers._
+import org.scalatest.concurrent.Eventually._
+import org.scalatest.time.SpanSugar._
+
+import org.apache.spark.streaming._
+import org.apache.spark.streaming.scheduler.rate.RateEstimator
+
+class RateControllerSuite extends TestSuiteBase {
+
+  override def useManualClock: Boolean = false
+
+  test("rate controller publishes updates") {
+    val ssc = new StreamingContext(conf, batchDuration)
+    withStreamingContext(ssc) { ssc =>
+      val dstream = new RateLimitInputDStream(ssc)
+      dstream.register()
+      ssc.start()
+
+      eventually(timeout(10.seconds)) {
+        assert(dstream.publishCalls > 0)
+      }
+    }
+  }
+
+  test("publish rates reach receivers") {
+    val ssc = new StreamingContext(conf, batchDuration)
+    withStreamingContext(ssc) { ssc =>
+      val dstream = new RateLimitInputDStream(ssc) {
+        override val rateController =
+          Some(new ReceiverRateController(id, new ConstantEstimator(200.0)))
+      }
+      dstream.register()
+      SingletonTestRateReceiver.reset()
+      ssc.start()
+
+      eventually(timeout(10.seconds)) {
+        assert(dstream.getCurrentRateLimit === Some(200))
+      }
+    }
+  }
+
+  test("multiple publish rates reach receivers") {
+    val ssc = new StreamingContext(conf, batchDuration)
+    withStreamingContext(ssc) { ssc =>
+      val rates = Seq(100L, 200L, 300L)
+
+      val dstream = new RateLimitInputDStream(ssc) {
+        override val rateController =
+          Some(new ReceiverRateController(id, new ConstantEstimator(rates.map(_.toDouble): _*)))
+      }
+      SingletonTestRateReceiver.reset()
+      dstream.register()
+
+      val observedRates = mutable.HashSet.empty[Long]
+      ssc.start()
+
+      eventually(timeout(20.seconds)) {
+        dstream.getCurrentRateLimit.foreach(observedRates += _)
+        // Long.MaxValue (essentially, no rate limit) is the initial rate limit for any Receiver
+        observedRates should contain theSameElementsAs (rates :+ Long.MaxValue)
+      }
+    }
+  }
+}
+
+private[streaming] class ConstantEstimator(rates: Double*) extends RateEstimator {
+  private var idx: Int = 0
+
+  private def nextRate(): Double = {
+    val rate = rates(idx)
+    idx = (idx + 1) % rates.size
+    rate
+  }
+
+  def compute(
+      time: Long,
+      elements: Long,
+      processingDelay: Long,
+      schedulingDelay: Long): Option[Double] = Some(nextRate())
+}
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicySuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicySuite.scala
index 93f920fdc71f1..0418d776ecc9a 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicySuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicySuite.scala
@@ -64,7 +64,7 @@ class ReceiverSchedulingPolicySuite extends SparkFunSuite {
 
   test("scheduleReceivers: " +
     "schedule receivers evenly when there are more receivers than executors") {
-    val receivers = (0 until 6).map(new DummyReceiver(_))
+    val receivers = (0 until 6).map(new RateTestReceiver(_))
     val executors = (10000 until 10003).map(port => s"localhost:${port}")
     val scheduledExecutors = receiverSchedulingPolicy.scheduleReceivers(receivers, executors)
     val numReceiversOnExecutor = mutable.HashMap[String, Int]()
@@ -79,7 +79,7 @@ class ReceiverSchedulingPolicySuite extends SparkFunSuite {
 
   test("scheduleReceivers: " +
     "schedule receivers evenly when there are more executors than receivers") {
-    val receivers = (0 until 3).map(new DummyReceiver(_))
+    val receivers = (0 until 3).map(new RateTestReceiver(_))
     val executors = (10000 until 10006).map(port => s"localhost:${port}")
     val scheduledExecutors = receiverSchedulingPolicy.scheduleReceivers(receivers, executors)
     val numReceiversOnExecutor = mutable.HashMap[String, Int]()
@@ -94,8 +94,8 @@ class ReceiverSchedulingPolicySuite extends SparkFunSuite {
   }
 
   test("scheduleReceivers: schedule receivers evenly when the preferredLocations are even") {
-    val receivers = (0 until 3).map(new DummyReceiver(_)) ++
-      (3 until 6).map(new DummyReceiver(_, Some("localhost")))
+    val receivers = (0 until 3).map(new RateTestReceiver(_)) ++
+      (3 until 6).map(new RateTestReceiver(_, Some("localhost")))
     val executors = (10000 until 10003).map(port => s"localhost:${port}") ++
       (10003 until 10006).map(port => s"localhost2:${port}")
     val scheduledExecutors = receiverSchedulingPolicy.scheduleReceivers(receivers, executors)
@@ -121,7 +121,7 @@ class ReceiverSchedulingPolicySuite extends SparkFunSuite {
   }
 
   test("scheduleReceivers: return empty scheduled executors if no executors") {
-    val receivers = (0 until 3).map(new DummyReceiver(_))
+    val receivers = (0 until 3).map(new RateTestReceiver(_))
     val scheduledExecutors = receiverSchedulingPolicy.scheduleReceivers(receivers, Seq.empty)
     scheduledExecutors.foreach { case (receiverId, executors) =>
       assert(executors.isEmpty)
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala
index b039233f36316..aff8b53f752fa 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala
@@ -43,6 +43,7 @@ class ReceiverTrackerSuite extends TestSuiteBase {
 
     ssc.addStreamingListener(ReceiverStartedWaiter)
     ssc.scheduler.listenerBus.start(ssc.sc)
+    SingletonTestRateReceiver.reset()
 
     val newRateLimit = 100L
     val inputDStream = new RateLimitInputDStream(ssc)
@@ -62,36 +63,62 @@ class ReceiverTrackerSuite extends TestSuiteBase {
   }
 }
 
-/** An input DStream with a hard-coded receiver that gives access to internals for testing. */
-private class RateLimitInputDStream(@transient ssc_ : StreamingContext)
+/**
+ * An input DStream with a hard-coded receiver that gives access to internals for testing.
+ *
+ * @note Make sure to call {{{SingletonDummyReceiver.reset()}}} before using this in a test,
+ *       or otherwise you may get {{{NotSerializableException}}} when trying to serialize
+ *       the receiver.
+ * @see [[[SingletonDummyReceiver]]].
+ */
+private[streaming] class RateLimitInputDStream(@transient ssc_ : StreamingContext)
   extends ReceiverInputDStream[Int](ssc_) {
 
-  override def getReceiver(): DummyReceiver = SingletonDummyReceiver
+  override def getReceiver(): RateTestReceiver = SingletonTestRateReceiver
 
   def getCurrentRateLimit: Option[Long] = {
     invokeExecutorMethod.getCurrentRateLimit
   }
 
+  @volatile
+  var publishCalls = 0
+
+  override val rateController: Option[RateController] = {
+    Some(new RateController(id, new ConstantEstimator(100.0)) {
+      override def publish(rate: Long): Unit = {
+        publishCalls += 1
+      }
+    })
+  }
+
   private def invokeExecutorMethod: ReceiverSupervisor = {
     val c = classOf[Receiver[_]]
     val ex = c.getDeclaredMethod("executor")
     ex.setAccessible(true)
-    ex.invoke(SingletonDummyReceiver).asInstanceOf[ReceiverSupervisor]
+    ex.invoke(SingletonTestRateReceiver).asInstanceOf[ReceiverSupervisor]
   }
 }
 
 /**
- * A Receiver as an object so we can read its rate limit.
+ * A Receiver as an object so we can read its rate limit. Make sure to call `reset()` when
+ * reusing this receiver, otherwise a non-null `executor_` field will prevent it from being
+ * serialized when receivers are installed on executors.
  *
  * @note It's necessary to be a top-level object, or else serialization would create another
  *       one on the executor side and we won't be able to read its rate limit.
  */
-private object SingletonDummyReceiver extends DummyReceiver(0)
+private[streaming] object SingletonTestRateReceiver extends RateTestReceiver(0) {
+
+  /** Reset the object to be usable in another test. */
+  def reset(): Unit = {
+    executor_ = null
+  }
+}
 
 /**
  * Dummy receiver implementation
  */
-private class DummyReceiver(receiverId: Int, host: Option[String] = None)
+private[streaming] class RateTestReceiver(receiverId: Int, host: Option[String] = None)
   extends Receiver[Int](StorageLevel.MEMORY_ONLY) {
 
   setReceiverId(receiverId)

From 5340dfaf94a3c54199f8cc3c78e11f61e34d0a67 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Wed, 29 Jul 2015 13:49:22 -0700
Subject: [PATCH 447/549] [SPARK-9430][SQL] Rename IntervalType to
 CalendarIntervalType.

We want to introduce a new IntervalType in 1.6 that is based on only the number of microseoncds,
so interval can be compared.

Renaming the existing IntervalType to CalendarIntervalType so we can do that in the future.

Author: Reynold Xin <rxin@databricks.com>

Closes #7745 from rxin/calendarintervaltype and squashes the following commits:

99f64e8 [Reynold Xin] One more line ...
13466c8 [Reynold Xin] Fixed tests.
e20f24e [Reynold Xin] [SPARK-9430][SQL] Rename IntervalType to CalendarIntervalType.
---
 .../expressions/SpecializedGetters.java       |   4 +-
 .../sql/catalyst/expressions/UnsafeRow.java   |  10 +-
 .../expressions/UnsafeRowWriters.java         |   4 +-
 .../org/apache/spark/sql/types/DataTypes.java |   4 +-
 .../spark/sql/catalyst/InternalRow.scala      |   5 +-
 .../apache/spark/sql/catalyst/SqlParser.scala |  16 +-
 .../catalyst/expressions/BoundAttribute.scala |   2 +-
 .../spark/sql/catalyst/expressions/Cast.scala |  12 +-
 .../sql/catalyst/expressions/arithmetic.scala |  20 +--
 .../expressions/codegen/CodeGenerator.scala   |   6 +-
 .../codegen/GenerateUnsafeProjection.scala    |  10 +-
 .../sql/catalyst/expressions/literals.scala   |   2 +-
 .../spark/sql/types/AbstractDataType.scala    |   2 +-
 ...lType.scala => CalendarIntervalType.scala} |  15 +-
 .../ExpressionTypeCheckingSuite.scala         |   7 +-
 .../analysis/HiveTypeCoercionSuite.scala      |   2 +-
 .../sql/catalyst/expressions/CastSuite.scala  |   9 +-
 .../spark/sql/execution/basicOperators.scala  | 131 ---------------
 .../spark/sql/execution/datasources/ddl.scala |   2 +-
 .../org/apache/spark/sql/execution/sort.scala | 159 ++++++++++++++++++
 .../org/apache/spark/sql/SQLQuerySuite.scala  |  20 +--
 .../{Interval.java => CalendarInterval.java}  |  24 +--
 .../spark/unsafe/types/IntervalSuite.java     |  72 ++++----
 23 files changed, 286 insertions(+), 252 deletions(-)
 rename sql/catalyst/src/main/scala/org/apache/spark/sql/types/{IntervalType.scala => CalendarIntervalType.scala} (64%)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/sort.scala
 rename unsafe/src/main/java/org/apache/spark/unsafe/types/{Interval.java => CalendarInterval.java} (87%)

diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/SpecializedGetters.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/SpecializedGetters.java
index 5f28d52a94bd7..bc345dcd00e49 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/SpecializedGetters.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/SpecializedGetters.java
@@ -19,7 +19,7 @@
 
 import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.types.Decimal;
-import org.apache.spark.unsafe.types.Interval;
+import org.apache.spark.unsafe.types.CalendarInterval;
 import org.apache.spark.unsafe.types.UTF8String;
 
 public interface SpecializedGetters {
@@ -46,7 +46,7 @@ public interface SpecializedGetters {
 
   byte[] getBinary(int ordinal);
 
-  Interval getInterval(int ordinal);
+  CalendarInterval getInterval(int ordinal);
 
   InternalRow getStruct(int ordinal, int numFields);
 
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
index 64a8edc34d681..6d684bac37573 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
@@ -29,7 +29,7 @@
 import org.apache.spark.unsafe.array.ByteArrayMethods;
 import org.apache.spark.unsafe.bitset.BitSetMethods;
 import org.apache.spark.unsafe.hash.Murmur3_x86_32;
-import org.apache.spark.unsafe.types.Interval;
+import org.apache.spark.unsafe.types.CalendarInterval;
 import org.apache.spark.unsafe.types.UTF8String;
 
 import static org.apache.spark.sql.types.DataTypes.*;
@@ -92,7 +92,7 @@ public static int calculateBitSetWidthInBytes(int numFields) {
       Arrays.asList(new DataType[]{
         StringType,
         BinaryType,
-        IntervalType
+        CalendarIntervalType
       }));
     _readableFieldTypes.addAll(settableFieldTypes);
     readableFieldTypes = Collections.unmodifiableSet(_readableFieldTypes);
@@ -265,7 +265,7 @@ public Object get(int ordinal, DataType dataType) {
       return getBinary(ordinal);
     } else if (dataType instanceof StringType) {
       return getUTF8String(ordinal);
-    } else if (dataType instanceof IntervalType) {
+    } else if (dataType instanceof CalendarIntervalType) {
       return getInterval(ordinal);
     } else if (dataType instanceof StructType) {
       return getStruct(ordinal, ((StructType) dataType).size());
@@ -350,7 +350,7 @@ public byte[] getBinary(int ordinal) {
   }
 
   @Override
-  public Interval getInterval(int ordinal) {
+  public CalendarInterval getInterval(int ordinal) {
     if (isNullAt(ordinal)) {
       return null;
     } else {
@@ -359,7 +359,7 @@ public Interval getInterval(int ordinal) {
       final int months = (int) PlatformDependent.UNSAFE.getLong(baseObject, baseOffset + offset);
       final long microseconds =
         PlatformDependent.UNSAFE.getLong(baseObject, baseOffset + offset + 8);
-      return new Interval(months, microseconds);
+      return new CalendarInterval(months, microseconds);
     }
   }
 
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRowWriters.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRowWriters.java
index 32faad374015c..c3259e21c4a78 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRowWriters.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRowWriters.java
@@ -21,7 +21,7 @@
 import org.apache.spark.unsafe.PlatformDependent;
 import org.apache.spark.unsafe.array.ByteArrayMethods;
 import org.apache.spark.unsafe.types.ByteArray;
-import org.apache.spark.unsafe.types.Interval;
+import org.apache.spark.unsafe.types.CalendarInterval;
 import org.apache.spark.unsafe.types.UTF8String;
 
 /**
@@ -131,7 +131,7 @@ public static int write(UnsafeRow target, int ordinal, int cursor, InternalRow i
   /** Writer for interval type. */
   public static class IntervalWriter {
 
-    public static int write(UnsafeRow target, int ordinal, int cursor, Interval input) {
+    public static int write(UnsafeRow target, int ordinal, int cursor, CalendarInterval input) {
       final long offset = target.getBaseOffset() + cursor;
 
       // Write the months and microseconds fields of Interval to the variable length portion.
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/types/DataTypes.java b/sql/catalyst/src/main/java/org/apache/spark/sql/types/DataTypes.java
index 5703de42393de..17659d7d960b0 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/types/DataTypes.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/types/DataTypes.java
@@ -50,9 +50,9 @@ public class DataTypes {
   public static final DataType TimestampType = TimestampType$.MODULE$;
 
   /**
-   * Gets the IntervalType object.
+   * Gets the CalendarIntervalType object.
    */
-  public static final DataType IntervalType = IntervalType$.MODULE$;
+  public static final DataType CalendarIntervalType = CalendarIntervalType$.MODULE$;
 
   /**
    * Gets the DoubleType object.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala
index e395a67434fa7..a5999e64ec554 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.types._
-import org.apache.spark.unsafe.types.{Interval, UTF8String}
+import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String}
 
 /**
  * An abstract class for row used internal in Spark SQL, which only contain the columns as
@@ -61,7 +61,8 @@ abstract class InternalRow extends Serializable with SpecializedGetters {
   override def getDecimal(ordinal: Int): Decimal =
     getAs[Decimal](ordinal, DecimalType.SYSTEM_DEFAULT)
 
-  override def getInterval(ordinal: Int): Interval = getAs[Interval](ordinal, IntervalType)
+  override def getInterval(ordinal: Int): CalendarInterval =
+    getAs[CalendarInterval](ordinal, CalendarIntervalType)
 
   // This is only use for test and will throw a null pointer exception if the position is null.
   def getString(ordinal: Int): String = getUTF8String(ordinal).toString
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
index e5f115f74bf3b..f2498861c9573 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
@@ -25,7 +25,7 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.types._
-import org.apache.spark.unsafe.types.Interval
+import org.apache.spark.unsafe.types.CalendarInterval
 
 /**
  * A very simple SQL parser.  Based loosely on:
@@ -365,32 +365,32 @@ class SqlParser extends AbstractSparkSQLParser with DataTypeParser {
 
   protected lazy val millisecond: Parser[Long] =
     integral <~ intervalUnit("millisecond") ^^ {
-      case num => num.toLong * Interval.MICROS_PER_MILLI
+      case num => num.toLong * CalendarInterval.MICROS_PER_MILLI
     }
 
   protected lazy val second: Parser[Long] =
     integral <~ intervalUnit("second") ^^ {
-      case num => num.toLong * Interval.MICROS_PER_SECOND
+      case num => num.toLong * CalendarInterval.MICROS_PER_SECOND
     }
 
   protected lazy val minute: Parser[Long] =
     integral <~ intervalUnit("minute") ^^ {
-      case num => num.toLong * Interval.MICROS_PER_MINUTE
+      case num => num.toLong * CalendarInterval.MICROS_PER_MINUTE
     }
 
   protected lazy val hour: Parser[Long] =
     integral <~ intervalUnit("hour") ^^ {
-      case num => num.toLong * Interval.MICROS_PER_HOUR
+      case num => num.toLong * CalendarInterval.MICROS_PER_HOUR
     }
 
   protected lazy val day: Parser[Long] =
     integral <~ intervalUnit("day") ^^ {
-      case num => num.toLong * Interval.MICROS_PER_DAY
+      case num => num.toLong * CalendarInterval.MICROS_PER_DAY
     }
 
   protected lazy val week: Parser[Long] =
     integral <~ intervalUnit("week") ^^ {
-      case num => num.toLong * Interval.MICROS_PER_WEEK
+      case num => num.toLong * CalendarInterval.MICROS_PER_WEEK
     }
 
   protected lazy val intervalLiteral: Parser[Literal] =
@@ -406,7 +406,7 @@ class SqlParser extends AbstractSparkSQLParser with DataTypeParser {
           val months = Seq(year, month).map(_.getOrElse(0)).sum
           val microseconds = Seq(week, day, hour, minute, second, millisecond, microsecond)
             .map(_.getOrElse(0L)).sum
-          Literal.create(new Interval(months, microseconds), IntervalType)
+          Literal.create(new CalendarInterval(months, microseconds), CalendarIntervalType)
       }
 
   private def toNarrowestIntegerType(value: String): Any = {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala
index 8304d4ccd47f7..371681b5d494f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala
@@ -48,7 +48,7 @@ case class BoundReference(ordinal: Int, dataType: DataType, nullable: Boolean)
         case DoubleType => input.getDouble(ordinal)
         case StringType => input.getUTF8String(ordinal)
         case BinaryType => input.getBinary(ordinal)
-        case IntervalType => input.getInterval(ordinal)
+        case CalendarIntervalType => input.getInterval(ordinal)
         case t: StructType => input.getStruct(ordinal, t.size)
         case _ => input.get(ordinal, dataType)
       }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
index bd8b0177eb00e..c6e8af27667ee 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -24,7 +24,7 @@ import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
 import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.types._
-import org.apache.spark.unsafe.types.{Interval, UTF8String}
+import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String}
 
 import scala.collection.mutable
 
@@ -55,7 +55,7 @@ object Cast {
 
     case (_, DateType) => true
 
-    case (StringType, IntervalType) => true
+    case (StringType, CalendarIntervalType) => true
 
     case (StringType, _: NumericType) => true
     case (BooleanType, _: NumericType) => true
@@ -225,7 +225,7 @@ case class Cast(child: Expression, dataType: DataType)
   // IntervalConverter
   private[this] def castToInterval(from: DataType): Any => Any = from match {
     case StringType =>
-      buildCast[UTF8String](_, s => Interval.fromString(s.toString))
+      buildCast[UTF8String](_, s => CalendarInterval.fromString(s.toString))
     case _ => _ => null
   }
 
@@ -398,7 +398,7 @@ case class Cast(child: Expression, dataType: DataType)
     case DateType => castToDate(from)
     case decimal: DecimalType => castToDecimal(from, decimal)
     case TimestampType => castToTimestamp(from)
-    case IntervalType => castToInterval(from)
+    case CalendarIntervalType => castToInterval(from)
     case BooleanType => castToBoolean(from)
     case ByteType => castToByte(from)
     case ShortType => castToShort(from)
@@ -438,7 +438,7 @@ case class Cast(child: Expression, dataType: DataType)
     case DateType => castToDateCode(from, ctx)
     case decimal: DecimalType => castToDecimalCode(from, decimal)
     case TimestampType => castToTimestampCode(from, ctx)
-    case IntervalType => castToIntervalCode(from)
+    case CalendarIntervalType => castToIntervalCode(from)
     case BooleanType => castToBooleanCode(from)
     case ByteType => castToByteCode(from)
     case ShortType => castToShortCode(from)
@@ -630,7 +630,7 @@ case class Cast(child: Expression, dataType: DataType)
   private[this] def castToIntervalCode(from: DataType): CastFunction = from match {
     case StringType =>
       (c, evPrim, evNull) =>
-        s"$evPrim = Interval.fromString($c.toString());"
+        s"$evPrim = CalendarInterval.fromString($c.toString());"
   }
 
   private[this] def decimalToTimestampCode(d: String): String =
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
index 4ec866475f8b0..6f8f4dd230f12 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
@@ -21,7 +21,7 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.catalyst.util.TypeUtils
 import org.apache.spark.sql.types._
-import org.apache.spark.unsafe.types.Interval
+import org.apache.spark.unsafe.types.CalendarInterval
 
 
 case class UnaryMinus(child: Expression) extends UnaryExpression with ExpectsInputTypes {
@@ -37,12 +37,12 @@ case class UnaryMinus(child: Expression) extends UnaryExpression with ExpectsInp
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = dataType match {
     case dt: DecimalType => defineCodeGen(ctx, ev, c => s"$c.unary_$$minus()")
     case dt: NumericType => defineCodeGen(ctx, ev, c => s"(${ctx.javaType(dt)})(-($c))")
-    case dt: IntervalType => defineCodeGen(ctx, ev, c => s"$c.negate()")
+    case dt: CalendarIntervalType => defineCodeGen(ctx, ev, c => s"$c.negate()")
   }
 
   protected override def nullSafeEval(input: Any): Any = {
-    if (dataType.isInstanceOf[IntervalType]) {
-      input.asInstanceOf[Interval].negate()
+    if (dataType.isInstanceOf[CalendarIntervalType]) {
+      input.asInstanceOf[CalendarInterval].negate()
     } else {
       numeric.negate(input)
     }
@@ -121,8 +121,8 @@ case class Add(left: Expression, right: Expression) extends BinaryArithmetic {
   private lazy val numeric = TypeUtils.getNumeric(dataType)
 
   protected override def nullSafeEval(input1: Any, input2: Any): Any = {
-    if (dataType.isInstanceOf[IntervalType]) {
-      input1.asInstanceOf[Interval].add(input2.asInstanceOf[Interval])
+    if (dataType.isInstanceOf[CalendarIntervalType]) {
+      input1.asInstanceOf[CalendarInterval].add(input2.asInstanceOf[CalendarInterval])
     } else {
       numeric.plus(input1, input2)
     }
@@ -134,7 +134,7 @@ case class Add(left: Expression, right: Expression) extends BinaryArithmetic {
     case ByteType | ShortType =>
       defineCodeGen(ctx, ev,
         (eval1, eval2) => s"(${ctx.javaType(dataType)})($eval1 $symbol $eval2)")
-    case IntervalType =>
+    case CalendarIntervalType =>
       defineCodeGen(ctx, ev, (eval1, eval2) => s"$eval1.add($eval2)")
     case _ =>
       defineCodeGen(ctx, ev, (eval1, eval2) => s"$eval1 $symbol $eval2")
@@ -150,8 +150,8 @@ case class Subtract(left: Expression, right: Expression) extends BinaryArithmeti
   private lazy val numeric = TypeUtils.getNumeric(dataType)
 
   protected override def nullSafeEval(input1: Any, input2: Any): Any = {
-    if (dataType.isInstanceOf[IntervalType]) {
-      input1.asInstanceOf[Interval].subtract(input2.asInstanceOf[Interval])
+    if (dataType.isInstanceOf[CalendarIntervalType]) {
+      input1.asInstanceOf[CalendarInterval].subtract(input2.asInstanceOf[CalendarInterval])
     } else {
       numeric.minus(input1, input2)
     }
@@ -163,7 +163,7 @@ case class Subtract(left: Expression, right: Expression) extends BinaryArithmeti
     case ByteType | ShortType =>
       defineCodeGen(ctx, ev,
         (eval1, eval2) => s"(${ctx.javaType(dataType)})($eval1 $symbol $eval2)")
-    case IntervalType =>
+    case CalendarIntervalType =>
       defineCodeGen(ctx, ev, (eval1, eval2) => s"$eval1.subtract($eval2)")
     case _ =>
       defineCodeGen(ctx, ev, (eval1, eval2) => s"$eval1 $symbol $eval2")
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index 2f02c90b1d5b3..092f4c9fb0bd2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -108,7 +108,7 @@ class CodeGenContext {
       case _ if isPrimitiveType(jt) => s"$row.get${primitiveTypeName(jt)}($ordinal)"
       case StringType => s"$row.getUTF8String($ordinal)"
       case BinaryType => s"$row.getBinary($ordinal)"
-      case IntervalType => s"$row.getInterval($ordinal)"
+      case CalendarIntervalType => s"$row.getInterval($ordinal)"
       case t: StructType => s"$row.getStruct($ordinal, ${t.size})"
       case _ => s"($jt)$row.get($ordinal)"
     }
@@ -150,7 +150,7 @@ class CodeGenContext {
     case dt: DecimalType => "Decimal"
     case BinaryType => "byte[]"
     case StringType => "UTF8String"
-    case IntervalType => "Interval"
+    case CalendarIntervalType => "CalendarInterval"
     case _: StructType => "InternalRow"
     case _: ArrayType => s"scala.collection.Seq"
     case _: MapType => s"scala.collection.Map"
@@ -293,7 +293,7 @@ abstract class CodeGenerator[InType <: AnyRef, OutType <: AnyRef] extends Loggin
       classOf[UnsafeRow].getName,
       classOf[UTF8String].getName,
       classOf[Decimal].getName,
-      classOf[Interval].getName
+      classOf[CalendarInterval].getName
     ))
     evaluator.setExtendedClass(classOf[GeneratedClass])
     try {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
index 9a4c00e86a3ec..dc725c28aaa27 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
@@ -39,7 +39,7 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
   /** Returns true iff we support this data type. */
   def canSupport(dataType: DataType): Boolean = dataType match {
     case t: AtomicType if !t.isInstanceOf[DecimalType] => true
-    case _: IntervalType => true
+    case _: CalendarIntervalType => true
     case t: StructType => t.toSeq.forall(field => canSupport(field.dataType))
     case NullType => true
     case _ => false
@@ -75,7 +75,7 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
           s" + (${exprs(i).isNull} ? 0 : $StringWriter.getSize(${exprs(i).primitive}))"
         case BinaryType =>
           s" + (${exprs(i).isNull} ? 0 : $BinaryWriter.getSize(${exprs(i).primitive}))"
-        case IntervalType =>
+        case CalendarIntervalType =>
           s" + (${exprs(i).isNull} ? 0 : 16)"
         case _: StructType =>
           s" + (${exprs(i).isNull} ? 0 : $StructWriter.getSize(${exprs(i).primitive}))"
@@ -91,7 +91,7 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
           s"$cursor += $StringWriter.write($ret, $i, $cursor, ${exprs(i).primitive})"
         case BinaryType =>
           s"$cursor += $BinaryWriter.write($ret, $i, $cursor, ${exprs(i).primitive})"
-        case IntervalType =>
+        case CalendarIntervalType =>
           s"$cursor += $IntervalWriter.write($ret, $i, $cursor, ${exprs(i).primitive})"
         case t: StructType =>
           s"$cursor += $StructWriter.write($ret, $i, $cursor, ${exprs(i).primitive})"
@@ -173,7 +173,7 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
           s" + (${ev.isNull} ? 0 : $StringWriter.getSize(${ev.primitive}))"
         case BinaryType =>
           s" + (${ev.isNull} ? 0 : $BinaryWriter.getSize(${ev.primitive}))"
-        case IntervalType =>
+        case CalendarIntervalType =>
           s" + (${ev.isNull} ? 0 : 16)"
         case _: StructType =>
           s" + (${ev.isNull} ? 0 : $StructWriter.getSize(${ev.primitive}))"
@@ -189,7 +189,7 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
           s"$cursor += $StringWriter.write($primitive, $i, $cursor, ${exprs(i).primitive})"
         case BinaryType =>
           s"$cursor += $BinaryWriter.write($primitive, $i, $cursor, ${exprs(i).primitive})"
-        case IntervalType =>
+        case CalendarIntervalType =>
           s"$cursor += $IntervalWriter.write($primitive, $i, $cursor, ${exprs(i).primitive})"
         case t: StructType =>
           s"$cursor += $StructWriter.write($primitive, $i, $cursor, ${exprs(i).primitive})"
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
index 064a1720c36e8..34bad23802ba4 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
@@ -42,7 +42,7 @@ object Literal {
     case t: Timestamp => Literal(DateTimeUtils.fromJavaTimestamp(t), TimestampType)
     case d: Date => Literal(DateTimeUtils.fromJavaDate(d), DateType)
     case a: Array[Byte] => Literal(a, BinaryType)
-    case i: Interval => Literal(i, IntervalType)
+    case i: CalendarInterval => Literal(i, CalendarIntervalType)
     case null => Literal(null, NullType)
     case _ =>
       throw new RuntimeException("Unsupported literal type " + v.getClass + " " + v)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala
index 40bf4b299c990..e0667c629486d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala
@@ -95,7 +95,7 @@ private[sql] object TypeCollection {
    * Types that include numeric types and interval type. They are only used in unary_minus,
    * unary_positive, add and subtract operations.
    */
-  val NumericAndInterval = TypeCollection(NumericType, IntervalType)
+  val NumericAndInterval = TypeCollection(NumericType, CalendarIntervalType)
 
   def apply(types: AbstractDataType*): TypeCollection = new TypeCollection(types)
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/IntervalType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/CalendarIntervalType.scala
similarity index 64%
rename from sql/catalyst/src/main/scala/org/apache/spark/sql/types/IntervalType.scala
rename to sql/catalyst/src/main/scala/org/apache/spark/sql/types/CalendarIntervalType.scala
index 87c6e9e6e5e2c..3565f52c21f69 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/IntervalType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/CalendarIntervalType.scala
@@ -22,16 +22,19 @@ import org.apache.spark.annotation.DeveloperApi
 
 /**
  * :: DeveloperApi ::
- * The data type representing time intervals.
+ * The data type representing calendar time intervals. The calendar time interval is stored
+ * internally in two components: number of months the number of microseconds.
  *
- * Please use the singleton [[DataTypes.IntervalType]].
+ * Note that calendar intervals are not comparable.
+ *
+ * Please use the singleton [[DataTypes.CalendarIntervalType]].
  */
 @DeveloperApi
-class IntervalType private() extends DataType {
+class CalendarIntervalType private() extends DataType {
 
-  override def defaultSize: Int = 4096
+  override def defaultSize: Int = 16
 
-  private[spark] override def asNullable: IntervalType = this
+  private[spark] override def asNullable: CalendarIntervalType = this
 }
 
-case object IntervalType extends IntervalType
+case object CalendarIntervalType extends CalendarIntervalType
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala
index ad15136ee9a2f..8acd4c685e2bc 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala
@@ -53,7 +53,7 @@ class ExpressionTypeCheckingSuite extends SparkFunSuite {
   }
 
   test("check types for unary arithmetic") {
-    assertError(UnaryMinus('stringField), "type (numeric or interval)")
+    assertError(UnaryMinus('stringField), "type (numeric or calendarinterval)")
     assertError(Abs('stringField), "expected to be of type numeric")
     assertError(BitwiseNot('stringField), "expected to be of type integral")
   }
@@ -78,8 +78,9 @@ class ExpressionTypeCheckingSuite extends SparkFunSuite {
     assertErrorForDifferingTypes(MaxOf('intField, 'booleanField))
     assertErrorForDifferingTypes(MinOf('intField, 'booleanField))
 
-    assertError(Add('booleanField, 'booleanField), "accepts (numeric or interval) type")
-    assertError(Subtract('booleanField, 'booleanField), "accepts (numeric or interval) type")
+    assertError(Add('booleanField, 'booleanField), "accepts (numeric or calendarinterval) type")
+    assertError(Subtract('booleanField, 'booleanField),
+      "accepts (numeric or calendarinterval) type")
     assertError(Multiply('booleanField, 'booleanField), "accepts numeric type")
     assertError(Divide('booleanField, 'booleanField), "accepts numeric type")
     assertError(Remainder('booleanField, 'booleanField), "accepts numeric type")
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
index 4454d51b75877..1d9ee5ddf3a5a 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
@@ -116,7 +116,7 @@ class HiveTypeCoercionSuite extends PlanTest {
     shouldNotCast(IntegerType, MapType)
     shouldNotCast(IntegerType, StructType)
 
-    shouldNotCast(IntervalType, StringType)
+    shouldNotCast(CalendarIntervalType, StringType)
 
     // Don't implicitly cast complex types to string.
     shouldNotCast(ArrayType(StringType), StringType)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
index 408353cf70a49..0e0213be0f57b 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
@@ -719,12 +719,13 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
   }
 
   test("case between string and interval") {
-    import org.apache.spark.unsafe.types.Interval
+    import org.apache.spark.unsafe.types.CalendarInterval
 
-    checkEvaluation(Cast(Literal("interval -3 month 7 hours"), IntervalType),
-      new Interval(-3, 7 * Interval.MICROS_PER_HOUR))
+    checkEvaluation(Cast(Literal("interval -3 month 7 hours"), CalendarIntervalType),
+      new CalendarInterval(-3, 7 * CalendarInterval.MICROS_PER_HOUR))
     checkEvaluation(Cast(Literal.create(
-      new Interval(15, -3 * Interval.MICROS_PER_DAY), IntervalType), StringType),
+      new CalendarInterval(15, -3 * CalendarInterval.MICROS_PER_DAY), CalendarIntervalType),
+      StringType),
       "interval 1 years 3 months -3 days")
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
index b02e60dc85cdd..2294a670c735f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
@@ -220,137 +220,6 @@ case class TakeOrderedAndProject(
   override def outputOrdering: Seq[SortOrder] = sortOrder
 }
 
-/**
- * :: DeveloperApi ::
- * Performs a sort on-heap.
- * @param global when true performs a global sort of all partitions by shuffling the data first
- *               if necessary.
- */
-@DeveloperApi
-case class Sort(
-    sortOrder: Seq[SortOrder],
-    global: Boolean,
-    child: SparkPlan)
-  extends UnaryNode {
-  override def requiredChildDistribution: Seq[Distribution] =
-    if (global) OrderedDistribution(sortOrder) :: Nil else UnspecifiedDistribution :: Nil
-
-  protected override def doExecute(): RDD[InternalRow] = attachTree(this, "sort") {
-    child.execute().mapPartitions( { iterator =>
-      val ordering = newOrdering(sortOrder, child.output)
-      iterator.map(_.copy()).toArray.sorted(ordering).iterator
-    }, preservesPartitioning = true)
-  }
-
-  override def output: Seq[Attribute] = child.output
-
-  override def outputOrdering: Seq[SortOrder] = sortOrder
-}
-
-/**
- * :: DeveloperApi ::
- * Performs a sort, spilling to disk as needed.
- * @param global when true performs a global sort of all partitions by shuffling the data first
- *               if necessary.
- */
-@DeveloperApi
-case class ExternalSort(
-    sortOrder: Seq[SortOrder],
-    global: Boolean,
-    child: SparkPlan)
-  extends UnaryNode {
-
-  override def requiredChildDistribution: Seq[Distribution] =
-    if (global) OrderedDistribution(sortOrder) :: Nil else UnspecifiedDistribution :: Nil
-
-  protected override def doExecute(): RDD[InternalRow] = attachTree(this, "sort") {
-    child.execute().mapPartitions( { iterator =>
-      val ordering = newOrdering(sortOrder, child.output)
-      val sorter = new ExternalSorter[InternalRow, Null, InternalRow](ordering = Some(ordering))
-      sorter.insertAll(iterator.map(r => (r.copy, null)))
-      val baseIterator = sorter.iterator.map(_._1)
-      // TODO(marmbrus): The complex type signature below thwarts inference for no reason.
-      CompletionIterator[InternalRow, Iterator[InternalRow]](baseIterator, sorter.stop())
-    }, preservesPartitioning = true)
-  }
-
-  override def output: Seq[Attribute] = child.output
-
-  override def outputOrdering: Seq[SortOrder] = sortOrder
-}
-
-/**
- * :: DeveloperApi ::
- * Optimized version of [[ExternalSort]] that operates on binary data (implemented as part of
- * Project Tungsten).
- *
- * @param global when true performs a global sort of all partitions by shuffling the data first
- *               if necessary.
- * @param testSpillFrequency Method for configuring periodic spilling in unit tests. If set, will
- *                           spill every `frequency` records.
- */
-@DeveloperApi
-case class UnsafeExternalSort(
-    sortOrder: Seq[SortOrder],
-    global: Boolean,
-    child: SparkPlan,
-    testSpillFrequency: Int = 0)
-  extends UnaryNode {
-
-  private[this] val schema: StructType = child.schema
-
-  override def requiredChildDistribution: Seq[Distribution] =
-    if (global) OrderedDistribution(sortOrder) :: Nil else UnspecifiedDistribution :: Nil
-
-  protected override def doExecute(): RDD[InternalRow] = attachTree(this, "sort") {
-    assert(codegenEnabled, "UnsafeExternalSort requires code generation to be enabled")
-    def doSort(iterator: Iterator[InternalRow]): Iterator[InternalRow] = {
-      val ordering = newOrdering(sortOrder, child.output)
-      val boundSortExpression = BindReferences.bindReference(sortOrder.head, child.output)
-      // Hack until we generate separate comparator implementations for ascending vs. descending
-      // (or choose to codegen them):
-      val prefixComparator = {
-        val comp = SortPrefixUtils.getPrefixComparator(boundSortExpression)
-        if (sortOrder.head.direction == Descending) {
-          new PrefixComparator {
-            override def compare(p1: Long, p2: Long): Int = -1 * comp.compare(p1, p2)
-          }
-        } else {
-          comp
-        }
-      }
-      val prefixComputer = {
-        val prefixComputer = SortPrefixUtils.getPrefixComputer(boundSortExpression)
-        new UnsafeExternalRowSorter.PrefixComputer {
-          override def computePrefix(row: InternalRow): Long = prefixComputer(row)
-        }
-      }
-      val sorter = new UnsafeExternalRowSorter(schema, ordering, prefixComparator, prefixComputer)
-      if (testSpillFrequency > 0) {
-        sorter.setTestSpillFrequency(testSpillFrequency)
-      }
-      sorter.sort(iterator)
-    }
-    child.execute().mapPartitions(doSort, preservesPartitioning = true)
-  }
-
-  override def output: Seq[Attribute] = child.output
-
-  override def outputOrdering: Seq[SortOrder] = sortOrder
-
-  override def outputsUnsafeRows: Boolean = true
-}
-
-@DeveloperApi
-object UnsafeExternalSort {
-  /**
-   * Return true if UnsafeExternalSort can sort rows with the given schema, false otherwise.
-   */
-  def supportsSchema(schema: StructType): Boolean = {
-    UnsafeExternalRowSorter.supportsSchema(schema)
-  }
-}
-
 
 /**
  * :: DeveloperApi ::
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ddl.scala
index e73b3704d4dfe..0cdb407ad57b9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ddl.scala
@@ -308,7 +308,7 @@ private[sql] object ResolvedDataSource {
       mode: SaveMode,
       options: Map[String, String],
       data: DataFrame): ResolvedDataSource = {
-    if (data.schema.map(_.dataType).exists(_.isInstanceOf[IntervalType])) {
+    if (data.schema.map(_.dataType).exists(_.isInstanceOf[CalendarIntervalType])) {
       throw new AnalysisException("Cannot save interval data type into external storage.")
     }
     val clazz: Class[_] = lookupDataSource(provider)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/sort.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/sort.scala
new file mode 100644
index 0000000000000..f82208868c3e3
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/sort.scala
@@ -0,0 +1,159 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution
+
+import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.errors._
+import org.apache.spark.sql.catalyst.expressions.{Descending, BindReferences, Attribute, SortOrder}
+import org.apache.spark.sql.catalyst.plans.physical.{UnspecifiedDistribution, OrderedDistribution, Distribution}
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.util.CompletionIterator
+import org.apache.spark.util.collection.ExternalSorter
+import org.apache.spark.util.collection.unsafe.sort.PrefixComparator
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// This file defines various sort operators.
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+/**
+ * Performs a sort on-heap.
+ * @param global when true performs a global sort of all partitions by shuffling the data first
+ *               if necessary.
+ */
+case class Sort(
+    sortOrder: Seq[SortOrder],
+    global: Boolean,
+    child: SparkPlan)
+  extends UnaryNode {
+  override def requiredChildDistribution: Seq[Distribution] =
+    if (global) OrderedDistribution(sortOrder) :: Nil else UnspecifiedDistribution :: Nil
+
+  protected override def doExecute(): RDD[InternalRow] = attachTree(this, "sort") {
+    child.execute().mapPartitions( { iterator =>
+      val ordering = newOrdering(sortOrder, child.output)
+      iterator.map(_.copy()).toArray.sorted(ordering).iterator
+    }, preservesPartitioning = true)
+  }
+
+  override def output: Seq[Attribute] = child.output
+
+  override def outputOrdering: Seq[SortOrder] = sortOrder
+}
+
+/**
+ * Performs a sort, spilling to disk as needed.
+ * @param global when true performs a global sort of all partitions by shuffling the data first
+ *               if necessary.
+ */
+case class ExternalSort(
+    sortOrder: Seq[SortOrder],
+    global: Boolean,
+    child: SparkPlan)
+  extends UnaryNode {
+
+  override def requiredChildDistribution: Seq[Distribution] =
+    if (global) OrderedDistribution(sortOrder) :: Nil else UnspecifiedDistribution :: Nil
+
+  protected override def doExecute(): RDD[InternalRow] = attachTree(this, "sort") {
+    child.execute().mapPartitions( { iterator =>
+      val ordering = newOrdering(sortOrder, child.output)
+      val sorter = new ExternalSorter[InternalRow, Null, InternalRow](ordering = Some(ordering))
+      sorter.insertAll(iterator.map(r => (r.copy(), null)))
+      val baseIterator = sorter.iterator.map(_._1)
+      // TODO(marmbrus): The complex type signature below thwarts inference for no reason.
+      CompletionIterator[InternalRow, Iterator[InternalRow]](baseIterator, sorter.stop())
+    }, preservesPartitioning = true)
+  }
+
+  override def output: Seq[Attribute] = child.output
+
+  override def outputOrdering: Seq[SortOrder] = sortOrder
+}
+
+/**
+ * Optimized version of [[ExternalSort]] that operates on binary data (implemented as part of
+ * Project Tungsten).
+ *
+ * @param global when true performs a global sort of all partitions by shuffling the data first
+ *               if necessary.
+ * @param testSpillFrequency Method for configuring periodic spilling in unit tests. If set, will
+ *                           spill every `frequency` records.
+ */
+case class UnsafeExternalSort(
+    sortOrder: Seq[SortOrder],
+    global: Boolean,
+    child: SparkPlan,
+    testSpillFrequency: Int = 0)
+  extends UnaryNode {
+
+  private[this] val schema: StructType = child.schema
+
+  override def requiredChildDistribution: Seq[Distribution] =
+    if (global) OrderedDistribution(sortOrder) :: Nil else UnspecifiedDistribution :: Nil
+
+  protected override def doExecute(): RDD[InternalRow] = attachTree(this, "sort") {
+    assert(codegenEnabled, "UnsafeExternalSort requires code generation to be enabled")
+    def doSort(iterator: Iterator[InternalRow]): Iterator[InternalRow] = {
+      val ordering = newOrdering(sortOrder, child.output)
+      val boundSortExpression = BindReferences.bindReference(sortOrder.head, child.output)
+      // Hack until we generate separate comparator implementations for ascending vs. descending
+      // (or choose to codegen them):
+      val prefixComparator = {
+        val comp = SortPrefixUtils.getPrefixComparator(boundSortExpression)
+        if (sortOrder.head.direction == Descending) {
+          new PrefixComparator {
+            override def compare(p1: Long, p2: Long): Int = -1 * comp.compare(p1, p2)
+          }
+        } else {
+          comp
+        }
+      }
+      val prefixComputer = {
+        val prefixComputer = SortPrefixUtils.getPrefixComputer(boundSortExpression)
+        new UnsafeExternalRowSorter.PrefixComputer {
+          override def computePrefix(row: InternalRow): Long = prefixComputer(row)
+        }
+      }
+      val sorter = new UnsafeExternalRowSorter(schema, ordering, prefixComparator, prefixComputer)
+      if (testSpillFrequency > 0) {
+        sorter.setTestSpillFrequency(testSpillFrequency)
+      }
+      sorter.sort(iterator)
+    }
+    child.execute().mapPartitions(doSort, preservesPartitioning = true)
+  }
+
+  override def output: Seq[Attribute] = child.output
+
+  override def outputOrdering: Seq[SortOrder] = sortOrder
+
+  override def outputsUnsafeRows: Boolean = true
+}
+
+@DeveloperApi
+object UnsafeExternalSort {
+  /**
+   * Return true if UnsafeExternalSort can sort rows with the given schema, false otherwise.
+   */
+  def supportsSchema(schema: StructType): Boolean = {
+    UnsafeExternalRowSorter.supportsSchema(schema)
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index d13dde1cdc8b2..535011fe3db5b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -1577,10 +1577,10 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll with SQLTestUtils {
   }
 
   test("SPARK-8753: add interval type") {
-    import org.apache.spark.unsafe.types.Interval
+    import org.apache.spark.unsafe.types.CalendarInterval
 
     val df = sql("select interval 3 years -3 month 7 week 123 microseconds")
-    checkAnswer(df, Row(new Interval(12 * 3 - 3, 7L * 1000 * 1000 * 3600 * 24 * 7 + 123 )))
+    checkAnswer(df, Row(new CalendarInterval(12 * 3 - 3, 7L * 1000 * 1000 * 3600 * 24 * 7 + 123 )))
     withTempPath(f => {
       // Currently we don't yet support saving out values of interval data type.
       val e = intercept[AnalysisException] {
@@ -1602,20 +1602,20 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll with SQLTestUtils {
   }
 
   test("SPARK-8945: add and subtract expressions for interval type") {
-    import org.apache.spark.unsafe.types.Interval
-    import org.apache.spark.unsafe.types.Interval.MICROS_PER_WEEK
+    import org.apache.spark.unsafe.types.CalendarInterval
+    import org.apache.spark.unsafe.types.CalendarInterval.MICROS_PER_WEEK
 
     val df = sql("select interval 3 years -3 month 7 week 123 microseconds as i")
-    checkAnswer(df, Row(new Interval(12 * 3 - 3, 7L * MICROS_PER_WEEK + 123)))
+    checkAnswer(df, Row(new CalendarInterval(12 * 3 - 3, 7L * MICROS_PER_WEEK + 123)))
 
-    checkAnswer(df.select(df("i") + new Interval(2, 123)),
-      Row(new Interval(12 * 3 - 3 + 2, 7L * MICROS_PER_WEEK + 123 + 123)))
+    checkAnswer(df.select(df("i") + new CalendarInterval(2, 123)),
+      Row(new CalendarInterval(12 * 3 - 3 + 2, 7L * MICROS_PER_WEEK + 123 + 123)))
 
-    checkAnswer(df.select(df("i") - new Interval(2, 123)),
-      Row(new Interval(12 * 3 - 3 - 2, 7L * MICROS_PER_WEEK + 123 - 123)))
+    checkAnswer(df.select(df("i") - new CalendarInterval(2, 123)),
+      Row(new CalendarInterval(12 * 3 - 3 - 2, 7L * MICROS_PER_WEEK + 123 - 123)))
 
     // unary minus
     checkAnswer(df.select(-df("i")),
-      Row(new Interval(-(12 * 3 - 3), -(7L * MICROS_PER_WEEK + 123))))
+      Row(new CalendarInterval(-(12 * 3 - 3), -(7L * MICROS_PER_WEEK + 123))))
   }
 }
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/Interval.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/CalendarInterval.java
similarity index 87%
rename from unsafe/src/main/java/org/apache/spark/unsafe/types/Interval.java
rename to unsafe/src/main/java/org/apache/spark/unsafe/types/CalendarInterval.java
index 71b1a85a818ea..92a5e4f86f234 100644
--- a/unsafe/src/main/java/org/apache/spark/unsafe/types/Interval.java
+++ b/unsafe/src/main/java/org/apache/spark/unsafe/types/CalendarInterval.java
@@ -24,7 +24,7 @@
 /**
  * The internal representation of interval type.
  */
-public final class Interval implements Serializable {
+public final class CalendarInterval implements Serializable {
   public static final long MICROS_PER_MILLI = 1000L;
   public static final long MICROS_PER_SECOND = MICROS_PER_MILLI * 1000;
   public static final long MICROS_PER_MINUTE = MICROS_PER_SECOND * 60;
@@ -58,7 +58,7 @@ private static long toLong(String s) {
     }
   }
 
-  public static Interval fromString(String s) {
+  public static CalendarInterval fromString(String s) {
     if (s == null) {
       return null;
     }
@@ -75,40 +75,40 @@ public static Interval fromString(String s) {
       microseconds += toLong(m.group(7)) * MICROS_PER_SECOND;
       microseconds += toLong(m.group(8)) * MICROS_PER_MILLI;
       microseconds += toLong(m.group(9));
-      return new Interval((int) months, microseconds);
+      return new CalendarInterval((int) months, microseconds);
     }
   }
 
   public final int months;
   public final long microseconds;
 
-  public Interval(int months, long microseconds) {
+  public CalendarInterval(int months, long microseconds) {
     this.months = months;
     this.microseconds = microseconds;
   }
 
-  public Interval add(Interval that) {
+  public CalendarInterval add(CalendarInterval that) {
     int months = this.months + that.months;
     long microseconds = this.microseconds + that.microseconds;
-    return new Interval(months, microseconds);
+    return new CalendarInterval(months, microseconds);
   }
 
-  public Interval subtract(Interval that) {
+  public CalendarInterval subtract(CalendarInterval that) {
     int months = this.months - that.months;
     long microseconds = this.microseconds - that.microseconds;
-    return new Interval(months, microseconds);
+    return new CalendarInterval(months, microseconds);
   }
 
-  public Interval negate() {
-    return new Interval(-this.months, -this.microseconds);
+  public CalendarInterval negate() {
+    return new CalendarInterval(-this.months, -this.microseconds);
   }
 
   @Override
   public boolean equals(Object other) {
     if (this == other) return true;
-    if (other == null || !(other instanceof Interval)) return false;
+    if (other == null || !(other instanceof CalendarInterval)) return false;
 
-    Interval o = (Interval) other;
+    CalendarInterval o = (CalendarInterval) other;
     return this.months == o.months && this.microseconds == o.microseconds;
   }
 
diff --git a/unsafe/src/test/java/org/apache/spark/unsafe/types/IntervalSuite.java b/unsafe/src/test/java/org/apache/spark/unsafe/types/IntervalSuite.java
index d29517cda66a3..e6733a7aae6f5 100644
--- a/unsafe/src/test/java/org/apache/spark/unsafe/types/IntervalSuite.java
+++ b/unsafe/src/test/java/org/apache/spark/unsafe/types/IntervalSuite.java
@@ -20,16 +20,16 @@
 import org.junit.Test;
 
 import static junit.framework.Assert.*;
-import static org.apache.spark.unsafe.types.Interval.*;
+import static org.apache.spark.unsafe.types.CalendarInterval.*;
 
 public class IntervalSuite {
 
   @Test
   public void equalsTest() {
-    Interval i1 = new Interval(3, 123);
-    Interval i2 = new Interval(3, 321);
-    Interval i3 = new Interval(1, 123);
-    Interval i4 = new Interval(3, 123);
+    CalendarInterval i1 = new CalendarInterval(3, 123);
+    CalendarInterval i2 = new CalendarInterval(3, 321);
+    CalendarInterval i3 = new CalendarInterval(1, 123);
+    CalendarInterval i4 = new CalendarInterval(3, 123);
 
     assertNotSame(i1, i2);
     assertNotSame(i1, i3);
@@ -39,21 +39,21 @@ public void equalsTest() {
 
   @Test
   public void toStringTest() {
-    Interval i;
+    CalendarInterval i;
 
-    i = new Interval(34, 0);
+    i = new CalendarInterval(34, 0);
     assertEquals(i.toString(), "interval 2 years 10 months");
 
-    i = new Interval(-34, 0);
+    i = new CalendarInterval(-34, 0);
     assertEquals(i.toString(), "interval -2 years -10 months");
 
-    i = new Interval(0, 3 * MICROS_PER_WEEK + 13 * MICROS_PER_HOUR + 123);
+    i = new CalendarInterval(0, 3 * MICROS_PER_WEEK + 13 * MICROS_PER_HOUR + 123);
     assertEquals(i.toString(), "interval 3 weeks 13 hours 123 microseconds");
 
-    i = new Interval(0, -3 * MICROS_PER_WEEK - 13 * MICROS_PER_HOUR - 123);
+    i = new CalendarInterval(0, -3 * MICROS_PER_WEEK - 13 * MICROS_PER_HOUR - 123);
     assertEquals(i.toString(), "interval -3 weeks -13 hours -123 microseconds");
 
-    i = new Interval(34, 3 * MICROS_PER_WEEK + 13 * MICROS_PER_HOUR + 123);
+    i = new CalendarInterval(34, 3 * MICROS_PER_WEEK + 13 * MICROS_PER_HOUR + 123);
     assertEquals(i.toString(), "interval 2 years 10 months 3 weeks 13 hours 123 microseconds");
   }
 
@@ -72,33 +72,33 @@ public void fromStringTest() {
     String input;
 
     input = "interval   -5  years  23   month";
-    Interval result = new Interval(-5 * 12 + 23, 0);
-    assertEquals(Interval.fromString(input), result);
+    CalendarInterval result = new CalendarInterval(-5 * 12 + 23, 0);
+    assertEquals(CalendarInterval.fromString(input), result);
 
     input = "interval   -5  years  23   month   ";
-    assertEquals(Interval.fromString(input), result);
+    assertEquals(CalendarInterval.fromString(input), result);
 
     input = "  interval   -5  years  23   month   ";
-    assertEquals(Interval.fromString(input), result);
+    assertEquals(CalendarInterval.fromString(input), result);
 
     // Error cases
     input = "interval   3month 1 hour";
-    assertEquals(Interval.fromString(input), null);
+    assertEquals(CalendarInterval.fromString(input), null);
 
     input = "interval 3 moth 1 hour";
-    assertEquals(Interval.fromString(input), null);
+    assertEquals(CalendarInterval.fromString(input), null);
 
     input = "interval";
-    assertEquals(Interval.fromString(input), null);
+    assertEquals(CalendarInterval.fromString(input), null);
 
     input = "int";
-    assertEquals(Interval.fromString(input), null);
+    assertEquals(CalendarInterval.fromString(input), null);
 
     input = "";
-    assertEquals(Interval.fromString(input), null);
+    assertEquals(CalendarInterval.fromString(input), null);
 
     input = null;
-    assertEquals(Interval.fromString(input), null);
+    assertEquals(CalendarInterval.fromString(input), null);
   }
 
   @Test
@@ -106,18 +106,18 @@ public void addTest() {
     String input = "interval 3 month 1 hour";
     String input2 = "interval 2 month 100 hour";
 
-    Interval interval = Interval.fromString(input);
-    Interval interval2 = Interval.fromString(input2);
+    CalendarInterval interval = CalendarInterval.fromString(input);
+    CalendarInterval interval2 = CalendarInterval.fromString(input2);
 
-    assertEquals(interval.add(interval2), new Interval(5, 101 * MICROS_PER_HOUR));
+    assertEquals(interval.add(interval2), new CalendarInterval(5, 101 * MICROS_PER_HOUR));
 
     input = "interval -10 month -81 hour";
     input2 = "interval 75 month 200 hour";
 
-    interval = Interval.fromString(input);
-    interval2 = Interval.fromString(input2);
+    interval = CalendarInterval.fromString(input);
+    interval2 = CalendarInterval.fromString(input2);
 
-    assertEquals(interval.add(interval2), new Interval(65, 119 * MICROS_PER_HOUR));
+    assertEquals(interval.add(interval2), new CalendarInterval(65, 119 * MICROS_PER_HOUR));
   }
 
   @Test
@@ -125,25 +125,25 @@ public void subtractTest() {
     String input = "interval 3 month 1 hour";
     String input2 = "interval 2 month 100 hour";
 
-    Interval interval = Interval.fromString(input);
-    Interval interval2 = Interval.fromString(input2);
+    CalendarInterval interval = CalendarInterval.fromString(input);
+    CalendarInterval interval2 = CalendarInterval.fromString(input2);
 
-    assertEquals(interval.subtract(interval2), new Interval(1, -99 * MICROS_PER_HOUR));
+    assertEquals(interval.subtract(interval2), new CalendarInterval(1, -99 * MICROS_PER_HOUR));
 
     input = "interval -10 month -81 hour";
     input2 = "interval 75 month 200 hour";
 
-    interval = Interval.fromString(input);
-    interval2 = Interval.fromString(input2);
+    interval = CalendarInterval.fromString(input);
+    interval2 = CalendarInterval.fromString(input2);
 
-    assertEquals(interval.subtract(interval2), new Interval(-85, -281 * MICROS_PER_HOUR));
+    assertEquals(interval.subtract(interval2), new CalendarInterval(-85, -281 * MICROS_PER_HOUR));
   }
 
   private void testSingleUnit(String unit, int number, int months, long microseconds) {
     String input1 = "interval " + number + " " + unit;
     String input2 = "interval " + number + " " + unit + "s";
-    Interval result = new Interval(months, microseconds);
-    assertEquals(Interval.fromString(input1), result);
-    assertEquals(Interval.fromString(input2), result);
+    CalendarInterval result = new CalendarInterval(months, microseconds);
+    assertEquals(CalendarInterval.fromString(input1), result);
+    assertEquals(CalendarInterval.fromString(input2), result);
   }
 }

From b715933fc69a49653abdb2fba0818dfc4f35d358 Mon Sep 17 00:00:00 2001
From: Alexander Ulanov <nashb@yandex.ru>
Date: Wed, 29 Jul 2015 13:59:00 -0700
Subject: [PATCH 448/549] [SPARK-9436] [GRAPHX] Pregel simplification patch

Pregel code contains two consecutive joins:
```
g.vertices.innerJoin(messages)(vprog)
...
g = g.outerJoinVertices(newVerts)
{ (vid, old, newOpt) => newOpt.getOrElse(old) }
```
This can be simplified with one join. ankurdave proposed a patch based on our discussion in the mailing list: https://www.mail-archive.com/devspark.apache.org/msg10316.html

Author: Alexander Ulanov <nashb@yandex.ru>

Closes #7749 from avulanov/SPARK-9436-pregel and squashes the following commits:

8568e06 [Alexander Ulanov] Pregel simplification patch
---
 .../org/apache/spark/graphx/Pregel.scala      | 23 ++++++++-----------
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/graphx/src/main/scala/org/apache/spark/graphx/Pregel.scala b/graphx/src/main/scala/org/apache/spark/graphx/Pregel.scala
index cfcf7244eaed5..2ca60d51f8331 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/Pregel.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/Pregel.scala
@@ -127,28 +127,25 @@ object Pregel extends Logging {
     var prevG: Graph[VD, ED] = null
     var i = 0
     while (activeMessages > 0 && i < maxIterations) {
-      // Receive the messages. Vertices that didn't get any messages do not appear in newVerts.
-      val newVerts = g.vertices.innerJoin(messages)(vprog).cache()
-      // Update the graph with the new vertices.
+      // Receive the messages and update the vertices.
       prevG = g
-      g = g.outerJoinVertices(newVerts) { (vid, old, newOpt) => newOpt.getOrElse(old) }
-      g.cache()
+      g = g.joinVertices(messages)(vprog).cache()
 
       val oldMessages = messages
-      // Send new messages. Vertices that didn't get any messages don't appear in newVerts, so don't
-      // get to send messages. We must cache messages so it can be materialized on the next line,
-      // allowing us to uncache the previous iteration.
-      messages = g.mapReduceTriplets(sendMsg, mergeMsg, Some((newVerts, activeDirection))).cache()
-      // The call to count() materializes `messages`, `newVerts`, and the vertices of `g`. This
-      // hides oldMessages (depended on by newVerts), newVerts (depended on by messages), and the
-      // vertices of prevG (depended on by newVerts, oldMessages, and the vertices of g).
+      // Send new messages, skipping edges where neither side received a message. We must cache
+      // messages so it can be materialized on the next line, allowing us to uncache the previous
+      // iteration.
+      messages = g.mapReduceTriplets(
+        sendMsg, mergeMsg, Some((oldMessages, activeDirection))).cache()
+      // The call to count() materializes `messages` and the vertices of `g`. This hides oldMessages
+      // (depended on by the vertices of g) and the vertices of prevG (depended on by oldMessages
+      // and the vertices of g).
       activeMessages = messages.count()
 
       logInfo("Pregel finished iteration " + i)
 
       // Unpersist the RDDs hidden by newly-materialized RDDs
       oldMessages.unpersist(blocking = false)
-      newVerts.unpersist(blocking = false)
       prevG.unpersistVertices(blocking = false)
       prevG.edges.unpersist(blocking = false)
       // count the iteration

From 1b0099fc62d02ff6216a76fbfe17a4ec5b2f3536 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Wed, 29 Jul 2015 16:00:30 -0700
Subject: [PATCH 449/549] [SPARK-9411] [SQL] Make Tungsten page sizes
 configurable

We need to make page sizes configurable so we can reduce them in unit tests and increase them in real production workloads.  These sizes are now controlled by a new configuration, `spark.buffer.pageSize`.  The new default is 64 megabytes.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #7741 from JoshRosen/SPARK-9411 and squashes the following commits:

a43c4db [Josh Rosen] Fix pow
2c0eefc [Josh Rosen] Fix MAXIMUM_PAGE_SIZE_BYTES comment + value
bccfb51 [Josh Rosen] Lower page size to 4MB in TestHive
ba54d4b [Josh Rosen] Make UnsafeExternalSorter's page size configurable
0045aa2 [Josh Rosen] Make UnsafeShuffle's page size configurable
bc734f0 [Josh Rosen] Rename configuration
e614858 [Josh Rosen] Makes BytesToBytesMap page size configurable
---
 .../unsafe/UnsafeShuffleExternalSorter.java   | 35 +++++++++------
 .../shuffle/unsafe/UnsafeShuffleWriter.java   |  5 +++
 .../unsafe/sort/UnsafeExternalSorter.java     | 30 +++++++------
 .../unsafe/UnsafeShuffleWriterSuite.java      |  6 +--
 .../UnsafeFixedWidthAggregationMap.java       |  5 ++-
 .../UnsafeFixedWidthAggregationMapSuite.scala |  6 ++-
 .../sql/execution/GeneratedAggregate.scala    |  4 +-
 .../sql/execution/joins/HashedRelation.scala  |  7 ++-
 .../apache/spark/sql/hive/test/TestHive.scala |  1 +
 .../spark/unsafe/map/BytesToBytesMap.java     | 43 ++++++++++++-------
 .../unsafe/memory/TaskMemoryManager.java      | 13 ++++--
 .../map/AbstractBytesToBytesMapSuite.java     | 22 +++++-----
 12 files changed, 112 insertions(+), 65 deletions(-)

diff --git a/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleExternalSorter.java b/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleExternalSorter.java
index 1d460432be9ff..1aa6ba4201261 100644
--- a/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleExternalSorter.java
+++ b/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleExternalSorter.java
@@ -59,14 +59,14 @@ final class UnsafeShuffleExternalSorter {
 
   private final Logger logger = LoggerFactory.getLogger(UnsafeShuffleExternalSorter.class);
 
-  private static final int PAGE_SIZE = PackedRecordPointer.MAXIMUM_PAGE_SIZE_BYTES;
   @VisibleForTesting
   static final int DISK_WRITE_BUFFER_SIZE = 1024 * 1024;
-  @VisibleForTesting
-  static final int MAX_RECORD_SIZE = PAGE_SIZE - 4;
 
   private final int initialSize;
   private final int numPartitions;
+  private final int pageSizeBytes;
+  @VisibleForTesting
+  final int maxRecordSizeBytes;
   private final TaskMemoryManager memoryManager;
   private final ShuffleMemoryManager shuffleMemoryManager;
   private final BlockManager blockManager;
@@ -109,7 +109,10 @@ public UnsafeShuffleExternalSorter(
     this.numPartitions = numPartitions;
     // Use getSizeAsKb (not bytes) to maintain backwards compatibility if no units are provided
     this.fileBufferSizeBytes = (int) conf.getSizeAsKb("spark.shuffle.file.buffer", "32k") * 1024;
-
+    this.pageSizeBytes = (int) Math.min(
+      PackedRecordPointer.MAXIMUM_PAGE_SIZE_BYTES,
+      conf.getSizeAsBytes("spark.buffer.pageSize", "64m"));
+    this.maxRecordSizeBytes = pageSizeBytes - 4;
     this.writeMetrics = writeMetrics;
     initializeForWriting();
   }
@@ -272,7 +275,11 @@ void spill() throws IOException {
   }
 
   private long getMemoryUsage() {
-    return sorter.getMemoryUsage() + (allocatedPages.size() * (long) PAGE_SIZE);
+    long totalPageSize = 0;
+    for (MemoryBlock page : allocatedPages) {
+      totalPageSize += page.size();
+    }
+    return sorter.getMemoryUsage() + totalPageSize;
   }
 
   private long freeMemory() {
@@ -346,23 +353,23 @@ private void allocateSpaceForRecord(int requiredSpace) throws IOException {
       // TODO: we should track metrics on the amount of space wasted when we roll over to a new page
       // without using the free space at the end of the current page. We should also do this for
       // BytesToBytesMap.
-      if (requiredSpace > PAGE_SIZE) {
+      if (requiredSpace > pageSizeBytes) {
         throw new IOException("Required space " + requiredSpace + " is greater than page size (" +
-          PAGE_SIZE + ")");
+          pageSizeBytes + ")");
       } else {
-        final long memoryAcquired = shuffleMemoryManager.tryToAcquire(PAGE_SIZE);
-        if (memoryAcquired < PAGE_SIZE) {
+        final long memoryAcquired = shuffleMemoryManager.tryToAcquire(pageSizeBytes);
+        if (memoryAcquired < pageSizeBytes) {
           shuffleMemoryManager.release(memoryAcquired);
           spill();
-          final long memoryAcquiredAfterSpilling = shuffleMemoryManager.tryToAcquire(PAGE_SIZE);
-          if (memoryAcquiredAfterSpilling != PAGE_SIZE) {
+          final long memoryAcquiredAfterSpilling = shuffleMemoryManager.tryToAcquire(pageSizeBytes);
+          if (memoryAcquiredAfterSpilling != pageSizeBytes) {
             shuffleMemoryManager.release(memoryAcquiredAfterSpilling);
-            throw new IOException("Unable to acquire " + PAGE_SIZE + " bytes of memory");
+            throw new IOException("Unable to acquire " + pageSizeBytes + " bytes of memory");
           }
         }
-        currentPage = memoryManager.allocatePage(PAGE_SIZE);
+        currentPage = memoryManager.allocatePage(pageSizeBytes);
         currentPagePosition = currentPage.getBaseOffset();
-        freeSpaceInCurrentPage = PAGE_SIZE;
+        freeSpaceInCurrentPage = pageSizeBytes;
         allocatedPages.add(currentPage);
       }
     }
diff --git a/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriter.java b/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriter.java
index 764578b181422..d47d6fc9c2ac4 100644
--- a/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriter.java
+++ b/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriter.java
@@ -129,6 +129,11 @@ public UnsafeShuffleWriter(
     open();
   }
 
+  @VisibleForTesting
+  public int maxRecordSizeBytes() {
+    return sorter.maxRecordSizeBytes;
+  }
+
   /**
    * This convenience method should only be called in test code.
    */
diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java
index 80b03d7e99e2b..c21990f4e4778 100644
--- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java
+++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java
@@ -41,10 +41,7 @@ public final class UnsafeExternalSorter {
 
   private final Logger logger = LoggerFactory.getLogger(UnsafeExternalSorter.class);
 
-  private static final int PAGE_SIZE = 1 << 27;  // 128 megabytes
-  @VisibleForTesting
-  static final int MAX_RECORD_SIZE = PAGE_SIZE - 4;
-
+  private final long pageSizeBytes;
   private final PrefixComparator prefixComparator;
   private final RecordComparator recordComparator;
   private final int initialSize;
@@ -91,6 +88,7 @@ public UnsafeExternalSorter(
     this.initialSize = initialSize;
     // Use getSizeAsKb (not bytes) to maintain backwards compatibility for units
     this.fileBufferSizeBytes = (int) conf.getSizeAsKb("spark.shuffle.file.buffer", "32k") * 1024;
+    this.pageSizeBytes = conf.getSizeAsBytes("spark.buffer.pageSize", "64m");
     initializeForWriting();
   }
 
@@ -147,7 +145,11 @@ public void spill() throws IOException {
   }
 
   private long getMemoryUsage() {
-    return sorter.getMemoryUsage() + (allocatedPages.size() * (long) PAGE_SIZE);
+    long totalPageSize = 0;
+    for (MemoryBlock page : allocatedPages) {
+      totalPageSize += page.size();
+    }
+    return sorter.getMemoryUsage() + totalPageSize;
   }
 
   @VisibleForTesting
@@ -214,23 +216,23 @@ private void allocateSpaceForRecord(int requiredSpace) throws IOException {
       // TODO: we should track metrics on the amount of space wasted when we roll over to a new page
       // without using the free space at the end of the current page. We should also do this for
       // BytesToBytesMap.
-      if (requiredSpace > PAGE_SIZE) {
+      if (requiredSpace > pageSizeBytes) {
         throw new IOException("Required space " + requiredSpace + " is greater than page size (" +
-          PAGE_SIZE + ")");
+          pageSizeBytes + ")");
       } else {
-        final long memoryAcquired = shuffleMemoryManager.tryToAcquire(PAGE_SIZE);
-        if (memoryAcquired < PAGE_SIZE) {
+        final long memoryAcquired = shuffleMemoryManager.tryToAcquire(pageSizeBytes);
+        if (memoryAcquired < pageSizeBytes) {
           shuffleMemoryManager.release(memoryAcquired);
           spill();
-          final long memoryAcquiredAfterSpilling = shuffleMemoryManager.tryToAcquire(PAGE_SIZE);
-          if (memoryAcquiredAfterSpilling != PAGE_SIZE) {
+          final long memoryAcquiredAfterSpilling = shuffleMemoryManager.tryToAcquire(pageSizeBytes);
+          if (memoryAcquiredAfterSpilling != pageSizeBytes) {
             shuffleMemoryManager.release(memoryAcquiredAfterSpilling);
-            throw new IOException("Unable to acquire " + PAGE_SIZE + " bytes of memory");
+            throw new IOException("Unable to acquire " + pageSizeBytes + " bytes of memory");
           }
         }
-        currentPage = memoryManager.allocatePage(PAGE_SIZE);
+        currentPage = memoryManager.allocatePage(pageSizeBytes);
         currentPagePosition = currentPage.getBaseOffset();
-        freeSpaceInCurrentPage = PAGE_SIZE;
+        freeSpaceInCurrentPage = pageSizeBytes;
         allocatedPages.add(currentPage);
       }
     }
diff --git a/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java b/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java
index 10c3eedbf4b46..04fc09b323dbb 100644
--- a/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java
+++ b/core/src/test/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriterSuite.java
@@ -111,7 +111,7 @@ public void setUp() throws IOException {
     mergedOutputFile = File.createTempFile("mergedoutput", "", tempDir);
     partitionSizesInMergedFile = null;
     spillFilesCreated.clear();
-    conf = new SparkConf();
+    conf = new SparkConf().set("spark.buffer.pageSize", "128m");
     taskMetrics = new TaskMetrics();
 
     when(shuffleMemoryManager.tryToAcquire(anyLong())).then(returnsFirstArg());
@@ -512,12 +512,12 @@ public void close() { }
     writer.insertRecordIntoSorter(new Tuple2<Object, Object>(new byte[1], new byte[1]));
     writer.forceSorterToSpill();
     // We should be able to write a record that's right _at_ the max record size
-    final byte[] atMaxRecordSize = new byte[UnsafeShuffleExternalSorter.MAX_RECORD_SIZE];
+    final byte[] atMaxRecordSize = new byte[writer.maxRecordSizeBytes()];
     new Random(42).nextBytes(atMaxRecordSize);
     writer.insertRecordIntoSorter(new Tuple2<Object, Object>(new byte[0], atMaxRecordSize));
     writer.forceSorterToSpill();
     // Inserting a record that's larger than the max record size should fail:
-    final byte[] exceedsMaxRecordSize = new byte[UnsafeShuffleExternalSorter.MAX_RECORD_SIZE + 1];
+    final byte[] exceedsMaxRecordSize = new byte[writer.maxRecordSizeBytes() + 1];
     new Random(42).nextBytes(exceedsMaxRecordSize);
     Product2<Object, Object> hugeRecord =
       new Tuple2<Object, Object>(new byte[0], exceedsMaxRecordSize);
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMap.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMap.java
index 684de6e81d67c..03f4c3ed8e6bb 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMap.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMap.java
@@ -95,6 +95,7 @@ public static boolean supportsAggregationBufferSchema(StructType schema) {
    * @param groupingKeySchema the schema of the grouping key, used for row conversion.
    * @param memoryManager the memory manager used to allocate our Unsafe memory structures.
    * @param initialCapacity the initial capacity of the map (a sizing hint to avoid re-hashing).
+   * @param pageSizeBytes the data page size, in bytes; limits the maximum record size.
    * @param enablePerfMetrics if true, performance metrics will be recorded (has minor perf impact)
    */
   public UnsafeFixedWidthAggregationMap(
@@ -103,11 +104,13 @@ public UnsafeFixedWidthAggregationMap(
       StructType groupingKeySchema,
       TaskMemoryManager memoryManager,
       int initialCapacity,
+      long pageSizeBytes,
       boolean enablePerfMetrics) {
     this.aggregationBufferSchema = aggregationBufferSchema;
     this.groupingKeyProjection = UnsafeProjection.create(groupingKeySchema);
     this.groupingKeySchema = groupingKeySchema;
-    this.map = new BytesToBytesMap(memoryManager, initialCapacity, enablePerfMetrics);
+    this.map =
+      new BytesToBytesMap(memoryManager, initialCapacity, pageSizeBytes, enablePerfMetrics);
     this.enablePerfMetrics = enablePerfMetrics;
 
     // Initialize the buffer for aggregation value
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMapSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMapSuite.scala
index 48b7dc57451a3..6a907290f2dbe 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMapSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMapSuite.scala
@@ -39,6 +39,7 @@ class UnsafeFixedWidthAggregationMapSuite
   private val groupKeySchema = StructType(StructField("product", StringType) :: Nil)
   private val aggBufferSchema = StructType(StructField("salePrice", IntegerType) :: Nil)
   private def emptyAggregationBuffer: InternalRow = InternalRow(0)
+  private val PAGE_SIZE_BYTES: Long = 1L << 26; // 64 megabytes
 
   private var memoryManager: TaskMemoryManager = null
 
@@ -69,7 +70,8 @@ class UnsafeFixedWidthAggregationMapSuite
       aggBufferSchema,
       groupKeySchema,
       memoryManager,
-      1024, // initial capacity
+      1024, // initial capacity,
+      PAGE_SIZE_BYTES,
       false // disable perf metrics
     )
     assert(!map.iterator().hasNext)
@@ -83,6 +85,7 @@ class UnsafeFixedWidthAggregationMapSuite
       groupKeySchema,
       memoryManager,
       1024, // initial capacity
+      PAGE_SIZE_BYTES,
       false // disable perf metrics
     )
     val groupKey = InternalRow(UTF8String.fromString("cats"))
@@ -109,6 +112,7 @@ class UnsafeFixedWidthAggregationMapSuite
       groupKeySchema,
       memoryManager,
       128, // initial capacity
+      PAGE_SIZE_BYTES,
       false // disable perf metrics
     )
     val rand = new Random(42)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala
index 1cd1420480f03..b85aada9d9d4c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.execution
 
-import org.apache.spark.TaskContext
+import org.apache.spark.{SparkEnv, TaskContext}
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
@@ -260,12 +260,14 @@ case class GeneratedAggregate(
       } else if (unsafeEnabled && schemaSupportsUnsafe) {
         assert(iter.hasNext, "There should be at least one row for this path")
         log.info("Using Unsafe-based aggregator")
+        val pageSizeBytes = SparkEnv.get.conf.getSizeAsBytes("spark.buffer.pageSize", "64m")
         val aggregationMap = new UnsafeFixedWidthAggregationMap(
           newAggregationBuffer(EmptyRow),
           aggregationBufferSchema,
           groupKeySchema,
           TaskContext.get.taskMemoryManager(),
           1024 * 16, // initial capacity
+          pageSizeBytes,
           false // disable tracking of performance metrics
         )
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
index 9c058f1f72fe4..7a507391316a9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
@@ -21,6 +21,7 @@ import java.io.{Externalizable, ObjectInput, ObjectOutput}
 import java.nio.ByteOrder
 import java.util.{HashMap => JavaHashMap}
 
+import org.apache.spark.{SparkEnv, TaskContext}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.execution.SparkSqlSerializer
@@ -259,7 +260,11 @@ private[joins] final class UnsafeHashedRelation(
     val nKeys = in.readInt()
     // This is used in Broadcast, shared by multiple tasks, so we use on-heap memory
     val memoryManager = new TaskMemoryManager(new ExecutorMemoryManager(MemoryAllocator.HEAP))
-    binaryMap = new BytesToBytesMap(memoryManager, nKeys * 2) // reduce hash collision
+    val pageSizeBytes = SparkEnv.get.conf.getSizeAsBytes("spark.buffer.pageSize", "64m")
+    binaryMap = new BytesToBytesMap(
+      memoryManager,
+      nKeys * 2, // reduce hash collision
+      pageSizeBytes)
 
     var i = 0
     var keyBuffer = new Array[Byte](1024)
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
index 3662a4352f55d..7bbdef90cd6b9 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
@@ -56,6 +56,7 @@ object TestHive
         .set("spark.sql.test", "")
         .set("spark.sql.hive.metastore.barrierPrefixes",
           "org.apache.spark.sql.hive.execution.PairSerDe")
+        .set("spark.buffer.pageSize", "4m")
         // SPARK-8910
         .set("spark.ui.enabled", "false")))
 
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java b/unsafe/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java
index d0bde69cc1068..198e0684f32f8 100644
--- a/unsafe/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java
+++ b/unsafe/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java
@@ -74,12 +74,6 @@ public final class BytesToBytesMap {
    */
   private long pageCursor = 0;
 
-  /**
-   * The size of the data pages that hold key and value data. Map entries cannot span multiple
-   * pages, so this limits the maximum entry size.
-   */
-  private static final long PAGE_SIZE_BYTES = 1L << 26; // 64 megabytes
-
   /**
    * The maximum number of keys that BytesToBytesMap supports. The hash table has to be
    * power-of-2-sized and its backing Java array can contain at most (1 << 30) elements, since
@@ -117,6 +111,12 @@ public final class BytesToBytesMap {
 
   private final double loadFactor;
 
+  /**
+   * The size of the data pages that hold key and value data. Map entries cannot span multiple
+   * pages, so this limits the maximum entry size.
+   */
+  private final long pageSizeBytes;
+
   /**
    * Number of keys defined in the map.
    */
@@ -153,10 +153,12 @@ public BytesToBytesMap(
       TaskMemoryManager memoryManager,
       int initialCapacity,
       double loadFactor,
+      long pageSizeBytes,
       boolean enablePerfMetrics) {
     this.memoryManager = memoryManager;
     this.loadFactor = loadFactor;
     this.loc = new Location();
+    this.pageSizeBytes = pageSizeBytes;
     this.enablePerfMetrics = enablePerfMetrics;
     if (initialCapacity <= 0) {
       throw new IllegalArgumentException("Initial capacity must be greater than 0");
@@ -165,18 +167,26 @@ public BytesToBytesMap(
       throw new IllegalArgumentException(
         "Initial capacity " + initialCapacity + " exceeds maximum capacity of " + MAX_CAPACITY);
     }
+    if (pageSizeBytes > TaskMemoryManager.MAXIMUM_PAGE_SIZE_BYTES) {
+      throw new IllegalArgumentException("Page size " + pageSizeBytes + " cannot exceed " +
+        TaskMemoryManager.MAXIMUM_PAGE_SIZE_BYTES);
+    }
     allocate(initialCapacity);
   }
 
-  public BytesToBytesMap(TaskMemoryManager memoryManager, int initialCapacity) {
-    this(memoryManager, initialCapacity, 0.70, false);
+  public BytesToBytesMap(
+      TaskMemoryManager memoryManager,
+      int initialCapacity,
+      long pageSizeBytes) {
+    this(memoryManager, initialCapacity, 0.70, pageSizeBytes, false);
   }
 
   public BytesToBytesMap(
       TaskMemoryManager memoryManager,
       int initialCapacity,
+      long pageSizeBytes,
       boolean enablePerfMetrics) {
-    this(memoryManager, initialCapacity, 0.70, enablePerfMetrics);
+    this(memoryManager, initialCapacity, 0.70, pageSizeBytes, enablePerfMetrics);
   }
 
   /**
@@ -443,20 +453,20 @@ public void putNewKey(
       // must be stored in the same memory page.
       // (8 byte key length) (key) (8 byte value length) (value)
       final long requiredSize = 8 + keyLengthBytes + 8 + valueLengthBytes;
-      assert (requiredSize <= PAGE_SIZE_BYTES - 8); // Reserve 8 bytes for the end-of-page marker.
+      assert (requiredSize <= pageSizeBytes - 8); // Reserve 8 bytes for the end-of-page marker.
       size++;
       bitset.set(pos);
 
       // If there's not enough space in the current page, allocate a new page (8 bytes are reserved
       // for the end-of-page marker).
-      if (currentDataPage == null || PAGE_SIZE_BYTES - 8 - pageCursor < requiredSize) {
+      if (currentDataPage == null || pageSizeBytes - 8 - pageCursor < requiredSize) {
         if (currentDataPage != null) {
           // There wasn't enough space in the current page, so write an end-of-page marker:
           final Object pageBaseObject = currentDataPage.getBaseObject();
           final long lengthOffsetInPage = currentDataPage.getBaseOffset() + pageCursor;
           PlatformDependent.UNSAFE.putLong(pageBaseObject, lengthOffsetInPage, END_OF_PAGE_MARKER);
         }
-        MemoryBlock newPage = memoryManager.allocatePage(PAGE_SIZE_BYTES);
+        MemoryBlock newPage = memoryManager.allocatePage(pageSizeBytes);
         dataPages.add(newPage);
         pageCursor = 0;
         currentDataPage = newPage;
@@ -538,10 +548,11 @@ public void free() {
 
   /** Returns the total amount of memory, in bytes, consumed by this map's managed structures. */
   public long getTotalMemoryConsumption() {
-    return (
-      dataPages.size() * PAGE_SIZE_BYTES +
-      bitset.memoryBlock().size() +
-      longArray.memoryBlock().size());
+    long totalDataPagesSize = 0L;
+    for (MemoryBlock dataPage : dataPages) {
+      totalDataPagesSize += dataPage.size();
+    }
+    return totalDataPagesSize + bitset.memoryBlock().size() + longArray.memoryBlock().size();
   }
 
   /**
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/memory/TaskMemoryManager.java b/unsafe/src/main/java/org/apache/spark/unsafe/memory/TaskMemoryManager.java
index 10881969dbc78..dd70df3b1f791 100644
--- a/unsafe/src/main/java/org/apache/spark/unsafe/memory/TaskMemoryManager.java
+++ b/unsafe/src/main/java/org/apache/spark/unsafe/memory/TaskMemoryManager.java
@@ -58,8 +58,13 @@ public class TaskMemoryManager {
   /** The number of entries in the page table. */
   private static final int PAGE_TABLE_SIZE = 1 << PAGE_NUMBER_BITS;
 
-  /** Maximum supported data page size */
-  private static final long MAXIMUM_PAGE_SIZE = (1L << OFFSET_BITS);
+  /**
+   * Maximum supported data page size (in bytes). In principle, the maximum addressable page size is
+   * (1L << OFFSET_BITS) bytes, which is 2+ petabytes. However, the on-heap allocator's maximum page
+   * size is limited by the maximum amount of data that can be stored in a  long[] array, which is
+   * (2^32 - 1) * 8 bytes (or 16 gigabytes). Therefore, we cap this at 16 gigabytes.
+   */
+  public static final long MAXIMUM_PAGE_SIZE_BYTES = ((1L << 31) - 1) * 8L;
 
   /** Bit mask for the lower 51 bits of a long. */
   private static final long MASK_LONG_LOWER_51_BITS = 0x7FFFFFFFFFFFFL;
@@ -110,9 +115,9 @@ public TaskMemoryManager(ExecutorMemoryManager executorMemoryManager) {
    * intended for allocating large blocks of memory that will be shared between operators.
    */
   public MemoryBlock allocatePage(long size) {
-    if (size > MAXIMUM_PAGE_SIZE) {
+    if (size > MAXIMUM_PAGE_SIZE_BYTES) {
       throw new IllegalArgumentException(
-        "Cannot allocate a page with more than " + MAXIMUM_PAGE_SIZE + " bytes");
+        "Cannot allocate a page with more than " + MAXIMUM_PAGE_SIZE_BYTES + " bytes");
     }
 
     final int pageNumber;
diff --git a/unsafe/src/test/java/org/apache/spark/unsafe/map/AbstractBytesToBytesMapSuite.java b/unsafe/src/test/java/org/apache/spark/unsafe/map/AbstractBytesToBytesMapSuite.java
index dae47e4bab0cb..0be94ad371255 100644
--- a/unsafe/src/test/java/org/apache/spark/unsafe/map/AbstractBytesToBytesMapSuite.java
+++ b/unsafe/src/test/java/org/apache/spark/unsafe/map/AbstractBytesToBytesMapSuite.java
@@ -43,6 +43,7 @@ public abstract class AbstractBytesToBytesMapSuite {
 
   private TaskMemoryManager memoryManager;
   private TaskMemoryManager sizeLimitedMemoryManager;
+  private final long PAGE_SIZE_BYTES = 1L << 26; // 64 megabytes
 
   @Before
   public void setup() {
@@ -110,7 +111,7 @@ private static boolean arrayEquals(
 
   @Test
   public void emptyMap() {
-    BytesToBytesMap map = new BytesToBytesMap(memoryManager, 64);
+    BytesToBytesMap map = new BytesToBytesMap(memoryManager, 64, PAGE_SIZE_BYTES);
     try {
       Assert.assertEquals(0, map.size());
       final int keyLengthInWords = 10;
@@ -125,7 +126,7 @@ public void emptyMap() {
 
   @Test
   public void setAndRetrieveAKey() {
-    BytesToBytesMap map = new BytesToBytesMap(memoryManager, 64);
+    BytesToBytesMap map = new BytesToBytesMap(memoryManager, 64, PAGE_SIZE_BYTES);
     final int recordLengthWords = 10;
     final int recordLengthBytes = recordLengthWords * 8;
     final byte[] keyData = getRandomByteArray(recordLengthWords);
@@ -177,7 +178,7 @@ public void setAndRetrieveAKey() {
   @Test
   public void iteratorTest() throws Exception {
     final int size = 4096;
-    BytesToBytesMap map = new BytesToBytesMap(memoryManager, size / 2);
+    BytesToBytesMap map = new BytesToBytesMap(memoryManager, size / 2, PAGE_SIZE_BYTES);
     try {
       for (long i = 0; i < size; i++) {
         final long[] value = new long[] { i };
@@ -235,7 +236,7 @@ public void iteratingOverDataPagesWithWastedSpace() throws Exception {
     final int NUM_ENTRIES = 1000 * 1000;
     final int KEY_LENGTH = 16;
     final int VALUE_LENGTH = 40;
-    final BytesToBytesMap map = new BytesToBytesMap(memoryManager, NUM_ENTRIES);
+    final BytesToBytesMap map = new BytesToBytesMap(memoryManager, NUM_ENTRIES, PAGE_SIZE_BYTES);
     // Each record will take 8 + 8 + 16 + 40 = 72 bytes of space in the data page. Our 64-megabyte
     // pages won't be evenly-divisible by records of this size, which will cause us to waste some
     // space at the end of the page. This is necessary in order for us to take the end-of-record
@@ -304,7 +305,7 @@ public void randomizedStressTest() {
     // Java arrays' hashCodes() aren't based on the arrays' contents, so we need to wrap arrays
     // into ByteBuffers in order to use them as keys here.
     final Map<ByteBuffer, byte[]> expected = new HashMap<ByteBuffer, byte[]>();
-    final BytesToBytesMap map = new BytesToBytesMap(memoryManager, size);
+    final BytesToBytesMap map = new BytesToBytesMap(memoryManager, size, PAGE_SIZE_BYTES);
 
     try {
       // Fill the map to 90% full so that we can trigger probing
@@ -353,14 +354,15 @@ public void randomizedStressTest() {
   @Test
   public void initialCapacityBoundsChecking() {
     try {
-      new BytesToBytesMap(sizeLimitedMemoryManager, 0);
+      new BytesToBytesMap(sizeLimitedMemoryManager, 0, PAGE_SIZE_BYTES);
       Assert.fail("Expected IllegalArgumentException to be thrown");
     } catch (IllegalArgumentException e) {
       // expected exception
     }
 
     try {
-      new BytesToBytesMap(sizeLimitedMemoryManager, BytesToBytesMap.MAX_CAPACITY + 1);
+      new BytesToBytesMap(
+        sizeLimitedMemoryManager, BytesToBytesMap.MAX_CAPACITY + 1, PAGE_SIZE_BYTES);
       Assert.fail("Expected IllegalArgumentException to be thrown");
     } catch (IllegalArgumentException e) {
       // expected exception
@@ -368,15 +370,15 @@ public void initialCapacityBoundsChecking() {
 
    // Can allocate _at_ the max capacity
     BytesToBytesMap map =
-      new BytesToBytesMap(sizeLimitedMemoryManager, BytesToBytesMap.MAX_CAPACITY);
+      new BytesToBytesMap(sizeLimitedMemoryManager, BytesToBytesMap.MAX_CAPACITY, PAGE_SIZE_BYTES);
     map.free();
   }
 
   @Test
   public void resizingLargeMap() {
     // As long as a map's capacity is below the max, we should be able to resize up to the max
-    BytesToBytesMap map =
-      new BytesToBytesMap(sizeLimitedMemoryManager, BytesToBytesMap.MAX_CAPACITY - 64);
+    BytesToBytesMap map = new BytesToBytesMap(
+      sizeLimitedMemoryManager, BytesToBytesMap.MAX_CAPACITY - 64, PAGE_SIZE_BYTES);
     map.growAndRehash();
     map.free();
   }

From 2cc212d56a1d50fe68d5816f71b27803de1f6389 Mon Sep 17 00:00:00 2001
From: Feynman Liang <fliang@databricks.com>
Date: Wed, 29 Jul 2015 16:20:20 -0700
Subject: [PATCH 450/549] [SPARK-6793] [MLLIB] OnlineLDAOptimizer LDA
 perplexity

Implements `logPerplexity` in `OnlineLDAOptimizer`. Also refactors inference code into companion object to enable future reuse (e.g. `predict` method).

Author: Feynman Liang <fliang@databricks.com>

Closes #7705 from feynmanliang/SPARK-6793-perplexity and squashes the following commits:

6da2c99 [Feynman Liang] Remove get* from LDAModel public API
8381da6 [Feynman Liang] Code review comments
17f7000 [Feynman Liang] Documentation typo fixes
2f452a4 [Feynman Liang] Remove auxillary DistributedLDAModel constructor
a275914 [Feynman Liang] Prevent empty counts calls to variationalInference
06d02d9 [Feynman Liang] Remove deprecated LocalLDAModel constructor
afecb46 [Feynman Liang] Fix regression bug in sstats accumulator
5a327a0 [Feynman Liang] Code review quick fixes
998c03e [Feynman Liang] Fix style
1cbb67d [Feynman Liang] Fix access modifier bug
4362daa [Feynman Liang] Organize imports
4f171f7 [Feynman Liang] Fix indendation
2f049ce [Feynman Liang] Fix failing save/load tests
7415e96 [Feynman Liang] Pick changes from big PR
11e7c33 [Feynman Liang] Merge remote-tracking branch 'apache/master' into SPARK-6793-perplexity
f8adc48 [Feynman Liang] Add logPerplexity, refactor variationalBound into a method
cd521d6 [Feynman Liang] Refactor methods into companion class
7f62a55 [Feynman Liang] --amend
c62cb1e [Feynman Liang] Outer product for stats, revert Range slicing
aead650 [Feynman Liang] Range slice, in-place update, reduce transposes
---
 .../spark/mllib/clustering/LDAModel.scala     | 200 ++++++++++++++----
 .../spark/mllib/clustering/LDAOptimizer.scala | 138 +++++++-----
 .../spark/mllib/clustering/LDAUtils.scala     |  55 +++++
 .../spark/mllib/clustering/JavaLDASuite.java  |   6 +-
 .../spark/mllib/clustering/LDASuite.scala     |  53 ++++-
 5 files changed, 348 insertions(+), 104 deletions(-)
 create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAUtils.scala

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
index 31c1d520fd659..059b52ef20a98 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
@@ -17,10 +17,9 @@
 
 package org.apache.spark.mllib.clustering
 
-import breeze.linalg.{DenseMatrix => BDM, normalize, sum => brzSum, DenseVector => BDV}
-
+import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV, normalize, sum}
+import breeze.numerics.{exp, lgamma}
 import org.apache.hadoop.fs.Path
-
 import org.json4s.DefaultFormats
 import org.json4s.JsonDSL._
 import org.json4s.jackson.JsonMethods._
@@ -28,14 +27,13 @@ import org.json4s.jackson.JsonMethods._
 import org.apache.spark.SparkContext
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.api.java.JavaPairRDD
-import org.apache.spark.graphx.{VertexId, Edge, EdgeContext, Graph}
-import org.apache.spark.mllib.linalg.{Vectors, Vector, Matrices, Matrix, DenseVector}
-import org.apache.spark.mllib.util.{Saveable, Loader}
+import org.apache.spark.graphx.{Edge, EdgeContext, Graph, VertexId}
+import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector, Vectors}
+import org.apache.spark.mllib.util.{Loader, Saveable}
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{SQLContext, Row}
+import org.apache.spark.sql.{Row, SQLContext}
 import org.apache.spark.util.BoundedPriorityQueue
 
-
 /**
  * :: Experimental ::
  *
@@ -53,6 +51,31 @@ abstract class LDAModel private[clustering] extends Saveable {
   /** Vocabulary size (number of terms or terms in the vocabulary) */
   def vocabSize: Int
 
+  /**
+   * Concentration parameter (commonly named "alpha") for the prior placed on documents'
+   * distributions over topics ("theta").
+   *
+   * This is the parameter to a Dirichlet distribution.
+   */
+  def docConcentration: Vector
+
+  /**
+   * Concentration parameter (commonly named "beta" or "eta") for the prior placed on topics'
+   * distributions over terms.
+   *
+   * This is the parameter to a symmetric Dirichlet distribution.
+   *
+   * Note: The topics' distributions over terms are called "beta" in the original LDA paper
+   * by Blei et al., but are called "phi" in many later papers such as Asuncion et al., 2009.
+   */
+  def topicConcentration: Double
+
+  /**
+  * Shape parameter for random initialization of variational parameter gamma.
+  * Used for variational inference for perplexity and other test-time computations.
+  */
+  protected def gammaShape: Double
+
   /**
    * Inferred topics, where each topic is represented by a distribution over terms.
    * This is a matrix of size vocabSize x k, where each column is a topic.
@@ -168,7 +191,10 @@ abstract class LDAModel private[clustering] extends Saveable {
  */
 @Experimental
 class LocalLDAModel private[clustering] (
-    private val topics: Matrix) extends LDAModel with Serializable {
+    val topics: Matrix,
+    override val docConcentration: Vector,
+    override val topicConcentration: Double,
+    override protected[clustering] val gammaShape: Double) extends LDAModel with Serializable {
 
   override def k: Int = topics.numCols
 
@@ -197,8 +223,82 @@ class LocalLDAModel private[clustering] (
   // TODO:
   // override def topicDistributions(documents: RDD[(Long, Vector)]): RDD[(Long, Vector)] = ???
 
+  /**
+   * Calculate the log variational bound on perplexity. See Equation (16) in original Online
+   * LDA paper.
+   * @param documents test corpus to use for calculating perplexity
+   * @return the log perplexity per word
+   */
+  def logPerplexity(documents: RDD[(Long, Vector)]): Double = {
+    val corpusWords = documents
+      .map { case (_, termCounts) => termCounts.toArray.sum }
+      .sum()
+    val batchVariationalBound = bound(documents, docConcentration,
+      topicConcentration, topicsMatrix.toBreeze.toDenseMatrix, gammaShape, k, vocabSize)
+    val perWordBound = batchVariationalBound / corpusWords
+
+    perWordBound
+  }
+
+  /**
+   * Estimate the variational likelihood bound of from `documents`:
+   *    log p(documents) >= E_q[log p(documents)] - E_q[log q(documents)]
+   * This bound is derived by decomposing the LDA model to:
+   *    log p(documents) = E_q[log p(documents)] - E_q[log q(documents)] + D(q|p)
+   * and noting that the KL-divergence D(q|p) >= 0. See Equation (16) in original Online LDA paper.
+   * @param documents a subset of the test corpus
+   * @param alpha document-topic Dirichlet prior parameters
+   * @param eta topic-word Dirichlet prior parameters
+   * @param lambda parameters for variational q(beta | lambda) topic-word distributions
+   * @param gammaShape shape parameter for random initialization of variational q(theta | gamma)
+   *                   topic mixture distributions
+   * @param k number of topics
+   * @param vocabSize number of unique terms in the entire test corpus
+   */
+  private def bound(
+      documents: RDD[(Long, Vector)],
+      alpha: Vector,
+      eta: Double,
+      lambda: BDM[Double],
+      gammaShape: Double,
+      k: Int,
+      vocabSize: Long): Double = {
+    val brzAlpha = alpha.toBreeze.toDenseVector
+    // transpose because dirichletExpectation normalizes by row and we need to normalize
+    // by topic (columns of lambda)
+    val Elogbeta = LDAUtils.dirichletExpectation(lambda.t).t
+
+    var score = documents.filter(_._2.numActives > 0).map { case (id: Long, termCounts: Vector) =>
+      var docScore = 0.0D
+      val (gammad: BDV[Double], _) = OnlineLDAOptimizer.variationalTopicInference(
+        termCounts, exp(Elogbeta), brzAlpha, gammaShape, k)
+      val Elogthetad: BDV[Double] = LDAUtils.dirichletExpectation(gammad)
+
+      // E[log p(doc | theta, beta)]
+      termCounts.foreachActive { case (idx, count) =>
+        docScore += LDAUtils.logSumExp(Elogthetad + Elogbeta(idx, ::).t)
+      }
+      // E[log p(theta | alpha) - log q(theta | gamma)]; assumes alpha is a vector
+      docScore += sum((brzAlpha - gammad) :* Elogthetad)
+      docScore += sum(lgamma(gammad) - lgamma(brzAlpha))
+      docScore += lgamma(sum(brzAlpha)) - lgamma(sum(gammad))
+
+      docScore
+    }.sum()
+
+    // E[log p(beta | eta) - log q (beta | lambda)]; assumes eta is a scalar
+    score += sum((eta - lambda) :* Elogbeta)
+    score += sum(lgamma(lambda) - lgamma(eta))
+
+    val sumEta = eta * vocabSize
+    score += sum(lgamma(sumEta) - lgamma(sum(lambda(::, breeze.linalg.*))))
+
+    score
+  }
+
 }
 
+
 @Experimental
 object LocalLDAModel extends Loader[LocalLDAModel] {
 
@@ -212,6 +312,8 @@ object LocalLDAModel extends Loader[LocalLDAModel] {
     // as a Row in data.
     case class Data(topic: Vector, index: Int)
 
+    // TODO: explicitly save docConcentration, topicConcentration, and gammaShape for use in
+    // model.predict()
     def save(sc: SparkContext, path: String, topicsMatrix: Matrix): Unit = {
       val sqlContext = SQLContext.getOrCreate(sc)
       import sqlContext.implicits._
@@ -219,7 +321,7 @@ object LocalLDAModel extends Loader[LocalLDAModel] {
       val k = topicsMatrix.numCols
       val metadata = compact(render
         (("class" -> thisClassName) ~ ("version" -> thisFormatVersion) ~
-         ("k" -> k) ~ ("vocabSize" -> topicsMatrix.numRows)))
+          ("k" -> k) ~ ("vocabSize" -> topicsMatrix.numRows)))
       sc.parallelize(Seq(metadata), 1).saveAsTextFile(Loader.metadataPath(path))
 
       val topicsDenseMatrix = topicsMatrix.toBreeze.toDenseMatrix
@@ -243,7 +345,11 @@ object LocalLDAModel extends Loader[LocalLDAModel] {
       topics.foreach { case Row(vec: Vector, ind: Int) =>
         brzTopics(::, ind) := vec.toBreeze
       }
-      new LocalLDAModel(Matrices.fromBreeze(brzTopics))
+      val topicsMat = Matrices.fromBreeze(brzTopics)
+
+      // TODO: initialize with docConcentration, topicConcentration, and gammaShape after SPARK-9940
+      new LocalLDAModel(topicsMat,
+        Vectors.dense(Array.fill(topicsMat.numRows)(1.0 / topicsMat.numRows)), 1D, 100D)
     }
   }
 
@@ -259,8 +365,8 @@ object LocalLDAModel extends Loader[LocalLDAModel] {
         SaveLoadV1_0.load(sc, path)
       case _ => throw new Exception(
         s"LocalLDAModel.load did not recognize model with (className, format version):" +
-        s"($loadedClassName, $loadedVersion).  Supported:\n" +
-        s"  ($classNameV1_0, 1.0)")
+          s"($loadedClassName, $loadedVersion).  Supported:\n" +
+          s"  ($classNameV1_0, 1.0)")
     }
 
     val topicsMatrix = model.topicsMatrix
@@ -268,7 +374,7 @@ object LocalLDAModel extends Loader[LocalLDAModel] {
       s"LocalLDAModel requires $expectedK topics, got ${topicsMatrix.numCols} topics")
     require(expectedVocabSize == topicsMatrix.numRows,
       s"LocalLDAModel requires $expectedVocabSize terms for each topic, " +
-      s"but got ${topicsMatrix.numRows}")
+        s"but got ${topicsMatrix.numRows}")
     model
   }
 }
@@ -282,28 +388,25 @@ object LocalLDAModel extends Loader[LocalLDAModel] {
  * than the [[LocalLDAModel]].
  */
 @Experimental
-class DistributedLDAModel private (
+class DistributedLDAModel private[clustering] (
     private[clustering] val graph: Graph[LDA.TopicCounts, LDA.TokenCount],
     private[clustering] val globalTopicTotals: LDA.TopicCounts,
     val k: Int,
     val vocabSize: Int,
-    private[clustering] val docConcentration: Double,
-    private[clustering] val topicConcentration: Double,
+    override val docConcentration: Vector,
+    override val topicConcentration: Double,
+    override protected[clustering] val gammaShape: Double,
     private[spark] val iterationTimes: Array[Double]) extends LDAModel {
 
   import LDA._
 
-  private[clustering] def this(state: EMLDAOptimizer, iterationTimes: Array[Double]) = {
-    this(state.graph, state.globalTopicTotals, state.k, state.vocabSize, state.docConcentration,
-      state.topicConcentration, iterationTimes)
-  }
-
   /**
    * Convert model to a local model.
    * The local model stores the inferred topics but not the topic distributions for training
    * documents.
    */
-  def toLocal: LocalLDAModel = new LocalLDAModel(topicsMatrix)
+  def toLocal: LocalLDAModel = new LocalLDAModel(topicsMatrix, docConcentration, topicConcentration,
+    gammaShape)
 
   /**
    * Inferred topics, where each topic is represented by a distribution over terms.
@@ -375,8 +478,9 @@ class DistributedLDAModel private (
    *    hyperparameters.
    */
   lazy val logLikelihood: Double = {
-    val eta = topicConcentration
-    val alpha = docConcentration
+    // TODO: generalize this for asymmetric (non-scalar) alpha
+    val alpha = this.docConcentration(0) // To avoid closure capture of enclosing object
+    val eta = this.topicConcentration
     assert(eta > 1.0)
     assert(alpha > 1.0)
     val N_k = globalTopicTotals
@@ -400,8 +504,9 @@ class DistributedLDAModel private (
    *  log P(topics, topic distributions for docs | alpha, eta)
    */
   lazy val logPrior: Double = {
-    val eta = topicConcentration
-    val alpha = docConcentration
+    // TODO: generalize this for asymmetric (non-scalar) alpha
+    val alpha = this.docConcentration(0) // To avoid closure capture of enclosing object
+    val eta = this.topicConcentration
     // Term vertices: Compute phi_{wk}.  Use to compute prior log probability.
     // Doc vertex: Compute theta_{kj}.  Use to compute prior log probability.
     val N_k = globalTopicTotals
@@ -412,12 +517,12 @@ class DistributedLDAModel private (
           val N_wk = vertex._2
           val smoothed_N_wk: TopicCounts = N_wk + (eta - 1.0)
           val phi_wk: TopicCounts = smoothed_N_wk :/ smoothed_N_k
-          (eta - 1.0) * brzSum(phi_wk.map(math.log))
+          (eta - 1.0) * sum(phi_wk.map(math.log))
         } else {
           val N_kj = vertex._2
           val smoothed_N_kj: TopicCounts = N_kj + (alpha - 1.0)
           val theta_kj: TopicCounts = normalize(smoothed_N_kj, 1.0)
-          (alpha - 1.0) * brzSum(theta_kj.map(math.log))
+          (alpha - 1.0) * sum(theta_kj.map(math.log))
         }
     }
     graph.vertices.aggregate(0.0)(seqOp, _ + _)
@@ -448,7 +553,7 @@ class DistributedLDAModel private (
   override def save(sc: SparkContext, path: String): Unit = {
     DistributedLDAModel.SaveLoadV1_0.save(
       sc, path, graph, globalTopicTotals, k, vocabSize, docConcentration, topicConcentration,
-      iterationTimes)
+      iterationTimes, gammaShape)
   }
 }
 
@@ -478,17 +583,20 @@ object DistributedLDAModel extends Loader[DistributedLDAModel] {
         globalTopicTotals: LDA.TopicCounts,
         k: Int,
         vocabSize: Int,
-        docConcentration: Double,
+        docConcentration: Vector,
         topicConcentration: Double,
-        iterationTimes: Array[Double]): Unit = {
+        iterationTimes: Array[Double],
+        gammaShape: Double): Unit = {
       val sqlContext = SQLContext.getOrCreate(sc)
       import sqlContext.implicits._
 
       val metadata = compact(render
         (("class" -> classNameV1_0) ~ ("version" -> thisFormatVersion) ~
-         ("k" -> k) ~ ("vocabSize" -> vocabSize) ~ ("docConcentration" -> docConcentration) ~
-         ("topicConcentration" -> topicConcentration) ~
-         ("iterationTimes" -> iterationTimes.toSeq)))
+          ("k" -> k) ~ ("vocabSize" -> vocabSize) ~
+          ("docConcentration" -> docConcentration.toArray.toSeq) ~
+          ("topicConcentration" -> topicConcentration) ~
+          ("iterationTimes" -> iterationTimes.toSeq) ~
+          ("gammaShape" -> gammaShape)))
       sc.parallelize(Seq(metadata), 1).saveAsTextFile(Loader.metadataPath(path))
 
       val newPath = new Path(Loader.dataPath(path), "globalTopicTotals").toUri.toString
@@ -510,9 +618,10 @@ object DistributedLDAModel extends Loader[DistributedLDAModel] {
         sc: SparkContext,
         path: String,
         vocabSize: Int,
-        docConcentration: Double,
+        docConcentration: Vector,
         topicConcentration: Double,
-        iterationTimes: Array[Double]): DistributedLDAModel = {
+        iterationTimes: Array[Double],
+        gammaShape: Double): DistributedLDAModel = {
       val dataPath = new Path(Loader.dataPath(path), "globalTopicTotals").toUri.toString
       val vertexDataPath = new Path(Loader.dataPath(path), "topicCounts").toUri.toString
       val edgeDataPath = new Path(Loader.dataPath(path), "tokenCounts").toUri.toString
@@ -536,7 +645,7 @@ object DistributedLDAModel extends Loader[DistributedLDAModel] {
       val graph: Graph[LDA.TopicCounts, LDA.TokenCount] = Graph(vertices, edges)
 
       new DistributedLDAModel(graph, globalTopicTotals, globalTopicTotals.length, vocabSize,
-        docConcentration, topicConcentration, iterationTimes)
+        docConcentration, topicConcentration, gammaShape, iterationTimes)
     }
 
   }
@@ -546,32 +655,35 @@ object DistributedLDAModel extends Loader[DistributedLDAModel] {
     implicit val formats = DefaultFormats
     val expectedK = (metadata \ "k").extract[Int]
     val vocabSize = (metadata \ "vocabSize").extract[Int]
-    val docConcentration = (metadata \ "docConcentration").extract[Double]
+    val docConcentration =
+      Vectors.dense((metadata \ "docConcentration").extract[Seq[Double]].toArray)
     val topicConcentration = (metadata \ "topicConcentration").extract[Double]
     val iterationTimes = (metadata \ "iterationTimes").extract[Seq[Double]]
+    val gammaShape = (metadata \ "gammaShape").extract[Double]
     val classNameV1_0 = SaveLoadV1_0.classNameV1_0
 
     val model = (loadedClassName, loadedVersion) match {
       case (className, "1.0") if className == classNameV1_0 => {
-        DistributedLDAModel.SaveLoadV1_0.load(
-          sc, path, vocabSize, docConcentration, topicConcentration, iterationTimes.toArray)
+        DistributedLDAModel.SaveLoadV1_0.load(sc, path, vocabSize, docConcentration,
+          topicConcentration, iterationTimes.toArray, gammaShape)
       }
       case _ => throw new Exception(
         s"DistributedLDAModel.load did not recognize model with (className, format version):" +
-        s"($loadedClassName, $loadedVersion).  Supported: ($classNameV1_0, 1.0)")
+          s"($loadedClassName, $loadedVersion).  Supported: ($classNameV1_0, 1.0)")
     }
 
     require(model.vocabSize == vocabSize,
       s"DistributedLDAModel requires $vocabSize vocabSize, got ${model.vocabSize} vocabSize")
     require(model.docConcentration == docConcentration,
       s"DistributedLDAModel requires $docConcentration docConcentration, " +
-      s"got ${model.docConcentration} docConcentration")
+        s"got ${model.docConcentration} docConcentration")
     require(model.topicConcentration == topicConcentration,
       s"DistributedLDAModel requires $topicConcentration docConcentration, " +
-      s"got ${model.topicConcentration} docConcentration")
+        s"got ${model.topicConcentration} docConcentration")
     require(expectedK == model.k,
       s"DistributedLDAModel requires $expectedK topics, got ${model.k} topics")
     model
   }
 
 }
+
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
index f4170a3d98dd8..7e75e7083acb5 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
@@ -20,7 +20,7 @@ package org.apache.spark.mllib.clustering
 import java.util.Random
 
 import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV, normalize, sum}
-import breeze.numerics.{abs, digamma, exp}
+import breeze.numerics.{abs, exp}
 import breeze.stats.distributions.{Gamma, RandBasis}
 
 import org.apache.spark.annotation.DeveloperApi
@@ -208,7 +208,11 @@ final class EMLDAOptimizer extends LDAOptimizer {
   override private[clustering] def getLDAModel(iterationTimes: Array[Double]): LDAModel = {
     require(graph != null, "graph is null, EMLDAOptimizer not initialized.")
     this.graphCheckpointer.deleteAllCheckpoints()
-    new DistributedLDAModel(this, iterationTimes)
+    // This assumes gammaShape = 100 in OnlineLDAOptimizer to ensure equivalence in LDAModel.toLocal
+    // conversion
+    new DistributedLDAModel(this.graph, this.globalTopicTotals, this.k, this.vocabSize,
+      Vectors.dense(Array.fill(this.k)(this.docConcentration)), this.topicConcentration,
+      100, iterationTimes)
   }
 }
 
@@ -385,71 +389,52 @@ final class OnlineLDAOptimizer extends LDAOptimizer {
     iteration += 1
     val k = this.k
     val vocabSize = this.vocabSize
-    val Elogbeta = dirichletExpectation(lambda).t
-    val expElogbeta = exp(Elogbeta)
+    val expElogbeta = exp(LDAUtils.dirichletExpectation(lambda)).t
     val alpha = this.alpha.toBreeze
     val gammaShape = this.gammaShape
 
-    val stats: RDD[BDM[Double]] = batch.mapPartitions { docs =>
-      val stat = BDM.zeros[Double](k, vocabSize)
-      docs.foreach { doc =>
-        val termCounts = doc._2
-        val (ids: List[Int], cts: Array[Double]) = termCounts match {
-          case v: DenseVector => ((0 until v.size).toList, v.values)
-          case v: SparseVector => (v.indices.toList, v.values)
-          case v => throw new IllegalArgumentException("Online LDA does not support vector type "
-            + v.getClass)
-        }
-        if (!ids.isEmpty) {
-
-          // Initialize the variational distribution q(theta|gamma) for the mini-batch
-          val gammad: BDV[Double] =
-            new Gamma(gammaShape, 1.0 / gammaShape).samplesVector(k) // K
-          val expElogthetad: BDV[Double] = exp(digamma(gammad) - digamma(sum(gammad))) // K
-          val expElogbetad: BDM[Double] = expElogbeta(ids, ::).toDenseMatrix // ids * K
-
-          val phinorm: BDV[Double] = expElogbetad * expElogthetad :+ 1e-100 // ids
-          var meanchange = 1D
-          val ctsVector = new BDV[Double](cts) // ids
-
-          // Iterate between gamma and phi until convergence
-          while (meanchange > 1e-3) {
-            val lastgamma = gammad.copy
-            //        K                  K * ids               ids
-            gammad := (expElogthetad :* (expElogbetad.t * (ctsVector :/ phinorm))) :+ alpha
-            expElogthetad := exp(digamma(gammad) - digamma(sum(gammad)))
-            phinorm := expElogbetad * expElogthetad :+ 1e-100
-            meanchange = sum(abs(gammad - lastgamma)) / k
-          }
+    val stats: RDD[(BDM[Double], List[BDV[Double]])] = batch.mapPartitions { docs =>
+      val nonEmptyDocs = docs.filter(_._2.numActives > 0)
 
-          stat(::, ids) := expElogthetad.asDenseMatrix.t * (ctsVector :/ phinorm).asDenseMatrix
+      val stat = BDM.zeros[Double](k, vocabSize)
+      var gammaPart = List[BDV[Double]]()
+      nonEmptyDocs.zipWithIndex.foreach { case ((_, termCounts: Vector), idx: Int) =>
+        val ids: List[Int] = termCounts match {
+          case v: DenseVector => (0 until v.size).toList
+          case v: SparseVector => v.indices.toList
         }
+        val (gammad, sstats) = OnlineLDAOptimizer.variationalTopicInference(
+          termCounts, expElogbeta, alpha, gammaShape, k)
+        stat(::, ids) := stat(::, ids).toDenseMatrix + sstats
+        gammaPart = gammad :: gammaPart
       }
-      Iterator(stat)
+      Iterator((stat, gammaPart))
     }
-
-    val statsSum: BDM[Double] = stats.reduce(_ += _)
+    val statsSum: BDM[Double] = stats.map(_._1).reduce(_ += _)
+    val gammat: BDM[Double] = breeze.linalg.DenseMatrix.vertcat(
+      stats.map(_._2).reduce(_ ++ _).map(_.toDenseMatrix): _*)
     val batchResult = statsSum :* expElogbeta.t
 
     // Note that this is an optimization to avoid batch.count
-    update(batchResult, iteration, (miniBatchFraction * corpusSize).ceil.toInt)
+    updateLambda(batchResult, (miniBatchFraction * corpusSize).ceil.toInt)
     this
   }
 
-  override private[clustering] def getLDAModel(iterationTimes: Array[Double]): LDAModel = {
-    new LocalLDAModel(Matrices.fromBreeze(lambda).transpose)
-  }
-
   /**
    * Update lambda based on the batch submitted. batchSize can be different for each iteration.
    */
-  private[clustering] def update(stat: BDM[Double], iter: Int, batchSize: Int): Unit = {
+  private def updateLambda(stat: BDM[Double], batchSize: Int): Unit = {
     // weight of the mini-batch.
-    val weight = math.pow(getTau0 + iter, -getKappa)
+    val weight = rho()
 
     // Update lambda based on documents.
-    lambda = lambda * (1 - weight) +
-      (stat * (corpusSize.toDouble / batchSize.toDouble) + eta) * weight
+    lambda := (1 - weight) * lambda +
+      weight * (stat * (corpusSize.toDouble / batchSize.toDouble) + eta)
+  }
+
+  /** Calculates learning rate rho, which decays as a function of [[iteration]] */
+  private def rho(): Double = {
+    math.pow(getTau0 + this.iteration, -getKappa)
   }
 
   /**
@@ -463,15 +448,56 @@ final class OnlineLDAOptimizer extends LDAOptimizer {
     new BDM[Double](col, row, temp).t
   }
 
+  override private[clustering] def getLDAModel(iterationTimes: Array[Double]): LDAModel = {
+    new LocalLDAModel(Matrices.fromBreeze(lambda).transpose, alpha, eta, gammaShape)
+  }
+
+}
+
+/**
+ * Serializable companion object containing helper methods and shared code for
+ * [[OnlineLDAOptimizer]] and [[LocalLDAModel]].
+ */
+private[clustering] object OnlineLDAOptimizer {
   /**
-   * For theta ~ Dir(alpha), computes E[log(theta)] given alpha. Currently the implementation
-   * uses digamma which is accurate but expensive.
+   * Uses variational inference to infer the topic distribution `gammad` given the term counts
+   * for a document. `termCounts` must be non-empty, otherwise Breeze will throw a BLAS error.
+   *
+   * An optimization (Lee, Seung: Algorithms for non-negative matrix factorization, NIPS 2001)
+   * avoids explicit computation of variational parameter `phi`.
+   * @see [[http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.31.7566]]
    */
-  private def dirichletExpectation(alpha: BDM[Double]): BDM[Double] = {
-    val rowSum = sum(alpha(breeze.linalg.*, ::))
-    val digAlpha = digamma(alpha)
-    val digRowSum = digamma(rowSum)
-    val result = digAlpha(::, breeze.linalg.*) - digRowSum
-    result
+  private[clustering] def variationalTopicInference(
+      termCounts: Vector,
+      expElogbeta: BDM[Double],
+      alpha: breeze.linalg.Vector[Double],
+      gammaShape: Double,
+      k: Int): (BDV[Double], BDM[Double]) = {
+    val (ids: List[Int], cts: Array[Double]) = termCounts match {
+      case v: DenseVector => ((0 until v.size).toList, v.values)
+      case v: SparseVector => (v.indices.toList, v.values)
+    }
+    // Initialize the variational distribution q(theta|gamma) for the mini-batch
+    val gammad: BDV[Double] =
+      new Gamma(gammaShape, 1.0 / gammaShape).samplesVector(k)                   // K
+    val expElogthetad: BDV[Double] = exp(LDAUtils.dirichletExpectation(gammad))  // K
+    val expElogbetad = expElogbeta(ids, ::).toDenseMatrix                        // ids * K
+
+    val phinorm: BDV[Double] = expElogbetad * expElogthetad :+ 1e-100            // ids
+    var meanchange = 1D
+    val ctsVector = new BDV[Double](cts)                                         // ids
+
+    // Iterate between gamma and phi until convergence
+    while (meanchange > 1e-3) {
+      val lastgamma = gammad.copy
+      //        K                  K * ids               ids
+      gammad := (expElogthetad :* (expElogbetad.t * (ctsVector :/ phinorm))) :+ alpha
+      expElogthetad := exp(LDAUtils.dirichletExpectation(gammad))
+      phinorm := expElogbetad * expElogthetad :+ 1e-100
+      meanchange = sum(abs(gammad - lastgamma)) / k
+    }
+
+    val sstatsd = expElogthetad.asDenseMatrix.t * (ctsVector :/ phinorm).asDenseMatrix
+    (gammad, sstatsd)
   }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAUtils.scala
new file mode 100644
index 0000000000000..f7e5ce1665fe6
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAUtils.scala
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.mllib.clustering
+
+import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV, max, sum}
+import breeze.numerics._
+
+/**
+ * Utility methods for LDA.
+ */
+object LDAUtils {
+  /**
+   * Log Sum Exp with overflow protection using the identity:
+   * For any a: \log \sum_{n=1}^N \exp\{x_n\} = a + \log \sum_{n=1}^N \exp\{x_n - a\}
+   */
+  private[clustering] def logSumExp(x: BDV[Double]): Double = {
+    val a = max(x)
+    a + log(sum(exp(x :- a)))
+  }
+
+  /**
+   * For theta ~ Dir(alpha), computes E[log(theta)] given alpha. Currently the implementation
+   * uses [[breeze.numerics.digamma]] which is accurate but expensive.
+   */
+  private[clustering] def dirichletExpectation(alpha: BDV[Double]): BDV[Double] = {
+    digamma(alpha) - digamma(sum(alpha))
+  }
+
+  /**
+   * Computes [[dirichletExpectation()]] row-wise, assuming each row of alpha are
+   * Dirichlet parameters.
+   */
+  private[clustering] def dirichletExpectation(alpha: BDM[Double]): BDM[Double] = {
+    val rowSum = sum(alpha(breeze.linalg.*, ::))
+    val digAlpha = digamma(alpha)
+    val digRowSum = digamma(rowSum)
+    val result = digAlpha(::, breeze.linalg.*) - digRowSum
+    result
+  }
+
+}
diff --git a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java
index b48f190f599a2..d272a42c8576f 100644
--- a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java
+++ b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java
@@ -19,6 +19,7 @@
 
 import java.io.Serializable;
 import java.util.ArrayList;
+import java.util.Arrays;
 
 import scala.Tuple2;
 
@@ -59,7 +60,10 @@ public void tearDown() {
 
   @Test
   public void localLDAModel() {
-    LocalLDAModel model = new LocalLDAModel(LDASuite$.MODULE$.tinyTopics());
+    Matrix topics = LDASuite$.MODULE$.tinyTopics();
+    double[] topicConcentration = new double[topics.numRows()];
+    Arrays.fill(topicConcentration, 1.0D / topics.numRows());
+    LocalLDAModel model = new LocalLDAModel(topics, Vectors.dense(topicConcentration), 1D, 100D);
 
     // Check: basic parameters
     assertEquals(model.k(), tinyK);
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
index 376a87f0511b4..aa36336ebbee6 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.mllib.clustering
 
-import breeze.linalg.{DenseMatrix => BDM}
+import breeze.linalg.{DenseMatrix => BDM, max, argmax}
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.graphx.Edge
@@ -31,7 +31,8 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext {
   import LDASuite._
 
   test("LocalLDAModel") {
-    val model = new LocalLDAModel(tinyTopics)
+    val model = new LocalLDAModel(tinyTopics,
+      Vectors.dense(Array.fill(tinyTopics.numRows)(1.0 / tinyTopics.numRows)), 1D, 100D)
 
     // Check: basic parameters
     assert(model.k === tinyK)
@@ -235,6 +236,51 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext {
     }
   }
 
+  test("LocalLDAModel logPerplexity") {
+    val k = 2
+    val vocabSize = 6
+    val alpha = 0.01
+    val eta = 0.01
+    val gammaShape = 100
+    val topics = new DenseMatrix(numRows = vocabSize, numCols = k, values = Array(
+      1.86738052, 1.94056535, 1.89981687, 0.0833265, 0.07405918, 0.07940597,
+      0.15081551, 0.08637973, 0.12428538, 1.9474897, 1.94615165, 1.95204124))
+
+    def toydata: Array[(Long, Vector)] = Array(
+      Vectors.sparse(6, Array(0, 1), Array(1, 1)),
+      Vectors.sparse(6, Array(1, 2), Array(1, 1)),
+      Vectors.sparse(6, Array(0, 2), Array(1, 1)),
+      Vectors.sparse(6, Array(3, 4), Array(1, 1)),
+      Vectors.sparse(6, Array(3, 5), Array(1, 1)),
+      Vectors.sparse(6, Array(4, 5), Array(1, 1))
+    ).zipWithIndex.map { case (wordCounts, docId) => (docId.toLong, wordCounts) }
+    val docs = sc.parallelize(toydata)
+
+
+    val ldaModel: LocalLDAModel = new LocalLDAModel(
+      topics, Vectors.dense(Array.fill(k)(alpha)), eta, gammaShape)
+
+    /* Verify results using gensim:
+       import numpy as np
+       from gensim import models
+       corpus = [
+          [(0, 1.0), (1, 1.0)],
+          [(1, 1.0), (2, 1.0)],
+          [(0, 1.0), (2, 1.0)],
+          [(3, 1.0), (4, 1.0)],
+          [(3, 1.0), (5, 1.0)],
+          [(4, 1.0), (5, 1.0)]]
+       np.random.seed(2345)
+       lda = models.ldamodel.LdaModel(
+          corpus=corpus, alpha=0.01, eta=0.01, num_topics=2, update_every=0, passes=100,
+          decay=0.51, offset=1024)
+       print(lda.log_perplexity(corpus))
+       > -3.69051285096
+     */
+
+    assert(ldaModel.logPerplexity(docs) ~== -3.690D relTol 1E-3D)
+  }
+
   test("OnlineLDAOptimizer with asymmetric prior") {
     def toydata: Array[(Long, Vector)] = Array(
       Vectors.sparse(6, Array(0, 1), Array(1, 1)),
@@ -287,7 +333,8 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext {
 
   test("model save/load") {
     // Test for LocalLDAModel.
-    val localModel = new LocalLDAModel(tinyTopics)
+    val localModel = new LocalLDAModel(tinyTopics,
+      Vectors.dense(Array.fill(tinyTopics.numRows)(1.0 / tinyTopics.numRows)), 1D, 100D)
     val tempDir1 = Utils.createTempDir()
     val path1 = tempDir1.toURI.toString
 

From 86505962e6c9da1ee18c6a3533e169a22e4f1665 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Wed, 29 Jul 2015 16:49:02 -0700
Subject: [PATCH 451/549] [SPARK-9448][SQL] GenerateUnsafeProjection should not
 share expressions across instances.

We accidentally moved the list of expressions from the generated code instance to the class wrapper, and as a result, different threads are sharing the same set of expressions, which cause problems for expressions with mutable state.

This pull request fixed that problem, and also added unit tests for all codegen classes, except GeneratedOrdering (which will never need any expressions since sort now only accepts bound references.

Author: Reynold Xin <rxin@databricks.com>

Closes #7759 from rxin/SPARK-9448 and squashes the following commits:

c09b50f [Reynold Xin] [SPARK-9448][SQL] GenerateUnsafeProjection should not share expressions across instances.
---
 .../codegen/GenerateUnsafeProjection.scala    | 12 +--
 .../CodegenExpressionCachingSuite.scala       | 90 +++++++++++++++++++
 2 files changed, 96 insertions(+), 6 deletions(-)
 create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodegenExpressionCachingSuite.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
index dc725c28aaa27..7be60114ce674 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
@@ -256,18 +256,18 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
     eval.code = createCode(ctx, eval, expressions)
 
     val code = s"""
-      private $exprType[] expressions;
-
-      public Object generate($exprType[] expr) {
-        this.expressions = expr;
-        return new SpecificProjection();
+      public Object generate($exprType[] exprs) {
+        return new SpecificProjection(exprs);
       }
 
       class SpecificProjection extends ${classOf[UnsafeProjection].getName} {
 
+        private $exprType[] expressions;
+
         ${declareMutableStates(ctx)}
 
-        public SpecificProjection() {
+        public SpecificProjection($exprType[] expressions) {
+          this.expressions = expressions;
           ${initMutableStates(ctx)}
         }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodegenExpressionCachingSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodegenExpressionCachingSuite.scala
new file mode 100644
index 0000000000000..866bf904e4a4c
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodegenExpressionCachingSuite.scala
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions.codegen
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.{UnsafeProjection, LeafExpression}
+import org.apache.spark.sql.types.{BooleanType, DataType}
+
+/**
+ * A test suite that makes sure code generation handles expression internally states correctly.
+ */
+class CodegenExpressionCachingSuite extends SparkFunSuite {
+
+  test("GenerateUnsafeProjection") {
+    val expr1 = MutableExpression()
+    val instance1 = UnsafeProjection.create(Seq(expr1))
+    assert(instance1.apply(null).getBoolean(0) === false)
+
+    val expr2 = MutableExpression()
+    expr2.mutableState = true
+    val instance2 = UnsafeProjection.create(Seq(expr2))
+    assert(instance1.apply(null).getBoolean(0) === false)
+    assert(instance2.apply(null).getBoolean(0) === true)
+  }
+
+  test("GenerateProjection") {
+    val expr1 = MutableExpression()
+    val instance1 = GenerateProjection.generate(Seq(expr1))
+    assert(instance1.apply(null).getBoolean(0) === false)
+
+    val expr2 = MutableExpression()
+    expr2.mutableState = true
+    val instance2 = GenerateProjection.generate(Seq(expr2))
+    assert(instance1.apply(null).getBoolean(0) === false)
+    assert(instance2.apply(null).getBoolean(0) === true)
+  }
+
+  test("GenerateMutableProjection") {
+    val expr1 = MutableExpression()
+    val instance1 = GenerateMutableProjection.generate(Seq(expr1))()
+    assert(instance1.apply(null).getBoolean(0) === false)
+
+    val expr2 = MutableExpression()
+    expr2.mutableState = true
+    val instance2 = GenerateMutableProjection.generate(Seq(expr2))()
+    assert(instance1.apply(null).getBoolean(0) === false)
+    assert(instance2.apply(null).getBoolean(0) === true)
+  }
+
+  test("GeneratePredicate") {
+    val expr1 = MutableExpression()
+    val instance1 = GeneratePredicate.generate(expr1)
+    assert(instance1.apply(null) === false)
+
+    val expr2 = MutableExpression()
+    expr2.mutableState = true
+    val instance2 = GeneratePredicate.generate(expr2)
+    assert(instance1.apply(null) === false)
+    assert(instance2.apply(null) === true)
+  }
+
+}
+
+
+/**
+ * An expression with mutable state so we can change it freely in our test suite.
+ */
+case class MutableExpression() extends LeafExpression with CodegenFallback {
+  var mutableState: Boolean = false
+  override def eval(input: InternalRow): Any = mutableState
+
+  override def nullable: Boolean = false
+  override def dataType: DataType = BooleanType
+}

From 103d8cce78533b38b4f8060b30f7f455113bc6b5 Mon Sep 17 00:00:00 2001
From: Bimal Tandel <bimal@bimal-MBP.local>
Date: Wed, 29 Jul 2015 16:54:58 -0700
Subject: [PATCH 452/549] [SPARK-8921] [MLLIB] Add @since tags to mllib.stat

Author: Bimal Tandel <bimal@bimal-MBP.local>

Closes #7730 from BimalTandel/branch_spark_8921 and squashes the following commits:

3ea230a [Bimal Tandel] Spark 8921 add @since tags
---
 .../spark/mllib/stat/KernelDensity.scala      |  5 ++++
 .../stat/MultivariateOnlineSummarizer.scala   | 27 +++++++++++++++++++
 .../stat/MultivariateStatisticalSummary.scala |  9 +++++++
 .../apache/spark/mllib/stat/Statistics.scala  | 20 ++++++++++++--
 .../distribution/MultivariateGaussian.scala   |  9 +++++--
 5 files changed, 66 insertions(+), 4 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/KernelDensity.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/KernelDensity.scala
index 58a50f9c19f14..93a6753efd4d9 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/KernelDensity.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/KernelDensity.scala
@@ -37,6 +37,7 @@ import org.apache.spark.rdd.RDD
  *   .setBandwidth(3.0)
  * val densities = kd.estimate(Array(-1.0, 2.0, 5.0))
  * }}}
+ * @since 1.4.0
  */
 @Experimental
 class KernelDensity extends Serializable {
@@ -51,6 +52,7 @@ class KernelDensity extends Serializable {
 
   /**
    * Sets the bandwidth (standard deviation) of the Gaussian kernel (default: `1.0`).
+   * @since 1.4.0
    */
   def setBandwidth(bandwidth: Double): this.type = {
     require(bandwidth > 0, s"Bandwidth must be positive, but got $bandwidth.")
@@ -60,6 +62,7 @@ class KernelDensity extends Serializable {
 
   /**
    * Sets the sample to use for density estimation.
+   * @since 1.4.0
    */
   def setSample(sample: RDD[Double]): this.type = {
     this.sample = sample
@@ -68,6 +71,7 @@ class KernelDensity extends Serializable {
 
   /**
    * Sets the sample to use for density estimation (for Java users).
+   * @since 1.4.0
    */
   def setSample(sample: JavaRDD[java.lang.Double]): this.type = {
     this.sample = sample.rdd.asInstanceOf[RDD[Double]]
@@ -76,6 +80,7 @@ class KernelDensity extends Serializable {
 
   /**
    * Estimates probability density function at the given array of points.
+   * @since 1.4.0
    */
   def estimate(points: Array[Double]): Array[Double] = {
     val sample = this.sample
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala
index d321cc554c1cc..62da9f2ef22a3 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala
@@ -33,6 +33,7 @@ import org.apache.spark.mllib.linalg.{Vectors, Vector}
  * Reference: [[http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance variance-wiki]]
  * Zero elements (including explicit zero values) are skipped when calling add(),
  * to have time complexity O(nnz) instead of O(n) for each column.
+ * @since 1.1.0
  */
 @DeveloperApi
 class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with Serializable {
@@ -52,6 +53,7 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
    *
    * @param sample The sample in dense/sparse vector format to be added into this summarizer.
    * @return This MultivariateOnlineSummarizer object.
+   * @since 1.1.0
    */
   def add(sample: Vector): this.type = {
     if (n == 0) {
@@ -107,6 +109,7 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
    *
    * @param other The other MultivariateOnlineSummarizer to be merged.
    * @return This MultivariateOnlineSummarizer object.
+   * @since 1.1.0
    */
   def merge(other: MultivariateOnlineSummarizer): this.type = {
     if (this.totalCnt != 0 && other.totalCnt != 0) {
@@ -149,6 +152,9 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
     this
   }
 
+  /**
+   * @since 1.1.0
+   */
   override def mean: Vector = {
     require(totalCnt > 0, s"Nothing has been added to this summarizer.")
 
@@ -161,6 +167,9 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
     Vectors.dense(realMean)
   }
 
+  /**
+   * @since 1.1.0
+   */
   override def variance: Vector = {
     require(totalCnt > 0, s"Nothing has been added to this summarizer.")
 
@@ -183,14 +192,23 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
     Vectors.dense(realVariance)
   }
 
+  /**
+   * @since 1.1.0
+   */
   override def count: Long = totalCnt
 
+  /**
+   * @since 1.1.0
+   */
   override def numNonzeros: Vector = {
     require(totalCnt > 0, s"Nothing has been added to this summarizer.")
 
     Vectors.dense(nnz)
   }
 
+  /**
+   * @since 1.1.0
+   */
   override def max: Vector = {
     require(totalCnt > 0, s"Nothing has been added to this summarizer.")
 
@@ -202,6 +220,9 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
     Vectors.dense(currMax)
   }
 
+  /**
+   * @since 1.1.0
+   */
   override def min: Vector = {
     require(totalCnt > 0, s"Nothing has been added to this summarizer.")
 
@@ -213,6 +234,9 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
     Vectors.dense(currMin)
   }
 
+  /**
+   * @since 1.2.0
+   */
   override def normL2: Vector = {
     require(totalCnt > 0, s"Nothing has been added to this summarizer.")
 
@@ -227,6 +251,9 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
     Vectors.dense(realMagnitude)
   }
 
+  /**
+   * @since 1.2.0
+   */
   override def normL1: Vector = {
     require(totalCnt > 0, s"Nothing has been added to this summarizer.")
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateStatisticalSummary.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateStatisticalSummary.scala
index 6a364c93284af..3bb49f12289e1 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateStatisticalSummary.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateStatisticalSummary.scala
@@ -21,46 +21,55 @@ import org.apache.spark.mllib.linalg.Vector
 
 /**
  * Trait for multivariate statistical summary of a data matrix.
+ * @since 1.0.0
  */
 trait MultivariateStatisticalSummary {
 
   /**
    * Sample mean vector.
+   * @since 1.0.0
    */
   def mean: Vector
 
   /**
    * Sample variance vector. Should return a zero vector if the sample size is 1.
+   * @since 1.0.0
    */
   def variance: Vector
 
   /**
    * Sample size.
+   * @since 1.0.0
    */
   def count: Long
 
   /**
    * Number of nonzero elements (including explicitly presented zero values) in each column.
+   * @since 1.0.0
    */
   def numNonzeros: Vector
 
   /**
    * Maximum value of each column.
+   * @since 1.0.0
    */
   def max: Vector
 
   /**
    * Minimum value of each column.
+   * @since 1.0.0
    */
   def min: Vector
 
   /**
    * Euclidean magnitude of each column
+   * @since 1.2.0
    */
   def normL2: Vector
 
   /**
    * L1 norm of each column
+   * @since 1.2.0
    */
   def normL1: Vector
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
index 90332028cfb3a..f84502919e381 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
@@ -32,6 +32,7 @@ import org.apache.spark.rdd.RDD
 /**
  * :: Experimental ::
  * API for statistical functions in MLlib.
+ * @since 1.1.0
  */
 @Experimental
 object Statistics {
@@ -41,6 +42,7 @@ object Statistics {
    *
    * @param X an RDD[Vector] for which column-wise summary statistics are to be computed.
    * @return [[MultivariateStatisticalSummary]] object containing column-wise summary statistics.
+   * @since 1.1.0
    */
   def colStats(X: RDD[Vector]): MultivariateStatisticalSummary = {
     new RowMatrix(X).computeColumnSummaryStatistics()
@@ -52,6 +54,7 @@ object Statistics {
    *
    * @param X an RDD[Vector] for which the correlation matrix is to be computed.
    * @return Pearson correlation matrix comparing columns in X.
+   * @since 1.1.0
    */
   def corr(X: RDD[Vector]): Matrix = Correlations.corrMatrix(X)
 
@@ -68,6 +71,7 @@ object Statistics {
    * @param method String specifying the method to use for computing correlation.
    *               Supported: `pearson` (default), `spearman`
    * @return Correlation matrix comparing columns in X.
+   * @since 1.1.0
    */
   def corr(X: RDD[Vector], method: String): Matrix = Correlations.corrMatrix(X, method)
 
@@ -81,10 +85,14 @@ object Statistics {
    * @param x RDD[Double] of the same cardinality as y.
    * @param y RDD[Double] of the same cardinality as x.
    * @return A Double containing the Pearson correlation between the two input RDD[Double]s
+   * @since 1.1.0
    */
   def corr(x: RDD[Double], y: RDD[Double]): Double = Correlations.corr(x, y)
 
-  /** Java-friendly version of [[corr()]] */
+  /**
+   * Java-friendly version of [[corr()]]
+   * @since 1.4.1
+   */
   def corr(x: JavaRDD[java.lang.Double], y: JavaRDD[java.lang.Double]): Double =
     corr(x.rdd.asInstanceOf[RDD[Double]], y.rdd.asInstanceOf[RDD[Double]])
 
@@ -101,10 +109,14 @@ object Statistics {
    *               Supported: `pearson` (default), `spearman`
    * @return A Double containing the correlation between the two input RDD[Double]s using the
    *         specified method.
+   * @since 1.1.0
    */
   def corr(x: RDD[Double], y: RDD[Double], method: String): Double = Correlations.corr(x, y, method)
 
-  /** Java-friendly version of [[corr()]] */
+  /**
+   * Java-friendly version of [[corr()]]
+   * @since 1.4.1
+   */
   def corr(x: JavaRDD[java.lang.Double], y: JavaRDD[java.lang.Double], method: String): Double =
     corr(x.rdd.asInstanceOf[RDD[Double]], y.rdd.asInstanceOf[RDD[Double]], method)
 
@@ -121,6 +133,7 @@ object Statistics {
    *                 `expected` is rescaled if the `expected` sum differs from the `observed` sum.
    * @return ChiSquaredTest object containing the test statistic, degrees of freedom, p-value,
    *         the method used, and the null hypothesis.
+   * @since 1.1.0
    */
   def chiSqTest(observed: Vector, expected: Vector): ChiSqTestResult = {
     ChiSqTest.chiSquared(observed, expected)
@@ -135,6 +148,7 @@ object Statistics {
    * @param observed Vector containing the observed categorical counts/relative frequencies.
    * @return ChiSquaredTest object containing the test statistic, degrees of freedom, p-value,
    *         the method used, and the null hypothesis.
+   * @since 1.1.0
    */
   def chiSqTest(observed: Vector): ChiSqTestResult = ChiSqTest.chiSquared(observed)
 
@@ -145,6 +159,7 @@ object Statistics {
    * @param observed The contingency matrix (containing either counts or relative frequencies).
    * @return ChiSquaredTest object containing the test statistic, degrees of freedom, p-value,
    *         the method used, and the null hypothesis.
+   * @since 1.1.0
    */
   def chiSqTest(observed: Matrix): ChiSqTestResult = ChiSqTest.chiSquaredMatrix(observed)
 
@@ -157,6 +172,7 @@ object Statistics {
    *             Real-valued features will be treated as categorical for each distinct value.
    * @return an array containing the ChiSquaredTestResult for every feature against the label.
    *         The order of the elements in the returned array reflects the order of input features.
+   * @since 1.1.0
    */
   def chiSqTest(data: RDD[LabeledPoint]): Array[ChiSqTestResult] = {
     ChiSqTest.chiSquaredFeatures(data)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala
index cf51b24ff777f..9aa7763d7890d 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala
@@ -32,6 +32,7 @@ import org.apache.spark.mllib.util.MLUtils
  *
  * @param mu The mean vector of the distribution
  * @param sigma The covariance matrix of the distribution
+ * @since 1.3.0
  */
 @DeveloperApi
 class MultivariateGaussian (
@@ -60,12 +61,16 @@ class MultivariateGaussian (
    */
   private val (rootSigmaInv: DBM[Double], u: Double) = calculateCovarianceConstants
 
-  /** Returns density of this multivariate Gaussian at given point, x */
+  /** Returns density of this multivariate Gaussian at given point, x
+    * @since 1.3.0
+    */
   def pdf(x: Vector): Double = {
     pdf(x.toBreeze)
   }
 
-  /** Returns the log-density of this multivariate Gaussian at given point, x */
+  /** Returns the log-density of this multivariate Gaussian at given point, x
+    * @since 1.3.0
+    */
   def logpdf(x: Vector): Double = {
     logpdf(x.toBreeze)
   }

From 37c2d1927cebdd19a14c054f670cb0fb9a263586 Mon Sep 17 00:00:00 2001
From: Holden Karau <holden@pigscanfly.ca>
Date: Wed, 29 Jul 2015 18:18:29 -0700
Subject: [PATCH 453/549] [SPARK-9016] [ML] make random forest classifiers
 implement classification trait

Implement the classification trait for RandomForestClassifiers. The plan is to use this in the future to providing thresholding for RandomForestClassifiers (as well as other classifiers that implement that trait).

Author: Holden Karau <holden@pigscanfly.ca>

Closes #7432 from holdenk/SPARK-9016-make-random-forest-classifiers-implement-classification-trait and squashes the following commits:

bf22fa6 [Holden Karau] Add missing imports for testing suite
e948f0d [Holden Karau] Check the prediction generation from rawprediciton
25320c3 [Holden Karau] Don't supply numClasses when not needed, assert model classes are as expected
1a67e04 [Holden Karau] Use old decission tree stuff instead
673e0c3 [Holden Karau] Merge branch 'master' into SPARK-9016-make-random-forest-classifiers-implement-classification-trait
0d15b96 [Holden Karau] FIx typo
5eafad4 [Holden Karau] add a constructor for rootnode + num classes
fc6156f [Holden Karau] scala style fix
2597915 [Holden Karau] take num classes in constructor
3ccfe4a [Holden Karau] Merge in master, make pass numClasses through randomforest for training
222a10b [Holden Karau] Increase numtrees to 3 in the python test since before the two were equal and the argmax was selecting the last one
16aea1c [Holden Karau] Make tests match the new models
b454a02 [Holden Karau] Make the Tree classifiers extends the Classifier base class
77b4114 [Holden Karau] Import vectors lib
---
 .../RandomForestClassifier.scala              | 30 ++++++++++---------
 .../RandomForestClassifierSuite.scala         | 18 ++++++++---
 python/pyspark/ml/classification.py           |  4 +--
 3 files changed, 32 insertions(+), 20 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
index fc0693f67cc2e..bc19bd6df894f 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
@@ -25,7 +25,7 @@ import org.apache.spark.ml.{PredictionModel, Predictor}
 import org.apache.spark.ml.param.ParamMap
 import org.apache.spark.ml.tree.{DecisionTreeModel, RandomForestParams, TreeClassifierParams, TreeEnsembleModel}
 import org.apache.spark.ml.util.{Identifiable, MetadataUtils}
-import org.apache.spark.mllib.linalg.Vector
+import org.apache.spark.mllib.linalg.{Vector, Vectors}
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.tree.configuration.{Algo => OldAlgo}
 import org.apache.spark.mllib.tree.model.{RandomForestModel => OldRandomForestModel}
@@ -43,7 +43,7 @@ import org.apache.spark.sql.types.DoubleType
  */
 @Experimental
 final class RandomForestClassifier(override val uid: String)
-  extends Predictor[Vector, RandomForestClassifier, RandomForestClassificationModel]
+  extends Classifier[Vector, RandomForestClassifier, RandomForestClassificationModel]
   with RandomForestParams with TreeClassifierParams {
 
   def this() = this(Identifiable.randomUID("rfc"))
@@ -98,7 +98,7 @@ final class RandomForestClassifier(override val uid: String)
     val trees =
       RandomForest.run(oldDataset, strategy, getNumTrees, getFeatureSubsetStrategy, getSeed)
         .map(_.asInstanceOf[DecisionTreeClassificationModel])
-    new RandomForestClassificationModel(trees)
+    new RandomForestClassificationModel(trees, numClasses)
   }
 
   override def copy(extra: ParamMap): RandomForestClassifier = defaultCopy(extra)
@@ -125,8 +125,9 @@ object RandomForestClassifier {
 @Experimental
 final class RandomForestClassificationModel private[ml] (
     override val uid: String,
-    private val _trees: Array[DecisionTreeClassificationModel])
-  extends PredictionModel[Vector, RandomForestClassificationModel]
+    private val _trees: Array[DecisionTreeClassificationModel],
+    override val numClasses: Int)
+  extends ClassificationModel[Vector, RandomForestClassificationModel]
   with TreeEnsembleModel with Serializable {
 
   require(numTrees > 0, "RandomForestClassificationModel requires at least 1 tree.")
@@ -135,8 +136,8 @@ final class RandomForestClassificationModel private[ml] (
    * Construct a random forest classification model, with all trees weighted equally.
    * @param trees  Component trees
    */
-  def this(trees: Array[DecisionTreeClassificationModel]) =
-    this(Identifiable.randomUID("rfc"), trees)
+  def this(trees: Array[DecisionTreeClassificationModel], numClasses: Int) =
+    this(Identifiable.randomUID("rfc"), trees, numClasses)
 
   override def trees: Array[DecisionTreeModel] = _trees.asInstanceOf[Array[DecisionTreeModel]]
 
@@ -153,20 +154,20 @@ final class RandomForestClassificationModel private[ml] (
     dataset.withColumn($(predictionCol), predictUDF(col($(featuresCol))))
   }
 
-  override protected def predict(features: Vector): Double = {
+  override protected def predictRaw(features: Vector): Vector = {
     // TODO: When we add a generic Bagging class, handle transform there: SPARK-7128
     // Classifies using majority votes.
     // Ignore the weights since all are 1.0 for now.
-    val votes = mutable.Map.empty[Int, Double]
+    val votes = new Array[Double](numClasses)
     _trees.view.foreach { tree =>
       val prediction = tree.rootNode.predict(features).toInt
-      votes(prediction) = votes.getOrElse(prediction, 0.0) + 1.0 // 1.0 = weight
+      votes(prediction) = votes(prediction) + 1.0 // 1.0 = weight
     }
-    votes.maxBy(_._2)._1
+    Vectors.dense(votes)
   }
 
   override def copy(extra: ParamMap): RandomForestClassificationModel = {
-    copyValues(new RandomForestClassificationModel(uid, _trees), extra)
+    copyValues(new RandomForestClassificationModel(uid, _trees, numClasses), extra)
   }
 
   override def toString: String = {
@@ -185,7 +186,8 @@ private[ml] object RandomForestClassificationModel {
   def fromOld(
       oldModel: OldRandomForestModel,
       parent: RandomForestClassifier,
-      categoricalFeatures: Map[Int, Int]): RandomForestClassificationModel = {
+      categoricalFeatures: Map[Int, Int],
+      numClasses: Int): RandomForestClassificationModel = {
     require(oldModel.algo == OldAlgo.Classification, "Cannot convert RandomForestModel" +
       s" with algo=${oldModel.algo} (old API) to RandomForestClassificationModel (new API).")
     val newTrees = oldModel.trees.map { tree =>
@@ -193,6 +195,6 @@ private[ml] object RandomForestClassificationModel {
       DecisionTreeClassificationModel.fromOld(tree, null, categoricalFeatures)
     }
     val uid = if (parent != null) parent.uid else Identifiable.randomUID("rfc")
-    new RandomForestClassificationModel(uid, newTrees)
+    new RandomForestClassificationModel(uid, newTrees, numClasses)
   }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala
index 1b6b69c7dc71e..ab711c8e4b215 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala
@@ -21,13 +21,13 @@ import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.impl.TreeTests
 import org.apache.spark.ml.param.ParamsSuite
 import org.apache.spark.ml.tree.LeafNode
-import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.linalg.{Vector, Vectors}
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.tree.{EnsembleTestHelper, RandomForest => OldRandomForest}
 import org.apache.spark.mllib.tree.configuration.{Algo => OldAlgo}
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.{DataFrame, Row}
 
 /**
  * Test suite for [[RandomForestClassifier]].
@@ -66,7 +66,7 @@ class RandomForestClassifierSuite extends SparkFunSuite with MLlibTestSparkConte
   test("params") {
     ParamsSuite.checkParams(new RandomForestClassifier)
     val model = new RandomForestClassificationModel("rfc",
-      Array(new DecisionTreeClassificationModel("dtc", new LeafNode(0.0, 0.0))))
+      Array(new DecisionTreeClassificationModel("dtc", new LeafNode(0.0, 0.0))), 2)
     ParamsSuite.checkParams(model)
   }
 
@@ -167,9 +167,19 @@ private object RandomForestClassifierSuite {
     val newModel = rf.fit(newData)
     // Use parent from newTree since this is not checked anyways.
     val oldModelAsNew = RandomForestClassificationModel.fromOld(
-      oldModel, newModel.parent.asInstanceOf[RandomForestClassifier], categoricalFeatures)
+      oldModel, newModel.parent.asInstanceOf[RandomForestClassifier], categoricalFeatures,
+      numClasses)
     TreeTests.checkEqual(oldModelAsNew, newModel)
     assert(newModel.hasParent)
     assert(!newModel.trees.head.asInstanceOf[DecisionTreeClassificationModel].hasParent)
+    assert(newModel.numClasses == numClasses)
+    val results = newModel.transform(newData)
+    results.select("rawPrediction", "prediction").collect().foreach {
+      case Row(raw: Vector, prediction: Double) => {
+        assert(raw.size == numClasses)
+        val predFromRaw = raw.toArray.zipWithIndex.maxBy(_._1)._2
+        assert(predFromRaw == prediction)
+      }
+    }
   }
 }
diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
index 89117e492846b..5a82bc286d1e8 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -299,9 +299,9 @@ class RandomForestClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred
     >>> stringIndexer = StringIndexer(inputCol="label", outputCol="indexed")
     >>> si_model = stringIndexer.fit(df)
     >>> td = si_model.transform(df)
-    >>> rf = RandomForestClassifier(numTrees=2, maxDepth=2, labelCol="indexed", seed=42)
+    >>> rf = RandomForestClassifier(numTrees=3, maxDepth=2, labelCol="indexed", seed=42)
     >>> model = rf.fit(td)
-    >>> allclose(model.treeWeights, [1.0, 1.0])
+    >>> allclose(model.treeWeights, [1.0, 1.0, 1.0])
     True
     >>> test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], ["features"])
     >>> model.transform(test0).head().prediction

From 2a9fe4a4e7acbe4c9d3b6c6e61ff46d1472ee5f4 Mon Sep 17 00:00:00 2001
From: sethah <seth.hendrickson16@gmail.com>
Date: Wed, 29 Jul 2015 18:23:07 -0700
Subject: [PATCH 454/549] [SPARK-6129] [MLLIB] [DOCS] Added user guide for
 evaluation metrics

Author: sethah <seth.hendrickson16@gmail.com>

Closes #7655 from sethah/Working_on_6129 and squashes the following commits:

253db2d [sethah] removed number formatting from example code
b769cab [sethah] rewording threshold section
d5dad4d [sethah] adding some explanations of concepts to the eval metrics user guide
3a61ff9 [sethah] Removing unnecessary latex commands from metrics guide
c9dd058 [sethah] Cleaning up and formatting metrics user guide section
6f31c21 [sethah] All example code for metrics section done
98813fe [sethah] Most java and python example code added. Further latex formatting
53a24fc [sethah] Adding documentations of metrics for ML algorithms to user guide
---
 docs/mllib-evaluation-metrics.md | 1497 ++++++++++++++++++++++++++++++
 docs/mllib-guide.md              |    1 +
 2 files changed, 1498 insertions(+)
 create mode 100644 docs/mllib-evaluation-metrics.md

diff --git a/docs/mllib-evaluation-metrics.md b/docs/mllib-evaluation-metrics.md
new file mode 100644
index 0000000000000..4ca0bb06b26a6
--- /dev/null
+++ b/docs/mllib-evaluation-metrics.md
@@ -0,0 +1,1497 @@
+---
+layout: global
+title: Evaluation Metrics - MLlib
+displayTitle: <a href="mllib-guide.html">MLlib</a> - Evaluation Metrics
+---
+
+* Table of contents
+{:toc}
+
+Spark's MLlib comes with a number of machine learning algorithms that can be used to learn from and make predictions
+on data. When these algorithms are applied to build machine learning models, there is a need to evaluate the performance
+of the model on some criteria, which depends on the application and its requirements. Spark's MLlib also provides a
+suite of metrics for the purpose of evaluating the performance of machine learning models.
+
+Specific machine learning algorithms fall under broader types of machine learning applications like classification,
+regression, clustering, etc. Each of these types have well established metrics for performance evaluation and those
+metrics that are currently available in Spark's MLlib are detailed in this section.
+
+## Classification model evaluation
+
+While there are many different types of classification algorithms, the evaluation of classification models all share
+similar principles. In a [supervised classification problem](https://en.wikipedia.org/wiki/Statistical_classification),
+there exists a true output and a model-generated predicted output for each data point. For this reason, the results for
+each data point can be assigned to one of four categories:
+
+* True Positive (TP) - label is positive and prediction is also positive
+* True Negative (TN) - label is negative and prediction is also negative
+* False Positive (FP) - label is negative but prediction is positive
+* False Negative (FN) - label is positive but prediction is negative
+
+These four numbers are the building blocks for most classifier evaluation metrics. A fundamental point when considering
+classifier evaluation is that pure accuracy (i.e. was the prediction correct or incorrect) is not generally a good metric. The
+reason for this is because a dataset may be highly unbalanced. For example, if a model is designed to predict fraud from
+a dataset where 95% of the data points are _not fraud_ and 5% of the data points are _fraud_, then a naive classifier
+that predicts _not fraud_, regardless of input, will be 95% accurate. For this reason, metrics like
+[precision and recall](https://en.wikipedia.org/wiki/Precision_and_recall) are typically used because they take into
+account the *type* of error. In most applications there is some desired balance between precision and recall, which can
+be captured by combining the two into a single metric, called the [F-measure](https://en.wikipedia.org/wiki/F1_score).
+
+### Binary classification
+
+[Binary classifiers](https://en.wikipedia.org/wiki/Binary_classification) are used to separate the elements of a given
+dataset into one of two possible groups (e.g. fraud or not fraud) and is a special case of multiclass classification.
+Most binary classification metrics can be generalized to multiclass classification metrics.
+
+#### Threshold tuning
+
+It is import to understand that many classification models actually output a "score" (often times a probability) for
+each class, where a higher score indicates higher likelihood. In the binary case, the model may output a probability for
+each class: $P(Y=1|X)$ and $P(Y=0|X)$. Instead of simply taking the higher probability, there may be some cases where
+the model might need to be tuned so that it only predicts a class when the probability is very high (e.g. only block a
+credit card transaction if the model predicts fraud with >90% probability). Therefore, there is a prediction *threshold*
+which determines what the predicted class will be based on the probabilities that the model outputs.
+
+Tuning the prediction threshold will change the precision and recall of the model and is an important part of model
+optimization. In order to visualize how precision, recall, and other metrics change as a function of the threshold it is
+common practice to plot competing metrics against one another, parameterized by threshold. A P-R curve plots (precision,
+recall) points for different threshold values, while a
+[receiver operating characteristic](https://en.wikipedia.org/wiki/Receiver_operating_characteristic), or ROC, curve
+plots (recall, false positive rate) points.
+
+**Available metrics**
+
+<table class="table">
+  <thead>
+    <tr><th>Metric</th><th>Definition</th></tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>Precision (Postive Predictive Value)</td>
+      <td>$PPV=\frac{TP}{TP + FP}$</td>
+    </tr>
+    <tr>
+      <td>Recall (True Positive Rate)</td>
+      <td>$TPR=\frac{TP}{P}=\frac{TP}{TP + FN}$</td>
+    </tr>
+    <tr>
+      <td>F-measure</td>
+      <td>$F(\beta) = \left(1 + \beta^2\right) \cdot \left(\frac{PPV \cdot TPR}
+          {\beta^2 \cdot PPV + TPR}\right)$</td>
+    </tr>
+    <tr>
+      <td>Receiver Operating Characteristic (ROC)</td>
+      <td>$FPR(T)=\int^\infty_{T} P_0(T)\,dT \\ TPR(T)=\int^\infty_{T} P_1(T)\,dT$</td>
+    </tr>
+    <tr>
+      <td>Area Under ROC Curve</td>
+      <td>$AUROC=\int^1_{0} \frac{TP}{P} d\left(\frac{FP}{N}\right)$</td>
+    </tr>
+    <tr>
+      <td>Area Under Precision-Recall Curve</td>
+      <td>$AUPRC=\int^1_{0} \frac{TP}{TP+FP} d\left(\frac{TP}{P}\right)$</td>
+    </tr>
+  </tbody>
+</table>
+
+
+**Examples**
+
+<div class="codetabs">
+The following code snippets illustrate how to load a sample dataset, train a binary classification algorithm on the
+data, and evaluate the performance of the algorithm by several binary evaluation metrics.
+
+<div data-lang="scala" markdown="1">
+
+{% highlight scala %}
+import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
+import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.mllib.util.MLUtils
+
+// Load training data in LIBSVM format
+val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_binary_classification_data.txt")
+
+// Split data into training (60%) and test (40%)
+val Array(training, test) = data.randomSplit(Array(0.6, 0.4), seed = 11L)
+training.cache()
+
+// Run training algorithm to build the model
+val model = new LogisticRegressionWithLBFGS()
+  .setNumClasses(2)
+  .run(training)
+
+// Clear the prediction threshold so the model will return probabilities
+model.clearThreshold
+
+// Compute raw scores on the test set
+val predictionAndLabels = test.map { case LabeledPoint(label, features) =>
+  val prediction = model.predict(features)
+  (prediction, label)
+}
+
+// Instantiate metrics object
+val metrics = new BinaryClassificationMetrics(predictionAndLabels)
+
+// Precision by threshold
+val precision = metrics.precisionByThreshold
+precision.foreach { case (t, p) =>
+    println(s"Threshold: $t, Precision: $p")
+}
+
+// Recall by threshold
+val recall = metrics.precisionByThreshold
+recall.foreach { case (t, r) =>
+    println(s"Threshold: $t, Recall: $r")
+}
+
+// Precision-Recall Curve
+val PRC = metrics.pr
+
+// F-measure
+val f1Score = metrics.fMeasureByThreshold
+f1Score.foreach { case (t, f) =>
+    println(s"Threshold: $t, F-score: $f, Beta = 1")
+}
+
+val beta = 0.5
+val fScore = metrics.fMeasureByThreshold(beta)
+f1Score.foreach { case (t, f) =>
+    println(s"Threshold: $t, F-score: $f, Beta = 0.5")
+}
+
+// AUPRC
+val auPRC = metrics.areaUnderPR
+println("Area under precision-recall curve = " + auPRC)
+
+// Compute thresholds used in ROC and PR curves
+val thresholds = precision.map(_._1)
+
+// ROC Curve
+val roc = metrics.roc
+
+// AUROC
+val auROC = metrics.areaUnderROC
+println("Area under ROC = " + auROC)
+
+{% endhighlight %}
+
+</div>
+
+<div data-lang="java" markdown="1">
+
+{% highlight java %}
+import scala.Tuple2;
+
+import org.apache.spark.api.java.*;
+import org.apache.spark.rdd.RDD;
+import org.apache.spark.api.java.function.Function;
+import org.apache.spark.mllib.classification.LogisticRegressionModel;
+import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS;
+import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics;
+import org.apache.spark.mllib.regression.LabeledPoint;
+import org.apache.spark.mllib.util.MLUtils;
+import org.apache.spark.SparkConf;
+import org.apache.spark.SparkContext;
+
+public class BinaryClassification {
+  public static void main(String[] args) {
+    SparkConf conf = new SparkConf().setAppName("Binary Classification Metrics");
+    SparkContext sc = new SparkContext(conf);
+    String path = "data/mllib/sample_binary_classification_data.txt";
+    JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD();
+
+    // Split initial RDD into two... [60% training data, 40% testing data].
+    JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[] {0.6, 0.4}, 11L);
+    JavaRDD<LabeledPoint> training = splits[0].cache();
+    JavaRDD<LabeledPoint> test = splits[1];
+
+    // Run training algorithm to build the model.
+    final LogisticRegressionModel model = new LogisticRegressionWithLBFGS()
+      .setNumClasses(2)
+      .run(training.rdd());
+
+    // Clear the prediction threshold so the model will return probabilities
+    model.clearThreshold();
+
+    // Compute raw scores on the test set.
+    JavaRDD<Tuple2<Object, Object>> predictionAndLabels = test.map(
+      new Function<LabeledPoint, Tuple2<Object, Object>>() {
+        public Tuple2<Object, Object> call(LabeledPoint p) {
+          Double prediction = model.predict(p.features());
+          return new Tuple2<Object, Object>(prediction, p.label());
+        }
+      }
+    );
+
+    // Get evaluation metrics.
+    BinaryClassificationMetrics metrics = new BinaryClassificationMetrics(predictionAndLabels.rdd());
+
+    // Precision by threshold
+    JavaRDD<Tuple2<Object, Object>> precision = metrics.precisionByThreshold().toJavaRDD();
+    System.out.println("Precision by threshold: " + precision.toArray());
+
+    // Recall by threshold
+    JavaRDD<Tuple2<Object, Object>> recall = metrics.recallByThreshold().toJavaRDD();
+    System.out.println("Recall by threshold: " + recall.toArray());
+
+    // F Score by threshold
+    JavaRDD<Tuple2<Object, Object>> f1Score = metrics.fMeasureByThreshold().toJavaRDD();
+    System.out.println("F1 Score by threshold: " + f1Score.toArray());
+
+    JavaRDD<Tuple2<Object, Object>> f2Score = metrics.fMeasureByThreshold(2.0).toJavaRDD();
+    System.out.println("F2 Score by threshold: " + f2Score.toArray());
+
+    // Precision-recall curve
+    JavaRDD<Tuple2<Object, Object>> prc = metrics.pr().toJavaRDD();
+    System.out.println("Precision-recall curve: " + prc.toArray());
+
+    // Thresholds
+    JavaRDD<Double> thresholds = precision.map(
+      new Function<Tuple2<Object, Object>, Double>() {
+        public Double call (Tuple2<Object, Object> t) {
+          return new Double(t._1().toString());
+        }
+      }
+    );
+
+    // ROC Curve
+    JavaRDD<Tuple2<Object, Object>> roc = metrics.roc().toJavaRDD();
+    System.out.println("ROC curve: " + roc.toArray());
+
+    // AUPRC
+    System.out.println("Area under precision-recall curve = " + metrics.areaUnderPR());
+
+    // AUROC
+    System.out.println("Area under ROC = " + metrics.areaUnderROC());
+
+    // Save and load model
+    model.save(sc, "myModelPath");
+    LogisticRegressionModel sameModel = LogisticRegressionModel.load(sc, "myModelPath");
+  }
+}
+
+{% endhighlight %}
+
+</div>
+
+<div data-lang="python" markdown="1">
+
+{% highlight python %}
+from pyspark.mllib.classification import LogisticRegressionWithLBFGS
+from pyspark.mllib.evaluation import BinaryClassificationMetrics
+from pyspark.mllib.regression import LabeledPoint
+from pyspark.mllib.util import MLUtils
+
+# Several of the methods available in scala are currently missing from pyspark
+
+# Load training data in LIBSVM format
+data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_binary_classification_data.txt")
+
+# Split data into training (60%) and test (40%)
+training, test = data.randomSplit([0.6, 0.4], seed = 11L)
+training.cache()
+
+# Run training algorithm to build the model
+model = LogisticRegressionWithLBFGS.train(training)
+
+# Compute raw scores on the test set
+predictionAndLabels = test.map(lambda lp: (float(model.predict(lp.features)), lp.label))
+
+# Instantiate metrics object
+metrics = BinaryClassificationMetrics(predictionAndLabels)
+
+# Area under precision-recall curve
+print "Area under PR = %s" % metrics.areaUnderPR
+
+# Area under ROC curve
+print "Area under ROC = %s" % metrics.areaUnderROC
+
+{% endhighlight %}
+
+</div>
+</div>
+
+
+### Multiclass classification
+
+A [multiclass classification](https://en.wikipedia.org/wiki/Multiclass_classification) describes a classification
+problem where there are $M \gt 2$ possible labels for each data point (the case where $M=2$ is the binary
+classification problem). For example, classifying handwriting samples to the digits 0 to 9, having 10 possible classes.
+
+For multiclass metrics, the notion of positives and negatives is slightly different. Predictions and labels can still
+be positive or negative, but they must be considered under the context of a particular class. Each label and prediction
+take on the value of one of the multiple classes and so they are said to be positive for their particular class and negative
+for all other classes. So, a true positive occurs whenever the prediction and the label match, while a true negative
+occurs when neither the prediction nor the label take on the value of a given class. By this convention, there can be
+multiple true negatives for a given data sample. The extension of false negatives and false positives from the former
+definitions of positive and negative labels is straightforward.
+
+#### Label based metrics
+
+Opposed to binary classification where there are only two possible labels, multiclass classification problems have many
+possible labels and so the concept of label-based metrics is introduced. Overall precision measures precision across all
+labels -  the number of times any class was predicted correctly (true positives) normalized by the number of data
+points. Precision by label considers only one class, and measures the number of time a specific label was predicted
+correctly normalized by the number of times that label appears in the output.
+
+**Available metrics**
+
+Define the class, or label, set as
+
+$$L = \{\ell_0, \ell_1, \ldots, \ell_{M-1} \} $$
+
+The true output vector $\mathbf{y}$ consists of $N$ elements
+
+$$\mathbf{y}_0, \mathbf{y}_1, \ldots, \mathbf{y}_{N-1} \in L $$
+
+A multiclass prediction algorithm generates a prediction vector $\hat{\mathbf{y}}$ of $N$ elements
+
+$$\hat{\mathbf{y}}_0, \hat{\mathbf{y}}_1, \ldots, \hat{\mathbf{y}}_{N-1} \in L $$
+
+For this section, a modified delta function $\hat{\delta}(x)$ will prove useful
+
+$$\hat{\delta}(x) = \begin{cases}1 & \text{if $x = 0$}, \\ 0 & \text{otherwise}.\end{cases}$$
+
+<table class="table">
+  <thead>
+    <tr><th>Metric</th><th>Definition</th></tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>Confusion Matrix</td>
+      <td>
+        $C_{ij} = \sum_{k=0}^{N-1} \hat{\delta}(\mathbf{y}_k-\ell_i) \cdot \hat{\delta}(\hat{\mathbf{y}}_k - \ell_j)\\ \\
+         \left( \begin{array}{ccc}
+         \sum_{k=0}^{N-1} \hat{\delta}(\mathbf{y}_k-\ell_1) \cdot \hat{\delta}(\hat{\mathbf{y}}_k - \ell_1) & \ldots &
+         \sum_{k=0}^{N-1} \hat{\delta}(\mathbf{y}_k-\ell_1) \cdot \hat{\delta}(\hat{\mathbf{y}}_k - \ell_N) \\
+         \vdots & \ddots & \vdots \\
+         \sum_{k=0}^{N-1} \hat{\delta}(\mathbf{y}_k-\ell_N) \cdot \hat{\delta}(\hat{\mathbf{y}}_k - \ell_1) & \ldots &
+         \sum_{k=0}^{N-1} \hat{\delta}(\mathbf{y}_k-\ell_N) \cdot \hat{\delta}(\hat{\mathbf{y}}_k - \ell_N)
+         \end{array} \right)$
+      </td>
+    </tr>
+    <tr>
+      <td>Overall Precision</td>
+      <td>$PPV = \frac{TP}{TP + FP} = \frac{1}{N}\sum_{i=0}^{N-1} \hat{\delta}\left(\hat{\mathbf{y}}_i -
+        \mathbf{y}_i\right)$</td>
+    </tr>
+    <tr>
+      <td>Overall Recall</td>
+      <td>$TPR = \frac{TP}{TP + FN} = \frac{1}{N}\sum_{i=0}^{N-1} \hat{\delta}\left(\hat{\mathbf{y}}_i -
+        \mathbf{y}_i\right)$</td>
+    </tr>
+    <tr>
+      <td>Overall F1-measure</td>
+      <td>$F1 = 2 \cdot \left(\frac{PPV \cdot TPR}
+          {PPV + TPR}\right)$</td>
+    </tr>
+    <tr>
+      <td>Precision by label</td>
+      <td>$PPV(\ell) = \frac{TP}{TP + FP} =
+          \frac{\sum_{i=0}^{N-1} \hat{\delta}(\hat{\mathbf{y}}_i - \ell) \cdot \hat{\delta}(\mathbf{y}_i - \ell)}
+          {\sum_{i=0}^{N-1} \hat{\delta}(\hat{\mathbf{y}}_i - \ell)}$</td>
+    </tr>
+    <tr>
+      <td>Recall by label</td>
+      <td>$TPR(\ell)=\frac{TP}{P} =
+          \frac{\sum_{i=0}^{N-1} \hat{\delta}(\hat{\mathbf{y}}_i - \ell) \cdot \hat{\delta}(\mathbf{y}_i - \ell)}
+          {\sum_{i=0}^{N-1} \hat{\delta}(\mathbf{y}_i - \ell)}$</td>
+    </tr>
+    <tr>
+      <td>F-measure by label</td>
+      <td>$F(\beta, \ell) = \left(1 + \beta^2\right) \cdot \left(\frac{PPV(\ell) \cdot TPR(\ell)}
+          {\beta^2 \cdot PPV(\ell) + TPR(\ell)}\right)$</td>
+    </tr>
+    <tr>
+      <td>Weighted precision</td>
+      <td>$PPV_{w}= \frac{1}{N} \sum\nolimits_{\ell \in L} PPV(\ell)
+          \cdot \sum_{i=0}^{N-1} \hat{\delta}(\mathbf{y}_i-\ell)$</td>
+    </tr>
+    <tr>
+      <td>Weighted recall</td>
+      <td>$TPR_{w}= \frac{1}{N} \sum\nolimits_{\ell \in L} TPR(\ell)
+          \cdot \sum_{i=0}^{N-1} \hat{\delta}(\mathbf{y}_i-\ell)$</td>
+    </tr>
+    <tr>
+      <td>Weighted F-measure</td>
+      <td>$F_{w}(\beta)= \frac{1}{N} \sum\nolimits_{\ell \in L} F(\beta, \ell)
+          \cdot \sum_{i=0}^{N-1} \hat{\delta}(\mathbf{y}_i-\ell)$</td>
+    </tr>
+  </tbody>
+</table>
+
+**Examples**
+
+<div class="codetabs">
+The following code snippets illustrate how to load a sample dataset, train a multiclass classification algorithm on
+the data, and evaluate the performance of the algorithm by several multiclass classification evaluation metrics.
+
+<div data-lang="scala" markdown="1">
+
+{% highlight scala %}
+import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
+import org.apache.spark.mllib.evaluation.MulticlassMetrics
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.mllib.util.MLUtils
+
+// Load training data in LIBSVM format
+val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_multiclass_classification_data.txt")
+
+// Split data into training (60%) and test (40%)
+val Array(training, test) = data.randomSplit(Array(0.6, 0.4), seed = 11L)
+training.cache()
+
+// Run training algorithm to build the model
+val model = new LogisticRegressionWithLBFGS()
+  .setNumClasses(3)
+  .run(training)
+
+// Compute raw scores on the test set
+val predictionAndLabels = test.map { case LabeledPoint(label, features) =>
+  val prediction = model.predict(features)
+  (prediction, label)
+}
+
+// Instantiate metrics object
+val metrics = new MulticlassMetrics(predictionAndLabels)
+
+// Confusion matrix
+println("Confusion matrix:")
+println(metrics.confusionMatrix)
+
+// Overall Statistics
+val precision = metrics.precision
+val recall = metrics.recall // same as true positive rate
+val f1Score = metrics.fMeasure
+println("Summary Statistics")
+println(s"Precision = $precision")
+println(s"Recall = $recall")
+println(s"F1 Score = $f1Score")
+
+// Precision by label
+val labels = metrics.labels
+labels.foreach { l =>
+    println(s"Precision($l) = " + metrics.precision(l))
+}
+
+// Recall by label
+labels.foreach { l =>
+    println(s"Recall($l) = " + metrics.recall(l))
+}
+
+// False positive rate by label
+labels.foreach { l =>
+    println(s"FPR($l) = " + metrics.falsePositiveRate(l))
+}
+
+// F-measure by label
+labels.foreach { l =>
+    println(s"F1-Score($l) = " + metrics.fMeasure(l))
+}
+
+// Weighted stats
+println(s"Weighted precision: ${metrics.weightedPrecision}")
+println(s"Weighted recall: ${metrics.weightedRecall}")
+println(s"Weighted F1 score: ${metrics.weightedFMeasure}")
+println(s"Weighted false positive rate: ${metrics.weightedFalsePositiveRate}")
+
+{% endhighlight %}
+
+</div>
+
+<div data-lang="java" markdown="1">
+
+{% highlight java %}
+import scala.Tuple2;
+
+import org.apache.spark.api.java.*;
+import org.apache.spark.rdd.RDD;
+import org.apache.spark.api.java.function.Function;
+import org.apache.spark.mllib.classification.LogisticRegressionModel;
+import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS;
+import org.apache.spark.mllib.evaluation.MulticlassMetrics;
+import org.apache.spark.mllib.regression.LabeledPoint;
+import org.apache.spark.mllib.util.MLUtils;
+import org.apache.spark.mllib.linalg.Matrix;
+import org.apache.spark.SparkConf;
+import org.apache.spark.SparkContext;
+
+public class MulticlassClassification {
+  public static void main(String[] args) {
+    SparkConf conf = new SparkConf().setAppName("Multiclass Classification Metrics");
+    SparkContext sc = new SparkContext(conf);
+    String path = "data/mllib/sample_multiclass_classification_data.txt";
+    JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD();
+
+    // Split initial RDD into two... [60% training data, 40% testing data].
+    JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[] {0.6, 0.4}, 11L);
+    JavaRDD<LabeledPoint> training = splits[0].cache();
+    JavaRDD<LabeledPoint> test = splits[1];
+
+    // Run training algorithm to build the model.
+    final LogisticRegressionModel model = new LogisticRegressionWithLBFGS()
+      .setNumClasses(3)
+      .run(training.rdd());
+
+    // Compute raw scores on the test set.
+    JavaRDD<Tuple2<Object, Object>> predictionAndLabels = test.map(
+      new Function<LabeledPoint, Tuple2<Object, Object>>() {
+        public Tuple2<Object, Object> call(LabeledPoint p) {
+          Double prediction = model.predict(p.features());
+          return new Tuple2<Object, Object>(prediction, p.label());
+        }
+      }
+    );
+
+    // Get evaluation metrics.
+    MulticlassMetrics metrics = new MulticlassMetrics(predictionAndLabels.rdd());
+
+    // Confusion matrix
+    Matrix confusion = metrics.confusionMatrix();
+    System.out.println("Confusion matrix: \n" + confusion);
+
+    // Overall statistics
+    System.out.println("Precision = " + metrics.precision());
+    System.out.println("Recall = " + metrics.recall());
+    System.out.println("F1 Score = " + metrics.fMeasure());
+
+    // Stats by labels
+    for (int i = 0; i < metrics.labels().length; i++) {
+        System.out.format("Class %f precision = %f\n", metrics.labels()[i], metrics.precision(metrics.labels()[i]));
+        System.out.format("Class %f recall = %f\n", metrics.labels()[i], metrics.recall(metrics.labels()[i]));
+        System.out.format("Class %f F1 score = %f\n", metrics.labels()[i], metrics.fMeasure(metrics.labels()[i]));
+    }
+
+    //Weighted stats
+    System.out.format("Weighted precision = %f\n", metrics.weightedPrecision());
+    System.out.format("Weighted recall = %f\n", metrics.weightedRecall());
+    System.out.format("Weighted F1 score = %f\n", metrics.weightedFMeasure());
+    System.out.format("Weighted false positive rate = %f\n", metrics.weightedFalsePositiveRate());
+
+    // Save and load model
+    model.save(sc, "myModelPath");
+    LogisticRegressionModel sameModel = LogisticRegressionModel.load(sc, "myModelPath");
+  }
+}
+
+{% endhighlight %}
+
+</div>
+
+<div data-lang="python" markdown="1">
+
+{% highlight python %}
+from pyspark.mllib.classification import LogisticRegressionWithLBFGS
+from pyspark.mllib.util import MLUtils
+from pyspark.mllib.evaluation import MulticlassMetrics
+
+# Load training data in LIBSVM format
+data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_multiclass_classification_data.txt")
+
+# Split data into training (60%) and test (40%)
+training, test = data.randomSplit([0.6, 0.4], seed = 11L)
+training.cache()
+
+# Run training algorithm to build the model
+model = LogisticRegressionWithLBFGS.train(training, numClasses=3)
+
+# Compute raw scores on the test set
+predictionAndLabels = test.map(lambda lp: (float(model.predict(lp.features)), lp.label))
+
+# Instantiate metrics object
+metrics = MulticlassMetrics(predictionAndLabels)
+
+# Overall statistics
+precision = metrics.precision()
+recall = metrics.recall()
+f1Score = metrics.fMeasure()
+print "Summary Stats"
+print "Precision = %s" % precision
+print "Recall = %s" % recall
+print "F1 Score = %s" % f1Score
+
+# Statistics by class
+labels = data.map(lambda lp: lp.label).distinct().collect()
+for label in sorted(labels):
+    print "Class %s precision = %s" % (label, metrics.precision(label))
+    print "Class %s recall = %s" % (label, metrics.recall(label))
+    print "Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0))
+
+# Weighted stats
+print "Weighted recall = %s" % metrics.weightedRecall
+print "Weighted precision = %s" % metrics.weightedPrecision
+print "Weighted F(1) Score = %s" % metrics.weightedFMeasure()
+print "Weighted F(0.5) Score = %s" % metrics.weightedFMeasure(beta=0.5)
+print "Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate
+{% endhighlight %}
+
+</div>
+</div>
+
+### Multilabel classification
+
+A [multilabel classification](https://en.wikipedia.org/wiki/Multi-label_classification) problem involves mapping
+each sample in a dataset to a set of class labels. In this type of classification problem, the labels are not
+mutually exclusive. For example, when classifying a set of news articles into topics, a single article might be both
+science and politics.
+
+Because the labels are not mutually exclusive, the predictions and true labels are now vectors of label *sets*, rather
+than vectors of labels. Multilabel metrics, therefore, extend the fundamental ideas of precision, recall, etc. to
+operations on sets. For example, a true positive for a given class now occurs when that class exists in the predicted
+set and it exists in the true label set, for a specific data point.
+
+**Available metrics**
+
+Here we define a set $D$ of $N$ documents
+
+$$D = \left\{d_0, d_1, ..., d_{N-1}\right\}$$
+
+Define $L_0, L_1, ..., L_{N-1}$ to be a family of label sets and $P_0, P_1, ..., P_{N-1}$
+to be a family of prediction sets where $L_i$ and $P_i$ are the label set and prediction set, respectively, that
+correspond to document $d_i$.
+
+The set of all unique labels is given by
+
+$$L = \bigcup_{k=0}^{N-1} L_k$$
+
+The following definition of indicator function $I_A(x)$ on a set $A$ will be necessary
+
+$$I_A(x) = \begin{cases}1 & \text{if $x \in A$}, \\ 0 & \text{otherwise}.\end{cases}$$
+
+<table class="table">
+  <thead>
+    <tr><th>Metric</th><th>Definition</th></tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>Precision</td><td>$\frac{1}{N} \sum_{i=0}^{N-1} \frac{\left|P_i \cap L_i\right|}{\left|P_i\right|}$</td>
+    </tr>
+    <tr>
+      <td>Recall</td><td>$\frac{1}{N} \sum_{i=0}^{N-1} \frac{\left|L_i \cap P_i\right|}{\left|L_i\right|}$</td>
+    </tr>
+    <tr>
+      <td>Accuracy</td>
+      <td>
+        $\frac{1}{N} \sum_{i=0}^{N - 1} \frac{\left|L_i \cap P_i \right|}
+        {\left|L_i\right| + \left|P_i\right| - \left|L_i \cap P_i \right|}$
+      </td>
+    </tr>
+    <tr>
+      <td>Precision by label</td><td>$PPV(\ell)=\frac{TP}{TP + FP}=
+          \frac{\sum_{i=0}^{N-1} I_{P_i}(\ell) \cdot I_{L_i}(\ell)}
+          {\sum_{i=0}^{N-1} I_{P_i}(\ell)}$</td>
+    </tr>
+    <tr>
+      <td>Recall by label</td><td>$TPR(\ell)=\frac{TP}{P}=
+          \frac{\sum_{i=0}^{N-1} I_{P_i}(\ell) \cdot I_{L_i}(\ell)}
+          {\sum_{i=0}^{N-1} I_{L_i}(\ell)}$</td>
+    </tr>
+    <tr>
+      <td>F1-measure by label</td><td>$F1(\ell) = 2
+                            \cdot \left(\frac{PPV(\ell) \cdot TPR(\ell)}
+                            {PPV(\ell) + TPR(\ell)}\right)$</td>
+    </tr>
+    <tr>
+      <td>Hamming Loss</td>
+      <td>
+        $\frac{1}{N \cdot \left|L\right|} \sum_{i=0}^{N - 1} \left|L_i\right| + \left|P_i\right| - 2\left|L_i
+          \cap P_i\right|$
+      </td>
+    </tr>
+    <tr>
+      <td>Subset Accuracy</td>
+      <td>$\frac{1}{N} \sum_{i=0}^{N-1} I_{\{L_i\}}(P_i)$</td>
+    </tr>
+    <tr>
+      <td>F1 Measure</td>
+      <td>$\frac{1}{N} \sum_{i=0}^{N-1} 2 \frac{\left|P_i \cap L_i\right|}{\left|P_i\right| \cdot \left|L_i\right|}$</td>
+    </tr>
+    <tr>
+      <td>Micro precision</td>
+      <td>$\frac{TP}{TP + FP}=\frac{\sum_{i=0}^{N-1} \left|P_i \cap L_i\right|}
+          {\sum_{i=0}^{N-1} \left|P_i \cap L_i\right| + \sum_{i=0}^{N-1} \left|P_i - L_i\right|}$</td>
+    </tr>
+    <tr>
+      <td>Micro recall</td>
+      <td>$\frac{TP}{TP + FN}=\frac{\sum_{i=0}^{N-1} \left|P_i \cap L_i\right|}
+        {\sum_{i=0}^{N-1} \left|P_i \cap L_i\right| + \sum_{i=0}^{N-1} \left|L_i - P_i\right|}$</td>
+    </tr>
+    <tr>
+      <td>Micro F1 Measure</td>
+      <td>
+        $2 \cdot \frac{TP}{2 \cdot TP + FP + FN}=2 \cdot \frac{\sum_{i=0}^{N-1} \left|P_i \cap L_i\right|}{2 \cdot
+        \sum_{i=0}^{N-1} \left|P_i \cap L_i\right| + \sum_{i=0}^{N-1} \left|L_i - P_i\right| + \sum_{i=0}^{N-1}
+        \left|P_i - L_i\right|}$
+      </td>
+    </tr>
+  </tbody>
+</table>
+
+**Examples**
+
+The following code snippets illustrate how to evaluate the performance of a multilabel classifer. The examples
+use the fake prediction and label data for multilabel classification that is shown below.
+
+Document predictions:
+
+* doc 0 - predict 0, 1 - class 0, 2
+* doc 1 - predict 0, 2 - class 0, 1
+* doc 2 - predict none - class 0
+* doc 3 - predict 2 - class 2
+* doc 4 - predict 2, 0 - class 2, 0
+* doc 5 - predict 0, 1, 2 - class 0, 1
+* doc 6 - predict 1 - class 1, 2
+
+Predicted classes:
+
+* class 0 - doc 0, 1, 4, 5 (total 4)
+* class 1 - doc 0, 5, 6 (total 3)
+* class 2 - doc 1, 3, 4, 5 (total 4)
+
+True classes:
+
+* class 0 - doc 0, 1, 2, 4, 5 (total 5)
+* class 1 - doc 1, 5, 6 (total 3)
+* class 2 - doc 0, 3, 4, 6 (total 4)
+
+<div class="codetabs">
+
+<div data-lang="scala" markdown="1">
+
+{% highlight scala %}
+import org.apache.spark.mllib.evaluation.MultilabelMetrics
+import org.apache.spark.rdd.RDD;
+
+val scoreAndLabels: RDD[(Array[Double], Array[Double])] = sc.parallelize(
+  Seq((Array(0.0, 1.0), Array(0.0, 2.0)),
+    (Array(0.0, 2.0), Array(0.0, 1.0)),
+    (Array(), Array(0.0)),
+    (Array(2.0), Array(2.0)),
+    (Array(2.0, 0.0), Array(2.0, 0.0)),
+    (Array(0.0, 1.0, 2.0), Array(0.0, 1.0)),
+    (Array(1.0), Array(1.0, 2.0))), 2)
+
+// Instantiate metrics object
+val metrics = new MultilabelMetrics(scoreAndLabels)
+
+// Summary stats
+println(s"Recall = ${metrics.recall}")
+println(s"Precision = ${metrics.precision}")
+println(s"F1 measure = ${metrics.f1Measure}")
+println(s"Accuracy = ${metrics.accuracy}")
+
+// Individual label stats
+metrics.labels.foreach(label => println(s"Class $label precision = ${metrics.precision(label)}"))
+metrics.labels.foreach(label => println(s"Class $label recall = ${metrics.recall(label)}"))
+metrics.labels.foreach(label => println(s"Class $label F1-score = ${metrics.f1Measure(label)}"))
+
+// Micro stats
+println(s"Micro recall = ${metrics.microRecall}")
+println(s"Micro precision = ${metrics.microPrecision}")
+println(s"Micro F1 measure = ${metrics.microF1Measure}")
+
+// Hamming loss
+println(s"Hamming loss = ${metrics.hammingLoss}")
+
+// Subset accuracy
+println(s"Subset accuracy = ${metrics.subsetAccuracy}")
+
+{% endhighlight %}
+
+</div>
+
+<div data-lang="java" markdown="1">
+
+{% highlight java %}
+import scala.Tuple2;
+
+import org.apache.spark.api.java.*;
+import org.apache.spark.rdd.RDD;
+import org.apache.spark.mllib.evaluation.MultilabelMetrics;
+import org.apache.spark.SparkConf;
+import java.util.Arrays;
+import java.util.List;
+
+public class MultilabelClassification {
+  public static void main(String[] args) {
+    SparkConf conf = new SparkConf().setAppName("Multilabel Classification Metrics");
+    JavaSparkContext sc = new JavaSparkContext(conf);
+
+    List<Tuple2<double[], double[]>> data = Arrays.asList(
+        new Tuple2<double[], double[]>(new double[]{0.0, 1.0}, new double[]{0.0, 2.0}),
+        new Tuple2<double[], double[]>(new double[]{0.0, 2.0}, new double[]{0.0, 1.0}),
+        new Tuple2<double[], double[]>(new double[]{}, new double[]{0.0}),
+        new Tuple2<double[], double[]>(new double[]{2.0}, new double[]{2.0}),
+        new Tuple2<double[], double[]>(new double[]{2.0, 0.0}, new double[]{2.0, 0.0}),
+        new Tuple2<double[], double[]>(new double[]{0.0, 1.0, 2.0}, new double[]{0.0, 1.0}),
+        new Tuple2<double[], double[]>(new double[]{1.0}, new double[]{1.0, 2.0})
+        );
+    JavaRDD<Tuple2<double[], double[]>> scoreAndLabels = sc.parallelize(data);
+
+    // Instantiate metrics object
+    MultilabelMetrics metrics = new MultilabelMetrics(scoreAndLabels.rdd());
+
+    // Summary stats
+    System.out.format("Recall = %f\n", metrics.recall());
+    System.out.format("Precision = %f\n", metrics.precision());
+    System.out.format("F1 measure = %f\n", metrics.f1Measure());
+    System.out.format("Accuracy = %f\n", metrics.accuracy());
+
+    // Stats by labels
+    for (int i = 0; i < metrics.labels().length - 1; i++) {
+        System.out.format("Class %1.1f precision = %f\n", metrics.labels()[i], metrics.precision(metrics.labels()[i]));
+        System.out.format("Class %1.1f recall = %f\n", metrics.labels()[i], metrics.recall(metrics.labels()[i]));
+        System.out.format("Class %1.1f F1 score = %f\n", metrics.labels()[i], metrics.f1Measure(metrics.labels()[i]));
+    }
+
+    // Micro stats
+    System.out.format("Micro recall = %f\n", metrics.microRecall());
+    System.out.format("Micro precision = %f\n", metrics.microPrecision());
+    System.out.format("Micro F1 measure = %f\n", metrics.microF1Measure());
+
+    // Hamming loss
+    System.out.format("Hamming loss = %f\n", metrics.hammingLoss());
+
+    // Subset accuracy
+    System.out.format("Subset accuracy = %f\n", metrics.subsetAccuracy());
+
+  }
+}
+
+{% endhighlight %}
+
+</div>
+
+<div data-lang="python" markdown="1">
+
+{% highlight python %}
+from pyspark.mllib.evaluation import MultilabelMetrics
+
+scoreAndLabels = sc.parallelize([
+    ([0.0, 1.0], [0.0, 2.0]),
+    ([0.0, 2.0], [0.0, 1.0]),
+    ([], [0.0]),
+    ([2.0], [2.0]),
+    ([2.0, 0.0], [2.0, 0.0]),
+    ([0.0, 1.0, 2.0], [0.0, 1.0]),
+    ([1.0], [1.0, 2.0])])
+
+# Instantiate metrics object
+metrics = MultilabelMetrics(scoreAndLabels)
+
+# Summary stats
+print "Recall = %s" % metrics.recall()
+print "Precision = %s" % metrics.precision()
+print "F1 measure = %s" % metrics.f1Measure()
+print "Accuracy = %s" % metrics.accuracy
+
+# Individual label stats
+labels = scoreAndLabels.flatMap(lambda x: x[1]).distinct().collect()
+for label in labels:
+    print "Class %s precision = %s" % (label, metrics.precision(label))
+    print "Class %s recall = %s" % (label, metrics.recall(label))
+    print "Class %s F1 Measure = %s" % (label, metrics.f1Measure(label))
+
+# Micro stats
+print "Micro precision = %s" % metrics.microPrecision
+print "Micro recall = %s" % metrics.microRecall
+print "Micro F1 measure = %s" % metrics.microF1Measure
+
+# Hamming loss
+print "Hamming loss = %s" % metrics.hammingLoss
+
+# Subset accuracy
+print "Subset accuracy = %s" % metrics.subsetAccuracy
+
+{% endhighlight %}
+
+</div>
+</div>
+
+### Ranking systems
+
+The role of a ranking algorithm (often thought of as a [recommender system](https://en.wikipedia.org/wiki/Recommender_system))
+is to return to the user a set of relevant items or documents based on some training data. The definition of relevance
+may vary and is usually application specific. Ranking system metrics aim to quantify the effectiveness of these
+rankings or recommendations in various contexts. Some metrics compare a set of recommended documents to a ground truth
+set of relevant documents, while other metrics may incorporate numerical ratings explicitly.
+
+**Available metrics**
+
+A ranking system usually deals with a set of $M$ users
+
+$$U = \left\{u_0, u_1, ..., u_{M-1}\right\}$$
+
+Each user ($u_i$) having a set of $N$ ground truth relevant documents
+
+$$D_i = \left\{d_0, d_1, ..., d_{N-1}\right\}$$
+
+And a list of $Q$ recommended documents, in order of decreasing relevance
+
+$$R_i = \left[r_0, r_1, ..., r_{Q-1}\right]$$
+
+The goal of the ranking system is to produce the most relevant set of documents for each user. The relevance of the
+sets and the effectiveness of the algorithms can be measured using the metrics listed below.
+
+It is necessary to define a function which, provided a recommended document and a set of ground truth relevant
+documents, returns a relevance score for the recommended document.
+
+$$rel_D(r) = \begin{cases}1 & \text{if $r \in D$}, \\ 0 & \text{otherwise}.\end{cases}$$
+
+<table class="table">
+  <thead>
+    <tr><th>Metric</th><th>Definition</th><th>Notes</th></tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>
+        Precision at k
+      </td>
+      <td>
+        $p(k)=\frac{1}{M} \sum_{i=0}^{M-1} {\frac{1}{k} \sum_{j=0}^{\text{min}(\left|D\right|, k) - 1} rel_{D_i}(R_i(j))}$
+      </td>
+      <td>
+        <a href="https://en.wikipedia.org/wiki/Information_retrieval#Precision_at_K">Precision at k</a> is a measure of
+         how many of the first k recommended documents are in the set of true relevant documents averaged across all
+         users. In this metric, the order of the recommendations is not taken into account.
+      </td>
+    </tr>
+    <tr>
+      <td>Mean Average Precision</td>
+      <td>
+        $MAP=\frac{1}{M} \sum_{i=0}^{M-1} {\frac{1}{\left|D_i\right|} \sum_{j=0}^{Q-1} \frac{rel_{D_i}(R_i(j))}{j + 1}}$
+      </td>
+      <td>
+        <a href="https://en.wikipedia.org/wiki/Information_retrieval#Mean_average_precision">MAP</a> is a measure of how
+         many of the recommended documents are in the set of true relevant documents, where the
+        order of the recommendations is taken into account (i.e. penalty for highly relevant documents is higher).
+      </td>
+    </tr>
+    <tr>
+      <td>Normalized Discounted Cumulative Gain</td>
+      <td>
+        $NDCG(k)=\frac{1}{M} \sum_{i=0}^{M-1} {\frac{1}{IDCG(D_i, k)}\sum_{j=0}^{n-1}
+          \frac{rel_{D_i}(R_i(j))}{\text{ln}(j+1)}} \\
+        \text{Where} \\
+        \hspace{5 mm} n = \text{min}\left(\text{max}\left(|R_i|,|D_i|\right),k\right) \\
+        \hspace{5 mm} IDCG(D, k) = \sum_{j=0}^{\text{min}(\left|D\right|, k) - 1} \frac{1}{\text{ln}(j+1)}$
+      </td>
+      <td>
+        <a href="https://en.wikipedia.org/wiki/Information_retrieval#Discounted_cumulative_gain">NDCG at k</a> is a
+        measure of how many of the first k recommended documents are in the set of true relevant documents averaged
+        across all users. In contrast to precision at k, this metric takes into account the order of the recommendations
+        (documents are assumed to be in order of decreasing relevance).
+      </td>
+    </tr>
+  </tbody>
+</table>
+
+**Examples**
+
+The following code snippets illustrate how to load a sample dataset, train an alternating least squares recommendation
+model on the data, and evaluate the performance of the recommender by several ranking metrics. A brief summary of the
+methodology is provided below.
+
+MovieLens ratings are on a scale of 1-5:
+
+ * 5: Must see
+ * 4: Will enjoy
+ * 3: It's okay
+ * 2: Fairly bad
+ * 1: Awful
+
+So we should not recommend a movie if the predicted rating is less than 3.
+To map ratings to confidence scores, we use:
+
+ * 5 -> 2.5
+ * 4 -> 1.5
+ * 3 -> 0.5
+ * 2 -> -0.5
+ * 1 -> -1.5.
+
+This mappings means unobserved entries are generally between It's okay and Fairly bad. The semantics of 0 in this
+expanded world of non-positive weights are "the same as never having interacted at all."
+
+<div class="codetabs">
+
+<div data-lang="scala" markdown="1">
+
+{% highlight scala %}
+import org.apache.spark.mllib.evaluation.{RegressionMetrics, RankingMetrics}
+import org.apache.spark.mllib.recommendation.{ALS, Rating}
+
+// Read in the ratings data
+val ratings = sc.textFile("data/mllib/sample_movielens_data.txt").map { line =>
+  val fields = line.split("::")
+  Rating(fields(0).toInt, fields(1).toInt, fields(2).toDouble - 2.5)
+}.cache()
+
+// Map ratings to 1 or 0, 1 indicating a movie that should be recommended
+val binarizedRatings = ratings.map(r => Rating(r.user, r.product, if (r.rating > 0) 1.0 else 0.0)).cache()
+
+// Summarize ratings
+val numRatings = ratings.count()
+val numUsers = ratings.map(_.user).distinct().count()
+val numMovies = ratings.map(_.product).distinct().count()
+println(s"Got $numRatings ratings from $numUsers users on $numMovies movies.")
+
+// Build the model
+val numIterations = 10
+val rank = 10
+val lambda = 0.01
+val model = ALS.train(ratings, rank, numIterations, lambda)
+
+// Define a function to scale ratings from 0 to 1
+def scaledRating(r: Rating): Rating = {
+  val scaledRating = math.max(math.min(r.rating, 1.0), 0.0)
+  Rating(r.user, r.product, scaledRating)
+}
+
+// Get sorted top ten predictions for each user and then scale from [0, 1]
+val userRecommended = model.recommendProductsForUsers(10).map{ case (user, recs) =>
+  (user, recs.map(scaledRating))
+}
+
+// Assume that any movie a user rated 3 or higher (which maps to a 1) is a relevant document
+// Compare with top ten most relevant documents
+val userMovies = binarizedRatings.groupBy(_.user)
+val relevantDocuments = userMovies.join(userRecommended).map{ case (user, (actual, predictions)) =>
+  (predictions.map(_.product), actual.filter(_.rating > 0.0).map(_.product).toArray)
+}
+
+// Instantiate metrics object
+val metrics = new RankingMetrics(relevantDocuments)
+
+// Precision at K
+Array(1, 3, 5).foreach{ k =>
+  println(s"Precision at $k = ${metrics.precisionAt(k)}")
+}
+
+// Mean average precision
+println(s"Mean average precision = ${metrics.meanAveragePrecision}")
+
+// Normalized discounted cumulative gain
+Array(1, 3, 5).foreach{ k =>
+  println(s"NDCG at $k = ${metrics.ndcgAt(k)}")
+}
+
+// Get predictions for each data point
+val allPredictions = model.predict(ratings.map(r => (r.user, r.product))).map(r => ((r.user, r.product), r.rating))
+val allRatings = ratings.map(r => ((r.user, r.product), r.rating))
+val predictionsAndLabels = allPredictions.join(allRatings).map{ case ((user, product), (predicted, actual)) =>
+  (predicted, actual)
+}
+
+// Get the RMSE using regression metrics
+val regressionMetrics = new RegressionMetrics(predictionsAndLabels)
+println(s"RMSE = ${regressionMetrics.rootMeanSquaredError}")
+
+// R-squared
+println(s"R-squared = ${regressionMetrics.r2}")
+
+{% endhighlight %}
+
+</div>
+
+<div data-lang="java" markdown="1">
+
+{% highlight java %}
+import scala.Tuple2;
+
+import org.apache.spark.api.java.*;
+import org.apache.spark.rdd.RDD;
+import org.apache.spark.mllib.recommendation.MatrixFactorizationModel;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.function.Function;
+import java.util.*;
+import org.apache.spark.mllib.evaluation.RegressionMetrics;
+import org.apache.spark.mllib.evaluation.RankingMetrics;
+import org.apache.spark.mllib.recommendation.ALS;
+import org.apache.spark.mllib.recommendation.Rating;
+
+// Read in the ratings data
+public class Ranking {
+  public static void main(String[] args) {
+    SparkConf conf = new SparkConf().setAppName("Ranking Metrics");
+    JavaSparkContext sc = new JavaSparkContext(conf);
+    String path = "data/mllib/sample_movielens_data.txt";
+    JavaRDD<String> data = sc.textFile(path);
+    JavaRDD<Rating> ratings = data.map(
+      new Function<String, Rating>() {
+        public Rating call(String line) {
+          String[] parts = line.split("::");
+          return new Rating(Integer.parseInt(parts[0]), Integer.parseInt(parts[1]), Double.parseDouble(parts[2]) - 2.5);
+        }
+      }
+    );
+    ratings.cache();
+
+    // Train an ALS model
+    final MatrixFactorizationModel model = ALS.train(JavaRDD.toRDD(ratings), 10, 10, 0.01);
+
+    // Get top 10 recommendations for every user and scale ratings from 0 to 1
+    JavaRDD<Tuple2<Object, Rating[]>> userRecs = model.recommendProductsForUsers(10).toJavaRDD();
+    JavaRDD<Tuple2<Object, Rating[]>> userRecsScaled = userRecs.map(
+      new Function<Tuple2<Object, Rating[]>, Tuple2<Object, Rating[]>>() {
+        public Tuple2<Object, Rating[]> call(Tuple2<Object, Rating[]> t) {
+          Rating[] scaledRatings = new Rating[t._2().length];
+          for (int i = 0; i < scaledRatings.length; i++) {
+            double newRating = Math.max(Math.min(t._2()[i].rating(), 1.0), 0.0);
+            scaledRatings[i] = new Rating(t._2()[i].user(), t._2()[i].product(), newRating);
+          }
+          return new Tuple2<Object, Rating[]>(t._1(), scaledRatings);
+        }
+      }
+    );
+    JavaPairRDD<Object, Rating[]> userRecommended = JavaPairRDD.fromJavaRDD(userRecsScaled);
+
+    // Map ratings to 1 or 0, 1 indicating a movie that should be recommended
+    JavaRDD<Rating> binarizedRatings = ratings.map(
+      new Function<Rating, Rating>() {
+        public Rating call(Rating r) {
+          double binaryRating;
+          if (r.rating() > 0.0) {
+            binaryRating = 1.0;
+          }
+          else {
+            binaryRating = 0.0;
+          }
+          return new Rating(r.user(), r.product(), binaryRating);
+        }
+      }
+    );
+
+    // Group ratings by common user
+    JavaPairRDD<Object, Iterable<Rating>> userMovies = binarizedRatings.groupBy(
+      new Function<Rating, Object>() {
+        public Object call(Rating r) {
+          return r.user();
+        }
+      }
+    );
+
+    // Get true relevant documents from all user ratings
+    JavaPairRDD<Object, List<Integer>> userMoviesList = userMovies.mapValues(
+      new Function<Iterable<Rating>, List<Integer>>() {
+        public List<Integer> call(Iterable<Rating> docs) {
+          List<Integer> products = new ArrayList<Integer>();
+          for (Rating r : docs) {
+            if (r.rating() > 0.0) {
+              products.add(r.product());
+            }
+          }
+          return products;
+        }
+      }
+    );
+
+    // Extract the product id from each recommendation
+    JavaPairRDD<Object, List<Integer>> userRecommendedList = userRecommended.mapValues(
+      new Function<Rating[], List<Integer>>() {
+        public List<Integer> call(Rating[] docs) {
+          List<Integer> products = new ArrayList<Integer>();
+          for (Rating r : docs) {
+            products.add(r.product());
+          }
+          return products;
+        }
+      }
+    );
+    JavaRDD<Tuple2<List<Integer>, List<Integer>>> relevantDocs = userMoviesList.join(userRecommendedList).values();
+
+    // Instantiate the metrics object
+    RankingMetrics metrics = RankingMetrics.of(relevantDocs);
+
+    // Precision and NDCG at k
+    Integer[] kVector = {1, 3, 5};
+    for (Integer k : kVector) {
+      System.out.format("Precision at %d = %f\n", k, metrics.precisionAt(k));
+      System.out.format("NDCG at %d = %f\n", k, metrics.ndcgAt(k));
+    }
+
+    // Mean average precision
+    System.out.format("Mean average precision = %f\n", metrics.meanAveragePrecision());
+
+    // Evaluate the model using numerical ratings and regression metrics
+    JavaRDD<Tuple2<Object, Object>> userProducts = ratings.map(
+      new Function<Rating, Tuple2<Object, Object>>() {
+        public Tuple2<Object, Object> call(Rating r) {
+          return new Tuple2<Object, Object>(r.user(), r.product());
+        }
+      }
+    );
+    JavaPairRDD<Tuple2<Integer, Integer>, Object> predictions = JavaPairRDD.fromJavaRDD(
+      model.predict(JavaRDD.toRDD(userProducts)).toJavaRDD().map(
+        new Function<Rating, Tuple2<Tuple2<Integer, Integer>, Object>>() {
+          public Tuple2<Tuple2<Integer, Integer>, Object> call(Rating r){
+            return new Tuple2<Tuple2<Integer, Integer>, Object>(
+              new Tuple2<Integer, Integer>(r.user(), r.product()), r.rating());
+          }
+        }
+    ));
+    JavaRDD<Tuple2<Object, Object>> ratesAndPreds =
+      JavaPairRDD.fromJavaRDD(ratings.map(
+        new Function<Rating, Tuple2<Tuple2<Integer, Integer>, Object>>() {
+          public Tuple2<Tuple2<Integer, Integer>, Object> call(Rating r){
+            return new Tuple2<Tuple2<Integer, Integer>, Object>(
+              new Tuple2<Integer, Integer>(r.user(), r.product()), r.rating());
+          }
+        }
+    )).join(predictions).values();
+
+    // Create regression metrics object
+    RegressionMetrics regressionMetrics = new RegressionMetrics(ratesAndPreds.rdd());
+
+    // Root mean squared error
+    System.out.format("RMSE = %f\n", regressionMetrics.rootMeanSquaredError());
+
+    // R-squared
+    System.out.format("R-squared = %f\n", regressionMetrics.r2());
+  }
+}
+
+{% endhighlight %}
+
+</div>
+
+<div data-lang="python" markdown="1">
+
+{% highlight python %}
+from pyspark.mllib.recommendation import ALS, Rating
+from pyspark.mllib.evaluation import RegressionMetrics, RankingMetrics
+
+#  Read in the ratings data
+lines = sc.textFile("data/mllib/sample_movielens_data.txt")
+
+def parseLine(line):
+    fields = line.split("::")
+    return Rating(int(fields[0]), int(fields[1]), float(fields[2]) - 2.5)
+ratings = lines.map(lambda r: parseLine(r))
+
+# Train a model on to predict user-product ratings
+model = ALS.train(ratings, 10, 10, 0.01)
+
+# Get predicted ratings on all existing user-product pairs
+testData = ratings.map(lambda p: (p.user, p.product))
+predictions = model.predictAll(testData).map(lambda r: ((r.user, r.product), r.rating))
+
+ratingsTuple = ratings.map(lambda r: ((r.user, r.product), r.rating))
+scoreAndLabels = predictions.join(ratingsTuple).map(lambda tup: tup[1])
+
+# Instantiate regression metrics to compare predicted and actual ratings
+metrics = RegressionMetrics(scoreAndLabels)
+
+# Root mean sqaured error
+print "RMSE = %s" % metrics.rootMeanSquaredError
+
+# R-squared
+print "R-squared = %s" % metrics.r2
+
+{% endhighlight %}
+
+</div>
+</div>
+
+## Regression model evaluation
+
+[Regression analysis](https://en.wikipedia.org/wiki/Regression_analysis) is used when predicting a continuous output
+variable from a number of independent variables.
+
+**Available metrics**
+
+<table class="table">
+  <thead>
+    <tr><th>Metric</th><th>Definition</th></tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>Mean Squared Error (MSE)</td>
+      <td>$MSE = \frac{\sum_{i=0}^{N-1} (\mathbf{y}_i - \hat{\mathbf{y}}_i)^2}{N}$</td>
+    </tr>
+    <tr>
+      <td>Root Mean Squared Error (RMSE)</td>
+      <td>$RMSE = \sqrt{\frac{\sum_{i=0}^{N-1} (\mathbf{y}_i - \hat{\mathbf{y}}_i)^2}{N}}$</td>
+    </tr>
+    <tr>
+      <td>Mean Absoloute Error (MAE)</td>
+      <td>$MAE=\sum_{i=0}^{N-1} \left|\mathbf{y}_i - \hat{\mathbf{y}}_i\right|$</td>
+    </tr>
+    <tr>
+      <td>Coefficient of Determination $(R^2)$</td>
+      <td>$R^2=1 - \frac{MSE}{\text{VAR}(\mathbf{y}) \cdot (N-1)}=1-\frac{\sum_{i=0}^{N-1}
+        (\mathbf{y}_i - \hat{\mathbf{y}}_i)^2}{\sum_{i=0}^{N-1}(\mathbf{y}_i-\bar{\mathbf{y}})^2}$</td>
+    </tr>
+    <tr>
+      <td>Explained Variance</td>
+      <td>$1 - \frac{\text{VAR}(\mathbf{y} - \mathbf{\hat{y}})}{\text{VAR}(\mathbf{y})}$</td>
+    </tr>
+  </tbody>
+</table>
+
+**Examples**
+
+<div class="codetabs">
+The following code snippets illustrate how to load a sample dataset, train a linear regression algorithm on the data,
+and evaluate the performance of the algorithm by several regression metrics.
+
+<div data-lang="scala" markdown="1">
+
+{% highlight scala %}
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.mllib.regression.LinearRegressionModel
+import org.apache.spark.mllib.regression.LinearRegressionWithSGD
+import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.evaluation.RegressionMetrics
+import org.apache.spark.mllib.util.MLUtils
+
+// Load the data
+val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_linear_regression_data.txt").cache()
+
+// Build the model
+val numIterations = 100
+val model = LinearRegressionWithSGD.train(data, numIterations)
+
+// Get predictions
+val valuesAndPreds = data.map{ point =>
+  val prediction = model.predict(point.features)
+  (prediction, point.label)
+}
+
+// Instantiate metrics object
+val metrics = new RegressionMetrics(valuesAndPreds)
+
+// Squared error
+println(s"MSE = ${metrics.meanSquaredError}")
+println(s"RMSE = ${metrics.rootMeanSquaredError}")
+
+// R-squared
+println(s"R-squared = ${metrics.r2}")
+
+// Mean absolute error
+println(s"MAE = ${metrics.meanAbsoluteError}")
+
+// Explained variance
+println(s"Explained variance = ${metrics.explainedVariance}")
+
+{% endhighlight %}
+
+</div>
+
+<div data-lang="java" markdown="1">
+
+{% highlight java %}
+import scala.Tuple2;
+
+import org.apache.spark.api.java.*;
+import org.apache.spark.api.java.function.Function;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.mllib.regression.LabeledPoint;
+import org.apache.spark.mllib.regression.LinearRegressionModel;
+import org.apache.spark.mllib.regression.LinearRegressionWithSGD;
+import org.apache.spark.mllib.evaluation.RegressionMetrics;
+import org.apache.spark.SparkConf;
+
+public class LinearRegression {
+  public static void main(String[] args) {
+    SparkConf conf = new SparkConf().setAppName("Linear Regression Example");
+    JavaSparkContext sc = new JavaSparkContext(conf);
+
+    // Load and parse the data
+    String path = "data/mllib/sample_linear_regression_data.txt";
+    JavaRDD<String> data = sc.textFile(path);
+    JavaRDD<LabeledPoint> parsedData = data.map(
+      new Function<String, LabeledPoint>() {
+        public LabeledPoint call(String line) {
+          String[] parts = line.split(" ");
+          double[] v = new double[parts.length - 1];
+          for (int i = 1; i < parts.length - 1; i++)
+            v[i - 1] = Double.parseDouble(parts[i].split(":")[1]);
+          return new LabeledPoint(Double.parseDouble(parts[0]), Vectors.dense(v));
+        }
+      }
+    );
+    parsedData.cache();
+
+    // Building the model
+    int numIterations = 100;
+    final LinearRegressionModel model =
+      LinearRegressionWithSGD.train(JavaRDD.toRDD(parsedData), numIterations);
+
+    // Evaluate model on training examples and compute training error
+    JavaRDD<Tuple2<Object, Object>> valuesAndPreds = parsedData.map(
+      new Function<LabeledPoint, Tuple2<Object, Object>>() {
+        public Tuple2<Object, Object> call(LabeledPoint point) {
+          double prediction = model.predict(point.features());
+          return new Tuple2<Object, Object>(prediction, point.label());
+        }
+      }
+    );
+
+    // Instantiate metrics object
+    RegressionMetrics metrics = new RegressionMetrics(valuesAndPreds.rdd());
+
+    // Squared error
+    System.out.format("MSE = %f\n", metrics.meanSquaredError());
+    System.out.format("RMSE = %f\n", metrics.rootMeanSquaredError());
+
+    // R-squared
+    System.out.format("R Squared = %f\n", metrics.r2());
+
+    // Mean absolute error
+    System.out.format("MAE = %f\n", metrics.meanAbsoluteError());
+
+    // Explained variance
+    System.out.format("Explained Variance = %f\n", metrics.explainedVariance());
+
+    // Save and load model
+    model.save(sc.sc(), "myModelPath");
+    LinearRegressionModel sameModel = LinearRegressionModel.load(sc.sc(), "myModelPath");
+  }
+}
+
+{% endhighlight %}
+
+</div>
+
+<div data-lang="python" markdown="1">
+
+{% highlight python %}
+from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD
+from pyspark.mllib.evaluation import RegressionMetrics
+from pyspark.mllib.linalg import DenseVector
+
+# Load and parse the data
+def parsePoint(line):
+    values = line.split()
+    return LabeledPoint(float(values[0]), DenseVector([float(x.split(':')[1]) for x in values[1:]]))
+
+data = sc.textFile("data/mllib/sample_linear_regression_data.txt")
+parsedData = data.map(parsePoint)
+
+# Build the model
+model = LinearRegressionWithSGD.train(parsedData)
+
+# Get predictions
+valuesAndPreds = parsedData.map(lambda p: (float(model.predict(p.features)), p.label))
+
+# Instantiate metrics object
+metrics = RegressionMetrics(valuesAndPreds)
+
+# Squared Error
+print "MSE = %s" % metrics.meanSquaredError
+print "RMSE = %s" % metrics.rootMeanSquaredError
+
+# R-squared
+print "R-squared = %s" % metrics.r2
+
+# Mean absolute error
+print "MAE = %s" % metrics.meanAbsoluteError
+
+# Explained variance
+print "Explained variance = %s" % metrics.explainedVariance
+
+{% endhighlight %}
+
+</div>
+</div>
\ No newline at end of file
diff --git a/docs/mllib-guide.md b/docs/mllib-guide.md
index d2d1cc93fe006..eea864eacf7c4 100644
--- a/docs/mllib-guide.md
+++ b/docs/mllib-guide.md
@@ -48,6 +48,7 @@ This lists functionality included in `spark.mllib`, the main MLlib API.
 * [Feature extraction and transformation](mllib-feature-extraction.html)
 * [Frequent pattern mining](mllib-frequent-pattern-mining.html)
   * FP-growth
+* [Evaluation Metrics](mllib-evaluation-metrics.html)
 * [Optimization (developer)](mllib-optimization.html)
   * stochastic gradient descent
   * limited-memory BFGS (L-BFGS)

From a200e64561c8803731578267df16906f6773cbea Mon Sep 17 00:00:00 2001
From: Feynman Liang <fliang@databricks.com>
Date: Wed, 29 Jul 2015 19:02:15 -0700
Subject: [PATCH 455/549] [SPARK-9440] [MLLIB] Add hyperparameters to
 LocalLDAModel save/load

jkbradley MechCoder

Resolves blocking issue for SPARK-6793. Please review after #7705 is merged.

Author: Feynman Liang <fliang@databricks.com>

Closes #7757 from feynmanliang/SPARK-9940-localSaveLoad and squashes the following commits:

d0d8cf4 [Feynman Liang] Fix thisClassName
0f30109 [Feynman Liang] Fix tests after changing LDAModel public API
dc61981 [Feynman Liang] Add hyperparams to LocalLDAModel save/load
---
 .../spark/mllib/clustering/LDAModel.scala     | 40 +++++++++++++------
 .../spark/mllib/clustering/LDASuite.scala     |  6 ++-
 2 files changed, 33 insertions(+), 13 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
index 059b52ef20a98..ece28848aa02c 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
@@ -215,7 +215,8 @@ class LocalLDAModel private[clustering] (
   override protected def formatVersion = "1.0"
 
   override def save(sc: SparkContext, path: String): Unit = {
-    LocalLDAModel.SaveLoadV1_0.save(sc, path, topicsMatrix)
+    LocalLDAModel.SaveLoadV1_0.save(sc, path, topicsMatrix, docConcentration, topicConcentration,
+      gammaShape)
   }
   // TODO
   // override def logLikelihood(documents: RDD[(Long, Vector)]): Double = ???
@@ -312,16 +313,23 @@ object LocalLDAModel extends Loader[LocalLDAModel] {
     // as a Row in data.
     case class Data(topic: Vector, index: Int)
 
-    // TODO: explicitly save docConcentration, topicConcentration, and gammaShape for use in
-    // model.predict()
-    def save(sc: SparkContext, path: String, topicsMatrix: Matrix): Unit = {
+    def save(
+        sc: SparkContext,
+        path: String,
+        topicsMatrix: Matrix,
+        docConcentration: Vector,
+        topicConcentration: Double,
+        gammaShape: Double): Unit = {
       val sqlContext = SQLContext.getOrCreate(sc)
       import sqlContext.implicits._
 
       val k = topicsMatrix.numCols
       val metadata = compact(render
         (("class" -> thisClassName) ~ ("version" -> thisFormatVersion) ~
-          ("k" -> k) ~ ("vocabSize" -> topicsMatrix.numRows)))
+          ("k" -> k) ~ ("vocabSize" -> topicsMatrix.numRows) ~
+          ("docConcentration" -> docConcentration.toArray.toSeq) ~
+          ("topicConcentration" -> topicConcentration) ~
+          ("gammaShape" -> gammaShape)))
       sc.parallelize(Seq(metadata), 1).saveAsTextFile(Loader.metadataPath(path))
 
       val topicsDenseMatrix = topicsMatrix.toBreeze.toDenseMatrix
@@ -331,7 +339,12 @@ object LocalLDAModel extends Loader[LocalLDAModel] {
       sc.parallelize(topics, 1).toDF().write.parquet(Loader.dataPath(path))
     }
 
-    def load(sc: SparkContext, path: String): LocalLDAModel = {
+    def load(
+        sc: SparkContext,
+        path: String,
+        docConcentration: Vector,
+        topicConcentration: Double,
+        gammaShape: Double): LocalLDAModel = {
       val dataPath = Loader.dataPath(path)
       val sqlContext = SQLContext.getOrCreate(sc)
       val dataFrame = sqlContext.read.parquet(dataPath)
@@ -348,8 +361,7 @@ object LocalLDAModel extends Loader[LocalLDAModel] {
       val topicsMat = Matrices.fromBreeze(brzTopics)
 
       // TODO: initialize with docConcentration, topicConcentration, and gammaShape after SPARK-9940
-      new LocalLDAModel(topicsMat,
-        Vectors.dense(Array.fill(topicsMat.numRows)(1.0 / topicsMat.numRows)), 1D, 100D)
+      new LocalLDAModel(topicsMat, docConcentration, topicConcentration, gammaShape)
     }
   }
 
@@ -358,11 +370,15 @@ object LocalLDAModel extends Loader[LocalLDAModel] {
     implicit val formats = DefaultFormats
     val expectedK = (metadata \ "k").extract[Int]
     val expectedVocabSize = (metadata \ "vocabSize").extract[Int]
+    val docConcentration =
+      Vectors.dense((metadata \ "docConcentration").extract[Seq[Double]].toArray)
+    val topicConcentration = (metadata \ "topicConcentration").extract[Double]
+    val gammaShape = (metadata \ "gammaShape").extract[Double]
     val classNameV1_0 = SaveLoadV1_0.thisClassName
 
     val model = (loadedClassName, loadedVersion) match {
       case (className, "1.0") if className == classNameV1_0 =>
-        SaveLoadV1_0.load(sc, path)
+        SaveLoadV1_0.load(sc, path, docConcentration, topicConcentration, gammaShape)
       case _ => throw new Exception(
         s"LocalLDAModel.load did not recognize model with (className, format version):" +
           s"($loadedClassName, $loadedVersion).  Supported:\n" +
@@ -565,7 +581,7 @@ object DistributedLDAModel extends Loader[DistributedLDAModel] {
 
     val thisFormatVersion = "1.0"
 
-    val classNameV1_0 = "org.apache.spark.mllib.clustering.DistributedLDAModel"
+    val thisClassName = "org.apache.spark.mllib.clustering.DistributedLDAModel"
 
     // Store globalTopicTotals as a Vector.
     case class Data(globalTopicTotals: Vector)
@@ -591,7 +607,7 @@ object DistributedLDAModel extends Loader[DistributedLDAModel] {
       import sqlContext.implicits._
 
       val metadata = compact(render
-        (("class" -> classNameV1_0) ~ ("version" -> thisFormatVersion) ~
+        (("class" -> thisClassName) ~ ("version" -> thisFormatVersion) ~
           ("k" -> k) ~ ("vocabSize" -> vocabSize) ~
           ("docConcentration" -> docConcentration.toArray.toSeq) ~
           ("topicConcentration" -> topicConcentration) ~
@@ -660,7 +676,7 @@ object DistributedLDAModel extends Loader[DistributedLDAModel] {
     val topicConcentration = (metadata \ "topicConcentration").extract[Double]
     val iterationTimes = (metadata \ "iterationTimes").extract[Seq[Double]]
     val gammaShape = (metadata \ "gammaShape").extract[Double]
-    val classNameV1_0 = SaveLoadV1_0.classNameV1_0
+    val classNameV1_0 = SaveLoadV1_0.thisClassName
 
     val model = (loadedClassName, loadedVersion) match {
       case (className, "1.0") if className == classNameV1_0 => {
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
index aa36336ebbee6..b91c7cefed22e 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
@@ -334,7 +334,7 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext {
   test("model save/load") {
     // Test for LocalLDAModel.
     val localModel = new LocalLDAModel(tinyTopics,
-      Vectors.dense(Array.fill(tinyTopics.numRows)(1.0 / tinyTopics.numRows)), 1D, 100D)
+      Vectors.dense(Array.fill(tinyTopics.numRows)(0.01)), 0.5D, 10D)
     val tempDir1 = Utils.createTempDir()
     val path1 = tempDir1.toURI.toString
 
@@ -360,6 +360,9 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext {
       assert(samelocalModel.topicsMatrix === localModel.topicsMatrix)
       assert(samelocalModel.k === localModel.k)
       assert(samelocalModel.vocabSize === localModel.vocabSize)
+      assert(samelocalModel.docConcentration === localModel.docConcentration)
+      assert(samelocalModel.topicConcentration === localModel.topicConcentration)
+      assert(samelocalModel.gammaShape === localModel.gammaShape)
 
       val sameDistributedModel = DistributedLDAModel.load(sc, path2)
       assert(distributedModel.topicsMatrix === sameDistributedModel.topicsMatrix)
@@ -368,6 +371,7 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext {
       assert(distributedModel.iterationTimes === sameDistributedModel.iterationTimes)
       assert(distributedModel.docConcentration === sameDistributedModel.docConcentration)
       assert(distributedModel.topicConcentration === sameDistributedModel.topicConcentration)
+      assert(distributedModel.gammaShape === sameDistributedModel.gammaShape)
       assert(distributedModel.globalTopicTotals === sameDistributedModel.globalTopicTotals)
 
       val graph = distributedModel.graph

From 9514d874f0cf61f1eb4ec4f5f66e053119f769c9 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Wed, 29 Jul 2015 20:46:03 -0700
Subject: [PATCH 456/549] [SPARK-9458] Avoid object allocation in prefix
 generation.

In our existing sort prefix generation code, we use expression's eval method to generate the prefix, which results in object allocation for every prefix. We can use the specialized getters available on InternalRow directly to avoid the object allocation.

I also removed the FLOAT prefix, opting for converting float directly to double.

Author: Reynold Xin <rxin@databricks.com>

Closes #7763 from rxin/sort-prefix and squashes the following commits:

5dc2f06 [Reynold Xin] [SPARK-9458] Avoid object allocation in prefix generation.
---
 .../unsafe/sort/PrefixComparators.java        | 16 ------
 .../unsafe/sort/PrefixComparatorsSuite.scala  | 12 -----
 .../execution/UnsafeExternalRowSorter.java    |  2 +-
 .../spark/sql/execution/SortPrefixUtils.scala | 51 +++++++++----------
 .../spark/sql/execution/SparkStrategies.scala |  4 +-
 .../org/apache/spark/sql/execution/sort.scala |  5 +-
 .../execution/RowFormatConvertersSuite.scala  |  2 +-
 .../execution/UnsafeExternalSortSuite.scala   | 10 ++--
 8 files changed, 35 insertions(+), 67 deletions(-)

diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/PrefixComparators.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/PrefixComparators.java
index bf1bc5dffba78..5624e067da2cc 100644
--- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/PrefixComparators.java
+++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/PrefixComparators.java
@@ -31,7 +31,6 @@ private PrefixComparators() {}
 
   public static final StringPrefixComparator STRING = new StringPrefixComparator();
   public static final IntegralPrefixComparator INTEGRAL = new IntegralPrefixComparator();
-  public static final FloatPrefixComparator FLOAT = new FloatPrefixComparator();
   public static final DoublePrefixComparator DOUBLE = new DoublePrefixComparator();
 
   public static final class StringPrefixComparator extends PrefixComparator {
@@ -78,21 +77,6 @@ public int compare(long a, long b) {
     public final long NULL_PREFIX = Long.MIN_VALUE;
   }
 
-  public static final class FloatPrefixComparator extends PrefixComparator {
-    @Override
-    public int compare(long aPrefix, long bPrefix) {
-      float a = Float.intBitsToFloat((int) aPrefix);
-      float b = Float.intBitsToFloat((int) bPrefix);
-      return Utils.nanSafeCompareFloats(a, b);
-    }
-
-    public long computePrefix(float value) {
-      return Float.floatToIntBits(value) & 0xffffffffL;
-    }
-
-    public final long NULL_PREFIX = computePrefix(Float.NEGATIVE_INFINITY);
-  }
-
   public static final class DoublePrefixComparator extends PrefixComparator {
     @Override
     public int compare(long aPrefix, long bPrefix) {
diff --git a/core/src/test/scala/org/apache/spark/util/collection/unsafe/sort/PrefixComparatorsSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/unsafe/sort/PrefixComparatorsSuite.scala
index dc03e374b51db..28fe9259453a6 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/unsafe/sort/PrefixComparatorsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/unsafe/sort/PrefixComparatorsSuite.scala
@@ -48,18 +48,6 @@ class PrefixComparatorsSuite extends SparkFunSuite with PropertyChecks {
     forAll { (s1: String, s2: String) => testPrefixComparison(s1, s2) }
   }
 
-  test("float prefix comparator handles NaN properly") {
-    val nan1: Float = java.lang.Float.intBitsToFloat(0x7f800001)
-    val nan2: Float = java.lang.Float.intBitsToFloat(0x7fffffff)
-    assert(nan1.isNaN)
-    assert(nan2.isNaN)
-    val nan1Prefix = PrefixComparators.FLOAT.computePrefix(nan1)
-    val nan2Prefix = PrefixComparators.FLOAT.computePrefix(nan2)
-    assert(nan1Prefix === nan2Prefix)
-    val floatMaxPrefix = PrefixComparators.FLOAT.computePrefix(Float.MaxValue)
-    assert(PrefixComparators.FLOAT.compare(nan1Prefix, floatMaxPrefix) === 1)
-  }
-
   test("double prefix comparator handles NaNs properly") {
     val nan1: Double = java.lang.Double.longBitsToDouble(0x7ff0000000000001L)
     val nan2: Double = java.lang.Double.longBitsToDouble(0x7fffffffffffffffL)
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java b/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java
index 4c3f2c6557140..8342833246f7d 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java
@@ -121,7 +121,7 @@ Iterator<InternalRow> sort() throws IOException {
         // here in order to prevent memory leaks.
         cleanupResources();
       }
-      return new AbstractScalaRowIterator() {
+      return new AbstractScalaRowIterator<InternalRow>() {
 
         private final int numFields = schema.length();
         private UnsafeRow row = new UnsafeRow();
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SortPrefixUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SortPrefixUtils.scala
index 2dee3542d6101..050d27f1460fb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SortPrefixUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SortPrefixUtils.scala
@@ -19,7 +19,7 @@
 package org.apache.spark.sql.execution
 
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.SortOrder
+import org.apache.spark.sql.catalyst.expressions.{BoundReference, SortOrder}
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
 import org.apache.spark.util.collection.unsafe.sort.{PrefixComparators, PrefixComparator}
@@ -39,57 +39,54 @@ object SortPrefixUtils {
     sortOrder.dataType match {
       case StringType => PrefixComparators.STRING
       case BooleanType | ByteType | ShortType | IntegerType | LongType => PrefixComparators.INTEGRAL
-      case FloatType => PrefixComparators.FLOAT
-      case DoubleType => PrefixComparators.DOUBLE
+      case FloatType | DoubleType => PrefixComparators.DOUBLE
       case _ => NoOpPrefixComparator
     }
   }
 
   def getPrefixComputer(sortOrder: SortOrder): InternalRow => Long = {
+    val bound = sortOrder.child.asInstanceOf[BoundReference]
+    val pos = bound.ordinal
     sortOrder.dataType match {
-      case StringType => (row: InternalRow) => {
-        PrefixComparators.STRING.computePrefix(sortOrder.child.eval(row).asInstanceOf[UTF8String])
-      }
+      case StringType =>
+        (row: InternalRow) => {
+          PrefixComparators.STRING.computePrefix(row.getUTF8String(pos))
+        }
       case BooleanType =>
         (row: InternalRow) => {
-          val exprVal = sortOrder.child.eval(row)
-          if (exprVal == null) PrefixComparators.INTEGRAL.NULL_PREFIX
-          else if (sortOrder.child.eval(row).asInstanceOf[Boolean]) 1
+          if (row.isNullAt(pos)) PrefixComparators.INTEGRAL.NULL_PREFIX
+          else if (row.getBoolean(pos)) 1
           else 0
         }
       case ByteType =>
         (row: InternalRow) => {
-          val exprVal = sortOrder.child.eval(row)
-          if (exprVal == null) PrefixComparators.INTEGRAL.NULL_PREFIX
-          else sortOrder.child.eval(row).asInstanceOf[Byte]
+          if (row.isNullAt(pos)) PrefixComparators.INTEGRAL.NULL_PREFIX else row.getByte(pos)
         }
       case ShortType =>
         (row: InternalRow) => {
-          val exprVal = sortOrder.child.eval(row)
-          if (exprVal == null) PrefixComparators.INTEGRAL.NULL_PREFIX
-          else sortOrder.child.eval(row).asInstanceOf[Short]
+          if (row.isNullAt(pos)) PrefixComparators.INTEGRAL.NULL_PREFIX else row.getShort(pos)
         }
       case IntegerType =>
         (row: InternalRow) => {
-          val exprVal = sortOrder.child.eval(row)
-          if (exprVal == null) PrefixComparators.INTEGRAL.NULL_PREFIX
-          else sortOrder.child.eval(row).asInstanceOf[Int]
+          if (row.isNullAt(pos)) PrefixComparators.INTEGRAL.NULL_PREFIX else row.getInt(pos)
         }
       case LongType =>
         (row: InternalRow) => {
-          val exprVal = sortOrder.child.eval(row)
-          if (exprVal == null) PrefixComparators.INTEGRAL.NULL_PREFIX
-          else sortOrder.child.eval(row).asInstanceOf[Long]
+          if (row.isNullAt(pos)) PrefixComparators.INTEGRAL.NULL_PREFIX else row.getLong(pos)
         }
       case FloatType => (row: InternalRow) => {
-        val exprVal = sortOrder.child.eval(row)
-        if (exprVal == null) PrefixComparators.FLOAT.NULL_PREFIX
-        else PrefixComparators.FLOAT.computePrefix(sortOrder.child.eval(row).asInstanceOf[Float])
+        if (row.isNullAt(pos)) {
+          PrefixComparators.DOUBLE.NULL_PREFIX
+        } else {
+          PrefixComparators.DOUBLE.computePrefix(row.getFloat(pos).toDouble)
+        }
       }
       case DoubleType => (row: InternalRow) => {
-        val exprVal = sortOrder.child.eval(row)
-        if (exprVal == null) PrefixComparators.DOUBLE.NULL_PREFIX
-        else PrefixComparators.DOUBLE.computePrefix(sortOrder.child.eval(row).asInstanceOf[Double])
+        if (row.isNullAt(pos)) {
+          PrefixComparators.DOUBLE.NULL_PREFIX
+        } else {
+          PrefixComparators.DOUBLE.computePrefix(row.getDouble(pos))
+        }
       }
       case _ => (row: InternalRow) => 0L
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index f3ef066528ff8..4ab2c41f1b339 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -340,8 +340,8 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
      */
     def getSortOperator(sortExprs: Seq[SortOrder], global: Boolean, child: SparkPlan): SparkPlan = {
       if (sqlContext.conf.unsafeEnabled && sqlContext.conf.codegenEnabled &&
-        UnsafeExternalSort.supportsSchema(child.schema)) {
-        execution.UnsafeExternalSort(sortExprs, global, child)
+        TungstenSort.supportsSchema(child.schema)) {
+        execution.TungstenSort(sortExprs, global, child)
       } else if (sqlContext.conf.externalSortEnabled) {
         execution.ExternalSort(sortExprs, global, child)
       } else {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/sort.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/sort.scala
index f82208868c3e3..d0ad310062853 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/sort.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/sort.scala
@@ -97,7 +97,7 @@ case class ExternalSort(
  * @param testSpillFrequency Method for configuring periodic spilling in unit tests. If set, will
  *                           spill every `frequency` records.
  */
-case class UnsafeExternalSort(
+case class TungstenSort(
     sortOrder: Seq[SortOrder],
     global: Boolean,
     child: SparkPlan,
@@ -110,7 +110,6 @@ case class UnsafeExternalSort(
     if (global) OrderedDistribution(sortOrder) :: Nil else UnspecifiedDistribution :: Nil
 
   protected override def doExecute(): RDD[InternalRow] = attachTree(this, "sort") {
-    assert(codegenEnabled, "UnsafeExternalSort requires code generation to be enabled")
     def doSort(iterator: Iterator[InternalRow]): Iterator[InternalRow] = {
       val ordering = newOrdering(sortOrder, child.output)
       val boundSortExpression = BindReferences.bindReference(sortOrder.head, child.output)
@@ -149,7 +148,7 @@ case class UnsafeExternalSort(
 }
 
 @DeveloperApi
-object UnsafeExternalSort {
+object TungstenSort {
   /**
    * Return true if UnsafeExternalSort can sort rows with the given schema, false otherwise.
    */
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/RowFormatConvertersSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/RowFormatConvertersSuite.scala
index 7b75f755918c1..c458f95ca1ab3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/RowFormatConvertersSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/RowFormatConvertersSuite.scala
@@ -31,7 +31,7 @@ class RowFormatConvertersSuite extends SparkPlanTest {
 
   private val outputsSafe = ExternalSort(Nil, false, PhysicalRDD(Seq.empty, null))
   assert(!outputsSafe.outputsUnsafeRows)
-  private val outputsUnsafe = UnsafeExternalSort(Nil, false, PhysicalRDD(Seq.empty, null))
+  private val outputsUnsafe = TungstenSort(Nil, false, PhysicalRDD(Seq.empty, null))
   assert(outputsUnsafe.outputsUnsafeRows)
 
   test("planner should insert unsafe->safe conversions when required") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeExternalSortSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeExternalSortSuite.scala
index 7a4baa9e4a49d..9cabc4b90bf8e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeExternalSortSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeExternalSortSuite.scala
@@ -42,7 +42,7 @@ class UnsafeExternalSortSuite extends SparkPlanTest with BeforeAndAfterAll {
     TestSQLContext.sparkContext.conf.set("spark.unsafe.exceptionOnMemoryLeak", "false")
     checkThatPlansAgree(
       (1 to 100).map(v => Tuple1(v)).toDF("a"),
-      (child: SparkPlan) => Limit(10, UnsafeExternalSort('a.asc :: Nil, true, child)),
+      (child: SparkPlan) => Limit(10, TungstenSort('a.asc :: Nil, true, child)),
       (child: SparkPlan) => Limit(10, Sort('a.asc :: Nil, global = true, child)),
       sortAnswers = false
     )
@@ -53,7 +53,7 @@ class UnsafeExternalSortSuite extends SparkPlanTest with BeforeAndAfterAll {
     try {
       checkThatPlansAgree(
         (1 to 100).map(v => Tuple1(v)).toDF("a"),
-        (child: SparkPlan) => Limit(10, UnsafeExternalSort('a.asc :: Nil, true, child)),
+        (child: SparkPlan) => Limit(10, TungstenSort('a.asc :: Nil, true, child)),
         (child: SparkPlan) => Limit(10, Sort('a.asc :: Nil, global = true, child)),
         sortAnswers = false
       )
@@ -68,7 +68,7 @@ class UnsafeExternalSortSuite extends SparkPlanTest with BeforeAndAfterAll {
     val stringLength = 1024 * 1024 * 2
     checkThatPlansAgree(
       Seq(Tuple1("a" * stringLength), Tuple1("b" * stringLength)).toDF("a").repartition(1),
-      UnsafeExternalSort(sortOrder, global = true, _: SparkPlan, testSpillFrequency = 1),
+      TungstenSort(sortOrder, global = true, _: SparkPlan, testSpillFrequency = 1),
       Sort(sortOrder, global = true, _: SparkPlan),
       sortAnswers = false
     )
@@ -88,11 +88,11 @@ class UnsafeExternalSortSuite extends SparkPlanTest with BeforeAndAfterAll {
         TestSQLContext.sparkContext.parallelize(Random.shuffle(inputData).map(v => Row(v))),
         StructType(StructField("a", dataType, nullable = true) :: Nil)
       )
-      assert(UnsafeExternalSort.supportsSchema(inputDf.schema))
+      assert(TungstenSort.supportsSchema(inputDf.schema))
       checkThatPlansAgree(
         inputDf,
         plan => ConvertToSafe(
-          UnsafeExternalSort(sortOrder, global = true, plan: SparkPlan, testSpillFrequency = 23)),
+          TungstenSort(sortOrder, global = true, plan: SparkPlan, testSpillFrequency = 23)),
         Sort(sortOrder, global = true, _: SparkPlan),
         sortAnswers = false
       )

From 07fd7d36471dfb823c1ce3e3a18464043affde18 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Wed, 29 Jul 2015 21:18:43 -0700
Subject: [PATCH 457/549] [SPARK-9460] Avoid byte array allocation in
 StringPrefixComparator.

As of today, StringPrefixComparator converts the long values back to byte arrays in order to compare them. This patch optimizes this to compare the longs directly, rather than turning the longs into byte arrays and comparing them byte by byte (unsigned).

This only works on little-endian architecture right now.

Author: Reynold Xin <rxin@databricks.com>

Closes #7765 from rxin/SPARK-9460 and squashes the following commits:

e4908cc [Reynold Xin] Stricter randomized tests.
4c8d094 [Reynold Xin] [SPARK-9460] Avoid byte array allocation in StringPrefixComparator.
---
 .../unsafe/sort/PrefixComparators.java        | 29 ++-----------------
 .../unsafe/sort/PrefixComparatorsSuite.scala  | 19 ++++++++----
 .../apache/spark/unsafe/types/UTF8String.java |  9 ++++++
 .../spark/unsafe/types/UTF8StringSuite.java   | 11 +++++++
 4 files changed, 36 insertions(+), 32 deletions(-)

diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/PrefixComparators.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/PrefixComparators.java
index 5624e067da2cc..a9ee6042fec74 100644
--- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/PrefixComparators.java
+++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/PrefixComparators.java
@@ -17,9 +17,7 @@
 
 package org.apache.spark.util.collection.unsafe.sort;
 
-import com.google.common.base.Charsets;
-import com.google.common.primitives.Longs;
-import com.google.common.primitives.UnsignedBytes;
+import com.google.common.primitives.UnsignedLongs;
 
 import org.apache.spark.annotation.Private;
 import org.apache.spark.unsafe.types.UTF8String;
@@ -36,32 +34,11 @@ private PrefixComparators() {}
   public static final class StringPrefixComparator extends PrefixComparator {
     @Override
     public int compare(long aPrefix, long bPrefix) {
-      // TODO: can done more efficiently
-      byte[] a = Longs.toByteArray(aPrefix);
-      byte[] b = Longs.toByteArray(bPrefix);
-      for (int i = 0; i < 8; i++) {
-        int c = UnsignedBytes.compare(a[i], b[i]);
-        if (c != 0) return c;
-      }
-      return 0;
-    }
-
-    public long computePrefix(byte[] bytes) {
-      if (bytes == null) {
-        return 0L;
-      } else {
-        byte[] padded = new byte[8];
-        System.arraycopy(bytes, 0, padded, 0, Math.min(bytes.length, 8));
-        return Longs.fromByteArray(padded);
-      }
-    }
-
-    public long computePrefix(String value) {
-      return value == null ? 0L : computePrefix(value.getBytes(Charsets.UTF_8));
+      return UnsignedLongs.compare(aPrefix, bPrefix);
     }
 
     public long computePrefix(UTF8String value) {
-      return value == null ? 0L : computePrefix(value.getBytes());
+      return value == null ? 0L : value.getPrefix();
     }
   }
 
diff --git a/core/src/test/scala/org/apache/spark/util/collection/unsafe/sort/PrefixComparatorsSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/unsafe/sort/PrefixComparatorsSuite.scala
index 28fe9259453a6..26b7a9e816d1e 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/unsafe/sort/PrefixComparatorsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/unsafe/sort/PrefixComparatorsSuite.scala
@@ -17,22 +17,29 @@
 
 package org.apache.spark.util.collection.unsafe.sort
 
+import com.google.common.primitives.UnsignedBytes
 import org.scalatest.prop.PropertyChecks
-
 import org.apache.spark.SparkFunSuite
+import org.apache.spark.unsafe.types.UTF8String
 
 class PrefixComparatorsSuite extends SparkFunSuite with PropertyChecks {
 
   test("String prefix comparator") {
 
     def testPrefixComparison(s1: String, s2: String): Unit = {
-      val s1Prefix = PrefixComparators.STRING.computePrefix(s1)
-      val s2Prefix = PrefixComparators.STRING.computePrefix(s2)
+      val utf8string1 = UTF8String.fromString(s1)
+      val utf8string2 = UTF8String.fromString(s2)
+      val s1Prefix = PrefixComparators.STRING.computePrefix(utf8string1)
+      val s2Prefix = PrefixComparators.STRING.computePrefix(utf8string2)
       val prefixComparisonResult = PrefixComparators.STRING.compare(s1Prefix, s2Prefix)
+
+      val cmp = UnsignedBytes.lexicographicalComparator().compare(
+        utf8string1.getBytes.take(8), utf8string2.getBytes.take(8))
+
       assert(
-        (prefixComparisonResult == 0) ||
-        (prefixComparisonResult < 0 && s1 < s2) ||
-        (prefixComparisonResult > 0 && s1 > s2))
+        (prefixComparisonResult == 0 && cmp == 0) ||
+        (prefixComparisonResult < 0 && s1.compareTo(s2) < 0) ||
+        (prefixComparisonResult > 0 && s1.compareTo(s2) > 0))
     }
 
     // scalastyle:off
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
index 3e1cc67dbf337..57522003ba2ba 100644
--- a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
+++ b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -137,6 +137,15 @@ public int numChars() {
     return len;
   }
 
+  /**
+   * Returns a 64-bit integer that can be used as the prefix used in sorting.
+   */
+  public long getPrefix() {
+    long p = PlatformDependent.UNSAFE.getLong(base, offset);
+    p = java.lang.Long.reverseBytes(p);
+    return p;
+  }
+
   /**
    * Returns the underline bytes, will be a copy of it if it's part of another array.
    */
diff --git a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
index e2a5628ff4d93..42e09e435a412 100644
--- a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
+++ b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
@@ -63,8 +63,19 @@ public void emptyStringTest() {
     assertEquals(0, EMPTY_UTF8.numBytes());
   }
 
+  @Test
+  public void prefix() {
+    assertTrue(fromString("a").getPrefix() - fromString("b").getPrefix() < 0);
+    assertTrue(fromString("ab").getPrefix() - fromString("b").getPrefix() < 0);
+    assertTrue(
+      fromString("abbbbbbbbbbbasdf").getPrefix() - fromString("bbbbbbbbbbbbasdf").getPrefix() < 0);
+    assertTrue(fromString("").getPrefix() - fromString("a").getPrefix() < 0);
+    assertTrue(fromString("你好").getPrefix() - fromString("世界").getPrefix() > 0);
+  }
+
   @Test
   public void compareTo() {
+    assertTrue(fromString("").compareTo(fromString("a")) < 0);
     assertTrue(fromString("abc").compareTo(fromString("ABC")) > 0);
     assertTrue(fromString("abc0").compareTo(fromString("abc")) > 0);
     assertTrue(fromString("abcabcabc").compareTo(fromString("abcabcabc")) == 0);

From 27850af5255352cebd933ed3cc3d82c9ff6e9b62 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Wed, 29 Jul 2015 21:24:47 -0700
Subject: [PATCH 458/549] [SPARK-9462][SQL] Initialize nondeterministic
 expressions in code gen fallback mode.

Author: Reynold Xin <rxin@databricks.com>

Closes #7767 from rxin/SPARK-9462 and squashes the following commits:

ef3e2d9 [Reynold Xin] Removed println
713ac3a [Reynold Xin] More unit tests.
bb5c334 [Reynold Xin] [SPARK-9462][SQL] Initialize nondeterministic expressions in code gen fallback mode.
---
 .../expressions/codegen/CodegenFallback.scala |  7 ++-
 .../CodegenExpressionCachingSuite.scala       | 46 +++++++++++++++++--
 2 files changed, 47 insertions(+), 6 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodegenFallback.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodegenFallback.scala
index 6b187f05604fd..3492d2c6189ed 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodegenFallback.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodegenFallback.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.catalyst.expressions.codegen
 
-import org.apache.spark.sql.catalyst.expressions.Expression
+import org.apache.spark.sql.catalyst.expressions.{Nondeterministic, Expression}
 
 /**
  * A trait that can be used to provide a fallback mode for expression code generation.
@@ -25,6 +25,11 @@ import org.apache.spark.sql.catalyst.expressions.Expression
 trait CodegenFallback extends Expression {
 
   protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    foreach {
+      case n: Nondeterministic => n.setInitialValues()
+      case _ =>
+    }
+
     ctx.references += this
     val objectTerm = ctx.freshName("obj")
     s"""
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodegenExpressionCachingSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodegenExpressionCachingSuite.scala
index 866bf904e4a4c..2d3f98dbbd3d1 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodegenExpressionCachingSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodegenExpressionCachingSuite.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.expressions.codegen
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.{UnsafeProjection, LeafExpression}
+import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.types.{BooleanType, DataType}
 
 /**
@@ -27,7 +27,32 @@ import org.apache.spark.sql.types.{BooleanType, DataType}
  */
 class CodegenExpressionCachingSuite extends SparkFunSuite {
 
-  test("GenerateUnsafeProjection") {
+  test("GenerateUnsafeProjection should initialize expressions") {
+    // Use an Add to wrap two of them together in case we only initialize the top level expressions.
+    val expr = And(NondeterministicExpression(), NondeterministicExpression())
+    val instance = UnsafeProjection.create(Seq(expr))
+    assert(instance.apply(null).getBoolean(0) === false)
+  }
+
+  test("GenerateProjection should initialize expressions") {
+    val expr = And(NondeterministicExpression(), NondeterministicExpression())
+    val instance = GenerateProjection.generate(Seq(expr))
+    assert(instance.apply(null).getBoolean(0) === false)
+  }
+
+  test("GenerateMutableProjection should initialize expressions") {
+    val expr = And(NondeterministicExpression(), NondeterministicExpression())
+    val instance = GenerateMutableProjection.generate(Seq(expr))()
+    assert(instance.apply(null).getBoolean(0) === false)
+  }
+
+  test("GeneratePredicate should initialize expressions") {
+    val expr = And(NondeterministicExpression(), NondeterministicExpression())
+    val instance = GeneratePredicate.generate(expr)
+    assert(instance.apply(null) === false)
+  }
+
+  test("GenerateUnsafeProjection should not share expression instances") {
     val expr1 = MutableExpression()
     val instance1 = UnsafeProjection.create(Seq(expr1))
     assert(instance1.apply(null).getBoolean(0) === false)
@@ -39,7 +64,7 @@ class CodegenExpressionCachingSuite extends SparkFunSuite {
     assert(instance2.apply(null).getBoolean(0) === true)
   }
 
-  test("GenerateProjection") {
+  test("GenerateProjection should not share expression instances") {
     val expr1 = MutableExpression()
     val instance1 = GenerateProjection.generate(Seq(expr1))
     assert(instance1.apply(null).getBoolean(0) === false)
@@ -51,7 +76,7 @@ class CodegenExpressionCachingSuite extends SparkFunSuite {
     assert(instance2.apply(null).getBoolean(0) === true)
   }
 
-  test("GenerateMutableProjection") {
+  test("GenerateMutableProjection should not share expression instances") {
     val expr1 = MutableExpression()
     val instance1 = GenerateMutableProjection.generate(Seq(expr1))()
     assert(instance1.apply(null).getBoolean(0) === false)
@@ -63,7 +88,7 @@ class CodegenExpressionCachingSuite extends SparkFunSuite {
     assert(instance2.apply(null).getBoolean(0) === true)
   }
 
-  test("GeneratePredicate") {
+  test("GeneratePredicate should not share expression instances") {
     val expr1 = MutableExpression()
     val instance1 = GeneratePredicate.generate(expr1)
     assert(instance1.apply(null) === false)
@@ -77,6 +102,17 @@ class CodegenExpressionCachingSuite extends SparkFunSuite {
 
 }
 
+/**
+ * An expression that's non-deterministic and doesn't support codegen.
+ */
+case class NondeterministicExpression()
+  extends LeafExpression with Nondeterministic with CodegenFallback {
+  override protected def initInternal(): Unit = { }
+  override protected def evalInternal(input: InternalRow): Any = false
+  override def nullable: Boolean = false
+  override def dataType: DataType = BooleanType
+}
+
 
 /**
  * An expression with mutable state so we can change it freely in our test suite.

From f5dd11339fc9a6d11350f63beeca7c14aec169b1 Mon Sep 17 00:00:00 2001
From: Alex Angelini <alex.louis.angelini@gmail.com>
Date: Wed, 29 Jul 2015 22:25:38 -0700
Subject: [PATCH 459/549] Fix reference to self.names in StructType

`names` is not defined in this context, I think you meant `self.names`.

davies

Author: Alex Angelini <alex.louis.angelini@gmail.com>

Closes #7766 from angelini/fix_struct_type_names and squashes the following commits:

01543a1 [Alex Angelini] Fix reference to self.names in StructType
---
 python/pyspark/sql/types.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index b97d50c945f24..8859308d66027 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -531,7 +531,7 @@ def toInternal(self, obj):
 
         if self._needSerializeFields:
             if isinstance(obj, dict):
-                return tuple(f.toInternal(obj.get(n)) for n, f in zip(names, self.fields))
+                return tuple(f.toInternal(obj.get(n)) for n, f in zip(self.names, self.fields))
             elif isinstance(obj, (tuple, list)):
                 return tuple(f.toInternal(v) for f, v in zip(self.fields, obj))
             else:

From e044705b4402f86d0557ecd146f3565388c7eeb4 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Wed, 29 Jul 2015 22:30:49 -0700
Subject: [PATCH 460/549] [SPARK-9116] [SQL] [PYSPARK] support Python only UDT
 in __main__

Also we could create a Python UDT without having a Scala one, it's important for Python users.

cc mengxr JoshRosen

Author: Davies Liu <davies@databricks.com>

Closes #7453 from davies/class_in_main and squashes the following commits:

4dfd5e1 [Davies Liu] add tests for Python and Scala UDT
793d9b2 [Davies Liu] Merge branch 'master' of github.com:apache/spark into class_in_main
dc65f19 [Davies Liu] address comment
a9a3c40 [Davies Liu] Merge branch 'master' of github.com:apache/spark into class_in_main
a86e1fc [Davies Liu] fix serialization
ad528ba [Davies Liu] Merge branch 'master' of github.com:apache/spark into class_in_main
63f52ef [Davies Liu] fix pylint check
655b8a9 [Davies Liu] Merge branch 'master' of github.com:apache/spark into class_in_main
316a394 [Davies Liu] support Python UDT with UTF
0bcb3ef [Davies Liu] fix bug in mllib
de986d6 [Davies Liu] fix test
83d65ac [Davies Liu] fix bug in StructType
55bb86e [Davies Liu] support Python UDT in __main__ (without Scala one)
---
 pylintrc                                      |   2 +-
 python/pyspark/cloudpickle.py                 |  38 +++++-
 python/pyspark/shuffle.py                     |   2 +-
 python/pyspark/sql/context.py                 | 108 ++++++++++-------
 python/pyspark/sql/tests.py                   | 112 ++++++++++++++++--
 python/pyspark/sql/types.py                   |  78 ++++++------
 .../org/apache/spark/sql/types/DataType.scala |   9 ++
 .../spark/sql/types/UserDefinedType.scala     |  29 +++++
 .../spark/sql/execution/pythonUDFs.scala      |   1 -
 9 files changed, 286 insertions(+), 93 deletions(-)

diff --git a/pylintrc b/pylintrc
index 061775960393b..6a675770da69a 100644
--- a/pylintrc
+++ b/pylintrc
@@ -84,7 +84,7 @@ enable=
 # If you would like to improve the code quality of pyspark, remove any of these disabled errors
 # run ./dev/lint-python and see if the errors raised by pylint can be fixed.
 
-disable=invalid-name,missing-docstring,protected-access,unused-argument,no-member,unused-wildcard-import,redefined-builtin,too-many-arguments,unused-variable,too-few-public-methods,bad-continuation,duplicate-code,redefined-outer-name,too-many-ancestors,import-error,superfluous-parens,unused-import,line-too-long,no-name-in-module,unnecessary-lambda,import-self,no-self-use,unidiomatic-typecheck,fixme,too-many-locals,cyclic-import,too-many-branches,bare-except,wildcard-import,dangerous-default-value,broad-except,too-many-public-methods,deprecated-lambda,anomalous-backslash-in-string,too-many-lines,reimported,too-many-statements,bad-whitespace,unpacking-non-sequence,too-many-instance-attributes,abstract-method,old-style-class,global-statement,attribute-defined-outside-init,arguments-differ,undefined-all-variable,no-init,useless-else-on-loop,super-init-not-called,notimplemented-raised,too-many-return-statements,pointless-string-statement,global-variable-undefined,bad-classmethod-argument,too-many-format-args,parse-error,no-self-argument,pointless-statement,undefined-variable
+disable=invalid-name,missing-docstring,protected-access,unused-argument,no-member,unused-wildcard-import,redefined-builtin,too-many-arguments,unused-variable,too-few-public-methods,bad-continuation,duplicate-code,redefined-outer-name,too-many-ancestors,import-error,superfluous-parens,unused-import,line-too-long,no-name-in-module,unnecessary-lambda,import-self,no-self-use,unidiomatic-typecheck,fixme,too-many-locals,cyclic-import,too-many-branches,bare-except,wildcard-import,dangerous-default-value,broad-except,too-many-public-methods,deprecated-lambda,anomalous-backslash-in-string,too-many-lines,reimported,too-many-statements,bad-whitespace,unpacking-non-sequence,too-many-instance-attributes,abstract-method,old-style-class,global-statement,attribute-defined-outside-init,arguments-differ,undefined-all-variable,no-init,useless-else-on-loop,super-init-not-called,notimplemented-raised,too-many-return-statements,pointless-string-statement,global-variable-undefined,bad-classmethod-argument,too-many-format-args,parse-error,no-self-argument,pointless-statement,undefined-variable,undefined-loop-variable
 
 
 [REPORTS]
diff --git a/python/pyspark/cloudpickle.py b/python/pyspark/cloudpickle.py
index 9ef93071d2e77..3b647985801b7 100644
--- a/python/pyspark/cloudpickle.py
+++ b/python/pyspark/cloudpickle.py
@@ -350,7 +350,26 @@ def save_global(self, obj, name=None, pack=struct.pack):
             if new_override:
                 d['__new__'] = obj.__new__
 
-            self.save_reduce(typ, (obj.__name__, obj.__bases__, d), obj=obj)
+            self.save(_load_class)
+            self.save_reduce(typ, (obj.__name__, obj.__bases__, {"__doc__": obj.__doc__}), obj=obj)
+            d.pop('__doc__', None)
+            # handle property and staticmethod
+            dd = {}
+            for k, v in d.items():
+                if isinstance(v, property):
+                    k = ('property', k)
+                    v = (v.fget, v.fset, v.fdel, v.__doc__)
+                elif isinstance(v, staticmethod) and hasattr(v, '__func__'):
+                    k = ('staticmethod', k)
+                    v = v.__func__
+                elif isinstance(v, classmethod) and hasattr(v, '__func__'):
+                    k = ('classmethod', k)
+                    v = v.__func__
+                dd[k] = v
+            self.save(dd)
+            self.write(pickle.TUPLE2)
+            self.write(pickle.REDUCE)
+
         else:
             raise pickle.PicklingError("Can't pickle %r" % obj)
 
@@ -708,6 +727,23 @@ def _make_skel_func(code, closures, base_globals = None):
                               None, None, closure)
 
 
+def _load_class(cls, d):
+    """
+    Loads additional properties into class `cls`.
+    """
+    for k, v in d.items():
+        if isinstance(k, tuple):
+            typ, k = k
+            if typ == 'property':
+                v = property(*v)
+            elif typ == 'staticmethod':
+                v = staticmethod(v)
+            elif typ == 'classmethod':
+                v = classmethod(v)
+        setattr(cls, k, v)
+    return cls
+
+
 """Constructors for 3rd party libraries
 Note: These can never be renamed due to client compatibility issues"""
 
diff --git a/python/pyspark/shuffle.py b/python/pyspark/shuffle.py
index 8fb71bac64a5e..b8118bdb7ca76 100644
--- a/python/pyspark/shuffle.py
+++ b/python/pyspark/shuffle.py
@@ -606,7 +606,7 @@ def _open_file(self):
         if not os.path.exists(d):
             os.makedirs(d)
         p = os.path.join(d, str(id(self)))
-        self._file = open(p, "wb+", 65536)
+        self._file = open(p, "w+b", 65536)
         self._ser = BatchedSerializer(CompressedSerializer(PickleSerializer()), 1024)
         os.unlink(p)
 
diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py
index abb6522dde7b0..917de24f3536b 100644
--- a/python/pyspark/sql/context.py
+++ b/python/pyspark/sql/context.py
@@ -277,6 +277,66 @@ def applySchema(self, rdd, schema):
 
         return self.createDataFrame(rdd, schema)
 
+    def _createFromRDD(self, rdd, schema, samplingRatio):
+        """
+        Create an RDD for DataFrame from an existing RDD, returns the RDD and schema.
+        """
+        if schema is None or isinstance(schema, (list, tuple)):
+            struct = self._inferSchema(rdd, samplingRatio)
+            converter = _create_converter(struct)
+            rdd = rdd.map(converter)
+            if isinstance(schema, (list, tuple)):
+                for i, name in enumerate(schema):
+                    struct.fields[i].name = name
+                    struct.names[i] = name
+            schema = struct
+
+        elif isinstance(schema, StructType):
+            # take the first few rows to verify schema
+            rows = rdd.take(10)
+            for row in rows:
+                _verify_type(row, schema)
+
+        else:
+            raise TypeError("schema should be StructType or list or None, but got: %s" % schema)
+
+        # convert python objects to sql data
+        rdd = rdd.map(schema.toInternal)
+        return rdd, schema
+
+    def _createFromLocal(self, data, schema):
+        """
+        Create an RDD for DataFrame from an list or pandas.DataFrame, returns
+        the RDD and schema.
+        """
+        if has_pandas and isinstance(data, pandas.DataFrame):
+            if schema is None:
+                schema = [str(x) for x in data.columns]
+            data = [r.tolist() for r in data.to_records(index=False)]
+
+        # make sure data could consumed multiple times
+        if not isinstance(data, list):
+            data = list(data)
+
+        if schema is None or isinstance(schema, (list, tuple)):
+            struct = self._inferSchemaFromList(data)
+            if isinstance(schema, (list, tuple)):
+                for i, name in enumerate(schema):
+                    struct.fields[i].name = name
+                    struct.names[i] = name
+            schema = struct
+
+        elif isinstance(schema, StructType):
+            for row in data:
+                _verify_type(row, schema)
+
+        else:
+            raise TypeError("schema should be StructType or list or None, but got: %s" % schema)
+
+        # convert python objects to sql data
+        data = [schema.toInternal(row) for row in data]
+        return self._sc.parallelize(data), schema
+
     @since(1.3)
     @ignore_unicode_prefix
     def createDataFrame(self, data, schema=None, samplingRatio=None):
@@ -340,49 +400,15 @@ def createDataFrame(self, data, schema=None, samplingRatio=None):
         if isinstance(data, DataFrame):
             raise TypeError("data is already a DataFrame")
 
-        if has_pandas and isinstance(data, pandas.DataFrame):
-            if schema is None:
-                schema = [str(x) for x in data.columns]
-            data = [r.tolist() for r in data.to_records(index=False)]
-
-        if not isinstance(data, RDD):
-            if not isinstance(data, list):
-                data = list(data)
-            try:
-                # data could be list, tuple, generator ...
-                rdd = self._sc.parallelize(data)
-            except Exception:
-                raise TypeError("cannot create an RDD from type: %s" % type(data))
+        if isinstance(data, RDD):
+            rdd, schema = self._createFromRDD(data, schema, samplingRatio)
         else:
-            rdd = data
-
-        if schema is None or isinstance(schema, (list, tuple)):
-            if isinstance(data, RDD):
-                struct = self._inferSchema(rdd, samplingRatio)
-            else:
-                struct = self._inferSchemaFromList(data)
-            if isinstance(schema, (list, tuple)):
-                for i, name in enumerate(schema):
-                    struct.fields[i].name = name
-            schema = struct
-            converter = _create_converter(schema)
-            rdd = rdd.map(converter)
-
-        elif isinstance(schema, StructType):
-            # take the first few rows to verify schema
-            rows = rdd.take(10)
-            for row in rows:
-                _verify_type(row, schema)
-
-        else:
-            raise TypeError("schema should be StructType or list or None")
-
-        # convert python objects to sql data
-        rdd = rdd.map(schema.toInternal)
-
+            rdd, schema = self._createFromLocal(data, schema)
         jrdd = self._jvm.SerDeUtil.toJavaArray(rdd._to_java_object_rdd())
-        df = self._ssql_ctx.applySchemaToPythonRDD(jrdd.rdd(), schema.json())
-        return DataFrame(df, self)
+        jdf = self._ssql_ctx.applySchemaToPythonRDD(jrdd.rdd(), schema.json())
+        df = DataFrame(jdf, self)
+        df._schema = schema
+        return df
 
     @since(1.3)
     def registerDataFrameAsTable(self, df, tableName):
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 5aa6135dc1ee7..ebd3ea8db6a43 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -75,7 +75,7 @@ def sqlType(self):
 
     @classmethod
     def module(cls):
-        return 'pyspark.tests'
+        return 'pyspark.sql.tests'
 
     @classmethod
     def scalaUDT(cls):
@@ -106,10 +106,45 @@ def __str__(self):
         return "(%s,%s)" % (self.x, self.y)
 
     def __eq__(self, other):
-        return isinstance(other, ExamplePoint) and \
+        return isinstance(other, self.__class__) and \
             other.x == self.x and other.y == self.y
 
 
+class PythonOnlyUDT(UserDefinedType):
+    """
+    User-defined type (UDT) for ExamplePoint.
+    """
+
+    @classmethod
+    def sqlType(self):
+        return ArrayType(DoubleType(), False)
+
+    @classmethod
+    def module(cls):
+        return '__main__'
+
+    def serialize(self, obj):
+        return [obj.x, obj.y]
+
+    def deserialize(self, datum):
+        return PythonOnlyPoint(datum[0], datum[1])
+
+    @staticmethod
+    def foo():
+        pass
+
+    @property
+    def props(self):
+        return {}
+
+
+class PythonOnlyPoint(ExamplePoint):
+    """
+    An example class to demonstrate UDT in only Python
+    """
+    __UDT__ = PythonOnlyUDT()
+
+
 class DataTypeTests(unittest.TestCase):
     # regression test for SPARK-6055
     def test_data_type_eq(self):
@@ -395,10 +430,39 @@ def test_convert_row_to_dict(self):
         self.assertEqual(1, row.asDict()["l"][0].a)
         self.assertEqual(1.0, row.asDict()['d']['key'].c)
 
+    def test_udt(self):
+        from pyspark.sql.types import _parse_datatype_json_string, _infer_type, _verify_type
+        from pyspark.sql.tests import ExamplePointUDT, ExamplePoint
+
+        def check_datatype(datatype):
+            pickled = pickle.loads(pickle.dumps(datatype))
+            assert datatype == pickled
+            scala_datatype = self.sqlCtx._ssql_ctx.parseDataType(datatype.json())
+            python_datatype = _parse_datatype_json_string(scala_datatype.json())
+            assert datatype == python_datatype
+
+        check_datatype(ExamplePointUDT())
+        structtype_with_udt = StructType([StructField("label", DoubleType(), False),
+                                          StructField("point", ExamplePointUDT(), False)])
+        check_datatype(structtype_with_udt)
+        p = ExamplePoint(1.0, 2.0)
+        self.assertEqual(_infer_type(p), ExamplePointUDT())
+        _verify_type(ExamplePoint(1.0, 2.0), ExamplePointUDT())
+        self.assertRaises(ValueError, lambda: _verify_type([1.0, 2.0], ExamplePointUDT()))
+
+        check_datatype(PythonOnlyUDT())
+        structtype_with_udt = StructType([StructField("label", DoubleType(), False),
+                                          StructField("point", PythonOnlyUDT(), False)])
+        check_datatype(structtype_with_udt)
+        p = PythonOnlyPoint(1.0, 2.0)
+        self.assertEqual(_infer_type(p), PythonOnlyUDT())
+        _verify_type(PythonOnlyPoint(1.0, 2.0), PythonOnlyUDT())
+        self.assertRaises(ValueError, lambda: _verify_type([1.0, 2.0], PythonOnlyUDT()))
+
     def test_infer_schema_with_udt(self):
         from pyspark.sql.tests import ExamplePoint, ExamplePointUDT
         row = Row(label=1.0, point=ExamplePoint(1.0, 2.0))
-        df = self.sc.parallelize([row]).toDF()
+        df = self.sqlCtx.createDataFrame([row])
         schema = df.schema
         field = [f for f in schema.fields if f.name == "point"][0]
         self.assertEqual(type(field.dataType), ExamplePointUDT)
@@ -406,36 +470,66 @@ def test_infer_schema_with_udt(self):
         point = self.sqlCtx.sql("SELECT point FROM labeled_point").head().point
         self.assertEqual(point, ExamplePoint(1.0, 2.0))
 
+        row = Row(label=1.0, point=PythonOnlyPoint(1.0, 2.0))
+        df = self.sqlCtx.createDataFrame([row])
+        schema = df.schema
+        field = [f for f in schema.fields if f.name == "point"][0]
+        self.assertEqual(type(field.dataType), PythonOnlyUDT)
+        df.registerTempTable("labeled_point")
+        point = self.sqlCtx.sql("SELECT point FROM labeled_point").head().point
+        self.assertEqual(point, PythonOnlyPoint(1.0, 2.0))
+
     def test_apply_schema_with_udt(self):
         from pyspark.sql.tests import ExamplePoint, ExamplePointUDT
         row = (1.0, ExamplePoint(1.0, 2.0))
-        rdd = self.sc.parallelize([row])
         schema = StructType([StructField("label", DoubleType(), False),
                              StructField("point", ExamplePointUDT(), False)])
-        df = rdd.toDF(schema)
+        df = self.sqlCtx.createDataFrame([row], schema)
         point = df.head().point
         self.assertEquals(point, ExamplePoint(1.0, 2.0))
 
+        row = (1.0, PythonOnlyPoint(1.0, 2.0))
+        schema = StructType([StructField("label", DoubleType(), False),
+                             StructField("point", PythonOnlyUDT(), False)])
+        df = self.sqlCtx.createDataFrame([row], schema)
+        point = df.head().point
+        self.assertEquals(point, PythonOnlyPoint(1.0, 2.0))
+
     def test_udf_with_udt(self):
         from pyspark.sql.tests import ExamplePoint, ExamplePointUDT
         row = Row(label=1.0, point=ExamplePoint(1.0, 2.0))
-        df = self.sc.parallelize([row]).toDF()
+        df = self.sqlCtx.createDataFrame([row])
         self.assertEqual(1.0, df.map(lambda r: r.point.x).first())
         udf = UserDefinedFunction(lambda p: p.y, DoubleType())
         self.assertEqual(2.0, df.select(udf(df.point)).first()[0])
         udf2 = UserDefinedFunction(lambda p: ExamplePoint(p.x + 1, p.y + 1), ExamplePointUDT())
         self.assertEqual(ExamplePoint(2.0, 3.0), df.select(udf2(df.point)).first()[0])
 
+        row = Row(label=1.0, point=PythonOnlyPoint(1.0, 2.0))
+        df = self.sqlCtx.createDataFrame([row])
+        self.assertEqual(1.0, df.map(lambda r: r.point.x).first())
+        udf = UserDefinedFunction(lambda p: p.y, DoubleType())
+        self.assertEqual(2.0, df.select(udf(df.point)).first()[0])
+        udf2 = UserDefinedFunction(lambda p: PythonOnlyPoint(p.x + 1, p.y + 1), PythonOnlyUDT())
+        self.assertEqual(PythonOnlyPoint(2.0, 3.0), df.select(udf2(df.point)).first()[0])
+
     def test_parquet_with_udt(self):
-        from pyspark.sql.tests import ExamplePoint
+        from pyspark.sql.tests import ExamplePoint, ExamplePointUDT
         row = Row(label=1.0, point=ExamplePoint(1.0, 2.0))
-        df0 = self.sc.parallelize([row]).toDF()
+        df0 = self.sqlCtx.createDataFrame([row])
         output_dir = os.path.join(self.tempdir.name, "labeled_point")
-        df0.saveAsParquetFile(output_dir)
+        df0.write.parquet(output_dir)
         df1 = self.sqlCtx.parquetFile(output_dir)
         point = df1.head().point
         self.assertEquals(point, ExamplePoint(1.0, 2.0))
 
+        row = Row(label=1.0, point=PythonOnlyPoint(1.0, 2.0))
+        df0 = self.sqlCtx.createDataFrame([row])
+        df0.write.parquet(output_dir, mode='overwrite')
+        df1 = self.sqlCtx.parquetFile(output_dir)
+        point = df1.head().point
+        self.assertEquals(point, PythonOnlyPoint(1.0, 2.0))
+
     def test_column_operators(self):
         ci = self.df.key
         cs = self.df.value
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index 8859308d66027..0976aea72c034 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -22,6 +22,7 @@
 import calendar
 import json
 import re
+import base64
 from array import array
 
 if sys.version >= "3":
@@ -31,6 +32,8 @@
 from py4j.protocol import register_input_converter
 from py4j.java_gateway import JavaClass
 
+from pyspark.serializers import CloudPickleSerializer
+
 __all__ = [
     "DataType", "NullType", "StringType", "BinaryType", "BooleanType", "DateType",
     "TimestampType", "DecimalType", "DoubleType", "FloatType", "ByteType", "IntegerType",
@@ -458,7 +461,7 @@ def __init__(self, fields=None):
             self.names = [f.name for f in fields]
             assert all(isinstance(f, StructField) for f in fields),\
                 "fields should be a list of StructField"
-        self._needSerializeFields = None
+        self._needSerializeAnyField = any(f.needConversion() for f in self.fields)
 
     def add(self, field, data_type=None, nullable=True, metadata=None):
         """
@@ -501,6 +504,7 @@ def add(self, field, data_type=None, nullable=True, metadata=None):
                 data_type_f = data_type
             self.fields.append(StructField(field, data_type_f, nullable, metadata))
             self.names.append(field)
+        self._needSerializeAnyField = any(f.needConversion() for f in self.fields)
         return self
 
     def simpleString(self):
@@ -526,10 +530,7 @@ def toInternal(self, obj):
         if obj is None:
             return
 
-        if self._needSerializeFields is None:
-            self._needSerializeFields = any(f.needConversion() for f in self.fields)
-
-        if self._needSerializeFields:
+        if self._needSerializeAnyField:
             if isinstance(obj, dict):
                 return tuple(f.toInternal(obj.get(n)) for n, f in zip(self.names, self.fields))
             elif isinstance(obj, (tuple, list)):
@@ -550,7 +551,10 @@ def fromInternal(self, obj):
         if isinstance(obj, Row):
             # it's already converted by pickler
             return obj
-        values = [f.dataType.fromInternal(v) for f, v in zip(self.fields, obj)]
+        if self._needSerializeAnyField:
+            values = [f.fromInternal(v) for f, v in zip(self.fields, obj)]
+        else:
+            values = obj
         return _create_row(self.names, values)
 
 
@@ -581,9 +585,10 @@ def module(cls):
     @classmethod
     def scalaUDT(cls):
         """
-        The class name of the paired Scala UDT.
+        The class name of the paired Scala UDT (could be '', if there
+        is no corresponding one).
         """
-        raise NotImplementedError("UDT must have a paired Scala UDT.")
+        return ''
 
     def needConversion(self):
         return True
@@ -622,22 +627,37 @@ def json(self):
         return json.dumps(self.jsonValue(), separators=(',', ':'), sort_keys=True)
 
     def jsonValue(self):
-        schema = {
-            "type": "udt",
-            "class": self.scalaUDT(),
-            "pyClass": "%s.%s" % (self.module(), type(self).__name__),
-            "sqlType": self.sqlType().jsonValue()
-        }
+        if self.scalaUDT():
+            assert self.module() != '__main__', 'UDT in __main__ cannot work with ScalaUDT'
+            schema = {
+                "type": "udt",
+                "class": self.scalaUDT(),
+                "pyClass": "%s.%s" % (self.module(), type(self).__name__),
+                "sqlType": self.sqlType().jsonValue()
+            }
+        else:
+            ser = CloudPickleSerializer()
+            b = ser.dumps(type(self))
+            schema = {
+                "type": "udt",
+                "pyClass": "%s.%s" % (self.module(), type(self).__name__),
+                "serializedClass": base64.b64encode(b).decode('utf8'),
+                "sqlType": self.sqlType().jsonValue()
+            }
         return schema
 
     @classmethod
     def fromJson(cls, json):
-        pyUDT = json["pyClass"]
+        pyUDT = str(json["pyClass"])
         split = pyUDT.rfind(".")
         pyModule = pyUDT[:split]
         pyClass = pyUDT[split+1:]
         m = __import__(pyModule, globals(), locals(), [pyClass])
-        UDT = getattr(m, pyClass)
+        if not hasattr(m, pyClass):
+            s = base64.b64decode(json['serializedClass'].encode('utf-8'))
+            UDT = CloudPickleSerializer().loads(s)
+        else:
+            UDT = getattr(m, pyClass)
         return UDT()
 
     def __eq__(self, other):
@@ -696,11 +716,6 @@ def _parse_datatype_json_string(json_string):
     >>> complex_maptype = MapType(complex_structtype,
     ...                           complex_arraytype, False)
     >>> check_datatype(complex_maptype)
-
-    >>> check_datatype(ExamplePointUDT())
-    >>> structtype_with_udt = StructType([StructField("label", DoubleType(), False),
-    ...                                   StructField("point", ExamplePointUDT(), False)])
-    >>> check_datatype(structtype_with_udt)
     """
     return _parse_datatype_json_value(json.loads(json_string))
 
@@ -752,10 +767,6 @@ def _parse_datatype_json_value(json_value):
 
 def _infer_type(obj):
     """Infer the DataType from obj
-
-    >>> p = ExamplePoint(1.0, 2.0)
-    >>> _infer_type(p)
-    ExamplePointUDT
     """
     if obj is None:
         return NullType()
@@ -1090,11 +1101,6 @@ def _verify_type(obj, dataType):
     Traceback (most recent call last):
         ...
     ValueError:...
-    >>> _verify_type(ExamplePoint(1.0, 2.0), ExamplePointUDT())
-    >>> _verify_type([1.0, 2.0], ExamplePointUDT()) # doctest: +IGNORE_EXCEPTION_DETAIL
-    Traceback (most recent call last):
-        ...
-    ValueError:...
     """
     # all objects are nullable
     if obj is None:
@@ -1259,18 +1265,12 @@ def convert(self, obj, gateway_client):
 def _test():
     import doctest
     from pyspark.context import SparkContext
-    # let doctest run in pyspark.sql.types, so DataTypes can be picklable
-    import pyspark.sql.types
-    from pyspark.sql import Row, SQLContext
-    from pyspark.sql.tests import ExamplePoint, ExamplePointUDT
-    globs = pyspark.sql.types.__dict__.copy()
+    from pyspark.sql import SQLContext
+    globs = globals()
     sc = SparkContext('local[4]', 'PythonTest')
     globs['sc'] = sc
     globs['sqlContext'] = SQLContext(sc)
-    globs['ExamplePoint'] = ExamplePoint
-    globs['ExamplePointUDT'] = ExamplePointUDT
-    (failure_count, test_count) = doctest.testmod(
-        pyspark.sql.types, globs=globs, optionflags=doctest.ELLIPSIS)
+    (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
     globs['sc'].stop()
     if failure_count:
         exit(-1)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala
index 591fb26e67c4a..f4428c2e8b202 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala
@@ -142,12 +142,21 @@ object DataType {
     ("type", JString("struct"))) =>
       StructType(fields.map(parseStructField))
 
+    // Scala/Java UDT
     case JSortedObject(
     ("class", JString(udtClass)),
     ("pyClass", _),
     ("sqlType", _),
     ("type", JString("udt"))) =>
       Utils.classForName(udtClass).newInstance().asInstanceOf[UserDefinedType[_]]
+
+    // Python UDT
+    case JSortedObject(
+    ("pyClass", JString(pyClass)),
+    ("serializedClass", JString(serialized)),
+    ("sqlType", v: JValue),
+    ("type", JString("udt"))) =>
+        new PythonUserDefinedType(parseDataType(v), pyClass, serialized)
   }
 
   private def parseStructField(json: JValue): StructField = json match {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/UserDefinedType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/UserDefinedType.scala
index e47cfb4833bd8..4305903616bd9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/UserDefinedType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/UserDefinedType.scala
@@ -45,6 +45,9 @@ abstract class UserDefinedType[UserType] extends DataType with Serializable {
   /** Paired Python UDT class, if exists. */
   def pyUDT: String = null
 
+  /** Serialized Python UDT class, if exists. */
+  def serializedPyClass: String = null
+
   /**
    * Convert the user type to a SQL datum
    *
@@ -82,3 +85,29 @@ abstract class UserDefinedType[UserType] extends DataType with Serializable {
   override private[sql] def acceptsType(dataType: DataType) =
     this.getClass == dataType.getClass
 }
+
+/**
+ * ::DeveloperApi::
+ * The user defined type in Python.
+ *
+ * Note: This can only be accessed via Python UDF, or accessed as serialized object.
+ */
+private[sql] class PythonUserDefinedType(
+    val sqlType: DataType,
+    override val pyUDT: String,
+    override val serializedPyClass: String) extends UserDefinedType[Any] {
+
+  /* The serialization is handled by UDT class in Python */
+  override def serialize(obj: Any): Any = obj
+  override def deserialize(datam: Any): Any = datam
+
+  /* There is no Java class for Python UDT */
+  override def userClass: java.lang.Class[Any] = null
+
+  override private[sql] def jsonValue: JValue = {
+    ("type" -> "udt") ~
+      ("pyClass" -> pyUDT) ~
+      ("serializedClass" -> serializedPyClass) ~
+      ("sqlType" -> sqlType.jsonValue)
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUDFs.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUDFs.scala
index ec084a299649e..3c38916fd7504 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUDFs.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUDFs.scala
@@ -267,7 +267,6 @@ object EvaluatePython {
           pickler.save(row.values(i))
           i += 1
         }
-        row.values.foreach(pickler.save)
         out.write(Opcodes.TUPLE)
         out.write(Opcodes.REDUCE)
       }

From 712465b68e50df7a2050b27528acda9f0d95ba1f Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Wed, 29 Jul 2015 22:51:06 -0700
Subject: [PATCH 461/549] HOTFIX: disable HashedRelationSuite.

---
 .../spark/sql/execution/joins/HashedRelationSuite.scala     | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala
index 8b1a9b21a96b9..941f6d4f6a450 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala
@@ -33,7 +33,7 @@ class HashedRelationSuite extends SparkFunSuite {
     override def apply(row: InternalRow): InternalRow = row
   }
 
-  test("GeneralHashedRelation") {
+  ignore("GeneralHashedRelation") {
     val data = Array(InternalRow(0), InternalRow(1), InternalRow(2), InternalRow(2))
     val hashed = HashedRelation(data.iterator, keyProjection)
     assert(hashed.isInstanceOf[GeneralHashedRelation])
@@ -47,7 +47,7 @@ class HashedRelationSuite extends SparkFunSuite {
     assert(hashed.get(data(2)) === data2)
   }
 
-  test("UniqueKeyHashedRelation") {
+  ignore("UniqueKeyHashedRelation") {
     val data = Array(InternalRow(0), InternalRow(1), InternalRow(2))
     val hashed = HashedRelation(data.iterator, keyProjection)
     assert(hashed.isInstanceOf[UniqueKeyHashedRelation])
@@ -64,7 +64,7 @@ class HashedRelationSuite extends SparkFunSuite {
     assert(uniqHashed.getValue(InternalRow(10)) === null)
   }
 
-  test("UnsafeHashedRelation") {
+  ignore("UnsafeHashedRelation") {
     val schema = StructType(StructField("a", IntegerType, true) :: Nil)
     val data = Array(InternalRow(0), InternalRow(1), InternalRow(2), InternalRow(2))
     val toUnsafe = UnsafeProjection.create(schema)

From e127ec34d58ceb0a9d45748c2f2918786ba0a83d Mon Sep 17 00:00:00 2001
From: Yijie Shen <henry.yijieshen@gmail.com>
Date: Wed, 29 Jul 2015 23:24:20 -0700
Subject: [PATCH 462/549] [SPARK-9428] [SQL] Add test cases for null inputs for
 expression unit tests

JIRA: https://issues.apache.org/jira/browse/SPARK-9428

Author: Yijie Shen <henry.yijieshen@gmail.com>

Closes #7748 from yjshen/string_cleanup and squashes the following commits:

e0c2b3d [Yijie Shen] update codegen in RegExpExtract and RegExpReplace
26614d2 [Yijie Shen] MathFunctionSuite
a402859 [Yijie Shen] complex_create, conditional and cast
6e4e608 [Yijie Shen] arithmetic and cast
52593c1 [Yijie Shen] null input test cases for StringExpressionSuite
---
 .../spark/sql/catalyst/expressions/Cast.scala | 12 ++--
 .../expressions/complexTypeCreator.scala      | 16 +++--
 .../catalyst/expressions/conditionals.scala   | 10 +--
 .../spark/sql/catalyst/expressions/math.scala | 14 ++---
 .../expressions/stringOperations.scala        | 11 ++--
 .../ExpressionTypeCheckingSuite.scala         |  7 ++-
 .../ArithmeticExpressionSuite.scala           |  3 +
 .../sql/catalyst/expressions/CastSuite.scala  | 52 ++++++++++++++-
 .../expressions/ComplexTypeSuite.scala        | 23 +++----
 .../ConditionalExpressionSuite.scala          |  4 ++
 .../expressions/MathFunctionsSuite.scala      | 63 ++++++++++---------
 .../catalyst/expressions/RandomSuite.scala    |  1 -
 .../expressions/StringExpressionsSuite.scala  | 26 ++++++++
 .../org/apache/spark/sql/functions.scala      |  6 +-
 14 files changed, 167 insertions(+), 81 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
index c6e8af27667ee..8c01c13c9ccd5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -599,7 +599,7 @@ case class Cast(child: Expression, dataType: DataType)
           }
          """
     case BooleanType =>
-      (c, evPrim, evNull) => s"$evPrim = $c ? 1L : 0;"
+      (c, evPrim, evNull) => s"$evPrim = $c ? 1L : 0L;"
     case _: IntegralType =>
       (c, evPrim, evNull) => s"$evPrim = ${longToTimeStampCode(c)};"
     case DateType =>
@@ -665,7 +665,7 @@ case class Cast(child: Expression, dataType: DataType)
           }
         """
     case BooleanType =>
-      (c, evPrim, evNull) => s"$evPrim = $c ? 1 : 0;"
+      (c, evPrim, evNull) => s"$evPrim = $c ? (byte) 1 : (byte) 0;"
     case DateType =>
       (c, evPrim, evNull) => s"$evNull = true;"
     case TimestampType =>
@@ -687,7 +687,7 @@ case class Cast(child: Expression, dataType: DataType)
           }
         """
     case BooleanType =>
-      (c, evPrim, evNull) => s"$evPrim = $c ? 1 : 0;"
+      (c, evPrim, evNull) => s"$evPrim = $c ? (short) 1 : (short) 0;"
     case DateType =>
       (c, evPrim, evNull) => s"$evNull = true;"
     case TimestampType =>
@@ -731,7 +731,7 @@ case class Cast(child: Expression, dataType: DataType)
           }
         """
     case BooleanType =>
-      (c, evPrim, evNull) => s"$evPrim = $c ? 1 : 0;"
+      (c, evPrim, evNull) => s"$evPrim = $c ? 1L : 0L;"
     case DateType =>
       (c, evPrim, evNull) => s"$evNull = true;"
     case TimestampType =>
@@ -753,7 +753,7 @@ case class Cast(child: Expression, dataType: DataType)
           }
         """
     case BooleanType =>
-      (c, evPrim, evNull) => s"$evPrim = $c ? 1 : 0;"
+      (c, evPrim, evNull) => s"$evPrim = $c ? 1.0f : 0.0f;"
     case DateType =>
       (c, evPrim, evNull) => s"$evNull = true;"
     case TimestampType =>
@@ -775,7 +775,7 @@ case class Cast(child: Expression, dataType: DataType)
           }
         """
     case BooleanType =>
-      (c, evPrim, evNull) => s"$evPrim = $c ? 1 : 0;"
+      (c, evPrim, evNull) => s"$evPrim = $c ? 1.0d : 0.0d;"
     case DateType =>
       (c, evPrim, evNull) => s"$evNull = true;"
     case TimestampType =>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
index d8c9087ff5380..0517050a45109 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+import org.apache.spark.unsafe.types.UTF8String
+
 import scala.collection.mutable
 
 import org.apache.spark.sql.catalyst.InternalRow
@@ -127,11 +129,12 @@ case class CreateNamedStruct(children: Seq[Expression]) extends Expression {
   private lazy val (nameExprs, valExprs) =
     children.grouped(2).map { case Seq(name, value) => (name, value) }.toList.unzip
 
-  private lazy val names = nameExprs.map(_.eval(EmptyRow).toString)
+  private lazy val names = nameExprs.map(_.eval(EmptyRow))
 
   override lazy val dataType: StructType = {
     val fields = names.zip(valExprs).map { case (name, valExpr) =>
-      StructField(name, valExpr.dataType, valExpr.nullable, Metadata.empty)
+      StructField(name.asInstanceOf[UTF8String].toString,
+        valExpr.dataType, valExpr.nullable, Metadata.empty)
     }
     StructType(fields)
   }
@@ -144,14 +147,15 @@ case class CreateNamedStruct(children: Seq[Expression]) extends Expression {
     if (children.size % 2 != 0) {
       TypeCheckResult.TypeCheckFailure(s"$prettyName expects an even number of arguments.")
     } else {
-      val invalidNames =
-        nameExprs.filterNot(e => e.foldable && e.dataType == StringType && !nullable)
+      val invalidNames = nameExprs.filterNot(e => e.foldable && e.dataType == StringType)
       if (invalidNames.nonEmpty) {
         TypeCheckResult.TypeCheckFailure(
-          s"Odd position only allow foldable and not-null StringType expressions, got :" +
+          s"Only foldable StringType expressions are allowed to appear at odd position , got :" +
             s" ${invalidNames.mkString(",")}")
-      } else {
+      } else if (names.forall(_ != null)){
         TypeCheckResult.TypeCheckSuccess
+      } else {
+        TypeCheckResult.TypeCheckFailure("Field name should not be null")
       }
     }
   }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionals.scala
index 15b33da884dcb..961b1d8616801 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionals.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionals.scala
@@ -315,7 +315,6 @@ case class CaseKeyWhen(key: Expression, branches: Seq[Expression]) extends CaseW
  * It takes at least 2 parameters, and returns null iff all parameters are null.
  */
 case class Least(children: Seq[Expression]) extends Expression {
-  require(children.length > 1, "LEAST requires at least 2 arguments, got " + children.length)
 
   override def nullable: Boolean = children.forall(_.nullable)
   override def foldable: Boolean = children.forall(_.foldable)
@@ -323,7 +322,9 @@ case class Least(children: Seq[Expression]) extends Expression {
   private lazy val ordering = TypeUtils.getOrdering(dataType)
 
   override def checkInputDataTypes(): TypeCheckResult = {
-    if (children.map(_.dataType).distinct.count(_ != NullType) > 1) {
+    if (children.length <= 1) {
+      TypeCheckResult.TypeCheckFailure(s"LEAST requires at least 2 arguments")
+    } else if (children.map(_.dataType).distinct.count(_ != NullType) > 1) {
       TypeCheckResult.TypeCheckFailure(
         s"The expressions should all have the same type," +
           s" got LEAST (${children.map(_.dataType)}).")
@@ -369,7 +370,6 @@ case class Least(children: Seq[Expression]) extends Expression {
  * It takes at least 2 parameters, and returns null iff all parameters are null.
  */
 case class Greatest(children: Seq[Expression]) extends Expression {
-  require(children.length > 1, "GREATEST requires at least 2 arguments, got " + children.length)
 
   override def nullable: Boolean = children.forall(_.nullable)
   override def foldable: Boolean = children.forall(_.foldable)
@@ -377,7 +377,9 @@ case class Greatest(children: Seq[Expression]) extends Expression {
   private lazy val ordering = TypeUtils.getOrdering(dataType)
 
   override def checkInputDataTypes(): TypeCheckResult = {
-    if (children.map(_.dataType).distinct.count(_ != NullType) > 1) {
+    if (children.length <= 1) {
+      TypeCheckResult.TypeCheckFailure(s"GREATEST requires at least 2 arguments")
+    } else if (children.map(_.dataType).distinct.count(_ != NullType) > 1) {
       TypeCheckResult.TypeCheckFailure(
         s"The expressions should all have the same type," +
           s" got GREATEST (${children.map(_.dataType)}).")
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
index 68cca0ad3d067..e6d807f6d897b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
@@ -646,19 +646,19 @@ case class Logarithm(left: Expression, right: Expression)
 /**
  * Round the `child`'s result to `scale` decimal place when `scale` >= 0
  * or round at integral part when `scale` < 0.
- * For example, round(31.415, 2) would eval to 31.42 and round(31.415, -1) would eval to 30.
+ * For example, round(31.415, 2) = 31.42 and round(31.415, -1) = 30.
  *
- * Child of IntegralType would eval to itself when `scale` >= 0.
- * Child of FractionalType whose value is NaN or Infinite would always eval to itself.
+ * Child of IntegralType would round to itself when `scale` >= 0.
+ * Child of FractionalType whose value is NaN or Infinite would always round to itself.
  *
- * Round's dataType would always equal to `child`'s dataType except for [[DecimalType.Fixed]],
- * which leads to scale update in DecimalType's [[PrecisionInfo]]
+ * Round's dataType would always equal to `child`'s dataType except for DecimalType,
+ * which would lead scale decrease from the origin DecimalType.
  *
  * @param child expr to be round, all [[NumericType]] is allowed as Input
  * @param scale new scale to be round to, this should be a constant int at runtime
  */
 case class Round(child: Expression, scale: Expression)
-  extends BinaryExpression with ExpectsInputTypes {
+  extends BinaryExpression with ImplicitCastInputTypes {
 
   import BigDecimal.RoundingMode.HALF_UP
 
@@ -838,6 +838,4 @@ case class Round(child: Expression, scale: Expression)
       """
     }
   }
-
-  override def prettyName: String = "round"
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
index 6db4e19c24ed5..5b3a64a09679c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
@@ -22,7 +22,6 @@ import java.util.Locale
 import java.util.regex.{MatchResult, Pattern}
 
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.analysis.UnresolvedException
 import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
@@ -52,7 +51,7 @@ case class Concat(children: Seq[Expression]) extends Expression with ImplicitCas
   override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     val evals = children.map(_.gen(ctx))
     val inputs = evals.map { eval =>
-      s"${eval.isNull} ? (UTF8String)null : ${eval.primitive}"
+      s"${eval.isNull} ? null : ${eval.primitive}"
     }.mkString(", ")
     evals.map(_.code).mkString("\n") + s"""
       boolean ${ev.isNull} = false;
@@ -1008,7 +1007,7 @@ case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expressio
 
     s"""
       ${evalSubject.code}
-      boolean ${ev.isNull} = ${evalSubject.isNull};
+      boolean ${ev.isNull} = true;
       ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
       if (!${evalSubject.isNull}) {
         ${evalRegexp.code}
@@ -1103,9 +1102,9 @@ case class RegExpExtract(subject: Expression, regexp: Expression, idx: Expressio
     val evalIdx = idx.gen(ctx)
 
     s"""
-      ${ctx.javaType(dataType)} ${ev.primitive} = null;
-      boolean ${ev.isNull} = true;
       ${evalSubject.code}
+      ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+      boolean ${ev.isNull} = true;
       if (!${evalSubject.isNull}) {
         ${evalRegexp.code}
         if (!${evalRegexp.isNull}) {
@@ -1117,7 +1116,7 @@ case class RegExpExtract(subject: Expression, regexp: Expression, idx: Expressio
               ${termPattern} = ${classNamePattern}.compile(${termLastRegex}.toString());
             }
             ${classOf[java.util.regex.Matcher].getCanonicalName} m =
-                                   ${termPattern}.matcher(${evalSubject.primitive}.toString());
+              ${termPattern}.matcher(${evalSubject.primitive}.toString());
             if (m.find()) {
               ${classOf[java.util.regex.MatchResult].getCanonicalName} mr = m.toMatchResult();
               ${ev.primitive} = ${classNameUTF8String}.fromString(mr.group(${evalIdx.primitive}));
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala
index 8acd4c685e2bc..a52e4cb4dfd9f 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala
@@ -167,10 +167,13 @@ class ExpressionTypeCheckingSuite extends SparkFunSuite {
       CreateNamedStruct(Seq("a", "b", 2.0)), "even number of arguments")
     assertError(
       CreateNamedStruct(Seq(1, "a", "b", 2.0)),
-        "Odd position only allow foldable and not-null StringType expressions")
+        "Only foldable StringType expressions are allowed to appear at odd position")
     assertError(
       CreateNamedStruct(Seq('a.string.at(0), "a", "b", 2.0)),
-        "Odd position only allow foldable and not-null StringType expressions")
+        "Only foldable StringType expressions are allowed to appear at odd position")
+    assertError(
+      CreateNamedStruct(Seq(Literal.create(null, StringType), "a")),
+        "Field name should not be null")
   }
 
   test("check types for ROUND") {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ArithmeticExpressionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ArithmeticExpressionSuite.scala
index 7773e098e0caa..d03b0fbbfb2b2 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ArithmeticExpressionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ArithmeticExpressionSuite.scala
@@ -116,9 +116,12 @@ class ArithmeticExpressionSuite extends SparkFunSuite with ExpressionEvalHelper
 
   test("Abs") {
     testNumericDataTypes { convert =>
+      val input = Literal(convert(1))
+      val dataType = input.dataType
       checkEvaluation(Abs(Literal(convert(0))), convert(0))
       checkEvaluation(Abs(Literal(convert(1))), convert(1))
       checkEvaluation(Abs(Literal(convert(-1))), convert(1))
+      checkEvaluation(Abs(Literal.create(null, dataType)), null)
     }
   }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
index 0e0213be0f57b..a517da9872852 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
@@ -43,6 +43,42 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(cast(v, Literal(expected).dataType), expected)
   }
 
+  private def checkNullCast(from: DataType, to: DataType): Unit = {
+    checkEvaluation(Cast(Literal.create(null, from), to), null)
+  }
+
+  test("null cast") {
+    import DataTypeTestUtils._
+
+    // follow [[org.apache.spark.sql.catalyst.expressions.Cast.canCast]] logic
+    // to ensure we test every possible cast situation here
+    atomicTypes.zip(atomicTypes).foreach { case (from, to) =>
+      checkNullCast(from, to)
+    }
+
+    atomicTypes.foreach(dt => checkNullCast(NullType, dt))
+    atomicTypes.foreach(dt => checkNullCast(dt, StringType))
+    checkNullCast(StringType, BinaryType)
+    checkNullCast(StringType, BooleanType)
+    checkNullCast(DateType, BooleanType)
+    checkNullCast(TimestampType, BooleanType)
+    numericTypes.foreach(dt => checkNullCast(dt, BooleanType))
+
+    checkNullCast(StringType, TimestampType)
+    checkNullCast(BooleanType, TimestampType)
+    checkNullCast(DateType, TimestampType)
+    numericTypes.foreach(dt => checkNullCast(dt, TimestampType))
+
+    atomicTypes.foreach(dt => checkNullCast(dt, DateType))
+
+    checkNullCast(StringType, CalendarIntervalType)
+    numericTypes.foreach(dt => checkNullCast(StringType, dt))
+    numericTypes.foreach(dt => checkNullCast(BooleanType, dt))
+    numericTypes.foreach(dt => checkNullCast(DateType, dt))
+    numericTypes.foreach(dt => checkNullCast(TimestampType, dt))
+    for (from <- numericTypes; to <- numericTypes) checkNullCast(from, to)
+  }
+
   test("cast string to date") {
     var c = Calendar.getInstance()
     c.set(2015, 0, 1, 0, 0, 0)
@@ -69,8 +105,7 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
   }
 
   test("cast string to timestamp") {
-    checkEvaluation(Cast(Literal("123"), TimestampType),
-      null)
+    checkEvaluation(Cast(Literal("123"), TimestampType), null)
 
     var c = Calendar.getInstance()
     c.set(2015, 0, 1, 0, 0, 0)
@@ -473,6 +508,8 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
     val array_notNull = Literal.create(Seq("123", "abc", ""),
       ArrayType(StringType, containsNull = false))
 
+    checkNullCast(ArrayType(StringType), ArrayType(IntegerType))
+
     {
       val ret = cast(array, ArrayType(IntegerType, containsNull = true))
       assert(ret.resolved === true)
@@ -526,6 +563,8 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
       Map("a" -> "123", "b" -> "abc", "c" -> ""),
       MapType(StringType, StringType, valueContainsNull = false))
 
+    checkNullCast(MapType(StringType, IntegerType), MapType(StringType, StringType))
+
     {
       val ret = cast(map, MapType(StringType, IntegerType, valueContainsNull = true))
       assert(ret.resolved === true)
@@ -580,6 +619,14 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
   }
 
   test("cast from struct") {
+    checkNullCast(
+      StructType(Seq(
+        StructField("a", StringType),
+        StructField("b", IntegerType))),
+      StructType(Seq(
+        StructField("a", StringType),
+        StructField("b", StringType))))
+
     val struct = Literal.create(
       InternalRow(
         UTF8String.fromString("123"),
@@ -728,5 +775,4 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
       StringType),
       "interval 1 years 3 months -3 days")
   }
-
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala
index fc842772f3480..5de5ddce975d8 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala
@@ -132,6 +132,7 @@ class ComplexTypeSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(CreateArray(intWithNull), intSeq :+ null, EmptyRow)
     checkEvaluation(CreateArray(longWithNull), longSeq :+ null, EmptyRow)
     checkEvaluation(CreateArray(strWithNull), strSeq :+ null, EmptyRow)
+    checkEvaluation(CreateArray(Literal.create(null, IntegerType) :: Nil), null :: Nil)
   }
 
   test("CreateStruct") {
@@ -139,26 +140,20 @@ class ComplexTypeSuite extends SparkFunSuite with ExpressionEvalHelper {
     val c1 = 'a.int.at(0)
     val c3 = 'c.int.at(2)
     checkEvaluation(CreateStruct(Seq(c1, c3)), create_row(1, 3), row)
+    checkEvaluation(CreateStruct(Literal.create(null, LongType) :: Nil), create_row(null))
   }
 
   test("CreateNamedStruct") {
-    val row = InternalRow(1, 2, 3)
+    val row = create_row(1, 2, 3)
     val c1 = 'a.int.at(0)
     val c3 = 'c.int.at(2)
-    checkEvaluation(CreateNamedStruct(Seq("a", c1, "b", c3)), InternalRow(1, 3), row)
-  }
-
-  test("CreateNamedStruct with literal field") {
-    val row = InternalRow(1, 2, 3)
-    val c1 = 'a.int.at(0)
+    checkEvaluation(CreateNamedStruct(Seq("a", c1, "b", c3)), create_row(1, 3), row)
     checkEvaluation(CreateNamedStruct(Seq("a", c1, "b", "y")),
-      InternalRow(1, UTF8String.fromString("y")), row)
-  }
-
-  test("CreateNamedStruct from all literal fields") {
-    checkEvaluation(
-      CreateNamedStruct(Seq("a", "x", "b", 2.0)),
-      InternalRow(UTF8String.fromString("x"), 2.0), InternalRow.empty)
+      create_row(1, UTF8String.fromString("y")), row)
+    checkEvaluation(CreateNamedStruct(Seq("a", "x", "b", 2.0)),
+      create_row(UTF8String.fromString("x"), 2.0))
+    checkEvaluation(CreateNamedStruct(Seq("a", Literal.create(null, IntegerType))),
+      create_row(null))
   }
 
   test("test dsl for complex type") {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ConditionalExpressionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ConditionalExpressionSuite.scala
index b31d6661c8c1c..d26bcdb2902ab 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ConditionalExpressionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ConditionalExpressionSuite.scala
@@ -149,6 +149,8 @@ class ConditionalExpressionSuite extends SparkFunSuite with ExpressionEvalHelper
     checkEvaluation(Least(Seq(c1, c2, Literal(-1))), -1, row)
     checkEvaluation(Least(Seq(c4, c5, c3, c3, Literal("a"))), "a", row)
 
+    val nullLiteral = Literal.create(null, IntegerType)
+    checkEvaluation(Least(Seq(nullLiteral, nullLiteral)), null)
     checkEvaluation(Least(Seq(Literal(null), Literal(null))), null, InternalRow.empty)
     checkEvaluation(Least(Seq(Literal(-1.0), Literal(2.5))), -1.0, InternalRow.empty)
     checkEvaluation(Least(Seq(Literal(-1), Literal(2))), -1, InternalRow.empty)
@@ -188,6 +190,8 @@ class ConditionalExpressionSuite extends SparkFunSuite with ExpressionEvalHelper
     checkEvaluation(Greatest(Seq(c1, c2, Literal(2))), 2, row)
     checkEvaluation(Greatest(Seq(c4, c5, c3, Literal("ccc"))), "ccc", row)
 
+    val nullLiteral = Literal.create(null, IntegerType)
+    checkEvaluation(Greatest(Seq(nullLiteral, nullLiteral)), null)
     checkEvaluation(Greatest(Seq(Literal(null), Literal(null))), null, InternalRow.empty)
     checkEvaluation(Greatest(Seq(Literal(-1.0), Literal(2.5))), 2.5, InternalRow.empty)
     checkEvaluation(Greatest(Seq(Literal(-1), Literal(2))), 2, InternalRow.empty)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala
index 21459a7c69838..9fcb548af6bbb 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala
@@ -110,35 +110,17 @@ class MathFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(c(Literal(1.0), Literal.create(null, DoubleType)), null, create_row(null))
   }
 
-  test("conv") {
-    checkEvaluation(Conv(Literal("3"), Literal(10), Literal(2)), "11")
-    checkEvaluation(Conv(Literal("-15"), Literal(10), Literal(-16)), "-F")
-    checkEvaluation(Conv(Literal("-15"), Literal(10), Literal(16)), "FFFFFFFFFFFFFFF1")
-    checkEvaluation(Conv(Literal("big"), Literal(36), Literal(16)), "3A48")
-    checkEvaluation(Conv(Literal.create(null, StringType), Literal(36), Literal(16)), null)
-    checkEvaluation(Conv(Literal("3"), Literal.create(null, IntegerType), Literal(16)), null)
-    checkEvaluation(
-      Conv(Literal("1234"), Literal(10), Literal(37)), null)
-    checkEvaluation(
-      Conv(Literal(""), Literal(10), Literal(16)), null)
-    checkEvaluation(
-      Conv(Literal("9223372036854775807"), Literal(36), Literal(16)), "FFFFFFFFFFFFFFFF")
-    // If there is an invalid digit in the number, the longest valid prefix should be converted.
-    checkEvaluation(
-      Conv(Literal("11abc"), Literal(10), Literal(16)), "B")
-  }
-
   private def checkNaN(
-      expression: Expression, inputRow: InternalRow = EmptyRow): Unit = {
+    expression: Expression, inputRow: InternalRow = EmptyRow): Unit = {
     checkNaNWithoutCodegen(expression, inputRow)
     checkNaNWithGeneratedProjection(expression, inputRow)
     checkNaNWithOptimization(expression, inputRow)
   }
 
   private def checkNaNWithoutCodegen(
-      expression: Expression,
-      expected: Any,
-      inputRow: InternalRow = EmptyRow): Unit = {
+    expression: Expression,
+    expected: Any,
+    inputRow: InternalRow = EmptyRow): Unit = {
     val actual = try evaluate(expression, inputRow) catch {
       case e: Exception => fail(s"Exception evaluating $expression", e)
     }
@@ -149,7 +131,6 @@ class MathFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     }
   }
 
-
   private def checkNaNWithGeneratedProjection(
     expression: Expression,
     inputRow: InternalRow = EmptyRow): Unit = {
@@ -172,6 +153,25 @@ class MathFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkNaNWithoutCodegen(optimizedPlan.expressions.head, inputRow)
   }
 
+  test("conv") {
+    checkEvaluation(Conv(Literal("3"), Literal(10), Literal(2)), "11")
+    checkEvaluation(Conv(Literal("-15"), Literal(10), Literal(-16)), "-F")
+    checkEvaluation(Conv(Literal("-15"), Literal(10), Literal(16)), "FFFFFFFFFFFFFFF1")
+    checkEvaluation(Conv(Literal("big"), Literal(36), Literal(16)), "3A48")
+    checkEvaluation(Conv(Literal.create(null, StringType), Literal(36), Literal(16)), null)
+    checkEvaluation(Conv(Literal("3"), Literal.create(null, IntegerType), Literal(16)), null)
+    checkEvaluation(Conv(Literal("3"), Literal(16), Literal.create(null, IntegerType)), null)
+    checkEvaluation(
+      Conv(Literal("1234"), Literal(10), Literal(37)), null)
+    checkEvaluation(
+      Conv(Literal(""), Literal(10), Literal(16)), null)
+    checkEvaluation(
+      Conv(Literal("9223372036854775807"), Literal(36), Literal(16)), "FFFFFFFFFFFFFFFF")
+    // If there is an invalid digit in the number, the longest valid prefix should be converted.
+    checkEvaluation(
+      Conv(Literal("11abc"), Literal(10), Literal(16)), "B")
+  }
+
   test("e") {
     testLeaf(EulerNumber, math.E)
   }
@@ -417,7 +417,7 @@ class MathFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
   }
 
   test("round") {
-    val domain = -6 to 6
+    val scales = -6 to 6
     val doublePi: Double = math.Pi
     val shortPi: Short = 31415
     val intPi: Int = 314159265
@@ -437,17 +437,16 @@ class MathFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
       31415926535900000L, 31415926535898000L, 31415926535897900L, 31415926535897930L) ++
       Seq.fill(7)(31415926535897932L)
 
-    val bdResults: Seq[BigDecimal] = Seq(BigDecimal(3.0), BigDecimal(3.1), BigDecimal(3.14),
-      BigDecimal(3.142), BigDecimal(3.1416), BigDecimal(3.14159),
-      BigDecimal(3.141593), BigDecimal(3.1415927))
-
-    domain.zipWithIndex.foreach { case (scale, i) =>
+    scales.zipWithIndex.foreach { case (scale, i) =>
       checkEvaluation(Round(doublePi, scale), doubleResults(i), EmptyRow)
       checkEvaluation(Round(shortPi, scale), shortResults(i), EmptyRow)
       checkEvaluation(Round(intPi, scale), intResults(i), EmptyRow)
       checkEvaluation(Round(longPi, scale), longResults(i), EmptyRow)
     }
 
+    val bdResults: Seq[BigDecimal] = Seq(BigDecimal(3.0), BigDecimal(3.1), BigDecimal(3.14),
+      BigDecimal(3.142), BigDecimal(3.1416), BigDecimal(3.14159),
+      BigDecimal(3.141593), BigDecimal(3.1415927))
     // round_scale > current_scale would result in precision increase
     // and not allowed by o.a.s.s.types.Decimal.changePrecision, therefore null
     (0 to 7).foreach { i =>
@@ -456,5 +455,11 @@ class MathFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     (8 to 10).foreach { scale =>
       checkEvaluation(Round(bdPi, scale), null, EmptyRow)
     }
+
+    DataTypeTestUtils.numericTypes.foreach { dataType =>
+      checkEvaluation(Round(Literal.create(null, dataType), Literal(2)), null)
+      checkEvaluation(Round(Literal.create(null, dataType),
+        Literal.create(null, IntegerType)), null)
+    }
   }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RandomSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RandomSuite.scala
index 5db992654811a..4a644d136f09c 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RandomSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RandomSuite.scala
@@ -21,7 +21,6 @@ import org.scalatest.Matchers._
 
 import org.apache.spark.SparkFunSuite
 
-
 class RandomSuite extends SparkFunSuite with ExpressionEvalHelper {
 
   test("random") {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
index 3d294fda5d103..07b952531ec2e 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
@@ -348,6 +348,9 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(StringTrimLeft(s), "花花世界 ", create_row("  花花世界 "))
     checkEvaluation(StringTrim(s), "花花世界", create_row("  花花世界 "))
     // scalastyle:on
+    checkEvaluation(StringTrim(Literal.create(null, StringType)), null)
+    checkEvaluation(StringTrimLeft(Literal.create(null, StringType)), null)
+    checkEvaluation(StringTrimRight(Literal.create(null, StringType)), null)
   }
 
   test("FORMAT") {
@@ -391,6 +394,9 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     val s3 = 'c.string.at(2)
     val s4 = 'd.int.at(3)
     val row1 = create_row("aaads", "aa", "zz", 1)
+    val row2 = create_row(null, "aa", "zz", 0)
+    val row3 = create_row("aaads", null, "zz", 0)
+    val row4 = create_row(null, null, null, 0)
 
     checkEvaluation(new StringLocate(Literal("aa"), Literal("aaads")), 1, row1)
     checkEvaluation(StringLocate(Literal("aa"), Literal("aaads"), Literal(1)), 2, row1)
@@ -402,6 +408,9 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(StringLocate(s2, s1, s4), 2, row1)
     checkEvaluation(new StringLocate(s3, s1), 0, row1)
     checkEvaluation(StringLocate(s3, s1, Literal.create(null, IntegerType)), 0, row1)
+    checkEvaluation(new StringLocate(s2, s1), null, row2)
+    checkEvaluation(new StringLocate(s2, s1), null, row3)
+    checkEvaluation(new StringLocate(s2, s1, Literal.create(null, IntegerType)), 0, row4)
   }
 
   test("LPAD/RPAD") {
@@ -448,6 +457,7 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     val row1 = create_row("abccc")
     checkEvaluation(StringReverse(Literal("abccc")), "cccba", row1)
     checkEvaluation(StringReverse(s), "cccba", row1)
+    checkEvaluation(StringReverse(Literal.create(null, StringType)), null, row1)
   }
 
   test("SPACE") {
@@ -466,6 +476,9 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     val row1 = create_row("100-200", "(\\d+)", "num")
     val row2 = create_row("100-200", "(\\d+)", "###")
     val row3 = create_row("100-200", "(-)", "###")
+    val row4 = create_row(null, "(\\d+)", "###")
+    val row5 = create_row("100-200", null, "###")
+    val row6 = create_row("100-200", "(-)", null)
 
     val s = 's.string.at(0)
     val p = 'p.string.at(1)
@@ -475,6 +488,9 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(expr, "num-num", row1)
     checkEvaluation(expr, "###-###", row2)
     checkEvaluation(expr, "100###200", row3)
+    checkEvaluation(expr, null, row4)
+    checkEvaluation(expr, null, row5)
+    checkEvaluation(expr, null, row6)
   }
 
   test("RegexExtract") {
@@ -482,6 +498,9 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     val row2 = create_row("100-200", "(\\d+)-(\\d+)", 2)
     val row3 = create_row("100-200", "(\\d+).*", 1)
     val row4 = create_row("100-200", "([a-z])", 1)
+    val row5 = create_row(null, "([a-z])", 1)
+    val row6 = create_row("100-200", null, 1)
+    val row7 = create_row("100-200", "([a-z])", null)
 
     val s = 's.string.at(0)
     val p = 'p.string.at(1)
@@ -492,6 +511,9 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(expr, "200", row2)
     checkEvaluation(expr, "100", row3)
     checkEvaluation(expr, "", row4) // will not match anything, empty string get
+    checkEvaluation(expr, null, row5)
+    checkEvaluation(expr, null, row6)
+    checkEvaluation(expr, null, row7)
 
     val expr1 = new RegExpExtract(s, p)
     checkEvaluation(expr1, "100", row1)
@@ -501,11 +523,15 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     val s1 = 'a.string.at(0)
     val s2 = 'b.string.at(1)
     val row1 = create_row("aa2bb3cc", "[1-9]+")
+    val row2 = create_row(null, "[1-9]+")
+    val row3 = create_row("aa2bb3cc", null)
 
     checkEvaluation(
       StringSplit(Literal("aa2bb3cc"), Literal("[1-9]+")), Seq("aa", "bb", "cc"), row1)
     checkEvaluation(
       StringSplit(s1, s2), Seq("aa", "bb", "cc"), row1)
+    checkEvaluation(StringSplit(s1, s2), null, row2)
+    checkEvaluation(StringSplit(s1, s2), null, row3)
   }
 
   test("length for string / binary") {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 4261a5e7cbeb5..4e68a88e7cda6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -1423,7 +1423,8 @@ object functions {
   def round(columnName: String): Column = round(Column(columnName), 0)
 
   /**
-   * Returns the value of `e` rounded to `scale` decimal places.
+   * Round the value of `e` to `scale` decimal places if `scale` >= 0
+   * or at integral part when `scale` < 0.
    *
    * @group math_funcs
    * @since 1.5.0
@@ -1431,7 +1432,8 @@ object functions {
   def round(e: Column, scale: Int): Column = Round(e.expr, Literal(scale))
 
   /**
-   * Returns the value of the given column rounded to `scale` decimal places.
+   * Round the value of the given column to `scale` decimal places if `scale` >= 0
+   * or at integral part when `scale` < 0.
    *
    * @group math_funcs
    * @since 1.5.0

From 1221849f91739454b8e495889cba7498ba8beea7 Mon Sep 17 00:00:00 2001
From: Joseph Batchik <josephbatchik@gmail.com>
Date: Wed, 29 Jul 2015 23:35:55 -0700
Subject: [PATCH 463/549] [SPARK-8005][SQL] Input file name

Users can now get the file name of the partition being read in. A thread local variable is in `SQLNewHadoopRDD` and is set when the partition is computed. `SQLNewHadoopRDD` is moved to core so that the catalyst package can reach it.

This supports:

`df.select(inputFileName())`

and

`sqlContext.sql("select input_file_name() from table")`

Author: Joseph Batchik <josephbatchik@gmail.com>

Closes #7743 from JDrit/input_file_name and squashes the following commits:

abb8609 [Joseph Batchik] fixed failing test and changed the default value to be an empty string
d2f323d [Joseph Batchik] updates per review
102061f [Joseph Batchik] updates per review
75313f5 [Joseph Batchik] small fixes
c7f7b5a [Joseph Batchik] addeding input file name to Spark SQL
---
 .../apache/spark/rdd}/SqlNewHadoopRDD.scala   | 34 +++++++++++--
 .../catalyst/analysis/FunctionRegistry.scala  |  3 +-
 .../catalyst/expressions/InputFileName.scala  | 49 +++++++++++++++++++
 .../expressions/SparkPartitionID.scala        |  2 +
 .../expressions/NondeterministicSuite.scala   |  4 ++
 .../org/apache/spark/sql/functions.scala      |  9 ++++
 .../spark/sql/parquet/ParquetRelation.scala   |  3 +-
 .../spark/sql/ColumnExpressionSuite.scala     | 17 ++++++-
 .../scala/org/apache/spark/sql/UDFSuite.scala | 17 ++++++-
 .../org/apache/spark/sql/hive/UDFSuite.scala  |  6 ---
 10 files changed, 128 insertions(+), 16 deletions(-)
 rename {sql/core/src/main/scala/org/apache/spark/sql/execution => core/src/main/scala/org/apache/spark/rdd}/SqlNewHadoopRDD.scala (91%)
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/InputFileName.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SqlNewHadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/SqlNewHadoopRDD.scala
similarity index 91%
rename from sql/core/src/main/scala/org/apache/spark/sql/execution/SqlNewHadoopRDD.scala
rename to core/src/main/scala/org/apache/spark/rdd/SqlNewHadoopRDD.scala
index 3d75b6a91def6..35e44cb59c1be 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SqlNewHadoopRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/SqlNewHadoopRDD.scala
@@ -15,12 +15,13 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.execution
+package org.apache.spark.rdd
 
 import java.text.SimpleDateFormat
 import java.util.Date
 
-import org.apache.spark.{Partition => SparkPartition, _}
+import scala.reflect.ClassTag
+
 import org.apache.hadoop.conf.{Configurable, Configuration}
 import org.apache.hadoop.io.Writable
 import org.apache.hadoop.mapreduce._
@@ -30,12 +31,12 @@ import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.executor.DataReadMethod
 import org.apache.spark.mapreduce.SparkHadoopMapReduceUtil
+import org.apache.spark.unsafe.types.UTF8String
+import org.apache.spark.{Partition => SparkPartition, _}
 import org.apache.spark.rdd.NewHadoopRDD.NewHadoopMapPartitionsWithSplitRDD
-import org.apache.spark.rdd.{HadoopRDD, RDD}
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.util.{SerializableConfiguration, Utils}
 
-import scala.reflect.ClassTag
 
 private[spark] class SqlNewHadoopPartition(
     rddId: Int,
@@ -62,7 +63,7 @@ private[spark] class SqlNewHadoopPartition(
  * changes based on [[org.apache.spark.rdd.HadoopRDD]]. In future, this functionality will be
  * folded into core.
  */
-private[sql] class SqlNewHadoopRDD[K, V](
+private[spark] class SqlNewHadoopRDD[K, V](
     @transient sc : SparkContext,
     broadcastedConf: Broadcast[SerializableConfiguration],
     @transient initDriverSideJobFuncOpt: Option[Job => Unit],
@@ -128,6 +129,12 @@ private[sql] class SqlNewHadoopRDD[K, V](
       val inputMetrics = context.taskMetrics
         .getInputMetricsForReadMethod(DataReadMethod.Hadoop)
 
+      // Sets the thread local variable for the file's name
+      split.serializableHadoopSplit.value match {
+        case fs: FileSplit => SqlNewHadoopRDD.setInputFileName(fs.getPath.toString)
+        case _ => SqlNewHadoopRDD.unsetInputFileName()
+      }
+
       // Find a function that will return the FileSystem bytes read by this thread. Do this before
       // creating RecordReader, because RecordReader's constructor might read some bytes
       val bytesReadCallback = inputMetrics.bytesReadCallback.orElse {
@@ -188,6 +195,8 @@ private[sql] class SqlNewHadoopRDD[K, V](
             reader.close()
             reader = null
 
+            SqlNewHadoopRDD.unsetInputFileName()
+
             if (bytesReadCallback.isDefined) {
               inputMetrics.updateBytesRead()
             } else if (split.serializableHadoopSplit.value.isInstanceOf[FileSplit] ||
@@ -250,6 +259,21 @@ private[sql] class SqlNewHadoopRDD[K, V](
 }
 
 private[spark] object SqlNewHadoopRDD {
+
+  /**
+   * The thread variable for the name of the current file being read. This is used by
+   * the InputFileName function in Spark SQL.
+   */
+  private[this] val inputFileName: ThreadLocal[UTF8String] = new ThreadLocal[UTF8String] {
+    override protected def initialValue(): UTF8String = UTF8String.fromString("")
+  }
+
+  def getInputFileName(): UTF8String = inputFileName.get()
+
+  private[spark] def setInputFileName(file: String) = inputFileName.set(UTF8String.fromString(file))
+
+  private[spark] def unsetInputFileName(): Unit = inputFileName.remove()
+
   /**
    * Analogous to [[org.apache.spark.rdd.MapPartitionsRDD]], but passes in an InputSplit to
    * the given function rather than the index of the partition.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index 372f80d4a8b16..378df4f57d9e2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -230,7 +230,8 @@ object FunctionRegistry {
     expression[Sha1]("sha"),
     expression[Sha1]("sha1"),
     expression[Sha2]("sha2"),
-    expression[SparkPartitionID]("spark_partition_id")
+    expression[SparkPartitionID]("spark_partition_id"),
+    expression[InputFileName]("input_file_name")
   )
 
   val builtin: FunctionRegistry = {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/InputFileName.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/InputFileName.scala
new file mode 100644
index 0000000000000..1e74f716955e3
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/InputFileName.scala
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions
+
+import org.apache.spark.rdd.SqlNewHadoopRDD
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, CodeGenContext}
+import org.apache.spark.sql.types.{DataType, StringType}
+import org.apache.spark.unsafe.types.UTF8String
+
+/**
+ * Expression that returns the name of the current file being read in using [[SqlNewHadoopRDD]]
+ */
+case class InputFileName() extends LeafExpression with Nondeterministic {
+
+  override def nullable: Boolean = true
+
+  override def dataType: DataType = StringType
+
+  override val prettyName = "INPUT_FILE_NAME"
+
+  override protected def initInternal(): Unit = {}
+
+  override protected def evalInternal(input: InternalRow): UTF8String = {
+    SqlNewHadoopRDD.getInputFileName()
+  }
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    ev.isNull = "false"
+    s"final ${ctx.javaType(dataType)} ${ev.primitive} = " +
+      "org.apache.spark.rdd.SqlNewHadoopRDD.getInputFileName();"
+  }
+
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SparkPartitionID.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SparkPartitionID.scala
index 3f6480bbf0114..4b1772a2deed5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SparkPartitionID.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SparkPartitionID.scala
@@ -34,6 +34,8 @@ private[sql] case class SparkPartitionID() extends LeafExpression with Nondeterm
 
   @transient private[this] var partitionId: Int = _
 
+  override val prettyName = "SPARK_PARTITION_ID"
+
   override protected def initInternal(): Unit = {
     partitionId = TaskContext.getPartitionId()
   }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NondeterministicSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NondeterministicSuite.scala
index 82894822ab0f4..bf1c930c0bd0b 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NondeterministicSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NondeterministicSuite.scala
@@ -27,4 +27,8 @@ class NondeterministicSuite extends SparkFunSuite with ExpressionEvalHelper {
   test("SparkPartitionID") {
     checkEvaluation(SparkPartitionID(), 0)
   }
+
+  test("InputFileName") {
+    checkEvaluation(InputFileName(), "")
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 4e68a88e7cda6..a2fece62f61f9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -743,6 +743,15 @@ object functions {
    */
   def sparkPartitionId(): Column = SparkPartitionID()
 
+  /**
+   * The file name of the current Spark task
+   *
+   * Note that this is indeterministic becuase it depends on what is currently being read in.
+   *
+   * @group normal_funcs
+   */
+  def inputFileName(): Column = InputFileName()
+
   /**
    * Computes the square root of the specified float value.
    *
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala
index cc6fa2b88663f..1a8176d8a80ab 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala
@@ -39,11 +39,10 @@ import org.apache.parquet.{Log => ParquetLog}
 
 import org.apache.spark.{Logging, Partition => SparkPartition, SparkException}
 import org.apache.spark.broadcast.Broadcast
-import org.apache.spark.rdd.RDD
+import org.apache.spark.rdd.{SqlNewHadoopPartition, SqlNewHadoopRDD, RDD}
 import org.apache.spark.rdd.RDD._
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.execution.{SqlNewHadoopPartition, SqlNewHadoopRDD}
 import org.apache.spark.sql.execution.datasources.PartitionSpec
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types.{DataType, StructType}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
index 1f9f7118c3f04..5c1102410879a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
@@ -22,13 +22,16 @@ import org.scalatest.Matchers._
 import org.apache.spark.sql.execution.Project
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types._
+import org.apache.spark.sql.test.SQLTestUtils
 
-class ColumnExpressionSuite extends QueryTest {
+class ColumnExpressionSuite extends QueryTest with SQLTestUtils {
   import org.apache.spark.sql.TestData._
 
   private lazy val ctx = org.apache.spark.sql.test.TestSQLContext
   import ctx.implicits._
 
+  override def sqlContext(): SQLContext = ctx
+
   test("alias") {
     val df = Seq((1, Seq(1, 2, 3))).toDF("a", "intList")
     assert(df.select(df("a").as("b")).columns.head === "b")
@@ -489,6 +492,18 @@ class ColumnExpressionSuite extends QueryTest {
     )
   }
 
+  test("InputFileName") {
+    withTempPath { dir =>
+      val data = sqlContext.sparkContext.parallelize(0 to 10).toDF("id")
+      data.write.parquet(dir.getCanonicalPath)
+      val answer = sqlContext.read.parquet(dir.getCanonicalPath).select(inputFileName())
+        .head.getString(0)
+      assert(answer.contains(dir.getCanonicalPath))
+
+      checkAnswer(data.select(inputFileName()).limit(1), Row(""))
+    }
+  }
+
   test("lift alias out of cast") {
     compareExpressions(
       col("1234").as("name").cast("int").expr,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
index d9c8b380ef146..183dc3407b3ab 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
@@ -17,14 +17,17 @@
 
 package org.apache.spark.sql
 
+import org.apache.spark.sql.test.SQLTestUtils
 
 case class FunctionResult(f1: String, f2: String)
 
-class UDFSuite extends QueryTest {
+class UDFSuite extends QueryTest with SQLTestUtils {
 
   private lazy val ctx = org.apache.spark.sql.test.TestSQLContext
   import ctx.implicits._
 
+  override def sqlContext(): SQLContext = ctx
+
   test("built-in fixed arity expressions") {
     val df = ctx.emptyDataFrame
     df.selectExpr("rand()", "randn()", "rand(5)", "randn(50)")
@@ -58,6 +61,18 @@ class UDFSuite extends QueryTest {
     ctx.dropTempTable("tmp_table")
   }
 
+  test("SPARK-8005 input_file_name") {
+    withTempPath { dir =>
+      val data = ctx.sparkContext.parallelize(0 to 10, 2).toDF("id")
+      data.write.parquet(dir.getCanonicalPath)
+      ctx.read.parquet(dir.getCanonicalPath).registerTempTable("test_table")
+      val answer = ctx.sql("select input_file_name() from test_table").head().getString(0)
+      assert(answer.contains(dir.getCanonicalPath))
+      assert(ctx.sql("select input_file_name() from test_table").distinct().collect().length >= 2)
+      ctx.dropTempTable("test_table")
+    }
+  }
+
   test("error reporting for incorrect number of arguments") {
     val df = ctx.emptyDataFrame
     val e = intercept[AnalysisException] {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/UDFSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/UDFSuite.scala
index 37afc2142abf7..9b3ede43ee2d1 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/UDFSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/UDFSuite.scala
@@ -34,10 +34,4 @@ class UDFSuite extends QueryTest {
     assert(ctx.sql("SELECT RANDOm1() FROM src LIMIT 1").head().getDouble(0) >= 0.0)
     assert(ctx.sql("SELECT strlenscala('test', 1) FROM src LIMIT 1").head().getInt(0) === 5)
   }
-
-  test("SPARK-8003 spark_partition_id") {
-    val df = Seq((1, "Two Fiiiiive")).toDF("id", "saying")
-    ctx.registerDataFrameAsTable(df, "test_table")
-    checkAnswer(ctx.sql("select spark_partition_id() from test_table LIMIT 1").toDF(), Row(0))
-  }
 }

From 76f2e393a5fad0db8b56c4b8dad5ef686bf140a4 Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Thu, 30 Jul 2015 00:46:36 -0700
Subject: [PATCH 464/549] [SPARK-9335] [TESTS] Enable Kinesis tests only when
 files in extras/kinesis-asl are changed

Author: zsxwing <zsxwing@gmail.com>

Closes #7711 from zsxwing/SPARK-9335-test and squashes the following commits:

c13ec2f [zsxwing] environs -> environ
69c2865 [zsxwing] Merge remote-tracking branch 'origin/master' into SPARK-9335-test
ef84a08 [zsxwing] Revert "Modify the Kinesis project to trigger ENABLE_KINESIS_TESTS"
f691028 [zsxwing] Modify the Kinesis project to trigger ENABLE_KINESIS_TESTS
7618205 [zsxwing] Enable Kinesis tests only when files in extras/kinesis-asl are changed
---
 dev/run-tests.py                | 16 ++++++++++++++++
 dev/sparktestsupport/modules.py | 14 ++++++++++++--
 2 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/dev/run-tests.py b/dev/run-tests.py
index 1f0d218514f92..29420da9aa956 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -85,6 +85,13 @@ def identify_changed_files_from_git_commits(patch_sha, target_branch=None, targe
     return [f for f in raw_output.split('\n') if f]
 
 
+def setup_test_environ(environ):
+    print("[info] Setup the following environment variables for tests: ")
+    for (k, v) in environ.items():
+        print("%s=%s" % (k, v))
+        os.environ[k] = v
+
+
 def determine_modules_to_test(changed_modules):
     """
     Given a set of modules that have changed, compute the transitive closure of those modules'
@@ -455,6 +462,15 @@ def main():
     print("[info] Found the following changed modules:",
           ", ".join(x.name for x in changed_modules))
 
+    # setup environment variables
+    # note - the 'root' module doesn't collect environment variables for all modules. Because the
+    # environment variables should not be set if a module is not changed, even if running the 'root'
+    # module. So here we should use changed_modules rather than test_modules.
+    test_environ = {}
+    for m in changed_modules:
+        test_environ.update(m.environ)
+    setup_test_environ(test_environ)
+
     test_modules = determine_modules_to_test(changed_modules)
 
     # license checks
diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
index 3073d489bad4a..030d982e99106 100644
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@@ -29,7 +29,7 @@ class Module(object):
     changed.
     """
 
-    def __init__(self, name, dependencies, source_file_regexes, build_profile_flags=(),
+    def __init__(self, name, dependencies, source_file_regexes, build_profile_flags=(), environ={},
                  sbt_test_goals=(), python_test_goals=(), blacklisted_python_implementations=(),
                  should_run_r_tests=False):
         """
@@ -43,6 +43,8 @@ def __init__(self, name, dependencies, source_file_regexes, build_profile_flags=
             filename strings.
         :param build_profile_flags: A set of profile flags that should be passed to Maven or SBT in
             order to build and test this module (e.g. '-PprofileName').
+        :param environ: A dict of environment variables that should be set when files in this
+            module are changed.
         :param sbt_test_goals: A set of SBT test goals for testing this module.
         :param python_test_goals: A set of Python test goals for testing this module.
         :param blacklisted_python_implementations: A set of Python implementations that are not
@@ -55,6 +57,7 @@ def __init__(self, name, dependencies, source_file_regexes, build_profile_flags=
         self.source_file_prefixes = source_file_regexes
         self.sbt_test_goals = sbt_test_goals
         self.build_profile_flags = build_profile_flags
+        self.environ = environ
         self.python_test_goals = python_test_goals
         self.blacklisted_python_implementations = blacklisted_python_implementations
         self.should_run_r_tests = should_run_r_tests
@@ -126,15 +129,22 @@ def contains_file(self, filename):
 )
 
 
+# Don't set the dependencies because changes in other modules should not trigger Kinesis tests.
+# Kinesis tests depends on external Amazon kinesis service. We should run these tests only when
+# files in streaming_kinesis_asl are changed, so that if Kinesis experiences an outage, we don't
+# fail other PRs.
 streaming_kinesis_asl = Module(
     name="kinesis-asl",
-    dependencies=[streaming],
+    dependencies=[],
     source_file_regexes=[
         "extras/kinesis-asl/",
     ],
     build_profile_flags=[
         "-Pkinesis-asl",
     ],
+    environ={
+        "ENABLE_KINESIS_TESTS": "1"
+    },
     sbt_test_goals=[
         "kinesis-asl/test",
     ]

From 4a8bb9d00d8181aff5f5183194d9aa2a65deacdf Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Thu, 30 Jul 2015 01:04:24 -0700
Subject: [PATCH 465/549] Revert "[SPARK-9458] Avoid object allocation in
 prefix generation."

This reverts commit 9514d874f0cf61f1eb4ec4f5f66e053119f769c9.
---
 .../unsafe/sort/PrefixComparators.java        | 16 ++++++
 .../unsafe/sort/PrefixComparatorsSuite.scala  | 12 +++++
 .../execution/UnsafeExternalRowSorter.java    |  2 +-
 .../spark/sql/execution/SortPrefixUtils.scala | 51 ++++++++++---------
 .../spark/sql/execution/SparkStrategies.scala |  4 +-
 .../org/apache/spark/sql/execution/sort.scala |  5 +-
 .../execution/RowFormatConvertersSuite.scala  |  2 +-
 .../execution/UnsafeExternalSortSuite.scala   | 10 ++--
 8 files changed, 67 insertions(+), 35 deletions(-)

diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/PrefixComparators.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/PrefixComparators.java
index a9ee6042fec74..600aff7d15d8a 100644
--- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/PrefixComparators.java
+++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/PrefixComparators.java
@@ -29,6 +29,7 @@ private PrefixComparators() {}
 
   public static final StringPrefixComparator STRING = new StringPrefixComparator();
   public static final IntegralPrefixComparator INTEGRAL = new IntegralPrefixComparator();
+  public static final FloatPrefixComparator FLOAT = new FloatPrefixComparator();
   public static final DoublePrefixComparator DOUBLE = new DoublePrefixComparator();
 
   public static final class StringPrefixComparator extends PrefixComparator {
@@ -54,6 +55,21 @@ public int compare(long a, long b) {
     public final long NULL_PREFIX = Long.MIN_VALUE;
   }
 
+  public static final class FloatPrefixComparator extends PrefixComparator {
+    @Override
+    public int compare(long aPrefix, long bPrefix) {
+      float a = Float.intBitsToFloat((int) aPrefix);
+      float b = Float.intBitsToFloat((int) bPrefix);
+      return Utils.nanSafeCompareFloats(a, b);
+    }
+
+    public long computePrefix(float value) {
+      return Float.floatToIntBits(value) & 0xffffffffL;
+    }
+
+    public final long NULL_PREFIX = computePrefix(Float.NEGATIVE_INFINITY);
+  }
+
   public static final class DoublePrefixComparator extends PrefixComparator {
     @Override
     public int compare(long aPrefix, long bPrefix) {
diff --git a/core/src/test/scala/org/apache/spark/util/collection/unsafe/sort/PrefixComparatorsSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/unsafe/sort/PrefixComparatorsSuite.scala
index 26b7a9e816d1e..cf53a8ad21c60 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/unsafe/sort/PrefixComparatorsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/unsafe/sort/PrefixComparatorsSuite.scala
@@ -55,6 +55,18 @@ class PrefixComparatorsSuite extends SparkFunSuite with PropertyChecks {
     forAll { (s1: String, s2: String) => testPrefixComparison(s1, s2) }
   }
 
+  test("float prefix comparator handles NaN properly") {
+    val nan1: Float = java.lang.Float.intBitsToFloat(0x7f800001)
+    val nan2: Float = java.lang.Float.intBitsToFloat(0x7fffffff)
+    assert(nan1.isNaN)
+    assert(nan2.isNaN)
+    val nan1Prefix = PrefixComparators.FLOAT.computePrefix(nan1)
+    val nan2Prefix = PrefixComparators.FLOAT.computePrefix(nan2)
+    assert(nan1Prefix === nan2Prefix)
+    val floatMaxPrefix = PrefixComparators.FLOAT.computePrefix(Float.MaxValue)
+    assert(PrefixComparators.FLOAT.compare(nan1Prefix, floatMaxPrefix) === 1)
+  }
+
   test("double prefix comparator handles NaNs properly") {
     val nan1: Double = java.lang.Double.longBitsToDouble(0x7ff0000000000001L)
     val nan2: Double = java.lang.Double.longBitsToDouble(0x7fffffffffffffffL)
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java b/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java
index 8342833246f7d..4c3f2c6557140 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java
@@ -121,7 +121,7 @@ Iterator<InternalRow> sort() throws IOException {
         // here in order to prevent memory leaks.
         cleanupResources();
       }
-      return new AbstractScalaRowIterator<InternalRow>() {
+      return new AbstractScalaRowIterator() {
 
         private final int numFields = schema.length();
         private UnsafeRow row = new UnsafeRow();
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SortPrefixUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SortPrefixUtils.scala
index 050d27f1460fb..2dee3542d6101 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SortPrefixUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SortPrefixUtils.scala
@@ -19,7 +19,7 @@
 package org.apache.spark.sql.execution
 
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.{BoundReference, SortOrder}
+import org.apache.spark.sql.catalyst.expressions.SortOrder
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
 import org.apache.spark.util.collection.unsafe.sort.{PrefixComparators, PrefixComparator}
@@ -39,54 +39,57 @@ object SortPrefixUtils {
     sortOrder.dataType match {
       case StringType => PrefixComparators.STRING
       case BooleanType | ByteType | ShortType | IntegerType | LongType => PrefixComparators.INTEGRAL
-      case FloatType | DoubleType => PrefixComparators.DOUBLE
+      case FloatType => PrefixComparators.FLOAT
+      case DoubleType => PrefixComparators.DOUBLE
       case _ => NoOpPrefixComparator
     }
   }
 
   def getPrefixComputer(sortOrder: SortOrder): InternalRow => Long = {
-    val bound = sortOrder.child.asInstanceOf[BoundReference]
-    val pos = bound.ordinal
     sortOrder.dataType match {
-      case StringType =>
-        (row: InternalRow) => {
-          PrefixComparators.STRING.computePrefix(row.getUTF8String(pos))
-        }
+      case StringType => (row: InternalRow) => {
+        PrefixComparators.STRING.computePrefix(sortOrder.child.eval(row).asInstanceOf[UTF8String])
+      }
       case BooleanType =>
         (row: InternalRow) => {
-          if (row.isNullAt(pos)) PrefixComparators.INTEGRAL.NULL_PREFIX
-          else if (row.getBoolean(pos)) 1
+          val exprVal = sortOrder.child.eval(row)
+          if (exprVal == null) PrefixComparators.INTEGRAL.NULL_PREFIX
+          else if (sortOrder.child.eval(row).asInstanceOf[Boolean]) 1
           else 0
         }
       case ByteType =>
         (row: InternalRow) => {
-          if (row.isNullAt(pos)) PrefixComparators.INTEGRAL.NULL_PREFIX else row.getByte(pos)
+          val exprVal = sortOrder.child.eval(row)
+          if (exprVal == null) PrefixComparators.INTEGRAL.NULL_PREFIX
+          else sortOrder.child.eval(row).asInstanceOf[Byte]
         }
       case ShortType =>
         (row: InternalRow) => {
-          if (row.isNullAt(pos)) PrefixComparators.INTEGRAL.NULL_PREFIX else row.getShort(pos)
+          val exprVal = sortOrder.child.eval(row)
+          if (exprVal == null) PrefixComparators.INTEGRAL.NULL_PREFIX
+          else sortOrder.child.eval(row).asInstanceOf[Short]
         }
       case IntegerType =>
         (row: InternalRow) => {
-          if (row.isNullAt(pos)) PrefixComparators.INTEGRAL.NULL_PREFIX else row.getInt(pos)
+          val exprVal = sortOrder.child.eval(row)
+          if (exprVal == null) PrefixComparators.INTEGRAL.NULL_PREFIX
+          else sortOrder.child.eval(row).asInstanceOf[Int]
         }
       case LongType =>
         (row: InternalRow) => {
-          if (row.isNullAt(pos)) PrefixComparators.INTEGRAL.NULL_PREFIX else row.getLong(pos)
+          val exprVal = sortOrder.child.eval(row)
+          if (exprVal == null) PrefixComparators.INTEGRAL.NULL_PREFIX
+          else sortOrder.child.eval(row).asInstanceOf[Long]
         }
       case FloatType => (row: InternalRow) => {
-        if (row.isNullAt(pos)) {
-          PrefixComparators.DOUBLE.NULL_PREFIX
-        } else {
-          PrefixComparators.DOUBLE.computePrefix(row.getFloat(pos).toDouble)
-        }
+        val exprVal = sortOrder.child.eval(row)
+        if (exprVal == null) PrefixComparators.FLOAT.NULL_PREFIX
+        else PrefixComparators.FLOAT.computePrefix(sortOrder.child.eval(row).asInstanceOf[Float])
       }
       case DoubleType => (row: InternalRow) => {
-        if (row.isNullAt(pos)) {
-          PrefixComparators.DOUBLE.NULL_PREFIX
-        } else {
-          PrefixComparators.DOUBLE.computePrefix(row.getDouble(pos))
-        }
+        val exprVal = sortOrder.child.eval(row)
+        if (exprVal == null) PrefixComparators.DOUBLE.NULL_PREFIX
+        else PrefixComparators.DOUBLE.computePrefix(sortOrder.child.eval(row).asInstanceOf[Double])
       }
       case _ => (row: InternalRow) => 0L
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index 4ab2c41f1b339..f3ef066528ff8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -340,8 +340,8 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
      */
     def getSortOperator(sortExprs: Seq[SortOrder], global: Boolean, child: SparkPlan): SparkPlan = {
       if (sqlContext.conf.unsafeEnabled && sqlContext.conf.codegenEnabled &&
-        TungstenSort.supportsSchema(child.schema)) {
-        execution.TungstenSort(sortExprs, global, child)
+        UnsafeExternalSort.supportsSchema(child.schema)) {
+        execution.UnsafeExternalSort(sortExprs, global, child)
       } else if (sqlContext.conf.externalSortEnabled) {
         execution.ExternalSort(sortExprs, global, child)
       } else {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/sort.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/sort.scala
index d0ad310062853..f82208868c3e3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/sort.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/sort.scala
@@ -97,7 +97,7 @@ case class ExternalSort(
  * @param testSpillFrequency Method for configuring periodic spilling in unit tests. If set, will
  *                           spill every `frequency` records.
  */
-case class TungstenSort(
+case class UnsafeExternalSort(
     sortOrder: Seq[SortOrder],
     global: Boolean,
     child: SparkPlan,
@@ -110,6 +110,7 @@ case class TungstenSort(
     if (global) OrderedDistribution(sortOrder) :: Nil else UnspecifiedDistribution :: Nil
 
   protected override def doExecute(): RDD[InternalRow] = attachTree(this, "sort") {
+    assert(codegenEnabled, "UnsafeExternalSort requires code generation to be enabled")
     def doSort(iterator: Iterator[InternalRow]): Iterator[InternalRow] = {
       val ordering = newOrdering(sortOrder, child.output)
       val boundSortExpression = BindReferences.bindReference(sortOrder.head, child.output)
@@ -148,7 +149,7 @@ case class TungstenSort(
 }
 
 @DeveloperApi
-object TungstenSort {
+object UnsafeExternalSort {
   /**
    * Return true if UnsafeExternalSort can sort rows with the given schema, false otherwise.
    */
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/RowFormatConvertersSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/RowFormatConvertersSuite.scala
index c458f95ca1ab3..7b75f755918c1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/RowFormatConvertersSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/RowFormatConvertersSuite.scala
@@ -31,7 +31,7 @@ class RowFormatConvertersSuite extends SparkPlanTest {
 
   private val outputsSafe = ExternalSort(Nil, false, PhysicalRDD(Seq.empty, null))
   assert(!outputsSafe.outputsUnsafeRows)
-  private val outputsUnsafe = TungstenSort(Nil, false, PhysicalRDD(Seq.empty, null))
+  private val outputsUnsafe = UnsafeExternalSort(Nil, false, PhysicalRDD(Seq.empty, null))
   assert(outputsUnsafe.outputsUnsafeRows)
 
   test("planner should insert unsafe->safe conversions when required") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeExternalSortSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeExternalSortSuite.scala
index 9cabc4b90bf8e..7a4baa9e4a49d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeExternalSortSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeExternalSortSuite.scala
@@ -42,7 +42,7 @@ class UnsafeExternalSortSuite extends SparkPlanTest with BeforeAndAfterAll {
     TestSQLContext.sparkContext.conf.set("spark.unsafe.exceptionOnMemoryLeak", "false")
     checkThatPlansAgree(
       (1 to 100).map(v => Tuple1(v)).toDF("a"),
-      (child: SparkPlan) => Limit(10, TungstenSort('a.asc :: Nil, true, child)),
+      (child: SparkPlan) => Limit(10, UnsafeExternalSort('a.asc :: Nil, true, child)),
       (child: SparkPlan) => Limit(10, Sort('a.asc :: Nil, global = true, child)),
       sortAnswers = false
     )
@@ -53,7 +53,7 @@ class UnsafeExternalSortSuite extends SparkPlanTest with BeforeAndAfterAll {
     try {
       checkThatPlansAgree(
         (1 to 100).map(v => Tuple1(v)).toDF("a"),
-        (child: SparkPlan) => Limit(10, TungstenSort('a.asc :: Nil, true, child)),
+        (child: SparkPlan) => Limit(10, UnsafeExternalSort('a.asc :: Nil, true, child)),
         (child: SparkPlan) => Limit(10, Sort('a.asc :: Nil, global = true, child)),
         sortAnswers = false
       )
@@ -68,7 +68,7 @@ class UnsafeExternalSortSuite extends SparkPlanTest with BeforeAndAfterAll {
     val stringLength = 1024 * 1024 * 2
     checkThatPlansAgree(
       Seq(Tuple1("a" * stringLength), Tuple1("b" * stringLength)).toDF("a").repartition(1),
-      TungstenSort(sortOrder, global = true, _: SparkPlan, testSpillFrequency = 1),
+      UnsafeExternalSort(sortOrder, global = true, _: SparkPlan, testSpillFrequency = 1),
       Sort(sortOrder, global = true, _: SparkPlan),
       sortAnswers = false
     )
@@ -88,11 +88,11 @@ class UnsafeExternalSortSuite extends SparkPlanTest with BeforeAndAfterAll {
         TestSQLContext.sparkContext.parallelize(Random.shuffle(inputData).map(v => Row(v))),
         StructType(StructField("a", dataType, nullable = true) :: Nil)
       )
-      assert(TungstenSort.supportsSchema(inputDf.schema))
+      assert(UnsafeExternalSort.supportsSchema(inputDf.schema))
       checkThatPlansAgree(
         inputDf,
         plan => ConvertToSafe(
-          TungstenSort(sortOrder, global = true, plan: SparkPlan, testSpillFrequency = 23)),
+          UnsafeExternalSort(sortOrder, global = true, plan: SparkPlan, testSpillFrequency = 23)),
         Sort(sortOrder, global = true, _: SparkPlan),
         sortAnswers = false
       )

From 5ba2d44068b89fd8e81cfd24f49bf20d373f81b9 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Thu, 30 Jul 2015 01:21:39 -0700
Subject: [PATCH 466/549] Fix flaky HashedRelationSuite

SparkEnv might not have been set in local unit tests.

Author: Reynold Xin <rxin@databricks.com>

Closes #7784 from rxin/HashedRelationSuite and squashes the following commits:

435d64b [Reynold Xin] Fix flaky HashedRelationSuite
---
 .../apache/spark/sql/execution/joins/HashedRelation.scala  | 7 +++++--
 .../spark/sql/execution/joins/HashedRelationSuite.scala    | 6 +++---
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
index 7a507391316a9..26dbc911e9521 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
@@ -21,7 +21,7 @@ import java.io.{Externalizable, ObjectInput, ObjectOutput}
 import java.nio.ByteOrder
 import java.util.{HashMap => JavaHashMap}
 
-import org.apache.spark.{SparkEnv, TaskContext}
+import org.apache.spark.{SparkConf, SparkEnv, TaskContext}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.execution.SparkSqlSerializer
@@ -260,7 +260,10 @@ private[joins] final class UnsafeHashedRelation(
     val nKeys = in.readInt()
     // This is used in Broadcast, shared by multiple tasks, so we use on-heap memory
     val memoryManager = new TaskMemoryManager(new ExecutorMemoryManager(MemoryAllocator.HEAP))
-    val pageSizeBytes = SparkEnv.get.conf.getSizeAsBytes("spark.buffer.pageSize", "64m")
+
+    val pageSizeBytes = Option(SparkEnv.get).map(_.conf).getOrElse(new SparkConf())
+      .getSizeAsBytes("spark.buffer.pageSize", "64m")
+
     binaryMap = new BytesToBytesMap(
       memoryManager,
       nKeys * 2, // reduce hash collision
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala
index 941f6d4f6a450..8b1a9b21a96b9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala
@@ -33,7 +33,7 @@ class HashedRelationSuite extends SparkFunSuite {
     override def apply(row: InternalRow): InternalRow = row
   }
 
-  ignore("GeneralHashedRelation") {
+  test("GeneralHashedRelation") {
     val data = Array(InternalRow(0), InternalRow(1), InternalRow(2), InternalRow(2))
     val hashed = HashedRelation(data.iterator, keyProjection)
     assert(hashed.isInstanceOf[GeneralHashedRelation])
@@ -47,7 +47,7 @@ class HashedRelationSuite extends SparkFunSuite {
     assert(hashed.get(data(2)) === data2)
   }
 
-  ignore("UniqueKeyHashedRelation") {
+  test("UniqueKeyHashedRelation") {
     val data = Array(InternalRow(0), InternalRow(1), InternalRow(2))
     val hashed = HashedRelation(data.iterator, keyProjection)
     assert(hashed.isInstanceOf[UniqueKeyHashedRelation])
@@ -64,7 +64,7 @@ class HashedRelationSuite extends SparkFunSuite {
     assert(uniqHashed.getValue(InternalRow(10)) === null)
   }
 
-  ignore("UnsafeHashedRelation") {
+  test("UnsafeHashedRelation") {
     val schema = StructType(StructField("a", IntegerType, true) :: Nil)
     val data = Array(InternalRow(0), InternalRow(1), InternalRow(2), InternalRow(2))
     val toUnsafe = UnsafeProjection.create(schema)

From 6175d6cfe795fbd88e3ee713fac375038a3993a8 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@appier.com>
Date: Thu, 30 Jul 2015 17:45:30 +0800
Subject: [PATCH 467/549] [SPARK-8838] [SQL] Add config to enable/disable
 merging part-files when merging parquet schema

JIRA: https://issues.apache.org/jira/browse/SPARK-8838

Currently all part-files are merged when merging parquet schema. However, in case there are many part-files and we can make sure that all the part-files have the same schema as their summary file. If so, we provide a configuration to disable merging part-files when merging parquet schema.

In short, we need to merge parquet schema because different summary files may contain different schema. But the part-files are confirmed to have the same schema with summary files.

Author: Liang-Chi Hsieh <viirya@appier.com>

Closes #7238 from viirya/option_partfile_merge and squashes the following commits:

71d5b5f [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into option_partfile_merge
8816f44 [Liang-Chi Hsieh] For comments.
dbc8e6b [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into option_partfile_merge
afc2fa1 [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into option_partfile_merge
d4ed7e6 [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into option_partfile_merge
df43027 [Liang-Chi Hsieh] Get dataStatuses' partitions based on all paths.
4eb2f00 [Liang-Chi Hsieh] Use given parameter.
ea8f6e5 [Liang-Chi Hsieh] Correct the code comments.
a57be0e [Liang-Chi Hsieh] Merge part-files if there are no summary files.
47df981 [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into option_partfile_merge
4caf293 [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into option_partfile_merge
0e734e0 [Liang-Chi Hsieh] Use correct API.
3b6be5b [Liang-Chi Hsieh] Fix key not found.
4bdd7e0 [Liang-Chi Hsieh] Don't read footer files if we can skip them.
8bbebcb [Liang-Chi Hsieh] Figure out how to test the config.
bbd4ce7 [Liang-Chi Hsieh] Add config to enable/disable merging part-files when merging parquet schema.
---
 .../scala/org/apache/spark/sql/SQLConf.scala  |  7 +++++
 .../spark/sql/parquet/ParquetRelation.scala   | 19 ++++++++++++-
 .../spark/sql/parquet/ParquetQuerySuite.scala | 27 +++++++++++++++++++
 3 files changed, 52 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
index cdb0c7a1c07a7..2564bbd2077bf 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
@@ -247,6 +247,13 @@ private[spark] object SQLConf {
           "otherwise the schema is picked from the summary file or a random data file " +
           "if no summary file is available.")
 
+  val PARQUET_SCHEMA_RESPECT_SUMMARIES = booleanConf("spark.sql.parquet.respectSummaryFiles",
+    defaultValue = Some(false),
+    doc = "When true, we make assumption that all part-files of Parquet are consistent with " +
+          "summary files and we will ignore them when merging schema. Otherwise, if this is " +
+          "false, which is the default, we will merge all part-files. This should be considered " +
+          "as expert-only option, and shouldn't be enabled before knowing what it means exactly.")
+
   val PARQUET_BINARY_AS_STRING = booleanConf("spark.sql.parquet.binaryAsString",
     defaultValue = Some(false),
     doc = "Some other Parquet-producing systems, in particular Impala and older versions of " +
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala
index 1a8176d8a80ab..b4337a48dbd80 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala
@@ -124,6 +124,9 @@ private[sql] class ParquetRelation(
       .map(_.toBoolean)
       .getOrElse(sqlContext.conf.getConf(SQLConf.PARQUET_SCHEMA_MERGING_ENABLED))
 
+  private val mergeRespectSummaries =
+    sqlContext.conf.getConf(SQLConf.PARQUET_SCHEMA_RESPECT_SUMMARIES)
+
   private val maybeMetastoreSchema = parameters
     .get(ParquetRelation.METASTORE_SCHEMA)
     .map(DataType.fromJson(_).asInstanceOf[StructType])
@@ -421,7 +424,21 @@ private[sql] class ParquetRelation(
       val filesToTouch =
         if (shouldMergeSchemas) {
           // Also includes summary files, 'cause there might be empty partition directories.
-          (metadataStatuses ++ commonMetadataStatuses ++ dataStatuses).toSeq
+
+          // If mergeRespectSummaries config is true, we assume that all part-files are the same for
+          // their schema with summary files, so we ignore them when merging schema.
+          // If the config is disabled, which is the default setting, we merge all part-files.
+          // In this mode, we only need to merge schemas contained in all those summary files.
+          // You should enable this configuration only if you are very sure that for the parquet
+          // part-files to read there are corresponding summary files containing correct schema.
+
+          val needMerged: Seq[FileStatus] =
+            if (mergeRespectSummaries) {
+              Seq()
+            } else {
+              dataStatuses
+            }
+          (metadataStatuses ++ commonMetadataStatuses ++ needMerged).toSeq
         } else {
           // Tries any "_common_metadata" first. Parquet files written by old versions or Parquet
           // don't have this.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala
index c037faf4cfd92..a95f70f2bba69 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala
@@ -17,10 +17,13 @@
 
 package org.apache.spark.sql.parquet
 
+import java.io.File
+
 import org.apache.hadoop.fs.Path
 
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.{QueryTest, Row, SQLConf}
+import org.apache.spark.util.Utils
 
 /**
  * A test suite that tests various Parquet queries.
@@ -123,6 +126,30 @@ class ParquetQuerySuite extends QueryTest with ParquetTest {
     }
   }
 
+  test("Enabling/disabling merging partfiles when merging parquet schema") {
+    def testSchemaMerging(expectedColumnNumber: Int): Unit = {
+      withTempDir { dir =>
+        val basePath = dir.getCanonicalPath
+        sqlContext.range(0, 10).toDF("a").write.parquet(new Path(basePath, "foo=1").toString)
+        sqlContext.range(0, 10).toDF("b").write.parquet(new Path(basePath, "foo=2").toString)
+        // delete summary files, so if we don't merge part-files, one column will not be included.
+        Utils.deleteRecursively(new File(basePath + "/foo=1/_metadata"))
+        Utils.deleteRecursively(new File(basePath + "/foo=1/_common_metadata"))
+        assert(sqlContext.read.parquet(basePath).columns.length === expectedColumnNumber)
+      }
+    }
+
+    withSQLConf(SQLConf.PARQUET_SCHEMA_MERGING_ENABLED.key -> "true",
+      SQLConf.PARQUET_SCHEMA_RESPECT_SUMMARIES.key -> "true") {
+      testSchemaMerging(2)
+    }
+
+    withSQLConf(SQLConf.PARQUET_SCHEMA_MERGING_ENABLED.key -> "true",
+      SQLConf.PARQUET_SCHEMA_RESPECT_SUMMARIES.key -> "false") {
+      testSchemaMerging(3)
+    }
+  }
+
   test("Enabling/disabling schema merging") {
     def testSchemaMerging(expectedColumnNumber: Int): Unit = {
       withTempDir { dir =>

From d31c618e3c8838f8198556876b9dcbbbf835f7b2 Mon Sep 17 00:00:00 2001
From: Yuhao Yang <hhbyyh@gmail.com>
Date: Thu, 30 Jul 2015 07:49:10 -0700
Subject: [PATCH 468/549] [SPARK-7368] [MLLIB] Add QR decomposition for
 RowMatrix

jira: https://issues.apache.org/jira/browse/SPARK-7368
Add QR decomposition for RowMatrix.

I'm not sure what's the blueprint about the distributed Matrix from community and whether this will be a desirable feature , so I sent a prototype for discussion. I'll go on polish the code and provide ut and performance statistics if it's acceptable.

The implementation refers to the [paper: https://www.cs.purdue.edu/homes/dgleich/publications/Benson%202013%20-%20direct-tsqr.pdf]
Austin R. Benson, David F. Gleich, James Demmel. "Direct QR factorizations for tall-and-skinny matrices in MapReduce architectures", 2013 IEEE International Conference on Big Data, which is a stable algorithm with good scalability.

Currently I tried it on a 400000 * 500 rowMatrix (16 partitions) and it can bring down the computation time from 8.8 mins (using breeze.linalg.qr.reduced)  to 2.6 mins on a 4 worker cluster. I think there will still be some room for performance improvement.

Any trial and suggestion is welcome.

Author: Yuhao Yang <hhbyyh@gmail.com>

Closes #5909 from hhbyyh/qrDecomposition and squashes the following commits:

cec797b [Yuhao Yang] remove unnecessary qr
0fb1012 [Yuhao Yang] hierarchy R computing
3fbdb61 [Yuhao Yang] update qr to indirect and add ut
0d913d3 [Yuhao Yang] Merge remote-tracking branch 'upstream/master' into qrDecomposition
39213c3 [Yuhao Yang] Merge remote-tracking branch 'upstream/master' into qrDecomposition
c0fc0c7 [Yuhao Yang] Merge remote-tracking branch 'upstream/master' into qrDecomposition
39b0b22 [Yuhao Yang] initial draft for discussion
---
 .../linalg/SingularValueDecomposition.scala   |  8 ++++
 .../mllib/linalg/distributed/RowMatrix.scala  | 46 ++++++++++++++++++-
 .../linalg/distributed/RowMatrixSuite.scala   | 17 +++++++
 3 files changed, 70 insertions(+), 1 deletion(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/SingularValueDecomposition.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/SingularValueDecomposition.scala
index 9669c364bad8f..b416d50a5631e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/SingularValueDecomposition.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/SingularValueDecomposition.scala
@@ -25,3 +25,11 @@ import org.apache.spark.annotation.Experimental
  */
 @Experimental
 case class SingularValueDecomposition[UType, VType](U: UType, s: Vector, V: VType)
+
+/**
+ * :: Experimental ::
+ * Represents QR factors.
+ */
+@Experimental
+case class QRDecomposition[UType, VType](Q: UType, R: VType)
+
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
index 1626da9c3d2ee..bfc90c9ef8527 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
@@ -22,7 +22,7 @@ import java.util.Arrays
 import scala.collection.mutable.ListBuffer
 
 import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV, SparseVector => BSV, axpy => brzAxpy,
-  svd => brzSvd}
+  svd => brzSvd, MatrixSingularException, inv}
 import breeze.numerics.{sqrt => brzSqrt}
 import com.github.fommil.netlib.BLAS.{getInstance => blas}
 
@@ -497,6 +497,50 @@ class RowMatrix(
     columnSimilaritiesDIMSUM(computeColumnSummaryStatistics().normL2.toArray, gamma)
   }
 
+  /**
+   * Compute QR decomposition for [[RowMatrix]]. The implementation is designed to optimize the QR
+   * decomposition (factorization) for the [[RowMatrix]] of a tall and skinny shape.
+   * Reference:
+   *  Paul G. Constantine, David F. Gleich. "Tall and skinny QR factorizations in MapReduce
+   *  architectures"  ([[http://dx.doi.org/10.1145/1996092.1996103]])
+   *
+   * @param computeQ whether to computeQ
+   * @return QRDecomposition(Q, R), Q = null if computeQ = false.
+   */
+  def tallSkinnyQR(computeQ: Boolean = false): QRDecomposition[RowMatrix, Matrix] = {
+    val col = numCols().toInt
+    // split rows horizontally into smaller matrices, and compute QR for each of them
+    val blockQRs = rows.glom().map { partRows =>
+      val bdm = BDM.zeros[Double](partRows.length, col)
+      var i = 0
+      partRows.foreach { row =>
+        bdm(i, ::) := row.toBreeze.t
+        i += 1
+      }
+      breeze.linalg.qr.reduced(bdm).r
+    }
+
+    // combine the R part from previous results vertically into a tall matrix
+    val combinedR = blockQRs.treeReduce{ (r1, r2) =>
+      val stackedR = BDM.vertcat(r1, r2)
+      breeze.linalg.qr.reduced(stackedR).r
+    }
+    val finalR = Matrices.fromBreeze(combinedR.toDenseMatrix)
+    val finalQ = if (computeQ) {
+      try {
+        val invR = inv(combinedR)
+        this.multiply(Matrices.fromBreeze(invR))
+      } catch {
+        case err: MatrixSingularException =>
+          logWarning("R is not invertible and return Q as null")
+          null
+      }
+    } else {
+      null
+    }
+    QRDecomposition(finalQ, finalR)
+  }
+
   /**
    * Find all similar columns using the DIMSUM sampling algorithm, described in two papers
    *
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala
index b6cb53d0c743e..283ffec1d49d7 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala
@@ -19,6 +19,7 @@ package org.apache.spark.mllib.linalg.distributed
 
 import scala.util.Random
 
+import breeze.numerics.abs
 import breeze.linalg.{DenseVector => BDV, DenseMatrix => BDM, norm => brzNorm, svd => brzSvd}
 
 import org.apache.spark.SparkFunSuite
@@ -238,6 +239,22 @@ class RowMatrixSuite extends SparkFunSuite with MLlibTestSparkContext {
       }
     }
   }
+
+  test("QR Decomposition") {
+    for (mat <- Seq(denseMat, sparseMat)) {
+      val result = mat.tallSkinnyQR(true)
+      val expected = breeze.linalg.qr.reduced(mat.toBreeze())
+      val calcQ = result.Q
+      val calcR = result.R
+      assert(closeToZero(abs(expected.q) - abs(calcQ.toBreeze())))
+      assert(closeToZero(abs(expected.r) - abs(calcR.toBreeze.asInstanceOf[BDM[Double]])))
+      assert(closeToZero(calcQ.multiply(calcR).toBreeze - mat.toBreeze()))
+      // Decomposition without computing Q
+      val rOnly = mat.tallSkinnyQR(computeQ = false)
+      assert(rOnly.Q == null)
+      assert(closeToZero(abs(expected.r) - abs(rOnly.R.toBreeze.asInstanceOf[BDM[Double]])))
+    }
+  }
 }
 
 class RowMatrixClusterSuite extends SparkFunSuite with LocalClusterSparkContext {

From c5815930be46a89469440b7c61b59764fb67a54c Mon Sep 17 00:00:00 2001
From: "Joseph K. Bradley" <joseph@databricks.com>
Date: Thu, 30 Jul 2015 07:56:15 -0700
Subject: [PATCH 469/549] [SPARK-5561] [MLLIB] Generalized PeriodicCheckpointer
 for RDDs and Graphs

PeriodicGraphCheckpointer was introduced for Latent Dirichlet Allocation (LDA), but it was meant to be generalized to work with Graphs, RDDs, and other data structures based on RDDs.  This PR generalizes it.

For those who are not familiar with the periodic checkpointer, it tries to automatically handle persisting/unpersisting and checkpointing/removing checkpoint files in a lineage of RDD-based objects.

I need it generalized to use with GradientBoostedTrees [https://issues.apache.org/jira/browse/SPARK-6684].  It should be useful for other iterative algorithms as well.

Changes I made:
* Copied PeriodicGraphCheckpointer to PeriodicCheckpointer.
* Within PeriodicCheckpointer, I created abstract methods for the basic operations (checkpoint, persist, etc.).
* The subclasses for Graphs and RDDs implement those abstract methods.
* I copied the test suite for the graph checkpointer and made tiny modifications to make it work for RDDs.

To review this PR, I recommend doing 2 diffs:
(1) diff between the old PeriodicGraphCheckpointer.scala and the new PeriodicCheckpointer.scala
(2) diff between the 2 test suites

CCing andrewor14 in case there are relevant changes to checkpointing.
CCing feynmanliang in case you're interested in learning about checkpointing.
CCing mengxr for final OK.
Thanks all!

Author: Joseph K. Bradley <joseph@databricks.com>

Closes #7728 from jkbradley/gbt-checkpoint and squashes the following commits:

d41902c [Joseph K. Bradley] Oops, forgot to update an extra time in the checkpointer tests, after the last commit. I'll fix that. I'll also make some of the checkpointer methods protected, which I should have done before.
32b23b8 [Joseph K. Bradley] fixed usage of checkpointer in lda
0b3dbc0 [Joseph K. Bradley] Changed checkpointer constructor not to take initial data.
568918c [Joseph K. Bradley] Generalized PeriodicGraphCheckpointer to PeriodicCheckpointer, with subclasses for RDDs and Graphs.
---
 .../spark/mllib/clustering/LDAOptimizer.scala |   6 +-
 .../mllib/impl/PeriodicCheckpointer.scala     | 154 ++++++++++++++++
 .../impl/PeriodicGraphCheckpointer.scala      | 105 ++---------
 .../mllib/impl/PeriodicRDDCheckpointer.scala  |  97 ++++++++++
 .../impl/PeriodicGraphCheckpointerSuite.scala |  16 +-
 .../impl/PeriodicRDDCheckpointerSuite.scala   | 173 ++++++++++++++++++
 6 files changed, 452 insertions(+), 99 deletions(-)
 create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/impl/PeriodicCheckpointer.scala
 create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/impl/PeriodicRDDCheckpointer.scala
 create mode 100644 mllib/src/test/scala/org/apache/spark/mllib/impl/PeriodicRDDCheckpointerSuite.scala

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
index 7e75e7083acb5..4b90fbdf0ce7e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
@@ -142,8 +142,8 @@ final class EMLDAOptimizer extends LDAOptimizer {
     this.k = k
     this.vocabSize = docs.take(1).head._2.size
     this.checkpointInterval = lda.getCheckpointInterval
-    this.graphCheckpointer = new
-      PeriodicGraphCheckpointer[TopicCounts, TokenCount](graph, checkpointInterval)
+    this.graphCheckpointer = new PeriodicGraphCheckpointer[TopicCounts, TokenCount](
+      checkpointInterval, graph.vertices.sparkContext)
     this.globalTopicTotals = computeGlobalTopicTotals()
     this
   }
@@ -188,7 +188,7 @@ final class EMLDAOptimizer extends LDAOptimizer {
     // Update the vertex descriptors with the new counts.
     val newGraph = GraphImpl.fromExistingRDDs(docTopicDistributions, graph.edges)
     graph = newGraph
-    graphCheckpointer.updateGraph(newGraph)
+    graphCheckpointer.update(newGraph)
     globalTopicTotals = computeGlobalTopicTotals()
     this
   }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/impl/PeriodicCheckpointer.scala b/mllib/src/main/scala/org/apache/spark/mllib/impl/PeriodicCheckpointer.scala
new file mode 100644
index 0000000000000..72d3aabc9b1f4
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/impl/PeriodicCheckpointer.scala
@@ -0,0 +1,154 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.impl
+
+import scala.collection.mutable
+
+import org.apache.hadoop.fs.{Path, FileSystem}
+
+import org.apache.spark.{SparkContext, Logging}
+import org.apache.spark.storage.StorageLevel
+
+
+/**
+ * This abstraction helps with persisting and checkpointing RDDs and types derived from RDDs
+ * (such as Graphs and DataFrames).  In documentation, we use the phrase "Dataset" to refer to
+ * the distributed data type (RDD, Graph, etc.).
+ *
+ * Specifically, this abstraction automatically handles persisting and (optionally) checkpointing,
+ * as well as unpersisting and removing checkpoint files.
+ *
+ * Users should call update() when a new Dataset has been created,
+ * before the Dataset has been materialized.  After updating [[PeriodicCheckpointer]], users are
+ * responsible for materializing the Dataset to ensure that persisting and checkpointing actually
+ * occur.
+ *
+ * When update() is called, this does the following:
+ *  - Persist new Dataset (if not yet persisted), and put in queue of persisted Datasets.
+ *  - Unpersist Datasets from queue until there are at most 3 persisted Datasets.
+ *  - If using checkpointing and the checkpoint interval has been reached,
+ *     - Checkpoint the new Dataset, and put in a queue of checkpointed Datasets.
+ *     - Remove older checkpoints.
+ *
+ * WARNINGS:
+ *  - This class should NOT be copied (since copies may conflict on which Datasets should be
+ *    checkpointed).
+ *  - This class removes checkpoint files once later Datasets have been checkpointed.
+ *    However, references to the older Datasets will still return isCheckpointed = true.
+ *
+ * @param checkpointInterval  Datasets will be checkpointed at this interval
+ * @param sc  SparkContext for the Datasets given to this checkpointer
+ * @tparam T  Dataset type, such as RDD[Double]
+ */
+private[mllib] abstract class PeriodicCheckpointer[T](
+    val checkpointInterval: Int,
+    val sc: SparkContext) extends Logging {
+
+  /** FIFO queue of past checkpointed Datasets */
+  private val checkpointQueue = mutable.Queue[T]()
+
+  /** FIFO queue of past persisted Datasets */
+  private val persistedQueue = mutable.Queue[T]()
+
+  /** Number of times [[update()]] has been called */
+  private var updateCount = 0
+
+  /**
+   * Update with a new Dataset. Handle persistence and checkpointing as needed.
+   * Since this handles persistence and checkpointing, this should be called before the Dataset
+   * has been materialized.
+   *
+   * @param newData  New Dataset created from previous Datasets in the lineage.
+   */
+  def update(newData: T): Unit = {
+    persist(newData)
+    persistedQueue.enqueue(newData)
+    // We try to maintain 2 Datasets in persistedQueue to support the semantics of this class:
+    // Users should call [[update()]] when a new Dataset has been created,
+    // before the Dataset has been materialized.
+    while (persistedQueue.size > 3) {
+      val dataToUnpersist = persistedQueue.dequeue()
+      unpersist(dataToUnpersist)
+    }
+    updateCount += 1
+
+    // Handle checkpointing (after persisting)
+    if ((updateCount % checkpointInterval) == 0 && sc.getCheckpointDir.nonEmpty) {
+      // Add new checkpoint before removing old checkpoints.
+      checkpoint(newData)
+      checkpointQueue.enqueue(newData)
+      // Remove checkpoints before the latest one.
+      var canDelete = true
+      while (checkpointQueue.size > 1 && canDelete) {
+        // Delete the oldest checkpoint only if the next checkpoint exists.
+        if (isCheckpointed(checkpointQueue.head)) {
+          removeCheckpointFile()
+        } else {
+          canDelete = false
+        }
+      }
+    }
+  }
+
+  /** Checkpoint the Dataset */
+  protected def checkpoint(data: T): Unit
+
+  /** Return true iff the Dataset is checkpointed */
+  protected def isCheckpointed(data: T): Boolean
+
+  /**
+   * Persist the Dataset.
+   * Note: This should handle checking the current [[StorageLevel]] of the Dataset.
+   */
+  protected def persist(data: T): Unit
+
+  /** Unpersist the Dataset */
+  protected def unpersist(data: T): Unit
+
+  /** Get list of checkpoint files for this given Dataset */
+  protected def getCheckpointFiles(data: T): Iterable[String]
+
+  /**
+   * Call this at the end to delete any remaining checkpoint files.
+   */
+  def deleteAllCheckpoints(): Unit = {
+    while (checkpointQueue.nonEmpty) {
+      removeCheckpointFile()
+    }
+  }
+
+  /**
+   * Dequeue the oldest checkpointed Dataset, and remove its checkpoint files.
+   * This prints a warning but does not fail if the files cannot be removed.
+   */
+  private def removeCheckpointFile(): Unit = {
+    val old = checkpointQueue.dequeue()
+    // Since the old checkpoint is not deleted by Spark, we manually delete it.
+    val fs = FileSystem.get(sc.hadoopConfiguration)
+    getCheckpointFiles(old).foreach { checkpointFile =>
+      try {
+        fs.delete(new Path(checkpointFile), true)
+      } catch {
+        case e: Exception =>
+          logWarning("PeriodicCheckpointer could not remove old checkpoint file: " +
+            checkpointFile)
+      }
+    }
+  }
+
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/impl/PeriodicGraphCheckpointer.scala b/mllib/src/main/scala/org/apache/spark/mllib/impl/PeriodicGraphCheckpointer.scala
index 6e5dd119dd653..11a059536c50c 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/impl/PeriodicGraphCheckpointer.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/impl/PeriodicGraphCheckpointer.scala
@@ -17,11 +17,7 @@
 
 package org.apache.spark.mllib.impl
 
-import scala.collection.mutable
-
-import org.apache.hadoop.fs.{Path, FileSystem}
-
-import org.apache.spark.Logging
+import org.apache.spark.SparkContext
 import org.apache.spark.graphx.Graph
 import org.apache.spark.storage.StorageLevel
 
@@ -31,12 +27,12 @@ import org.apache.spark.storage.StorageLevel
  * Specifically, it automatically handles persisting and (optionally) checkpointing, as well as
  * unpersisting and removing checkpoint files.
  *
- * Users should call [[PeriodicGraphCheckpointer.updateGraph()]] when a new graph has been created,
+ * Users should call update() when a new graph has been created,
  * before the graph has been materialized.  After updating [[PeriodicGraphCheckpointer]], users are
  * responsible for materializing the graph to ensure that persisting and checkpointing actually
  * occur.
  *
- * When [[PeriodicGraphCheckpointer.updateGraph()]] is called, this does the following:
+ * When update() is called, this does the following:
  *  - Persist new graph (if not yet persisted), and put in queue of persisted graphs.
  *  - Unpersist graphs from queue until there are at most 3 persisted graphs.
  *  - If using checkpointing and the checkpoint interval has been reached,
@@ -52,7 +48,7 @@ import org.apache.spark.storage.StorageLevel
  * Example usage:
  * {{{
  *  val (graph1, graph2, graph3, ...) = ...
- *  val cp = new PeriodicGraphCheckpointer(graph1, dir, 2)
+ *  val cp = new PeriodicGraphCheckpointer(2, sc)
  *  graph1.vertices.count(); graph1.edges.count()
  *  // persisted: graph1
  *  cp.updateGraph(graph2)
@@ -73,99 +69,30 @@ import org.apache.spark.storage.StorageLevel
  *  // checkpointed: graph4
  * }}}
  *
- * @param currentGraph  Initial graph
  * @param checkpointInterval Graphs will be checkpointed at this interval
  * @tparam VD  Vertex descriptor type
  * @tparam ED  Edge descriptor type
  *
- * TODO: Generalize this for Graphs and RDDs, and move it out of MLlib.
+ * TODO: Move this out of MLlib?
  */
 private[mllib] class PeriodicGraphCheckpointer[VD, ED](
-    var currentGraph: Graph[VD, ED],
-    val checkpointInterval: Int) extends Logging {
-
-  /** FIFO queue of past checkpointed RDDs */
-  private val checkpointQueue = mutable.Queue[Graph[VD, ED]]()
-
-  /** FIFO queue of past persisted RDDs */
-  private val persistedQueue = mutable.Queue[Graph[VD, ED]]()
-
-  /** Number of times [[updateGraph()]] has been called */
-  private var updateCount = 0
-
-  /**
-   * Spark Context for the Graphs given to this checkpointer.
-   * NOTE: This code assumes that only one SparkContext is used for the given graphs.
-   */
-  private val sc = currentGraph.vertices.sparkContext
+    checkpointInterval: Int,
+    sc: SparkContext)
+  extends PeriodicCheckpointer[Graph[VD, ED]](checkpointInterval, sc) {
 
-  updateGraph(currentGraph)
+  override protected def checkpoint(data: Graph[VD, ED]): Unit = data.checkpoint()
 
-  /**
-   * Update [[currentGraph]] with a new graph. Handle persistence and checkpointing as needed.
-   * Since this handles persistence and checkpointing, this should be called before the graph
-   * has been materialized.
-   *
-   * @param newGraph  New graph created from previous graphs in the lineage.
-   */
-  def updateGraph(newGraph: Graph[VD, ED]): Unit = {
-    if (newGraph.vertices.getStorageLevel == StorageLevel.NONE) {
-      newGraph.persist()
-    }
-    persistedQueue.enqueue(newGraph)
-    // We try to maintain 2 Graphs in persistedQueue to support the semantics of this class:
-    // Users should call [[updateGraph()]] when a new graph has been created,
-    // before the graph has been materialized.
-    while (persistedQueue.size > 3) {
-      val graphToUnpersist = persistedQueue.dequeue()
-      graphToUnpersist.unpersist(blocking = false)
-    }
-    updateCount += 1
+  override protected def isCheckpointed(data: Graph[VD, ED]): Boolean = data.isCheckpointed
 
-    // Handle checkpointing (after persisting)
-    if ((updateCount % checkpointInterval) == 0 && sc.getCheckpointDir.nonEmpty) {
-      // Add new checkpoint before removing old checkpoints.
-      newGraph.checkpoint()
-      checkpointQueue.enqueue(newGraph)
-      // Remove checkpoints before the latest one.
-      var canDelete = true
-      while (checkpointQueue.size > 1 && canDelete) {
-        // Delete the oldest checkpoint only if the next checkpoint exists.
-        if (checkpointQueue.get(1).get.isCheckpointed) {
-          removeCheckpointFile()
-        } else {
-          canDelete = false
-        }
-      }
+  override protected def persist(data: Graph[VD, ED]): Unit = {
+    if (data.vertices.getStorageLevel == StorageLevel.NONE) {
+      data.persist()
     }
   }
 
-  /**
-   * Call this at the end to delete any remaining checkpoint files.
-   */
-  def deleteAllCheckpoints(): Unit = {
-    while (checkpointQueue.size > 0) {
-      removeCheckpointFile()
-    }
-  }
+  override protected def unpersist(data: Graph[VD, ED]): Unit = data.unpersist(blocking = false)
 
-  /**
-   * Dequeue the oldest checkpointed Graph, and remove its checkpoint files.
-   * This prints a warning but does not fail if the files cannot be removed.
-   */
-  private def removeCheckpointFile(): Unit = {
-    val old = checkpointQueue.dequeue()
-    // Since the old checkpoint is not deleted by Spark, we manually delete it.
-    val fs = FileSystem.get(sc.hadoopConfiguration)
-    old.getCheckpointFiles.foreach { checkpointFile =>
-      try {
-        fs.delete(new Path(checkpointFile), true)
-      } catch {
-        case e: Exception =>
-          logWarning("PeriodicGraphCheckpointer could not remove old checkpoint file: " +
-            checkpointFile)
-      }
-    }
+  override protected def getCheckpointFiles(data: Graph[VD, ED]): Iterable[String] = {
+    data.getCheckpointFiles
   }
-
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/impl/PeriodicRDDCheckpointer.scala b/mllib/src/main/scala/org/apache/spark/mllib/impl/PeriodicRDDCheckpointer.scala
new file mode 100644
index 0000000000000..f31ed2aa90a64
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/impl/PeriodicRDDCheckpointer.scala
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.impl
+
+import org.apache.spark.SparkContext
+import org.apache.spark.rdd.RDD
+import org.apache.spark.storage.StorageLevel
+
+
+/**
+ * This class helps with persisting and checkpointing RDDs.
+ * Specifically, it automatically handles persisting and (optionally) checkpointing, as well as
+ * unpersisting and removing checkpoint files.
+ *
+ * Users should call update() when a new RDD has been created,
+ * before the RDD has been materialized.  After updating [[PeriodicRDDCheckpointer]], users are
+ * responsible for materializing the RDD to ensure that persisting and checkpointing actually
+ * occur.
+ *
+ * When update() is called, this does the following:
+ *  - Persist new RDD (if not yet persisted), and put in queue of persisted RDDs.
+ *  - Unpersist RDDs from queue until there are at most 3 persisted RDDs.
+ *  - If using checkpointing and the checkpoint interval has been reached,
+ *     - Checkpoint the new RDD, and put in a queue of checkpointed RDDs.
+ *     - Remove older checkpoints.
+ *
+ * WARNINGS:
+ *  - This class should NOT be copied (since copies may conflict on which RDDs should be
+ *    checkpointed).
+ *  - This class removes checkpoint files once later RDDs have been checkpointed.
+ *    However, references to the older RDDs will still return isCheckpointed = true.
+ *
+ * Example usage:
+ * {{{
+ *  val (rdd1, rdd2, rdd3, ...) = ...
+ *  val cp = new PeriodicRDDCheckpointer(2, sc)
+ *  rdd1.count();
+ *  // persisted: rdd1
+ *  cp.update(rdd2)
+ *  rdd2.count();
+ *  // persisted: rdd1, rdd2
+ *  // checkpointed: rdd2
+ *  cp.update(rdd3)
+ *  rdd3.count();
+ *  // persisted: rdd1, rdd2, rdd3
+ *  // checkpointed: rdd2
+ *  cp.update(rdd4)
+ *  rdd4.count();
+ *  // persisted: rdd2, rdd3, rdd4
+ *  // checkpointed: rdd4
+ *  cp.update(rdd5)
+ *  rdd5.count();
+ *  // persisted: rdd3, rdd4, rdd5
+ *  // checkpointed: rdd4
+ * }}}
+ *
+ * @param checkpointInterval  RDDs will be checkpointed at this interval
+ * @tparam T  RDD element type
+ *
+ * TODO: Move this out of MLlib?
+ */
+private[mllib] class PeriodicRDDCheckpointer[T](
+    checkpointInterval: Int,
+    sc: SparkContext)
+  extends PeriodicCheckpointer[RDD[T]](checkpointInterval, sc) {
+
+  override protected def checkpoint(data: RDD[T]): Unit = data.checkpoint()
+
+  override protected def isCheckpointed(data: RDD[T]): Boolean = data.isCheckpointed
+
+  override protected def persist(data: RDD[T]): Unit = {
+    if (data.getStorageLevel == StorageLevel.NONE) {
+      data.persist()
+    }
+  }
+
+  override protected def unpersist(data: RDD[T]): Unit = data.unpersist(blocking = false)
+
+  override protected def getCheckpointFiles(data: RDD[T]): Iterable[String] = {
+    data.getCheckpointFile.map(x => x)
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/impl/PeriodicGraphCheckpointerSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/impl/PeriodicGraphCheckpointerSuite.scala
index d34888af2d73b..e331c75989187 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/impl/PeriodicGraphCheckpointerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/impl/PeriodicGraphCheckpointerSuite.scala
@@ -30,20 +30,20 @@ class PeriodicGraphCheckpointerSuite extends SparkFunSuite with MLlibTestSparkCo
 
   import PeriodicGraphCheckpointerSuite._
 
-  // TODO: Do I need to call count() on the graphs' RDDs?
-
   test("Persisting") {
     var graphsToCheck = Seq.empty[GraphToCheck]
 
     val graph1 = createGraph(sc)
-    val checkpointer = new PeriodicGraphCheckpointer(graph1, 10)
+    val checkpointer =
+      new PeriodicGraphCheckpointer[Double, Double](10, graph1.vertices.sparkContext)
+    checkpointer.update(graph1)
     graphsToCheck = graphsToCheck :+ GraphToCheck(graph1, 1)
     checkPersistence(graphsToCheck, 1)
 
     var iteration = 2
     while (iteration < 9) {
       val graph = createGraph(sc)
-      checkpointer.updateGraph(graph)
+      checkpointer.update(graph)
       graphsToCheck = graphsToCheck :+ GraphToCheck(graph, iteration)
       checkPersistence(graphsToCheck, iteration)
       iteration += 1
@@ -57,7 +57,9 @@ class PeriodicGraphCheckpointerSuite extends SparkFunSuite with MLlibTestSparkCo
     var graphsToCheck = Seq.empty[GraphToCheck]
     sc.setCheckpointDir(path)
     val graph1 = createGraph(sc)
-    val checkpointer = new PeriodicGraphCheckpointer(graph1, checkpointInterval)
+    val checkpointer = new PeriodicGraphCheckpointer[Double, Double](
+      checkpointInterval, graph1.vertices.sparkContext)
+    checkpointer.update(graph1)
     graph1.edges.count()
     graph1.vertices.count()
     graphsToCheck = graphsToCheck :+ GraphToCheck(graph1, 1)
@@ -66,7 +68,7 @@ class PeriodicGraphCheckpointerSuite extends SparkFunSuite with MLlibTestSparkCo
     var iteration = 2
     while (iteration < 9) {
       val graph = createGraph(sc)
-      checkpointer.updateGraph(graph)
+      checkpointer.update(graph)
       graph.vertices.count()
       graph.edges.count()
       graphsToCheck = graphsToCheck :+ GraphToCheck(graph, iteration)
@@ -168,7 +170,7 @@ private object PeriodicGraphCheckpointerSuite {
       } else {
         // Graph should never be checkpointed
         assert(!graph.isCheckpointed, "Graph should never have been checkpointed")
-        assert(graph.getCheckpointFiles.length == 0, "Graph should not have any checkpoint files")
+        assert(graph.getCheckpointFiles.isEmpty, "Graph should not have any checkpoint files")
       }
     } catch {
       case e: AssertionError =>
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/impl/PeriodicRDDCheckpointerSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/impl/PeriodicRDDCheckpointerSuite.scala
new file mode 100644
index 0000000000000..b2a459a68b5fa
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/mllib/impl/PeriodicRDDCheckpointerSuite.scala
@@ -0,0 +1,173 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.impl
+
+import org.apache.hadoop.fs.{FileSystem, Path}
+
+import org.apache.spark.{SparkContext, SparkFunSuite}
+import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.rdd.RDD
+import org.apache.spark.storage.StorageLevel
+import org.apache.spark.util.Utils
+
+
+class PeriodicRDDCheckpointerSuite extends SparkFunSuite with MLlibTestSparkContext {
+
+  import PeriodicRDDCheckpointerSuite._
+
+  test("Persisting") {
+    var rddsToCheck = Seq.empty[RDDToCheck]
+
+    val rdd1 = createRDD(sc)
+    val checkpointer = new PeriodicRDDCheckpointer[Double](10, rdd1.sparkContext)
+    checkpointer.update(rdd1)
+    rddsToCheck = rddsToCheck :+ RDDToCheck(rdd1, 1)
+    checkPersistence(rddsToCheck, 1)
+
+    var iteration = 2
+    while (iteration < 9) {
+      val rdd = createRDD(sc)
+      checkpointer.update(rdd)
+      rddsToCheck = rddsToCheck :+ RDDToCheck(rdd, iteration)
+      checkPersistence(rddsToCheck, iteration)
+      iteration += 1
+    }
+  }
+
+  test("Checkpointing") {
+    val tempDir = Utils.createTempDir()
+    val path = tempDir.toURI.toString
+    val checkpointInterval = 2
+    var rddsToCheck = Seq.empty[RDDToCheck]
+    sc.setCheckpointDir(path)
+    val rdd1 = createRDD(sc)
+    val checkpointer = new PeriodicRDDCheckpointer[Double](checkpointInterval, rdd1.sparkContext)
+    checkpointer.update(rdd1)
+    rdd1.count()
+    rddsToCheck = rddsToCheck :+ RDDToCheck(rdd1, 1)
+    checkCheckpoint(rddsToCheck, 1, checkpointInterval)
+
+    var iteration = 2
+    while (iteration < 9) {
+      val rdd = createRDD(sc)
+      checkpointer.update(rdd)
+      rdd.count()
+      rddsToCheck = rddsToCheck :+ RDDToCheck(rdd, iteration)
+      checkCheckpoint(rddsToCheck, iteration, checkpointInterval)
+      iteration += 1
+    }
+
+    checkpointer.deleteAllCheckpoints()
+    rddsToCheck.foreach { rdd =>
+      confirmCheckpointRemoved(rdd.rdd)
+    }
+
+    Utils.deleteRecursively(tempDir)
+  }
+}
+
+private object PeriodicRDDCheckpointerSuite {
+
+  case class RDDToCheck(rdd: RDD[Double], gIndex: Int)
+
+  def createRDD(sc: SparkContext): RDD[Double] = {
+    sc.parallelize(Seq(0.0, 1.0, 2.0, 3.0))
+  }
+
+  def checkPersistence(rdds: Seq[RDDToCheck], iteration: Int): Unit = {
+    rdds.foreach { g =>
+      checkPersistence(g.rdd, g.gIndex, iteration)
+    }
+  }
+
+  /**
+   * Check storage level of rdd.
+   * @param gIndex  Index of rdd in order inserted into checkpointer (from 1).
+   * @param iteration  Total number of rdds inserted into checkpointer.
+   */
+  def checkPersistence(rdd: RDD[_], gIndex: Int, iteration: Int): Unit = {
+    try {
+      if (gIndex + 2 < iteration) {
+        assert(rdd.getStorageLevel == StorageLevel.NONE)
+      } else {
+        assert(rdd.getStorageLevel != StorageLevel.NONE)
+      }
+    } catch {
+      case _: AssertionError =>
+        throw new Exception(s"PeriodicRDDCheckpointerSuite.checkPersistence failed with:\n" +
+          s"\t gIndex = $gIndex\n" +
+          s"\t iteration = $iteration\n" +
+          s"\t rdd.getStorageLevel = ${rdd.getStorageLevel}\n")
+    }
+  }
+
+  def checkCheckpoint(rdds: Seq[RDDToCheck], iteration: Int, checkpointInterval: Int): Unit = {
+    rdds.reverse.foreach { g =>
+      checkCheckpoint(g.rdd, g.gIndex, iteration, checkpointInterval)
+    }
+  }
+
+  def confirmCheckpointRemoved(rdd: RDD[_]): Unit = {
+    // Note: We cannot check rdd.isCheckpointed since that value is never updated.
+    //       Instead, we check for the presence of the checkpoint files.
+    //       This test should continue to work even after this rdd.isCheckpointed issue
+    //       is fixed (though it can then be simplified and not look for the files).
+    val fs = FileSystem.get(rdd.sparkContext.hadoopConfiguration)
+    rdd.getCheckpointFile.foreach { checkpointFile =>
+      assert(!fs.exists(new Path(checkpointFile)), "RDD checkpoint file should have been removed")
+    }
+  }
+
+  /**
+   * Check checkpointed status of rdd.
+   * @param gIndex  Index of rdd in order inserted into checkpointer (from 1).
+   * @param iteration  Total number of rdds inserted into checkpointer.
+   */
+  def checkCheckpoint(
+      rdd: RDD[_],
+      gIndex: Int,
+      iteration: Int,
+      checkpointInterval: Int): Unit = {
+    try {
+      if (gIndex % checkpointInterval == 0) {
+        // We allow 2 checkpoint intervals since we perform an action (checkpointing a second rdd)
+        // only AFTER PeriodicRDDCheckpointer decides whether to remove the previous checkpoint.
+        if (iteration - 2 * checkpointInterval < gIndex && gIndex <= iteration) {
+          assert(rdd.isCheckpointed, "RDD should be checkpointed")
+          assert(rdd.getCheckpointFile.nonEmpty, "RDD should have 2 checkpoint files")
+        } else {
+          confirmCheckpointRemoved(rdd)
+        }
+      } else {
+        // RDD should never be checkpointed
+        assert(!rdd.isCheckpointed, "RDD should never have been checkpointed")
+        assert(rdd.getCheckpointFile.isEmpty, "RDD should not have any checkpoint files")
+      }
+    } catch {
+      case e: AssertionError =>
+        throw new Exception(s"PeriodicRDDCheckpointerSuite.checkCheckpoint failed with:\n" +
+          s"\t gIndex = $gIndex\n" +
+          s"\t iteration = $iteration\n" +
+          s"\t checkpointInterval = $checkpointInterval\n" +
+          s"\t rdd.isCheckpointed = ${rdd.isCheckpointed}\n" +
+          s"\t rdd.getCheckpointFile = ${rdd.getCheckpointFile.mkString(", ")}\n" +
+          s"  AssertionError message: ${e.getMessage}")
+    }
+  }
+
+}

From d212a314227dec26c0dbec8ed3422d0ec8f818f9 Mon Sep 17 00:00:00 2001
From: zhangjiajin <zhangjiajin@huawei.com>
Date: Thu, 30 Jul 2015 08:14:09 -0700
Subject: [PATCH 470/549] [SPARK-8998] [MLLIB] Distribute PrefixSpan
 computation for large projected databases

Continuation of work by zhangjiajin

Closes #7412

Author: zhangjiajin <zhangjiajin@huawei.com>
Author: Feynman Liang <fliang@databricks.com>
Author: zhang jiajin <zhangjiajin@huawei.com>

Closes #7783 from feynmanliang/SPARK-8998-improve-distributed and squashes the following commits:

a61943d [Feynman Liang] Collect small patterns to local
4ddf479 [Feynman Liang] Parallelize freqItemCounts
ad23aa9 [zhang jiajin] Merge pull request #1 from feynmanliang/SPARK-8998-collectBeforeLocal
87fa021 [Feynman Liang] Improve extend prefix readability
c2caa5c [Feynman Liang] Readability improvements and comments
1235cfc [Feynman Liang] Use Iterable[Array[_]] over Array[Array[_]] for database
da0091b [Feynman Liang] Use lists for prefixes to reuse data
cb2a4fc [Feynman Liang] Inline code for readability
01c9ae9 [Feynman Liang] Add getters
6e149fa [Feynman Liang] Fix splitPrefixSuffixPairs
64271b3 [zhangjiajin] Modified codes according to comments.
d2250b7 [zhangjiajin] remove minPatternsBeforeLocalProcessing, add maxSuffixesBeforeLocalProcessing.
b07e20c [zhangjiajin] Merge branch 'master' of https://github.com/apache/spark into CollectEnoughPrefixes
095aa3a [zhangjiajin] Modified the code according to the review comments.
baa2885 [zhangjiajin] Modified the code according to the review comments.
6560c69 [zhangjiajin] Add feature: Collect enough frequent prefixes before projection in PrefixeSpan
a8fde87 [zhangjiajin] Merge branch 'master' of https://github.com/apache/spark
4dd1c8a [zhangjiajin] initialize file before rebase.
078d410 [zhangjiajin] fix a scala style error.
22b0ef4 [zhangjiajin] Add feature: Collect enough frequent prefixes before projection in PrefixSpan.
ca9c4c8 [zhangjiajin] Modified the code according to the review comments.
574e56c [zhangjiajin] Add new object LocalPrefixSpan, and do some optimization.
ba5df34 [zhangjiajin] Fix a Scala style error.
4c60fb3 [zhangjiajin] Fix some Scala style errors.
1dd33ad [zhangjiajin] Modified the code according to the review comments.
89bc368 [zhangjiajin] Fixed a Scala style error.
a2eb14c [zhang jiajin] Delete PrefixspanSuite.scala
951fd42 [zhang jiajin] Delete Prefixspan.scala
575995f [zhangjiajin] Modified the code according to the review comments.
91fd7e6 [zhangjiajin] Add new algorithm PrefixSpan and test file.
---
 .../spark/mllib/fpm/LocalPrefixSpan.scala     |   6 +-
 .../apache/spark/mllib/fpm/PrefixSpan.scala   | 203 +++++++++++++-----
 .../spark/mllib/fpm/PrefixSpanSuite.scala     |  21 +-
 3 files changed, 161 insertions(+), 69 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/LocalPrefixSpan.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/LocalPrefixSpan.scala
index 7ead6327486cc..0ea792081086d 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/LocalPrefixSpan.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/LocalPrefixSpan.scala
@@ -40,7 +40,7 @@ private[fpm] object LocalPrefixSpan extends Logging with Serializable {
       minCount: Long,
       maxPatternLength: Int,
       prefixes: List[Int],
-      database: Array[Array[Int]]): Iterator[(List[Int], Long)] = {
+      database: Iterable[Array[Int]]): Iterator[(List[Int], Long)] = {
     if (prefixes.length == maxPatternLength || database.isEmpty) return Iterator.empty
     val frequentItemAndCounts = getFreqItemAndCounts(minCount, database)
     val filteredDatabase = database.map(x => x.filter(frequentItemAndCounts.contains))
@@ -67,7 +67,7 @@ private[fpm] object LocalPrefixSpan extends Logging with Serializable {
     }
   }
 
-  def project(database: Array[Array[Int]], prefix: Int): Array[Array[Int]] = {
+  def project(database: Iterable[Array[Int]], prefix: Int): Iterable[Array[Int]] = {
     database
       .map(getSuffix(prefix, _))
       .filter(_.nonEmpty)
@@ -81,7 +81,7 @@ private[fpm] object LocalPrefixSpan extends Logging with Serializable {
    */
   private def getFreqItemAndCounts(
       minCount: Long,
-      database: Array[Array[Int]]): mutable.Map[Int, Long] = {
+      database: Iterable[Array[Int]]): mutable.Map[Int, Long] = {
     // TODO: use PrimitiveKeyOpenHashMap
     val counts = mutable.Map[Int, Long]().withDefaultValue(0L)
     database.foreach { sequence =>
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala
index 6f52db7b073ae..e6752332cdeeb 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.mllib.fpm
 
+import scala.collection.mutable.ArrayBuffer
+
 import org.apache.spark.Logging
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.rdd.RDD
@@ -43,28 +45,45 @@ class PrefixSpan private (
     private var minSupport: Double,
     private var maxPatternLength: Int) extends Logging with Serializable {
 
+  /**
+   * The maximum number of items allowed in a projected database before local processing. If a
+   * projected database exceeds this size, another iteration of distributed PrefixSpan is run.
+   */
+  // TODO: make configurable with a better default value, 10000 may be too small
+  private val maxLocalProjDBSize: Long = 10000
+
   /**
    * Constructs a default instance with default parameters
    * {minSupport: `0.1`, maxPatternLength: `10`}.
    */
   def this() = this(0.1, 10)
 
+  /**
+   * Get the minimal support (i.e. the frequency of occurrence before a pattern is considered
+   * frequent).
+   */
+  def getMinSupport: Double = this.minSupport
+
   /**
    * Sets the minimal support level (default: `0.1`).
    */
   def setMinSupport(minSupport: Double): this.type = {
-    require(minSupport >= 0 && minSupport <= 1,
-      "The minimum support value must be between 0 and 1, including 0 and 1.")
+    require(minSupport >= 0 && minSupport <= 1, "The minimum support value must be in [0, 1].")
     this.minSupport = minSupport
     this
   }
 
+  /**
+   * Gets the maximal pattern length (i.e. the length of the longest sequential pattern to consider.
+   */
+  def getMaxPatternLength: Double = this.maxPatternLength
+
   /**
    * Sets maximal pattern length (default: `10`).
    */
   def setMaxPatternLength(maxPatternLength: Int): this.type = {
-    require(maxPatternLength >= 1,
-      "The maximum pattern length value must be greater than 0.")
+    // TODO: support unbounded pattern length when maxPatternLength = 0
+    require(maxPatternLength >= 1, "The maximum pattern length value must be greater than 0.")
     this.maxPatternLength = maxPatternLength
     this
   }
@@ -78,81 +97,153 @@ class PrefixSpan private (
    *         the value of pair is the pattern's count.
    */
   def run(sequences: RDD[Array[Int]]): RDD[(Array[Int], Long)] = {
+    val sc = sequences.sparkContext
+
     if (sequences.getStorageLevel == StorageLevel.NONE) {
       logWarning("Input data is not cached.")
     }
-    val minCount = getMinCount(sequences)
-    val lengthOnePatternsAndCounts =
-      getFreqItemAndCounts(minCount, sequences).collect()
-    val prefixAndProjectedDatabase = getPrefixAndProjectedDatabase(
-      lengthOnePatternsAndCounts.map(_._1), sequences)
-    val groupedProjectedDatabase = prefixAndProjectedDatabase
-      .map(x => (x._1.toSeq, x._2))
-      .groupByKey()
-      .map(x => (x._1.toArray, x._2.toArray))
-    val nextPatterns = getPatternsInLocal(minCount, groupedProjectedDatabase)
-    val lengthOnePatternsAndCountsRdd =
-      sequences.sparkContext.parallelize(
-        lengthOnePatternsAndCounts.map(x => (Array(x._1), x._2)))
-    val allPatterns = lengthOnePatternsAndCountsRdd ++ nextPatterns
-    allPatterns
+
+    // Convert min support to a min number of transactions for this dataset
+    val minCount = if (minSupport == 0) 0L else math.ceil(sequences.count() * minSupport).toLong
+
+    // (Frequent items -> number of occurrences, all items here satisfy the `minSupport` threshold
+    val freqItemCounts = sequences
+      .flatMap(seq => seq.distinct.map(item => (item, 1L)))
+      .reduceByKey(_ + _)
+      .filter(_._2 >= minCount)
+      .collect()
+
+    // Pairs of (length 1 prefix, suffix consisting of frequent items)
+    val itemSuffixPairs = {
+      val freqItems = freqItemCounts.map(_._1).toSet
+      sequences.flatMap { seq =>
+        val filteredSeq = seq.filter(freqItems.contains(_))
+        freqItems.flatMap { item =>
+          val candidateSuffix = LocalPrefixSpan.getSuffix(item, filteredSeq)
+          candidateSuffix match {
+            case suffix if !suffix.isEmpty => Some((List(item), suffix))
+            case _ => None
+          }
+        }
+      }
+    }
+
+    // Accumulator for the computed results to be returned, initialized to the frequent items (i.e.
+    // frequent length-one prefixes)
+    var resultsAccumulator = freqItemCounts.map(x => (List(x._1), x._2))
+
+    // Remaining work to be locally and distributively processed respectfully
+    var (pairsForLocal, pairsForDistributed) = partitionByProjDBSize(itemSuffixPairs)
+
+    // Continue processing until no pairs for distributed processing remain (i.e. all prefixes have
+    // projected database sizes <= `maxLocalProjDBSize`)
+    while (pairsForDistributed.count() != 0) {
+      val (nextPatternAndCounts, nextPrefixSuffixPairs) =
+        extendPrefixes(minCount, pairsForDistributed)
+      pairsForDistributed.unpersist()
+      val (smallerPairsPart, largerPairsPart) = partitionByProjDBSize(nextPrefixSuffixPairs)
+      pairsForDistributed = largerPairsPart
+      pairsForDistributed.persist(StorageLevel.MEMORY_AND_DISK)
+      pairsForLocal ++= smallerPairsPart
+      resultsAccumulator ++= nextPatternAndCounts.collect()
+    }
+
+    // Process the small projected databases locally
+    val remainingResults = getPatternsInLocal(
+      minCount, sc.parallelize(pairsForLocal, 1).groupByKey())
+
+    (sc.parallelize(resultsAccumulator, 1) ++ remainingResults)
+      .map { case (pattern, count) => (pattern.toArray, count) }
   }
 
+
   /**
-   * Get the minimum count (sequences count * minSupport).
-   * @param sequences input data set, contains a set of sequences,
-   * @return minimum count,
+   * Partitions the prefix-suffix pairs by projected database size.
+   * @param prefixSuffixPairs prefix (length n) and suffix pairs,
+   * @return prefix-suffix pairs partitioned by whether their projected database size is <= or
+   *         greater than [[maxLocalProjDBSize]]
    */
-  private def getMinCount(sequences: RDD[Array[Int]]): Long = {
-    if (minSupport == 0) 0L else math.ceil(sequences.count() * minSupport).toLong
+  private def partitionByProjDBSize(prefixSuffixPairs: RDD[(List[Int], Array[Int])])
+    : (Array[(List[Int], Array[Int])], RDD[(List[Int], Array[Int])]) = {
+    val prefixToSuffixSize = prefixSuffixPairs
+      .aggregateByKey(0)(
+        seqOp = { case (count, suffix) => count + suffix.length },
+        combOp = { _ + _ })
+    val smallPrefixes = prefixToSuffixSize
+      .filter(_._2 <= maxLocalProjDBSize)
+      .keys
+      .collect()
+      .toSet
+    val small = prefixSuffixPairs.filter { case (prefix, _) => smallPrefixes.contains(prefix) }
+    val large = prefixSuffixPairs.filter { case (prefix, _) => !smallPrefixes.contains(prefix) }
+    (small.collect(), large)
   }
 
   /**
-   * Generates frequent items by filtering the input data using minimal count level.
-   * @param minCount the absolute minimum count
-   * @param sequences original sequences data
-   * @return array of item and count pair
+   * Extends all prefixes by one item from their suffix and computes the resulting frequent prefixes
+   * and remaining work.
+   * @param minCount minimum count
+   * @param prefixSuffixPairs prefix (length N) and suffix pairs,
+   * @return (frequent length N+1 extended prefix, count) pairs and (frequent length N+1 extended
+   *         prefix, corresponding suffix) pairs.
    */
-  private def getFreqItemAndCounts(
+  private def extendPrefixes(
       minCount: Long,
-      sequences: RDD[Array[Int]]): RDD[(Int, Long)] = {
-    sequences.flatMap(_.distinct.map((_, 1L)))
+      prefixSuffixPairs: RDD[(List[Int], Array[Int])])
+    : (RDD[(List[Int], Long)], RDD[(List[Int], Array[Int])]) = {
+
+    // (length N prefix, item from suffix) pairs and their corresponding number of occurrences
+    // Every (prefix :+ suffix) is guaranteed to have support exceeding `minSupport`
+    val prefixItemPairAndCounts = prefixSuffixPairs
+      .flatMap { case (prefix, suffix) => suffix.distinct.map(y => ((prefix, y), 1L)) }
       .reduceByKey(_ + _)
       .filter(_._2 >= minCount)
-  }
 
-  /**
-   * Get the frequent prefixes' projected database.
-   * @param frequentPrefixes frequent prefixes
-   * @param sequences sequences data
-   * @return prefixes and projected database
-   */
-  private def getPrefixAndProjectedDatabase(
-      frequentPrefixes: Array[Int],
-      sequences: RDD[Array[Int]]): RDD[(Array[Int], Array[Int])] = {
-    val filteredSequences = sequences.map { p =>
-      p.filter (frequentPrefixes.contains(_) )
-    }
-    filteredSequences.flatMap { x =>
-      frequentPrefixes.map { y =>
-        val sub = LocalPrefixSpan.getSuffix(y, x)
-        (Array(y), sub)
-      }.filter(_._2.nonEmpty)
-    }
+    // Map from prefix to set of possible next items from suffix
+    val prefixToNextItems = prefixItemPairAndCounts
+      .keys
+      .groupByKey()
+      .mapValues(_.toSet)
+      .collect()
+      .toMap
+
+
+    // Frequent patterns with length N+1 and their corresponding counts
+    val extendedPrefixAndCounts = prefixItemPairAndCounts
+      .map { case ((prefix, item), count) => (item :: prefix, count) }
+
+    // Remaining work, all prefixes will have length N+1
+    val extendedPrefixAndSuffix = prefixSuffixPairs
+      .filter(x => prefixToNextItems.contains(x._1))
+      .flatMap { case (prefix, suffix) =>
+        val frequentNextItems = prefixToNextItems(prefix)
+        val filteredSuffix = suffix.filter(frequentNextItems.contains(_))
+        frequentNextItems.flatMap { item =>
+          LocalPrefixSpan.getSuffix(item, filteredSuffix) match {
+            case suffix if !suffix.isEmpty => Some(item :: prefix, suffix)
+            case _ => None
+          }
+        }
+      }
+
+    (extendedPrefixAndCounts, extendedPrefixAndSuffix)
   }
 
   /**
-   * calculate the patterns in local.
+   * Calculate the patterns in local.
    * @param minCount the absolute minimum count
-   * @param data patterns and projected sequences data data
+   * @param data prefixes and projected sequences data data
    * @return patterns
    */
   private def getPatternsInLocal(
       minCount: Long,
-      data: RDD[(Array[Int], Array[Array[Int]])]): RDD[(Array[Int], Long)] = {
-    data.flatMap { case (prefix, projDB) =>
-      LocalPrefixSpan.run(minCount, maxPatternLength, prefix.toList, projDB)
-        .map { case (pattern: List[Int], count: Long) => (pattern.toArray.reverse, count) }
+      data: RDD[(List[Int], Iterable[Array[Int]])]): RDD[(List[Int], Long)] = {
+    data.flatMap {
+      case (prefix, projDB) =>
+        LocalPrefixSpan.run(minCount, maxPatternLength, prefix.toList.reverse, projDB)
+          .map { case (pattern: List[Int], count: Long) =>
+          (pattern.reverse, count)
+        }
     }
   }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/fpm/PrefixSpanSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/fpm/PrefixSpanSuite.scala
index 9f107c89f6d80..6dd2dc926acc5 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/fpm/PrefixSpanSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/fpm/PrefixSpanSuite.scala
@@ -44,13 +44,6 @@ class PrefixSpanSuite extends SparkFunSuite with MLlibTestSparkContext {
 
     val rdd = sc.parallelize(sequences, 2).cache()
 
-    def compareResult(
-        expectedValue: Array[(Array[Int], Long)],
-        actualValue: Array[(Array[Int], Long)]): Boolean = {
-      expectedValue.map(x => (x._1.toSeq, x._2)).toSet ==
-        actualValue.map(x => (x._1.toSeq, x._2)).toSet
-    }
-
     val prefixspan = new PrefixSpan()
       .setMinSupport(0.33)
       .setMaxPatternLength(50)
@@ -76,7 +69,7 @@ class PrefixSpanSuite extends SparkFunSuite with MLlibTestSparkContext {
       (Array(4, 5), 2L),
       (Array(5), 3L)
     )
-    assert(compareResult(expectedValue1, result1.collect()))
+    assert(compareResults(expectedValue1, result1.collect()))
 
     prefixspan.setMinSupport(0.5).setMaxPatternLength(50)
     val result2 = prefixspan.run(rdd)
@@ -87,7 +80,7 @@ class PrefixSpanSuite extends SparkFunSuite with MLlibTestSparkContext {
       (Array(4), 4L),
       (Array(5), 3L)
     )
-    assert(compareResult(expectedValue2, result2.collect()))
+    assert(compareResults(expectedValue2, result2.collect()))
 
     prefixspan.setMinSupport(0.33).setMaxPatternLength(2)
     val result3 = prefixspan.run(rdd)
@@ -107,6 +100,14 @@ class PrefixSpanSuite extends SparkFunSuite with MLlibTestSparkContext {
       (Array(4, 5), 2L),
       (Array(5), 3L)
     )
-    assert(compareResult(expectedValue3, result3.collect()))
+    assert(compareResults(expectedValue3, result3.collect()))
+  }
+
+  private def compareResults(
+    expectedValue: Array[(Array[Int], Long)],
+    actualValue: Array[(Array[Int], Long)]): Boolean = {
+    expectedValue.map(x => (x._1.toSeq, x._2)).toSet ==
+      actualValue.map(x => (x._1.toSeq, x._2)).toSet
   }
+
 }

From 9c0501c5d04d83ca25ce433138bf64df6a14dc58 Mon Sep 17 00:00:00 2001
From: Yuhao Yang <hhbyyh@gmail.com>
Date: Thu, 30 Jul 2015 08:20:52 -0700
Subject: [PATCH 471/549] [SPARK-] [MLLIB] minor fix on tokenizer doc

A trivial fix for the comments of RegexTokenizer.

Maybe this is too small, yet I just noticed it and think it can be quite misleading. I can create a jira if necessary.

Author: Yuhao Yang <hhbyyh@gmail.com>

Closes #7791 from hhbyyh/docFix and squashes the following commits:

cdf2542 [Yuhao Yang] minor fix on tokenizer doc
---
 .../src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala
index 0b3af4747e693..248288ca73e99 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala
@@ -50,7 +50,7 @@ class Tokenizer(override val uid: String) extends UnaryTransformer[String, Seq[S
 /**
  * :: Experimental ::
  * A regex based tokenizer that extracts tokens either by using the provided regex pattern to split
- * the text (default) or repeatedly matching the regex (if `gaps` is true).
+ * the text (default) or repeatedly matching the regex (if `gaps` is false).
  * Optional parameters also allow filtering tokens using a minimal length.
  * It returns an array of strings that can be empty.
  */

From a6e53a9c8b24326d1b6dca7a0e36ce6c643daa77 Mon Sep 17 00:00:00 2001
From: Meihua Wu <meihuawu@umich.edu>
Date: Thu, 30 Jul 2015 08:52:01 -0700
Subject: [PATCH 472/549] [SPARK-9225] [MLLIB] LDASuite needs unit tests for
 empty documents

Add unit tests for running LDA with empty documents.
Both EMLDAOptimizer and OnlineLDAOptimizer are tested.

feynmanliang

Author: Meihua Wu <meihuawu@umich.edu>

Closes #7620 from rotationsymmetry/SPARK-9225 and squashes the following commits:

3ed7c88 [Meihua Wu] Incorporate reviewer's further comments
f9432e8 [Meihua Wu] Incorporate reviewer's comments
8e1b9ec [Meihua Wu] Merge remote-tracking branch 'upstream/master' into SPARK-9225
ad55665 [Meihua Wu] Add unit tests for running LDA with empty documents
---
 .../spark/mllib/clustering/LDASuite.scala     | 40 +++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
index b91c7cefed22e..61d2edfd9fb5f 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
@@ -390,6 +390,46 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext {
     }
   }
 
+  test("EMLDAOptimizer with empty docs") {
+    val vocabSize = 6
+    val emptyDocsArray = Array.fill(6)(Vectors.sparse(vocabSize, Array.empty, Array.empty))
+    val emptyDocs = emptyDocsArray
+      .zipWithIndex.map { case (wordCounts, docId) =>
+        (docId.toLong, wordCounts)
+    }
+    val distributedEmptyDocs = sc.parallelize(emptyDocs, 2)
+
+    val op = new EMLDAOptimizer()
+    val lda = new LDA()
+      .setK(3)
+      .setMaxIterations(5)
+      .setSeed(12345)
+      .setOptimizer(op)
+
+    val model = lda.run(distributedEmptyDocs)
+    assert(model.vocabSize === vocabSize)
+  }
+
+  test("OnlineLDAOptimizer with empty docs") {
+    val vocabSize = 6
+    val emptyDocsArray = Array.fill(6)(Vectors.sparse(vocabSize, Array.empty, Array.empty))
+    val emptyDocs = emptyDocsArray
+      .zipWithIndex.map { case (wordCounts, docId) =>
+        (docId.toLong, wordCounts)
+    }
+    val distributedEmptyDocs = sc.parallelize(emptyDocs, 2)
+
+    val op = new OnlineLDAOptimizer()
+    val lda = new LDA()
+      .setK(3)
+      .setMaxIterations(5)
+      .setSeed(12345)
+      .setOptimizer(op)
+
+    val model = lda.run(distributedEmptyDocs)
+    assert(model.vocabSize === vocabSize)
+  }
+
 }
 
 private[clustering] object LDASuite {

From ed3cb1d21c73645c8f6e6ee08181f876fc192e41 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Thu, 30 Jul 2015 09:19:55 -0700
Subject: [PATCH 473/549] [SPARK-9277] [MLLIB] SparseVector constructor must
 throw an error when declared number of elements less than array length

Check that SparseVector size is at least as big as the number of indices/values provided. And add tests for constructor checks.

CC MechCoder jkbradley -- I am not sure if a change needs to also happen in the Python API? I didn't see it had any similar checks to begin with, but I don't know it well.

Author: Sean Owen <sowen@cloudera.com>

Closes #7794 from srowen/SPARK-9277 and squashes the following commits:

e8dc31e [Sean Owen] Fix scalastyle
6ffe34a [Sean Owen] Check that SparseVector size is at least as big as the number of indices/values provided. And add tests for constructor checks.
---
 .../org/apache/spark/mllib/linalg/Vectors.scala   |  2 ++
 .../apache/spark/mllib/linalg/VectorsSuite.scala  | 15 +++++++++++++++
 2 files changed, 17 insertions(+)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
index 0cb28d78bec05..23c2c16d68d9a 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
@@ -637,6 +637,8 @@ class SparseVector(
   require(indices.length == values.length, "Sparse vectors require that the dimension of the" +
     s" indices match the dimension of the values. You provided ${indices.length} indices and " +
     s" ${values.length} values.")
+  require(indices.length <= size, s"You provided ${indices.length} indices and values, " +
+    s"which exceeds the specified vector size ${size}.")
 
   override def toString: String =
     s"($size,${indices.mkString("[", ",", "]")},${values.mkString("[", ",", "]")})"
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala
index 03be4119bdaca..1c37ea5123e82 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala
@@ -57,6 +57,21 @@ class VectorsSuite extends SparkFunSuite with Logging {
     assert(vec.values === values)
   }
 
+  test("sparse vector construction with mismatched indices/values array") {
+    intercept[IllegalArgumentException] {
+      Vectors.sparse(4, Array(1, 2, 3), Array(3.0, 5.0, 7.0, 9.0))
+    }
+    intercept[IllegalArgumentException] {
+      Vectors.sparse(4, Array(1, 2, 3), Array(3.0, 5.0))
+    }
+  }
+
+  test("sparse vector construction with too many indices vs size") {
+    intercept[IllegalArgumentException] {
+      Vectors.sparse(3, Array(1, 2, 3, 4), Array(3.0, 5.0, 7.0, 9.0))
+    }
+  }
+
   test("dense to array") {
     val vec = Vectors.dense(arr).asInstanceOf[DenseVector]
     assert(vec.toArray.eq(arr))

From 81464f2a8243c6ae2a39bac7ebdc50d4f60af451 Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Thu, 30 Jul 2015 09:45:17 -0700
Subject: [PATCH 474/549] [MINOR] [MLLIB] fix doc for RegexTokenizer

This is #7791 for Python. hhbyyh

Author: Xiangrui Meng <meng@databricks.com>

Closes #7798 from mengxr/regex-tok-py and squashes the following commits:

baa2dcd [Xiangrui Meng] fix doc for RegexTokenizer
---
 python/pyspark/ml/feature.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 86e654dd0779f..015e7a9d4900a 100644
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -525,7 +525,7 @@ class RegexTokenizer(JavaTransformer, HasInputCol, HasOutputCol):
     """
     A regex based tokenizer that extracts tokens either by using the
     provided regex pattern (in Java dialect) to split the text
-    (default) or repeatedly matching the regex (if gaps is true).
+    (default) or repeatedly matching the regex (if gaps is false).
     Optional parameters also allow filtering tokens using a minimal
     length.
     It returns an array of strings that can be empty.

From 7492a33fdd074446c30c657d771a69932a00246d Mon Sep 17 00:00:00 2001
From: Yuu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Thu, 30 Jul 2015 10:00:27 -0700
Subject: [PATCH 475/549] [SPARK-9248] [SPARKR] Closing curly-braces should
 always be on their own line

### JIRA
[[SPARK-9248] Closing curly-braces should always be on their own line - ASF JIRA](https://issues.apache.org/jira/browse/SPARK-9248)

## The result of `dev/lint-r`
[The result of `dev/lint-r` for SPARK-9248 at the revistion:6175d6cfe795fbd88e3ee713fac375038a3993a8](https://gist.github.com/yu-iskw/96cadcea4ce664c41f81)

Author: Yuu ISHIKAWA <yuu.ishikawa@gmail.com>

Closes #7795 from yu-iskw/SPARK-9248 and squashes the following commits:

c8eccd3 [Yuu ISHIKAWA] [SPARK-9248][SparkR] Closing curly-braces should always be on their own line
---
 R/pkg/R/generics.R               | 14 +++++++-------
 R/pkg/R/pairRDD.R                |  4 ++--
 R/pkg/R/sparkR.R                 |  9 ++++++---
 R/pkg/inst/tests/test_sparkSQL.R |  6 ++++--
 4 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index 836e0175c391f..a3a121058e165 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -254,8 +254,10 @@ setGeneric("flatMapValues", function(X, FUN) { standardGeneric("flatMapValues")
 
 # @rdname intersection
 # @export
-setGeneric("intersection", function(x, other, numPartitions = 1) {
-  standardGeneric("intersection") })
+setGeneric("intersection",
+           function(x, other, numPartitions = 1) {
+             standardGeneric("intersection")
+           })
 
 # @rdname keys
 # @export
@@ -489,9 +491,7 @@ setGeneric("sample",
 #' @rdname sample
 #' @export
 setGeneric("sample_frac",
-           function(x, withReplacement, fraction, seed) {
-             standardGeneric("sample_frac")
-           })
+           function(x, withReplacement, fraction, seed) { standardGeneric("sample_frac") })
 
 #' @rdname saveAsParquetFile
 #' @export
@@ -553,8 +553,8 @@ setGeneric("withColumn", function(x, colName, col) { standardGeneric("withColumn
 
 #' @rdname withColumnRenamed
 #' @export
-setGeneric("withColumnRenamed", function(x, existingCol, newCol) {
-  standardGeneric("withColumnRenamed") })
+setGeneric("withColumnRenamed",
+           function(x, existingCol, newCol) { standardGeneric("withColumnRenamed") })
 
 
 ###################### Column Methods ##########################
diff --git a/R/pkg/R/pairRDD.R b/R/pkg/R/pairRDD.R
index ebc6ff65e9d0f..83801d3209700 100644
--- a/R/pkg/R/pairRDD.R
+++ b/R/pkg/R/pairRDD.R
@@ -202,8 +202,8 @@ setMethod("partitionBy",
 
             packageNamesArr <- serialize(.sparkREnv$.packages,
                                          connection = NULL)
-            broadcastArr <- lapply(ls(.broadcastNames), function(name) {
-                                   get(name, .broadcastNames) })
+            broadcastArr <- lapply(ls(.broadcastNames),
+                                   function(name) { get(name, .broadcastNames) })
             jrdd <- getJRDD(x)
 
             # We create a PairwiseRRDD that extends RDD[(Int, Array[Byte])],
diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R
index 76c15875b50d5..e83104f116422 100644
--- a/R/pkg/R/sparkR.R
+++ b/R/pkg/R/sparkR.R
@@ -22,7 +22,8 @@
 connExists <- function(env) {
   tryCatch({
     exists(".sparkRCon", envir = env) && isOpen(env[[".sparkRCon"]])
-  }, error = function(err) {
+  },
+  error = function(err) {
     return(FALSE)
   })
 }
@@ -153,7 +154,8 @@ sparkR.init <- function(
   .sparkREnv$backendPort <- backendPort
   tryCatch({
     connectBackend("localhost", backendPort)
-  }, error = function(err) {
+  },
+  error = function(err) {
     stop("Failed to connect JVM\n")
   })
 
@@ -264,7 +266,8 @@ sparkRHive.init <- function(jsc = NULL) {
   ssc <- callJMethod(sc, "sc")
   hiveCtx <- tryCatch({
     newJObject("org.apache.spark.sql.hive.HiveContext", ssc)
-  }, error = function(err) {
+  },
+  error = function(err) {
     stop("Spark SQL is not built with Hive support")
   })
 
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index 62fe48a5d6c7b..d5db97248c770 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -112,7 +112,8 @@ test_that("create DataFrame from RDD", {
   df <- jsonFile(sqlContext, jsonPathNa)
   hiveCtx <- tryCatch({
     newJObject("org.apache.spark.sql.hive.test.TestHiveContext", ssc)
-  }, error = function(err) {
+  },
+  error = function(err) {
     skip("Hive is not build with SparkSQL, skipped")
   })
   sql(hiveCtx, "CREATE TABLE people (name string, age double, height float)")
@@ -602,7 +603,8 @@ test_that("write.df() as parquet file", {
 test_that("test HiveContext", {
   hiveCtx <- tryCatch({
     newJObject("org.apache.spark.sql.hive.test.TestHiveContext", ssc)
-  }, error = function(err) {
+  },
+  error = function(err) {
     skip("Hive is not build with SparkSQL, skipped")
   })
   df <- createExternalTable(hiveCtx, "json", jsonPath, "json")

From c0cc0eaec67208c087a30c1b1f50c00b2c1ebf08 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@outlook.com>
Date: Thu, 30 Jul 2015 10:04:30 -0700
Subject: [PATCH 476/549] [SPARK-9390][SQL] create a wrapper for array type

Author: Wenchen Fan <cloud0fan@outlook.com>

Closes #7724 from cloud-fan/array-data and squashes the following commits:

d0408a1 [Wenchen Fan] fix python
661e608 [Wenchen Fan] rebase
f39256c [Wenchen Fan] fix hive...
6dbfa6f [Wenchen Fan] fix hive again...
8cb8842 [Wenchen Fan] remove element type parameter from getArray
43e9816 [Wenchen Fan] fix mllib
e719afc [Wenchen Fan] fix hive
4346290 [Wenchen Fan] address comment
d4a38da [Wenchen Fan] remove sizeInBytes and add license
7e283e2 [Wenchen Fan] create a wrapper for array type
---
 .../apache/spark/mllib/linalg/Matrices.scala  |  16 +--
 .../apache/spark/mllib/linalg/Vectors.scala   |  15 +--
 .../expressions/SpecializedGetters.java       |   2 +
 .../sql/catalyst/CatalystTypeConverters.scala |  29 +++--
 .../spark/sql/catalyst/InternalRow.scala      |   2 +
 .../catalyst/expressions/BoundAttribute.scala |   2 +-
 .../spark/sql/catalyst/expressions/Cast.scala |  39 ++++--
 .../expressions/codegen/CodeGenerator.scala   |  28 ++--
 .../codegen/GenerateUnsafeProjection.scala    |   4 +-
 .../expressions/collectionOperations.scala    |  10 +-
 .../expressions/complexTypeCreator.scala      |  20 ++-
 .../expressions/complexTypeExtractors.scala   |  59 ++++++---
 .../sql/catalyst/expressions/generators.scala |   4 +-
 .../expressions/stringOperations.scala        |  12 +-
 .../sql/catalyst/optimizer/Optimizer.scala    |   3 +-
 .../apache/spark/sql/types/ArrayData.scala    | 121 ++++++++++++++++++
 .../spark/sql/types/GenericArrayData.scala    |  59 +++++++++
 .../sql/catalyst/expressions/CastSuite.scala  |  21 ++-
 .../expressions/ComplexTypeSuite.scala        |   2 +-
 .../spark/sql/execution/debug/package.scala   |   4 +-
 .../spark/sql/execution/pythonUDFs.scala      |  19 ++-
 .../sql/execution/stat/FrequentItems.scala    |   4 +-
 .../apache/spark/sql/json/InferSchema.scala   |   2 +-
 .../apache/spark/sql/json/JacksonParser.scala |  30 +++--
 .../sql/parquet/CatalystRowConverter.scala    |   2 +-
 .../spark/sql/parquet/ParquetConverter.scala  |   3 +-
 .../sql/parquet/ParquetTableSupport.scala     |  12 +-
 .../apache/spark/sql/JavaDataFrameSuite.java  |   5 +-
 .../spark/sql/UserDefinedTypeSuite.scala      |   8 +-
 .../spark/sql/sources/TableScanSuite.scala    |  30 ++---
 .../spark/sql/hive/HiveInspectors.scala       |  28 ++--
 .../hive/execution/ScriptTransformation.scala |  12 +-
 .../org/apache/spark/sql/hive/hiveUDFs.scala  |   2 +-
 .../spark/sql/hive/HiveInspectorSuite.scala   |   2 +-
 34 files changed, 430 insertions(+), 181 deletions(-)
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayData.scala
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/types/GenericArrayData.scala

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
index d82ba2456df1a..88914fa875990 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
@@ -154,9 +154,9 @@ private[spark] class MatrixUDT extends UserDefinedType[Matrix] {
         row.setByte(0, 0)
         row.setInt(1, sm.numRows)
         row.setInt(2, sm.numCols)
-        row.update(3, sm.colPtrs.toSeq)
-        row.update(4, sm.rowIndices.toSeq)
-        row.update(5, sm.values.toSeq)
+        row.update(3, new GenericArrayData(sm.colPtrs.map(_.asInstanceOf[Any])))
+        row.update(4, new GenericArrayData(sm.rowIndices.map(_.asInstanceOf[Any])))
+        row.update(5, new GenericArrayData(sm.values.map(_.asInstanceOf[Any])))
         row.setBoolean(6, sm.isTransposed)
 
       case dm: DenseMatrix =>
@@ -165,7 +165,7 @@ private[spark] class MatrixUDT extends UserDefinedType[Matrix] {
         row.setInt(2, dm.numCols)
         row.setNullAt(3)
         row.setNullAt(4)
-        row.update(5, dm.values.toSeq)
+        row.update(5, new GenericArrayData(dm.values.map(_.asInstanceOf[Any])))
         row.setBoolean(6, dm.isTransposed)
     }
     row
@@ -179,14 +179,12 @@ private[spark] class MatrixUDT extends UserDefinedType[Matrix] {
         val tpe = row.getByte(0)
         val numRows = row.getInt(1)
         val numCols = row.getInt(2)
-        val values = row.getAs[Seq[Double]](5, ArrayType(DoubleType, containsNull = false)).toArray
+        val values = row.getArray(5).toArray.map(_.asInstanceOf[Double])
         val isTransposed = row.getBoolean(6)
         tpe match {
           case 0 =>
-            val colPtrs =
-              row.getAs[Seq[Int]](3, ArrayType(IntegerType, containsNull = false)).toArray
-            val rowIndices =
-              row.getAs[Seq[Int]](4, ArrayType(IntegerType, containsNull = false)).toArray
+            val colPtrs = row.getArray(3).toArray.map(_.asInstanceOf[Int])
+            val rowIndices = row.getArray(4).toArray.map(_.asInstanceOf[Int])
             new SparseMatrix(numRows, numCols, colPtrs, rowIndices, values, isTransposed)
           case 1 =>
             new DenseMatrix(numRows, numCols, values, isTransposed)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
index 23c2c16d68d9a..89a1818db0d1d 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
@@ -187,15 +187,15 @@ private[spark] class VectorUDT extends UserDefinedType[Vector] {
         val row = new GenericMutableRow(4)
         row.setByte(0, 0)
         row.setInt(1, size)
-        row.update(2, indices.toSeq)
-        row.update(3, values.toSeq)
+        row.update(2, new GenericArrayData(indices.map(_.asInstanceOf[Any])))
+        row.update(3, new GenericArrayData(values.map(_.asInstanceOf[Any])))
         row
       case DenseVector(values) =>
         val row = new GenericMutableRow(4)
         row.setByte(0, 1)
         row.setNullAt(1)
         row.setNullAt(2)
-        row.update(3, values.toSeq)
+        row.update(3, new GenericArrayData(values.map(_.asInstanceOf[Any])))
         row
     }
   }
@@ -209,14 +209,11 @@ private[spark] class VectorUDT extends UserDefinedType[Vector] {
         tpe match {
           case 0 =>
             val size = row.getInt(1)
-            val indices =
-              row.getAs[Seq[Int]](2, ArrayType(IntegerType, containsNull = false)).toArray
-            val values =
-              row.getAs[Seq[Double]](3, ArrayType(DoubleType, containsNull = false)).toArray
+            val indices = row.getArray(2).toArray().map(_.asInstanceOf[Int])
+            val values = row.getArray(3).toArray().map(_.asInstanceOf[Double])
             new SparseVector(size, indices, values)
           case 1 =>
-            val values =
-              row.getAs[Seq[Double]](3, ArrayType(DoubleType, containsNull = false)).toArray
+            val values = row.getArray(3).toArray().map(_.asInstanceOf[Double])
             new DenseVector(values)
         }
     }
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/SpecializedGetters.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/SpecializedGetters.java
index bc345dcd00e49..f7cea13688876 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/SpecializedGetters.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/SpecializedGetters.java
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.catalyst.expressions;
 
 import org.apache.spark.sql.catalyst.InternalRow;
+import org.apache.spark.sql.types.ArrayData;
 import org.apache.spark.sql.types.Decimal;
 import org.apache.spark.unsafe.types.CalendarInterval;
 import org.apache.spark.unsafe.types.UTF8String;
@@ -50,4 +51,5 @@ public interface SpecializedGetters {
 
   InternalRow getStruct(int ordinal, int numFields);
 
+  ArrayData getArray(int ordinal);
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala
index d1d89a1f48329..22452c0f201ef 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala
@@ -55,7 +55,6 @@ object CatalystTypeConverters {
 
   private def isWholePrimitive(dt: DataType): Boolean = dt match {
     case dt if isPrimitive(dt) => true
-    case ArrayType(elementType, _) => isWholePrimitive(elementType)
     case MapType(keyType, valueType, _) => isWholePrimitive(keyType) && isWholePrimitive(valueType)
     case _ => false
   }
@@ -154,39 +153,41 @@ object CatalystTypeConverters {
 
   /** Converter for arrays, sequences, and Java iterables. */
   private case class ArrayConverter(
-      elementType: DataType) extends CatalystTypeConverter[Any, Seq[Any], Seq[Any]] {
+      elementType: DataType) extends CatalystTypeConverter[Any, Seq[Any], ArrayData] {
 
     private[this] val elementConverter = getConverterForType(elementType)
 
     private[this] val isNoChange = isWholePrimitive(elementType)
 
-    override def toCatalystImpl(scalaValue: Any): Seq[Any] = {
+    override def toCatalystImpl(scalaValue: Any): ArrayData = {
       scalaValue match {
-        case a: Array[_] => a.toSeq.map(elementConverter.toCatalyst)
-        case s: Seq[_] => s.map(elementConverter.toCatalyst)
+        case a: Array[_] =>
+          new GenericArrayData(a.map(elementConverter.toCatalyst))
+        case s: Seq[_] =>
+          new GenericArrayData(s.map(elementConverter.toCatalyst).toArray)
         case i: JavaIterable[_] =>
           val iter = i.iterator
-          var convertedIterable: List[Any] = List()
+          val convertedIterable = scala.collection.mutable.ArrayBuffer.empty[Any]
           while (iter.hasNext) {
             val item = iter.next()
-            convertedIterable :+= elementConverter.toCatalyst(item)
+            convertedIterable += elementConverter.toCatalyst(item)
           }
-          convertedIterable
+          new GenericArrayData(convertedIterable.toArray)
       }
     }
 
-    override def toScala(catalystValue: Seq[Any]): Seq[Any] = {
+    override def toScala(catalystValue: ArrayData): Seq[Any] = {
       if (catalystValue == null) {
         null
       } else if (isNoChange) {
-        catalystValue
+        catalystValue.toArray()
       } else {
-        catalystValue.map(elementConverter.toScala)
+        catalystValue.toArray().map(elementConverter.toScala)
       }
     }
 
     override def toScalaImpl(row: InternalRow, column: Int): Seq[Any] =
-      toScala(row.get(column, ArrayType(elementType)).asInstanceOf[Seq[Any]])
+      toScala(row.getArray(column))
   }
 
   private case class MapConverter(
@@ -402,9 +403,9 @@ object CatalystTypeConverters {
     case t: Timestamp => TimestampConverter.toCatalyst(t)
     case d: BigDecimal => BigDecimalConverter.toCatalyst(d)
     case d: JavaBigDecimal => BigDecimalConverter.toCatalyst(d)
-    case seq: Seq[Any] => seq.map(convertToCatalyst)
+    case seq: Seq[Any] => new GenericArrayData(seq.map(convertToCatalyst).toArray)
     case r: Row => InternalRow(r.toSeq.map(convertToCatalyst): _*)
-    case arr: Array[Any] => arr.map(convertToCatalyst)
+    case arr: Array[Any] => new GenericArrayData(arr.map(convertToCatalyst))
     case m: Map[_, _] =>
       m.map { case (k, v) => (convertToCatalyst(k), convertToCatalyst(v)) }.toMap
     case other => other
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala
index a5999e64ec554..486ba036548c8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala
@@ -76,6 +76,8 @@ abstract class InternalRow extends Serializable with SpecializedGetters {
   override def getStruct(ordinal: Int, numFields: Int): InternalRow =
     getAs[InternalRow](ordinal, null)
 
+  override def getArray(ordinal: Int): ArrayData = getAs(ordinal, null)
+
   override def toString: String = s"[${this.mkString(",")}]"
 
   /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala
index 371681b5d494f..45709c1c8f554 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala
@@ -65,7 +65,7 @@ case class BoundReference(ordinal: Int, dataType: DataType, nullable: Boolean)
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     val javaType = ctx.javaType(dataType)
-    val value = ctx.getColumn("i", dataType, ordinal)
+    val value = ctx.getValue("i", dataType, ordinal.toString)
     s"""
       boolean ${ev.isNull} = i.isNullAt($ordinal);
       $javaType ${ev.primitive} = ${ev.isNull} ? ${ctx.defaultValue(dataType)} : ($value);
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
index 8c01c13c9ccd5..43be11c48ae7c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -363,7 +363,21 @@ case class Cast(child: Expression, dataType: DataType)
 
   private[this] def castArray(from: ArrayType, to: ArrayType): Any => Any = {
     val elementCast = cast(from.elementType, to.elementType)
-    buildCast[Seq[Any]](_, _.map(v => if (v == null) null else elementCast(v)))
+    // TODO: Could be faster?
+    buildCast[ArrayData](_, array => {
+      val length = array.numElements()
+      val values = new Array[Any](length)
+      var i = 0
+      while (i < length) {
+        if (array.isNullAt(i)) {
+          values(i) = null
+        } else {
+          values(i) = elementCast(array.get(i))
+        }
+        i += 1
+      }
+      new GenericArrayData(values)
+    })
   }
 
   private[this] def castMap(from: MapType, to: MapType): Any => Any = {
@@ -789,37 +803,36 @@ case class Cast(child: Expression, dataType: DataType)
   private[this] def castArrayCode(
       from: ArrayType, to: ArrayType, ctx: CodeGenContext): CastFunction = {
     val elementCast = nullSafeCastFunction(from.elementType, to.elementType, ctx)
-
-    val arraySeqClass = classOf[mutable.ArraySeq[Any]].getName
+    val arrayClass = classOf[GenericArrayData].getName
     val fromElementNull = ctx.freshName("feNull")
     val fromElementPrim = ctx.freshName("fePrim")
     val toElementNull = ctx.freshName("teNull")
     val toElementPrim = ctx.freshName("tePrim")
     val size = ctx.freshName("n")
     val j = ctx.freshName("j")
-    val result = ctx.freshName("result")
+    val values = ctx.freshName("values")
 
     (c, evPrim, evNull) =>
       s"""
-        final int $size = $c.size();
-        final $arraySeqClass<Object> $result = new $arraySeqClass<Object>($size);
+        final int $size = $c.numElements();
+        final Object[] $values = new Object[$size];
         for (int $j = 0; $j < $size; $j ++) {
-          if ($c.apply($j) == null) {
-            $result.update($j, null);
+          if ($c.isNullAt($j)) {
+            $values[$j] = null;
           } else {
             boolean $fromElementNull = false;
             ${ctx.javaType(from.elementType)} $fromElementPrim =
-              (${ctx.boxedType(from.elementType)}) $c.apply($j);
+              ${ctx.getValue(c, from.elementType, j)};
             ${castCode(ctx, fromElementPrim,
               fromElementNull, toElementPrim, toElementNull, to.elementType, elementCast)}
             if ($toElementNull) {
-              $result.update($j, null);
+              $values[$j] = null;
             } else {
-              $result.update($j, $toElementPrim);
+              $values[$j] = $toElementPrim;
             }
           }
         }
-        $evPrim = $result;
+        $evPrim = new $arrayClass($values);
       """
   }
 
@@ -891,7 +904,7 @@ case class Cast(child: Expression, dataType: DataType)
           $result.setNullAt($i);
         } else {
           $fromType $fromFieldPrim =
-            ${ctx.getColumn(tmpRow, from.fields(i).dataType, i)};
+            ${ctx.getValue(tmpRow, from.fields(i).dataType, i.toString)};
           ${castCode(ctx, fromFieldPrim,
             fromFieldNull, toFieldPrim, toFieldNull, to.fields(i).dataType, cast)}
           if ($toFieldNull) {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index 092f4c9fb0bd2..c39e0df6fae2a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -100,17 +100,18 @@ class CodeGenContext {
   }
 
   /**
-   * Returns the code to access a column in Row for a given DataType.
+   * Returns the code to access a value in `SpecializedGetters` for a given DataType.
    */
-  def getColumn(row: String, dataType: DataType, ordinal: Int): String = {
+  def getValue(getter: String, dataType: DataType, ordinal: String): String = {
     val jt = javaType(dataType)
     dataType match {
-      case _ if isPrimitiveType(jt) => s"$row.get${primitiveTypeName(jt)}($ordinal)"
-      case StringType => s"$row.getUTF8String($ordinal)"
-      case BinaryType => s"$row.getBinary($ordinal)"
-      case CalendarIntervalType => s"$row.getInterval($ordinal)"
-      case t: StructType => s"$row.getStruct($ordinal, ${t.size})"
-      case _ => s"($jt)$row.get($ordinal)"
+      case _ if isPrimitiveType(jt) => s"$getter.get${primitiveTypeName(jt)}($ordinal)"
+      case StringType => s"$getter.getUTF8String($ordinal)"
+      case BinaryType => s"$getter.getBinary($ordinal)"
+      case CalendarIntervalType => s"$getter.getInterval($ordinal)"
+      case t: StructType => s"$getter.getStruct($ordinal, ${t.size})"
+      case a: ArrayType => s"$getter.getArray($ordinal)"
+      case _ => s"($jt)$getter.get($ordinal)" // todo: remove generic getter.
     }
   }
 
@@ -152,8 +153,8 @@ class CodeGenContext {
     case StringType => "UTF8String"
     case CalendarIntervalType => "CalendarInterval"
     case _: StructType => "InternalRow"
-    case _: ArrayType => s"scala.collection.Seq"
-    case _: MapType => s"scala.collection.Map"
+    case _: ArrayType => "ArrayData"
+    case _: MapType => "scala.collection.Map"
     case dt: OpenHashSetUDT if dt.elementType == IntegerType => classOf[IntegerHashSet].getName
     case dt: OpenHashSetUDT if dt.elementType == LongType => classOf[LongHashSet].getName
     case _ => "Object"
@@ -214,7 +215,9 @@ class CodeGenContext {
     case dt: DataType if isPrimitiveType(dt) => s"($c1 > $c2 ? 1 : $c1 < $c2 ? -1 : 0)"
     case BinaryType => s"org.apache.spark.sql.catalyst.util.TypeUtils.compareBinary($c1, $c2)"
     case NullType => "0"
-    case other => s"$c1.compare($c2)"
+    case other if other.isInstanceOf[AtomicType] => s"$c1.compare($c2)"
+    case _ => throw new IllegalArgumentException(
+      "cannot generate compare code for un-comparable type")
   }
 
   /**
@@ -293,7 +296,8 @@ abstract class CodeGenerator[InType <: AnyRef, OutType <: AnyRef] extends Loggin
       classOf[UnsafeRow].getName,
       classOf[UTF8String].getName,
       classOf[Decimal].getName,
-      classOf[CalendarInterval].getName
+      classOf[CalendarInterval].getName,
+      classOf[ArrayData].getName
     ))
     evaluator.setExtendedClass(classOf[GeneratedClass])
     try {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
index 7be60114ce674..a662357fb6cf9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
@@ -153,14 +153,14 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
           val nestedStructEv = GeneratedExpressionCode(
             code = "",
             isNull = s"${input.primitive}.isNullAt($i)",
-            primitive = s"${ctx.getColumn(input.primitive, dt, i)}"
+            primitive = s"${ctx.getValue(input.primitive, dt, i.toString)}"
           )
           createCodeForStruct(ctx, nestedStructEv, st)
         case _ =>
           GeneratedExpressionCode(
             code = "",
             isNull = s"${input.primitive}.isNullAt($i)",
-            primitive = s"${ctx.getColumn(input.primitive, dt, i)}"
+            primitive = s"${ctx.getValue(input.primitive, dt, i.toString)}"
           )
         }
     }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
index 2d92dcf23a86e..1a00dbc254de1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
@@ -27,11 +27,15 @@ case class Size(child: Expression) extends UnaryExpression with ExpectsInputType
   override def inputTypes: Seq[AbstractDataType] = Seq(TypeCollection(ArrayType, MapType))
 
   override def nullSafeEval(value: Any): Int = child.dataType match {
-    case ArrayType(_, _) => value.asInstanceOf[Seq[Any]].size
-    case MapType(_, _, _) => value.asInstanceOf[Map[Any, Any]].size
+    case _: ArrayType => value.asInstanceOf[ArrayData].numElements()
+    case _: MapType => value.asInstanceOf[Map[Any, Any]].size
   }
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
-    nullSafeCodeGen(ctx, ev, c => s"${ev.primitive} = ($c).size();")
+    val sizeCall = child.dataType match {
+      case _: ArrayType => "numElements()"
+      case _: MapType => "size()"
+    }
+    nullSafeCodeGen(ctx, ev, c => s"${ev.primitive} = ($c).$sizeCall;")
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
index 0517050a45109..a145dfb4bbf08 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
@@ -18,12 +18,9 @@
 package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.unsafe.types.UTF8String
-
-import scala.collection.mutable
-
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
-import org.apache.spark.sql.catalyst.expressions.codegen.{GenerateUnsafeProjection, GeneratedExpressionCode, CodeGenContext}
+import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.catalyst.util.TypeUtils
 import org.apache.spark.sql.types._
 
@@ -46,25 +43,26 @@ case class CreateArray(children: Seq[Expression]) extends Expression {
   override def nullable: Boolean = false
 
   override def eval(input: InternalRow): Any = {
-    children.map(_.eval(input))
+    new GenericArrayData(children.map(_.eval(input)).toArray)
   }
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
-    val arraySeqClass = classOf[mutable.ArraySeq[Any]].getName
+    val arrayClass = classOf[GenericArrayData].getName
     s"""
-      boolean ${ev.isNull} = false;
-      $arraySeqClass<Object> ${ev.primitive} = new $arraySeqClass<Object>(${children.size});
+      final boolean ${ev.isNull} = false;
+      final Object[] values = new Object[${children.size}];
     """ +
       children.zipWithIndex.map { case (e, i) =>
         val eval = e.gen(ctx)
         eval.code + s"""
           if (${eval.isNull}) {
-            ${ev.primitive}.update($i, null);
+            values[$i] = null;
           } else {
-            ${ev.primitive}.update($i, ${eval.primitive});
+            values[$i] = ${eval.primitive};
           }
          """
-      }.mkString("\n")
+      }.mkString("\n") +
+      s"final ${ctx.javaType(dataType)} ${ev.primitive} = new $arrayClass(values);"
   }
 
   override def prettyName: String = "array"
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala
index 6331a9eb603ca..99393c9c76ab6 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala
@@ -57,7 +57,8 @@ object ExtractValue {
       case (ArrayType(StructType(fields), containsNull), NonNullLiteral(v, StringType)) =>
         val fieldName = v.toString
         val ordinal = findField(fields, fieldName, resolver)
-        GetArrayStructFields(child, fields(ordinal).copy(name = fieldName), ordinal, containsNull)
+        GetArrayStructFields(child, fields(ordinal).copy(name = fieldName),
+          ordinal, fields.length, containsNull)
 
       case (_: ArrayType, _) if extraction.dataType.isInstanceOf[IntegralType] =>
         GetArrayItem(child, extraction)
@@ -118,7 +119,7 @@ case class GetStructField(child: Expression, field: StructField, ordinal: Int)
         if ($eval.isNullAt($ordinal)) {
           ${ev.isNull} = true;
         } else {
-          ${ev.primitive} = ${ctx.getColumn(eval, dataType, ordinal)};
+          ${ev.primitive} = ${ctx.getValue(eval, dataType, ordinal.toString)};
         }
       """
     })
@@ -134,6 +135,7 @@ case class GetArrayStructFields(
     child: Expression,
     field: StructField,
     ordinal: Int,
+    numFields: Int,
     containsNull: Boolean) extends UnaryExpression {
 
   override def dataType: DataType = ArrayType(field.dataType, containsNull)
@@ -141,26 +143,45 @@ case class GetArrayStructFields(
   override def toString: String = s"$child.${field.name}"
 
   protected override def nullSafeEval(input: Any): Any = {
-    input.asInstanceOf[Seq[InternalRow]].map { row =>
-      if (row == null) null else row.get(ordinal, field.dataType)
+    val array = input.asInstanceOf[ArrayData]
+    val length = array.numElements()
+    val result = new Array[Any](length)
+    var i = 0
+    while (i < length) {
+      if (array.isNullAt(i)) {
+        result(i) = null
+      } else {
+        val row = array.getStruct(i, numFields)
+        if (row.isNullAt(ordinal)) {
+          result(i) = null
+        } else {
+          result(i) = row.get(ordinal, field.dataType)
+        }
+      }
+      i += 1
     }
+    new GenericArrayData(result)
   }
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
-    val arraySeqClass = "scala.collection.mutable.ArraySeq"
-    // TODO: consider using Array[_] for ArrayType child to avoid
-    // boxing of primitives
+    val arrayClass = classOf[GenericArrayData].getName
     nullSafeCodeGen(ctx, ev, eval => {
       s"""
-        final int n = $eval.size();
-        final $arraySeqClass<Object> values = new $arraySeqClass<Object>(n);
+        final int n = $eval.numElements();
+        final Object[] values = new Object[n];
         for (int j = 0; j < n; j++) {
-          InternalRow row = (InternalRow) $eval.apply(j);
-          if (row != null && !row.isNullAt($ordinal)) {
-            values.update(j, ${ctx.getColumn("row", field.dataType, ordinal)});
+          if ($eval.isNullAt(j)) {
+            values[j] = null;
+          } else {
+            final InternalRow row = $eval.getStruct(j, $numFields);
+            if (row.isNullAt($ordinal)) {
+              values[j] = null;
+            } else {
+              values[j] = ${ctx.getValue("row", field.dataType, ordinal.toString)};
+            }
           }
         }
-        ${ev.primitive} = (${ctx.javaType(dataType)}) values;
+        ${ev.primitive} = new $arrayClass(values);
       """
     })
   }
@@ -186,23 +207,23 @@ case class GetArrayItem(child: Expression, ordinal: Expression) extends BinaryEx
   protected override def nullSafeEval(value: Any, ordinal: Any): Any = {
     // TODO: consider using Array[_] for ArrayType child to avoid
     // boxing of primitives
-    val baseValue = value.asInstanceOf[Seq[_]]
+    val baseValue = value.asInstanceOf[ArrayData]
     val index = ordinal.asInstanceOf[Number].intValue()
-    if (index >= baseValue.size || index < 0) {
+    if (index >= baseValue.numElements() || index < 0) {
       null
     } else {
-      baseValue(index)
+      baseValue.get(index)
     }
   }
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     nullSafeCodeGen(ctx, ev, (eval1, eval2) => {
       s"""
-        final int index = (int)$eval2;
-        if (index >= $eval1.size() || index < 0) {
+        final int index = (int) $eval2;
+        if (index >= $eval1.numElements() || index < 0) {
           ${ev.isNull} = true;
         } else {
-          ${ev.primitive} = (${ctx.boxedType(dataType)})$eval1.apply(index);
+          ${ev.primitive} = ${ctx.getValue(eval1, dataType, "index")};
         }
       """
     })
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
index 2dbcf2830f876..8064235c64ef9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
@@ -121,8 +121,8 @@ case class Explode(child: Expression) extends UnaryExpression with Generator wit
   override def eval(input: InternalRow): TraversableOnce[InternalRow] = {
     child.dataType match {
       case ArrayType(_, _) =>
-        val inputArray = child.eval(input).asInstanceOf[Seq[Any]]
-        if (inputArray == null) Nil else inputArray.map(v => InternalRow(v))
+        val inputArray = child.eval(input).asInstanceOf[ArrayData]
+        if (inputArray == null) Nil else inputArray.toArray().map(v => InternalRow(v))
       case MapType(_, _, _) =>
         val inputMap = child.eval(input).asInstanceOf[Map[Any, Any]]
         if (inputMap == null) Nil
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
index 5b3a64a09679c..79c0ca56a8e79 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
@@ -92,7 +92,7 @@ case class ConcatWs(children: Seq[Expression])
     val flatInputs = children.flatMap { child =>
       child.eval(input) match {
         case s: UTF8String => Iterator(s)
-        case arr: Seq[_] => arr.asInstanceOf[Seq[UTF8String]]
+        case arr: ArrayData => arr.toArray().map(_.asInstanceOf[UTF8String])
         case null => Iterator(null.asInstanceOf[UTF8String])
       }
     }
@@ -105,7 +105,7 @@ case class ConcatWs(children: Seq[Expression])
       val evals = children.map(_.gen(ctx))
 
       val inputs = evals.map { eval =>
-        s"${eval.isNull} ? (UTF8String)null : ${eval.primitive}"
+        s"${eval.isNull} ? (UTF8String) null : ${eval.primitive}"
       }.mkString(", ")
 
       evals.map(_.code).mkString("\n") + s"""
@@ -665,13 +665,15 @@ case class StringSplit(str: Expression, pattern: Expression)
   override def inputTypes: Seq[DataType] = Seq(StringType, StringType)
 
   override def nullSafeEval(string: Any, regex: Any): Any = {
-    string.asInstanceOf[UTF8String].split(regex.asInstanceOf[UTF8String], -1).toSeq
+    val strings = string.asInstanceOf[UTF8String].split(regex.asInstanceOf[UTF8String], -1)
+    new GenericArrayData(strings.asInstanceOf[Array[Any]])
   }
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    val arrayClass = classOf[GenericArrayData].getName
     nullSafeCodeGen(ctx, ev, (str, pattern) =>
-      s"""${ev.primitive} = scala.collection.JavaConversions.asScalaBuffer(
-            java.util.Arrays.asList($str.split($pattern, -1)));""")
+      // Array in java is covariant, so we don't need to cast UTF8String[] to Object[].
+      s"""${ev.primitive} = new $arrayClass($str.split($pattern, -1));""")
   }
 
   override def prettyName: String = "split"
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 813c62009666c..29d706dcb39a7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -312,7 +312,8 @@ object NullPropagation extends Rule[LogicalPlan] {
       case e @ GetMapValue(Literal(null, _), _) => Literal.create(null, e.dataType)
       case e @ GetMapValue(_, Literal(null, _)) => Literal.create(null, e.dataType)
       case e @ GetStructField(Literal(null, _), _, _) => Literal.create(null, e.dataType)
-      case e @ GetArrayStructFields(Literal(null, _), _, _, _) => Literal.create(null, e.dataType)
+      case e @ GetArrayStructFields(Literal(null, _), _, _, _, _) =>
+        Literal.create(null, e.dataType)
       case e @ EqualNullSafe(Literal(null, _), r) => IsNull(r)
       case e @ EqualNullSafe(l, Literal(null, _)) => IsNull(l)
       case e @ Count(expr) if !expr.nullable => Count(Literal(1))
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayData.scala
new file mode 100644
index 0000000000000..14a7285877622
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayData.scala
@@ -0,0 +1,121 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.types
+
+import org.apache.spark.sql.catalyst.expressions.SpecializedGetters
+
+abstract class ArrayData extends SpecializedGetters with Serializable {
+  // todo: remove this after we handle all types.(map type need special getter)
+  def get(ordinal: Int): Any
+
+  def numElements(): Int
+
+  // todo: need a more efficient way to iterate array type.
+  def toArray(): Array[Any] = {
+    val n = numElements()
+    val values = new Array[Any](n)
+    var i = 0
+    while (i < n) {
+      if (isNullAt(i)) {
+        values(i) = null
+      } else {
+        values(i) = get(i)
+      }
+      i += 1
+    }
+    values
+  }
+
+  override def toString(): String = toArray.mkString("[", ",", "]")
+
+  override def equals(o: Any): Boolean = {
+    if (!o.isInstanceOf[ArrayData]) {
+      return false
+    }
+
+    val other = o.asInstanceOf[ArrayData]
+    if (other eq null) {
+      return false
+    }
+
+    val len = numElements()
+    if (len != other.numElements()) {
+      return false
+    }
+
+    var i = 0
+    while (i < len) {
+      if (isNullAt(i) != other.isNullAt(i)) {
+        return false
+      }
+      if (!isNullAt(i)) {
+        val o1 = get(i)
+        val o2 = other.get(i)
+        o1 match {
+          case b1: Array[Byte] =>
+            if (!o2.isInstanceOf[Array[Byte]] ||
+              !java.util.Arrays.equals(b1, o2.asInstanceOf[Array[Byte]])) {
+              return false
+            }
+          case f1: Float if java.lang.Float.isNaN(f1) =>
+            if (!o2.isInstanceOf[Float] || ! java.lang.Float.isNaN(o2.asInstanceOf[Float])) {
+              return false
+            }
+          case d1: Double if java.lang.Double.isNaN(d1) =>
+            if (!o2.isInstanceOf[Double] || ! java.lang.Double.isNaN(o2.asInstanceOf[Double])) {
+              return false
+            }
+          case _ => if (o1 != o2) {
+            return false
+          }
+        }
+      }
+      i += 1
+    }
+    true
+  }
+
+  override def hashCode: Int = {
+    var result: Int = 37
+    var i = 0
+    val len = numElements()
+    while (i < len) {
+      val update: Int =
+        if (isNullAt(i)) {
+          0
+        } else {
+          get(i) match {
+            case b: Boolean => if (b) 0 else 1
+            case b: Byte => b.toInt
+            case s: Short => s.toInt
+            case i: Int => i
+            case l: Long => (l ^ (l >>> 32)).toInt
+            case f: Float => java.lang.Float.floatToIntBits(f)
+            case d: Double =>
+              val b = java.lang.Double.doubleToLongBits(d)
+              (b ^ (b >>> 32)).toInt
+            case a: Array[Byte] => java.util.Arrays.hashCode(a)
+            case other => other.hashCode()
+          }
+        }
+      result = 37 * result + update
+      i += 1
+    }
+    result
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/GenericArrayData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/GenericArrayData.scala
new file mode 100644
index 0000000000000..7992ba947c069
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/GenericArrayData.scala
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.types
+
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.unsafe.types.{UTF8String, CalendarInterval}
+
+class GenericArrayData(array: Array[Any]) extends ArrayData {
+  private def getAs[T](ordinal: Int) = get(ordinal).asInstanceOf[T]
+
+  override def toArray(): Array[Any] = array
+
+  override def get(ordinal: Int): Any = array(ordinal)
+
+  override def isNullAt(ordinal: Int): Boolean = get(ordinal) == null
+
+  override def getBoolean(ordinal: Int): Boolean = getAs(ordinal)
+
+  override def getByte(ordinal: Int): Byte = getAs(ordinal)
+
+  override def getShort(ordinal: Int): Short = getAs(ordinal)
+
+  override def getInt(ordinal: Int): Int = getAs(ordinal)
+
+  override def getLong(ordinal: Int): Long = getAs(ordinal)
+
+  override def getFloat(ordinal: Int): Float = getAs(ordinal)
+
+  override def getDouble(ordinal: Int): Double = getAs(ordinal)
+
+  override def getDecimal(ordinal: Int): Decimal = getAs(ordinal)
+
+  override def getUTF8String(ordinal: Int): UTF8String = getAs(ordinal)
+
+  override def getBinary(ordinal: Int): Array[Byte] = getAs(ordinal)
+
+  override def getInterval(ordinal: Int): CalendarInterval = getAs(ordinal)
+
+  override def getStruct(ordinal: Int, numFields: Int): InternalRow = getAs(ordinal)
+
+  override def getArray(ordinal: Int): ArrayData = getAs(ordinal)
+
+  override def numElements(): Int = array.length
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
index a517da9872852..4f35b653d73c0 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
@@ -21,6 +21,7 @@ import java.sql.{Timestamp, Date}
 import java.util.{TimeZone, Calendar}
 
 import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.types._
@@ -730,13 +731,10 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
 
   test("complex casting") {
     val complex = Literal.create(
-      InternalRow(
-        Seq(UTF8String.fromString("123"), UTF8String.fromString("abc"), UTF8String.fromString("")),
-        Map(
-          UTF8String.fromString("a") -> UTF8String.fromString("123"),
-          UTF8String.fromString("b") -> UTF8String.fromString("abc"),
-          UTF8String.fromString("c") -> UTF8String.fromString("")),
-        InternalRow(0)),
+      Row(
+        Seq("123", "abc", ""),
+        Map("a" ->"123", "b" -> "abc", "c" -> ""),
+        Row(0)),
       StructType(Seq(
         StructField("a",
           ArrayType(StringType, containsNull = false), nullable = true),
@@ -756,13 +754,10 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
           StructField("l", LongType, nullable = true)))))))
 
     assert(ret.resolved === true)
-    checkEvaluation(ret, InternalRow(
+    checkEvaluation(ret, Row(
       Seq(123, null, null),
-      Map(
-        UTF8String.fromString("a") -> true,
-        UTF8String.fromString("b") -> true,
-        UTF8String.fromString("c") -> false),
-      InternalRow(0L)))
+      Map("a" -> true, "b" -> true, "c" -> false),
+      Row(0L)))
   }
 
   test("case between string and interval") {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala
index 5de5ddce975d8..3fa246b69d1f1 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala
@@ -110,7 +110,7 @@ class ComplexTypeSuite extends SparkFunSuite with ExpressionEvalHelper {
       expr.dataType match {
         case ArrayType(StructType(fields), containsNull) =>
           val field = fields.find(_.name == fieldName).get
-          GetArrayStructFields(expr, field, fields.indexOf(field), containsNull)
+          GetArrayStructFields(expr, field, fields.indexOf(field), fields.length, containsNull)
       }
     }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala
index aeeb0e45270dd..f26f41fb75d57 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala
@@ -158,8 +158,8 @@ package object debug {
 
       case (row: InternalRow, StructType(fields)) =>
         row.toSeq.zip(fields.map(_.dataType)).foreach { case(d, t) => typeCheck(d, t) }
-      case (s: Seq[_], ArrayType(elemType, _)) =>
-        s.foreach(typeCheck(_, elemType))
+      case (a: ArrayData, ArrayType(elemType, _)) =>
+        a.toArray().foreach(typeCheck(_, elemType))
       case (m: Map[_, _], MapType(keyType, valueType, _)) =>
         m.keys.foreach(typeCheck(_, keyType))
         m.values.foreach(typeCheck(_, valueType))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUDFs.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUDFs.scala
index 3c38916fd7504..ef1c6e57dc08a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUDFs.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUDFs.scala
@@ -134,8 +134,19 @@ object EvaluatePython {
       }
       new GenericInternalRowWithSchema(values, struct)
 
-    case (seq: Seq[Any], array: ArrayType) =>
-      seq.map(x => toJava(x, array.elementType)).asJava
+    case (a: ArrayData, array: ArrayType) =>
+      val length = a.numElements()
+      val values = new java.util.ArrayList[Any](length)
+      var i = 0
+      while (i < length) {
+        if (a.isNullAt(i)) {
+          values.add(null)
+        } else {
+          values.add(toJava(a.get(i), array.elementType))
+        }
+        i += 1
+      }
+      values
 
     case (obj: Map[_, _], mt: MapType) => obj.map {
       case (k, v) => (toJava(k, mt.keyType), toJava(v, mt.valueType))
@@ -190,10 +201,10 @@ object EvaluatePython {
     case (c, BinaryType) if c.getClass.isArray && c.getClass.getComponentType.getName == "byte" => c
 
     case (c: java.util.List[_], ArrayType(elementType, _)) =>
-      c.map { e => fromJava(e, elementType)}.toSeq
+      new GenericArrayData(c.map { e => fromJava(e, elementType)}.toArray)
 
     case (c, ArrayType(elementType, _)) if c.getClass.isArray =>
-      c.asInstanceOf[Array[_]].map(e => fromJava(e, elementType)).toSeq
+      new GenericArrayData(c.asInstanceOf[Array[_]].map(e => fromJava(e, elementType)))
 
     case (c: java.util.Map[_, _], MapType(keyType, valueType, _)) => c.map {
       case (key, value) => (fromJava(key, keyType), fromJava(value, valueType))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala
index 78da2840dad69..9329148aa233c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala
@@ -22,7 +22,7 @@ import scala.collection.mutable.{Map => MutableMap}
 import org.apache.spark.Logging
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.plans.logical.LocalRelation
-import org.apache.spark.sql.types.{DataType, ArrayType, StructField, StructType}
+import org.apache.spark.sql.types._
 import org.apache.spark.sql.{Column, DataFrame}
 
 private[sql] object FrequentItems extends Logging {
@@ -110,7 +110,7 @@ private[sql] object FrequentItems extends Logging {
         baseCounts
       }
     )
-    val justItems = freqItems.map(m => m.baseMap.keys.toSeq)
+    val justItems = freqItems.map(m => m.baseMap.keys.toArray).map(new GenericArrayData(_))
     val resultRow = InternalRow(justItems : _*)
     // append frequent Items to the column name for easy debugging
     val outputCols = colInfo.map { v =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/json/InferSchema.scala b/sql/core/src/main/scala/org/apache/spark/sql/json/InferSchema.scala
index 0eb3b04007f8d..04ab5e2217882 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/json/InferSchema.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/json/InferSchema.scala
@@ -125,7 +125,7 @@ private[sql] object InferSchema {
    * Convert NullType to StringType and remove StructTypes with no fields
    */
   private def canonicalizeType: DataType => Option[DataType] = {
-    case at@ArrayType(elementType, _) =>
+    case at @ ArrayType(elementType, _) =>
       for {
         canonicalType <- canonicalizeType(elementType)
       } yield {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/json/JacksonParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/json/JacksonParser.scala
index 381e7ed54428f..1c309f8794ef3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/json/JacksonParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/json/JacksonParser.scala
@@ -110,8 +110,13 @@ private[sql] object JacksonParser {
       case (START_OBJECT, st: StructType) =>
         convertObject(factory, parser, st)
 
+      case (START_ARRAY, st: StructType) =>
+        // SPARK-3308: support reading top level JSON arrays and take every element
+        // in such an array as a row
+        convertArray(factory, parser, st)
+
       case (START_ARRAY, ArrayType(st, _)) =>
-        convertList(factory, parser, st)
+        convertArray(factory, parser, st)
 
       case (START_OBJECT, ArrayType(st, _)) =>
         // the business end of SPARK-3308:
@@ -165,16 +170,16 @@ private[sql] object JacksonParser {
     builder.result()
   }
 
-  private def convertList(
+  private def convertArray(
       factory: JsonFactory,
       parser: JsonParser,
-      schema: DataType): Seq[Any] = {
-    val builder = Seq.newBuilder[Any]
+      elementType: DataType): ArrayData = {
+    val values = scala.collection.mutable.ArrayBuffer.empty[Any]
     while (nextUntil(parser, JsonToken.END_ARRAY)) {
-      builder += convertField(factory, parser, schema)
+      values += convertField(factory, parser, elementType)
     }
 
-    builder.result()
+    new GenericArrayData(values.toArray)
   }
 
   private def parseJson(
@@ -201,12 +206,15 @@ private[sql] object JacksonParser {
           val parser = factory.createParser(record)
           parser.nextToken()
 
-          // to support both object and arrays (see SPARK-3308) we'll start
-          // by converting the StructType schema to an ArrayType and let
-          // convertField wrap an object into a single value array when necessary.
-          convertField(factory, parser, ArrayType(schema)) match {
+          convertField(factory, parser, schema) match {
             case null => failedRecord(record)
-            case list: Seq[InternalRow @unchecked] => list
+            case row: InternalRow => row :: Nil
+            case array: ArrayData =>
+              if (array.numElements() == 0) {
+                Nil
+              } else {
+                array.toArray().map(_.asInstanceOf[InternalRow])
+              }
             case _ =>
               sys.error(
                 s"Failed to parse record $record. Please make sure that each line of the file " +
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystRowConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystRowConverter.scala
index e00bd90edb3dd..172db8362afb6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystRowConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystRowConverter.scala
@@ -325,7 +325,7 @@ private[parquet] class CatalystRowConverter(
 
     override def getConverter(fieldIndex: Int): Converter = elementConverter
 
-    override def end(): Unit = updater.set(currentArray)
+    override def end(): Unit = updater.set(new GenericArrayData(currentArray.toArray))
 
     // NOTE: We can't reuse the mutable `ArrayBuffer` here and must instantiate a new buffer for the
     // next value.  `Row.copy()` only copies row cells, it doesn't do deep copy to objects stored
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala
index ea51650fe9039..2332a36468dbc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.parquet
 
 import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.types.ArrayData
 
 // TODO Removes this while fixing SPARK-8848
 private[sql] object CatalystConverter {
@@ -32,7 +33,7 @@ private[sql] object CatalystConverter {
   val MAP_SCHEMA_NAME = "map"
 
   // TODO: consider using Array[T] for arrays to avoid boxing of primitive types
-  type ArrayScalaType[T] = Seq[T]
+  type ArrayScalaType[T] = ArrayData
   type StructScalaType[T] = InternalRow
   type MapScalaType[K, V] = Map[K, V]
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
index 78ecfad1d57c6..79dd16b7b0c39 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
@@ -146,15 +146,15 @@ private[parquet] class RowWriteSupport extends WriteSupport[InternalRow] with Lo
       array: CatalystConverter.ArrayScalaType[_]): Unit = {
     val elementType = schema.elementType
     writer.startGroup()
-    if (array.size > 0) {
+    if (array.numElements() > 0) {
       if (schema.containsNull) {
         writer.startField(CatalystConverter.ARRAY_CONTAINS_NULL_BAG_SCHEMA_NAME, 0)
         var i = 0
-        while (i < array.size) {
+        while (i < array.numElements()) {
           writer.startGroup()
-          if (array(i) != null) {
+          if (!array.isNullAt(i)) {
             writer.startField(CatalystConverter.ARRAY_ELEMENTS_SCHEMA_NAME, 0)
-            writeValue(elementType, array(i))
+            writeValue(elementType, array.get(i))
             writer.endField(CatalystConverter.ARRAY_ELEMENTS_SCHEMA_NAME, 0)
           }
           writer.endGroup()
@@ -164,8 +164,8 @@ private[parquet] class RowWriteSupport extends WriteSupport[InternalRow] with Lo
       } else {
         writer.startField(CatalystConverter.ARRAY_ELEMENTS_SCHEMA_NAME, 0)
         var i = 0
-        while (i < array.size) {
-          writeValue(elementType, array(i))
+        while (i < array.numElements()) {
+          writeValue(elementType, array.get(i))
           i = i + 1
         }
         writer.endField(CatalystConverter.ARRAY_ELEMENTS_SCHEMA_NAME, 0)
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java
index 72c42f4fe376b..9e61d06f4036e 100644
--- a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java
@@ -30,7 +30,6 @@
 
 import scala.collection.JavaConversions;
 import scala.collection.Seq;
-import scala.collection.mutable.Buffer;
 
 import java.io.Serializable;
 import java.util.Arrays;
@@ -168,10 +167,10 @@ public void testCreateDataFrameFromJavaBeans() {
     for (int i = 0; i < result.length(); i++) {
       Assert.assertEquals(bean.getB()[i], result.apply(i));
     }
-    Buffer<Integer> outputBuffer = (Buffer<Integer>) first.getJavaMap(2).get("hello");
+    Seq<Integer> outputBuffer = (Seq<Integer>) first.getJavaMap(2).get("hello");
     Assert.assertArrayEquals(
       bean.getC().get("hello"),
-      Ints.toArray(JavaConversions.bufferAsJavaList(outputBuffer)));
+      Ints.toArray(JavaConversions.seqAsJavaList(outputBuffer)));
     Seq<String> d = first.getAs(3);
     Assert.assertEquals(bean.getD().size(), d.length());
     for (int i = 0; i < d.length(); i++) {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala
index 45c9f06941c10..77ed4a9c0d5ae 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala
@@ -47,17 +47,17 @@ private[sql] class MyDenseVectorUDT extends UserDefinedType[MyDenseVector] {
 
   override def sqlType: DataType = ArrayType(DoubleType, containsNull = false)
 
-  override def serialize(obj: Any): Seq[Double] = {
+  override def serialize(obj: Any): ArrayData = {
     obj match {
       case features: MyDenseVector =>
-        features.data.toSeq
+        new GenericArrayData(features.data.map(_.asInstanceOf[Any]))
     }
   }
 
   override def deserialize(datum: Any): MyDenseVector = {
     datum match {
-      case data: Seq[_] =>
-        new MyDenseVector(data.asInstanceOf[Seq[Double]].toArray)
+      case data: ArrayData =>
+        new MyDenseVector(data.toArray.map(_.asInstanceOf[Double]))
     }
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala
index 5e189c3563ca8..cfb03ff485b7c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala
@@ -67,12 +67,12 @@ case class AllDataTypesScan(
 
   override def schema: StructType = userSpecifiedSchema
 
-  override def needConversion: Boolean = false
+  override def needConversion: Boolean = true
 
   override def buildScan(): RDD[Row] = {
     sqlContext.sparkContext.parallelize(from to to).map { i =>
-      InternalRow(
-        UTF8String.fromString(s"str_$i"),
+      Row(
+        s"str_$i",
         s"str_$i".getBytes(),
         i % 2 == 0,
         i.toByte,
@@ -81,19 +81,19 @@ case class AllDataTypesScan(
         i.toLong,
         i.toFloat,
         i.toDouble,
-        Decimal(new java.math.BigDecimal(i)),
-        Decimal(new java.math.BigDecimal(i)),
-        DateTimeUtils.fromJavaDate(new Date(1970, 1, 1)),
-        DateTimeUtils.fromJavaTimestamp(new Timestamp(20000 + i)),
-        UTF8String.fromString(s"varchar_$i"),
+        new java.math.BigDecimal(i),
+        new java.math.BigDecimal(i),
+        new Date(1970, 1, 1),
+        new Timestamp(20000 + i),
+        s"varchar_$i",
         Seq(i, i + 1),
-        Seq(Map(UTF8String.fromString(s"str_$i") -> InternalRow(i.toLong))),
-        Map(i -> UTF8String.fromString(i.toString)),
-        Map(Map(UTF8String.fromString(s"str_$i") -> i.toFloat) -> InternalRow(i.toLong)),
-        InternalRow(i, UTF8String.fromString(i.toString)),
-        InternalRow(Seq(UTF8String.fromString(s"str_$i"), UTF8String.fromString(s"str_${i + 1}")),
-          InternalRow(Seq(DateTimeUtils.fromJavaDate(new Date(1970, 1, i + 1))))))
-    }.asInstanceOf[RDD[Row]]
+        Seq(Map(s"str_$i" -> Row(i.toLong))),
+        Map(i -> i.toString),
+        Map(Map(s"str_$i" -> i.toFloat) -> Row(i.toLong)),
+        Row(i, i.toString),
+          Row(Seq(s"str_$i", s"str_${i + 1}"),
+            Row(Seq(new Date(1970, 1, i + 1)))))
+    }
   }
 }
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
index f467500259c91..5926ef9aa388b 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
@@ -52,9 +52,8 @@ import scala.collection.JavaConversions._
  *     java.sql.Timestamp
  *  Complex Types =>
  *    Map: scala.collection.immutable.Map
- *    List: scala.collection.immutable.Seq
- *    Struct:
- *           [[org.apache.spark.sql.catalyst.InternalRow]]
+ *    List: [[org.apache.spark.sql.types.ArrayData]]
+ *    Struct: [[org.apache.spark.sql.catalyst.InternalRow]]
  *    Union: NOT SUPPORTED YET
  *  The Complex types plays as a container, which can hold arbitrary data types.
  *
@@ -297,7 +296,10 @@ private[hive] trait HiveInspectors {
       }.toMap
     case li: StandardConstantListObjectInspector =>
       // take the value from the list inspector object, rather than the input data
-      li.getWritableConstantValue.map(unwrap(_, li.getListElementObjectInspector)).toSeq
+      val values = li.getWritableConstantValue
+        .map(unwrap(_, li.getListElementObjectInspector))
+        .toArray
+      new GenericArrayData(values)
     // if the value is null, we don't care about the object inspector type
     case _ if data == null => null
     case poi: VoidObjectInspector => null // always be null for void object inspector
@@ -339,7 +341,10 @@ private[hive] trait HiveInspectors {
     }
     case li: ListObjectInspector =>
       Option(li.getList(data))
-        .map(_.map(unwrap(_, li.getListElementObjectInspector)).toSeq)
+        .map { l =>
+          val values = l.map(unwrap(_, li.getListElementObjectInspector)).toArray
+          new GenericArrayData(values)
+        }
         .orNull
     case mi: MapObjectInspector =>
       Option(mi.getMap(data)).map(
@@ -391,7 +396,13 @@ private[hive] trait HiveInspectors {
 
     case loi: ListObjectInspector =>
       val wrapper = wrapperFor(loi.getListElementObjectInspector)
-      (o: Any) => if (o != null) seqAsJavaList(o.asInstanceOf[Seq[_]].map(wrapper)) else null
+      (o: Any) => {
+        if (o != null) {
+          seqAsJavaList(o.asInstanceOf[ArrayData].toArray().map(wrapper))
+        } else {
+          null
+        }
+      }
 
     case moi: MapObjectInspector =>
       // The Predef.Map is scala.collection.immutable.Map.
@@ -520,7 +531,7 @@ private[hive] trait HiveInspectors {
     case x: ListObjectInspector =>
       val list = new java.util.ArrayList[Object]
       val tpe = dataType.asInstanceOf[ArrayType].elementType
-      a.asInstanceOf[Seq[_]].foreach {
+      a.asInstanceOf[ArrayData].toArray().foreach {
         v => list.add(wrap(v, x.getListElementObjectInspector, tpe))
       }
       list
@@ -634,7 +645,8 @@ private[hive] trait HiveInspectors {
         ObjectInspectorFactory.getStandardConstantListObjectInspector(listObjectInspector, null)
       } else {
         val list = new java.util.ArrayList[Object]()
-        value.asInstanceOf[Seq[_]].foreach(v => list.add(wrap(v, listObjectInspector, dt)))
+        value.asInstanceOf[ArrayData].toArray()
+          .foreach(v => list.add(wrap(v, listObjectInspector, dt)))
         ObjectInspectorFactory.getStandardConstantListObjectInspector(listObjectInspector, list)
       }
     case Literal(value, MapType(keyType, valueType, _)) =>
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala
index 741c705e2a253..7e3342cc84c0e 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala
@@ -176,13 +176,13 @@ case class ScriptTransformation(
             val prevLine = curLine
             curLine = reader.readLine()
             if (!ioschema.schemaLess) {
-              new GenericInternalRow(CatalystTypeConverters.convertToCatalyst(
-                prevLine.split(ioschema.outputRowFormatMap("TOK_TABLEROWFORMATFIELD")))
-                .asInstanceOf[Array[Any]])
+              new GenericInternalRow(
+                prevLine.split(ioschema.outputRowFormatMap("TOK_TABLEROWFORMATFIELD"))
+                  .map(CatalystTypeConverters.convertToCatalyst))
             } else {
-              new GenericInternalRow(CatalystTypeConverters.convertToCatalyst(
-                prevLine.split(ioschema.outputRowFormatMap("TOK_TABLEROWFORMATFIELD"), 2))
-                .asInstanceOf[Array[Any]])
+              new GenericInternalRow(
+                prevLine.split(ioschema.outputRowFormatMap("TOK_TABLEROWFORMATFIELD"), 2)
+                  .map(CatalystTypeConverters.convertToCatalyst))
             }
           } else {
             val ret = deserialize()
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
index 8732e9abf8d31..4a13022eddf60 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
@@ -431,7 +431,7 @@ private[hive] case class HiveWindowFunction(
       // if pivotResult is true, we will get a Seq having the same size with the size
       // of the window frame. At here, we will return the result at the position of
       // index in the output buffer.
-      outputBuffer.asInstanceOf[Seq[Any]].get(index)
+      outputBuffer.asInstanceOf[ArrayData].get(index)
     }
   }
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala
index 0330013f5325e..f719f2e06ab63 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala
@@ -217,7 +217,7 @@ class HiveInspectorSuite extends SparkFunSuite with HiveInspectors {
   test("wrap / unwrap Array Type") {
     val dt = ArrayType(dataTypes(0))
 
-    val d = row(0) :: row(0) :: Nil
+    val d = new GenericArrayData(Array(row(0), row(0)))
     checkValue(d, unwrap(wrap(d, toInspector(dt), dt), toInspector(dt)))
     checkValue(null, unwrap(wrap(null, toInspector(dt), dt), toInspector(dt)))
     checkValue(d,

From 7bbf02f0bddefd19985372af79e906a38bc528b6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Garillot?= <francois@garillot.net>
Date: Thu, 30 Jul 2015 18:14:08 +0100
Subject: [PATCH 477/549] [SPARK-9267] [CORE] Retire stringify(Partial)?Value
 from Accumulators
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

cc srowen

Author: François Garillot <francois@garillot.net>

Closes #7678 from huitseeker/master and squashes the following commits:

5e99f57 [François Garillot] [SPARK-9267][Core] Retire stringify(Partial)?Value from Accumulators
---
 core/src/main/scala/org/apache/spark/Accumulators.scala     | 3 ---
 .../scala/org/apache/spark/scheduler/DAGScheduler.scala     | 6 ++----
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/Accumulators.scala b/core/src/main/scala/org/apache/spark/Accumulators.scala
index 2f4fcac890eef..eb75f26718e19 100644
--- a/core/src/main/scala/org/apache/spark/Accumulators.scala
+++ b/core/src/main/scala/org/apache/spark/Accumulators.scala
@@ -341,7 +341,4 @@ private[spark] object Accumulators extends Logging {
     }
   }
 
-  def stringifyPartialValue(partialValue: Any): String = "%s".format(partialValue)
-
-  def stringifyValue(value: Any): String = "%s".format(value)
 }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
index cdf6078421123..c4fa277c21254 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -916,11 +916,9 @@ class DAGScheduler(
           // To avoid UI cruft, ignore cases where value wasn't updated
           if (acc.name.isDefined && partialValue != acc.zero) {
             val name = acc.name.get
-            val stringPartialValue = Accumulators.stringifyPartialValue(partialValue)
-            val stringValue = Accumulators.stringifyValue(acc.value)
-            stage.latestInfo.accumulables(id) = AccumulableInfo(id, name, stringValue)
+            stage.latestInfo.accumulables(id) = AccumulableInfo(id, name, s"${acc.value}")
             event.taskInfo.accumulables +=
-              AccumulableInfo(id, name, Some(stringPartialValue), stringValue)
+              AccumulableInfo(id, name, Some(s"$partialValue"), s"${acc.value}")
           }
         }
       } catch {

From 5363ed71568c3e7c082146d654a9c669d692d894 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@appier.com>
Date: Thu, 30 Jul 2015 10:30:37 -0700
Subject: [PATCH 478/549] [SPARK-9361] [SQL] Refactor new aggregation code to
 reduce the times of checking compatibility

JIRA: https://issues.apache.org/jira/browse/SPARK-9361

Currently, we call `aggregate.Utils.tryConvert` in many places to check it the logical.Aggregate can be run with new aggregation. But looks like `aggregate.Utils.tryConvert` will cost considerable time to run. We should only call `tryConvert` once and keep it value in `logical.Aggregate` and reuse it.

In `org.apache.spark.sql.execution.aggregate.Utils`, the codes involving with `tryConvert` should be moved to catalyst because it actually doesn't deal with execution details.

Author: Liang-Chi Hsieh <viirya@appier.com>

Closes #7677 from viirya/refactor_aggregate and squashes the following commits:

babea30 [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into refactor_aggregate
9a589d7 [Liang-Chi Hsieh] Fix scala style.
0a91329 [Liang-Chi Hsieh] Refactor new aggregation code to reduce the times to call tryConvert.
---
 .../expressions/aggregate/interfaces.scala    |   4 +-
 .../expressions/aggregate/utils.scala         | 167 ++++++++++++++++++
 .../plans/logical/basicOperators.scala        |   3 +
 .../spark/sql/execution/SparkStrategies.scala |  34 ++--
 .../spark/sql/execution/aggregate/utils.scala | 144 ---------------
 5 files changed, 188 insertions(+), 164 deletions(-)
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/utils.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala
index 9fb7623172e78..d08f553cefe8c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala
@@ -42,7 +42,7 @@ private[sql] case object Partial extends AggregateMode
 private[sql] case object PartialMerge extends AggregateMode
 
 /**
- * An [[AggregateFunction2]] with [[PartialMerge]] mode is used to merge aggregation buffers
+ * An [[AggregateFunction2]] with [[Final]] mode is used to merge aggregation buffers
  * containing intermediate results for this function and then generate final result.
  * This function updates the given aggregation buffer by merging multiple aggregation buffers.
  * When it has processed all input rows, the final result of this function is returned.
@@ -50,7 +50,7 @@ private[sql] case object PartialMerge extends AggregateMode
 private[sql] case object Final extends AggregateMode
 
 /**
- * An [[AggregateFunction2]] with [[Partial]] mode is used to evaluate this function directly
+ * An [[AggregateFunction2]] with [[Complete]] mode is used to evaluate this function directly
  * from original input rows without any partial aggregation.
  * This function updates the given aggregation buffer with the original input of this
  * function. When it has processed all input rows, the final result of this function is returned.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/utils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/utils.scala
new file mode 100644
index 0000000000000..4a43318a95490
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/utils.scala
@@ -0,0 +1,167 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions.aggregate
+
+import org.apache.spark.sql.AnalysisException
+import org.apache.spark.sql.catalyst._
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, LogicalPlan}
+import org.apache.spark.sql.types.{StructType, MapType, ArrayType}
+
+/**
+ * Utility functions used by the query planner to convert our plan to new aggregation code path.
+ */
+object Utils {
+  // Right now, we do not support complex types in the grouping key schema.
+  private def supportsGroupingKeySchema(aggregate: Aggregate): Boolean = {
+    val hasComplexTypes = aggregate.groupingExpressions.map(_.dataType).exists {
+      case array: ArrayType => true
+      case map: MapType => true
+      case struct: StructType => true
+      case _ => false
+    }
+
+    !hasComplexTypes
+  }
+
+  private def doConvert(plan: LogicalPlan): Option[Aggregate] = plan match {
+    case p: Aggregate if supportsGroupingKeySchema(p) =>
+      val converted = p.transformExpressionsDown {
+        case expressions.Average(child) =>
+          aggregate.AggregateExpression2(
+            aggregateFunction = aggregate.Average(child),
+            mode = aggregate.Complete,
+            isDistinct = false)
+
+        case expressions.Count(child) =>
+          aggregate.AggregateExpression2(
+            aggregateFunction = aggregate.Count(child),
+            mode = aggregate.Complete,
+            isDistinct = false)
+
+        // We do not support multiple COUNT DISTINCT columns for now.
+        case expressions.CountDistinct(children) if children.length == 1 =>
+          aggregate.AggregateExpression2(
+            aggregateFunction = aggregate.Count(children.head),
+            mode = aggregate.Complete,
+            isDistinct = true)
+
+        case expressions.First(child) =>
+          aggregate.AggregateExpression2(
+            aggregateFunction = aggregate.First(child),
+            mode = aggregate.Complete,
+            isDistinct = false)
+
+        case expressions.Last(child) =>
+          aggregate.AggregateExpression2(
+            aggregateFunction = aggregate.Last(child),
+            mode = aggregate.Complete,
+            isDistinct = false)
+
+        case expressions.Max(child) =>
+          aggregate.AggregateExpression2(
+            aggregateFunction = aggregate.Max(child),
+            mode = aggregate.Complete,
+            isDistinct = false)
+
+        case expressions.Min(child) =>
+          aggregate.AggregateExpression2(
+            aggregateFunction = aggregate.Min(child),
+            mode = aggregate.Complete,
+            isDistinct = false)
+
+        case expressions.Sum(child) =>
+          aggregate.AggregateExpression2(
+            aggregateFunction = aggregate.Sum(child),
+            mode = aggregate.Complete,
+            isDistinct = false)
+
+        case expressions.SumDistinct(child) =>
+          aggregate.AggregateExpression2(
+            aggregateFunction = aggregate.Sum(child),
+            mode = aggregate.Complete,
+            isDistinct = true)
+      }
+      // Check if there is any expressions.AggregateExpression1 left.
+      // If so, we cannot convert this plan.
+      val hasAggregateExpression1 = converted.aggregateExpressions.exists { expr =>
+        // For every expressions, check if it contains AggregateExpression1.
+        expr.find {
+          case agg: expressions.AggregateExpression1 => true
+          case other => false
+        }.isDefined
+      }
+
+      // Check if there are multiple distinct columns.
+      val aggregateExpressions = converted.aggregateExpressions.flatMap { expr =>
+        expr.collect {
+          case agg: AggregateExpression2 => agg
+        }
+      }.toSet.toSeq
+      val functionsWithDistinct = aggregateExpressions.filter(_.isDistinct)
+      val hasMultipleDistinctColumnSets =
+        if (functionsWithDistinct.map(_.aggregateFunction.children).distinct.length > 1) {
+          true
+        } else {
+          false
+        }
+
+      if (!hasAggregateExpression1 && !hasMultipleDistinctColumnSets) Some(converted) else None
+
+    case other => None
+  }
+
+  def checkInvalidAggregateFunction2(aggregate: Aggregate): Unit = {
+    // If the plan cannot be converted, we will do a final round check to see if the original
+    // logical.Aggregate contains both AggregateExpression1 and AggregateExpression2. If so,
+    // we need to throw an exception.
+    val aggregateFunction2s = aggregate.aggregateExpressions.flatMap { expr =>
+      expr.collect {
+        case agg: AggregateExpression2 => agg.aggregateFunction
+      }
+    }.distinct
+    if (aggregateFunction2s.nonEmpty) {
+      // For functions implemented based on the new interface, prepare a list of function names.
+      val invalidFunctions = {
+        if (aggregateFunction2s.length > 1) {
+          s"${aggregateFunction2s.tail.map(_.nodeName).mkString(",")} " +
+            s"and ${aggregateFunction2s.head.nodeName} are"
+        } else {
+          s"${aggregateFunction2s.head.nodeName} is"
+        }
+      }
+      val errorMessage =
+        s"${invalidFunctions} implemented based on the new Aggregate Function " +
+          s"interface and it cannot be used with functions implemented based on " +
+          s"the old Aggregate Function interface."
+      throw new AnalysisException(errorMessage)
+    }
+  }
+
+  def tryConvert(plan: LogicalPlan): Option[Aggregate] = plan match {
+    case p: Aggregate =>
+      val converted = doConvert(p)
+      if (converted.isDefined) {
+        converted
+      } else {
+        checkInvalidAggregateFunction2(p)
+        None
+      }
+    case other => None
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
index ad5af19578f33..a67f8de6b733a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.catalyst.plans.logical
 
 import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.expressions.aggregate.Utils
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.types._
 import org.apache.spark.util.collection.OpenHashSet
@@ -219,6 +220,8 @@ case class Aggregate(
     expressions.forall(_.resolved) && childrenResolved && !hasWindowExpressions
   }
 
+  lazy val newAggregation: Option[Aggregate] = Utils.tryConvert(this)
+
   override def output: Seq[Attribute] = aggregateExpressions.map(_.toAttribute)
 }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index f3ef066528ff8..52a9b02d373c7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.execution
 
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression2
+import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression2, Utils}
 import org.apache.spark.sql.catalyst.planning._
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical.{BroadcastHint, LogicalPlan}
@@ -193,11 +193,15 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
       case _ => Nil
     }
 
-    def canBeConvertedToNewAggregation(plan: LogicalPlan): Boolean = {
-      aggregate.Utils.tryConvert(
-        plan,
-        sqlContext.conf.useSqlAggregate2,
-        sqlContext.conf.codegenEnabled).isDefined
+    def canBeConvertedToNewAggregation(plan: LogicalPlan): Boolean = plan match {
+      case a: logical.Aggregate =>
+        if (sqlContext.conf.useSqlAggregate2 && sqlContext.conf.codegenEnabled) {
+          a.newAggregation.isDefined
+        } else {
+          Utils.checkInvalidAggregateFunction2(a)
+          false
+        }
+      case _ => false
     }
 
     def canBeCodeGened(aggs: Seq[AggregateExpression1]): Boolean = aggs.forall {
@@ -217,12 +221,9 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
    */
   object Aggregation extends Strategy {
     def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
-      case p: logical.Aggregate =>
-        val converted =
-          aggregate.Utils.tryConvert(
-            p,
-            sqlContext.conf.useSqlAggregate2,
-            sqlContext.conf.codegenEnabled)
+      case p: logical.Aggregate if sqlContext.conf.useSqlAggregate2 &&
+          sqlContext.conf.codegenEnabled =>
+        val converted = p.newAggregation
         converted match {
           case None => Nil // Cannot convert to new aggregation code path.
           case Some(logical.Aggregate(groupingExpressions, resultExpressions, child)) =>
@@ -377,17 +378,14 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
       case e @ logical.Expand(_, _, _, child) =>
         execution.Expand(e.projections, e.output, planLater(child)) :: Nil
       case a @ logical.Aggregate(group, agg, child) => {
-        val useNewAggregation =
-          aggregate.Utils.tryConvert(
-            a,
-            sqlContext.conf.useSqlAggregate2,
-            sqlContext.conf.codegenEnabled).isDefined
-        if (useNewAggregation) {
+        val useNewAggregation = sqlContext.conf.useSqlAggregate2 && sqlContext.conf.codegenEnabled
+        if (useNewAggregation && a.newAggregation.isDefined) {
           // If this logical.Aggregate can be planned to use new aggregation code path
           // (i.e. it can be planned by the Strategy Aggregation), we will not use the old
           // aggregation code path.
           Nil
         } else {
+          Utils.checkInvalidAggregateFunction2(a)
           execution.Aggregate(partial = false, group, agg, planLater(child)) :: Nil
         }
       }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/utils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/utils.scala
index 6549c87752a7d..03635baae4a5f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/utils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/utils.scala
@@ -29,150 +29,6 @@ import org.apache.spark.sql.types.{StructType, MapType, ArrayType}
  * Utility functions used by the query planner to convert our plan to new aggregation code path.
  */
 object Utils {
-  // Right now, we do not support complex types in the grouping key schema.
-  private def supportsGroupingKeySchema(aggregate: Aggregate): Boolean = {
-    val hasComplexTypes = aggregate.groupingExpressions.map(_.dataType).exists {
-      case array: ArrayType => true
-      case map: MapType => true
-      case struct: StructType => true
-      case _ => false
-    }
-
-    !hasComplexTypes
-  }
-
-  private def tryConvert(plan: LogicalPlan): Option[Aggregate] = plan match {
-    case p: Aggregate if supportsGroupingKeySchema(p) =>
-      val converted = p.transformExpressionsDown {
-        case expressions.Average(child) =>
-          aggregate.AggregateExpression2(
-            aggregateFunction = aggregate.Average(child),
-            mode = aggregate.Complete,
-            isDistinct = false)
-
-        case expressions.Count(child) =>
-          aggregate.AggregateExpression2(
-            aggregateFunction = aggregate.Count(child),
-            mode = aggregate.Complete,
-            isDistinct = false)
-
-        // We do not support multiple COUNT DISTINCT columns for now.
-        case expressions.CountDistinct(children) if children.length == 1 =>
-          aggregate.AggregateExpression2(
-            aggregateFunction = aggregate.Count(children.head),
-            mode = aggregate.Complete,
-            isDistinct = true)
-
-        case expressions.First(child) =>
-          aggregate.AggregateExpression2(
-            aggregateFunction = aggregate.First(child),
-            mode = aggregate.Complete,
-            isDistinct = false)
-
-        case expressions.Last(child) =>
-          aggregate.AggregateExpression2(
-            aggregateFunction = aggregate.Last(child),
-            mode = aggregate.Complete,
-            isDistinct = false)
-
-        case expressions.Max(child) =>
-          aggregate.AggregateExpression2(
-            aggregateFunction = aggregate.Max(child),
-            mode = aggregate.Complete,
-            isDistinct = false)
-
-        case expressions.Min(child) =>
-          aggregate.AggregateExpression2(
-            aggregateFunction = aggregate.Min(child),
-            mode = aggregate.Complete,
-            isDistinct = false)
-
-        case expressions.Sum(child) =>
-          aggregate.AggregateExpression2(
-            aggregateFunction = aggregate.Sum(child),
-            mode = aggregate.Complete,
-            isDistinct = false)
-
-        case expressions.SumDistinct(child) =>
-          aggregate.AggregateExpression2(
-            aggregateFunction = aggregate.Sum(child),
-            mode = aggregate.Complete,
-            isDistinct = true)
-      }
-      // Check if there is any expressions.AggregateExpression1 left.
-      // If so, we cannot convert this plan.
-      val hasAggregateExpression1 = converted.aggregateExpressions.exists { expr =>
-        // For every expressions, check if it contains AggregateExpression1.
-        expr.find {
-          case agg: expressions.AggregateExpression1 => true
-          case other => false
-        }.isDefined
-      }
-
-      // Check if there are multiple distinct columns.
-      val aggregateExpressions = converted.aggregateExpressions.flatMap { expr =>
-        expr.collect {
-          case agg: AggregateExpression2 => agg
-        }
-      }.toSet.toSeq
-      val functionsWithDistinct = aggregateExpressions.filter(_.isDistinct)
-      val hasMultipleDistinctColumnSets =
-        if (functionsWithDistinct.map(_.aggregateFunction.children).distinct.length > 1) {
-          true
-        } else {
-          false
-        }
-
-      if (!hasAggregateExpression1 && !hasMultipleDistinctColumnSets) Some(converted) else None
-
-    case other => None
-  }
-
-  private def checkInvalidAggregateFunction2(aggregate: Aggregate): Unit = {
-    // If the plan cannot be converted, we will do a final round check to if the original
-    // logical.Aggregate contains both AggregateExpression1 and AggregateExpression2. If so,
-    // we need to throw an exception.
-    val aggregateFunction2s = aggregate.aggregateExpressions.flatMap { expr =>
-      expr.collect {
-        case agg: AggregateExpression2 => agg.aggregateFunction
-      }
-    }.distinct
-    if (aggregateFunction2s.nonEmpty) {
-      // For functions implemented based on the new interface, prepare a list of function names.
-      val invalidFunctions = {
-        if (aggregateFunction2s.length > 1) {
-          s"${aggregateFunction2s.tail.map(_.nodeName).mkString(",")} " +
-            s"and ${aggregateFunction2s.head.nodeName} are"
-        } else {
-          s"${aggregateFunction2s.head.nodeName} is"
-        }
-      }
-      val errorMessage =
-        s"${invalidFunctions} implemented based on the new Aggregate Function " +
-          s"interface and it cannot be used with functions implemented based on " +
-          s"the old Aggregate Function interface."
-      throw new AnalysisException(errorMessage)
-    }
-  }
-
-  def tryConvert(
-      plan: LogicalPlan,
-      useNewAggregation: Boolean,
-      codeGenEnabled: Boolean): Option[Aggregate] = plan match {
-    case p: Aggregate if useNewAggregation && codeGenEnabled =>
-      val converted = tryConvert(p)
-      if (converted.isDefined) {
-        converted
-      } else {
-        checkInvalidAggregateFunction2(p)
-        None
-      }
-    case p: Aggregate =>
-      checkInvalidAggregateFunction2(p)
-      None
-    case other => None
-  }
-
   def planAggregateWithoutDistinct(
       groupingExpressions: Seq[Expression],
       aggregateExpressions: Seq[AggregateExpression2],

From e53534655d6198e5b8a507010d26c7b4c4e7f1fd Mon Sep 17 00:00:00 2001
From: Mridul Muralidharan <mridulm@yahoo-inc.com>
Date: Thu, 30 Jul 2015 10:37:53 -0700
Subject: [PATCH 479/549] [SPARK-8297] [YARN] Scheduler backend is not notified
 in case node fails in YARN

This change adds code to notify the scheduler backend when a container dies in YARN.

Author: Mridul Muralidharan <mridulm@yahoo-inc.com>
Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #7431 from vanzin/SPARK-8297 and squashes the following commits:

471e4a0 [Marcelo Vanzin] Fix unit test after merge.
d4adf4e [Marcelo Vanzin] Merge branch 'master' into SPARK-8297
3b262e8 [Marcelo Vanzin] Merge branch 'master' into SPARK-8297
537da6f [Marcelo Vanzin] Make an expected log less scary.
04dc112 [Marcelo Vanzin] Use driver <-> AM communication to send "remove executor" request.
8855b97 [Marcelo Vanzin] Merge remote-tracking branch 'mridul/fix_yarn_scheduler_bug' into SPARK-8297
687790f [Mridul Muralidharan] Merge branch 'fix_yarn_scheduler_bug' of github.com:mridulm/spark into fix_yarn_scheduler_bug
e1b0067 [Mridul Muralidharan] Fix failing testcase, fix merge issue from our 1.3 -> master
9218fcc [Mridul Muralidharan] Fix failing testcase
362d64a [Mridul Muralidharan] Merge branch 'fix_yarn_scheduler_bug' of github.com:mridulm/spark into fix_yarn_scheduler_bug
62ad0cc [Mridul Muralidharan] Merge branch 'fix_yarn_scheduler_bug' of github.com:mridulm/spark into fix_yarn_scheduler_bug
bbf8811 [Mridul Muralidharan] Merge branch 'fix_yarn_scheduler_bug' of github.com:mridulm/spark into fix_yarn_scheduler_bug
9ee1307 [Mridul Muralidharan] Fix SPARK-8297
a3a0f01 [Mridul Muralidharan] Fix SPARK-8297
---
 .../CoarseGrainedSchedulerBackend.scala       |  2 +-
 .../cluster/YarnSchedulerBackend.scala        |  2 ++
 .../spark/deploy/yarn/ApplicationMaster.scala | 22 +++++++++----
 .../spark/deploy/yarn/YarnAllocator.scala     | 32 +++++++++++++++----
 .../spark/deploy/yarn/YarnRMClient.scala      |  5 ++-
 .../deploy/yarn/YarnAllocatorSuite.scala      | 29 +++++++++++++++++
 6 files changed, 77 insertions(+), 15 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
index 660702f6e6fd0..bd89160af4ffa 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
@@ -241,7 +241,7 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
           scheduler.executorLost(executorId, SlaveLost(reason))
           listenerBus.post(
             SparkListenerExecutorRemoved(System.currentTimeMillis(), executorId, reason))
-        case None => logError(s"Asked to remove non-existent executor $executorId")
+        case None => logInfo(s"Asked to remove non-existent executor $executorId")
       }
     }
 
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala
index 074282d1be37d..044f6288fabdd 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala
@@ -109,6 +109,8 @@ private[spark] abstract class YarnSchedulerBackend(
       case AddWebUIFilter(filterName, filterParams, proxyBase) =>
         addWebUIFilter(filterName, filterParams, proxyBase)
 
+      case RemoveExecutor(executorId, reason) =>
+        removeExecutor(executorId, reason)
     }
 
     override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
index 44acc7374d024..1d67b3ebb51b7 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
@@ -229,7 +229,11 @@ private[spark] class ApplicationMaster(
     sparkContextRef.compareAndSet(sc, null)
   }
 
-  private def registerAM(_rpcEnv: RpcEnv, uiAddress: String, securityMgr: SecurityManager) = {
+  private def registerAM(
+      _rpcEnv: RpcEnv,
+      driverRef: RpcEndpointRef,
+      uiAddress: String,
+      securityMgr: SecurityManager) = {
     val sc = sparkContextRef.get()
 
     val appId = client.getAttemptId().getApplicationId().toString()
@@ -246,6 +250,7 @@ private[spark] class ApplicationMaster(
         RpcAddress(_sparkConf.get("spark.driver.host"), _sparkConf.get("spark.driver.port").toInt),
         CoarseGrainedSchedulerBackend.ENDPOINT_NAME)
     allocator = client.register(driverUrl,
+      driverRef,
       yarnConf,
       _sparkConf,
       if (sc != null) sc.preferredNodeLocationData else Map(),
@@ -262,17 +267,20 @@ private[spark] class ApplicationMaster(
    *
    * In cluster mode, the AM and the driver belong to same process
    * so the AMEndpoint need not monitor lifecycle of the driver.
+   *
+   * @return A reference to the driver's RPC endpoint.
    */
   private def runAMEndpoint(
       host: String,
       port: String,
-      isClusterMode: Boolean): Unit = {
+      isClusterMode: Boolean): RpcEndpointRef = {
     val driverEndpoint = rpcEnv.setupEndpointRef(
       SparkEnv.driverActorSystemName,
       RpcAddress(host, port.toInt),
       YarnSchedulerBackend.ENDPOINT_NAME)
     amEndpoint =
       rpcEnv.setupEndpoint("YarnAM", new AMEndpoint(rpcEnv, driverEndpoint, isClusterMode))
+    driverEndpoint
   }
 
   private def runDriver(securityMgr: SecurityManager): Unit = {
@@ -290,11 +298,11 @@ private[spark] class ApplicationMaster(
         "Timed out waiting for SparkContext.")
     } else {
       rpcEnv = sc.env.rpcEnv
-      runAMEndpoint(
+      val driverRef = runAMEndpoint(
         sc.getConf.get("spark.driver.host"),
         sc.getConf.get("spark.driver.port"),
         isClusterMode = true)
-      registerAM(rpcEnv, sc.ui.map(_.appUIAddress).getOrElse(""), securityMgr)
+      registerAM(rpcEnv, driverRef, sc.ui.map(_.appUIAddress).getOrElse(""), securityMgr)
       userClassThread.join()
     }
   }
@@ -302,9 +310,9 @@ private[spark] class ApplicationMaster(
   private def runExecutorLauncher(securityMgr: SecurityManager): Unit = {
     val port = sparkConf.getInt("spark.yarn.am.port", 0)
     rpcEnv = RpcEnv.create("sparkYarnAM", Utils.localHostName, port, sparkConf, securityMgr)
-    waitForSparkDriver()
+    val driverRef = waitForSparkDriver()
     addAmIpFilter()
-    registerAM(rpcEnv, sparkConf.get("spark.driver.appUIAddress", ""), securityMgr)
+    registerAM(rpcEnv, driverRef, sparkConf.get("spark.driver.appUIAddress", ""), securityMgr)
 
     // In client mode the actor will stop the reporter thread.
     reporterThread.join()
@@ -428,7 +436,7 @@ private[spark] class ApplicationMaster(
     }
   }
 
-  private def waitForSparkDriver(): Unit = {
+  private def waitForSparkDriver(): RpcEndpointRef = {
     logInfo("Waiting for Spark driver to be reachable.")
     var driverUp = false
     val hostport = args.userArgs(0)
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
index 6c103394af098..59caa787b6e20 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
@@ -36,6 +36,9 @@ import org.apache.log4j.{Level, Logger}
 
 import org.apache.spark.{Logging, SecurityManager, SparkConf}
 import org.apache.spark.deploy.yarn.YarnSparkHadoopUtil._
+import org.apache.spark.rpc.RpcEndpointRef
+import org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend
+import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages._
 
 /**
  * YarnAllocator is charged with requesting containers from the YARN ResourceManager and deciding
@@ -52,6 +55,7 @@ import org.apache.spark.deploy.yarn.YarnSparkHadoopUtil._
  */
 private[yarn] class YarnAllocator(
     driverUrl: String,
+    driverRef: RpcEndpointRef,
     conf: Configuration,
     sparkConf: SparkConf,
     amClient: AMRMClient[ContainerRequest],
@@ -88,6 +92,9 @@ private[yarn] class YarnAllocator(
   // Visible for testing.
   private[yarn] val executorIdToContainer = new HashMap[String, Container]
 
+  private var numUnexpectedContainerRelease = 0L
+  private val containerIdToExecutorId = new HashMap[ContainerId, String]
+
   // Executor memory in MB.
   protected val executorMemory = args.executorMemory
   // Additional memory overhead.
@@ -184,6 +191,7 @@ private[yarn] class YarnAllocator(
   def killExecutor(executorId: String): Unit = synchronized {
     if (executorIdToContainer.contains(executorId)) {
       val container = executorIdToContainer.remove(executorId).get
+      containerIdToExecutorId.remove(container.getId)
       internalReleaseContainer(container)
       numExecutorsRunning -= 1
     } else {
@@ -383,6 +391,7 @@ private[yarn] class YarnAllocator(
 
       logInfo("Launching container %s for on host %s".format(containerId, executorHostname))
       executorIdToContainer(executorId) = container
+      containerIdToExecutorId(container.getId) = executorId
 
       val containerSet = allocatedHostToContainersMap.getOrElseUpdate(executorHostname,
         new HashSet[ContainerId])
@@ -413,12 +422,8 @@ private[yarn] class YarnAllocator(
   private[yarn] def processCompletedContainers(completedContainers: Seq[ContainerStatus]): Unit = {
     for (completedContainer <- completedContainers) {
       val containerId = completedContainer.getContainerId
-
-      if (releasedContainers.contains(containerId)) {
-        // Already marked the container for release, so remove it from
-        // `releasedContainers`.
-        releasedContainers.remove(containerId)
-      } else {
+      val alreadyReleased = releasedContainers.remove(containerId)
+      if (!alreadyReleased) {
         // Decrement the number of executors running. The next iteration of
         // the ApplicationMaster's reporting thread will take care of allocating.
         numExecutorsRunning -= 1
@@ -460,6 +465,18 @@ private[yarn] class YarnAllocator(
 
         allocatedContainerToHostMap.remove(containerId)
       }
+
+      containerIdToExecutorId.remove(containerId).foreach { eid =>
+        executorIdToContainer.remove(eid)
+
+        if (!alreadyReleased) {
+          // The executor could have gone away (like no route to host, node failure, etc)
+          // Notify backend about the failure of the executor
+          numUnexpectedContainerRelease += 1
+          driverRef.send(RemoveExecutor(eid,
+            s"Yarn deallocated the executor $eid (container $containerId)"))
+        }
+      }
     }
   }
 
@@ -467,6 +484,9 @@ private[yarn] class YarnAllocator(
     releasedContainers.add(container.getId())
     amClient.releaseAssignedContainer(container.getId())
   }
+
+  private[yarn] def getNumUnexpectedContainerRelease = numUnexpectedContainerRelease
+
 }
 
 private object YarnAllocator {
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala
index 7f533ee55e8bb..4999f9c06210a 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala
@@ -33,6 +33,7 @@ import org.apache.hadoop.yarn.util.ConverterUtils
 import org.apache.hadoop.yarn.webapp.util.WebAppUtils
 
 import org.apache.spark.{Logging, SecurityManager, SparkConf}
+import org.apache.spark.rpc.RpcEndpointRef
 import org.apache.spark.scheduler.SplitInfo
 import org.apache.spark.util.Utils
 
@@ -56,6 +57,7 @@ private[spark] class YarnRMClient(args: ApplicationMasterArguments) extends Logg
    */
   def register(
       driverUrl: String,
+      driverRef: RpcEndpointRef,
       conf: YarnConfiguration,
       sparkConf: SparkConf,
       preferredNodeLocations: Map[String, Set[SplitInfo]],
@@ -73,7 +75,8 @@ private[spark] class YarnRMClient(args: ApplicationMasterArguments) extends Logg
       amClient.registerApplicationMaster(Utils.localHostName(), 0, uiAddress)
       registered = true
     }
-    new YarnAllocator(driverUrl, conf, sparkConf, amClient, getAttemptId(), args, securityMgr)
+    new YarnAllocator(driverUrl, driverRef, conf, sparkConf, amClient, getAttemptId(), args,
+      securityMgr)
   }
 
   /**
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnAllocatorSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnAllocatorSuite.scala
index 37a789fcd375b..58318bf9bcc08 100644
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnAllocatorSuite.scala
+++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnAllocatorSuite.scala
@@ -27,10 +27,14 @@ import org.apache.hadoop.yarn.client.api.AMRMClient
 import org.apache.hadoop.yarn.client.api.AMRMClient.ContainerRequest
 import org.scalatest.{BeforeAndAfterEach, Matchers}
 
+import org.scalatest.{BeforeAndAfterEach, Matchers}
+import org.mockito.Mockito._
+
 import org.apache.spark.{SecurityManager, SparkFunSuite}
 import org.apache.spark.SparkConf
 import org.apache.spark.deploy.yarn.YarnSparkHadoopUtil._
 import org.apache.spark.deploy.yarn.YarnAllocator._
+import org.apache.spark.rpc.RpcEndpointRef
 import org.apache.spark.scheduler.SplitInfo
 
 class MockResolver extends DNSToSwitchMapping {
@@ -90,6 +94,7 @@ class YarnAllocatorSuite extends SparkFunSuite with Matchers with BeforeAndAfter
       "--class", "SomeClass")
     new YarnAllocator(
       "not used",
+      mock(classOf[RpcEndpointRef]),
       conf,
       sparkConf,
       rmClient,
@@ -230,6 +235,30 @@ class YarnAllocatorSuite extends SparkFunSuite with Matchers with BeforeAndAfter
     handler.getNumPendingAllocate should be (1)
   }
 
+  test("lost executor removed from backend") {
+    val handler = createAllocator(4)
+    handler.updateResourceRequests()
+    handler.getNumExecutorsRunning should be (0)
+    handler.getNumPendingAllocate should be (4)
+
+    val container1 = createContainer("host1")
+    val container2 = createContainer("host2")
+    handler.handleAllocatedContainers(Array(container1, container2))
+
+    handler.requestTotalExecutorsWithPreferredLocalities(2, 0, Map())
+
+    val statuses = Seq(container1, container2).map { c =>
+      ContainerStatus.newInstance(c.getId(), ContainerState.COMPLETE, "Failed", -1)
+    }
+    handler.updateResourceRequests()
+    handler.processCompletedContainers(statuses.toSeq)
+    handler.updateResourceRequests()
+    handler.getNumExecutorsRunning should be (0)
+    handler.getNumPendingAllocate should be (2)
+    handler.getNumExecutorsFailed should be (2)
+    handler.getNumUnexpectedContainerRelease should be (2)
+  }
+
   test("memory exceeded diagnostic regexes") {
     val diagnostics =
       "Container [pid=12465,containerID=container_1412887393566_0003_01_000002] is running " +

From ab78b1d2a6ce26833ea3878a63921efd805a3737 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Thu, 30 Jul 2015 10:40:04 -0700
Subject: [PATCH 480/549] [SPARK-9388] [YARN] Make executor info log messages
 easier to read.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #7706 from vanzin/SPARK-9388 and squashes the following commits:

028b990 [Marcelo Vanzin] Single log statement.
3c5fb6a [Marcelo Vanzin] YARN not Yarn.
5bcd7a0 [Marcelo Vanzin] [SPARK-9388] [yarn] Make executor info log messages easier to read.
---
 .../scala/org/apache/spark/deploy/yarn/Client.scala |  2 +-
 .../apache/spark/deploy/yarn/ExecutorRunnable.scala | 13 ++++++++++---
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
index bc28ce5eeae72..4ac3397f1ad28 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -767,7 +767,7 @@ private[spark] class Client(
     amContainer.setCommands(printableCommands)
 
     logDebug("===============================================================================")
-    logDebug("Yarn AM launch context:")
+    logDebug("YARN AM launch context:")
     logDebug(s"    user class: ${Option(args.userClass).getOrElse("N/A")}")
     logDebug("    env:")
     launchEnv.foreach { case (k, v) => logDebug(s"        $k -> $v") }
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala
index 78e27fb7f3337..52580deb372c2 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala
@@ -86,10 +86,17 @@ class ExecutorRunnable(
     val commands = prepareCommand(masterAddress, slaveId, hostname, executorMemory, executorCores,
       appId, localResources)
 
-    logInfo(s"Setting up executor with environment: $env")
-    logInfo("Setting up executor with commands: " + commands)
-    ctx.setCommands(commands)
+    logInfo(s"""
+      |===============================================================================
+      |YARN executor launch context:
+      |  env:
+      |${env.map { case (k, v) => s"    $k -> $v\n" }.mkString}
+      |  command:
+      |    ${commands.mkString(" ")}
+      |===============================================================================
+      """.stripMargin)
 
+    ctx.setCommands(commands)
     ctx.setApplicationACLs(YarnSparkHadoopUtil.getApplicationAclsForYarn(securityMgr))
 
     // If external shuffle service is enabled, register with the Yarn shuffle service already

From 520ec0ff9db75267f627dc4615b2316a1a3d44d7 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Thu, 30 Jul 2015 10:45:32 -0700
Subject: [PATCH 481/549] [SPARK-8850] [SQL] Enable Unsafe mode by default

This pull request enables Unsafe mode by default in Spark SQL. In order to do this, we had to fix a number of small issues:

**List of fixed blockers**:

- [x] Make some default buffer sizes configurable so that HiveCompatibilitySuite can run properly (#7741).
- [x] Memory leak on grouped aggregation of empty input (fixed by #7560 to fix this)
- [x] Update planner to also check whether codegen is enabled before planning unsafe operators.
- [x] Investigate failing HiveThriftBinaryServerSuite test.  This turns out to be caused by a ClassCastException that occurs when Exchange tries to apply an interpreted RowOrdering to an UnsafeRow when range partitioning an RDD.  This could be fixed by #7408, but a shorter-term fix is to just skip the Unsafe exchange path when RangePartitioner is used.
- [x] Memory leak exceptions masking exceptions that actually caused tasks to fail (will be fixed by #7603).
- [x]  ~~https://issues.apache.org/jira/browse/SPARK-9162, to implement code generation for ScalaUDF.  This is necessary for `UDFSuite` to pass.  For now, I've just ignored this test in order to try to find other problems while we wait for a fix.~~ This is no longer necessary as of #7682.
- [x] Memory leaks from Limit after UnsafeExternalSort cause the memory leak detector to fail tests. This is a huge problem in the HiveCompatibilitySuite (fixed by f4ac642a4e5b2a7931c5e04e086bb10e263b1db6).
- [x] Tests in `AggregationQuerySuite` are failing due to NaN-handling issues in UnsafeRow, which were fixed in #7736.
- [x] `org.apache.spark.sql.ColumnExpressionSuite.rand` needs to be updated so that the planner check also matches `TungstenProject`.
- [x] After having lowered the buffer sizes to 4MB so that most of HiveCompatibilitySuite runs:
  - [x] Wrong answer in `join_1to1` (fixed by #7680)
  - [x] Wrong answer in `join_nulls` (fixed by #7680)
  - [x] Managed memory OOM / leak in `lateral_view`
  - [x] Seems to hang indefinitely in `partcols1`.  This might be a deadlock in script transformation or a bug in error-handling code? The hang was fixed by #7710.
  - [x] Error while freeing memory in `partcols1`: will be fixed by #7734.
- [x] After fixing the `partcols1` hang, it appears that a number of later tests have issues as well.
- [x] Fix thread-safety bug in codegen fallback expression evaluation (#7759).

Author: Josh Rosen <joshrosen@databricks.com>

Closes #7564 from JoshRosen/unsafe-by-default and squashes the following commits:

83c0c56 [Josh Rosen] Merge remote-tracking branch 'origin/master' into unsafe-by-default
f4cc859 [Josh Rosen] Merge remote-tracking branch 'origin/master' into unsafe-by-default
963f567 [Josh Rosen] Reduce buffer size for R tests
d6986de [Josh Rosen] Lower page size in PySpark tests
013b9da [Josh Rosen] Also match TungstenProject in checkNumProjects
5d0b2d3 [Josh Rosen] Add task completion callback to avoid leak in limit after sort
ea250da [Josh Rosen] Disable unsafe Exchange path when RangePartitioning is used
715517b [Josh Rosen] Enable Unsafe by default
---
 R/run-tests.sh                                |  2 +-
 .../unsafe/sort/UnsafeExternalSorter.java     | 14 +++++++++++++
 python/pyspark/java_gateway.py                |  6 +++++-
 .../scala/org/apache/spark/sql/SQLConf.scala  |  2 +-
 .../apache/spark/sql/execution/Exchange.scala |  7 ++++++-
 .../spark/sql/ColumnExpressionSuite.scala     |  3 ++-
 .../execution/UnsafeExternalSortSuite.scala   | 20 +------------------
 7 files changed, 30 insertions(+), 24 deletions(-)

diff --git a/R/run-tests.sh b/R/run-tests.sh
index e82ad0ba2cd06..18a1e13bdc655 100755
--- a/R/run-tests.sh
+++ b/R/run-tests.sh
@@ -23,7 +23,7 @@ FAILED=0
 LOGFILE=$FWDIR/unit-tests.out
 rm -f $LOGFILE
 
-SPARK_TESTING=1 $FWDIR/../bin/sparkR --driver-java-options "-Dlog4j.configuration=file:$FWDIR/log4j.properties" $FWDIR/pkg/tests/run-all.R 2>&1 | tee -a $LOGFILE
+SPARK_TESTING=1 $FWDIR/../bin/sparkR --conf spark.buffer.pageSize=4m --driver-java-options "-Dlog4j.configuration=file:$FWDIR/log4j.properties" $FWDIR/pkg/tests/run-all.R 2>&1 | tee -a $LOGFILE
 FAILED=$((PIPESTATUS[0]||$FAILED))
 
 if [[ $FAILED != 0 ]]; then
diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java
index c21990f4e4778..866e0b4151577 100644
--- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java
+++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java
@@ -20,6 +20,9 @@
 import java.io.IOException;
 import java.util.LinkedList;
 
+import scala.runtime.AbstractFunction0;
+import scala.runtime.BoxedUnit;
+
 import com.google.common.annotations.VisibleForTesting;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -90,6 +93,17 @@ public UnsafeExternalSorter(
     this.fileBufferSizeBytes = (int) conf.getSizeAsKb("spark.shuffle.file.buffer", "32k") * 1024;
     this.pageSizeBytes = conf.getSizeAsBytes("spark.buffer.pageSize", "64m");
     initializeForWriting();
+
+    // Register a cleanup task with TaskContext to ensure that memory is guaranteed to be freed at
+    // the end of the task. This is necessary to avoid memory leaks in when the downstream operator
+    // does not fully consume the sorter's output (e.g. sort followed by limit).
+    taskContext.addOnCompleteCallback(new AbstractFunction0<BoxedUnit>() {
+      @Override
+      public BoxedUnit apply() {
+        freeMemory();
+        return null;
+      }
+    });
   }
 
   // TODO: metrics tracking + integration with shuffle write metrics
diff --git a/python/pyspark/java_gateway.py b/python/pyspark/java_gateway.py
index 90cd342a6cf7f..60be85e53e2aa 100644
--- a/python/pyspark/java_gateway.py
+++ b/python/pyspark/java_gateway.py
@@ -52,7 +52,11 @@ def launch_gateway():
         script = "./bin/spark-submit.cmd" if on_windows else "./bin/spark-submit"
         submit_args = os.environ.get("PYSPARK_SUBMIT_ARGS", "pyspark-shell")
         if os.environ.get("SPARK_TESTING"):
-            submit_args = "--conf spark.ui.enabled=false " + submit_args
+            submit_args = ' '.join([
+                "--conf spark.ui.enabled=false",
+                "--conf spark.buffer.pageSize=4mb",
+                submit_args
+            ])
         command = [os.path.join(SPARK_HOME, script)] + shlex.split(submit_args)
 
         # Start a socket that will be used by PythonGatewayServer to communicate its port to us
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
index 2564bbd2077bf..6644e85d4a037 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
@@ -229,7 +229,7 @@ private[spark] object SQLConf {
       " a specific query.")
 
   val UNSAFE_ENABLED = booleanConf("spark.sql.unsafe.enabled",
-    defaultValue = Some(false),
+    defaultValue = Some(true),
     doc = "When true, use the new optimized Tungsten physical execution backend.")
 
   val DIALECT = stringConf(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
index 41a0c519ba527..70e5031fb63c0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
@@ -47,7 +47,12 @@ case class Exchange(newPartitioning: Partitioning, child: SparkPlan) extends Una
 
   override def canProcessSafeRows: Boolean = true
 
-  override def canProcessUnsafeRows: Boolean = true
+  override def canProcessUnsafeRows: Boolean = {
+    // Do not use the Unsafe path if we are using a RangePartitioning, since this may lead to
+    // an interpreted RowOrdering being applied to an UnsafeRow, which will lead to
+    // ClassCastExceptions at runtime. This check can be removed after SPARK-9054 is fixed.
+    !newPartitioning.isInstanceOf[RangePartitioning]
+  }
 
   /**
    * Determines whether records must be defensively copied before being sent to the shuffle.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
index 5c1102410879a..eb64684ae0fd9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql
 
 import org.scalatest.Matchers._
 
-import org.apache.spark.sql.execution.Project
+import org.apache.spark.sql.execution.{Project, TungstenProject}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.test.SQLTestUtils
@@ -538,6 +538,7 @@ class ColumnExpressionSuite extends QueryTest with SQLTestUtils {
     def checkNumProjects(df: DataFrame, expectedNumProjects: Int): Unit = {
       val projects = df.queryExecution.executedPlan.collect {
         case project: Project => project
+        case tungstenProject: TungstenProject => tungstenProject
       }
       assert(projects.size === expectedNumProjects)
     }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeExternalSortSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeExternalSortSuite.scala
index 7a4baa9e4a49d..138636b0c65b8 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeExternalSortSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeExternalSortSuite.scala
@@ -36,10 +36,7 @@ class UnsafeExternalSortSuite extends SparkPlanTest with BeforeAndAfterAll {
     TestSQLContext.conf.setConf(SQLConf.CODEGEN_ENABLED, SQLConf.CODEGEN_ENABLED.defaultValue.get)
   }
 
-  ignore("sort followed by limit should not leak memory") {
-    // TODO: this test is going to fail until we implement a proper iterator interface
-    // with a close() method.
-    TestSQLContext.sparkContext.conf.set("spark.unsafe.exceptionOnMemoryLeak", "false")
+  test("sort followed by limit") {
     checkThatPlansAgree(
       (1 to 100).map(v => Tuple1(v)).toDF("a"),
       (child: SparkPlan) => Limit(10, UnsafeExternalSort('a.asc :: Nil, true, child)),
@@ -48,21 +45,6 @@ class UnsafeExternalSortSuite extends SparkPlanTest with BeforeAndAfterAll {
     )
   }
 
-  test("sort followed by limit") {
-    TestSQLContext.sparkContext.conf.set("spark.unsafe.exceptionOnMemoryLeak", "false")
-    try {
-      checkThatPlansAgree(
-        (1 to 100).map(v => Tuple1(v)).toDF("a"),
-        (child: SparkPlan) => Limit(10, UnsafeExternalSort('a.asc :: Nil, true, child)),
-        (child: SparkPlan) => Limit(10, Sort('a.asc :: Nil, global = true, child)),
-        sortAnswers = false
-      )
-    } finally {
-      TestSQLContext.sparkContext.conf.set("spark.unsafe.exceptionOnMemoryLeak", "false")
-
-    }
-  }
-
   test("sorting does not crash for large inputs") {
     val sortOrder = 'a.asc :: Nil
     val stringLength = 1024 * 1024 * 2

From 06b6a074fb224b3fe23922bdc89fc5f7c2ffaaf6 Mon Sep 17 00:00:00 2001
From: Imran Rashid <irashid@cloudera.com>
Date: Thu, 30 Jul 2015 10:46:26 -0700
Subject: [PATCH 482/549] [SPARK-9437] [CORE] avoid overflow in SizeEstimator

https://issues.apache.org/jira/browse/SPARK-9437

Author: Imran Rashid <irashid@cloudera.com>

Closes #7750 from squito/SPARK-9437_size_estimator_overflow and squashes the following commits:

29493f1 [Imran Rashid] prevent another potential overflow
bc1cb82 [Imran Rashid] avoid overflow
---
 .../main/scala/org/apache/spark/util/SizeEstimator.scala    | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/util/SizeEstimator.scala b/core/src/main/scala/org/apache/spark/util/SizeEstimator.scala
index 7d84468f62ab1..14b1f2a17e707 100644
--- a/core/src/main/scala/org/apache/spark/util/SizeEstimator.scala
+++ b/core/src/main/scala/org/apache/spark/util/SizeEstimator.scala
@@ -217,10 +217,10 @@ object SizeEstimator extends Logging {
     var arrSize: Long = alignSize(objectSize + INT_SIZE)
 
     if (elementClass.isPrimitive) {
-      arrSize += alignSize(length * primitiveSize(elementClass))
+      arrSize += alignSize(length.toLong * primitiveSize(elementClass))
       state.size += arrSize
     } else {
-      arrSize += alignSize(length * pointerSize)
+      arrSize += alignSize(length.toLong * pointerSize)
       state.size += arrSize
 
       if (length <= ARRAY_SIZE_FOR_SAMPLING) {
@@ -336,7 +336,7 @@ object SizeEstimator extends Logging {
     // hg.openjdk.java.net/jdk8/jdk8/hotspot/file/tip/src/share/vm/classfile/classFileParser.cpp
     var alignedSize = shellSize
     for (size <- fieldSizes if sizeCount(size) > 0) {
-      val count = sizeCount(size)
+      val count = sizeCount(size).toLong
       // If there are internal gaps, smaller field can fit in.
       alignedSize = math.max(alignedSize, alignSizeUp(shellSize, size) + size * count)
       shellSize += size * count

From 6d94bf6ac10ac851636c62439f8f2737f3526a2a Mon Sep 17 00:00:00 2001
From: Daoyuan Wang <daoyuan.wang@intel.com>
Date: Thu, 30 Jul 2015 11:13:15 -0700
Subject: [PATCH 483/549] [SPARK-8174] [SPARK-8175] [SQL] function
 unix_timestamp, from_unixtime

unix_timestamp(): long
Gets current Unix timestamp in seconds.

unix_timestamp(string|date): long
Converts time string in format yyyy-MM-dd HH:mm:ss to Unix timestamp (in seconds), using the default timezone and the default locale, return null if fail: unix_timestamp('2009-03-20 11:30:01') = 1237573801

unix_timestamp(string date, string pattern): long
Convert time string with given pattern (see [http://docs.oracle.com/javase/tutorial/i18n/format/simpleDateFormat.html]) to Unix time stamp (in seconds), return null if fail: unix_timestamp('2009-03-20', 'yyyy-MM-dd') = 1237532400.

from_unixtime(bigint unixtime[, string format]): string
Converts the number of seconds from unix epoch (1970-01-01 00:00:00 UTC) to a string representing the timestamp of that moment in the current system time zone in the format of "1970-01-01 00:00:00".

Jira:
https://issues.apache.org/jira/browse/SPARK-8174
https://issues.apache.org/jira/browse/SPARK-8175

Author: Daoyuan Wang <daoyuan.wang@intel.com>

Closes #7644 from adrian-wang/udfunixtime and squashes the following commits:

2fe20c4 [Daoyuan Wang] util.Date
ea2ec16 [Daoyuan Wang] use util.Date for better performance
a2cf929 [Daoyuan Wang] doc return null instead of 0
f6f070a [Daoyuan Wang] address comments from davies
6a4cbb3 [Daoyuan Wang] temp
56ded53 [Daoyuan Wang] rebase and address comments
14a8b37 [Daoyuan Wang] function unix_timestamp, from_unixtime
---
 .../catalyst/analysis/FunctionRegistry.scala  |   2 +
 .../expressions/datetimeFunctions.scala       | 219 +++++++++++++++++-
 .../expressions/DateExpressionsSuite.scala    |  59 ++++-
 .../org/apache/spark/sql/functions.scala      |  42 ++++
 .../apache/spark/sql/DateFunctionsSuite.scala |  56 +++++
 5 files changed, 374 insertions(+), 4 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index 378df4f57d9e2..d663f12bc6d0d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -211,6 +211,7 @@ object FunctionRegistry {
     expression[DayOfMonth]("day"),
     expression[DayOfYear]("dayofyear"),
     expression[DayOfMonth]("dayofmonth"),
+    expression[FromUnixTime]("from_unixtime"),
     expression[Hour]("hour"),
     expression[LastDay]("last_day"),
     expression[Minute]("minute"),
@@ -218,6 +219,7 @@ object FunctionRegistry {
     expression[NextDay]("next_day"),
     expression[Quarter]("quarter"),
     expression[Second]("second"),
+    expression[UnixTimestamp]("unix_timestamp"),
     expression[WeekOfYear]("weekofyear"),
     expression[Year]("year"),
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeFunctions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeFunctions.scala
index efecb771f2f5d..a5e6249e438d2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeFunctions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeFunctions.scala
@@ -17,7 +17,6 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
-import java.sql.Date
 import java.text.SimpleDateFormat
 import java.util.{Calendar, TimeZone}
 
@@ -28,6 +27,8 @@ import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
 
+import scala.util.Try
+
 /**
  * Returns the current date at the start of query evaluation.
  * All calls of current_date within the same query return the same value.
@@ -236,20 +237,232 @@ case class DateFormatClass(left: Expression, right: Expression) extends BinaryEx
 
   override protected def nullSafeEval(timestamp: Any, format: Any): Any = {
     val sdf = new SimpleDateFormat(format.toString)
-    UTF8String.fromString(sdf.format(new Date(timestamp.asInstanceOf[Long] / 1000)))
+    UTF8String.fromString(sdf.format(new java.util.Date(timestamp.asInstanceOf[Long] / 1000)))
   }
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     val sdf = classOf[SimpleDateFormat].getName
     defineCodeGen(ctx, ev, (timestamp, format) => {
       s"""UTF8String.fromString((new $sdf($format.toString()))
-          .format(new java.sql.Date($timestamp / 1000)))"""
+          .format(new java.util.Date($timestamp / 1000)))"""
     })
   }
 
   override def prettyName: String = "date_format"
 }
 
+/**
+ * Converts time string with given pattern
+ * (see [http://docs.oracle.com/javase/tutorial/i18n/format/simpleDateFormat.html])
+ * to Unix time stamp (in seconds), returns null if fail.
+ * Note that hive Language Manual says it returns 0 if fail, but in fact it returns null.
+ * If the second parameter is missing, use "yyyy-MM-dd HH:mm:ss".
+ * If no parameters provided, the first parameter will be current_timestamp.
+ * If the first parameter is a Date or Timestamp instead of String, we will ignore the
+ * second parameter.
+ */
+case class UnixTimestamp(timeExp: Expression, format: Expression)
+  extends BinaryExpression with ExpectsInputTypes {
+
+  override def left: Expression = timeExp
+  override def right: Expression = format
+
+  def this(time: Expression) = {
+    this(time, Literal("yyyy-MM-dd HH:mm:ss"))
+  }
+
+  def this() = {
+    this(CurrentTimestamp())
+  }
+
+  override def inputTypes: Seq[AbstractDataType] =
+    Seq(TypeCollection(StringType, DateType, TimestampType), StringType)
+
+  override def dataType: DataType = LongType
+
+  private lazy val constFormat: UTF8String = right.eval().asInstanceOf[UTF8String]
+
+  override def eval(input: InternalRow): Any = {
+    val t = left.eval(input)
+    if (t == null) {
+      null
+    } else {
+      left.dataType match {
+        case DateType =>
+          DateTimeUtils.daysToMillis(t.asInstanceOf[Int]) / 1000L
+        case TimestampType =>
+          t.asInstanceOf[Long] / 1000000L
+        case StringType if right.foldable =>
+          if (constFormat != null) {
+            Try(new SimpleDateFormat(constFormat.toString).parse(
+              t.asInstanceOf[UTF8String].toString).getTime / 1000L).getOrElse(null)
+          } else {
+            null
+          }
+        case StringType =>
+          val f = format.eval(input)
+          if (f == null) {
+            null
+          } else {
+            val formatString = f.asInstanceOf[UTF8String].toString
+            Try(new SimpleDateFormat(formatString).parse(
+              t.asInstanceOf[UTF8String].toString).getTime / 1000L).getOrElse(null)
+          }
+      }
+    }
+  }
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    left.dataType match {
+      case StringType if right.foldable =>
+        val sdf = classOf[SimpleDateFormat].getName
+        val fString = if (constFormat == null) null else constFormat.toString
+        val formatter = ctx.freshName("formatter")
+        if (fString == null) {
+          s"""
+            boolean ${ev.isNull} = true;
+            ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+          """
+        } else {
+          val eval1 = left.gen(ctx)
+          s"""
+            ${eval1.code}
+            boolean ${ev.isNull} = ${eval1.isNull};
+            ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+            if (!${ev.isNull}) {
+              try {
+                $sdf $formatter = new $sdf("$fString");
+                ${ev.primitive} =
+                  $formatter.parse(${eval1.primitive}.toString()).getTime() / 1000L;
+              } catch (java.lang.Throwable e) {
+                ${ev.isNull} = true;
+              }
+            }
+          """
+        }
+      case StringType =>
+        val sdf = classOf[SimpleDateFormat].getName
+        nullSafeCodeGen(ctx, ev, (string, format) => {
+          s"""
+            try {
+              ${ev.primitive} =
+                (new $sdf($format.toString())).parse($string.toString()).getTime() / 1000L;
+            } catch (java.lang.Throwable e) {
+              ${ev.isNull} = true;
+            }
+          """
+        })
+      case TimestampType =>
+        val eval1 = left.gen(ctx)
+        s"""
+          ${eval1.code}
+          boolean ${ev.isNull} = ${eval1.isNull};
+          ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+          if (!${ev.isNull}) {
+            ${ev.primitive} = ${eval1.primitive} / 1000000L;
+          }
+        """
+      case DateType =>
+        val dtu = DateTimeUtils.getClass.getName.stripSuffix("$")
+        val eval1 = left.gen(ctx)
+        s"""
+          ${eval1.code}
+          boolean ${ev.isNull} = ${eval1.isNull};
+          ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+          if (!${ev.isNull}) {
+            ${ev.primitive} = $dtu.daysToMillis(${eval1.primitive}) / 1000L;
+          }
+        """
+    }
+  }
+}
+
+/**
+ * Converts the number of seconds from unix epoch (1970-01-01 00:00:00 UTC) to a string
+ * representing the timestamp of that moment in the current system time zone in the given
+ * format. If the format is missing, using format like "1970-01-01 00:00:00".
+ * Note that hive Language Manual says it returns 0 if fail, but in fact it returns null.
+ */
+case class FromUnixTime(sec: Expression, format: Expression)
+  extends BinaryExpression with ImplicitCastInputTypes {
+
+  override def left: Expression = sec
+  override def right: Expression = format
+
+  def this(unix: Expression) = {
+    this(unix, Literal("yyyy-MM-dd HH:mm:ss"))
+  }
+
+  override def dataType: DataType = StringType
+
+  override def inputTypes: Seq[AbstractDataType] = Seq(LongType, StringType)
+
+  private lazy val constFormat: UTF8String = right.eval().asInstanceOf[UTF8String]
+
+  override def eval(input: InternalRow): Any = {
+    val time = left.eval(input)
+    if (time == null) {
+      null
+    } else {
+      if (format.foldable) {
+        if (constFormat == null) {
+          null
+        } else {
+          Try(UTF8String.fromString(new SimpleDateFormat(constFormat.toString).format(
+            new java.util.Date(time.asInstanceOf[Long] * 1000L)))).getOrElse(null)
+        }
+      } else {
+        val f = format.eval(input)
+        if (f == null) {
+          null
+        } else {
+          Try(UTF8String.fromString(new SimpleDateFormat(
+            f.asInstanceOf[UTF8String].toString).format(new java.util.Date(
+              time.asInstanceOf[Long] * 1000L)))).getOrElse(null)
+        }
+      }
+    }
+  }
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    val sdf = classOf[SimpleDateFormat].getName
+    if (format.foldable) {
+      if (constFormat == null) {
+        s"""
+          boolean ${ev.isNull} = true;
+          ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+        """
+      } else {
+        val t = left.gen(ctx)
+        s"""
+          ${t.code}
+          boolean ${ev.isNull} = ${t.isNull};
+          ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+          if (!${ev.isNull}) {
+            try {
+              ${ev.primitive} = UTF8String.fromString(new $sdf("${constFormat.toString}").format(
+                new java.util.Date(${t.primitive} * 1000L)));
+            } catch (java.lang.Throwable e) {
+              ${ev.isNull} = true;
+            }
+          }
+        """
+      }
+    } else {
+      nullSafeCodeGen(ctx, ev, (seconds, f) => {
+        s"""
+        try {
+          ${ev.primitive} = UTF8String.fromString((new $sdf($f.toString())).format(
+            new java.util.Date($seconds * 1000L)));
+        } catch (java.lang.Throwable e) {
+          ${ev.isNull} = true;
+        }""".stripMargin
+      })
+    }
+  }
+
+}
+
 /**
  * Returns the last day of the month which the date belongs to.
  */
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
index aca8d6eb3500c..e1387f945ffa4 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
@@ -22,8 +22,9 @@ import java.text.SimpleDateFormat
 import java.util.Calendar
 
 import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
-import org.apache.spark.sql.types.{StringType, TimestampType, DateType}
+import org.apache.spark.sql.types._
 
 class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
 
@@ -303,4 +304,60 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(
       NextDay(Literal(Date.valueOf("2015-07-23")), Literal.create(null, StringType)), null)
   }
+
+  test("from_unixtime") {
+    val sdf1 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
+    val fmt2 = "yyyy-MM-dd HH:mm:ss.SSS"
+    val sdf2 = new SimpleDateFormat(fmt2)
+    checkEvaluation(
+      FromUnixTime(Literal(0L), Literal("yyyy-MM-dd HH:mm:ss")), sdf1.format(new Timestamp(0)))
+    checkEvaluation(FromUnixTime(
+      Literal(1000L), Literal("yyyy-MM-dd HH:mm:ss")), sdf1.format(new Timestamp(1000000)))
+    checkEvaluation(
+      FromUnixTime(Literal(-1000L), Literal(fmt2)), sdf2.format(new Timestamp(-1000000)))
+    checkEvaluation(
+      FromUnixTime(Literal.create(null, LongType), Literal.create(null, StringType)), null)
+    checkEvaluation(
+      FromUnixTime(Literal.create(null, LongType), Literal("yyyy-MM-dd HH:mm:ss")), null)
+    checkEvaluation(FromUnixTime(Literal(1000L), Literal.create(null, StringType)), null)
+    checkEvaluation(
+      FromUnixTime(Literal(0L), Literal("not a valid format")), null)
+  }
+
+  test("unix_timestamp") {
+    val sdf1 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
+    val fmt2 = "yyyy-MM-dd HH:mm:ss.SSS"
+    val sdf2 = new SimpleDateFormat(fmt2)
+    val fmt3 = "yy-MM-dd"
+    val sdf3 = new SimpleDateFormat(fmt3)
+    val date1 = Date.valueOf("2015-07-24")
+    checkEvaluation(
+      UnixTimestamp(Literal(sdf1.format(new Timestamp(0))), Literal("yyyy-MM-dd HH:mm:ss")), 0L)
+    checkEvaluation(UnixTimestamp(
+      Literal(sdf1.format(new Timestamp(1000000))), Literal("yyyy-MM-dd HH:mm:ss")), 1000L)
+    checkEvaluation(
+      UnixTimestamp(Literal(new Timestamp(1000000)), Literal("yyyy-MM-dd HH:mm:ss")), 1000L)
+    checkEvaluation(
+      UnixTimestamp(Literal(date1), Literal("yyyy-MM-dd HH:mm:ss")),
+      DateTimeUtils.daysToMillis(DateTimeUtils.fromJavaDate(date1)) / 1000L)
+    checkEvaluation(
+      UnixTimestamp(Literal(sdf2.format(new Timestamp(-1000000))), Literal(fmt2)), -1000L)
+    checkEvaluation(UnixTimestamp(
+      Literal(sdf3.format(Date.valueOf("2015-07-24"))), Literal(fmt3)),
+      DateTimeUtils.daysToMillis(DateTimeUtils.fromJavaDate(Date.valueOf("2015-07-24"))) / 1000L)
+    val t1 = UnixTimestamp(
+      CurrentTimestamp(), Literal("yyyy-MM-dd HH:mm:ss")).eval().asInstanceOf[Long]
+    val t2 = UnixTimestamp(
+      CurrentTimestamp(), Literal("yyyy-MM-dd HH:mm:ss")).eval().asInstanceOf[Long]
+    assert(t2 - t1 <= 1)
+    checkEvaluation(
+      UnixTimestamp(Literal.create(null, DateType), Literal.create(null, StringType)), null)
+    checkEvaluation(
+      UnixTimestamp(Literal.create(null, DateType), Literal("yyyy-MM-dd HH:mm:ss")), null)
+    checkEvaluation(UnixTimestamp(
+      Literal(date1), Literal.create(null, StringType)), date1.getTime / 1000L)
+    checkEvaluation(
+      UnixTimestamp(Literal("2015-07-24"), Literal("not a valid format")), null)
+  }
+
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index a2fece62f61f9..3f440e062eb96 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -2110,6 +2110,48 @@ object functions {
    */
   def weekofyear(columnName: String): Column = weekofyear(Column(columnName))
 
+  /**
+   * Converts the number of seconds from unix epoch (1970-01-01 00:00:00 UTC) to a string
+   * representing the timestamp of that moment in the current system time zone in the given
+   * format.
+   * @group datetime_funcs
+   * @since 1.5.0
+   */
+  def from_unixtime(ut: Column): Column = FromUnixTime(ut.expr, Literal("yyyy-MM-dd HH:mm:ss"))
+
+  /**
+   * Converts the number of seconds from unix epoch (1970-01-01 00:00:00 UTC) to a string
+   * representing the timestamp of that moment in the current system time zone in the given
+   * format.
+   * @group datetime_funcs
+   * @since 1.5.0
+   */
+  def from_unixtime(ut: Column, f: String): Column = FromUnixTime(ut.expr, Literal(f))
+
+  /**
+   * Gets current Unix timestamp in seconds.
+   * @group datetime_funcs
+   * @since 1.5.0
+   */
+  def unix_timestamp(): Column = UnixTimestamp(CurrentTimestamp(), Literal("yyyy-MM-dd HH:mm:ss"))
+
+  /**
+   * Converts time string in format yyyy-MM-dd HH:mm:ss to Unix timestamp (in seconds),
+   * using the default timezone and the default locale, return null if fail.
+   * @group datetime_funcs
+   * @since 1.5.0
+   */
+  def unix_timestamp(s: Column): Column = UnixTimestamp(s.expr, Literal("yyyy-MM-dd HH:mm:ss"))
+
+  /**
+   * Convert time string with given pattern
+   * (see [http://docs.oracle.com/javase/tutorial/i18n/format/simpleDateFormat.html])
+   * to Unix time stamp (in seconds), return null if fail.
+   * @group datetime_funcs
+   * @since 1.5.0
+   */
+  def unix_timestamp(s: Column, p: String): Column = UnixTimestamp(s.expr, Literal(p))
+
   //////////////////////////////////////////////////////////////////////////////////////////////
   // Collection functions
   //////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
index 07eb6e4a8d8cd..df4cb57ac5b21 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
@@ -228,4 +228,60 @@ class DateFunctionsSuite extends QueryTest {
       Seq(Row(Date.valueOf("2015-07-30")), Row(Date.valueOf("2015-07-30"))))
   }
 
+  test("from_unixtime") {
+    val sdf1 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
+    val fmt2 = "yyyy-MM-dd HH:mm:ss.SSS"
+    val sdf2 = new SimpleDateFormat(fmt2)
+    val fmt3 = "yy-MM-dd HH-mm-ss"
+    val sdf3 = new SimpleDateFormat(fmt3)
+    val df = Seq((1000, "yyyy-MM-dd HH:mm:ss.SSS"), (-1000, "yy-MM-dd HH-mm-ss")).toDF("a", "b")
+    checkAnswer(
+      df.select(from_unixtime(col("a"))),
+      Seq(Row(sdf1.format(new Timestamp(1000000))), Row(sdf1.format(new Timestamp(-1000000)))))
+    checkAnswer(
+      df.select(from_unixtime(col("a"), fmt2)),
+      Seq(Row(sdf2.format(new Timestamp(1000000))), Row(sdf2.format(new Timestamp(-1000000)))))
+    checkAnswer(
+      df.select(from_unixtime(col("a"), fmt3)),
+      Seq(Row(sdf3.format(new Timestamp(1000000))), Row(sdf3.format(new Timestamp(-1000000)))))
+    checkAnswer(
+      df.selectExpr("from_unixtime(a)"),
+      Seq(Row(sdf1.format(new Timestamp(1000000))), Row(sdf1.format(new Timestamp(-1000000)))))
+    checkAnswer(
+      df.selectExpr(s"from_unixtime(a, '$fmt2')"),
+      Seq(Row(sdf2.format(new Timestamp(1000000))), Row(sdf2.format(new Timestamp(-1000000)))))
+    checkAnswer(
+      df.selectExpr(s"from_unixtime(a, '$fmt3')"),
+      Seq(Row(sdf3.format(new Timestamp(1000000))), Row(sdf3.format(new Timestamp(-1000000)))))
+  }
+
+  test("unix_timestamp") {
+    val date1 = Date.valueOf("2015-07-24")
+    val date2 = Date.valueOf("2015-07-25")
+    val ts1 = Timestamp.valueOf("2015-07-24 10:00:00.3")
+    val ts2 = Timestamp.valueOf("2015-07-25 02:02:02.2")
+    val s1 = "2015/07/24 10:00:00.5"
+    val s2 = "2015/07/25 02:02:02.6"
+    val ss1 = "2015-07-24 10:00:00"
+    val ss2 = "2015-07-25 02:02:02"
+    val fmt = "yyyy/MM/dd HH:mm:ss.S"
+    val df = Seq((date1, ts1, s1, ss1), (date2, ts2, s2, ss2)).toDF("d", "ts", "s", "ss")
+    checkAnswer(df.select(unix_timestamp(col("ts"))), Seq(
+      Row(ts1.getTime / 1000L), Row(ts2.getTime / 1000L)))
+    checkAnswer(df.select(unix_timestamp(col("ss"))), Seq(
+      Row(ts1.getTime / 1000L), Row(ts2.getTime / 1000L)))
+    checkAnswer(df.select(unix_timestamp(col("d"), fmt)), Seq(
+      Row(date1.getTime / 1000L), Row(date2.getTime / 1000L)))
+    checkAnswer(df.select(unix_timestamp(col("s"), fmt)), Seq(
+      Row(ts1.getTime / 1000L), Row(ts2.getTime / 1000L)))
+    checkAnswer(df.selectExpr("unix_timestamp(ts)"), Seq(
+      Row(ts1.getTime / 1000L), Row(ts2.getTime / 1000L)))
+    checkAnswer(df.selectExpr("unix_timestamp(ss)"), Seq(
+      Row(ts1.getTime / 1000L), Row(ts2.getTime / 1000L)))
+    checkAnswer(df.selectExpr(s"unix_timestamp(d, '$fmt')"), Seq(
+      Row(date1.getTime / 1000L), Row(date2.getTime / 1000L)))
+    checkAnswer(df.selectExpr(s"unix_timestamp(s, '$fmt')"), Seq(
+      Row(ts1.getTime / 1000L), Row(ts2.getTime / 1000L)))
+  }
+
 }

From a20e743fb863de809863652931bc982aac2d1f86 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Thu, 30 Jul 2015 13:09:43 -0700
Subject: [PATCH 484/549] [SPARK-9460] Fix prefix generation for UTF8String.

Previously we could be getting garbage data if the number of bytes is 0, or on JVMs that are 4 byte aligned, or when compressedoops is on.

Author: Reynold Xin <rxin@databricks.com>

Closes #7789 from rxin/utf8string and squashes the following commits:

86ffa3e [Reynold Xin] Mask out data outside of valid range.
4d647ed [Reynold Xin] Mask out data.
c6e8794 [Reynold Xin] [SPARK-9460] Fix prefix generation for UTF8String.
---
 .../apache/spark/unsafe/types/UTF8String.java | 36 +++++++++++++++++--
 .../spark/unsafe/types/UTF8StringSuite.java   |  8 +++++
 2 files changed, 41 insertions(+), 3 deletions(-)

diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
index 57522003ba2ba..c38953f65d7d7 100644
--- a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
+++ b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -65,6 +65,19 @@ public static UTF8String fromBytes(byte[] bytes) {
     }
   }
 
+  /**
+   * Creates an UTF8String from byte array, which should be encoded in UTF-8.
+   *
+   * Note: `bytes` will be hold by returned UTF8String.
+   */
+  public static UTF8String fromBytes(byte[] bytes, int offset, int numBytes) {
+    if (bytes != null) {
+      return new UTF8String(bytes, BYTE_ARRAY_OFFSET + offset, numBytes);
+    } else {
+      return null;
+    }
+  }
+
   /**
    * Creates an UTF8String from String.
    */
@@ -89,10 +102,10 @@ public static UTF8String blankString(int length) {
     return fromBytes(spaces);
   }
 
-  protected UTF8String(Object base, long offset, int size) {
+  protected UTF8String(Object base, long offset, int numBytes) {
     this.base = base;
     this.offset = offset;
-    this.numBytes = size;
+    this.numBytes = numBytes;
   }
 
   /**
@@ -141,7 +154,24 @@ public int numChars() {
    * Returns a 64-bit integer that can be used as the prefix used in sorting.
    */
   public long getPrefix() {
-    long p = PlatformDependent.UNSAFE.getLong(base, offset);
+    // Since JVMs are either 4-byte aligned or 8-byte aligned, we check the size of the string.
+    // If size is 0, just return 0.
+    // If size is between 0 and 4 (inclusive), assume data is 4-byte aligned under the hood and
+    // use a getInt to fetch the prefix.
+    // If size is greater than 4, assume we have at least 8 bytes of data to fetch.
+    // After getting the data, we use a mask to mask out data that is not part of the string.
+    long p;
+    if (numBytes >= 8) {
+      p = PlatformDependent.UNSAFE.getLong(base, offset);
+    } else  if (numBytes > 4) {
+      p = PlatformDependent.UNSAFE.getLong(base, offset);
+      p = p & ((1L << numBytes * 8) - 1);
+    } else if (numBytes > 0) {
+      p = (long) PlatformDependent.UNSAFE.getInt(base, offset);
+      p = p & ((1L << numBytes * 8) - 1);
+    } else {
+      p = 0;
+    }
     p = java.lang.Long.reverseBytes(p);
     return p;
   }
diff --git a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
index 42e09e435a412..f2cc19ca6b172 100644
--- a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
+++ b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
@@ -71,6 +71,14 @@ public void prefix() {
       fromString("abbbbbbbbbbbasdf").getPrefix() - fromString("bbbbbbbbbbbbasdf").getPrefix() < 0);
     assertTrue(fromString("").getPrefix() - fromString("a").getPrefix() < 0);
     assertTrue(fromString("你好").getPrefix() - fromString("世界").getPrefix() > 0);
+
+    byte[] buf1 = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+    byte[] buf2 = {1, 2, 3};
+    UTF8String str1 = UTF8String.fromBytes(buf1, 0, 3);
+    UTF8String str2 = UTF8String.fromBytes(buf1, 0, 8);
+    UTF8String str3 = UTF8String.fromBytes(buf2);
+    assertTrue(str1.getPrefix() - str2.getPrefix() < 0);
+    assertEquals(str1.getPrefix(), str3.getPrefix());
   }
 
   @Test

From d8cfd531c7c50c9b00ab546be458f44f84c386ae Mon Sep 17 00:00:00 2001
From: Feynman Liang <fliang@databricks.com>
Date: Thu, 30 Jul 2015 13:17:54 -0700
Subject: [PATCH 485/549] [SPARK-5567] [MLLIB] Add predict method to
 LocalLDAModel

jkbradley hhbyyh

Adds `topicDistributions` to LocalLDAModel. Please review after #7757 is merged.

Author: Feynman Liang <fliang@databricks.com>

Closes #7760 from feynmanliang/SPARK-5567-predict-in-LDA and squashes the following commits:

0ad1134 [Feynman Liang] Remove println
27b3877 [Feynman Liang] Code review fixes
6bfb87c [Feynman Liang] Remove extra newline
476f788 [Feynman Liang] Fix checks and doc for variationalInference
061780c [Feynman Liang] Code review cleanup
3be2947 [Feynman Liang] Rename topicDistribution -> topicDistributions
2a821a6 [Feynman Liang] Add predict methods to LocalLDAModel
---
 .../spark/mllib/clustering/LDAModel.scala     | 42 +++++++++++--
 .../spark/mllib/clustering/LDAOptimizer.scala |  5 +-
 .../spark/mllib/clustering/LDASuite.scala     | 63 +++++++++++++++++++
 3 files changed, 102 insertions(+), 8 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
index ece28848aa02c..6cfad3fbbdb87 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
@@ -186,7 +186,6 @@ abstract class LDAModel private[clustering] extends Saveable {
  * This model stores only the inferred topics.
  * It may be used for computing topics for new documents, but it may give less accurate answers
  * than the [[DistributedLDAModel]].
- *
  * @param topics Inferred topics (vocabSize x k matrix).
  */
 @Experimental
@@ -221,9 +220,6 @@ class LocalLDAModel private[clustering] (
   // TODO
   // override def logLikelihood(documents: RDD[(Long, Vector)]): Double = ???
 
-  // TODO:
-  // override def topicDistributions(documents: RDD[(Long, Vector)]): RDD[(Long, Vector)] = ???
-
   /**
    * Calculate the log variational bound on perplexity. See Equation (16) in original Online
    * LDA paper.
@@ -269,7 +265,7 @@ class LocalLDAModel private[clustering] (
     // by topic (columns of lambda)
     val Elogbeta = LDAUtils.dirichletExpectation(lambda.t).t
 
-    var score = documents.filter(_._2.numActives > 0).map { case (id: Long, termCounts: Vector) =>
+    var score = documents.filter(_._2.numNonzeros > 0).map { case (id: Long, termCounts: Vector) =>
       var docScore = 0.0D
       val (gammad: BDV[Double], _) = OnlineLDAOptimizer.variationalTopicInference(
         termCounts, exp(Elogbeta), brzAlpha, gammaShape, k)
@@ -277,7 +273,7 @@ class LocalLDAModel private[clustering] (
 
       // E[log p(doc | theta, beta)]
       termCounts.foreachActive { case (idx, count) =>
-        docScore += LDAUtils.logSumExp(Elogthetad + Elogbeta(idx, ::).t)
+        docScore += count * LDAUtils.logSumExp(Elogthetad + Elogbeta(idx, ::).t)
       }
       // E[log p(theta | alpha) - log q(theta | gamma)]; assumes alpha is a vector
       docScore += sum((brzAlpha - gammad) :* Elogthetad)
@@ -297,6 +293,40 @@ class LocalLDAModel private[clustering] (
     score
   }
 
+  /**
+   * Predicts the topic mixture distribution for each document (often called "theta" in the
+   * literature).  Returns a vector of zeros for an empty document.
+   *
+   * This uses a variational approximation following Hoffman et al. (2010), where the approximate
+   * distribution is called "gamma."  Technically, this method returns this approximation "gamma"
+   * for each document.
+   * @param documents documents to predict topic mixture distributions for
+   * @return An RDD of (document ID, topic mixture distribution for document)
+   */
+  // TODO: declare in LDAModel and override once implemented in DistributedLDAModel
+  def topicDistributions(documents: RDD[(Long, Vector)]): RDD[(Long, Vector)] = {
+    // Double transpose because dirichletExpectation normalizes by row and we need to normalize
+    // by topic (columns of lambda)
+    val expElogbeta = exp(LDAUtils.dirichletExpectation(topicsMatrix.toBreeze.toDenseMatrix.t).t)
+    val docConcentrationBrz = this.docConcentration.toBreeze
+    val gammaShape = this.gammaShape
+    val k = this.k
+
+    documents.map { case (id: Long, termCounts: Vector) =>
+      if (termCounts.numNonzeros == 0) {
+         (id, Vectors.zeros(k))
+      } else {
+        val (gamma, _) = OnlineLDAOptimizer.variationalTopicInference(
+          termCounts,
+          expElogbeta,
+          docConcentrationBrz,
+          gammaShape,
+          k)
+        (id, Vectors.dense(normalize(gamma, 1.0).toArray))
+      }
+    }
+  }
+
 }
 
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
index 4b90fbdf0ce7e..9dbec41efeada 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
@@ -394,7 +394,7 @@ final class OnlineLDAOptimizer extends LDAOptimizer {
     val gammaShape = this.gammaShape
 
     val stats: RDD[(BDM[Double], List[BDV[Double]])] = batch.mapPartitions { docs =>
-      val nonEmptyDocs = docs.filter(_._2.numActives > 0)
+      val nonEmptyDocs = docs.filter(_._2.numNonzeros > 0)
 
       val stat = BDM.zeros[Double](k, vocabSize)
       var gammaPart = List[BDV[Double]]()
@@ -461,7 +461,8 @@ final class OnlineLDAOptimizer extends LDAOptimizer {
 private[clustering] object OnlineLDAOptimizer {
   /**
    * Uses variational inference to infer the topic distribution `gammad` given the term counts
-   * for a document. `termCounts` must be non-empty, otherwise Breeze will throw a BLAS error.
+   * for a document. `termCounts` must contain at least one non-zero entry, otherwise Breeze will
+   * throw a BLAS error.
    *
    * An optimization (Lee, Seung: Algorithms for non-negative matrix factorization, NIPS 2001)
    * avoids explicit computation of variational parameter `phi`.
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
index 61d2edfd9fb5f..d74482d3a7598 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
@@ -242,6 +242,7 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext {
     val alpha = 0.01
     val eta = 0.01
     val gammaShape = 100
+    // obtained from LDA model trained in gensim, see below
     val topics = new DenseMatrix(numRows = vocabSize, numCols = k, values = Array(
       1.86738052, 1.94056535, 1.89981687, 0.0833265, 0.07405918, 0.07940597,
       0.15081551, 0.08637973, 0.12428538, 1.9474897, 1.94615165, 1.95204124))
@@ -281,6 +282,68 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext {
     assert(ldaModel.logPerplexity(docs) ~== -3.690D relTol 1E-3D)
   }
 
+  test("LocalLDAModel predict") {
+    val k = 2
+    val vocabSize = 6
+    val alpha = 0.01
+    val eta = 0.01
+    val gammaShape = 100
+    // obtained from LDA model trained in gensim, see below
+    val topics = new DenseMatrix(numRows = vocabSize, numCols = k, values = Array(
+      1.86738052, 1.94056535, 1.89981687, 0.0833265, 0.07405918, 0.07940597,
+      0.15081551, 0.08637973, 0.12428538, 1.9474897, 1.94615165, 1.95204124))
+
+    def toydata: Array[(Long, Vector)] = Array(
+      Vectors.sparse(6, Array(0, 1), Array(1, 1)),
+      Vectors.sparse(6, Array(1, 2), Array(1, 1)),
+      Vectors.sparse(6, Array(0, 2), Array(1, 1)),
+      Vectors.sparse(6, Array(3, 4), Array(1, 1)),
+      Vectors.sparse(6, Array(3, 5), Array(1, 1)),
+      Vectors.sparse(6, Array(4, 5), Array(1, 1))
+    ).zipWithIndex.map { case (wordCounts, docId) => (docId.toLong, wordCounts) }
+    val docs = sc.parallelize(toydata)
+
+    val ldaModel: LocalLDAModel = new LocalLDAModel(
+      topics, Vectors.dense(Array.fill(k)(alpha)), eta, gammaShape)
+
+    /* Verify results using gensim:
+       import numpy as np
+       from gensim import models
+       corpus = [
+          [(0, 1.0), (1, 1.0)],
+          [(1, 1.0), (2, 1.0)],
+          [(0, 1.0), (2, 1.0)],
+          [(3, 1.0), (4, 1.0)],
+          [(3, 1.0), (5, 1.0)],
+          [(4, 1.0), (5, 1.0)]]
+       np.random.seed(2345)
+       lda = models.ldamodel.LdaModel(
+          corpus=corpus, alpha=0.01, eta=0.01, num_topics=2, update_every=0, passes=100,
+          decay=0.51, offset=1024)
+       print(list(lda.get_document_topics(corpus)))
+       > [[(0, 0.99504950495049516)], [(0, 0.99504950495049516)],
+       > [(0, 0.99504950495049516)], [(1, 0.99504950495049516)],
+       > [(1, 0.99504950495049516)], [(1, 0.99504950495049516)]]
+     */
+
+    val expectedPredictions = List(
+      (0, 0.99504), (0, 0.99504),
+      (0, 0.99504), (1, 0.99504),
+      (1, 0.99504), (1, 0.99504))
+
+    val actualPredictions = ldaModel.topicDistributions(docs).map { case (id, topics) =>
+        // convert results to expectedPredictions format, which only has highest probability topic
+        val topicsBz = topics.toBreeze.toDenseVector
+        (id, (argmax(topicsBz), max(topicsBz)))
+      }.sortByKey()
+      .values
+      .collect()
+
+    expectedPredictions.zip(actualPredictions).forall { case (expected, actual) =>
+      expected._1 === actual._1 && (expected._2 ~== actual._2 relTol 1E-3D)
+    }
+  }
+
   test("OnlineLDAOptimizer with asymmetric prior") {
     def toydata: Array[(Long, Vector)] = Array(
       Vectors.sparse(6, Array(0, 1), Array(1, 1)),

From 1abf7dc16ca1ba1777fe874c8b81fe6f2b0a6de5 Mon Sep 17 00:00:00 2001
From: Daoyuan Wang <daoyuan.wang@intel.com>
Date: Thu, 30 Jul 2015 13:21:46 -0700
Subject: [PATCH 486/549] [SPARK-8186] [SPARK-8187] [SPARK-8194] [SPARK-8198]
 [SPARK-9133] [SPARK-9290] [SQL] functions: date_add, date_sub, add_months,
 months_between, time-interval calculation

This PR is based on #7589 , thanks to adrian-wang

Added SQL function date_add, date_sub, add_months, month_between, also add a rule for
add/subtract of date/timestamp and interval.

Closes #7589

cc rxin

Author: Daoyuan Wang <daoyuan.wang@intel.com>
Author: Davies Liu <davies@databricks.com>

Closes #7754 from davies/date_add and squashes the following commits:

e8c633a [Davies Liu] Merge branch 'master' of github.com:apache/spark into date_add
9e8e085 [Davies Liu] Merge branch 'master' of github.com:apache/spark into date_add
6224ce4 [Davies Liu] fix conclict
bd18cd4 [Davies Liu] Merge branch 'master' of github.com:apache/spark into date_add
e47ff2c [Davies Liu] add python api, fix date functions
01943d0 [Davies Liu] Merge branch 'master' into date_add
522e91a [Daoyuan Wang] fix
e8a639a [Daoyuan Wang] fix
42df486 [Daoyuan Wang] fix style
87c4b77 [Daoyuan Wang] function add_months, months_between and some fixes
1a68e03 [Daoyuan Wang] poc of time interval calculation
c506661 [Daoyuan Wang] function date_add , date_sub
---
 python/pyspark/sql/functions.py               |  76 ++++++-
 .../catalyst/analysis/FunctionRegistry.scala  |   4 +
 .../catalyst/analysis/HiveTypeCoercion.scala  |  22 ++
 .../expressions/datetimeFunctions.scala       | 155 ++++++++++++-
 .../sql/catalyst/util/DateTimeUtils.scala     | 139 ++++++++++++
 .../analysis/HiveTypeCoercionSuite.scala      |  30 +++
 .../expressions/DateExpressionsSuite.scala    | 176 +++++++++------
 .../catalyst/util/DateTimeUtilsSuite.scala    | 205 +++++++++++-------
 .../org/apache/spark/sql/functions.scala      |  29 +++
 .../apache/spark/sql/DateFunctionsSuite.scala | 117 ++++++++++
 10 files changed, 791 insertions(+), 162 deletions(-)

diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index d930f7db25d25..a7295e25f0aa5 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -59,7 +59,7 @@
 __all__ += ['lag', 'lead', 'ntile']
 
 __all__ += [
-    'date_format',
+    'date_format', 'date_add', 'date_sub', 'add_months', 'months_between',
     'year', 'quarter', 'month', 'hour', 'minute', 'second',
     'dayofmonth', 'dayofyear', 'weekofyear']
 
@@ -716,7 +716,7 @@ def date_format(dateCol, format):
     [Row(date=u'04/08/2015')]
     """
     sc = SparkContext._active_spark_context
-    return Column(sc._jvm.functions.date_format(dateCol, format))
+    return Column(sc._jvm.functions.date_format(_to_java_column(dateCol), format))
 
 
 @since(1.5)
@@ -729,7 +729,7 @@ def year(col):
     [Row(year=2015)]
     """
     sc = SparkContext._active_spark_context
-    return Column(sc._jvm.functions.year(col))
+    return Column(sc._jvm.functions.year(_to_java_column(col)))
 
 
 @since(1.5)
@@ -742,7 +742,7 @@ def quarter(col):
     [Row(quarter=2)]
     """
     sc = SparkContext._active_spark_context
-    return Column(sc._jvm.functions.quarter(col))
+    return Column(sc._jvm.functions.quarter(_to_java_column(col)))
 
 
 @since(1.5)
@@ -755,7 +755,7 @@ def month(col):
     [Row(month=4)]
    """
     sc = SparkContext._active_spark_context
-    return Column(sc._jvm.functions.month(col))
+    return Column(sc._jvm.functions.month(_to_java_column(col)))
 
 
 @since(1.5)
@@ -768,7 +768,7 @@ def dayofmonth(col):
     [Row(day=8)]
     """
     sc = SparkContext._active_spark_context
-    return Column(sc._jvm.functions.dayofmonth(col))
+    return Column(sc._jvm.functions.dayofmonth(_to_java_column(col)))
 
 
 @since(1.5)
@@ -781,7 +781,7 @@ def dayofyear(col):
     [Row(day=98)]
     """
     sc = SparkContext._active_spark_context
-    return Column(sc._jvm.functions.dayofyear(col))
+    return Column(sc._jvm.functions.dayofyear(_to_java_column(col)))
 
 
 @since(1.5)
@@ -794,7 +794,7 @@ def hour(col):
     [Row(hour=13)]
     """
     sc = SparkContext._active_spark_context
-    return Column(sc._jvm.functions.hour(col))
+    return Column(sc._jvm.functions.hour(_to_java_column(col)))
 
 
 @since(1.5)
@@ -807,7 +807,7 @@ def minute(col):
     [Row(minute=8)]
     """
     sc = SparkContext._active_spark_context
-    return Column(sc._jvm.functions.minute(col))
+    return Column(sc._jvm.functions.minute(_to_java_column(col)))
 
 
 @since(1.5)
@@ -820,7 +820,7 @@ def second(col):
     [Row(second=15)]
     """
     sc = SparkContext._active_spark_context
-    return Column(sc._jvm.functions.second(col))
+    return Column(sc._jvm.functions.second(_to_java_column(col)))
 
 
 @since(1.5)
@@ -829,11 +829,63 @@ def weekofyear(col):
     Extract the week number of a given date as integer.
 
     >>> df = sqlContext.createDataFrame([('2015-04-08',)], ['a'])
-    >>> df.select(weekofyear('a').alias('week')).collect()
+    >>> df.select(weekofyear(df.a).alias('week')).collect()
     [Row(week=15)]
     """
     sc = SparkContext._active_spark_context
-    return Column(sc._jvm.functions.weekofyear(col))
+    return Column(sc._jvm.functions.weekofyear(_to_java_column(col)))
+
+
+@since(1.5)
+def date_add(start, days):
+    """
+    Returns the date that is `days` days after `start`
+
+    >>> df = sqlContext.createDataFrame([('2015-04-08',)], ['d'])
+    >>> df.select(date_add(df.d, 1).alias('d')).collect()
+    [Row(d=datetime.date(2015, 4, 9))]
+    """
+    sc = SparkContext._active_spark_context
+    return Column(sc._jvm.functions.date_add(_to_java_column(start), days))
+
+
+@since(1.5)
+def date_sub(start, days):
+    """
+    Returns the date that is `days` days before `start`
+
+    >>> df = sqlContext.createDataFrame([('2015-04-08',)], ['d'])
+    >>> df.select(date_sub(df.d, 1).alias('d')).collect()
+    [Row(d=datetime.date(2015, 4, 7))]
+    """
+    sc = SparkContext._active_spark_context
+    return Column(sc._jvm.functions.date_sub(_to_java_column(start), days))
+
+
+@since(1.5)
+def add_months(start, months):
+    """
+    Returns the date that is `months` months after `start`
+
+    >>> df = sqlContext.createDataFrame([('2015-04-08',)], ['d'])
+    >>> df.select(add_months(df.d, 1).alias('d')).collect()
+    [Row(d=datetime.date(2015, 5, 8))]
+    """
+    sc = SparkContext._active_spark_context
+    return Column(sc._jvm.functions.add_months(_to_java_column(start), months))
+
+
+@since(1.5)
+def months_between(date1, date2):
+    """
+    Returns the number of months between date1 and date2.
+
+    >>> df = sqlContext.createDataFrame([('1997-02-28 10:30:00', '1996-10-30')], ['t', 'd'])
+    >>> df.select(months_between(df.t, df.d).alias('months')).collect()
+    [Row(months=3.9495967...)]
+    """
+    sc = SparkContext._active_spark_context
+    return Column(sc._jvm.functions.months_between(_to_java_column(date1), _to_java_column(date2)))
 
 
 @since(1.5)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index d663f12bc6d0d..6c7c481fab8db 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -205,9 +205,12 @@ object FunctionRegistry {
     expression[Upper]("upper"),
 
     // datetime functions
+    expression[AddMonths]("add_months"),
     expression[CurrentDate]("current_date"),
     expression[CurrentTimestamp]("current_timestamp"),
+    expression[DateAdd]("date_add"),
     expression[DateFormatClass]("date_format"),
+    expression[DateSub]("date_sub"),
     expression[DayOfMonth]("day"),
     expression[DayOfYear]("dayofyear"),
     expression[DayOfMonth]("dayofmonth"),
@@ -216,6 +219,7 @@ object FunctionRegistry {
     expression[LastDay]("last_day"),
     expression[Minute]("minute"),
     expression[Month]("month"),
+    expression[MonthsBetween]("months_between"),
     expression[NextDay]("next_day"),
     expression[Quarter]("quarter"),
     expression[Second]("second"),
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
index ecc48986e35d8..603afc4032a37 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
@@ -47,6 +47,7 @@ object HiveTypeCoercion {
       Division ::
       PropagateTypes ::
       ImplicitTypeCasts ::
+      DateTimeOperations ::
       Nil
 
   // See https://cwiki.apache.org/confluence/display/Hive/LanguageManual+Types.
@@ -638,6 +639,27 @@ object HiveTypeCoercion {
     }
   }
 
+  /**
+   * Turns Add/Subtract of DateType/TimestampType/StringType and CalendarIntervalType
+   * to TimeAdd/TimeSub
+   */
+  object DateTimeOperations extends Rule[LogicalPlan] {
+
+    private val acceptedTypes = Seq(DateType, TimestampType, StringType)
+
+    def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
+      // Skip nodes who's children have not been resolved yet.
+      case e if !e.childrenResolved => e
+
+      case Add(l @ CalendarIntervalType(), r) if acceptedTypes.contains(r.dataType) =>
+        Cast(TimeAdd(r, l), r.dataType)
+      case Add(l, r @ CalendarIntervalType()) if acceptedTypes.contains(l.dataType) =>
+        Cast(TimeAdd(l, r), l.dataType)
+      case Subtract(l, r @ CalendarIntervalType()) if acceptedTypes.contains(l.dataType) =>
+        Cast(TimeSub(l, r), l.dataType)
+    }
+  }
+
   /**
    * Casts types according to the expected input types for [[Expression]]s.
    */
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeFunctions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeFunctions.scala
index a5e6249e438d2..9795673ee0664 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeFunctions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeFunctions.scala
@@ -25,7 +25,7 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.types._
-import org.apache.spark.unsafe.types.UTF8String
+import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String}
 
 import scala.util.Try
 
@@ -63,6 +63,53 @@ case class CurrentTimestamp() extends LeafExpression with CodegenFallback {
   }
 }
 
+/**
+ * Adds a number of days to startdate.
+ */
+case class DateAdd(startDate: Expression, days: Expression)
+  extends BinaryExpression with ImplicitCastInputTypes {
+
+  override def left: Expression = startDate
+  override def right: Expression = days
+
+  override def inputTypes: Seq[AbstractDataType] = Seq(DateType, IntegerType)
+
+  override def dataType: DataType = DateType
+
+  override def nullSafeEval(start: Any, d: Any): Any = {
+    start.asInstanceOf[Int] + d.asInstanceOf[Int]
+  }
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    nullSafeCodeGen(ctx, ev, (sd, d) => {
+      s"""${ev.primitive} = $sd + $d;"""
+    })
+  }
+}
+
+/**
+ * Subtracts a number of days to startdate.
+ */
+case class DateSub(startDate: Expression, days: Expression)
+  extends BinaryExpression with ImplicitCastInputTypes {
+  override def left: Expression = startDate
+  override def right: Expression = days
+
+  override def inputTypes: Seq[AbstractDataType] = Seq(DateType, IntegerType)
+
+  override def dataType: DataType = DateType
+
+  override def nullSafeEval(start: Any, d: Any): Any = {
+    start.asInstanceOf[Int] - d.asInstanceOf[Int]
+  }
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    nullSafeCodeGen(ctx, ev, (sd, d) => {
+      s"""${ev.primitive} = $sd - $d;"""
+    })
+  }
+}
+
 case class Hour(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
 
   override def inputTypes: Seq[AbstractDataType] = Seq(TimestampType)
@@ -543,3 +590,109 @@ case class NextDay(startDate: Expression, dayOfWeek: Expression)
 
   override def prettyName: String = "next_day"
 }
+
+/**
+ * Adds an interval to timestamp.
+ */
+case class TimeAdd(start: Expression, interval: Expression)
+  extends BinaryExpression with ImplicitCastInputTypes {
+
+  override def left: Expression = start
+  override def right: Expression = interval
+
+  override def toString: String = s"$left + $right"
+  override def inputTypes: Seq[AbstractDataType] = Seq(TimestampType, CalendarIntervalType)
+
+  override def dataType: DataType = TimestampType
+
+  override def nullSafeEval(start: Any, interval: Any): Any = {
+    val itvl = interval.asInstanceOf[CalendarInterval]
+    DateTimeUtils.timestampAddInterval(
+      start.asInstanceOf[Long], itvl.months, itvl.microseconds)
+  }
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    val dtu = DateTimeUtils.getClass.getName.stripSuffix("$")
+    defineCodeGen(ctx, ev, (sd, i) => {
+      s"""$dtu.timestampAddInterval($sd, $i.months, $i.microseconds)"""
+    })
+  }
+}
+
+/**
+ * Subtracts an interval from timestamp.
+ */
+case class TimeSub(start: Expression, interval: Expression)
+  extends BinaryExpression with ImplicitCastInputTypes {
+
+  override def left: Expression = start
+  override def right: Expression = interval
+
+  override def toString: String = s"$left - $right"
+  override def inputTypes: Seq[AbstractDataType] = Seq(TimestampType, CalendarIntervalType)
+
+  override def dataType: DataType = TimestampType
+
+  override def nullSafeEval(start: Any, interval: Any): Any = {
+    val itvl = interval.asInstanceOf[CalendarInterval]
+    DateTimeUtils.timestampAddInterval(
+      start.asInstanceOf[Long], 0 - itvl.months, 0 - itvl.microseconds)
+  }
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    val dtu = DateTimeUtils.getClass.getName.stripSuffix("$")
+    defineCodeGen(ctx, ev, (sd, i) => {
+      s"""$dtu.timestampAddInterval($sd, 0 - $i.months, 0 - $i.microseconds)"""
+    })
+  }
+}
+
+/**
+ * Returns the date that is num_months after start_date.
+ */
+case class AddMonths(startDate: Expression, numMonths: Expression)
+  extends BinaryExpression with ImplicitCastInputTypes {
+
+  override def left: Expression = startDate
+  override def right: Expression = numMonths
+
+  override def inputTypes: Seq[AbstractDataType] = Seq(DateType, IntegerType)
+
+  override def dataType: DataType = DateType
+
+  override def nullSafeEval(start: Any, months: Any): Any = {
+    DateTimeUtils.dateAddMonths(start.asInstanceOf[Int], months.asInstanceOf[Int])
+  }
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    val dtu = DateTimeUtils.getClass.getName.stripSuffix("$")
+    defineCodeGen(ctx, ev, (sd, m) => {
+      s"""$dtu.dateAddMonths($sd, $m)"""
+    })
+  }
+}
+
+/**
+ * Returns number of months between dates date1 and date2.
+ */
+case class MonthsBetween(date1: Expression, date2: Expression)
+  extends BinaryExpression with ImplicitCastInputTypes {
+
+  override def left: Expression = date1
+  override def right: Expression = date2
+
+  override def inputTypes: Seq[AbstractDataType] = Seq(TimestampType, TimestampType)
+
+  override def dataType: DataType = DoubleType
+
+  override def nullSafeEval(t1: Any, t2: Any): Any = {
+    DateTimeUtils.monthsBetween(t1.asInstanceOf[Long], t2.asInstanceOf[Long])
+  }
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    val dtu = DateTimeUtils.getClass.getName.stripSuffix("$")
+    defineCodeGen(ctx, ev, (l, r) => {
+      s"""$dtu.monthsBetween($l, $r)"""
+    })
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
index 93966a503c27c..53abdf6618eac 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
@@ -45,6 +45,7 @@ object DateTimeUtils {
   final val to2001 = -11323
 
   // this is year -17999, calculation: 50 * daysIn400Year
+  final val YearZero = -17999
   final val toYearZero = to2001 + 7304850
 
   @transient lazy val defaultTimeZone = TimeZone.getDefault
@@ -575,6 +576,144 @@ object DateTimeUtils {
   }
 
   /**
+   * The number of days for each month (not leap year)
+   */
+  private val monthDays = Array(31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31)
+
+  /**
+   * Returns the date value for the first day of the given month.
+   * The month is expressed in months since year zero (17999 BC), starting from 0.
+   */
+  private def firstDayOfMonth(absoluteMonth: Int): Int = {
+    val absoluteYear = absoluteMonth / 12
+    var monthInYear = absoluteMonth - absoluteYear * 12
+    var date = getDateFromYear(absoluteYear)
+    if (monthInYear >= 2 && isLeapYear(absoluteYear + YearZero)) {
+      date += 1
+    }
+    while (monthInYear > 0) {
+      date += monthDays(monthInYear - 1)
+      monthInYear -= 1
+    }
+    date
+  }
+
+  /**
+   * Returns the date value for January 1 of the given year.
+   * The year is expressed in years since year zero (17999 BC), starting from 0.
+   */
+  private def getDateFromYear(absoluteYear: Int): Int = {
+    val absoluteDays = (absoluteYear * 365 + absoluteYear / 400 - absoluteYear / 100
+      + absoluteYear / 4)
+    absoluteDays - toYearZero
+  }
+
+  /**
+   * Add date and year-month interval.
+   * Returns a date value, expressed in days since 1.1.1970.
+   */
+  def dateAddMonths(days: Int, months: Int): Int = {
+    val absoluteMonth = (getYear(days) - YearZero) * 12 + getMonth(days) - 1 + months
+    val currentMonthInYear = absoluteMonth % 12
+    val currentYear = absoluteMonth / 12
+    val leapDay = if (currentMonthInYear == 1 && isLeapYear(currentYear + YearZero)) 1 else 0
+    val lastDayOfMonth = monthDays(currentMonthInYear) + leapDay
+
+    val dayOfMonth = getDayOfMonth(days)
+    val currentDayInMonth = if (getDayOfMonth(days + 1) == 1 || dayOfMonth >= lastDayOfMonth) {
+      // last day of the month
+      lastDayOfMonth
+    } else {
+      dayOfMonth
+    }
+    firstDayOfMonth(absoluteMonth) + currentDayInMonth - 1
+  }
+
+  /**
+   * Add timestamp and full interval.
+   * Returns a timestamp value, expressed in microseconds since 1.1.1970 00:00:00.
+   */
+  def timestampAddInterval(start: Long, months: Int, microseconds: Long): Long = {
+    val days = millisToDays(start / 1000L)
+    val newDays = dateAddMonths(days, months)
+    daysToMillis(newDays) * 1000L + start - daysToMillis(days) * 1000L + microseconds
+  }
+
+  /**
+   * Returns the last dayInMonth in the month it belongs to. The date is expressed
+   * in days since 1.1.1970. the return value starts from 1.
+   */
+  private def getLastDayInMonthOfMonth(date: Int): Int = {
+    var (year, dayInYear) = getYearAndDayInYear(date)
+    if (isLeapYear(year)) {
+      if (dayInYear > 31 && dayInYear <= 60) {
+        return 29
+      } else if (dayInYear > 60) {
+        dayInYear = dayInYear - 1
+      }
+    }
+    if (dayInYear <= 31) {
+      31
+    } else if (dayInYear <= 59) {
+      28
+    } else if (dayInYear <= 90) {
+      31
+    } else if (dayInYear <= 120) {
+      30
+    } else if (dayInYear <= 151) {
+      31
+    } else if (dayInYear <= 181) {
+      30
+    } else if (dayInYear <= 212) {
+      31
+    } else if (dayInYear <= 243) {
+      31
+    } else if (dayInYear <= 273) {
+      30
+    } else if (dayInYear <= 304) {
+      31
+    } else if (dayInYear <= 334) {
+      30
+    } else {
+      31
+    }
+  }
+
+  /**
+   * Returns number of months between time1 and time2. time1 and time2 are expressed in
+   * microseconds since 1.1.1970.
+   *
+   * If time1 and time2 having the same day of month, or both are the last day of month,
+   * it returns an integer (time under a day will be ignored).
+   *
+   * Otherwise, the difference is calculated based on 31 days per month, and rounding to
+   * 8 digits.
+   */
+  def monthsBetween(time1: Long, time2: Long): Double = {
+    val millis1 = time1 / 1000L
+    val millis2 = time2 / 1000L
+    val date1 = millisToDays(millis1)
+    val date2 = millisToDays(millis2)
+    // TODO(davies): get year, month, dayOfMonth from single function
+    val dayInMonth1 = getDayOfMonth(date1)
+    val dayInMonth2 = getDayOfMonth(date2)
+    val months1 = getYear(date1) * 12 + getMonth(date1)
+    val months2 = getYear(date2) * 12 + getMonth(date2)
+
+    if (dayInMonth1 == dayInMonth2 || (dayInMonth1 == getLastDayInMonthOfMonth(date1)
+      && dayInMonth2 == getLastDayInMonthOfMonth(date2))) {
+      return (months1 - months2).toDouble
+    }
+    // milliseconds is enough for 8 digits precision on the right side
+    val timeInDay1 = millis1 - daysToMillis(date1)
+    val timeInDay2 = millis2 - daysToMillis(date2)
+    val timesBetween = (timeInDay1 - timeInDay2).toDouble / MILLIS_PER_DAY
+    val diff = (months1 - months2).toDouble + (dayInMonth1 - dayInMonth2 + timesBetween) / 31.0
+    // rounding to 8 digits
+    math.round(diff * 1e8) / 1e8
+  }
+
+  /*
    * Returns day of week from String. Starting from Thursday, marked as 0.
    * (Because 1970-01-01 is Thursday).
    */
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
index 1d9ee5ddf3a5a..70608771dd110 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
@@ -17,12 +17,15 @@
 
 package org.apache.spark.sql.catalyst.analysis
 
+import java.sql.Timestamp
+
 import org.apache.spark.sql.catalyst.plans.PlanTest
 
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.CalendarInterval
 
 class HiveTypeCoercionSuite extends PlanTest {
 
@@ -400,6 +403,33 @@ class HiveTypeCoercionSuite extends PlanTest {
     }
   }
 
+  test("rule for date/timestamp operations") {
+    val dateTimeOperations = HiveTypeCoercion.DateTimeOperations
+    val date = Literal(new java.sql.Date(0L))
+    val timestamp = Literal(new Timestamp(0L))
+    val interval = Literal(new CalendarInterval(0, 0))
+    val str = Literal("2015-01-01")
+
+    ruleTest(dateTimeOperations, Add(date, interval), Cast(TimeAdd(date, interval), DateType))
+    ruleTest(dateTimeOperations, Add(interval, date), Cast(TimeAdd(date, interval), DateType))
+    ruleTest(dateTimeOperations, Add(timestamp, interval),
+      Cast(TimeAdd(timestamp, interval), TimestampType))
+    ruleTest(dateTimeOperations, Add(interval, timestamp),
+      Cast(TimeAdd(timestamp, interval), TimestampType))
+    ruleTest(dateTimeOperations, Add(str, interval), Cast(TimeAdd(str, interval), StringType))
+    ruleTest(dateTimeOperations, Add(interval, str), Cast(TimeAdd(str, interval), StringType))
+
+    ruleTest(dateTimeOperations, Subtract(date, interval), Cast(TimeSub(date, interval), DateType))
+    ruleTest(dateTimeOperations, Subtract(timestamp, interval),
+      Cast(TimeSub(timestamp, interval), TimestampType))
+    ruleTest(dateTimeOperations, Subtract(str, interval), Cast(TimeSub(str, interval), StringType))
+
+    // interval operations should not be effected
+    ruleTest(dateTimeOperations, Add(interval, interval), Add(interval, interval))
+    ruleTest(dateTimeOperations, Subtract(interval, interval), Subtract(interval, interval))
+  }
+
+
   /**
    * There are rules that need to not fire before child expressions get resolved.
    * We use this test to make sure those rules do not fire early.
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
index e1387f945ffa4..fd1d6c1d25497 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
@@ -22,8 +22,8 @@ import java.text.SimpleDateFormat
 import java.util.Calendar
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
+import org.apache.spark.unsafe.types.CalendarInterval
 import org.apache.spark.sql.types._
 
 class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
@@ -48,56 +48,8 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
 
   test("DayOfYear") {
     val sdfDay = new SimpleDateFormat("D")
-    (2002 to 2004).foreach { y =>
-      (0 to 11).foreach { m =>
-        (0 to 5).foreach { i =>
-          val c = Calendar.getInstance()
-          c.set(y, m, 28, 0, 0, 0)
-          c.add(Calendar.DATE, i)
-          checkEvaluation(DayOfYear(Literal(new Date(c.getTimeInMillis))),
-            sdfDay.format(c.getTime).toInt)
-        }
-      }
-    }
-
     (1998 to 2002).foreach { y =>
-      (0 to 11).foreach { m =>
-        (0 to 5).foreach { i =>
-          val c = Calendar.getInstance()
-          c.set(y, m, 28, 0, 0, 0)
-          c.add(Calendar.DATE, i)
-          checkEvaluation(DayOfYear(Literal(new Date(c.getTimeInMillis))),
-            sdfDay.format(c.getTime).toInt)
-        }
-      }
-    }
-
-    (1969 to 1970).foreach { y =>
-      (0 to 11).foreach { m =>
-        (0 to 5).foreach { i =>
-          val c = Calendar.getInstance()
-          c.set(y, m, 28, 0, 0, 0)
-          c.add(Calendar.DATE, i)
-          checkEvaluation(DayOfYear(Literal(new Date(c.getTimeInMillis))),
-            sdfDay.format(c.getTime).toInt)
-        }
-      }
-    }
-
-    (2402 to 2404).foreach { y =>
-      (0 to 11).foreach { m =>
-        (0 to 5).foreach { i =>
-          val c = Calendar.getInstance()
-          c.set(y, m, 28, 0, 0, 0)
-          c.add(Calendar.DATE, i)
-          checkEvaluation(DayOfYear(Literal(new Date(c.getTimeInMillis))),
-            sdfDay.format(c.getTime).toInt)
-        }
-      }
-    }
-
-    (2398 to 2402).foreach { y =>
-      (0 to 11).foreach { m =>
+      (0 to 3).foreach { m =>
         (0 to 5).foreach { i =>
           val c = Calendar.getInstance()
           c.set(y, m, 28, 0, 0, 0)
@@ -117,7 +69,7 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(Year(Cast(Literal(ts), DateType)), 2013)
 
     val c = Calendar.getInstance()
-    (2000 to 2010).foreach { y =>
+    (2000 to 2002).foreach { y =>
       (0 to 11 by 11).foreach { m =>
         c.set(y, m, 28)
         (0 to 5 * 24).foreach { i =>
@@ -155,20 +107,8 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(Month(Cast(Literal(ts), DateType)), 11)
 
     (2003 to 2004).foreach { y =>
-      (0 to 11).foreach { m =>
-        (0 to 5 * 24).foreach { i =>
-          val c = Calendar.getInstance()
-          c.set(y, m, 28, 0, 0, 0)
-          c.add(Calendar.HOUR_OF_DAY, i)
-          checkEvaluation(Month(Literal(new Date(c.getTimeInMillis))),
-            c.get(Calendar.MONTH) + 1)
-        }
-      }
-    }
-
-    (1999 to 2000).foreach { y =>
-      (0 to 11).foreach { m =>
-        (0 to 5 * 24).foreach { i =>
+      (0 to 3).foreach { m =>
+        (0 to 2 * 24).foreach { i =>
           val c = Calendar.getInstance()
           c.set(y, m, 28, 0, 0, 0)
           c.add(Calendar.HOUR_OF_DAY, i)
@@ -262,6 +202,112 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     }
   }
 
+  test("date_add") {
+    checkEvaluation(
+      DateAdd(Literal(Date.valueOf("2016-02-28")), Literal(1)),
+      DateTimeUtils.fromJavaDate(Date.valueOf("2016-02-29")))
+    checkEvaluation(
+      DateAdd(Literal(Date.valueOf("2016-02-28")), Literal(-365)),
+      DateTimeUtils.fromJavaDate(Date.valueOf("2015-02-28")))
+    checkEvaluation(DateAdd(Literal.create(null, DateType), Literal(1)), null)
+    checkEvaluation(DateAdd(Literal(Date.valueOf("2016-02-28")), Literal.create(null, IntegerType)),
+      null)
+    checkEvaluation(DateAdd(Literal.create(null, DateType), Literal.create(null, IntegerType)),
+      null)
+  }
+
+  test("date_sub") {
+    checkEvaluation(
+      DateSub(Literal(Date.valueOf("2015-01-01")), Literal(1)),
+      DateTimeUtils.fromJavaDate(Date.valueOf("2014-12-31")))
+    checkEvaluation(
+      DateSub(Literal(Date.valueOf("2015-01-01")), Literal(-1)),
+      DateTimeUtils.fromJavaDate(Date.valueOf("2015-01-02")))
+    checkEvaluation(DateSub(Literal.create(null, DateType), Literal(1)), null)
+    checkEvaluation(DateSub(Literal(Date.valueOf("2016-02-28")), Literal.create(null, IntegerType)),
+      null)
+    checkEvaluation(DateSub(Literal.create(null, DateType), Literal.create(null, IntegerType)),
+      null)
+  }
+
+  test("time_add") {
+    checkEvaluation(
+      TimeAdd(Literal(Timestamp.valueOf("2016-01-29 10:00:00")),
+        Literal(new CalendarInterval(1, 123000L))),
+      DateTimeUtils.fromJavaTimestamp(Timestamp.valueOf("2016-02-29 10:00:00.123")))
+
+    checkEvaluation(
+      TimeAdd(Literal.create(null, TimestampType), Literal(new CalendarInterval(1, 123000L))),
+      null)
+    checkEvaluation(
+      TimeAdd(Literal(Timestamp.valueOf("2016-01-29 10:00:00")),
+        Literal.create(null, CalendarIntervalType)),
+      null)
+    checkEvaluation(
+      TimeAdd(Literal.create(null, TimestampType), Literal.create(null, CalendarIntervalType)),
+      null)
+  }
+
+  test("time_sub") {
+    checkEvaluation(
+      TimeSub(Literal(Timestamp.valueOf("2016-03-31 10:00:00")),
+        Literal(new CalendarInterval(1, 0))),
+      DateTimeUtils.fromJavaTimestamp(Timestamp.valueOf("2016-02-29 10:00:00")))
+    checkEvaluation(
+      TimeSub(
+        Literal(Timestamp.valueOf("2016-03-30 00:00:01")),
+        Literal(new CalendarInterval(1, 2000000.toLong))),
+      DateTimeUtils.fromJavaTimestamp(Timestamp.valueOf("2016-02-28 23:59:59")))
+
+    checkEvaluation(
+      TimeSub(Literal.create(null, TimestampType), Literal(new CalendarInterval(1, 123000L))),
+      null)
+    checkEvaluation(
+      TimeSub(Literal(Timestamp.valueOf("2016-01-29 10:00:00")),
+        Literal.create(null, CalendarIntervalType)),
+      null)
+    checkEvaluation(
+      TimeSub(Literal.create(null, TimestampType), Literal.create(null, CalendarIntervalType)),
+      null)
+  }
+
+  test("add_months") {
+    checkEvaluation(AddMonths(Literal(Date.valueOf("2015-01-30")), Literal(1)),
+      DateTimeUtils.fromJavaDate(Date.valueOf("2015-02-28")))
+    checkEvaluation(AddMonths(Literal(Date.valueOf("2016-03-30")), Literal(-1)),
+      DateTimeUtils.fromJavaDate(Date.valueOf("2016-02-29")))
+    checkEvaluation(
+      AddMonths(Literal(Date.valueOf("2015-01-30")), Literal.create(null, IntegerType)),
+      null)
+    checkEvaluation(AddMonths(Literal.create(null, DateType), Literal(1)), null)
+    checkEvaluation(AddMonths(Literal.create(null, DateType), Literal.create(null, IntegerType)),
+      null)
+  }
+
+  test("months_between") {
+    checkEvaluation(
+      MonthsBetween(Literal(Timestamp.valueOf("1997-02-28 10:30:00")),
+        Literal(Timestamp.valueOf("1996-10-30 00:00:00"))),
+      3.94959677)
+    checkEvaluation(
+      MonthsBetween(Literal(Timestamp.valueOf("2015-01-30 11:52:00")),
+        Literal(Timestamp.valueOf("2015-01-30 11:50:00"))),
+      0.0)
+    checkEvaluation(
+      MonthsBetween(Literal(Timestamp.valueOf("2015-01-31 00:00:00")),
+        Literal(Timestamp.valueOf("2015-03-31 22:00:00"))),
+      -2.0)
+    checkEvaluation(
+      MonthsBetween(Literal(Timestamp.valueOf("2015-03-31 22:00:00")),
+        Literal(Timestamp.valueOf("2015-02-28 00:00:00"))),
+      1.0)
+    val t = Literal(Timestamp.valueOf("2015-03-31 22:00:00"))
+    val tnull = Literal.create(null, TimestampType)
+    checkEvaluation(MonthsBetween(t, tnull), null)
+    checkEvaluation(MonthsBetween(tnull, t), null)
+    checkEvaluation(MonthsBetween(tnull, tnull), null)
+  }
+
   test("last_day") {
     checkEvaluation(LastDay(Literal(Date.valueOf("2015-02-28"))), Date.valueOf("2015-02-28"))
     checkEvaluation(LastDay(Literal(Date.valueOf("2015-03-27"))), Date.valueOf("2015-03-31"))
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
index fab9eb9cd4c9f..60d2bcfe13757 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
@@ -19,47 +19,48 @@ package org.apache.spark.sql.catalyst.util
 
 import java.sql.{Date, Timestamp}
 import java.text.SimpleDateFormat
-import java.util.{TimeZone, Calendar}
+import java.util.{Calendar, TimeZone}
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.unsafe.types.UTF8String
+import org.apache.spark.sql.catalyst.util.DateTimeUtils._
 
 class DateTimeUtilsSuite extends SparkFunSuite {
 
   private[this] def getInUTCDays(timestamp: Long): Int = {
     val tz = TimeZone.getDefault
-    ((timestamp + tz.getOffset(timestamp)) / DateTimeUtils.MILLIS_PER_DAY).toInt
+    ((timestamp + tz.getOffset(timestamp)) / MILLIS_PER_DAY).toInt
   }
 
   test("timestamp and us") {
     val now = new Timestamp(System.currentTimeMillis())
     now.setNanos(1000)
-    val ns = DateTimeUtils.fromJavaTimestamp(now)
+    val ns = fromJavaTimestamp(now)
     assert(ns % 1000000L === 1)
-    assert(DateTimeUtils.toJavaTimestamp(ns) === now)
+    assert(toJavaTimestamp(ns) === now)
 
     List(-111111111111L, -1L, 0, 1L, 111111111111L).foreach { t =>
-      val ts = DateTimeUtils.toJavaTimestamp(t)
-      assert(DateTimeUtils.fromJavaTimestamp(ts) === t)
-      assert(DateTimeUtils.toJavaTimestamp(DateTimeUtils.fromJavaTimestamp(ts)) === ts)
+      val ts = toJavaTimestamp(t)
+      assert(fromJavaTimestamp(ts) === t)
+      assert(toJavaTimestamp(fromJavaTimestamp(ts)) === ts)
     }
   }
 
   test("us and julian day") {
-    val (d, ns) = DateTimeUtils.toJulianDay(0)
-    assert(d === DateTimeUtils.JULIAN_DAY_OF_EPOCH)
-    assert(ns === DateTimeUtils.SECONDS_PER_DAY / 2 * DateTimeUtils.NANOS_PER_SECOND)
-    assert(DateTimeUtils.fromJulianDay(d, ns) == 0L)
+    val (d, ns) = toJulianDay(0)
+    assert(d === JULIAN_DAY_OF_EPOCH)
+    assert(ns === SECONDS_PER_DAY / 2 * NANOS_PER_SECOND)
+    assert(fromJulianDay(d, ns) == 0L)
 
     val t = new Timestamp(61394778610000L) // (2015, 6, 11, 10, 10, 10, 100)
-    val (d1, ns1) = DateTimeUtils.toJulianDay(DateTimeUtils.fromJavaTimestamp(t))
-    val t2 = DateTimeUtils.toJavaTimestamp(DateTimeUtils.fromJulianDay(d1, ns1))
+    val (d1, ns1) = toJulianDay(fromJavaTimestamp(t))
+    val t2 = toJavaTimestamp(fromJulianDay(d1, ns1))
     assert(t.equals(t2))
   }
 
   test("SPARK-6785: java date conversion before and after epoch") {
     def checkFromToJavaDate(d1: Date): Unit = {
-      val d2 = DateTimeUtils.toJavaDate(DateTimeUtils.fromJavaDate(d1))
+      val d2 = toJavaDate(fromJavaDate(d1))
       assert(d2.toString === d1.toString)
     }
 
@@ -95,157 +96,156 @@ class DateTimeUtilsSuite extends SparkFunSuite {
   }
 
   test("string to date") {
-    import DateTimeUtils.millisToDays
 
     var c = Calendar.getInstance()
     c.set(2015, 0, 28, 0, 0, 0)
     c.set(Calendar.MILLISECOND, 0)
-    assert(DateTimeUtils.stringToDate(UTF8String.fromString("2015-01-28")).get ===
+    assert(stringToDate(UTF8String.fromString("2015-01-28")).get ===
       millisToDays(c.getTimeInMillis))
     c.set(2015, 0, 1, 0, 0, 0)
     c.set(Calendar.MILLISECOND, 0)
-    assert(DateTimeUtils.stringToDate(UTF8String.fromString("2015")).get ===
+    assert(stringToDate(UTF8String.fromString("2015")).get ===
       millisToDays(c.getTimeInMillis))
     c = Calendar.getInstance()
     c.set(2015, 2, 1, 0, 0, 0)
     c.set(Calendar.MILLISECOND, 0)
-    assert(DateTimeUtils.stringToDate(UTF8String.fromString("2015-03")).get ===
+    assert(stringToDate(UTF8String.fromString("2015-03")).get ===
       millisToDays(c.getTimeInMillis))
     c = Calendar.getInstance()
     c.set(2015, 2, 18, 0, 0, 0)
     c.set(Calendar.MILLISECOND, 0)
-    assert(DateTimeUtils.stringToDate(UTF8String.fromString("2015-03-18")).get ===
+    assert(stringToDate(UTF8String.fromString("2015-03-18")).get ===
       millisToDays(c.getTimeInMillis))
-    assert(DateTimeUtils.stringToDate(UTF8String.fromString("2015-03-18 ")).get ===
+    assert(stringToDate(UTF8String.fromString("2015-03-18 ")).get ===
       millisToDays(c.getTimeInMillis))
-    assert(DateTimeUtils.stringToDate(UTF8String.fromString("2015-03-18 123142")).get ===
+    assert(stringToDate(UTF8String.fromString("2015-03-18 123142")).get ===
       millisToDays(c.getTimeInMillis))
-    assert(DateTimeUtils.stringToDate(UTF8String.fromString("2015-03-18T123123")).get ===
+    assert(stringToDate(UTF8String.fromString("2015-03-18T123123")).get ===
       millisToDays(c.getTimeInMillis))
-    assert(DateTimeUtils.stringToDate(UTF8String.fromString("2015-03-18T")).get ===
+    assert(stringToDate(UTF8String.fromString("2015-03-18T")).get ===
       millisToDays(c.getTimeInMillis))
 
-    assert(DateTimeUtils.stringToDate(UTF8String.fromString("2015-03-18X")).isEmpty)
-    assert(DateTimeUtils.stringToDate(UTF8String.fromString("2015/03/18")).isEmpty)
-    assert(DateTimeUtils.stringToDate(UTF8String.fromString("2015.03.18")).isEmpty)
-    assert(DateTimeUtils.stringToDate(UTF8String.fromString("20150318")).isEmpty)
-    assert(DateTimeUtils.stringToDate(UTF8String.fromString("2015-031-8")).isEmpty)
+    assert(stringToDate(UTF8String.fromString("2015-03-18X")).isEmpty)
+    assert(stringToDate(UTF8String.fromString("2015/03/18")).isEmpty)
+    assert(stringToDate(UTF8String.fromString("2015.03.18")).isEmpty)
+    assert(stringToDate(UTF8String.fromString("20150318")).isEmpty)
+    assert(stringToDate(UTF8String.fromString("2015-031-8")).isEmpty)
   }
 
   test("string to timestamp") {
     var c = Calendar.getInstance()
     c.set(1969, 11, 31, 16, 0, 0)
     c.set(Calendar.MILLISECOND, 0)
-    assert(DateTimeUtils.stringToTimestamp(UTF8String.fromString("1969-12-31 16:00:00")).get ===
+    assert(stringToTimestamp(UTF8String.fromString("1969-12-31 16:00:00")).get ===
       c.getTimeInMillis * 1000)
     c.set(2015, 0, 1, 0, 0, 0)
     c.set(Calendar.MILLISECOND, 0)
-    assert(DateTimeUtils.stringToTimestamp(UTF8String.fromString("2015")).get ===
+    assert(stringToTimestamp(UTF8String.fromString("2015")).get ===
       c.getTimeInMillis * 1000)
     c = Calendar.getInstance()
     c.set(2015, 2, 1, 0, 0, 0)
     c.set(Calendar.MILLISECOND, 0)
-    assert(DateTimeUtils.stringToTimestamp(UTF8String.fromString("2015-03")).get ===
+    assert(stringToTimestamp(UTF8String.fromString("2015-03")).get ===
       c.getTimeInMillis * 1000)
     c = Calendar.getInstance()
     c.set(2015, 2, 18, 0, 0, 0)
     c.set(Calendar.MILLISECOND, 0)
-    assert(DateTimeUtils.stringToTimestamp(UTF8String.fromString("2015-03-18")).get ===
+    assert(stringToTimestamp(UTF8String.fromString("2015-03-18")).get ===
       c.getTimeInMillis * 1000)
-    assert(DateTimeUtils.stringToTimestamp(UTF8String.fromString("2015-03-18 ")).get ===
+    assert(stringToTimestamp(UTF8String.fromString("2015-03-18 ")).get ===
       c.getTimeInMillis * 1000)
-    assert(DateTimeUtils.stringToTimestamp(UTF8String.fromString("2015-03-18T")).get ===
+    assert(stringToTimestamp(UTF8String.fromString("2015-03-18T")).get ===
       c.getTimeInMillis * 1000)
 
     c = Calendar.getInstance()
     c.set(2015, 2, 18, 12, 3, 17)
     c.set(Calendar.MILLISECOND, 0)
-    assert(DateTimeUtils.stringToTimestamp(UTF8String.fromString("2015-03-18 12:03:17")).get ===
+    assert(stringToTimestamp(UTF8String.fromString("2015-03-18 12:03:17")).get ===
       c.getTimeInMillis * 1000)
-    assert(DateTimeUtils.stringToTimestamp(UTF8String.fromString("2015-03-18T12:03:17")).get ===
+    assert(stringToTimestamp(UTF8String.fromString("2015-03-18T12:03:17")).get ===
       c.getTimeInMillis * 1000)
 
     c = Calendar.getInstance(TimeZone.getTimeZone("GMT-13:53"))
     c.set(2015, 2, 18, 12, 3, 17)
     c.set(Calendar.MILLISECOND, 0)
-    assert(DateTimeUtils.stringToTimestamp(
+    assert(stringToTimestamp(
       UTF8String.fromString("2015-03-18T12:03:17-13:53")).get === c.getTimeInMillis * 1000)
 
     c = Calendar.getInstance(TimeZone.getTimeZone("UTC"))
     c.set(2015, 2, 18, 12, 3, 17)
     c.set(Calendar.MILLISECOND, 0)
-    assert(DateTimeUtils.stringToTimestamp(UTF8String.fromString("2015-03-18T12:03:17Z")).get ===
+    assert(stringToTimestamp(UTF8String.fromString("2015-03-18T12:03:17Z")).get ===
       c.getTimeInMillis * 1000)
-    assert(DateTimeUtils.stringToTimestamp(UTF8String.fromString("2015-03-18 12:03:17Z")).get ===
+    assert(stringToTimestamp(UTF8String.fromString("2015-03-18 12:03:17Z")).get ===
       c.getTimeInMillis * 1000)
 
     c = Calendar.getInstance(TimeZone.getTimeZone("GMT-01:00"))
     c.set(2015, 2, 18, 12, 3, 17)
     c.set(Calendar.MILLISECOND, 0)
-    assert(DateTimeUtils.stringToTimestamp(UTF8String.fromString("2015-03-18T12:03:17-1:0")).get ===
+    assert(stringToTimestamp(UTF8String.fromString("2015-03-18T12:03:17-1:0")).get ===
       c.getTimeInMillis * 1000)
-    assert(DateTimeUtils.stringToTimestamp(
+    assert(stringToTimestamp(
       UTF8String.fromString("2015-03-18T12:03:17-01:00")).get === c.getTimeInMillis * 1000)
 
     c = Calendar.getInstance(TimeZone.getTimeZone("GMT+07:30"))
     c.set(2015, 2, 18, 12, 3, 17)
     c.set(Calendar.MILLISECOND, 0)
-    assert(DateTimeUtils.stringToTimestamp(
+    assert(stringToTimestamp(
       UTF8String.fromString("2015-03-18T12:03:17+07:30")).get === c.getTimeInMillis * 1000)
 
     c = Calendar.getInstance(TimeZone.getTimeZone("GMT+07:03"))
     c.set(2015, 2, 18, 12, 3, 17)
     c.set(Calendar.MILLISECOND, 0)
-    assert(DateTimeUtils.stringToTimestamp(
+    assert(stringToTimestamp(
       UTF8String.fromString("2015-03-18T12:03:17+07:03")).get === c.getTimeInMillis * 1000)
 
     c = Calendar.getInstance()
     c.set(2015, 2, 18, 12, 3, 17)
     c.set(Calendar.MILLISECOND, 123)
-    assert(DateTimeUtils.stringToTimestamp(
+    assert(stringToTimestamp(
       UTF8String.fromString("2015-03-18 12:03:17.123")).get === c.getTimeInMillis * 1000)
-    assert(DateTimeUtils.stringToTimestamp(
+    assert(stringToTimestamp(
       UTF8String.fromString("2015-03-18T12:03:17.123")).get === c.getTimeInMillis * 1000)
 
     c = Calendar.getInstance(TimeZone.getTimeZone("UTC"))
     c.set(2015, 2, 18, 12, 3, 17)
     c.set(Calendar.MILLISECOND, 456)
-    assert(DateTimeUtils.stringToTimestamp(
+    assert(stringToTimestamp(
       UTF8String.fromString("2015-03-18T12:03:17.456Z")).get  === c.getTimeInMillis * 1000)
-    assert(DateTimeUtils.stringToTimestamp(
+    assert(stringToTimestamp(
       UTF8String.fromString("2015-03-18 12:03:17.456Z")).get  === c.getTimeInMillis * 1000)
 
     c = Calendar.getInstance(TimeZone.getTimeZone("GMT-01:00"))
     c.set(2015, 2, 18, 12, 3, 17)
     c.set(Calendar.MILLISECOND, 123)
-    assert(DateTimeUtils.stringToTimestamp(
+    assert(stringToTimestamp(
       UTF8String.fromString("2015-03-18T12:03:17.123-1:0")).get  === c.getTimeInMillis * 1000)
-    assert(DateTimeUtils.stringToTimestamp(
+    assert(stringToTimestamp(
       UTF8String.fromString("2015-03-18T12:03:17.123-01:00")).get ===  c.getTimeInMillis * 1000)
 
     c = Calendar.getInstance(TimeZone.getTimeZone("GMT+07:30"))
     c.set(2015, 2, 18, 12, 3, 17)
     c.set(Calendar.MILLISECOND, 123)
-    assert(DateTimeUtils.stringToTimestamp(
+    assert(stringToTimestamp(
       UTF8String.fromString("2015-03-18T12:03:17.123+07:30")).get ===  c.getTimeInMillis * 1000)
 
     c = Calendar.getInstance(TimeZone.getTimeZone("GMT+07:30"))
     c.set(2015, 2, 18, 12, 3, 17)
     c.set(Calendar.MILLISECOND, 123)
-    assert(DateTimeUtils.stringToTimestamp(
+    assert(stringToTimestamp(
       UTF8String.fromString("2015-03-18T12:03:17.123+07:30")).get === c.getTimeInMillis * 1000)
 
     c = Calendar.getInstance(TimeZone.getTimeZone("GMT+07:30"))
     c.set(2015, 2, 18, 12, 3, 17)
     c.set(Calendar.MILLISECOND, 123)
-    assert(DateTimeUtils.stringToTimestamp(
+    assert(stringToTimestamp(
       UTF8String.fromString("2015-03-18T12:03:17.123121+7:30")).get ===
         c.getTimeInMillis * 1000 + 121)
 
     c = Calendar.getInstance(TimeZone.getTimeZone("GMT+07:30"))
     c.set(2015, 2, 18, 12, 3, 17)
     c.set(Calendar.MILLISECOND, 123)
-    assert(DateTimeUtils.stringToTimestamp(
+    assert(stringToTimestamp(
       UTF8String.fromString("2015-03-18T12:03:17.12312+7:30")).get ===
         c.getTimeInMillis * 1000 + 120)
 
@@ -254,7 +254,7 @@ class DateTimeUtilsSuite extends SparkFunSuite {
     c.set(Calendar.MINUTE, 12)
     c.set(Calendar.SECOND, 15)
     c.set(Calendar.MILLISECOND, 0)
-    assert(DateTimeUtils.stringToTimestamp(
+    assert(stringToTimestamp(
       UTF8String.fromString("18:12:15")).get ===
       c.getTimeInMillis * 1000)
 
@@ -263,7 +263,7 @@ class DateTimeUtilsSuite extends SparkFunSuite {
     c.set(Calendar.MINUTE, 12)
     c.set(Calendar.SECOND, 15)
     c.set(Calendar.MILLISECOND, 123)
-    assert(DateTimeUtils.stringToTimestamp(
+    assert(stringToTimestamp(
       UTF8String.fromString("T18:12:15.12312+7:30")).get ===
       c.getTimeInMillis * 1000 + 120)
 
@@ -272,93 +272,130 @@ class DateTimeUtilsSuite extends SparkFunSuite {
     c.set(Calendar.MINUTE, 12)
     c.set(Calendar.SECOND, 15)
     c.set(Calendar.MILLISECOND, 123)
-    assert(DateTimeUtils.stringToTimestamp(
+    assert(stringToTimestamp(
       UTF8String.fromString("18:12:15.12312+7:30")).get ===
       c.getTimeInMillis * 1000 + 120)
 
     c = Calendar.getInstance()
     c.set(2011, 4, 6, 7, 8, 9)
     c.set(Calendar.MILLISECOND, 100)
-    assert(DateTimeUtils.stringToTimestamp(
+    assert(stringToTimestamp(
       UTF8String.fromString("2011-05-06 07:08:09.1000")).get === c.getTimeInMillis * 1000)
 
-    assert(DateTimeUtils.stringToTimestamp(UTF8String.fromString("238")).isEmpty)
-    assert(DateTimeUtils.stringToTimestamp(UTF8String.fromString("2015-03-18 123142")).isEmpty)
-    assert(DateTimeUtils.stringToTimestamp(UTF8String.fromString("2015-03-18T123123")).isEmpty)
-    assert(DateTimeUtils.stringToTimestamp(UTF8String.fromString("2015-03-18X")).isEmpty)
-    assert(DateTimeUtils.stringToTimestamp(UTF8String.fromString("2015/03/18")).isEmpty)
-    assert(DateTimeUtils.stringToTimestamp(UTF8String.fromString("2015.03.18")).isEmpty)
-    assert(DateTimeUtils.stringToTimestamp(UTF8String.fromString("20150318")).isEmpty)
-    assert(DateTimeUtils.stringToTimestamp(UTF8String.fromString("2015-031-8")).isEmpty)
-    assert(DateTimeUtils.stringToTimestamp(
+    assert(stringToTimestamp(UTF8String.fromString("238")).isEmpty)
+    assert(stringToTimestamp(UTF8String.fromString("2015-03-18 123142")).isEmpty)
+    assert(stringToTimestamp(UTF8String.fromString("2015-03-18T123123")).isEmpty)
+    assert(stringToTimestamp(UTF8String.fromString("2015-03-18X")).isEmpty)
+    assert(stringToTimestamp(UTF8String.fromString("2015/03/18")).isEmpty)
+    assert(stringToTimestamp(UTF8String.fromString("2015.03.18")).isEmpty)
+    assert(stringToTimestamp(UTF8String.fromString("20150318")).isEmpty)
+    assert(stringToTimestamp(UTF8String.fromString("2015-031-8")).isEmpty)
+    assert(stringToTimestamp(
       UTF8String.fromString("2015-03-18T12:03.17-20:0")).isEmpty)
-    assert(DateTimeUtils.stringToTimestamp(
+    assert(stringToTimestamp(
       UTF8String.fromString("2015-03-18T12:03.17-0:70")).isEmpty)
-    assert(DateTimeUtils.stringToTimestamp(
+    assert(stringToTimestamp(
       UTF8String.fromString("2015-03-18T12:03.17-1:0:0")).isEmpty)
   }
 
   test("hours") {
     val c = Calendar.getInstance()
     c.set(2015, 2, 18, 13, 2, 11)
-    assert(DateTimeUtils.getHours(c.getTimeInMillis * 1000) === 13)
+    assert(getHours(c.getTimeInMillis * 1000) === 13)
     c.set(2015, 12, 8, 2, 7, 9)
-    assert(DateTimeUtils.getHours(c.getTimeInMillis * 1000) === 2)
+    assert(getHours(c.getTimeInMillis * 1000) === 2)
   }
 
   test("minutes") {
     val c = Calendar.getInstance()
     c.set(2015, 2, 18, 13, 2, 11)
-    assert(DateTimeUtils.getMinutes(c.getTimeInMillis * 1000) === 2)
+    assert(getMinutes(c.getTimeInMillis * 1000) === 2)
     c.set(2015, 2, 8, 2, 7, 9)
-    assert(DateTimeUtils.getMinutes(c.getTimeInMillis * 1000) === 7)
+    assert(getMinutes(c.getTimeInMillis * 1000) === 7)
   }
 
   test("seconds") {
     val c = Calendar.getInstance()
     c.set(2015, 2, 18, 13, 2, 11)
-    assert(DateTimeUtils.getSeconds(c.getTimeInMillis * 1000) === 11)
+    assert(getSeconds(c.getTimeInMillis * 1000) === 11)
     c.set(2015, 2, 8, 2, 7, 9)
-    assert(DateTimeUtils.getSeconds(c.getTimeInMillis * 1000) === 9)
+    assert(getSeconds(c.getTimeInMillis * 1000) === 9)
   }
 
   test("get day in year") {
     val c = Calendar.getInstance()
     c.set(2015, 2, 18, 0, 0, 0)
-    assert(DateTimeUtils.getDayInYear(getInUTCDays(c.getTimeInMillis)) === 77)
+    assert(getDayInYear(getInUTCDays(c.getTimeInMillis)) === 77)
     c.set(2012, 2, 18, 0, 0, 0)
-    assert(DateTimeUtils.getDayInYear(getInUTCDays(c.getTimeInMillis)) === 78)
+    assert(getDayInYear(getInUTCDays(c.getTimeInMillis)) === 78)
   }
 
   test("get year") {
     val c = Calendar.getInstance()
     c.set(2015, 2, 18, 0, 0, 0)
-    assert(DateTimeUtils.getYear(getInUTCDays(c.getTimeInMillis)) === 2015)
+    assert(getYear(getInUTCDays(c.getTimeInMillis)) === 2015)
     c.set(2012, 2, 18, 0, 0, 0)
-    assert(DateTimeUtils.getYear(getInUTCDays(c.getTimeInMillis)) === 2012)
+    assert(getYear(getInUTCDays(c.getTimeInMillis)) === 2012)
   }
 
   test("get quarter") {
     val c = Calendar.getInstance()
     c.set(2015, 2, 18, 0, 0, 0)
-    assert(DateTimeUtils.getQuarter(getInUTCDays(c.getTimeInMillis)) === 1)
+    assert(getQuarter(getInUTCDays(c.getTimeInMillis)) === 1)
     c.set(2012, 11, 18, 0, 0, 0)
-    assert(DateTimeUtils.getQuarter(getInUTCDays(c.getTimeInMillis)) === 4)
+    assert(getQuarter(getInUTCDays(c.getTimeInMillis)) === 4)
   }
 
   test("get month") {
     val c = Calendar.getInstance()
     c.set(2015, 2, 18, 0, 0, 0)
-    assert(DateTimeUtils.getMonth(getInUTCDays(c.getTimeInMillis)) === 3)
+    assert(getMonth(getInUTCDays(c.getTimeInMillis)) === 3)
     c.set(2012, 11, 18, 0, 0, 0)
-    assert(DateTimeUtils.getMonth(getInUTCDays(c.getTimeInMillis)) === 12)
+    assert(getMonth(getInUTCDays(c.getTimeInMillis)) === 12)
   }
 
   test("get day of month") {
     val c = Calendar.getInstance()
     c.set(2015, 2, 18, 0, 0, 0)
-    assert(DateTimeUtils.getDayOfMonth(getInUTCDays(c.getTimeInMillis)) === 18)
+    assert(getDayOfMonth(getInUTCDays(c.getTimeInMillis)) === 18)
     c.set(2012, 11, 24, 0, 0, 0)
-    assert(DateTimeUtils.getDayOfMonth(getInUTCDays(c.getTimeInMillis)) === 24)
+    assert(getDayOfMonth(getInUTCDays(c.getTimeInMillis)) === 24)
+  }
+
+  test("date add months") {
+    val c1 = Calendar.getInstance()
+    c1.set(1997, 1, 28, 10, 30, 0)
+    val days1 = millisToDays(c1.getTimeInMillis)
+    val c2 = Calendar.getInstance()
+    c2.set(2000, 1, 29)
+    assert(dateAddMonths(days1, 36) === millisToDays(c2.getTimeInMillis))
+    c2.set(1996, 0, 31)
+    assert(dateAddMonths(days1, -13) === millisToDays(c2.getTimeInMillis))
+  }
+
+  test("timestamp add months") {
+    val c1 = Calendar.getInstance()
+    c1.set(1997, 1, 28, 10, 30, 0)
+    c1.set(Calendar.MILLISECOND, 0)
+    val ts1 = c1.getTimeInMillis * 1000L
+    val c2 = Calendar.getInstance()
+    c2.set(2000, 1, 29, 10, 30, 0)
+    c2.set(Calendar.MILLISECOND, 123)
+    val ts2 = c2.getTimeInMillis * 1000L
+    assert(timestampAddInterval(ts1, 36, 123000) === ts2)
+  }
+
+  test("monthsBetween") {
+    val c1 = Calendar.getInstance()
+    c1.set(1997, 1, 28, 10, 30, 0)
+    val c2 = Calendar.getInstance()
+    c2.set(1996, 9, 30, 0, 0, 0)
+    assert(monthsBetween(c1.getTimeInMillis * 1000L, c2.getTimeInMillis * 1000L) === 3.94959677)
+    c2.set(2000, 1, 28, 0, 0, 0)
+    assert(monthsBetween(c1.getTimeInMillis * 1000L, c2.getTimeInMillis * 1000L) === -36)
+    c2.set(2000, 1, 29, 0, 0, 0)
+    assert(monthsBetween(c1.getTimeInMillis * 1000L, c2.getTimeInMillis * 1000L) === -36)
+    c2.set(1996, 2, 31, 0, 0, 0)
+    assert(monthsBetween(c1.getTimeInMillis * 1000L, c2.getTimeInMillis * 1000L) === 11)
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 3f440e062eb96..168894d66117d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -1927,6 +1927,14 @@ object functions {
   // DateTime functions
   //////////////////////////////////////////////////////////////////////////////////////////////
 
+  /**
+   * Returns the date that is numMonths after startDate.
+   * @group datetime_funcs
+   * @since 1.5.0
+   */
+  def add_months(startDate: Column, numMonths: Int): Column =
+    AddMonths(startDate.expr, Literal(numMonths))
+
   /**
    * Converts a date/timestamp/string to a value of string in the format specified by the date
    * format given by the second argument.
@@ -1959,6 +1967,20 @@ object functions {
   def date_format(dateColumnName: String, format: String): Column =
     date_format(Column(dateColumnName), format)
 
+  /**
+   * Returns the date that is `days` days after `start`
+   * @group datetime_funcs
+   * @since 1.5.0
+   */
+  def date_add(start: Column, days: Int): Column = DateAdd(start.expr, Literal(days))
+
+  /**
+   * Returns the date that is `days` days before `start`
+   * @group datetime_funcs
+   * @since 1.5.0
+   */
+  def date_sub(start: Column, days: Int): Column = DateSub(start.expr, Literal(days))
+
   /**
    * Extracts the year as an integer from a given date/timestamp/string.
    * @group datetime_funcs
@@ -2067,6 +2089,13 @@ object functions {
    */
   def minute(columnName: String): Column = minute(Column(columnName))
 
+  /*
+   * Returns number of months between dates `date1` and `date2`.
+   * @group datetime_funcs
+   * @since 1.5.0
+   */
+  def months_between(date1: Column, date2: Column): Column = MonthsBetween(date1.expr, date2.expr)
+
   /**
    * Given a date column, returns the first date which is later than the value of the date column
    * that is on the specified day of the week.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
index df4cb57ac5b21..b7267c413165a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
@@ -22,6 +22,7 @@ import java.text.SimpleDateFormat
 
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.functions._
+import org.apache.spark.unsafe.types.CalendarInterval
 
 class DateFunctionsSuite extends QueryTest {
   private lazy val ctx = org.apache.spark.sql.test.TestSQLContext
@@ -206,6 +207,122 @@ class DateFunctionsSuite extends QueryTest {
       Row(15, 15, 15))
   }
 
+  test("function date_add") {
+    val st1 = "2015-06-01 12:34:56"
+    val st2 = "2015-06-02 12:34:56"
+    val t1 = Timestamp.valueOf(st1)
+    val t2 = Timestamp.valueOf(st2)
+    val s1 = "2015-06-01"
+    val s2 = "2015-06-02"
+    val d1 = Date.valueOf(s1)
+    val d2 = Date.valueOf(s2)
+    val df = Seq((t1, d1, s1, st1), (t2, d2, s2, st2)).toDF("t", "d", "s", "ss")
+    checkAnswer(
+      df.select(date_add(col("d"), 1)),
+      Seq(Row(Date.valueOf("2015-06-02")), Row(Date.valueOf("2015-06-03"))))
+    checkAnswer(
+      df.select(date_add(col("t"), 3)),
+      Seq(Row(Date.valueOf("2015-06-04")), Row(Date.valueOf("2015-06-05"))))
+    checkAnswer(
+      df.select(date_add(col("s"), 5)),
+      Seq(Row(Date.valueOf("2015-06-06")), Row(Date.valueOf("2015-06-07"))))
+    checkAnswer(
+      df.select(date_add(col("ss"), 7)),
+      Seq(Row(Date.valueOf("2015-06-08")), Row(Date.valueOf("2015-06-09"))))
+
+    checkAnswer(df.selectExpr("DATE_ADD(null, 1)"), Seq(Row(null), Row(null)))
+    checkAnswer(
+      df.selectExpr("""DATE_ADD(d, 1)"""),
+      Seq(Row(Date.valueOf("2015-06-02")), Row(Date.valueOf("2015-06-03"))))
+  }
+
+  test("function date_sub") {
+    val st1 = "2015-06-01 12:34:56"
+    val st2 = "2015-06-02 12:34:56"
+    val t1 = Timestamp.valueOf(st1)
+    val t2 = Timestamp.valueOf(st2)
+    val s1 = "2015-06-01"
+    val s2 = "2015-06-02"
+    val d1 = Date.valueOf(s1)
+    val d2 = Date.valueOf(s2)
+    val df = Seq((t1, d1, s1, st1), (t2, d2, s2, st2)).toDF("t", "d", "s", "ss")
+    checkAnswer(
+      df.select(date_sub(col("d"), 1)),
+      Seq(Row(Date.valueOf("2015-05-31")), Row(Date.valueOf("2015-06-01"))))
+    checkAnswer(
+      df.select(date_sub(col("t"), 1)),
+      Seq(Row(Date.valueOf("2015-05-31")), Row(Date.valueOf("2015-06-01"))))
+    checkAnswer(
+      df.select(date_sub(col("s"), 1)),
+      Seq(Row(Date.valueOf("2015-05-31")), Row(Date.valueOf("2015-06-01"))))
+    checkAnswer(
+      df.select(date_sub(col("ss"), 1)),
+      Seq(Row(Date.valueOf("2015-05-31")), Row(Date.valueOf("2015-06-01"))))
+    checkAnswer(
+      df.select(date_sub(lit(null), 1)).limit(1), Row(null))
+
+    checkAnswer(df.selectExpr("""DATE_SUB(d, null)"""), Seq(Row(null), Row(null)))
+    checkAnswer(
+      df.selectExpr("""DATE_SUB(d, 1)"""),
+      Seq(Row(Date.valueOf("2015-05-31")), Row(Date.valueOf("2015-06-01"))))
+  }
+
+  test("time_add") {
+    val t1 = Timestamp.valueOf("2015-07-31 23:59:59")
+    val t2 = Timestamp.valueOf("2015-12-31 00:00:00")
+    val d1 = Date.valueOf("2015-07-31")
+    val d2 = Date.valueOf("2015-12-31")
+    val i = new CalendarInterval(2, 2000000L)
+    val df = Seq((1, t1, d1), (3, t2, d2)).toDF("n", "t", "d")
+    checkAnswer(
+      df.selectExpr(s"d + $i"),
+      Seq(Row(Date.valueOf("2015-09-30")), Row(Date.valueOf("2016-02-29"))))
+    checkAnswer(
+      df.selectExpr(s"t + $i"),
+      Seq(Row(Timestamp.valueOf("2015-10-01 00:00:01")),
+        Row(Timestamp.valueOf("2016-02-29 00:00:02"))))
+  }
+
+  test("time_sub") {
+    val t1 = Timestamp.valueOf("2015-10-01 00:00:01")
+    val t2 = Timestamp.valueOf("2016-02-29 00:00:02")
+    val d1 = Date.valueOf("2015-09-30")
+    val d2 = Date.valueOf("2016-02-29")
+    val i = new CalendarInterval(2, 2000000L)
+    val df = Seq((1, t1, d1), (3, t2, d2)).toDF("n", "t", "d")
+    checkAnswer(
+      df.selectExpr(s"d - $i"),
+      Seq(Row(Date.valueOf("2015-07-30")), Row(Date.valueOf("2015-12-30"))))
+    checkAnswer(
+      df.selectExpr(s"t - $i"),
+      Seq(Row(Timestamp.valueOf("2015-07-31 23:59:59")),
+        Row(Timestamp.valueOf("2015-12-31 00:00:00"))))
+  }
+
+  test("function add_months") {
+    val d1 = Date.valueOf("2015-08-31")
+    val d2 = Date.valueOf("2015-02-28")
+    val df = Seq((1, d1), (2, d2)).toDF("n", "d")
+    checkAnswer(
+      df.select(add_months(col("d"), 1)),
+      Seq(Row(Date.valueOf("2015-09-30")), Row(Date.valueOf("2015-03-31"))))
+    checkAnswer(
+      df.selectExpr("add_months(d, -1)"),
+      Seq(Row(Date.valueOf("2015-07-31")), Row(Date.valueOf("2015-01-31"))))
+  }
+
+  test("function months_between") {
+    val d1 = Date.valueOf("2015-07-31")
+    val d2 = Date.valueOf("2015-02-16")
+    val t1 = Timestamp.valueOf("2014-09-30 23:30:00")
+    val t2 = Timestamp.valueOf("2015-09-16 12:00:00")
+    val s1 = "2014-09-15 11:30:00"
+    val s2 = "2015-10-01 00:00:00"
+    val df = Seq((t1, d1, s1), (t2, d2, s2)).toDF("t", "d", "s")
+    checkAnswer(df.select(months_between(col("t"), col("d"))), Seq(Row(-10.0), Row(7.0)))
+    checkAnswer(df.selectExpr("months_between(t, s)"), Seq(Row(0.5), Row(-0.5)))
+  }
+
   test("function last_day") {
     val df1 = Seq((1, "2015-07-23"), (2, "2015-07-24")).toDF("i", "d")
     val df2 = Seq((1, "2015-07-23 00:11:22"), (2, "2015-07-24 11:22:33")).toDF("i", "t")

From 89cda69ecd5ef942a68ad13fc4e1f4184010f087 Mon Sep 17 00:00:00 2001
From: Feynman Liang <fliang@databricks.com>
Date: Thu, 30 Jul 2015 14:08:59 -0700
Subject: [PATCH 487/549] [SPARK-9454] Change LDASuite tests to use vector
 comparisons

jkbradley Changes the current hacky string-comparison for vector compares.

Author: Feynman Liang <fliang@databricks.com>

Closes #7775 from feynmanliang/SPARK-9454-ldasuite-vector-compare and squashes the following commits:

bd91a82 [Feynman Liang] Remove println
905c76e [Feynman Liang] Fix string compare in distributed EM
2f24c13 [Feynman Liang] Improve LDASuite tests
---
 .../spark/mllib/clustering/LDASuite.scala     | 33 ++++++++-----------
 1 file changed, 14 insertions(+), 19 deletions(-)

diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
index d74482d3a7598..c43e1e575c09c 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
@@ -83,21 +83,14 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext {
     assert(model.topicsMatrix === localModel.topicsMatrix)
 
     // Check: topic summaries
-    //  The odd decimal formatting and sorting is a hack to do a robust comparison.
-    val roundedTopicSummary = model.describeTopics().map { case (terms, termWeights) =>
-      // cut values to 3 digits after the decimal place
-      terms.zip(termWeights).map { case (term, weight) =>
-        ("%.3f".format(weight).toDouble, term.toInt)
-      }
-    }.sortBy(_.mkString(""))
-    val roundedLocalTopicSummary = localModel.describeTopics().map { case (terms, termWeights) =>
-      // cut values to 3 digits after the decimal place
-      terms.zip(termWeights).map { case (term, weight) =>
-        ("%.3f".format(weight).toDouble, term.toInt)
-      }
-    }.sortBy(_.mkString(""))
-    roundedTopicSummary.zip(roundedLocalTopicSummary).foreach { case (t1, t2) =>
-      assert(t1 === t2)
+    val topicSummary = model.describeTopics().map { case (terms, termWeights) =>
+      Vectors.sparse(tinyVocabSize, terms, termWeights)
+    }.sortBy(_.toString)
+    val localTopicSummary = localModel.describeTopics().map { case (terms, termWeights) =>
+      Vectors.sparse(tinyVocabSize, terms, termWeights)
+    }.sortBy(_.toString)
+    topicSummary.zip(localTopicSummary).foreach { case (topics, topicsLocal) =>
+      assert(topics ~== topicsLocal absTol 0.01)
     }
 
     // Check: per-doc topic distributions
@@ -197,10 +190,12 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext {
 
     // verify the result, Note this generate the identical result as
     // [[https://github.com/Blei-Lab/onlineldavb]]
-    val topic1 = op.getLambda(0, ::).inner.toArray.map("%.4f".format(_)).mkString(", ")
-    val topic2 = op.getLambda(1, ::).inner.toArray.map("%.4f".format(_)).mkString(", ")
-    assert("1.1101, 1.2076, 1.3050, 0.8899, 0.7924, 0.6950" == topic1)
-    assert("0.8899, 0.7924, 0.6950, 1.1101, 1.2076, 1.3050" == topic2)
+    val topic1: Vector = Vectors.fromBreeze(op.getLambda(0, ::).t)
+    val topic2: Vector = Vectors.fromBreeze(op.getLambda(1, ::).t)
+    val expectedTopic1 = Vectors.dense(1.1101, 1.2076, 1.3050, 0.8899, 0.7924, 0.6950)
+    val expectedTopic2 = Vectors.dense(0.8899, 0.7924, 0.6950, 1.1101, 1.2076, 1.3050)
+    assert(topic1 ~== expectedTopic1 absTol 0.01)
+    assert(topic2 ~== expectedTopic2 absTol 0.01)
   }
 
   test("OnlineLDAOptimizer with toy data") {

From 0dbd6963d589a8f6ad344273f3da7df680ada515 Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Thu, 30 Jul 2015 15:39:46 -0700
Subject: [PATCH 488/549] [SPARK-9479] [STREAMING] [TESTS] Fix
 ReceiverTrackerSuite failure for maven build and other potential test
 failures in Streaming

See https://issues.apache.org/jira/browse/SPARK-9479 for the failure cause.

The PR includes the following changes:
1. Make ReceiverTrackerSuite create StreamingContext in the test body.
2. Fix places that don't stop StreamingContext. I verified no SparkContext was stopped in the shutdown hook locally after this fix.
3. Fix an issue that `ReceiverTracker.endpoint` may be null.
4. Make sure stopping SparkContext in non-main thread won't fail other tests.

Author: zsxwing <zsxwing@gmail.com>

Closes #7797 from zsxwing/fix-ReceiverTrackerSuite and squashes the following commits:

3a4bb98 [zsxwing] Fix another potential NPE
d7497df [zsxwing] Fix ReceiverTrackerSuite; make sure StreamingContext in tests is closed
---
 .../StreamingLogisticRegressionSuite.scala    | 21 +++++--
 .../clustering/StreamingKMeansSuite.scala     | 17 ++++--
 .../StreamingLinearRegressionSuite.scala      | 21 +++++--
 .../streaming/scheduler/ReceiverTracker.scala | 12 +++-
 .../apache/spark/streaming/JavaAPISuite.java  |  1 +
 .../streaming/BasicOperationsSuite.scala      | 58 ++++++++++---------
 .../spark/streaming/InputStreamsSuite.scala   | 38 ++++++------
 .../spark/streaming/MasterFailureTest.scala   |  8 ++-
 .../streaming/StreamingContextSuite.scala     | 22 +++++--
 .../streaming/StreamingListenerSuite.scala    | 13 ++++-
 .../scheduler/ReceiverTrackerSuite.scala      | 56 +++++++++---------
 .../StreamingJobProgressListenerSuite.scala   | 19 ++++--
 12 files changed, 183 insertions(+), 103 deletions(-)

diff --git a/mllib/src/test/scala/org/apache/spark/mllib/classification/StreamingLogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/classification/StreamingLogisticRegressionSuite.scala
index fd653296c9d97..d7b291d5a6330 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/classification/StreamingLogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/classification/StreamingLogisticRegressionSuite.scala
@@ -24,13 +24,22 @@ import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.util.TestingUtils._
 import org.apache.spark.streaming.dstream.DStream
-import org.apache.spark.streaming.TestSuiteBase
+import org.apache.spark.streaming.{StreamingContext, TestSuiteBase}
 
 class StreamingLogisticRegressionSuite extends SparkFunSuite with TestSuiteBase {
 
   // use longer wait time to ensure job completion
   override def maxWaitTimeMillis: Int = 30000
 
+  var ssc: StreamingContext = _
+
+  override def afterFunction() {
+    super.afterFunction()
+    if (ssc != null) {
+      ssc.stop()
+    }
+  }
+
   // Test if we can accurately learn B for Y = logistic(BX) on streaming data
   test("parameter accuracy") {
 
@@ -50,7 +59,7 @@ class StreamingLogisticRegressionSuite extends SparkFunSuite with TestSuiteBase
     }
 
     // apply model training to input stream
-    val ssc = setupStreams(input, (inputDStream: DStream[LabeledPoint]) => {
+    ssc = setupStreams(input, (inputDStream: DStream[LabeledPoint]) => {
       model.trainOn(inputDStream)
       inputDStream.count()
     })
@@ -84,7 +93,7 @@ class StreamingLogisticRegressionSuite extends SparkFunSuite with TestSuiteBase
 
     // apply model training to input stream, storing the intermediate results
     // (we add a count to ensure the result is a DStream)
-    val ssc = setupStreams(input, (inputDStream: DStream[LabeledPoint]) => {
+    ssc = setupStreams(input, (inputDStream: DStream[LabeledPoint]) => {
       model.trainOn(inputDStream)
       inputDStream.foreachRDD(x => history.append(math.abs(model.latestModel().weights(0) - B)))
       inputDStream.count()
@@ -118,7 +127,7 @@ class StreamingLogisticRegressionSuite extends SparkFunSuite with TestSuiteBase
     }
 
     // apply model predictions to test stream
-    val ssc = setupStreams(testInput, (inputDStream: DStream[LabeledPoint]) => {
+    ssc = setupStreams(testInput, (inputDStream: DStream[LabeledPoint]) => {
       model.predictOnValues(inputDStream.map(x => (x.label, x.features)))
     })
 
@@ -147,7 +156,7 @@ class StreamingLogisticRegressionSuite extends SparkFunSuite with TestSuiteBase
     }
 
     // train and predict
-    val ssc = setupStreams(testInput, (inputDStream: DStream[LabeledPoint]) => {
+    ssc = setupStreams(testInput, (inputDStream: DStream[LabeledPoint]) => {
       model.trainOn(inputDStream)
       model.predictOnValues(inputDStream.map(x => (x.label, x.features)))
     })
@@ -167,7 +176,7 @@ class StreamingLogisticRegressionSuite extends SparkFunSuite with TestSuiteBase
       .setNumIterations(10)
     val numBatches = 10
     val emptyInput = Seq.empty[Seq[LabeledPoint]]
-    val ssc = setupStreams(emptyInput,
+    ssc = setupStreams(emptyInput,
       (inputDStream: DStream[LabeledPoint]) => {
         model.trainOn(inputDStream)
         model.predictOnValues(inputDStream.map(x => (x.label, x.features)))
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/StreamingKMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/StreamingKMeansSuite.scala
index ac01622b8a089..3645d29dccdb2 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/StreamingKMeansSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/StreamingKMeansSuite.scala
@@ -20,7 +20,7 @@ package org.apache.spark.mllib.clustering
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.linalg.{Vector, Vectors}
 import org.apache.spark.mllib.util.TestingUtils._
-import org.apache.spark.streaming.TestSuiteBase
+import org.apache.spark.streaming.{StreamingContext, TestSuiteBase}
 import org.apache.spark.streaming.dstream.DStream
 import org.apache.spark.util.random.XORShiftRandom
 
@@ -28,6 +28,15 @@ class StreamingKMeansSuite extends SparkFunSuite with TestSuiteBase {
 
   override def maxWaitTimeMillis: Int = 30000
 
+  var ssc: StreamingContext = _
+
+  override def afterFunction() {
+    super.afterFunction()
+    if (ssc != null) {
+      ssc.stop()
+    }
+  }
+
   test("accuracy for single center and equivalence to grand average") {
     // set parameters
     val numBatches = 10
@@ -46,7 +55,7 @@ class StreamingKMeansSuite extends SparkFunSuite with TestSuiteBase {
     val (input, centers) = StreamingKMeansDataGenerator(numPoints, numBatches, k, d, r, 42)
 
     // setup and run the model training
-    val ssc = setupStreams(input, (inputDStream: DStream[Vector]) => {
+    ssc = setupStreams(input, (inputDStream: DStream[Vector]) => {
       model.trainOn(inputDStream)
       inputDStream.count()
     })
@@ -82,7 +91,7 @@ class StreamingKMeansSuite extends SparkFunSuite with TestSuiteBase {
     val (input, centers) = StreamingKMeansDataGenerator(numPoints, numBatches, k, d, r, 42)
 
     // setup and run the model training
-    val ssc = setupStreams(input, (inputDStream: DStream[Vector]) => {
+    ssc = setupStreams(input, (inputDStream: DStream[Vector]) => {
       kMeans.trainOn(inputDStream)
       inputDStream.count()
     })
@@ -114,7 +123,7 @@ class StreamingKMeansSuite extends SparkFunSuite with TestSuiteBase {
       StreamingKMeansDataGenerator(numPoints, numBatches, k, d, r, 42, Array(Vectors.dense(0.0)))
 
     // setup and run the model training
-    val ssc = setupStreams(input, (inputDStream: DStream[Vector]) => {
+    ssc = setupStreams(input, (inputDStream: DStream[Vector]) => {
       kMeans.trainOn(inputDStream)
       inputDStream.count()
     })
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionSuite.scala
index a2a4c5f6b8b70..34c07ed170816 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionSuite.scala
@@ -22,14 +22,23 @@ import scala.collection.mutable.ArrayBuffer
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.util.LinearDataGenerator
+import org.apache.spark.streaming.{StreamingContext, TestSuiteBase}
 import org.apache.spark.streaming.dstream.DStream
-import org.apache.spark.streaming.TestSuiteBase
 
 class StreamingLinearRegressionSuite extends SparkFunSuite with TestSuiteBase {
 
   // use longer wait time to ensure job completion
   override def maxWaitTimeMillis: Int = 20000
 
+  var ssc: StreamingContext = _
+
+  override def afterFunction() {
+    super.afterFunction()
+    if (ssc != null) {
+      ssc.stop()
+    }
+  }
+
   // Assert that two values are equal within tolerance epsilon
   def assertEqual(v1: Double, v2: Double, epsilon: Double) {
     def errorMessage = v1.toString + " did not equal " + v2.toString
@@ -62,7 +71,7 @@ class StreamingLinearRegressionSuite extends SparkFunSuite with TestSuiteBase {
     }
 
     // apply model training to input stream
-    val ssc = setupStreams(input, (inputDStream: DStream[LabeledPoint]) => {
+    ssc = setupStreams(input, (inputDStream: DStream[LabeledPoint]) => {
       model.trainOn(inputDStream)
       inputDStream.count()
     })
@@ -98,7 +107,7 @@ class StreamingLinearRegressionSuite extends SparkFunSuite with TestSuiteBase {
 
     // apply model training to input stream, storing the intermediate results
     // (we add a count to ensure the result is a DStream)
-    val ssc = setupStreams(input, (inputDStream: DStream[LabeledPoint]) => {
+    ssc = setupStreams(input, (inputDStream: DStream[LabeledPoint]) => {
       model.trainOn(inputDStream)
       inputDStream.foreachRDD(x => history.append(math.abs(model.latestModel().weights(0) - 10.0)))
       inputDStream.count()
@@ -129,7 +138,7 @@ class StreamingLinearRegressionSuite extends SparkFunSuite with TestSuiteBase {
     }
 
     // apply model predictions to test stream
-    val ssc = setupStreams(testInput, (inputDStream: DStream[LabeledPoint]) => {
+    ssc = setupStreams(testInput, (inputDStream: DStream[LabeledPoint]) => {
       model.predictOnValues(inputDStream.map(x => (x.label, x.features)))
     })
     // collect the output as (true, estimated) tuples
@@ -156,7 +165,7 @@ class StreamingLinearRegressionSuite extends SparkFunSuite with TestSuiteBase {
     }
 
     // train and predict
-    val ssc = setupStreams(testInput, (inputDStream: DStream[LabeledPoint]) => {
+    ssc = setupStreams(testInput, (inputDStream: DStream[LabeledPoint]) => {
       model.trainOn(inputDStream)
       model.predictOnValues(inputDStream.map(x => (x.label, x.features)))
     })
@@ -177,7 +186,7 @@ class StreamingLinearRegressionSuite extends SparkFunSuite with TestSuiteBase {
     val numBatches = 10
     val nPoints = 100
     val emptyInput = Seq.empty[Seq[LabeledPoint]]
-    val ssc = setupStreams(emptyInput,
+    ssc = setupStreams(emptyInput,
       (inputDStream: DStream[LabeledPoint]) => {
         model.trainOn(inputDStream)
         model.predictOnValues(inputDStream.map(x => (x.label, x.features)))
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala
index 6270137951b5a..e076fb5ea174b 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala
@@ -223,7 +223,11 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false
     // Signal the receivers to delete old block data
     if (WriteAheadLogUtils.enableReceiverLog(ssc.conf)) {
       logInfo(s"Cleanup old received batch data: $cleanupThreshTime")
-      endpoint.send(CleanupOldBlocks(cleanupThreshTime))
+      synchronized {
+        if (isTrackerStarted) {
+          endpoint.send(CleanupOldBlocks(cleanupThreshTime))
+        }
+      }
     }
   }
 
@@ -285,8 +289,10 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false
   }
 
   /** Update a receiver's maximum ingestion rate */
-  def sendRateUpdate(streamUID: Int, newRate: Long): Unit = {
-    endpoint.send(UpdateReceiverRateLimit(streamUID, newRate))
+  def sendRateUpdate(streamUID: Int, newRate: Long): Unit = synchronized {
+    if (isTrackerStarted) {
+      endpoint.send(UpdateReceiverRateLimit(streamUID, newRate))
+    }
   }
 
   /** Add new blocks for the given stream */
diff --git a/streaming/src/test/java/org/apache/spark/streaming/JavaAPISuite.java b/streaming/src/test/java/org/apache/spark/streaming/JavaAPISuite.java
index a34f23475804a..e0718f73aa13f 100644
--- a/streaming/src/test/java/org/apache/spark/streaming/JavaAPISuite.java
+++ b/streaming/src/test/java/org/apache/spark/streaming/JavaAPISuite.java
@@ -1735,6 +1735,7 @@ public Integer call(String s) throws Exception {
   @SuppressWarnings("unchecked")
   @Test
   public void testContextGetOrCreate() throws InterruptedException {
+    ssc.stop();
 
     final SparkConf conf = new SparkConf()
         .setMaster("local[2]")
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/BasicOperationsSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/BasicOperationsSuite.scala
index 08faeaa58f419..255376807c957 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/BasicOperationsSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/BasicOperationsSuite.scala
@@ -81,39 +81,41 @@ class BasicOperationsSuite extends TestSuiteBase {
   test("repartition (more partitions)") {
     val input = Seq(1 to 100, 101 to 200, 201 to 300)
     val operation = (r: DStream[Int]) => r.repartition(5)
-    val ssc = setupStreams(input, operation, 2)
-    val output = runStreamsWithPartitions(ssc, 3, 3)
-    assert(output.size === 3)
-    val first = output(0)
-    val second = output(1)
-    val third = output(2)
-
-    assert(first.size === 5)
-    assert(second.size === 5)
-    assert(third.size === 5)
-
-    assert(first.flatten.toSet.equals((1 to 100).toSet) )
-    assert(second.flatten.toSet.equals((101 to 200).toSet))
-    assert(third.flatten.toSet.equals((201 to 300).toSet))
+    withStreamingContext(setupStreams(input, operation, 2)) { ssc =>
+      val output = runStreamsWithPartitions(ssc, 3, 3)
+      assert(output.size === 3)
+      val first = output(0)
+      val second = output(1)
+      val third = output(2)
+
+      assert(first.size === 5)
+      assert(second.size === 5)
+      assert(third.size === 5)
+
+      assert(first.flatten.toSet.equals((1 to 100).toSet))
+      assert(second.flatten.toSet.equals((101 to 200).toSet))
+      assert(third.flatten.toSet.equals((201 to 300).toSet))
+    }
   }
 
   test("repartition (fewer partitions)") {
     val input = Seq(1 to 100, 101 to 200, 201 to 300)
     val operation = (r: DStream[Int]) => r.repartition(2)
-    val ssc = setupStreams(input, operation, 5)
-    val output = runStreamsWithPartitions(ssc, 3, 3)
-    assert(output.size === 3)
-    val first = output(0)
-    val second = output(1)
-    val third = output(2)
-
-    assert(first.size === 2)
-    assert(second.size === 2)
-    assert(third.size === 2)
-
-    assert(first.flatten.toSet.equals((1 to 100).toSet))
-    assert(second.flatten.toSet.equals( (101 to 200).toSet))
-    assert(third.flatten.toSet.equals((201 to 300).toSet))
+    withStreamingContext(setupStreams(input, operation, 5)) { ssc =>
+      val output = runStreamsWithPartitions(ssc, 3, 3)
+      assert(output.size === 3)
+      val first = output(0)
+      val second = output(1)
+      val third = output(2)
+
+      assert(first.size === 2)
+      assert(second.size === 2)
+      assert(third.size === 2)
+
+      assert(first.flatten.toSet.equals((1 to 100).toSet))
+      assert(second.flatten.toSet.equals((101 to 200).toSet))
+      assert(third.flatten.toSet.equals((201 to 300).toSet))
+    }
   }
 
   test("groupByKey") {
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/InputStreamsSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/InputStreamsSuite.scala
index b74d67c63a788..ec2852d9a0206 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/InputStreamsSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/InputStreamsSuite.scala
@@ -325,27 +325,31 @@ class InputStreamsSuite extends TestSuiteBase with BeforeAndAfter {
   }
 
   test("test track the number of input stream") {
-    val ssc = new StreamingContext(conf, batchDuration)
+    withStreamingContext(new StreamingContext(conf, batchDuration)) { ssc =>
 
-    class TestInputDStream extends InputDStream[String](ssc) {
-      def start() { }
-      def stop() { }
-      def compute(validTime: Time): Option[RDD[String]] = None
-    }
+      class TestInputDStream extends InputDStream[String](ssc) {
+        def start() {}
 
-    class TestReceiverInputDStream extends ReceiverInputDStream[String](ssc) {
-      def getReceiver: Receiver[String] = null
-    }
+        def stop() {}
+
+        def compute(validTime: Time): Option[RDD[String]] = None
+      }
+
+      class TestReceiverInputDStream extends ReceiverInputDStream[String](ssc) {
+        def getReceiver: Receiver[String] = null
+      }
 
-    // Register input streams
-    val receiverInputStreams = Array(new TestReceiverInputDStream, new TestReceiverInputDStream)
-    val inputStreams = Array(new TestInputDStream, new TestInputDStream, new TestInputDStream)
+      // Register input streams
+      val receiverInputStreams = Array(new TestReceiverInputDStream, new TestReceiverInputDStream)
+      val inputStreams = Array(new TestInputDStream, new TestInputDStream, new TestInputDStream)
 
-    assert(ssc.graph.getInputStreams().length == receiverInputStreams.length + inputStreams.length)
-    assert(ssc.graph.getReceiverInputStreams().length == receiverInputStreams.length)
-    assert(ssc.graph.getReceiverInputStreams() === receiverInputStreams)
-    assert(ssc.graph.getInputStreams().map(_.id) === Array.tabulate(5)(i => i))
-    assert(receiverInputStreams.map(_.id) === Array(0, 1))
+      assert(ssc.graph.getInputStreams().length ==
+        receiverInputStreams.length + inputStreams.length)
+      assert(ssc.graph.getReceiverInputStreams().length == receiverInputStreams.length)
+      assert(ssc.graph.getReceiverInputStreams() === receiverInputStreams)
+      assert(ssc.graph.getInputStreams().map(_.id) === Array.tabulate(5)(i => i))
+      assert(receiverInputStreams.map(_.id) === Array(0, 1))
+    }
   }
 
   def testFileStream(newFilesOnly: Boolean) {
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/MasterFailureTest.scala b/streaming/src/test/scala/org/apache/spark/streaming/MasterFailureTest.scala
index 6e9d4431090a2..0e64b57e0ffd8 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/MasterFailureTest.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/MasterFailureTest.scala
@@ -244,7 +244,13 @@ object MasterFailureTest extends Logging {
       } catch {
         case e: Exception => logError("Error running streaming context", e)
       }
-      if (killingThread.isAlive) killingThread.interrupt()
+      if (killingThread.isAlive) {
+        killingThread.interrupt()
+        // SparkContext.stop will set SparkEnv.env to null. We need to make sure SparkContext is
+        // stopped before running the next test. Otherwise, it's possible that we set SparkEnv.env
+        // to null after the next test creates the new SparkContext and fail the test.
+        killingThread.join()
+      }
       ssc.stop()
 
       logInfo("Has been killed = " + killed)
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala
index 4bba9691f8aa5..84a5fbb3d95eb 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala
@@ -120,7 +120,7 @@ class StreamingContextSuite extends SparkFunSuite with BeforeAndAfter with Timeo
 
     val myConf = SparkContext.updatedConf(new SparkConf(false), master, appName)
     myConf.set("spark.streaming.checkpoint.directory", checkpointDirectory)
-    val ssc = new StreamingContext(myConf, batchDuration)
+    ssc = new StreamingContext(myConf, batchDuration)
     assert(ssc.checkpointDir != null)
   }
 
@@ -369,16 +369,22 @@ class StreamingContextSuite extends SparkFunSuite with BeforeAndAfter with Timeo
     }
     assert(exception.isInstanceOf[TestFailedDueToTimeoutException], "Did not wait for stop")
 
+    var t: Thread = null
     // test whether wait exits if context is stopped
     failAfter(10000 millis) { // 10 seconds because spark takes a long time to shutdown
-      new Thread() {
+      t = new Thread() {
         override def run() {
           Thread.sleep(500)
           ssc.stop()
         }
-      }.start()
+      }
+      t.start()
       ssc.awaitTermination()
     }
+    // SparkContext.stop will set SparkEnv.env to null. We need to make sure SparkContext is stopped
+    // before running the next test. Otherwise, it's possible that we set SparkEnv.env to null after
+    // the next test creates the new SparkContext and fail the test.
+    t.join()
   }
 
   test("awaitTermination after stop") {
@@ -430,16 +436,22 @@ class StreamingContextSuite extends SparkFunSuite with BeforeAndAfter with Timeo
       assert(ssc.awaitTerminationOrTimeout(500) === false)
     }
 
+    var t: Thread = null
     // test whether awaitTerminationOrTimeout() return true if context is stopped
     failAfter(10000 millis) { // 10 seconds because spark takes a long time to shutdown
-      new Thread() {
+      t = new Thread() {
         override def run() {
           Thread.sleep(500)
           ssc.stop()
         }
-      }.start()
+      }
+      t.start()
       assert(ssc.awaitTerminationOrTimeout(10000) === true)
     }
+    // SparkContext.stop will set SparkEnv.env to null. We need to make sure SparkContext is stopped
+    // before running the next test. Otherwise, it's possible that we set SparkEnv.env to null after
+    // the next test creates the new SparkContext and fail the test.
+    t.join()
   }
 
   test("getOrCreate") {
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/StreamingListenerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/StreamingListenerSuite.scala
index 4bc1dd4a30fc4..d840c349bbbc4 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/StreamingListenerSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/StreamingListenerSuite.scala
@@ -36,13 +36,22 @@ class StreamingListenerSuite extends TestSuiteBase with Matchers {
   val input = (1 to 4).map(Seq(_)).toSeq
   val operation = (d: DStream[Int]) => d.map(x => x)
 
+  var ssc: StreamingContext = _
+
+  override def afterFunction() {
+    super.afterFunction()
+    if (ssc != null) {
+      ssc.stop()
+    }
+  }
+
   // To make sure that the processing start and end times in collected
   // information are different for successive batches
   override def batchDuration: Duration = Milliseconds(100)
   override def actuallyWait: Boolean = true
 
   test("batch info reporting") {
-    val ssc = setupStreams(input, operation)
+    ssc = setupStreams(input, operation)
     val collector = new BatchInfoCollector
     ssc.addStreamingListener(collector)
     runStreams(ssc, input.size, input.size)
@@ -107,7 +116,7 @@ class StreamingListenerSuite extends TestSuiteBase with Matchers {
   }
 
   test("receiver info reporting") {
-    val ssc = new StreamingContext("local[2]", "test", Milliseconds(1000))
+    ssc = new StreamingContext("local[2]", "test", Milliseconds(1000))
     val inputStream = ssc.receiverStream(new StreamingListenerSuiteReceiver)
     inputStream.foreachRDD(_.count)
 
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala
index aff8b53f752fa..afad5f16dbc71 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala
@@ -29,36 +29,40 @@ import org.apache.spark.storage.StorageLevel
 /** Testsuite for receiver scheduling */
 class ReceiverTrackerSuite extends TestSuiteBase {
   val sparkConf = new SparkConf().setMaster("local[8]").setAppName("test")
-  val ssc = new StreamingContext(sparkConf, Milliseconds(100))
 
-  ignore("Receiver tracker - propagates rate limit") {
-    object ReceiverStartedWaiter extends StreamingListener {
-      @volatile
-      var started = false
+  test("Receiver tracker - propagates rate limit") {
+    withStreamingContext(new StreamingContext(sparkConf, Milliseconds(100))) { ssc =>
+      object ReceiverStartedWaiter extends StreamingListener {
+        @volatile
+        var started = false
 
-      override def onReceiverStarted(receiverStarted: StreamingListenerReceiverStarted): Unit = {
-        started = true
+        override def onReceiverStarted(receiverStarted: StreamingListenerReceiverStarted): Unit = {
+          started = true
+        }
       }
-    }
-
-    ssc.addStreamingListener(ReceiverStartedWaiter)
-    ssc.scheduler.listenerBus.start(ssc.sc)
-    SingletonTestRateReceiver.reset()
-
-    val newRateLimit = 100L
-    val inputDStream = new RateLimitInputDStream(ssc)
-    val tracker = new ReceiverTracker(ssc)
-    tracker.start()
 
-    // we wait until the Receiver has registered with the tracker,
-    // otherwise our rate update is lost
-    eventually(timeout(5 seconds)) {
-      assert(ReceiverStartedWaiter.started)
-    }
-    tracker.sendRateUpdate(inputDStream.id, newRateLimit)
-    // this is an async message, we need to wait a bit for it to be processed
-    eventually(timeout(3 seconds)) {
-      assert(inputDStream.getCurrentRateLimit.get === newRateLimit)
+      ssc.addStreamingListener(ReceiverStartedWaiter)
+      ssc.scheduler.listenerBus.start(ssc.sc)
+      SingletonTestRateReceiver.reset()
+
+      val newRateLimit = 100L
+      val inputDStream = new RateLimitInputDStream(ssc)
+      val tracker = new ReceiverTracker(ssc)
+      tracker.start()
+      try {
+        // we wait until the Receiver has registered with the tracker,
+        // otherwise our rate update is lost
+        eventually(timeout(5 seconds)) {
+          assert(ReceiverStartedWaiter.started)
+        }
+        tracker.sendRateUpdate(inputDStream.id, newRateLimit)
+        // this is an async message, we need to wait a bit for it to be processed
+        eventually(timeout(3 seconds)) {
+          assert(inputDStream.getCurrentRateLimit.get === newRateLimit)
+        }
+      } finally {
+        tracker.stop(false)
+      }
     }
   }
 }
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/ui/StreamingJobProgressListenerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/ui/StreamingJobProgressListenerSuite.scala
index 0891309f956d2..995f1197ccdfd 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/ui/StreamingJobProgressListenerSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/ui/StreamingJobProgressListenerSuite.scala
@@ -22,15 +22,24 @@ import java.util.Properties
 import org.scalatest.Matchers
 
 import org.apache.spark.scheduler.SparkListenerJobStart
+import org.apache.spark.streaming._
 import org.apache.spark.streaming.dstream.DStream
 import org.apache.spark.streaming.scheduler._
-import org.apache.spark.streaming.{Duration, Time, Milliseconds, TestSuiteBase}
 
 class StreamingJobProgressListenerSuite extends TestSuiteBase with Matchers {
 
   val input = (1 to 4).map(Seq(_)).toSeq
   val operation = (d: DStream[Int]) => d.map(x => x)
 
+  var ssc: StreamingContext = _
+
+  override def afterFunction() {
+    super.afterFunction()
+    if (ssc != null) {
+      ssc.stop()
+    }
+  }
+
   private def createJobStart(
       batchTime: Time, outputOpId: Int, jobId: Int): SparkListenerJobStart = {
     val properties = new Properties()
@@ -46,7 +55,7 @@ class StreamingJobProgressListenerSuite extends TestSuiteBase with Matchers {
 
   test("onBatchSubmitted, onBatchStarted, onBatchCompleted, " +
     "onReceiverStarted, onReceiverError, onReceiverStopped") {
-    val ssc = setupStreams(input, operation)
+    ssc = setupStreams(input, operation)
     val listener = new StreamingJobProgressListener(ssc)
 
     val streamIdToInputInfo = Map(
@@ -141,7 +150,7 @@ class StreamingJobProgressListenerSuite extends TestSuiteBase with Matchers {
   }
 
   test("Remove the old completed batches when exceeding the limit") {
-    val ssc = setupStreams(input, operation)
+    ssc = setupStreams(input, operation)
     val limit = ssc.conf.getInt("spark.streaming.ui.retainedBatches", 1000)
     val listener = new StreamingJobProgressListener(ssc)
 
@@ -158,7 +167,7 @@ class StreamingJobProgressListenerSuite extends TestSuiteBase with Matchers {
   }
 
   test("out-of-order onJobStart and onBatchXXX") {
-    val ssc = setupStreams(input, operation)
+    ssc = setupStreams(input, operation)
     val limit = ssc.conf.getInt("spark.streaming.ui.retainedBatches", 1000)
     val listener = new StreamingJobProgressListener(ssc)
 
@@ -209,7 +218,7 @@ class StreamingJobProgressListenerSuite extends TestSuiteBase with Matchers {
   }
 
   test("detect memory leak") {
-    val ssc = setupStreams(input, operation)
+    ssc = setupStreams(input, operation)
     val listener = new StreamingJobProgressListener(ssc)
 
     val limit = ssc.conf.getInt("spark.streaming.ui.retainedBatches", 1000)

From 7f7a319c4ce07f07a6bd68100cf0a4f1da66269e Mon Sep 17 00:00:00 2001
From: martinzapletal <zapletal-martin@email.cz>
Date: Thu, 30 Jul 2015 15:57:14 -0700
Subject: [PATCH 489/549] [SPARK-8671] [ML] Added isotonic regression to the
 pipeline API.

Author: martinzapletal <zapletal-martin@email.cz>

Closes #7517 from zapletal-martin/SPARK-8671-isotonic-regression-api and squashes the following commits:

8c435c1 [martinzapletal] Review https://github.com/apache/spark/pull/7517 feedback update.
bebbb86 [martinzapletal] Merge remote-tracking branch 'upstream/master' into SPARK-8671-isotonic-regression-api
b68efc0 [martinzapletal] Added tests for param validation.
07c12bd [martinzapletal] Comments and refactoring.
834fcf7 [martinzapletal] Merge remote-tracking branch 'upstream/master' into SPARK-8671-isotonic-regression-api
b611fee [martinzapletal] SPARK-8671. Added first version of isotonic regression to pipeline API
---
 .../ml/regression/IsotonicRegression.scala    | 144 +++++++++++++++++
 .../regression/IsotonicRegressionSuite.scala  | 148 ++++++++++++++++++
 2 files changed, 292 insertions(+)
 create mode 100644 mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala
 create mode 100644 mllib/src/test/scala/org/apache/spark/ml/regression/IsotonicRegressionSuite.scala

diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala
new file mode 100644
index 0000000000000..4ece8cf8cf0b6
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.regression
+
+import org.apache.spark.annotation.Experimental
+import org.apache.spark.ml.PredictorParams
+import org.apache.spark.ml.param.{Param, ParamMap, BooleanParam}
+import org.apache.spark.ml.util.{SchemaUtils, Identifiable}
+import org.apache.spark.mllib.regression.{IsotonicRegression => MLlibIsotonicRegression}
+import org.apache.spark.mllib.regression.{IsotonicRegressionModel => MLlibIsotonicRegressionModel}
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.types.{DoubleType, DataType}
+import org.apache.spark.sql.{Row, DataFrame}
+import org.apache.spark.storage.StorageLevel
+
+/**
+ * Params for isotonic regression.
+ */
+private[regression] trait IsotonicRegressionParams extends PredictorParams {
+
+  /**
+   * Param for weight column name.
+   * TODO: Move weightCol to sharedParams.
+   *
+   * @group param
+   */
+  final val weightCol: Param[String] =
+    new Param[String](this, "weightCol", "weight column name")
+
+  /** @group getParam */
+  final def getWeightCol: String = $(weightCol)
+
+  /**
+   * Param for isotonic parameter.
+   * Isotonic (increasing) or antitonic (decreasing) sequence.
+   * @group param
+   */
+  final val isotonic: BooleanParam =
+    new BooleanParam(this, "isotonic", "isotonic (increasing) or antitonic (decreasing) sequence")
+
+  /** @group getParam */
+  final def getIsotonicParam: Boolean = $(isotonic)
+}
+
+/**
+ * :: Experimental ::
+ * Isotonic regression.
+ *
+ * Currently implemented using parallelized pool adjacent violators algorithm.
+ * Only univariate (single feature) algorithm supported.
+ *
+ * Uses [[org.apache.spark.mllib.regression.IsotonicRegression]].
+ */
+@Experimental
+class IsotonicRegression(override val uid: String)
+  extends Regressor[Double, IsotonicRegression, IsotonicRegressionModel]
+  with IsotonicRegressionParams {
+
+  def this() = this(Identifiable.randomUID("isoReg"))
+
+  /**
+   * Set the isotonic parameter.
+   * Default is true.
+   * @group setParam
+   */
+  def setIsotonicParam(value: Boolean): this.type = set(isotonic, value)
+  setDefault(isotonic -> true)
+
+  /**
+   * Set weight column param.
+   * Default is weight.
+   * @group setParam
+   */
+  def setWeightParam(value: String): this.type = set(weightCol, value)
+  setDefault(weightCol -> "weight")
+
+  override private[ml] def featuresDataType: DataType = DoubleType
+
+  override def copy(extra: ParamMap): IsotonicRegression = defaultCopy(extra)
+
+  private[this] def extractWeightedLabeledPoints(
+      dataset: DataFrame): RDD[(Double, Double, Double)] = {
+
+    dataset.select($(labelCol), $(featuresCol), $(weightCol))
+      .map { case Row(label: Double, features: Double, weights: Double) =>
+        (label, features, weights)
+      }
+  }
+
+  override protected def train(dataset: DataFrame): IsotonicRegressionModel = {
+    SchemaUtils.checkColumnType(dataset.schema, $(weightCol), DoubleType)
+    // Extract columns from data.  If dataset is persisted, do not persist oldDataset.
+    val instances = extractWeightedLabeledPoints(dataset)
+    val handlePersistence = dataset.rdd.getStorageLevel == StorageLevel.NONE
+    if (handlePersistence) instances.persist(StorageLevel.MEMORY_AND_DISK)
+
+    val isotonicRegression = new MLlibIsotonicRegression().setIsotonic($(isotonic))
+    val parentModel = isotonicRegression.run(instances)
+
+    new IsotonicRegressionModel(uid, parentModel)
+  }
+}
+
+/**
+ * :: Experimental ::
+ * Model fitted by IsotonicRegression.
+ * Predicts using a piecewise linear function.
+ *
+ * For detailed rules see [[org.apache.spark.mllib.regression.IsotonicRegressionModel.predict()]].
+ *
+ * @param parentModel A [[org.apache.spark.mllib.regression.IsotonicRegressionModel]]
+ *                    model trained by [[org.apache.spark.mllib.regression.IsotonicRegression]].
+ */
+class IsotonicRegressionModel private[ml] (
+    override val uid: String,
+    private[ml] val parentModel: MLlibIsotonicRegressionModel)
+  extends RegressionModel[Double, IsotonicRegressionModel]
+  with IsotonicRegressionParams {
+
+  override def featuresDataType: DataType = DoubleType
+
+  override protected def predict(features: Double): Double = {
+    parentModel.predict(features)
+  }
+
+  override def copy(extra: ParamMap): IsotonicRegressionModel = {
+    copyValues(new IsotonicRegressionModel(uid, parentModel), extra)
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/IsotonicRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/IsotonicRegressionSuite.scala
new file mode 100644
index 0000000000000..66e4b170bae80
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/IsotonicRegressionSuite.scala
@@ -0,0 +1,148 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.regression
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.ml.param.ParamsSuite
+import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.sql.types.{DoubleType, StructField, StructType}
+import org.apache.spark.sql.{DataFrame, Row}
+
+class IsotonicRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
+  private val schema = StructType(
+    Array(
+      StructField("label", DoubleType),
+      StructField("features", DoubleType),
+      StructField("weight", DoubleType)))
+
+  private val predictionSchema = StructType(Array(StructField("features", DoubleType)))
+
+  private def generateIsotonicInput(labels: Seq[Double]): DataFrame = {
+    val data = Seq.tabulate(labels.size)(i => Row(labels(i), i.toDouble, 1d))
+    val parallelData = sc.parallelize(data)
+
+    sqlContext.createDataFrame(parallelData, schema)
+  }
+
+  private def generatePredictionInput(features: Seq[Double]): DataFrame = {
+    val data = Seq.tabulate(features.size)(i => Row(features(i)))
+
+    val parallelData = sc.parallelize(data)
+    sqlContext.createDataFrame(parallelData, predictionSchema)
+  }
+
+  test("isotonic regression predictions") {
+    val dataset = generateIsotonicInput(Seq(1, 2, 3, 1, 6, 17, 16, 17, 18))
+    val trainer = new IsotonicRegression().setIsotonicParam(true)
+
+    val model = trainer.fit(dataset)
+
+    val predictions = model
+      .transform(dataset)
+      .select("prediction").map {
+        case Row(pred) => pred
+      }.collect()
+
+    assert(predictions === Array(1, 2, 2, 2, 6, 16.5, 16.5, 17, 18))
+
+    assert(model.parentModel.boundaries === Array(0, 1, 3, 4, 5, 6, 7, 8))
+    assert(model.parentModel.predictions === Array(1, 2, 2, 6, 16.5, 16.5, 17.0, 18.0))
+    assert(model.parentModel.isotonic)
+  }
+
+  test("antitonic regression predictions") {
+    val dataset = generateIsotonicInput(Seq(7, 5, 3, 5, 1))
+    val trainer = new IsotonicRegression().setIsotonicParam(false)
+
+    val model = trainer.fit(dataset)
+    val features = generatePredictionInput(Seq(-2.0, -1.0, 0.5, 0.75, 1.0, 2.0, 9.0))
+
+    val predictions = model
+      .transform(features)
+      .select("prediction").map {
+        case Row(pred) => pred
+      }.collect()
+
+    assert(predictions === Array(7, 7, 6, 5.5, 5, 4, 1))
+  }
+
+  test("params validation") {
+    val dataset = generateIsotonicInput(Seq(1, 2, 3))
+    val ir = new IsotonicRegression
+    ParamsSuite.checkParams(ir)
+    val model = ir.fit(dataset)
+    ParamsSuite.checkParams(model)
+  }
+
+  test("default params") {
+    val dataset = generateIsotonicInput(Seq(1, 2, 3))
+    val ir = new IsotonicRegression()
+    assert(ir.getLabelCol === "label")
+    assert(ir.getFeaturesCol === "features")
+    assert(ir.getWeightCol === "weight")
+    assert(ir.getPredictionCol === "prediction")
+    assert(ir.getIsotonicParam === true)
+
+    val model = ir.fit(dataset)
+    model.transform(dataset)
+      .select("label", "features", "prediction", "weight")
+      .collect()
+
+    assert(model.getLabelCol === "label")
+    assert(model.getFeaturesCol === "features")
+    assert(model.getWeightCol === "weight")
+    assert(model.getPredictionCol === "prediction")
+    assert(model.getIsotonicParam === true)
+    assert(model.hasParent)
+  }
+
+  test("set parameters") {
+    val isotonicRegression = new IsotonicRegression()
+      .setIsotonicParam(false)
+      .setWeightParam("w")
+      .setFeaturesCol("f")
+      .setLabelCol("l")
+      .setPredictionCol("p")
+
+    assert(isotonicRegression.getIsotonicParam === false)
+    assert(isotonicRegression.getWeightCol === "w")
+    assert(isotonicRegression.getFeaturesCol === "f")
+    assert(isotonicRegression.getLabelCol === "l")
+    assert(isotonicRegression.getPredictionCol === "p")
+  }
+
+  test("missing column") {
+    val dataset = generateIsotonicInput(Seq(1, 2, 3))
+
+    intercept[IllegalArgumentException] {
+      new IsotonicRegression().setWeightParam("w").fit(dataset)
+    }
+
+    intercept[IllegalArgumentException] {
+      new IsotonicRegression().setFeaturesCol("f").fit(dataset)
+    }
+
+    intercept[IllegalArgumentException] {
+      new IsotonicRegression().setLabelCol("l").fit(dataset)
+    }
+
+    intercept[IllegalArgumentException] {
+      new IsotonicRegression().fit(dataset).setFeaturesCol("f").transform(dataset)
+    }
+  }
+}

From be7be6d4c7d978c20e601d1f5f56ecb3479814cb Mon Sep 17 00:00:00 2001
From: "Joseph K. Bradley" <joseph@databricks.com>
Date: Thu, 30 Jul 2015 16:04:23 -0700
Subject: [PATCH 490/549] [SPARK-6684] [MLLIB] [ML] Add checkpointing to GBTs

Add checkpointing to GradientBoostedTrees, GBTClassifier, GBTRegressor

CC: mengxr

Author: Joseph K. Bradley <joseph@databricks.com>

Closes #7804 from jkbradley/gbt-checkpoint3 and squashes the following commits:

3fbd7ba [Joseph K. Bradley] tiny fix
b3e160c [Joseph K. Bradley] unset checkpoint dir after test
9cc3a04 [Joseph K. Bradley] added checkpointing to GBTs
---
 .../spark/mllib/clustering/LDAOptimizer.scala |  1 +
 .../mllib/tree/GradientBoostedTrees.scala     | 48 +++++------
 .../tree/configuration/BoostingStrategy.scala |  3 +-
 .../classification/GBTClassifierSuite.scala   | 20 +++++
 .../ml/regression/GBTRegressorSuite.scala     | 20 ++++-
 .../tree/GradientBoostedTreesSuite.scala      | 79 +++++++++++--------
 6 files changed, 114 insertions(+), 57 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
index 9dbec41efeada..d6f8b29a43dfd 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
@@ -144,6 +144,7 @@ final class EMLDAOptimizer extends LDAOptimizer {
     this.checkpointInterval = lda.getCheckpointInterval
     this.graphCheckpointer = new PeriodicGraphCheckpointer[TopicCounts, TokenCount](
       checkpointInterval, graph.vertices.sparkContext)
+    this.graphCheckpointer.update(this.graph)
     this.globalTopicTotals = computeGlobalTopicTotals()
     this
   }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
index a835f96d5d0e3..9ce6faa137c41 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
@@ -20,6 +20,7 @@ package org.apache.spark.mllib.tree
 import org.apache.spark.Logging
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.api.java.JavaRDD
+import org.apache.spark.mllib.impl.PeriodicRDDCheckpointer
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.tree.configuration.BoostingStrategy
 import org.apache.spark.mllib.tree.configuration.Algo._
@@ -184,22 +185,28 @@ object GradientBoostedTrees extends Logging {
       false
     }
 
+    // Prepare periodic checkpointers
+    val predErrorCheckpointer = new PeriodicRDDCheckpointer[(Double, Double)](
+      treeStrategy.getCheckpointInterval, input.sparkContext)
+    val validatePredErrorCheckpointer = new PeriodicRDDCheckpointer[(Double, Double)](
+      treeStrategy.getCheckpointInterval, input.sparkContext)
+
     timer.stop("init")
 
     logDebug("##########")
     logDebug("Building tree 0")
     logDebug("##########")
-    var data = input
 
     // Initialize tree
     timer.start("building tree 0")
-    val firstTreeModel = new DecisionTree(treeStrategy).run(data)
+    val firstTreeModel = new DecisionTree(treeStrategy).run(input)
     val firstTreeWeight = 1.0
     baseLearners(0) = firstTreeModel
     baseLearnerWeights(0) = firstTreeWeight
 
     var predError: RDD[(Double, Double)] = GradientBoostedTreesModel.
       computeInitialPredictionAndError(input, firstTreeWeight, firstTreeModel, loss)
+    predErrorCheckpointer.update(predError)
     logDebug("error of gbt = " + predError.values.mean())
 
     // Note: A model of type regression is used since we require raw prediction
@@ -207,35 +214,34 @@ object GradientBoostedTrees extends Logging {
 
     var validatePredError: RDD[(Double, Double)] = GradientBoostedTreesModel.
       computeInitialPredictionAndError(validationInput, firstTreeWeight, firstTreeModel, loss)
+    if (validate) validatePredErrorCheckpointer.update(validatePredError)
     var bestValidateError = if (validate) validatePredError.values.mean() else 0.0
     var bestM = 1
 
-    // pseudo-residual for second iteration
-    data = predError.zip(input).map { case ((pred, _), point) =>
-      LabeledPoint(-loss.gradient(pred, point.label), point.features)
-    }
-
     var m = 1
-    while (m < numIterations) {
+    var doneLearning = false
+    while (m < numIterations && !doneLearning) {
+      // Update data with pseudo-residuals
+      val data = predError.zip(input).map { case ((pred, _), point) =>
+        LabeledPoint(-loss.gradient(pred, point.label), point.features)
+      }
+
       timer.start(s"building tree $m")
       logDebug("###################################################")
       logDebug("Gradient boosting tree iteration " + m)
       logDebug("###################################################")
       val model = new DecisionTree(treeStrategy).run(data)
       timer.stop(s"building tree $m")
-      // Create partial model
+      // Update partial model
       baseLearners(m) = model
       // Note: The setting of baseLearnerWeights is incorrect for losses other than SquaredError.
       //       Technically, the weight should be optimized for the particular loss.
       //       However, the behavior should be reasonable, though not optimal.
       baseLearnerWeights(m) = learningRate
-      // Note: A model of type regression is used since we require raw prediction
-      val partialModel = new GradientBoostedTreesModel(
-        Regression, baseLearners.slice(0, m + 1),
-        baseLearnerWeights.slice(0, m + 1))
 
       predError = GradientBoostedTreesModel.updatePredictionError(
         input, predError, baseLearnerWeights(m), baseLearners(m), loss)
+      predErrorCheckpointer.update(predError)
       logDebug("error of gbt = " + predError.values.mean())
 
       if (validate) {
@@ -246,21 +252,15 @@ object GradientBoostedTrees extends Logging {
 
         validatePredError = GradientBoostedTreesModel.updatePredictionError(
           validationInput, validatePredError, baseLearnerWeights(m), baseLearners(m), loss)
+        validatePredErrorCheckpointer.update(validatePredError)
         val currentValidateError = validatePredError.values.mean()
         if (bestValidateError - currentValidateError < validationTol) {
-          return new GradientBoostedTreesModel(
-            boostingStrategy.treeStrategy.algo,
-            baseLearners.slice(0, bestM),
-            baseLearnerWeights.slice(0, bestM))
+          doneLearning = true
         } else if (currentValidateError < bestValidateError) {
-            bestValidateError = currentValidateError
-            bestM = m + 1
+          bestValidateError = currentValidateError
+          bestM = m + 1
         }
       }
-      // Update data with pseudo-residuals
-      data = predError.zip(input).map { case ((pred, _), point) =>
-        LabeledPoint(-loss.gradient(pred, point.label), point.features)
-      }
       m += 1
     }
 
@@ -269,6 +269,8 @@ object GradientBoostedTrees extends Logging {
     logInfo("Internal timing for DecisionTree:")
     logInfo(s"$timer")
 
+    predErrorCheckpointer.deleteAllCheckpoints()
+    validatePredErrorCheckpointer.deleteAllCheckpoints()
     if (persistedInput) input.unpersist()
 
     if (validate) {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala
index 2d6b01524ff3d..9fd30c9b56319 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala
@@ -36,7 +36,8 @@ import org.apache.spark.mllib.tree.loss.{LogLoss, SquaredError, Loss}
  *                     learning rate should be between in the interval (0, 1]
  * @param validationTol Useful when runWithValidation is used. If the error rate on the
  *                      validation input between two iterations is less than the validationTol
- *                      then stop. Ignored when [[run]] is used.
+ *                      then stop.  Ignored when
+ *                      [[org.apache.spark.mllib.tree.GradientBoostedTrees.run()]] is used.
  */
 @Experimental
 case class BoostingStrategy(
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala
index 82c345491bb3c..a7bc77965fefd 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala
@@ -28,6 +28,7 @@ import org.apache.spark.mllib.tree.configuration.{Algo => OldAlgo}
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.DataFrame
+import org.apache.spark.util.Utils
 
 
 /**
@@ -76,6 +77,25 @@ class GBTClassifierSuite extends SparkFunSuite with MLlibTestSparkContext {
     }
   }
 
+  test("Checkpointing") {
+    val tempDir = Utils.createTempDir()
+    val path = tempDir.toURI.toString
+    sc.setCheckpointDir(path)
+
+    val categoricalFeatures = Map.empty[Int, Int]
+    val df: DataFrame = TreeTests.setMetadata(data, categoricalFeatures, numClasses = 2)
+    val gbt = new GBTClassifier()
+      .setMaxDepth(2)
+      .setLossType("logistic")
+      .setMaxIter(5)
+      .setStepSize(0.1)
+      .setCheckpointInterval(2)
+    val model = gbt.fit(df)
+
+    sc.checkpointDir = None
+    Utils.deleteRecursively(tempDir)
+  }
+
   // TODO: Reinstate test once runWithValidation is implemented   SPARK-7132
   /*
   test("runWithValidation stops early and performs better on a validation dataset") {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala
index 9682edcd9ba84..dbdce0c9dea54 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala
@@ -25,7 +25,8 @@ import org.apache.spark.mllib.tree.{EnsembleTestHelper, GradientBoostedTrees =>
 import org.apache.spark.mllib.tree.configuration.{Algo => OldAlgo}
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{DataFrame, Row}
+import org.apache.spark.sql.DataFrame
+import org.apache.spark.util.Utils
 
 
 /**
@@ -88,6 +89,23 @@ class GBTRegressorSuite extends SparkFunSuite with MLlibTestSparkContext {
     assert(predictions.min() < -1)
   }
 
+  test("Checkpointing") {
+    val tempDir = Utils.createTempDir()
+    val path = tempDir.toURI.toString
+    sc.setCheckpointDir(path)
+
+    val df = sqlContext.createDataFrame(data)
+    val gbt = new GBTRegressor()
+      .setMaxDepth(2)
+      .setMaxIter(5)
+      .setStepSize(0.1)
+      .setCheckpointInterval(2)
+    val model = gbt.fit(df)
+
+    sc.checkpointDir = None
+    Utils.deleteRecursively(tempDir)
+  }
+
   // TODO: Reinstate test once runWithValidation is implemented  SPARK-7132
   /*
   test("runWithValidation stops early and performs better on a validation dataset") {
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/tree/GradientBoostedTreesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/tree/GradientBoostedTreesSuite.scala
index 2521b3342181a..6fc9e8df621df 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/tree/GradientBoostedTreesSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/tree/GradientBoostedTreesSuite.scala
@@ -166,43 +166,58 @@ class GradientBoostedTreesSuite extends SparkFunSuite with MLlibTestSparkContext
 
     val algos = Array(Regression, Regression, Classification)
     val losses = Array(SquaredError, AbsoluteError, LogLoss)
-    (algos zip losses) map {
-      case (algo, loss) => {
-        val treeStrategy = new Strategy(algo = algo, impurity = Variance, maxDepth = 2,
-          categoricalFeaturesInfo = Map.empty)
-        val boostingStrategy =
-          new BoostingStrategy(treeStrategy, loss, numIterations, validationTol = 0.0)
-        val gbtValidate = new GradientBoostedTrees(boostingStrategy)
-          .runWithValidation(trainRdd, validateRdd)
-        val numTrees = gbtValidate.numTrees
-        assert(numTrees !== numIterations)
-
-        // Test that it performs better on the validation dataset.
-        val gbt = new GradientBoostedTrees(boostingStrategy).run(trainRdd)
-        val (errorWithoutValidation, errorWithValidation) = {
-          if (algo == Classification) {
-            val remappedRdd = validateRdd.map(x => new LabeledPoint(2 * x.label - 1, x.features))
-            (loss.computeError(gbt, remappedRdd), loss.computeError(gbtValidate, remappedRdd))
-          } else {
-            (loss.computeError(gbt, validateRdd), loss.computeError(gbtValidate, validateRdd))
-          }
-        }
-        assert(errorWithValidation <= errorWithoutValidation)
-
-        // Test that results from evaluateEachIteration comply with runWithValidation.
-        // Note that convergenceTol is set to 0.0
-        val evaluationArray = gbt.evaluateEachIteration(validateRdd, loss)
-        assert(evaluationArray.length === numIterations)
-        assert(evaluationArray(numTrees) > evaluationArray(numTrees - 1))
-        var i = 1
-        while (i < numTrees) {
-          assert(evaluationArray(i) <= evaluationArray(i - 1))
-          i += 1
+    algos.zip(losses).foreach { case (algo, loss) =>
+      val treeStrategy = new Strategy(algo = algo, impurity = Variance, maxDepth = 2,
+        categoricalFeaturesInfo = Map.empty)
+      val boostingStrategy =
+        new BoostingStrategy(treeStrategy, loss, numIterations, validationTol = 0.0)
+      val gbtValidate = new GradientBoostedTrees(boostingStrategy)
+        .runWithValidation(trainRdd, validateRdd)
+      val numTrees = gbtValidate.numTrees
+      assert(numTrees !== numIterations)
+
+      // Test that it performs better on the validation dataset.
+      val gbt = new GradientBoostedTrees(boostingStrategy).run(trainRdd)
+      val (errorWithoutValidation, errorWithValidation) = {
+        if (algo == Classification) {
+          val remappedRdd = validateRdd.map(x => new LabeledPoint(2 * x.label - 1, x.features))
+          (loss.computeError(gbt, remappedRdd), loss.computeError(gbtValidate, remappedRdd))
+        } else {
+          (loss.computeError(gbt, validateRdd), loss.computeError(gbtValidate, validateRdd))
         }
       }
+      assert(errorWithValidation <= errorWithoutValidation)
+
+      // Test that results from evaluateEachIteration comply with runWithValidation.
+      // Note that convergenceTol is set to 0.0
+      val evaluationArray = gbt.evaluateEachIteration(validateRdd, loss)
+      assert(evaluationArray.length === numIterations)
+      assert(evaluationArray(numTrees) > evaluationArray(numTrees - 1))
+      var i = 1
+      while (i < numTrees) {
+        assert(evaluationArray(i) <= evaluationArray(i - 1))
+        i += 1
+      }
     }
   }
 
+  test("Checkpointing") {
+    val tempDir = Utils.createTempDir()
+    val path = tempDir.toURI.toString
+    sc.setCheckpointDir(path)
+
+    val rdd = sc.parallelize(GradientBoostedTreesSuite.data, 2)
+
+    val treeStrategy = new Strategy(algo = Regression, impurity = Variance, maxDepth = 2,
+      categoricalFeaturesInfo = Map.empty, checkpointInterval = 2)
+    val boostingStrategy = new BoostingStrategy(treeStrategy, SquaredError, 5, 0.1)
+
+    val gbt = GradientBoostedTrees.train(rdd, boostingStrategy)
+
+    sc.checkpointDir = None
+    Utils.deleteRecursively(tempDir)
+  }
+
 }
 
 private object GradientBoostedTreesSuite {

From e7905a9395c1a002f50bab29e16a729e14d4ed6f Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Thu, 30 Jul 2015 16:15:43 -0700
Subject: [PATCH 491/549] [SPARK-9463] [ML] Expose model coefficients with
 names in SparkR RFormula

Preview:

```
> summary(m)
            features coefficients
1        (Intercept)    1.6765001
2       Sepal_Length    0.3498801
3 Species.versicolor   -0.9833885
4  Species.virginica   -1.0075104

```

Design doc from umbrella task: https://docs.google.com/document/d/10NZNSEurN2EdWM31uFYsgayIPfCFHiuIu3pCWrUmP_c/edit

cc mengxr

Author: Eric Liang <ekl@databricks.com>

Closes #7771 from ericl/summary and squashes the following commits:

ccd54c3 [Eric Liang] second pass
a5ca93b [Eric Liang] comments
2772111 [Eric Liang] clean up
70483ef [Eric Liang] fix test
7c247d4 [Eric Liang] Merge branch 'master' into summary
3c55024 [Eric Liang] working
8c539aa [Eric Liang] first pass
---
 R/pkg/NAMESPACE                               |  3 ++-
 R/pkg/R/mllib.R                               | 26 ++++++++++++++++++
 R/pkg/inst/tests/test_mllib.R                 | 11 ++++++++
 .../spark/ml/feature/OneHotEncoder.scala      | 12 ++++-----
 .../apache/spark/ml/feature/RFormula.scala    | 12 ++++++++-
 .../apache/spark/ml/r/SparkRWrappers.scala    | 27 +++++++++++++++++--
 .../ml/regression/LinearRegression.scala      |  8 ++++--
 .../spark/ml/feature/OneHotEncoderSuite.scala |  8 +++---
 .../spark/ml/feature/RFormulaSuite.scala      | 18 +++++++++++++
 9 files changed, 108 insertions(+), 17 deletions(-)

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 7f7a8a2e4de24..a329e14f25aeb 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -12,7 +12,8 @@ export("print.jobj")
 
 # MLlib integration
 exportMethods("glm",
-              "predict")
+              "predict",
+              "summary")
 
 # Job group lifecycle management methods
 export("setJobGroup",
diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
index 6a8bacaa552c6..efddcc1d8d71c 100644
--- a/R/pkg/R/mllib.R
+++ b/R/pkg/R/mllib.R
@@ -71,3 +71,29 @@ setMethod("predict", signature(object = "PipelineModel"),
           function(object, newData) {
             return(dataFrame(callJMethod(object@model, "transform", newData@sdf)))
           })
+
+#' Get the summary of a model
+#'
+#' Returns the summary of a model produced by glm(), similarly to R's summary().
+#'
+#' @param model A fitted MLlib model
+#' @return a list with a 'coefficient' component, which is the matrix of coefficients. See
+#'         summary.glm for more information.
+#' @rdname glm
+#' @export
+#' @examples
+#'\dontrun{
+#' model <- glm(y ~ x, trainingData)
+#' summary(model)
+#'}
+setMethod("summary", signature(object = "PipelineModel"),
+          function(object) {
+            features <- callJStatic("org.apache.spark.ml.api.r.SparkRWrappers",
+                                   "getModelFeatures", object@model)
+            weights <- callJStatic("org.apache.spark.ml.api.r.SparkRWrappers",
+                                   "getModelWeights", object@model)
+            coefficients <- as.matrix(unlist(weights))
+            colnames(coefficients) <- c("Estimate")
+            rownames(coefficients) <- unlist(features)
+            return(list(coefficients = coefficients))
+          })
diff --git a/R/pkg/inst/tests/test_mllib.R b/R/pkg/inst/tests/test_mllib.R
index 3bef69324770a..f272de78ad4a6 100644
--- a/R/pkg/inst/tests/test_mllib.R
+++ b/R/pkg/inst/tests/test_mllib.R
@@ -48,3 +48,14 @@ test_that("dot minus and intercept vs native glm", {
   rVals <- predict(glm(Sepal.Width ~ . - Species + 0, data = iris), iris)
   expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
 })
+
+test_that("summary coefficients match with native glm", {
+  training <- createDataFrame(sqlContext, iris)
+  stats <- summary(glm(Sepal_Width ~ Sepal_Length + Species, data = training))
+  coefs <- as.vector(stats$coefficients)
+  rCoefs <- as.vector(coef(glm(Sepal.Width ~ Sepal.Length + Species, data = iris)))
+  expect_true(all(abs(rCoefs - coefs) < 1e-6))
+  expect_true(all(
+    as.character(stats$features) ==
+    c("(Intercept)", "Sepal_Length", "Species__versicolor", "Species__virginica")))
+})
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala
index 3825942795645..9c60d4084ec46 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala
@@ -66,7 +66,6 @@ class OneHotEncoder(override val uid: String) extends Transformer
   def setOutputCol(value: String): this.type = set(outputCol, value)
 
   override def transformSchema(schema: StructType): StructType = {
-    val is = "_is_"
     val inputColName = $(inputCol)
     val outputColName = $(outputCol)
 
@@ -79,17 +78,17 @@ class OneHotEncoder(override val uid: String) extends Transformer
     val outputAttrNames: Option[Array[String]] = inputAttr match {
       case nominal: NominalAttribute =>
         if (nominal.values.isDefined) {
-          nominal.values.map(_.map(v => inputColName + is + v))
+          nominal.values
         } else if (nominal.numValues.isDefined) {
-          nominal.numValues.map(n => Array.tabulate(n)(i => inputColName + is + i))
+          nominal.numValues.map(n => Array.tabulate(n)(_.toString))
         } else {
           None
         }
       case binary: BinaryAttribute =>
         if (binary.values.isDefined) {
-          binary.values.map(_.map(v => inputColName + is + v))
+          binary.values
         } else {
-          Some(Array.tabulate(2)(i => inputColName + is + i))
+          Some(Array.tabulate(2)(_.toString))
         }
       case _: NumericAttribute =>
         throw new RuntimeException(
@@ -123,7 +122,6 @@ class OneHotEncoder(override val uid: String) extends Transformer
 
   override def transform(dataset: DataFrame): DataFrame = {
     // schema transformation
-    val is = "_is_"
     val inputColName = $(inputCol)
     val outputColName = $(outputCol)
     val shouldDropLast = $(dropLast)
@@ -142,7 +140,7 @@ class OneHotEncoder(override val uid: String) extends Transformer
             math.max(m0, m1)
           }
         ).toInt + 1
-      val outputAttrNames = Array.tabulate(numAttrs)(i => inputColName + is + i)
+      val outputAttrNames = Array.tabulate(numAttrs)(_.toString)
       val filtered = if (shouldDropLast) outputAttrNames.dropRight(1) else outputAttrNames
       val outputAttrs: Array[Attribute] =
         filtered.map(name => BinaryAttribute.defaultAttr.withName(name))
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
index 0b428d278d908..d1726917e4517 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.ml.feature
 
+import scala.collection.mutable
 import scala.collection.mutable.ArrayBuffer
 import scala.util.parsing.combinator.RegexParsers
 
@@ -91,11 +92,20 @@ class RFormula(override val uid: String) extends Estimator[RFormulaModel] with R
     // TODO(ekl) add support for feature interactions
     val encoderStages = ArrayBuffer[PipelineStage]()
     val tempColumns = ArrayBuffer[String]()
+    val takenNames = mutable.Set(dataset.columns: _*)
     val encodedTerms = resolvedFormula.terms.map { term =>
       dataset.schema(term) match {
         case column if column.dataType == StringType =>
           val indexCol = term + "_idx_" + uid
-          val encodedCol = term + "_onehot_" + uid
+          val encodedCol = {
+            var tmp = term
+            while (takenNames.contains(tmp)) {
+              tmp += "_"
+            }
+            tmp
+          }
+          takenNames.add(indexCol)
+          takenNames.add(encodedCol)
           encoderStages += new StringIndexer().setInputCol(term).setOutputCol(indexCol)
           encoderStages += new OneHotEncoder().setInputCol(indexCol).setOutputCol(encodedCol)
           tempColumns += indexCol
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/SparkRWrappers.scala b/mllib/src/main/scala/org/apache/spark/ml/r/SparkRWrappers.scala
index 9f70592ccad7e..f5a022c31ed90 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/SparkRWrappers.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/SparkRWrappers.scala
@@ -17,9 +17,10 @@
 
 package org.apache.spark.ml.api.r
 
+import org.apache.spark.ml.attribute._
 import org.apache.spark.ml.feature.RFormula
-import org.apache.spark.ml.classification.LogisticRegression
-import org.apache.spark.ml.regression.LinearRegression
+import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel}
+import org.apache.spark.ml.regression.{LinearRegression, LinearRegressionModel}
 import org.apache.spark.ml.{Pipeline, PipelineModel}
 import org.apache.spark.sql.DataFrame
 
@@ -44,4 +45,26 @@ private[r] object SparkRWrappers {
     val pipeline = new Pipeline().setStages(Array(formula, estimator))
     pipeline.fit(df)
   }
+
+  def getModelWeights(model: PipelineModel): Array[Double] = {
+    model.stages.last match {
+      case m: LinearRegressionModel =>
+        Array(m.intercept) ++ m.weights.toArray
+      case _: LogisticRegressionModel =>
+        throw new UnsupportedOperationException(
+          "No weights available for LogisticRegressionModel")  // SPARK-9492
+    }
+  }
+
+  def getModelFeatures(model: PipelineModel): Array[String] = {
+    model.stages.last match {
+      case m: LinearRegressionModel =>
+        val attrs = AttributeGroup.fromStructField(
+          m.summary.predictions.schema(m.summary.featuresCol))
+        Array("(Intercept)") ++ attrs.attributes.get.map(_.name.get)
+      case _: LogisticRegressionModel =>
+        throw new UnsupportedOperationException(
+          "No features names available for LogisticRegressionModel")  // SPARK-9492
+    }
+  }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index 89718e0f3e15a..3b85ba001b128 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -36,6 +36,7 @@ import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{DataFrame, Row}
 import org.apache.spark.sql.functions.{col, udf}
+import org.apache.spark.sql.types.StructField
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.util.StatCounter
 
@@ -146,9 +147,10 @@ class LinearRegression(override val uid: String)
 
       val model = new LinearRegressionModel(uid, weights, intercept)
       val trainingSummary = new LinearRegressionTrainingSummary(
-        model.transform(dataset).select($(predictionCol), $(labelCol)),
+        model.transform(dataset),
         $(predictionCol),
         $(labelCol),
+        $(featuresCol),
         Array(0D))
       return copyValues(model.setSummary(trainingSummary))
     }
@@ -221,9 +223,10 @@ class LinearRegression(override val uid: String)
 
     val model = copyValues(new LinearRegressionModel(uid, weights, intercept))
     val trainingSummary = new LinearRegressionTrainingSummary(
-      model.transform(dataset).select($(predictionCol), $(labelCol)),
+      model.transform(dataset),
       $(predictionCol),
       $(labelCol),
+      $(featuresCol),
       objectiveHistory)
     model.setSummary(trainingSummary)
   }
@@ -300,6 +303,7 @@ class LinearRegressionTrainingSummary private[regression] (
     predictions: DataFrame,
     predictionCol: String,
     labelCol: String,
+    val featuresCol: String,
     val objectiveHistory: Array[Double])
   extends LinearRegressionSummary(predictions, predictionCol, labelCol) {
 
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/OneHotEncoderSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/OneHotEncoderSuite.scala
index 65846a846b7b4..321eeb843941c 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/OneHotEncoderSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/OneHotEncoderSuite.scala
@@ -86,8 +86,8 @@ class OneHotEncoderSuite extends SparkFunSuite with MLlibTestSparkContext {
     val output = encoder.transform(df)
     val group = AttributeGroup.fromStructField(output.schema("encoded"))
     assert(group.size === 2)
-    assert(group.getAttr(0) === BinaryAttribute.defaultAttr.withName("size_is_small").withIndex(0))
-    assert(group.getAttr(1) === BinaryAttribute.defaultAttr.withName("size_is_medium").withIndex(1))
+    assert(group.getAttr(0) === BinaryAttribute.defaultAttr.withName("small").withIndex(0))
+    assert(group.getAttr(1) === BinaryAttribute.defaultAttr.withName("medium").withIndex(1))
   }
 
   test("input column without ML attribute") {
@@ -98,7 +98,7 @@ class OneHotEncoderSuite extends SparkFunSuite with MLlibTestSparkContext {
     val output = encoder.transform(df)
     val group = AttributeGroup.fromStructField(output.schema("encoded"))
     assert(group.size === 2)
-    assert(group.getAttr(0) === BinaryAttribute.defaultAttr.withName("index_is_0").withIndex(0))
-    assert(group.getAttr(1) === BinaryAttribute.defaultAttr.withName("index_is_1").withIndex(1))
+    assert(group.getAttr(0) === BinaryAttribute.defaultAttr.withName("0").withIndex(0))
+    assert(group.getAttr(1) === BinaryAttribute.defaultAttr.withName("1").withIndex(1))
   }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
index 8148c553e9051..6aed3243afce8 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.ml.feature
 
 import org.apache.spark.SparkFunSuite
+import org.apache.spark.ml.attribute._
 import org.apache.spark.ml.param.ParamsSuite
 import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.util.MLlibTestSparkContext
@@ -105,4 +106,21 @@ class RFormulaSuite extends SparkFunSuite with MLlibTestSparkContext {
     assert(result.schema.toString == resultSchema.toString)
     assert(result.collect() === expected.collect())
   }
+
+  test("attribute generation") {
+    val formula = new RFormula().setFormula("id ~ a + b")
+    val original = sqlContext.createDataFrame(
+      Seq((1, "foo", 4), (2, "bar", 4), (3, "bar", 5), (4, "baz", 5))
+    ).toDF("id", "a", "b")
+    val model = formula.fit(original)
+    val result = model.transform(original)
+    val attrs = AttributeGroup.fromStructField(result.schema("features"))
+    val expectedAttrs = new AttributeGroup(
+      "features",
+      Array(
+        new BinaryAttribute(Some("a__bar"), Some(1)),
+        new BinaryAttribute(Some("a__foo"), Some(2)),
+        new NumericAttribute(Some("b"), Some(3))))
+    assert(attrs === expectedAttrs)
+  }
 }

From 157840d1b14502a4f25cff53633c927998c6ada1 Mon Sep 17 00:00:00 2001
From: Hossein <hossein@databricks.com>
Date: Thu, 30 Jul 2015 16:16:17 -0700
Subject: [PATCH 492/549] [SPARK-8742] [SPARKR] Improve SparkR error messages
 for DataFrame API

This patch improves SparkR error message reporting, especially with DataFrame API. When there is a user error (e.g., malformed SQL query), the message of the cause is sent back through the RPC and the R client reads it and returns it back to user.

cc shivaram

Author: Hossein <hossein@databricks.com>

Closes #7742 from falaki/SPARK-8742 and squashes the following commits:

4f643c9 [Hossein] Not logging exceptions in RBackendHandler
4a8005c [Hossein] Returning stack track of causing exception from RBackendHandler
5cf17f0 [Hossein] Adding unit test for error messages from SQLContext
2af75d5 [Hossein] Reading error message in case of failure and stoping with that message
f479c99 [Hossein] Wrting exception cause message in JVM
---
 R/pkg/R/backend.R                                      |  4 +++-
 R/pkg/inst/tests/test_sparkSQL.R                       |  5 +++++
 .../scala/org/apache/spark/api/r/RBackendHandler.scala | 10 ++++++++--
 3 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/R/pkg/R/backend.R b/R/pkg/R/backend.R
index 2fb6fae55f28c..49162838b8d1a 100644
--- a/R/pkg/R/backend.R
+++ b/R/pkg/R/backend.R
@@ -110,6 +110,8 @@ invokeJava <- function(isStatic, objId, methodName, ...) {
 
   # TODO: check the status code to output error information
   returnStatus <- readInt(conn)
-  stopifnot(returnStatus == 0)
+  if (returnStatus != 0) {
+    stop(readString(conn))
+  }
   readObject(conn)
 }
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index d5db97248c770..61c8a7ec7d837 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -1002,6 +1002,11 @@ test_that("crosstab() on a DataFrame", {
   expect_identical(expected, ordered)
 })
 
+test_that("SQL error message is returned from JVM", {
+  retError <- tryCatch(sql(sqlContext, "select * from blah"), error = function(e) e)
+  expect_equal(grepl("Table Not Found: blah", retError), TRUE)
+})
+
 unlink(parquetPath)
 unlink(jsonPath)
 unlink(jsonPathNa)
diff --git a/core/src/main/scala/org/apache/spark/api/r/RBackendHandler.scala b/core/src/main/scala/org/apache/spark/api/r/RBackendHandler.scala
index a5de10fe89c42..14dac4ed28ce3 100644
--- a/core/src/main/scala/org/apache/spark/api/r/RBackendHandler.scala
+++ b/core/src/main/scala/org/apache/spark/api/r/RBackendHandler.scala
@@ -69,8 +69,11 @@ private[r] class RBackendHandler(server: RBackend)
             case e: Exception =>
               logError(s"Removing $objId failed", e)
               writeInt(dos, -1)
+              writeString(dos, s"Removing $objId failed: ${e.getMessage}")
           }
-        case _ => dos.writeInt(-1)
+        case _ =>
+          dos.writeInt(-1)
+          writeString(dos, s"Error: unknown method $methodName")
       }
     } else {
       handleMethodCall(isStatic, objId, methodName, numArgs, dis, dos)
@@ -146,8 +149,11 @@ private[r] class RBackendHandler(server: RBackend)
       }
     } catch {
       case e: Exception =>
-        logError(s"$methodName on $objId failed", e)
+        logError(s"$methodName on $objId failed")
         writeInt(dos, -1)
+        // Writing the error message of the cause for the exception. This will be returned
+        // to user in the R process.
+        writeString(dos, Utils.exceptionString(e.getCause))
     }
   }
 

From 04c8409107710fc9a625ee513d68c149745539f3 Mon Sep 17 00:00:00 2001
From: Calvin Jia <jia.calvin@gmail.com>
Date: Thu, 30 Jul 2015 16:32:40 -0700
Subject: [PATCH 493/549] [SPARK-9199] [CORE] Update Tachyon dependency from
 0.6.4 -> 0.7.0

No new dependencies are added. The exclusion changes are due to the change in tachyon-client 0.7.0's project structure.

There is no client side API change in Tachyon 0.7.0 so no code changes are required.

Author: Calvin Jia <jia.calvin@gmail.com>

Closes #7577 from calvinjia/SPARK-9199 and squashes the following commits:

4e81e40 [Calvin Jia] Update Tachyon dependency from 0.6.4 -> 0.7.0
---
 core/pom.xml         | 34 +++++-----------------------------
 make-distribution.sh |  2 +-
 2 files changed, 6 insertions(+), 30 deletions(-)

diff --git a/core/pom.xml b/core/pom.xml
index 6fa87ec6a24af..202678779150b 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -286,7 +286,7 @@
     <dependency>
       <groupId>org.tachyonproject</groupId>
       <artifactId>tachyon-client</artifactId>
-      <version>0.6.4</version>
+      <version>0.7.0</version>
       <exclusions>
         <exclusion>
           <groupId>org.apache.hadoop</groupId>
@@ -297,36 +297,12 @@
           <artifactId>curator-recipes</artifactId>
         </exclusion>
         <exclusion>
-          <groupId>org.eclipse.jetty</groupId>
-          <artifactId>jetty-jsp</artifactId>
+          <groupId>org.tachyonproject</groupId>
+          <artifactId>tachyon-underfs-glusterfs</artifactId>
         </exclusion>
         <exclusion>
-          <groupId>org.eclipse.jetty</groupId>
-          <artifactId>jetty-webapp</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>org.eclipse.jetty</groupId>
-          <artifactId>jetty-server</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>org.eclipse.jetty</groupId>
-          <artifactId>jetty-servlet</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>junit</groupId>
-          <artifactId>junit</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>org.powermock</groupId>
-          <artifactId>powermock-module-junit4</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>org.powermock</groupId>
-          <artifactId>powermock-api-mockito</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>org.apache.curator</groupId>
-          <artifactId>curator-test</artifactId>
+          <groupId>org.tachyonproject</groupId>
+          <artifactId>tachyon-underfs-s3</artifactId>
         </exclusion>
       </exclusions>
     </dependency>
diff --git a/make-distribution.sh b/make-distribution.sh
index cac7032bb2e87..4789b0e09cc8a 100755
--- a/make-distribution.sh
+++ b/make-distribution.sh
@@ -33,7 +33,7 @@ SPARK_HOME="$(cd "`dirname "$0"`"; pwd)"
 DISTDIR="$SPARK_HOME/dist"
 
 SPARK_TACHYON=false
-TACHYON_VERSION="0.6.4"
+TACHYON_VERSION="0.7.0"
 TACHYON_TGZ="tachyon-${TACHYON_VERSION}-bin.tar.gz"
 TACHYON_URL="https://github.com/amplab/tachyon/releases/download/v${TACHYON_VERSION}/${TACHYON_TGZ}"
 

From 1afdeb7b458f86e2641f062fb9ddc00e9c5c7531 Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Thu, 30 Jul 2015 16:44:02 -0700
Subject: [PATCH 494/549] [STREAMING] [TEST] [HOTFIX] Fixed Kinesis test to not
 throw weird errors when Kinesis tests are enabled without AWS keys

If Kinesis tests are enabled by env ENABLE_KINESIS_TESTS = 1 but no AWS credentials are found, the desired behavior is the fail the test using with
```
Exception encountered when attempting to run a suite with class name: org.apache.spark.streaming.kinesis.KinesisBackedBlockRDDSuite *** ABORTED *** (3 seconds, 5 milliseconds)
[info]   java.lang.Exception: Kinesis tests enabled, but could get not AWS credentials
```

Instead KinesisStreamSuite fails with

```
[info] - basic operation *** FAILED *** (3 seconds, 35 milliseconds)
[info]   java.lang.IllegalArgumentException: requirement failed: Stream not yet created, call createStream() to create one
[info]   at scala.Predef$.require(Predef.scala:233)
[info]   at org.apache.spark.streaming.kinesis.KinesisTestUtils.streamName(KinesisTestUtils.scala:77)
[info]   at org.apache.spark.streaming.kinesis.KinesisTestUtils$$anonfun$deleteStream$1.apply(KinesisTestUtils.scala:150)
[info]   at org.apache.spark.streaming.kinesis.KinesisTestUtils$$anonfun$deleteStream$1.apply(KinesisTestUtils.scala:150)
[info]   at org.apache.spark.Logging$class.logWarning(Logging.scala:71)
[info]   at org.apache.spark.streaming.kinesis.KinesisTestUtils.logWarning(KinesisTestUtils.scala:39)
[info]   at org.apache.spark.streaming.kinesis.KinesisTestUtils.deleteStream(KinesisTestUtils.scala:150)
[info]   at org.apache.spark.streaming.kinesis.KinesisStreamSuite$$anonfun$3.apply$mcV$sp(KinesisStreamSuite.scala:111)
[info]   at org.apache.spark.streaming.kinesis.KinesisStreamSuite$$anonfun$3.apply(KinesisStreamSuite.scala:86)
[info]   at org.apache.spark.streaming.kinesis.KinesisStreamSuite$$anonfun$3.apply(KinesisStreamSuite.scala:86)
```
This is because attempting to delete a non-existent Kinesis stream throws uncaught exception. This PR fixes it.

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #7809 from tdas/kinesis-test-hotfix and squashes the following commits:

7c372e6 [Tathagata Das] Fixed test
---
 .../streaming/kinesis/KinesisTestUtils.scala  | 27 ++++++++++---------
 .../kinesis/KinesisStreamSuite.scala          |  4 +--
 2 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisTestUtils.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisTestUtils.scala
index 0ff1b7ed0fd90..ca39358b75cb6 100644
--- a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisTestUtils.scala
+++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisTestUtils.scala
@@ -53,6 +53,8 @@ private class KinesisTestUtils(
 
   @volatile
   private var streamCreated = false
+
+  @volatile
   private var _streamName: String = _
 
   private lazy val kinesisClient = {
@@ -115,21 +117,9 @@ private class KinesisTestUtils(
     shardIdToSeqNumbers.toMap
   }
 
-  def describeStream(streamNameToDescribe: String = streamName): Option[StreamDescription] = {
-    try {
-      val describeStreamRequest = new DescribeStreamRequest().withStreamName(streamNameToDescribe)
-      val desc = kinesisClient.describeStream(describeStreamRequest).getStreamDescription()
-      Some(desc)
-    } catch {
-      case rnfe: ResourceNotFoundException =>
-        None
-    }
-  }
-
   def deleteStream(): Unit = {
     try {
-      if (describeStream().nonEmpty) {
-        val deleteStreamRequest = new DeleteStreamRequest()
+      if (streamCreated) {
         kinesisClient.deleteStream(streamName)
       }
     } catch {
@@ -149,6 +139,17 @@ private class KinesisTestUtils(
     }
   }
 
+  private def describeStream(streamNameToDescribe: String): Option[StreamDescription] = {
+    try {
+      val describeStreamRequest = new DescribeStreamRequest().withStreamName(streamNameToDescribe)
+      val desc = kinesisClient.describeStream(describeStreamRequest).getStreamDescription()
+      Some(desc)
+    } catch {
+      case rnfe: ResourceNotFoundException =>
+        None
+    }
+  }
+
   private def findNonExistentStreamName(): String = {
     var testStreamName: String = null
     do {
diff --git a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisStreamSuite.scala b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisStreamSuite.scala
index f9c952b9468bb..b88c9c6478d56 100644
--- a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisStreamSuite.scala
+++ b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisStreamSuite.scala
@@ -88,11 +88,11 @@ class KinesisStreamSuite extends KinesisFunSuite
     try {
       kinesisTestUtils.createStream()
       ssc = new StreamingContext(sc, Seconds(1))
-      val aWSCredentials = KinesisTestUtils.getAWSCredentials()
+      val awsCredentials = KinesisTestUtils.getAWSCredentials()
       val stream = KinesisUtils.createStream(ssc, kinesisAppName, kinesisTestUtils.streamName,
         kinesisTestUtils.endpointUrl, kinesisTestUtils.regionName, InitialPositionInStream.LATEST,
         Seconds(10), StorageLevel.MEMORY_ONLY,
-        aWSCredentials.getAWSAccessKeyId, aWSCredentials.getAWSSecretKey)
+        awsCredentials.getAWSAccessKeyId, awsCredentials.getAWSSecretKey)
 
       val collected = new mutable.HashSet[Int] with mutable.SynchronizedSet[Int]
       stream.map { bytes => new String(bytes).toInt }.foreachRDD { rdd =>

From ca71cc8c8b2d64b7756ae697c06876cd18b536dc Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Thu, 30 Jul 2015 16:57:38 -0700
Subject: [PATCH 495/549] [SPARK-9408] [PYSPARK] [MLLIB] Refactor linalg.py to
 /linalg

This is based on MechCoder 's PR https://github.com/apache/spark/pull/7731. Hopefully it could pass tests. MechCoder I tried to make minimal changes. If this passes Jenkins, we can merge this one first and then try to move `__init__.py` to `local.py` in a separate PR.

Closes #7731

Author: Xiangrui Meng <meng@databricks.com>

Closes #7746 from mengxr/SPARK-9408 and squashes the following commits:

0e05a3b [Xiangrui Meng] merge master
1135551 [Xiangrui Meng] add a comment for str(...)
c48cae0 [Xiangrui Meng] update tests
173a805 [Xiangrui Meng] move linalg.py to linalg/__init__.py
---
 dev/sparktestsupport/modules.py                        | 2 +-
 python/pyspark/mllib/{linalg.py => linalg/__init__.py} | 0
 python/pyspark/sql/types.py                            | 2 +-
 3 files changed, 2 insertions(+), 2 deletions(-)
 rename python/pyspark/mllib/{linalg.py => linalg/__init__.py} (100%)

diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
index 030d982e99106..44600cb9523c1 100644
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@@ -323,7 +323,7 @@ def contains_file(self, filename):
         "pyspark.mllib.evaluation",
         "pyspark.mllib.feature",
         "pyspark.mllib.fpm",
-        "pyspark.mllib.linalg",
+        "pyspark.mllib.linalg.__init__",
         "pyspark.mllib.random",
         "pyspark.mllib.recommendation",
         "pyspark.mllib.regression",
diff --git a/python/pyspark/mllib/linalg.py b/python/pyspark/mllib/linalg/__init__.py
similarity index 100%
rename from python/pyspark/mllib/linalg.py
rename to python/pyspark/mllib/linalg/__init__.py
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index 0976aea72c034..6f74b7162f7cc 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -648,7 +648,7 @@ def jsonValue(self):
 
     @classmethod
     def fromJson(cls, json):
-        pyUDT = str(json["pyClass"])
+        pyUDT = str(json["pyClass"])  # convert unicode to str
         split = pyUDT.rfind(".")
         pyModule = pyUDT[:split]
         pyClass = pyUDT[split+1:]

From df32669514afc0223ecdeca30fbfbe0b40baef3a Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Thu, 30 Jul 2015 17:16:03 -0700
Subject: [PATCH 496/549] [SPARK-7157][SQL] add sampleBy to DataFrame

This was previously committed but then reverted due to test failures (see #6769).

Author: Xiangrui Meng <meng@databricks.com>

Closes #7755 from rxin/SPARK-7157 and squashes the following commits:

fbf9044 [Xiangrui Meng] fix python test
542bd37 [Xiangrui Meng] update test
604fe6d [Xiangrui Meng] Merge remote-tracking branch 'apache/master' into SPARK-7157
f051afd [Xiangrui Meng] use udf instead of building expression
f4e9425 [Xiangrui Meng] Merge remote-tracking branch 'apache/master' into SPARK-7157
8fb990b [Xiangrui Meng] Merge remote-tracking branch 'apache/master' into SPARK-7157
103beb3 [Xiangrui Meng] add Java-friendly sampleBy
991f26f [Xiangrui Meng] fix seed
4a14834 [Xiangrui Meng] move sampleBy to stat
832f7cc [Xiangrui Meng] add sampleBy to DataFrame
---
 python/pyspark/sql/dataframe.py               | 41 ++++++++++++++++++
 .../spark/sql/DataFrameStatFunctions.scala    | 42 +++++++++++++++++++
 .../apache/spark/sql/JavaDataFrameSuite.java  |  9 ++++
 .../apache/spark/sql/DataFrameStatSuite.scala | 12 +++++-
 4 files changed, 102 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index d76e051bd73a1..0f3480c239187 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -441,6 +441,42 @@ def sample(self, withReplacement, fraction, seed=None):
         rdd = self._jdf.sample(withReplacement, fraction, long(seed))
         return DataFrame(rdd, self.sql_ctx)
 
+    @since(1.5)
+    def sampleBy(self, col, fractions, seed=None):
+        """
+        Returns a stratified sample without replacement based on the
+        fraction given on each stratum.
+
+        :param col: column that defines strata
+        :param fractions:
+            sampling fraction for each stratum. If a stratum is not
+            specified, we treat its fraction as zero.
+        :param seed: random seed
+        :return: a new DataFrame that represents the stratified sample
+
+        >>> from pyspark.sql.functions import col
+        >>> dataset = sqlContext.range(0, 100).select((col("id") % 3).alias("key"))
+        >>> sampled = dataset.sampleBy("key", fractions={0: 0.1, 1: 0.2}, seed=0)
+        >>> sampled.groupBy("key").count().orderBy("key").show()
+        +---+-----+
+        |key|count|
+        +---+-----+
+        |  0|    3|
+        |  1|    8|
+        +---+-----+
+
+        """
+        if not isinstance(col, str):
+            raise ValueError("col must be a string, but got %r" % type(col))
+        if not isinstance(fractions, dict):
+            raise ValueError("fractions must be a dict but got %r" % type(fractions))
+        for k, v in fractions.items():
+            if not isinstance(k, (float, int, long, basestring)):
+                raise ValueError("key must be float, int, long, or string, but got %r" % type(k))
+            fractions[k] = float(v)
+        seed = seed if seed is not None else random.randint(0, sys.maxsize)
+        return DataFrame(self._jdf.stat().sampleBy(col, self._jmap(fractions), seed), self.sql_ctx)
+
     @since(1.4)
     def randomSplit(self, weights, seed=None):
         """Randomly splits this :class:`DataFrame` with the provided weights.
@@ -1314,6 +1350,11 @@ def freqItems(self, cols, support=None):
 
     freqItems.__doc__ = DataFrame.freqItems.__doc__
 
+    def sampleBy(self, col, fractions, seed=None):
+        return self.df.sampleBy(col, fractions, seed)
+
+    sampleBy.__doc__ = DataFrame.sampleBy.__doc__
+
 
 def _test():
     import doctest
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
index 4ec58082e7aef..2e68e358f2f1f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
@@ -17,6 +17,10 @@
 
 package org.apache.spark.sql
 
+import java.{util => ju, lang => jl}
+
+import scala.collection.JavaConverters._
+
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.sql.execution.stat._
 
@@ -166,4 +170,42 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
   def freqItems(cols: Seq[String]): DataFrame = {
     FrequentItems.singlePassFreqItems(df, cols, 0.01)
   }
+
+  /**
+   * Returns a stratified sample without replacement based on the fraction given on each stratum.
+   * @param col column that defines strata
+   * @param fractions sampling fraction for each stratum. If a stratum is not specified, we treat
+   *                  its fraction as zero.
+   * @param seed random seed
+   * @tparam T stratum type
+   * @return a new [[DataFrame]] that represents the stratified sample
+   *
+   * @since 1.5.0
+   */
+  def sampleBy[T](col: String, fractions: Map[T, Double], seed: Long): DataFrame = {
+    require(fractions.values.forall(p => p >= 0.0 && p <= 1.0),
+      s"Fractions must be in [0, 1], but got $fractions.")
+    import org.apache.spark.sql.functions.{rand, udf}
+    val c = Column(col)
+    val r = rand(seed)
+    val f = udf { (stratum: Any, x: Double) =>
+      x < fractions.getOrElse(stratum.asInstanceOf[T], 0.0)
+    }
+    df.filter(f(c, r))
+  }
+
+  /**
+   * Returns a stratified sample without replacement based on the fraction given on each stratum.
+   * @param col column that defines strata
+   * @param fractions sampling fraction for each stratum. If a stratum is not specified, we treat
+   *                  its fraction as zero.
+   * @param seed random seed
+   * @tparam T stratum type
+   * @return a new [[DataFrame]] that represents the stratified sample
+   *
+   * @since 1.5.0
+   */
+  def sampleBy[T](col: String, fractions: ju.Map[T, jl.Double], seed: Long): DataFrame = {
+    sampleBy(col, fractions.asScala.toMap.asInstanceOf[Map[T, Double]], seed)
+  }
 }
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java
index 9e61d06f4036e..2c669bb59a0b5 100644
--- a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java
@@ -226,4 +226,13 @@ public void testCovariance() {
     Double result = df.stat().cov("a", "b");
     Assert.assertTrue(Math.abs(result) < 1e-6);
   }
+
+  @Test
+  public void testSampleBy() {
+    DataFrame df = context.range(0, 100).select(col("id").mod(3).as("key"));
+    DataFrame sampled = df.stat().<Integer>sampleBy("key", ImmutableMap.of(0, 0.1, 1, 0.2), 0L);
+    Row[] actual = sampled.groupBy("key").count().orderBy("key").collect();
+    Row[] expected = new Row[] {RowFactory.create(0, 5), RowFactory.create(1, 8)};
+    Assert.assertArrayEquals(expected, actual);
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala
index 7ba4ba73e0cc9..07a675e64f527 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala
@@ -21,9 +21,9 @@ import java.util.Random
 
 import org.scalatest.Matchers._
 
-import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.functions.col
 
-class DataFrameStatSuite extends SparkFunSuite  {
+class DataFrameStatSuite extends QueryTest {
 
   private val sqlCtx = org.apache.spark.sql.test.TestSQLContext
   import sqlCtx.implicits._
@@ -130,4 +130,12 @@ class DataFrameStatSuite extends SparkFunSuite  {
     val items2 = singleColResults.collect().head
     items2.getSeq[Double](0) should contain (-1.0)
   }
+
+  test("sampleBy") {
+    val df = sqlCtx.range(0, 100).select((col("id") % 3).as("key"))
+    val sampled = df.stat.sampleBy("key", Map(0 -> 0.1, 1 -> 0.2), 0L)
+    checkAnswer(
+      sampled.groupBy("key").count().orderBy("key"),
+      Seq(Row(0, 5), Row(1, 8)))
+  }
 }

From e7a0976e991f75a7bda99509e2b040daab965ae6 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Thu, 30 Jul 2015 17:17:27 -0700
Subject: [PATCH 497/549] [SPARK-9458][SPARK-9469][SQL] Code generate prefix
 computation in sorting & moves unsafe conversion out of TungstenSort.

Author: Reynold Xin <rxin@databricks.com>

Closes #7803 from rxin/SPARK-9458 and squashes the following commits:

5b032dc [Reynold Xin] Fix string.
b670dbb [Reynold Xin] [SPARK-9458][SPARK-9469][SQL] Code generate prefix computation in sorting & moves unsafe conversion out of TungstenSort.
---
 .../unsafe/sort/PrefixComparators.java        | 49 ++++++++------
 .../unsafe/sort/PrefixComparatorsSuite.scala  | 22 ++-----
 .../execution/UnsafeExternalRowSorter.java    | 27 ++++----
 .../sql/catalyst/expressions/SortOrder.scala  | 44 ++++++++++++-
 .../spark/sql/execution/SortPrefixUtils.scala | 64 +++----------------
 .../spark/sql/execution/SparkStrategies.scala |  4 +-
 .../sql/execution/joins/HashedRelation.scala  |  4 +-
 .../org/apache/spark/sql/execution/sort.scala | 64 ++++++++-----------
 .../execution/RowFormatConvertersSuite.scala  | 11 ++--
 ...ortSuite.scala => TungstenSortSuite.scala} | 10 +--
 10 files changed, 138 insertions(+), 161 deletions(-)
 rename sql/core/src/test/scala/org/apache/spark/sql/execution/{UnsafeExternalSortSuite.scala => TungstenSortSuite.scala} (87%)

diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/PrefixComparators.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/PrefixComparators.java
index 600aff7d15d8a..4d7e5b3dfba6e 100644
--- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/PrefixComparators.java
+++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/PrefixComparators.java
@@ -28,9 +28,11 @@ public class PrefixComparators {
   private PrefixComparators() {}
 
   public static final StringPrefixComparator STRING = new StringPrefixComparator();
-  public static final IntegralPrefixComparator INTEGRAL = new IntegralPrefixComparator();
-  public static final FloatPrefixComparator FLOAT = new FloatPrefixComparator();
+  public static final StringPrefixComparatorDesc STRING_DESC = new StringPrefixComparatorDesc();
+  public static final LongPrefixComparator LONG = new LongPrefixComparator();
+  public static final LongPrefixComparatorDesc LONG_DESC = new LongPrefixComparatorDesc();
   public static final DoublePrefixComparator DOUBLE = new DoublePrefixComparator();
+  public static final DoublePrefixComparatorDesc DOUBLE_DESC = new DoublePrefixComparatorDesc();
 
   public static final class StringPrefixComparator extends PrefixComparator {
     @Override
@@ -38,50 +40,55 @@ public int compare(long aPrefix, long bPrefix) {
       return UnsignedLongs.compare(aPrefix, bPrefix);
     }
 
-    public long computePrefix(UTF8String value) {
+    public static long computePrefix(UTF8String value) {
       return value == null ? 0L : value.getPrefix();
     }
   }
 
-  /**
-   * Prefix comparator for all integral types (boolean, byte, short, int, long).
-   */
-  public static final class IntegralPrefixComparator extends PrefixComparator {
+  public static final class StringPrefixComparatorDesc extends PrefixComparator {
+    @Override
+    public int compare(long bPrefix, long aPrefix) {
+      return UnsignedLongs.compare(aPrefix, bPrefix);
+    }
+  }
+
+  public static final class LongPrefixComparator extends PrefixComparator {
     @Override
     public int compare(long a, long b) {
       return (a < b) ? -1 : (a > b) ? 1 : 0;
     }
+  }
 
-    public final long NULL_PREFIX = Long.MIN_VALUE;
+  public static final class LongPrefixComparatorDesc extends PrefixComparator {
+    @Override
+    public int compare(long b, long a) {
+      return (a < b) ? -1 : (a > b) ? 1 : 0;
+    }
   }
 
-  public static final class FloatPrefixComparator extends PrefixComparator {
+  public static final class DoublePrefixComparator extends PrefixComparator {
     @Override
     public int compare(long aPrefix, long bPrefix) {
-      float a = Float.intBitsToFloat((int) aPrefix);
-      float b = Float.intBitsToFloat((int) bPrefix);
-      return Utils.nanSafeCompareFloats(a, b);
+      double a = Double.longBitsToDouble(aPrefix);
+      double b = Double.longBitsToDouble(bPrefix);
+      return Utils.nanSafeCompareDoubles(a, b);
     }
 
-    public long computePrefix(float value) {
-      return Float.floatToIntBits(value) & 0xffffffffL;
+    public static long computePrefix(double value) {
+      return Double.doubleToLongBits(value);
     }
-
-    public final long NULL_PREFIX = computePrefix(Float.NEGATIVE_INFINITY);
   }
 
-  public static final class DoublePrefixComparator extends PrefixComparator {
+  public static final class DoublePrefixComparatorDesc extends PrefixComparator {
     @Override
-    public int compare(long aPrefix, long bPrefix) {
+    public int compare(long bPrefix, long aPrefix) {
       double a = Double.longBitsToDouble(aPrefix);
       double b = Double.longBitsToDouble(bPrefix);
       return Utils.nanSafeCompareDoubles(a, b);
     }
 
-    public long computePrefix(double value) {
+    public static long computePrefix(double value) {
       return Double.doubleToLongBits(value);
     }
-
-    public final long NULL_PREFIX = computePrefix(Double.NEGATIVE_INFINITY);
   }
 }
diff --git a/core/src/test/scala/org/apache/spark/util/collection/unsafe/sort/PrefixComparatorsSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/unsafe/sort/PrefixComparatorsSuite.scala
index cf53a8ad21c60..26a2e96edaaa2 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/unsafe/sort/PrefixComparatorsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/unsafe/sort/PrefixComparatorsSuite.scala
@@ -29,8 +29,8 @@ class PrefixComparatorsSuite extends SparkFunSuite with PropertyChecks {
     def testPrefixComparison(s1: String, s2: String): Unit = {
       val utf8string1 = UTF8String.fromString(s1)
       val utf8string2 = UTF8String.fromString(s2)
-      val s1Prefix = PrefixComparators.STRING.computePrefix(utf8string1)
-      val s2Prefix = PrefixComparators.STRING.computePrefix(utf8string2)
+      val s1Prefix = PrefixComparators.StringPrefixComparator.computePrefix(utf8string1)
+      val s2Prefix = PrefixComparators.StringPrefixComparator.computePrefix(utf8string2)
       val prefixComparisonResult = PrefixComparators.STRING.compare(s1Prefix, s2Prefix)
 
       val cmp = UnsignedBytes.lexicographicalComparator().compare(
@@ -55,27 +55,15 @@ class PrefixComparatorsSuite extends SparkFunSuite with PropertyChecks {
     forAll { (s1: String, s2: String) => testPrefixComparison(s1, s2) }
   }
 
-  test("float prefix comparator handles NaN properly") {
-    val nan1: Float = java.lang.Float.intBitsToFloat(0x7f800001)
-    val nan2: Float = java.lang.Float.intBitsToFloat(0x7fffffff)
-    assert(nan1.isNaN)
-    assert(nan2.isNaN)
-    val nan1Prefix = PrefixComparators.FLOAT.computePrefix(nan1)
-    val nan2Prefix = PrefixComparators.FLOAT.computePrefix(nan2)
-    assert(nan1Prefix === nan2Prefix)
-    val floatMaxPrefix = PrefixComparators.FLOAT.computePrefix(Float.MaxValue)
-    assert(PrefixComparators.FLOAT.compare(nan1Prefix, floatMaxPrefix) === 1)
-  }
-
   test("double prefix comparator handles NaNs properly") {
     val nan1: Double = java.lang.Double.longBitsToDouble(0x7ff0000000000001L)
     val nan2: Double = java.lang.Double.longBitsToDouble(0x7fffffffffffffffL)
     assert(nan1.isNaN)
     assert(nan2.isNaN)
-    val nan1Prefix = PrefixComparators.DOUBLE.computePrefix(nan1)
-    val nan2Prefix = PrefixComparators.DOUBLE.computePrefix(nan2)
+    val nan1Prefix = PrefixComparators.DoublePrefixComparator.computePrefix(nan1)
+    val nan2Prefix = PrefixComparators.DoublePrefixComparator.computePrefix(nan2)
     assert(nan1Prefix === nan2Prefix)
-    val doubleMaxPrefix = PrefixComparators.DOUBLE.computePrefix(Double.MaxValue)
+    val doubleMaxPrefix = PrefixComparators.DoublePrefixComparator.computePrefix(Double.MaxValue)
     assert(PrefixComparators.DOUBLE.compare(nan1Prefix, doubleMaxPrefix) === 1)
   }
 
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java b/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java
index 4c3f2c6557140..68c49feae938e 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java
@@ -48,7 +48,6 @@ final class UnsafeExternalRowSorter {
   private long numRowsInserted = 0;
 
   private final StructType schema;
-  private final UnsafeProjection unsafeProjection;
   private final PrefixComputer prefixComputer;
   private final UnsafeExternalSorter sorter;
 
@@ -62,7 +61,6 @@ public UnsafeExternalRowSorter(
       PrefixComparator prefixComparator,
       PrefixComputer prefixComputer) throws IOException {
     this.schema = schema;
-    this.unsafeProjection = UnsafeProjection.create(schema);
     this.prefixComputer = prefixComputer;
     final SparkEnv sparkEnv = SparkEnv.get();
     final TaskContext taskContext = TaskContext.get();
@@ -88,13 +86,12 @@ void setTestSpillFrequency(int frequency) {
   }
 
   @VisibleForTesting
-  void insertRow(InternalRow row) throws IOException {
-    UnsafeRow unsafeRow = unsafeProjection.apply(row);
+  void insertRow(UnsafeRow row) throws IOException {
     final long prefix = prefixComputer.computePrefix(row);
     sorter.insertRecord(
-      unsafeRow.getBaseObject(),
-      unsafeRow.getBaseOffset(),
-      unsafeRow.getSizeInBytes(),
+      row.getBaseObject(),
+      row.getBaseOffset(),
+      row.getSizeInBytes(),
       prefix
     );
     numRowsInserted++;
@@ -113,7 +110,7 @@ private void cleanupResources() {
   }
 
   @VisibleForTesting
-  Iterator<InternalRow> sort() throws IOException {
+  Iterator<UnsafeRow> sort() throws IOException {
     try {
       final UnsafeSorterIterator sortedIterator = sorter.getSortedIterator();
       if (!sortedIterator.hasNext()) {
@@ -121,7 +118,7 @@ Iterator<InternalRow> sort() throws IOException {
         // here in order to prevent memory leaks.
         cleanupResources();
       }
-      return new AbstractScalaRowIterator() {
+      return new AbstractScalaRowIterator<UnsafeRow>() {
 
         private final int numFields = schema.length();
         private UnsafeRow row = new UnsafeRow();
@@ -132,7 +129,7 @@ public boolean hasNext() {
         }
 
         @Override
-        public InternalRow next() {
+        public UnsafeRow next() {
           try {
             sortedIterator.loadNext();
             row.pointTo(
@@ -164,11 +161,11 @@ public InternalRow next() {
   }
 
 
-  public Iterator<InternalRow> sort(Iterator<InternalRow> inputIterator) throws IOException {
-      while (inputIterator.hasNext()) {
-        insertRow(inputIterator.next());
-      }
-      return sort();
+  public Iterator<UnsafeRow> sort(Iterator<UnsafeRow> inputIterator) throws IOException {
+    while (inputIterator.hasNext()) {
+      insertRow(inputIterator.next());
+    }
+    return sort();
   }
 
   /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SortOrder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SortOrder.scala
index 3f436c0eb893c..9fe877f10fa08 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SortOrder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SortOrder.scala
@@ -17,7 +17,10 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
-import org.apache.spark.sql.types.DataType
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, CodeGenContext}
+import org.apache.spark.sql.types._
+import org.apache.spark.util.collection.unsafe.sort.PrefixComparators.DoublePrefixComparator
 
 abstract sealed class SortDirection
 case object Ascending extends SortDirection
@@ -37,4 +40,43 @@ case class SortOrder(child: Expression, direction: SortDirection)
   override def nullable: Boolean = child.nullable
 
   override def toString: String = s"$child ${if (direction == Ascending) "ASC" else "DESC"}"
+
+  def isAscending: Boolean = direction == Ascending
+}
+
+/**
+ * An expression to generate a 64-bit long prefix used in sorting.
+ */
+case class SortPrefix(child: SortOrder) extends UnaryExpression {
+
+  override def eval(input: InternalRow): Any = throw new UnsupportedOperationException
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    val childCode = child.child.gen(ctx)
+    val input = childCode.primitive
+    val DoublePrefixCmp = classOf[DoublePrefixComparator].getName
+
+    val (nullValue: Long, prefixCode: String) = child.child.dataType match {
+      case BooleanType =>
+        (Long.MinValue, s"$input ? 1L : 0L")
+      case _: IntegralType =>
+        (Long.MinValue, s"(long) $input")
+      case FloatType | DoubleType =>
+        (DoublePrefixComparator.computePrefix(Double.NegativeInfinity),
+          s"$DoublePrefixCmp.computePrefix((double)$input)")
+      case StringType => (0L, s"$input.getPrefix()")
+      case _ => (0L, "0L")
+    }
+
+    childCode.code +
+    s"""
+      |long ${ev.primitive} = ${nullValue}L;
+      |boolean ${ev.isNull} = false;
+      |if (!${childCode.isNull}) {
+      |  ${ev.primitive} = $prefixCode;
+      |}
+    """.stripMargin
+  }
+
+  override def dataType: DataType = LongType
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SortPrefixUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SortPrefixUtils.scala
index 2dee3542d6101..a2145b185ce90 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SortPrefixUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SortPrefixUtils.scala
@@ -18,10 +18,8 @@
 
 package org.apache.spark.sql.execution
 
-import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.SortOrder
 import org.apache.spark.sql.types._
-import org.apache.spark.unsafe.types.UTF8String
 import org.apache.spark.util.collection.unsafe.sort.{PrefixComparators, PrefixComparator}
 
 
@@ -37,61 +35,15 @@ object SortPrefixUtils {
 
   def getPrefixComparator(sortOrder: SortOrder): PrefixComparator = {
     sortOrder.dataType match {
-      case StringType => PrefixComparators.STRING
-      case BooleanType | ByteType | ShortType | IntegerType | LongType => PrefixComparators.INTEGRAL
-      case FloatType => PrefixComparators.FLOAT
-      case DoubleType => PrefixComparators.DOUBLE
+      case StringType if sortOrder.isAscending => PrefixComparators.STRING
+      case StringType if !sortOrder.isAscending => PrefixComparators.STRING_DESC
+      case BooleanType | ByteType | ShortType | IntegerType | LongType if sortOrder.isAscending =>
+        PrefixComparators.LONG
+      case BooleanType | ByteType | ShortType | IntegerType | LongType if !sortOrder.isAscending =>
+        PrefixComparators.LONG_DESC
+      case FloatType | DoubleType if sortOrder.isAscending => PrefixComparators.DOUBLE
+      case FloatType | DoubleType if !sortOrder.isAscending => PrefixComparators.DOUBLE_DESC
       case _ => NoOpPrefixComparator
     }
   }
-
-  def getPrefixComputer(sortOrder: SortOrder): InternalRow => Long = {
-    sortOrder.dataType match {
-      case StringType => (row: InternalRow) => {
-        PrefixComparators.STRING.computePrefix(sortOrder.child.eval(row).asInstanceOf[UTF8String])
-      }
-      case BooleanType =>
-        (row: InternalRow) => {
-          val exprVal = sortOrder.child.eval(row)
-          if (exprVal == null) PrefixComparators.INTEGRAL.NULL_PREFIX
-          else if (sortOrder.child.eval(row).asInstanceOf[Boolean]) 1
-          else 0
-        }
-      case ByteType =>
-        (row: InternalRow) => {
-          val exprVal = sortOrder.child.eval(row)
-          if (exprVal == null) PrefixComparators.INTEGRAL.NULL_PREFIX
-          else sortOrder.child.eval(row).asInstanceOf[Byte]
-        }
-      case ShortType =>
-        (row: InternalRow) => {
-          val exprVal = sortOrder.child.eval(row)
-          if (exprVal == null) PrefixComparators.INTEGRAL.NULL_PREFIX
-          else sortOrder.child.eval(row).asInstanceOf[Short]
-        }
-      case IntegerType =>
-        (row: InternalRow) => {
-          val exprVal = sortOrder.child.eval(row)
-          if (exprVal == null) PrefixComparators.INTEGRAL.NULL_PREFIX
-          else sortOrder.child.eval(row).asInstanceOf[Int]
-        }
-      case LongType =>
-        (row: InternalRow) => {
-          val exprVal = sortOrder.child.eval(row)
-          if (exprVal == null) PrefixComparators.INTEGRAL.NULL_PREFIX
-          else sortOrder.child.eval(row).asInstanceOf[Long]
-        }
-      case FloatType => (row: InternalRow) => {
-        val exprVal = sortOrder.child.eval(row)
-        if (exprVal == null) PrefixComparators.FLOAT.NULL_PREFIX
-        else PrefixComparators.FLOAT.computePrefix(sortOrder.child.eval(row).asInstanceOf[Float])
-      }
-      case DoubleType => (row: InternalRow) => {
-        val exprVal = sortOrder.child.eval(row)
-        if (exprVal == null) PrefixComparators.DOUBLE.NULL_PREFIX
-        else PrefixComparators.DOUBLE.computePrefix(sortOrder.child.eval(row).asInstanceOf[Double])
-      }
-      case _ => (row: InternalRow) => 0L
-    }
-  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index 52a9b02d373c7..03d24a88d4ecd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -341,8 +341,8 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
      */
     def getSortOperator(sortExprs: Seq[SortOrder], global: Boolean, child: SparkPlan): SparkPlan = {
       if (sqlContext.conf.unsafeEnabled && sqlContext.conf.codegenEnabled &&
-        UnsafeExternalSort.supportsSchema(child.schema)) {
-        execution.UnsafeExternalSort(sortExprs, global, child)
+        TungstenSort.supportsSchema(child.schema)) {
+        execution.TungstenSort(sortExprs, global, child)
       } else if (sqlContext.conf.externalSortEnabled) {
         execution.ExternalSort(sortExprs, global, child)
       } else {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
index 26dbc911e9521..f88a45f48aee9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
@@ -229,7 +229,7 @@ private[joins] final class UnsafeHashedRelation(
       // write all the values as single byte array
       var totalSize = 0L
       var i = 0
-      while (i < values.size) {
+      while (i < values.length) {
         totalSize += values(i).getSizeInBytes + 4 + 4
         i += 1
       }
@@ -240,7 +240,7 @@ private[joins] final class UnsafeHashedRelation(
       out.writeInt(totalSize.toInt)
       out.write(key.getBytes)
       i = 0
-      while (i < values.size) {
+      while (i < values.length) {
         // [num of fields] [num of bytes] [row bytes]
         // write the integer in native order, so they can be read by UNSAFE.getInt()
         if (ByteOrder.nativeOrder() == ByteOrder.BIG_ENDIAN) {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/sort.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/sort.scala
index f82208868c3e3..6d903ab23c57f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/sort.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/sort.scala
@@ -17,16 +17,14 @@
 
 package org.apache.spark.sql.execution
 
-import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.errors._
-import org.apache.spark.sql.catalyst.expressions.{Descending, BindReferences, Attribute, SortOrder}
+import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.physical.{UnspecifiedDistribution, OrderedDistribution, Distribution}
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.util.CompletionIterator
 import org.apache.spark.util.collection.ExternalSorter
-import org.apache.spark.util.collection.unsafe.sort.PrefixComparator
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // This file defines various sort operators.
@@ -97,59 +95,53 @@ case class ExternalSort(
  * @param testSpillFrequency Method for configuring periodic spilling in unit tests. If set, will
  *                           spill every `frequency` records.
  */
-case class UnsafeExternalSort(
+case class TungstenSort(
     sortOrder: Seq[SortOrder],
     global: Boolean,
     child: SparkPlan,
     testSpillFrequency: Int = 0)
   extends UnaryNode {
 
-  private[this] val schema: StructType = child.schema
+  override def outputsUnsafeRows: Boolean = true
+  override def canProcessUnsafeRows: Boolean = true
+  override def canProcessSafeRows: Boolean = false
+
+  override def output: Seq[Attribute] = child.output
+
+  override def outputOrdering: Seq[SortOrder] = sortOrder
 
   override def requiredChildDistribution: Seq[Distribution] =
     if (global) OrderedDistribution(sortOrder) :: Nil else UnspecifiedDistribution :: Nil
 
-  protected override def doExecute(): RDD[InternalRow] = attachTree(this, "sort") {
-    assert(codegenEnabled, "UnsafeExternalSort requires code generation to be enabled")
-    def doSort(iterator: Iterator[InternalRow]): Iterator[InternalRow] = {
-      val ordering = newOrdering(sortOrder, child.output)
-      val boundSortExpression = BindReferences.bindReference(sortOrder.head, child.output)
-      // Hack until we generate separate comparator implementations for ascending vs. descending
-      // (or choose to codegen them):
-      val prefixComparator = {
-        val comp = SortPrefixUtils.getPrefixComparator(boundSortExpression)
-        if (sortOrder.head.direction == Descending) {
-          new PrefixComparator {
-            override def compare(p1: Long, p2: Long): Int = -1 * comp.compare(p1, p2)
-          }
-        } else {
-          comp
-        }
-      }
-      val prefixComputer = {
-        val prefixComputer = SortPrefixUtils.getPrefixComputer(boundSortExpression)
-        new UnsafeExternalRowSorter.PrefixComputer {
-          override def computePrefix(row: InternalRow): Long = prefixComputer(row)
+  protected override def doExecute(): RDD[InternalRow] = {
+    val schema = child.schema
+    val childOutput = child.output
+    child.execute().mapPartitions({ iter =>
+      val ordering = newOrdering(sortOrder, childOutput)
+
+      // The comparator for comparing prefix
+      val boundSortExpression = BindReferences.bindReference(sortOrder.head, childOutput)
+      val prefixComparator = SortPrefixUtils.getPrefixComparator(boundSortExpression)
+
+      // The generator for prefix
+      val prefixProjection = UnsafeProjection.create(Seq(SortPrefix(boundSortExpression)))
+      val prefixComputer = new UnsafeExternalRowSorter.PrefixComputer {
+        override def computePrefix(row: InternalRow): Long = {
+          prefixProjection.apply(row).getLong(0)
         }
       }
+
       val sorter = new UnsafeExternalRowSorter(schema, ordering, prefixComparator, prefixComputer)
       if (testSpillFrequency > 0) {
         sorter.setTestSpillFrequency(testSpillFrequency)
       }
-      sorter.sort(iterator)
-    }
-    child.execute().mapPartitions(doSort, preservesPartitioning = true)
+      sorter.sort(iter.asInstanceOf[Iterator[UnsafeRow]])
+    }, preservesPartitioning = true)
   }
 
-  override def output: Seq[Attribute] = child.output
-
-  override def outputOrdering: Seq[SortOrder] = sortOrder
-
-  override def outputsUnsafeRows: Boolean = true
 }
 
-@DeveloperApi
-object UnsafeExternalSort {
+object TungstenSort {
   /**
    * Return true if UnsafeExternalSort can sort rows with the given schema, false otherwise.
    */
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/RowFormatConvertersSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/RowFormatConvertersSuite.scala
index 7b75f755918c1..707cd9c6d939b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/RowFormatConvertersSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/RowFormatConvertersSuite.scala
@@ -18,8 +18,7 @@
 package org.apache.spark.sql.execution
 
 import org.apache.spark.sql.Row
-import org.apache.spark.sql.catalyst.CatalystTypeConverters
-import org.apache.spark.sql.catalyst.expressions.IsNull
+import org.apache.spark.sql.catalyst.expressions.{Literal, IsNull}
 import org.apache.spark.sql.test.TestSQLContext
 
 class RowFormatConvertersSuite extends SparkPlanTest {
@@ -31,7 +30,7 @@ class RowFormatConvertersSuite extends SparkPlanTest {
 
   private val outputsSafe = ExternalSort(Nil, false, PhysicalRDD(Seq.empty, null))
   assert(!outputsSafe.outputsUnsafeRows)
-  private val outputsUnsafe = UnsafeExternalSort(Nil, false, PhysicalRDD(Seq.empty, null))
+  private val outputsUnsafe = TungstenSort(Nil, false, PhysicalRDD(Seq.empty, null))
   assert(outputsUnsafe.outputsUnsafeRows)
 
   test("planner should insert unsafe->safe conversions when required") {
@@ -41,14 +40,14 @@ class RowFormatConvertersSuite extends SparkPlanTest {
   }
 
   test("filter can process unsafe rows") {
-    val plan = Filter(IsNull(null), outputsUnsafe)
+    val plan = Filter(IsNull(IsNull(Literal(1))), outputsUnsafe)
     val preparedPlan = TestSQLContext.prepareForExecution.execute(plan)
-    assert(getConverters(preparedPlan).isEmpty)
+    assert(getConverters(preparedPlan).size === 1)
     assert(preparedPlan.outputsUnsafeRows)
   }
 
   test("filter can process safe rows") {
-    val plan = Filter(IsNull(null), outputsSafe)
+    val plan = Filter(IsNull(IsNull(Literal(1))), outputsSafe)
     val preparedPlan = TestSQLContext.prepareForExecution.execute(plan)
     assert(getConverters(preparedPlan).isEmpty)
     assert(!preparedPlan.outputsUnsafeRows)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeExternalSortSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/TungstenSortSuite.scala
similarity index 87%
rename from sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeExternalSortSuite.scala
rename to sql/core/src/test/scala/org/apache/spark/sql/execution/TungstenSortSuite.scala
index 138636b0c65b8..450963547c798 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeExternalSortSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/TungstenSortSuite.scala
@@ -26,7 +26,7 @@ import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.test.TestSQLContext
 import org.apache.spark.sql.types._
 
-class UnsafeExternalSortSuite extends SparkPlanTest with BeforeAndAfterAll {
+class TungstenSortSuite extends SparkPlanTest with BeforeAndAfterAll {
 
   override def beforeAll(): Unit = {
     TestSQLContext.conf.setConf(SQLConf.CODEGEN_ENABLED, true)
@@ -39,7 +39,7 @@ class UnsafeExternalSortSuite extends SparkPlanTest with BeforeAndAfterAll {
   test("sort followed by limit") {
     checkThatPlansAgree(
       (1 to 100).map(v => Tuple1(v)).toDF("a"),
-      (child: SparkPlan) => Limit(10, UnsafeExternalSort('a.asc :: Nil, true, child)),
+      (child: SparkPlan) => Limit(10, TungstenSort('a.asc :: Nil, true, child)),
       (child: SparkPlan) => Limit(10, Sort('a.asc :: Nil, global = true, child)),
       sortAnswers = false
     )
@@ -50,7 +50,7 @@ class UnsafeExternalSortSuite extends SparkPlanTest with BeforeAndAfterAll {
     val stringLength = 1024 * 1024 * 2
     checkThatPlansAgree(
       Seq(Tuple1("a" * stringLength), Tuple1("b" * stringLength)).toDF("a").repartition(1),
-      UnsafeExternalSort(sortOrder, global = true, _: SparkPlan, testSpillFrequency = 1),
+      TungstenSort(sortOrder, global = true, _: SparkPlan, testSpillFrequency = 1),
       Sort(sortOrder, global = true, _: SparkPlan),
       sortAnswers = false
     )
@@ -70,11 +70,11 @@ class UnsafeExternalSortSuite extends SparkPlanTest with BeforeAndAfterAll {
         TestSQLContext.sparkContext.parallelize(Random.shuffle(inputData).map(v => Row(v))),
         StructType(StructField("a", dataType, nullable = true) :: Nil)
       )
-      assert(UnsafeExternalSort.supportsSchema(inputDf.schema))
+      assert(TungstenSort.supportsSchema(inputDf.schema))
       checkThatPlansAgree(
         inputDf,
         plan => ConvertToSafe(
-          UnsafeExternalSort(sortOrder, global = true, plan: SparkPlan, testSpillFrequency = 23)),
+          TungstenSort(sortOrder, global = true, plan: SparkPlan, testSpillFrequency = 23)),
         Sort(sortOrder, global = true, _: SparkPlan),
         sortAnswers = false
       )

From 0b1a464b6e061580a75b99a91b042069d76bbbfd Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Thu, 30 Jul 2015 17:18:32 -0700
Subject: [PATCH 498/549] [SPARK-9425] [SQL] support DecimalType in UnsafeRow

This PR brings the support of DecimalType in UnsafeRow, for precision <= 18, it's settable, otherwise it's not settable.

Author: Davies Liu <davies@databricks.com>

Closes #7758 from davies/unsafe_decimal and squashes the following commits:

478b1ba [Davies Liu] address comments
536314c [Davies Liu] Merge branch 'master' of github.com:apache/spark into unsafe_decimal
7c2e77a [Davies Liu] fix JoinedRow
76d6fa4 [Davies Liu] fix tests
99d3151 [Davies Liu] Merge branch 'master' of github.com:apache/spark into unsafe_decimal
d49c6ae [Davies Liu] support DecimalType in UnsafeRow
---
 .../expressions/SpecializedGetters.java       |   2 +-
 .../UnsafeFixedWidthAggregationMap.java       |  22 ++--
 .../sql/catalyst/expressions/UnsafeRow.java   |  53 +++++---
 .../expressions/UnsafeRowWriters.java         |  42 +++++++
 .../sql/catalyst/CatalystTypeConverters.scala |   9 +-
 .../spark/sql/catalyst/InternalRow.scala      |   4 +-
 .../sql/catalyst/expressions/Projection.scala |   7 +-
 .../expressions/codegen/CodeGenerator.scala   |   9 +-
 .../codegen/GenerateUnsafeProjection.scala    | 115 ++++++++++--------
 .../spark/sql/catalyst/expressions/rows.scala |   3 +-
 .../org/apache/spark/sql/types/Decimal.scala  |   6 +-
 .../spark/sql/types/GenericArrayData.scala    |   2 +-
 .../sql/catalyst/expressions/CastSuite.scala  |   5 +-
 .../expressions/DateExpressionsSuite.scala    |   2 +-
 .../UnsafeFixedWidthAggregationMapSuite.scala |   8 +-
 .../expressions/UnsafeRowConverterSuite.scala |  17 +--
 .../spark/sql/columnar/ColumnBuilder.scala    |   2 +-
 .../spark/sql/columnar/ColumnStats.scala      |   4 +-
 .../spark/sql/columnar/ColumnType.scala       |   2 +-
 .../sql/execution/GeneratedAggregate.scala    |   2 +-
 .../sql/execution/SparkSqlSerializer2.scala   |   2 +-
 .../sql/parquet/ParquetTableSupport.scala     |   4 +-
 .../spark/sql/columnar/ColumnStatsSuite.scala |  40 +++++-
 23 files changed, 237 insertions(+), 125 deletions(-)

diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/SpecializedGetters.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/SpecializedGetters.java
index f7cea13688876..e3d3ba7a9ccc0 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/SpecializedGetters.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/SpecializedGetters.java
@@ -41,7 +41,7 @@ public interface SpecializedGetters {
 
   double getDouble(int ordinal);
 
-  Decimal getDecimal(int ordinal);
+  Decimal getDecimal(int ordinal, int precision, int scale);
 
   UTF8String getUTF8String(int ordinal);
 
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMap.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMap.java
index 03f4c3ed8e6bb..f3b462778dc10 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMap.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMap.java
@@ -20,6 +20,8 @@
 import java.util.Iterator;
 
 import org.apache.spark.sql.catalyst.InternalRow;
+import org.apache.spark.sql.types.Decimal;
+import org.apache.spark.sql.types.DecimalType;
 import org.apache.spark.sql.types.StructField;
 import org.apache.spark.sql.types.StructType;
 import org.apache.spark.unsafe.PlatformDependent;
@@ -61,26 +63,18 @@ public final class UnsafeFixedWidthAggregationMap {
 
   private final boolean enablePerfMetrics;
 
-  /**
-   * @return true if UnsafeFixedWidthAggregationMap supports grouping keys with the given schema,
-   *         false otherwise.
-   */
-  public static boolean supportsGroupKeySchema(StructType schema) {
-    for (StructField field: schema.fields()) {
-      if (!UnsafeRow.readableFieldTypes.contains(field.dataType())) {
-        return false;
-      }
-    }
-    return true;
-  }
-
   /**
    * @return true if UnsafeFixedWidthAggregationMap supports aggregation buffers with the given
    *         schema, false otherwise.
    */
   public static boolean supportsAggregationBufferSchema(StructType schema) {
     for (StructField field: schema.fields()) {
-      if (!UnsafeRow.settableFieldTypes.contains(field.dataType())) {
+      if (field.dataType() instanceof DecimalType) {
+        DecimalType dt = (DecimalType) field.dataType();
+        if (dt.precision() > Decimal.MAX_LONG_DIGITS()) {
+          return false;
+        }
+      } else if (!UnsafeRow.settableFieldTypes.contains(field.dataType())) {
         return false;
       }
     }
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
index 6d684bac37573..e7088edced1a1 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
@@ -19,6 +19,8 @@
 
 import java.io.IOException;
 import java.io.OutputStream;
+import java.math.BigDecimal;
+import java.math.BigInteger;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.HashSet;
@@ -65,12 +67,7 @@ public static int calculateBitSetWidthInBytes(int numFields) {
    */
   public static final Set<DataType> settableFieldTypes;
 
-  /**
-   * Fields types can be read(but not set (e.g. set() will throw UnsupportedOperationException).
-   */
-  public static final Set<DataType> readableFieldTypes;
-
-  // TODO: support DecimalType
+  // DecimalType(precision <= 18) is settable
   static {
     settableFieldTypes = Collections.unmodifiableSet(
       new HashSet<>(
@@ -86,16 +83,6 @@ public static int calculateBitSetWidthInBytes(int numFields) {
           DateType,
           TimestampType
         })));
-
-    // We support get() on a superset of the types for which we support set():
-    final Set<DataType> _readableFieldTypes = new HashSet<>(
-      Arrays.asList(new DataType[]{
-        StringType,
-        BinaryType,
-        CalendarIntervalType
-      }));
-    _readableFieldTypes.addAll(settableFieldTypes);
-    readableFieldTypes = Collections.unmodifiableSet(_readableFieldTypes);
   }
 
   //////////////////////////////////////////////////////////////////////////////
@@ -232,6 +219,21 @@ public void setFloat(int ordinal, float value) {
     PlatformDependent.UNSAFE.putFloat(baseObject, getFieldOffset(ordinal), value);
   }
 
+  @Override
+  public void setDecimal(int ordinal, Decimal value, int precision) {
+    assertIndexIsValid(ordinal);
+    if (value == null) {
+      setNullAt(ordinal);
+    } else {
+      if (precision <= Decimal.MAX_LONG_DIGITS()) {
+        setLong(ordinal, value.toUnscaledLong());
+      } else {
+        // TODO(davies): support update decimal (hold a bounded space even it's null)
+        throw new UnsupportedOperationException();
+      }
+    }
+  }
+
   @Override
   public Object get(int ordinal) {
     throw new UnsupportedOperationException();
@@ -256,7 +258,8 @@ public Object get(int ordinal, DataType dataType) {
     } else if (dataType instanceof DoubleType) {
       return getDouble(ordinal);
     } else if (dataType instanceof DecimalType) {
-      return getDecimal(ordinal);
+      DecimalType dt = (DecimalType) dataType;
+      return getDecimal(ordinal, dt.precision(), dt.scale());
     } else if (dataType instanceof DateType) {
       return getInt(ordinal);
     } else if (dataType instanceof TimestampType) {
@@ -322,6 +325,22 @@ public double getDouble(int ordinal) {
     return PlatformDependent.UNSAFE.getDouble(baseObject, getFieldOffset(ordinal));
   }
 
+  @Override
+  public Decimal getDecimal(int ordinal, int precision, int scale) {
+    assertIndexIsValid(ordinal);
+    if (isNullAt(ordinal)) {
+      return null;
+    }
+    if (precision <= Decimal.MAX_LONG_DIGITS()) {
+      return Decimal.apply(getLong(ordinal), precision, scale);
+    } else {
+      byte[] bytes = getBinary(ordinal);
+      BigInteger bigInteger = new BigInteger(bytes);
+      BigDecimal javaDecimal = new BigDecimal(bigInteger, scale);
+      return Decimal.apply(new scala.math.BigDecimal(javaDecimal), precision, scale);
+    }
+  }
+
   @Override
   public UTF8String getUTF8String(int ordinal) {
     assertIndexIsValid(ordinal);
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRowWriters.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRowWriters.java
index c3259e21c4a78..f43a285cd6cad 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRowWriters.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRowWriters.java
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.catalyst.expressions;
 
 import org.apache.spark.sql.catalyst.InternalRow;
+import org.apache.spark.sql.types.Decimal;
 import org.apache.spark.unsafe.PlatformDependent;
 import org.apache.spark.unsafe.array.ByteArrayMethods;
 import org.apache.spark.unsafe.types.ByteArray;
@@ -30,6 +31,47 @@
  */
 public class UnsafeRowWriters {
 
+  /** Writer for Decimal with precision under 18. */
+  public static class CompactDecimalWriter {
+
+    public static int getSize(Decimal input) {
+      return 0;
+    }
+
+    public static int write(UnsafeRow target, int ordinal, int cursor, Decimal input) {
+      target.setLong(ordinal, input.toUnscaledLong());
+      return 0;
+    }
+  }
+
+  /** Writer for Decimal with precision larger than 18. */
+  public static class DecimalWriter {
+
+    public static int getSize(Decimal input) {
+      // bounded size
+      return 16;
+    }
+
+    public static int write(UnsafeRow target, int ordinal, int cursor, Decimal input) {
+      final long offset = target.getBaseOffset() + cursor;
+      final byte[] bytes = input.toJavaBigDecimal().unscaledValue().toByteArray();
+      final int numBytes = bytes.length;
+      assert(numBytes <= 16);
+
+      // zero-out the bytes
+      PlatformDependent.UNSAFE.putLong(target.getBaseObject(), offset, 0L);
+      PlatformDependent.UNSAFE.putLong(target.getBaseObject(), offset + 8, 0L);
+
+      // Write the bytes to the variable length portion.
+      PlatformDependent.copyMemory(bytes, PlatformDependent.BYTE_ARRAY_OFFSET,
+        target.getBaseObject(), offset, numBytes);
+
+      // Set the fixed length portion.
+      target.setLong(ordinal, (((long) cursor) << 32) | ((long) numBytes));
+      return 16;
+    }
+  }
+
   /** Writer for UTF8String. */
   public static class UTF8StringWriter {
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala
index 22452c0f201ef..7ca20fe97fbef 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala
@@ -68,7 +68,7 @@ object CatalystTypeConverters {
       case StringType => StringConverter
       case DateType => DateConverter
       case TimestampType => TimestampConverter
-      case dt: DecimalType => BigDecimalConverter
+      case dt: DecimalType => new DecimalConverter(dt)
       case BooleanType => BooleanConverter
       case ByteType => ByteConverter
       case ShortType => ShortConverter
@@ -306,7 +306,8 @@ object CatalystTypeConverters {
       DateTimeUtils.toJavaTimestamp(row.getLong(column))
   }
 
-  private object BigDecimalConverter extends CatalystTypeConverter[Any, JavaBigDecimal, Decimal] {
+  private class DecimalConverter(dataType: DecimalType)
+    extends CatalystTypeConverter[Any, JavaBigDecimal, Decimal] {
     override def toCatalystImpl(scalaValue: Any): Decimal = scalaValue match {
       case d: BigDecimal => Decimal(d)
       case d: JavaBigDecimal => Decimal(d)
@@ -314,9 +315,11 @@ object CatalystTypeConverters {
     }
     override def toScala(catalystValue: Decimal): JavaBigDecimal = catalystValue.toJavaBigDecimal
     override def toScalaImpl(row: InternalRow, column: Int): JavaBigDecimal =
-      row.getDecimal(column).toJavaBigDecimal
+      row.getDecimal(column, dataType.precision, dataType.scale).toJavaBigDecimal
   }
 
+  private object BigDecimalConverter extends DecimalConverter(DecimalType.SYSTEM_DEFAULT)
+
   private abstract class PrimitiveConverter[T] extends CatalystTypeConverter[T, Any, Any] {
     final override def toScala(catalystValue: Any): Any = catalystValue
     final override def toCatalystImpl(scalaValue: T): Any = scalaValue
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala
index 486ba036548c8..b19bf4386b0ba 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala
@@ -58,8 +58,8 @@ abstract class InternalRow extends Serializable with SpecializedGetters {
 
   override def getBinary(ordinal: Int): Array[Byte] = getAs[Array[Byte]](ordinal, BinaryType)
 
-  override def getDecimal(ordinal: Int): Decimal =
-    getAs[Decimal](ordinal, DecimalType.SYSTEM_DEFAULT)
+  override def getDecimal(ordinal: Int, precision: Int, scale: Int): Decimal =
+    getAs[Decimal](ordinal, DecimalType(precision, scale))
 
   override def getInterval(ordinal: Int): CalendarInterval =
     getAs[CalendarInterval](ordinal, CalendarIntervalType)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
index b3beb7e28f208..7c7664e4c1a91 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.codegen.{GenerateUnsafeProjection, GenerateMutableProjection}
-import org.apache.spark.sql.types.{StructType, DataType}
+import org.apache.spark.sql.types.{Decimal, StructType, DataType}
 import org.apache.spark.unsafe.types.UTF8String
 
 /**
@@ -225,6 +225,11 @@ class JoinedRow extends InternalRow {
   override def getFloat(i: Int): Float =
     if (i < row1.numFields) row1.getFloat(i) else row2.getFloat(i - row1.numFields)
 
+  override def getDecimal(i: Int, precision: Int, scale: Int): Decimal = {
+    if (i < row1.numFields) row1.getDecimal(i, precision, scale)
+    else row2.getDecimal(i - row1.numFields, precision, scale)
+  }
+
   override def getStruct(i: Int, numFields: Int): InternalRow = {
     if (i < row1.numFields) {
       row1.getStruct(i, numFields)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index c39e0df6fae2a..60e2863f7bbb0 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -106,6 +106,7 @@ class CodeGenContext {
     val jt = javaType(dataType)
     dataType match {
       case _ if isPrimitiveType(jt) => s"$getter.get${primitiveTypeName(jt)}($ordinal)"
+      case t: DecimalType => s"$getter.getDecimal($ordinal, ${t.precision}, ${t.scale})"
       case StringType => s"$getter.getUTF8String($ordinal)"
       case BinaryType => s"$getter.getBinary($ordinal)"
       case CalendarIntervalType => s"$getter.getInterval($ordinal)"
@@ -120,10 +121,10 @@ class CodeGenContext {
    */
   def setColumn(row: String, dataType: DataType, ordinal: Int, value: String): String = {
     val jt = javaType(dataType)
-    if (isPrimitiveType(jt)) {
-      s"$row.set${primitiveTypeName(jt)}($ordinal, $value)"
-    } else {
-      s"$row.update($ordinal, $value)"
+    dataType match {
+      case _ if isPrimitiveType(jt) => s"$row.set${primitiveTypeName(jt)}($ordinal, $value)"
+      case t: DecimalType => s"$row.setDecimal($ordinal, $value, ${t.precision})"
+      case _ => s"$row.update($ordinal, $value)"
     }
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
index a662357fb6cf9..1d223986d9441 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
@@ -35,6 +35,8 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
   private val BinaryWriter = classOf[UnsafeRowWriters.BinaryWriter].getName
   private val IntervalWriter = classOf[UnsafeRowWriters.IntervalWriter].getName
   private val StructWriter = classOf[UnsafeRowWriters.StructWriter].getName
+  private val CompactDecimalWriter = classOf[UnsafeRowWriters.CompactDecimalWriter].getName
+  private val DecimalWriter = classOf[UnsafeRowWriters.DecimalWriter].getName
 
   /** Returns true iff we support this data type. */
   def canSupport(dataType: DataType): Boolean = dataType match {
@@ -42,9 +44,64 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
     case _: CalendarIntervalType => true
     case t: StructType => t.toSeq.forall(field => canSupport(field.dataType))
     case NullType => true
+    case t: DecimalType => true
     case _ => false
   }
 
+  def genAdditionalSize(dt: DataType, ev: GeneratedExpressionCode): String = dt match {
+    case t: DecimalType if t.precision > Decimal.MAX_LONG_DIGITS =>
+      s" + (${ev.isNull} ? 0 : $DecimalWriter.getSize(${ev.primitive}))"
+    case StringType =>
+      s" + (${ev.isNull} ? 0 : $StringWriter.getSize(${ev.primitive}))"
+    case BinaryType =>
+      s" + (${ev.isNull} ? 0 : $BinaryWriter.getSize(${ev.primitive}))"
+    case CalendarIntervalType =>
+      s" + (${ev.isNull} ? 0 : 16)"
+    case _: StructType =>
+      s" + (${ev.isNull} ? 0 : $StructWriter.getSize(${ev.primitive}))"
+    case _ => ""
+  }
+
+  def genFieldWriter(
+      ctx: CodeGenContext,
+      fieldType: DataType,
+      ev: GeneratedExpressionCode,
+      primitive: String,
+      index: Int,
+      cursor: String): String = fieldType match {
+    case _ if ctx.isPrimitiveType(fieldType) =>
+      s"${ctx.setColumn(primitive, fieldType, index, ev.primitive)}"
+    case t: DecimalType if t.precision <= Decimal.MAX_LONG_DIGITS =>
+      s"""
+       // make sure Decimal object has the same scale as DecimalType
+       if (${ev.primitive}.changePrecision(${t.precision}, ${t.scale})) {
+         $CompactDecimalWriter.write($primitive, $index, $cursor, ${ev.primitive});
+       } else {
+         $primitive.setNullAt($index);
+       }
+       """
+    case t: DecimalType if t.precision > Decimal.MAX_LONG_DIGITS =>
+      s"""
+       // make sure Decimal object has the same scale as DecimalType
+       if (${ev.primitive}.changePrecision(${t.precision}, ${t.scale})) {
+         $cursor += $DecimalWriter.write($primitive, $index, $cursor, ${ev.primitive});
+       } else {
+         $primitive.setNullAt($index);
+       }
+       """
+    case StringType =>
+      s"$cursor += $StringWriter.write($primitive, $index, $cursor, ${ev.primitive})"
+    case BinaryType =>
+      s"$cursor += $BinaryWriter.write($primitive, $index, $cursor, ${ev.primitive})"
+    case CalendarIntervalType =>
+      s"$cursor += $IntervalWriter.write($primitive, $index, $cursor, ${ev.primitive})"
+    case t: StructType =>
+      s"$cursor += $StructWriter.write($primitive, $index, $cursor, ${ev.primitive})"
+    case NullType => ""
+    case _ =>
+      throw new UnsupportedOperationException(s"Not supported DataType: $fieldType")
+  }
+
   /**
    * Generates the code to create an [[UnsafeRow]] object based on the input expressions.
    * @param ctx context for code generation
@@ -69,36 +126,12 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
     val allExprs = exprs.map(_.code).mkString("\n")
 
     val fixedSize = 8 * exprs.length + UnsafeRow.calculateBitSetWidthInBytes(exprs.length)
-    val additionalSize = expressions.zipWithIndex.map { case (e, i) =>
-      e.dataType match {
-        case StringType =>
-          s" + (${exprs(i).isNull} ? 0 : $StringWriter.getSize(${exprs(i).primitive}))"
-        case BinaryType =>
-          s" + (${exprs(i).isNull} ? 0 : $BinaryWriter.getSize(${exprs(i).primitive}))"
-        case CalendarIntervalType =>
-          s" + (${exprs(i).isNull} ? 0 : 16)"
-        case _: StructType =>
-          s" + (${exprs(i).isNull} ? 0 : $StructWriter.getSize(${exprs(i).primitive}))"
-        case _ => ""
-      }
+    val additionalSize = expressions.zipWithIndex.map {
+      case (e, i) => genAdditionalSize(e.dataType, exprs(i))
     }.mkString("")
 
     val writers = expressions.zipWithIndex.map { case (e, i) =>
-      val update = e.dataType match {
-        case dt if ctx.isPrimitiveType(dt) =>
-          s"${ctx.setColumn(ret, dt, i, exprs(i).primitive)}"
-        case StringType =>
-          s"$cursor += $StringWriter.write($ret, $i, $cursor, ${exprs(i).primitive})"
-        case BinaryType =>
-          s"$cursor += $BinaryWriter.write($ret, $i, $cursor, ${exprs(i).primitive})"
-        case CalendarIntervalType =>
-          s"$cursor += $IntervalWriter.write($ret, $i, $cursor, ${exprs(i).primitive})"
-        case t: StructType =>
-          s"$cursor += $StructWriter.write($ret, $i, $cursor, ${exprs(i).primitive})"
-        case NullType => ""
-        case _ =>
-          throw new UnsupportedOperationException(s"Not supported DataType: ${e.dataType}")
-      }
+      val update = genFieldWriter(ctx, e.dataType, exprs(i), ret, i, cursor)
       s"""if (${exprs(i).isNull}) {
             $ret.setNullAt($i);
           } else {
@@ -168,35 +201,11 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
 
     val fixedSize = 8 * exprs.length + UnsafeRow.calculateBitSetWidthInBytes(exprs.length)
     val additionalSize = schema.toSeq.map(_.dataType).zip(exprs).map { case (dt, ev) =>
-      dt match {
-        case StringType =>
-          s" + (${ev.isNull} ? 0 : $StringWriter.getSize(${ev.primitive}))"
-        case BinaryType =>
-          s" + (${ev.isNull} ? 0 : $BinaryWriter.getSize(${ev.primitive}))"
-        case CalendarIntervalType =>
-          s" + (${ev.isNull} ? 0 : 16)"
-        case _: StructType =>
-          s" + (${ev.isNull} ? 0 : $StructWriter.getSize(${ev.primitive}))"
-        case _ => ""
-      }
+      genAdditionalSize(dt, ev)
     }.mkString("")
 
     val writers = schema.toSeq.map(_.dataType).zip(exprs).zipWithIndex.map { case ((dt, ev), i) =>
-      val update = dt match {
-        case _ if ctx.isPrimitiveType(dt) =>
-          s"${ctx.setColumn(primitive, dt, i, exprs(i).primitive)}"
-        case StringType =>
-          s"$cursor += $StringWriter.write($primitive, $i, $cursor, ${exprs(i).primitive})"
-        case BinaryType =>
-          s"$cursor += $BinaryWriter.write($primitive, $i, $cursor, ${exprs(i).primitive})"
-        case CalendarIntervalType =>
-          s"$cursor += $IntervalWriter.write($primitive, $i, $cursor, ${exprs(i).primitive})"
-        case t: StructType =>
-          s"$cursor += $StructWriter.write($primitive, $i, $cursor, ${exprs(i).primitive})"
-        case NullType => ""
-        case _ =>
-          throw new UnsupportedOperationException(s"Not supported DataType: $dt")
-      }
+      val update = genFieldWriter(ctx, dt, ev, primitive, i, cursor)
       s"""
           if (${exprs(i).isNull}) {
             $primitive.setNullAt($i);
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala
index b7c4ece4a16fe..df6ea586c87ba 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.types.{DataType, StructType, AtomicType}
+import org.apache.spark.sql.types.{Decimal, DataType, StructType, AtomicType}
 import org.apache.spark.unsafe.types.UTF8String
 
 /**
@@ -39,6 +39,7 @@ abstract class MutableRow extends InternalRow {
   def setShort(i: Int, value: Short): Unit = { update(i, value) }
   def setByte(i: Int, value: Byte): Unit = { update(i, value) }
   def setFloat(i: Int, value: Float): Unit = { update(i, value) }
+  def setDecimal(i: Int, value: Decimal, precision: Int) { update(i, value) }
   def setString(i: Int, value: String): Unit = {
     update(i, UTF8String.fromString(value))
   }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala
index bc689810bc292..c0155eeb450a6 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala
@@ -188,6 +188,10 @@ final class Decimal extends Ordered[Decimal] with Serializable {
    * @return true if successful, false if overflow would occur
    */
   def changePrecision(precision: Int, scale: Int): Boolean = {
+    // fast path for UnsafeProjection
+    if (precision == this.precision && scale == this.scale) {
+      return true
+    }
     // First, update our longVal if we can, or transfer over to using a BigDecimal
     if (decimalVal.eq(null)) {
       if (scale < _scale) {
@@ -224,7 +228,7 @@ final class Decimal extends Ordered[Decimal] with Serializable {
       decimalVal = newVal
     } else {
       // We're still using Longs, but we should check whether we match the new precision
-      val p = POW_10(math.min(_precision, MAX_LONG_DIGITS))
+      val p = POW_10(math.min(precision, MAX_LONG_DIGITS))
       if (longVal <= -p || longVal >= p) {
         // Note that we shouldn't have been able to fix this by switching to BigDecimal
         return false
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/GenericArrayData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/GenericArrayData.scala
index 7992ba947c069..35ace673fb3da 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/GenericArrayData.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/GenericArrayData.scala
@@ -43,7 +43,7 @@ class GenericArrayData(array: Array[Any]) extends ArrayData {
 
   override def getDouble(ordinal: Int): Double = getAs(ordinal)
 
-  override def getDecimal(ordinal: Int): Decimal = getAs(ordinal)
+  override def getDecimal(ordinal: Int, precision: Int, scale: Int): Decimal = getAs(ordinal)
 
   override def getUTF8String(ordinal: Int): UTF8String = getAs(ordinal)
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
index 4f35b653d73c0..1ad70733eae03 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
@@ -242,10 +242,9 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
 
     checkEvaluation(cast(123L, DecimalType.USER_DEFAULT), Decimal(123))
     checkEvaluation(cast(123L, DecimalType(3, 0)), Decimal(123))
-    checkEvaluation(cast(123L, DecimalType(3, 1)), Decimal(123.0))
+    checkEvaluation(cast(123L, DecimalType(3, 1)), null)
 
-    // TODO: Fix the following bug and re-enable it.
-    // checkEvaluation(cast(123L, DecimalType(2, 0)), null)
+    checkEvaluation(cast(123L, DecimalType(2, 0)), null)
   }
 
   test("cast from boolean") {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
index fd1d6c1d25497..887e43621a941 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
-import java.sql.{Timestamp, Date}
+import java.sql.{Date, Timestamp}
 import java.text.SimpleDateFormat
 import java.util.Calendar
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMapSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMapSuite.scala
index 6a907290f2dbe..c6b4c729de2f9 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMapSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMapSuite.scala
@@ -55,13 +55,13 @@ class UnsafeFixedWidthAggregationMapSuite
   }
 
   test("supported schemas") {
+    assert(supportsAggregationBufferSchema(
+      StructType(StructField("x", DecimalType.USER_DEFAULT) :: Nil)))
+    assert(!supportsAggregationBufferSchema(
+      StructType(StructField("x", DecimalType.SYSTEM_DEFAULT) :: Nil)))
     assert(!supportsAggregationBufferSchema(StructType(StructField("x", StringType) :: Nil)))
-    assert(supportsGroupKeySchema(StructType(StructField("x", StringType) :: Nil)))
-
     assert(
       !supportsAggregationBufferSchema(StructType(StructField("x", ArrayType(IntegerType)) :: Nil)))
-    assert(
-      !supportsGroupKeySchema(StructType(StructField("x", ArrayType(IntegerType)) :: Nil)))
   }
 
   test("empty map") {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala
index b7bc17f89e82f..a0e1701339ea7 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala
@@ -46,7 +46,6 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers {
     assert(unsafeRow.getLong(1) === 1)
     assert(unsafeRow.getInt(2) === 2)
 
-    // We can copy UnsafeRows as long as they don't reference ObjectPools
     val unsafeRowCopy = unsafeRow.copy()
     assert(unsafeRowCopy.getLong(0) === 0)
     assert(unsafeRowCopy.getLong(1) === 1)
@@ -122,8 +121,8 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers {
       FloatType,
       DoubleType,
       StringType,
-      BinaryType
-      // DecimalType.Default,
+      BinaryType,
+      DecimalType.USER_DEFAULT
       // ArrayType(IntegerType)
     )
     val converter = UnsafeProjection.create(fieldTypes)
@@ -150,7 +149,7 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers {
     assert(createdFromNull.getDouble(7) === 0.0d)
     assert(createdFromNull.getUTF8String(8) === null)
     assert(createdFromNull.getBinary(9) === null)
-    // assert(createdFromNull.get(10) === null)
+    assert(createdFromNull.getDecimal(10, 10, 0) === null)
     // assert(createdFromNull.get(11) === null)
 
     // If we have an UnsafeRow with columns that are initially non-null and we null out those
@@ -168,7 +167,7 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers {
       r.setDouble(7, 700)
       r.update(8, UTF8String.fromString("hello"))
       r.update(9, "world".getBytes)
-      // r.update(10, Decimal(10))
+      r.setDecimal(10, Decimal(10), 10)
       // r.update(11, Array(11))
       r
     }
@@ -184,7 +183,8 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers {
     assert(setToNullAfterCreation.getDouble(7) === rowWithNoNullColumns.getDouble(7))
     assert(setToNullAfterCreation.getString(8) === rowWithNoNullColumns.getString(8))
     assert(setToNullAfterCreation.getBinary(9) === rowWithNoNullColumns.getBinary(9))
-    // assert(setToNullAfterCreation.get(10) === rowWithNoNullColumns.get(10))
+    assert(setToNullAfterCreation.getDecimal(10, 10, 0) ===
+      rowWithNoNullColumns.getDecimal(10, 10, 0))
     // assert(setToNullAfterCreation.get(11) === rowWithNoNullColumns.get(11))
 
     for (i <- fieldTypes.indices) {
@@ -203,7 +203,7 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers {
     setToNullAfterCreation.setDouble(7, 700)
     // setToNullAfterCreation.update(8, UTF8String.fromString("hello"))
     // setToNullAfterCreation.update(9, "world".getBytes)
-    // setToNullAfterCreation.update(10, Decimal(10))
+    setToNullAfterCreation.setDecimal(10, Decimal(10), 10)
     // setToNullAfterCreation.update(11, Array(11))
 
     assert(setToNullAfterCreation.isNullAt(0) === rowWithNoNullColumns.isNullAt(0))
@@ -216,7 +216,8 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers {
     assert(setToNullAfterCreation.getDouble(7) === rowWithNoNullColumns.getDouble(7))
     // assert(setToNullAfterCreation.getString(8) === rowWithNoNullColumns.getString(8))
     // assert(setToNullAfterCreation.get(9) === rowWithNoNullColumns.get(9))
-    // assert(setToNullAfterCreation.get(10) === rowWithNoNullColumns.get(10))
+    assert(setToNullAfterCreation.getDecimal(10, 10, 0) ===
+      rowWithNoNullColumns.getDecimal(10, 10, 0))
     // assert(setToNullAfterCreation.get(11) === rowWithNoNullColumns.get(11))
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnBuilder.scala
index 454b7b91a63f5..1620fc401ba6e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnBuilder.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnBuilder.scala
@@ -114,7 +114,7 @@ private[sql] class FixedDecimalColumnBuilder(
     precision: Int,
     scale: Int)
   extends NativeColumnBuilder(
-    new FixedDecimalColumnStats,
+    new FixedDecimalColumnStats(precision, scale),
     FIXED_DECIMAL(precision, scale))
 
 // TODO (lian) Add support for array, struct and map
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnStats.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnStats.scala
index 32a84b2676e07..af1a8ecca9b57 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnStats.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnStats.scala
@@ -234,14 +234,14 @@ private[sql] class BinaryColumnStats extends ColumnStats {
     InternalRow(null, null, nullCount, count, sizeInBytes)
 }
 
-private[sql] class FixedDecimalColumnStats extends ColumnStats {
+private[sql] class FixedDecimalColumnStats(precision: Int, scale: Int) extends ColumnStats {
   protected var upper: Decimal = null
   protected var lower: Decimal = null
 
   override def gatherStats(row: InternalRow, ordinal: Int): Unit = {
     super.gatherStats(row, ordinal)
     if (!row.isNullAt(ordinal)) {
-      val value = row.getDecimal(ordinal)
+      val value = row.getDecimal(ordinal, precision, scale)
       if (upper == null || value.compareTo(upper) > 0) upper = value
       if (lower == null || value.compareTo(lower) < 0) lower = value
       sizeInBytes += FIXED_DECIMAL.defaultSize
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala
index 2863f6c230a9d..30f8fe320db3d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala
@@ -392,7 +392,7 @@ private[sql] case class FIXED_DECIMAL(precision: Int, scale: Int)
   }
 
   override def getField(row: InternalRow, ordinal: Int): Decimal = {
-    row.getDecimal(ordinal)
+    row.getDecimal(ordinal, precision, scale)
   }
 
   override def setField(row: MutableRow, ordinal: Int, value: Decimal): Unit = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala
index b85aada9d9d4c..d851eae3fcc71 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala
@@ -202,7 +202,7 @@ case class GeneratedAggregate(
 
     val schemaSupportsUnsafe: Boolean = {
       UnsafeFixedWidthAggregationMap.supportsAggregationBufferSchema(aggregationBufferSchema) &&
-        UnsafeFixedWidthAggregationMap.supportsGroupKeySchema(groupKeySchema)
+        UnsafeProjection.canSupport(groupKeySchema)
     }
 
     child.execute().mapPartitions { iter =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlSerializer2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlSerializer2.scala
index c808442a4849b..e5bbd0aaed0a5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlSerializer2.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlSerializer2.scala
@@ -298,7 +298,7 @@ private[sql] object SparkSqlSerializer2 {
                 out.writeByte(NULL)
               } else {
                 out.writeByte(NOT_NULL)
-                val value = row.getDecimal(i)
+                val value = row.getDecimal(i, decimal.precision, decimal.scale)
                 val javaBigDecimal = value.toJavaBigDecimal
                 // First, write out the unscaled value.
                 val bytes: Array[Byte] = javaBigDecimal.unscaledValue().toByteArray
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
index 79dd16b7b0c39..ec8da38a3d427 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
@@ -293,8 +293,8 @@ private[parquet] class MutableRowWriteSupport extends RowWriteSupport {
         writer.addBinary(Binary.fromByteArray(record.getUTF8String(index).getBytes))
       case BinaryType =>
         writer.addBinary(Binary.fromByteArray(record.getBinary(index)))
-      case DecimalType.Fixed(precision, _) =>
-        writeDecimal(record.getDecimal(index), precision)
+      case DecimalType.Fixed(precision, scale) =>
+        writeDecimal(record.getDecimal(index, precision, scale), precision)
       case _ => sys.error(s"Unsupported datatype $ctype, cannot write to consumer")
     }
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnStatsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnStatsSuite.scala
index 4499a7207031d..66014ddca0596 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnStatsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnStatsSuite.scala
@@ -34,8 +34,7 @@ class ColumnStatsSuite extends SparkFunSuite {
   testColumnStats(classOf[DoubleColumnStats], DOUBLE,
     InternalRow(Double.MaxValue, Double.MinValue, 0))
   testColumnStats(classOf[StringColumnStats], STRING, InternalRow(null, null, 0))
-  testColumnStats(classOf[FixedDecimalColumnStats],
-    FIXED_DECIMAL(15, 10), InternalRow(null, null, 0))
+  testDecimalColumnStats(InternalRow(null, null, 0))
 
   def testColumnStats[T <: AtomicType, U <: ColumnStats](
       columnStatsClass: Class[U],
@@ -52,7 +51,7 @@ class ColumnStatsSuite extends SparkFunSuite {
     }
 
     test(s"$columnStatsName: non-empty") {
-      import ColumnarTestUtils._
+      import org.apache.spark.sql.columnar.ColumnarTestUtils._
 
       val columnStats = columnStatsClass.newInstance()
       val rows = Seq.fill(10)(makeRandomRow(columnType)) ++ Seq.fill(10)(makeNullRow(1))
@@ -73,4 +72,39 @@ class ColumnStatsSuite extends SparkFunSuite {
       }
     }
   }
+
+  def testDecimalColumnStats[T <: AtomicType, U <: ColumnStats](initialStatistics: InternalRow) {
+
+    val columnStatsName = classOf[FixedDecimalColumnStats].getSimpleName
+    val columnType = FIXED_DECIMAL(15, 10)
+
+    test(s"$columnStatsName: empty") {
+      val columnStats = new FixedDecimalColumnStats(15, 10)
+      columnStats.collectedStatistics.toSeq.zip(initialStatistics.toSeq).foreach {
+        case (actual, expected) => assert(actual === expected)
+      }
+    }
+
+    test(s"$columnStatsName: non-empty") {
+      import org.apache.spark.sql.columnar.ColumnarTestUtils._
+
+      val columnStats = new FixedDecimalColumnStats(15, 10)
+      val rows = Seq.fill(10)(makeRandomRow(columnType)) ++ Seq.fill(10)(makeNullRow(1))
+      rows.foreach(columnStats.gatherStats(_, 0))
+
+      val values = rows.take(10).map(_.get(0, columnType.dataType).asInstanceOf[T#InternalType])
+      val ordering = columnType.dataType.ordering.asInstanceOf[Ordering[T#InternalType]]
+      val stats = columnStats.collectedStatistics
+
+      assertResult(values.min(ordering), "Wrong lower bound")(stats.genericGet(0))
+      assertResult(values.max(ordering), "Wrong upper bound")(stats.genericGet(1))
+      assertResult(10, "Wrong null count")(stats.genericGet(2))
+      assertResult(20, "Wrong row count")(stats.genericGet(3))
+      assertResult(stats.genericGet(4), "Wrong size in bytes") {
+        rows.map { row =>
+          if (row.isNullAt(0)) 4 else columnType.actualSize(row, 0)
+        }.sum
+      }
+    }
+  }
 }

From 351eda0e2fd47c183c4298469970032097ad07a0 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@appier.com>
Date: Thu, 30 Jul 2015 17:22:51 -0700
Subject: [PATCH 499/549] [SPARK-6319][SQL] Throw AnalysisException when using
 BinaryType on Join and Aggregate

JIRA: https://issues.apache.org/jira/browse/SPARK-6319

Spark SQL uses plain byte arrays to represent binary values. However, the arrays are compared by reference rather than by values. Thus, we should not use BinaryType on Join and Aggregate in current implementation.

Author: Liang-Chi Hsieh <viirya@appier.com>

Closes #7787 from viirya/agg_no_binary_type and squashes the following commits:

4f76cac [Liang-Chi Hsieh] Throw AnalysisException when using BinaryType on Join and Aggregate.
---
 .../sql/catalyst/analysis/CheckAnalysis.scala | 20 +++++++++++++++++++
 .../spark/sql/DataFrameAggregateSuite.scala   | 11 +++++++++-
 .../org/apache/spark/sql/JoinSuite.scala      |  9 +++++++++
 3 files changed, 39 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
index a373714832962..0ebc3d180a780 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
@@ -87,6 +87,18 @@ trait CheckAnalysis {
               s"join condition '${condition.prettyString}' " +
                 s"of type ${condition.dataType.simpleString} is not a boolean.")
 
+          case j @ Join(_, _, _, Some(condition)) =>
+            def checkValidJoinConditionExprs(expr: Expression): Unit = expr match {
+              case p: Predicate =>
+                p.asInstanceOf[Expression].children.foreach(checkValidJoinConditionExprs)
+              case e if e.dataType.isInstanceOf[BinaryType] =>
+                failAnalysis(s"expression ${e.prettyString} in join condition " +
+                  s"'${condition.prettyString}' can't be binary type.")
+              case _ => // OK
+            }
+
+            checkValidJoinConditionExprs(condition)
+
           case Aggregate(groupingExprs, aggregateExprs, child) =>
             def checkValidAggregateExpression(expr: Expression): Unit = expr match {
               case _: AggregateExpression => // OK
@@ -100,7 +112,15 @@ trait CheckAnalysis {
               case e => e.children.foreach(checkValidAggregateExpression)
             }
 
+            def checkValidGroupingExprs(expr: Expression): Unit = expr.dataType match {
+              case BinaryType =>
+                failAnalysis(s"grouping expression '${expr.prettyString}' in aggregate can " +
+                  s"not be binary type.")
+              case _ => // OK
+            }
+
             aggregateExprs.foreach(checkValidAggregateExpression)
+            aggregateExprs.foreach(checkValidGroupingExprs)
 
           case Sort(orders, _, _) =>
             orders.foreach { order =>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
index b26d3ab253a1d..228ece8065151 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql
 
 import org.apache.spark.sql.TestData._
 import org.apache.spark.sql.functions._
-import org.apache.spark.sql.types.DecimalType
+import org.apache.spark.sql.types.{BinaryType, DecimalType}
 
 
 class DataFrameAggregateSuite extends QueryTest {
@@ -191,4 +191,13 @@ class DataFrameAggregateSuite extends QueryTest {
       Row(null))
   }
 
+  test("aggregation can't work on binary type") {
+    val df = Seq(1, 1, 2, 2).map(i => Tuple1(i.toString)).toDF("c").select($"c" cast BinaryType)
+    intercept[AnalysisException] {
+      df.groupBy("c").agg(count("*"))
+    }
+    intercept[AnalysisException] {
+      df.distinct
+    }
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
index 666f26bf620e1..27c08f64649ee 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
@@ -22,6 +22,7 @@ import org.scalatest.BeforeAndAfterEach
 import org.apache.spark.sql.TestData._
 import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation
 import org.apache.spark.sql.execution.joins._
+import org.apache.spark.sql.types.BinaryType
 
 
 class JoinSuite extends QueryTest with BeforeAndAfterEach {
@@ -489,4 +490,12 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach {
         Row(3, 2) :: Nil)
 
   }
+
+  test("Join can't work on binary type") {
+    val left = Seq(1, 1, 2, 2).map(i => Tuple1(i.toString)).toDF("c").select($"c" cast BinaryType)
+    val right = Seq(1, 1, 2, 2).map(i => Tuple1(i.toString)).toDF("d").select($"d" cast BinaryType)
+    intercept[AnalysisException] {
+      left.join(right, ($"left.N" === $"right.N"), "full")
+    }
+  }
 }

From 65fa4181c35135080870c1e4c1f904ada3a8cf59 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Thu, 30 Jul 2015 17:26:18 -0700
Subject: [PATCH 500/549] [SPARK-9077] [MLLIB] Improve error message for
 decision trees when numExamples < maxCategoriesPerFeature

Improve error message when number of examples is less than arity of high-arity categorical feature

CC jkbradley is this about what you had in mind? I know it's a starter, but was on my list to close out in the short term.

Author: Sean Owen <sowen@cloudera.com>

Closes #7800 from srowen/SPARK-9077 and squashes the following commits:

b8f6cdb [Sean Owen] Improve error message when number of examples is less than arity of high-arity categorical feature
---
 .../spark/mllib/tree/impl/DecisionTreeMetadata.scala      | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/DecisionTreeMetadata.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/DecisionTreeMetadata.scala
index 380291ac22bd3..9fe264656ede7 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/DecisionTreeMetadata.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/DecisionTreeMetadata.scala
@@ -128,9 +128,13 @@ private[spark] object DecisionTreeMetadata extends Logging {
     // based on the number of training examples.
     if (strategy.categoricalFeaturesInfo.nonEmpty) {
       val maxCategoriesPerFeature = strategy.categoricalFeaturesInfo.values.max
+      val maxCategory =
+        strategy.categoricalFeaturesInfo.find(_._2 == maxCategoriesPerFeature).get._1
       require(maxCategoriesPerFeature <= maxPossibleBins,
-        s"DecisionTree requires maxBins (= $maxPossibleBins) >= max categories " +
-          s"in categorical features (= $maxCategoriesPerFeature)")
+        s"DecisionTree requires maxBins (= $maxPossibleBins) to be at least as large as the " +
+        s"number of values in each categorical feature, but categorical feature $maxCategory " +
+        s"has $maxCategoriesPerFeature values. Considering remove this and other categorical " +
+        "features with a large number of values, or add more training examples.")
     }
 
     val unorderedFeatures = new mutable.HashSet[Int]()

From 3c66ff727d4b47220e1ff363cea215189ed64f36 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Thu, 30 Jul 2015 17:38:48 -0700
Subject: [PATCH 501/549] [SPARK-9489] Remove unnecessary compatibility and
 requirements checks from Exchange

While reviewing yhuai's patch for SPARK-2205 (#7773), I noticed that Exchange's `compatible` check may be incorrectly returning `false` in many cases.  As far as I know, this is not actually a problem because the `compatible`, `meetsRequirements`, and `needsAnySort` checks are serving only as short-circuit performance optimizations that are not necessary for correctness.

In order to reduce code complexity, I think that we should remove these checks and unconditionally rewrite the operator's children.  This should be safe because we rewrite the tree in a single bottom-up pass.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #7807 from JoshRosen/SPARK-9489 and squashes the following commits:

9d76ce9 [Josh Rosen] [SPARK-9489] Remove compatibleWith, meetsRequirements, and needsAnySort checks from Exchange
---
 .../plans/physical/partitioning.scala         | 35 ---------
 .../apache/spark/sql/execution/Exchange.scala | 76 +++++--------------
 2 files changed, 17 insertions(+), 94 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala
index 2dcfa19fec383..f4d1dbaf28efe 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala
@@ -86,14 +86,6 @@ sealed trait Partitioning {
    */
   def satisfies(required: Distribution): Boolean
 
-  /**
-   * Returns true iff all distribution guarantees made by this partitioning can also be made
-   * for the `other` specified partitioning.
-   * For example, two [[HashPartitioning HashPartitioning]]s are
-   * only compatible if the `numPartitions` of them is the same.
-   */
-  def compatibleWith(other: Partitioning): Boolean
-
   /** Returns the expressions that are used to key the partitioning. */
   def keyExpressions: Seq[Expression]
 }
@@ -104,11 +96,6 @@ case class UnknownPartitioning(numPartitions: Int) extends Partitioning {
     case _ => false
   }
 
-  override def compatibleWith(other: Partitioning): Boolean = other match {
-    case UnknownPartitioning(_) => true
-    case _ => false
-  }
-
   override def keyExpressions: Seq[Expression] = Nil
 }
 
@@ -117,11 +104,6 @@ case object SinglePartition extends Partitioning {
 
   override def satisfies(required: Distribution): Boolean = true
 
-  override def compatibleWith(other: Partitioning): Boolean = other match {
-    case SinglePartition => true
-    case _ => false
-  }
-
   override def keyExpressions: Seq[Expression] = Nil
 }
 
@@ -130,11 +112,6 @@ case object BroadcastPartitioning extends Partitioning {
 
   override def satisfies(required: Distribution): Boolean = true
 
-  override def compatibleWith(other: Partitioning): Boolean = other match {
-    case SinglePartition => true
-    case _ => false
-  }
-
   override def keyExpressions: Seq[Expression] = Nil
 }
 
@@ -159,12 +136,6 @@ case class HashPartitioning(expressions: Seq[Expression], numPartitions: Int)
     case _ => false
   }
 
-  override def compatibleWith(other: Partitioning): Boolean = other match {
-    case BroadcastPartitioning => true
-    case h: HashPartitioning if h == this => true
-    case _ => false
-  }
-
   override def keyExpressions: Seq[Expression] = expressions
 }
 
@@ -199,11 +170,5 @@ case class RangePartitioning(ordering: Seq[SortOrder], numPartitions: Int)
     case _ => false
   }
 
-  override def compatibleWith(other: Partitioning): Boolean = other match {
-    case BroadcastPartitioning => true
-    case r: RangePartitioning if r == this => true
-    case _ => false
-  }
-
   override def keyExpressions: Seq[Expression] = ordering.map(_.child)
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
index 70e5031fb63c0..6bd57f010a990 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
@@ -202,41 +202,6 @@ private[sql] case class EnsureRequirements(sqlContext: SQLContext) extends Rule[
 
   def apply(plan: SparkPlan): SparkPlan = plan.transformUp {
     case operator: SparkPlan =>
-      // True iff every child's outputPartitioning satisfies the corresponding
-      // required data distribution.
-      def meetsRequirements: Boolean =
-        operator.requiredChildDistribution.zip(operator.children).forall {
-          case (required, child) =>
-            val valid = child.outputPartitioning.satisfies(required)
-            logDebug(
-              s"${if (valid) "Valid" else "Invalid"} distribution," +
-                s"required: $required current: ${child.outputPartitioning}")
-            valid
-        }
-
-      // True iff any of the children are incorrectly sorted.
-      def needsAnySort: Boolean =
-        operator.requiredChildOrdering.zip(operator.children).exists {
-          case (required, child) => required.nonEmpty && required != child.outputOrdering
-        }
-
-      // True iff outputPartitionings of children are compatible with each other.
-      // It is possible that every child satisfies its required data distribution
-      // but two children have incompatible outputPartitionings. For example,
-      // A dataset is range partitioned by "a.asc" (RangePartitioning) and another
-      // dataset is hash partitioned by "a" (HashPartitioning). Tuples in these two
-      // datasets are both clustered by "a", but these two outputPartitionings are not
-      // compatible.
-      // TODO: ASSUMES TRANSITIVITY?
-      def compatible: Boolean =
-        operator.children
-          .map(_.outputPartitioning)
-          .sliding(2)
-          .forall {
-            case Seq(a) => true
-            case Seq(a, b) => a.compatibleWith(b)
-          }
-
       // Adds Exchange or Sort operators as required
       def addOperatorsIfNecessary(
           partitioning: Partitioning,
@@ -269,33 +234,26 @@ private[sql] case class EnsureRequirements(sqlContext: SQLContext) extends Rule[
         addSortIfNecessary(addShuffleIfNecessary(child))
       }
 
-      if (meetsRequirements && compatible && !needsAnySort) {
-        operator
-      } else {
-        // At least one child does not satisfies its required data distribution or
-        // at least one child's outputPartitioning is not compatible with another child's
-        // outputPartitioning. In this case, we need to add Exchange operators.
-        val requirements =
-          (operator.requiredChildDistribution, operator.requiredChildOrdering, operator.children)
+      val requirements =
+        (operator.requiredChildDistribution, operator.requiredChildOrdering, operator.children)
 
-        val fixedChildren = requirements.zipped.map {
-          case (AllTuples, rowOrdering, child) =>
-            addOperatorsIfNecessary(SinglePartition, rowOrdering, child)
-          case (ClusteredDistribution(clustering), rowOrdering, child) =>
-            addOperatorsIfNecessary(HashPartitioning(clustering, numPartitions), rowOrdering, child)
-          case (OrderedDistribution(ordering), rowOrdering, child) =>
-            addOperatorsIfNecessary(RangePartitioning(ordering, numPartitions), rowOrdering, child)
+      val fixedChildren = requirements.zipped.map {
+        case (AllTuples, rowOrdering, child) =>
+          addOperatorsIfNecessary(SinglePartition, rowOrdering, child)
+        case (ClusteredDistribution(clustering), rowOrdering, child) =>
+          addOperatorsIfNecessary(HashPartitioning(clustering, numPartitions), rowOrdering, child)
+        case (OrderedDistribution(ordering), rowOrdering, child) =>
+          addOperatorsIfNecessary(RangePartitioning(ordering, numPartitions), rowOrdering, child)
 
-          case (UnspecifiedDistribution, Seq(), child) =>
-            child
-          case (UnspecifiedDistribution, rowOrdering, child) =>
-            sqlContext.planner.BasicOperators.getSortOperator(rowOrdering, global = false, child)
+        case (UnspecifiedDistribution, Seq(), child) =>
+          child
+        case (UnspecifiedDistribution, rowOrdering, child) =>
+          sqlContext.planner.BasicOperators.getSortOperator(rowOrdering, global = false, child)
 
-          case (dist, ordering, _) =>
-            sys.error(s"Don't know how to ensure $dist with ordering $ordering")
-        }
-
-        operator.withNewChildren(fixedChildren)
+        case (dist, ordering, _) =>
+          sys.error(s"Don't know how to ensure $dist with ordering $ordering")
       }
+
+      operator.withNewChildren(fixedChildren)
   }
 }

From 9307f5653d19a6a2fda355a675ca9ea97e35611b Mon Sep 17 00:00:00 2001
From: cody koeninger <cody@koeninger.org>
Date: Thu, 30 Jul 2015 17:44:20 -0700
Subject: [PATCH 502/549] [SPARK-9472] [STREAMING] consistent hadoop
 configuration, streaming only

Author: cody koeninger <cody@koeninger.org>

Closes #7772 from koeninger/streaming-hadoop-config and squashes the following commits:

5267284 [cody koeninger] [SPARK-4229][Streaming] consistent hadoop configuration, streaming only
---
 .../main/scala/org/apache/spark/streaming/Checkpoint.scala | 3 ++-
 .../org/apache/spark/streaming/StreamingContext.scala      | 7 ++++---
 .../apache/spark/streaming/api/java/JavaPairDStream.scala  | 2 +-
 .../spark/streaming/api/java/JavaStreamingContext.scala    | 3 ++-
 4 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala b/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala
index 65d4e933bf8e9..2780d5b6adbcf 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala
@@ -25,6 +25,7 @@ import org.apache.hadoop.fs.{FileSystem, Path}
 import org.apache.hadoop.conf.Configuration
 
 import org.apache.spark.{SparkException, SparkConf, Logging}
+import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.io.CompressionCodec
 import org.apache.spark.util.{MetadataCleaner, Utils}
 import org.apache.spark.streaming.scheduler.JobGenerator
@@ -100,7 +101,7 @@ object Checkpoint extends Logging {
     }
 
     val path = new Path(checkpointDir)
-    val fs = fsOption.getOrElse(path.getFileSystem(new Configuration()))
+    val fs = fsOption.getOrElse(path.getFileSystem(SparkHadoopUtil.get.conf))
     if (fs.exists(path)) {
       val statuses = fs.listStatus(path)
       if (statuses != null) {
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
index 92438f1b1fbf7..177e710ace54b 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
@@ -34,6 +34,7 @@ import org.apache.hadoop.mapreduce.{InputFormat => NewInputFormat}
 
 import org.apache.spark._
 import org.apache.spark.annotation.{DeveloperApi, Experimental}
+import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.input.FixedLengthBinaryInputFormat
 import org.apache.spark.rdd.{RDD, RDDOperationScope}
 import org.apache.spark.serializer.SerializationDebugger
@@ -110,7 +111,7 @@ class StreamingContext private[streaming] (
    * Recreate a StreamingContext from a checkpoint file.
    * @param path Path to the directory that was specified as the checkpoint directory
    */
-  def this(path: String) = this(path, new Configuration)
+  def this(path: String) = this(path, SparkHadoopUtil.get.conf)
 
   /**
    * Recreate a StreamingContext from a checkpoint file using an existing SparkContext.
@@ -803,7 +804,7 @@ object StreamingContext extends Logging {
   def getActiveOrCreate(
       checkpointPath: String,
       creatingFunc: () => StreamingContext,
-      hadoopConf: Configuration = new Configuration(),
+      hadoopConf: Configuration = SparkHadoopUtil.get.conf,
       createOnError: Boolean = false
     ): StreamingContext = {
     ACTIVATION_LOCK.synchronized {
@@ -828,7 +829,7 @@ object StreamingContext extends Logging {
   def getOrCreate(
       checkpointPath: String,
       creatingFunc: () => StreamingContext,
-      hadoopConf: Configuration = new Configuration(),
+      hadoopConf: Configuration = SparkHadoopUtil.get.conf,
       createOnError: Boolean = false
     ): StreamingContext = {
     val checkpointOption = CheckpointReader.read(
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaPairDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaPairDStream.scala
index 959ac9c177f81..26383e420101e 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaPairDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaPairDStream.scala
@@ -788,7 +788,7 @@ class JavaPairDStream[K, V](val dstream: DStream[(K, V)])(
       keyClass: Class[_],
       valueClass: Class[_],
       outputFormatClass: Class[F],
-      conf: Configuration = new Configuration) {
+      conf: Configuration = dstream.context.sparkContext.hadoopConfiguration) {
     dstream.saveAsNewAPIHadoopFiles(prefix, suffix, keyClass, valueClass, outputFormatClass, conf)
   }
 
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala
index 40deb6d7ea79a..35cc3ce5cf468 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala
@@ -33,6 +33,7 @@ import org.apache.spark.annotation.Experimental
 import org.apache.spark.api.java.{JavaPairRDD, JavaRDD, JavaSparkContext}
 import org.apache.spark.api.java.function.{Function => JFunction, Function2 => JFunction2}
 import org.apache.spark.api.java.function.{Function0 => JFunction0}
+import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.rdd.RDD
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.streaming._
@@ -136,7 +137,7 @@ class JavaStreamingContext(val ssc: StreamingContext) extends Closeable {
    * Recreate a JavaStreamingContext from a checkpoint file.
    * @param path Path to the directory that was specified as the checkpoint directory
    */
-  def this(path: String) = this(new StreamingContext(path, new Configuration))
+  def this(path: String) = this(new StreamingContext(path, SparkHadoopUtil.get.conf))
 
   /**
    * Re-creates a JavaStreamingContext from a checkpoint file.

From 83670fc9e6fc9c7a6ae68dfdd3f9335ea72f4ab0 Mon Sep 17 00:00:00 2001
From: Daoyuan Wang <daoyuan.wang@intel.com>
Date: Thu, 30 Jul 2015 19:22:38 -0700
Subject: [PATCH 503/549] [SPARK-8176] [SPARK-8197] [SQL] function to_date/
 trunc

This PR is based on #6988 , thanks to adrian-wang .

This brings two SQL functions: to_date() and trunc().

Closes #6988

Author: Daoyuan Wang <daoyuan.wang@intel.com>
Author: Davies Liu <davies@databricks.com>

Closes #7805 from davies/to_date and squashes the following commits:

2c7beba [Davies Liu] Merge branch 'master' of github.com:apache/spark into to_date
310dd55 [Daoyuan Wang] remove dup test in rebase
980b092 [Daoyuan Wang] resolve rebase conflict
a476c5a [Daoyuan Wang] address comments from davies
d44ea5f [Daoyuan Wang] function to_date, trunc
---
 python/pyspark/sql/functions.py               | 30 +++++++
 .../catalyst/analysis/FunctionRegistry.scala  |  2 +
 .../expressions/datetimeFunctions.scala       | 88 ++++++++++++++++++-
 .../sql/catalyst/util/DateTimeUtils.scala     | 34 +++++++
 .../expressions/DateExpressionsSuite.scala    | 29 +++++-
 .../expressions/NonFoldableLiteral.scala      |  4 +
 .../org/apache/spark/sql/functions.scala      | 16 ++++
 .../apache/spark/sql/DateFunctionsSuite.scala | 44 ++++++++++
 8 files changed, 245 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index a7295e25f0aa5..8024a8de07c98 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -888,6 +888,36 @@ def months_between(date1, date2):
     return Column(sc._jvm.functions.months_between(_to_java_column(date1), _to_java_column(date2)))
 
 
+@since(1.5)
+def to_date(col):
+    """
+    Converts the column of StringType or TimestampType into DateType.
+
+    >>> df = sqlContext.createDataFrame([('1997-02-28 10:30:00',)], ['t'])
+    >>> df.select(to_date(df.t).alias('date')).collect()
+    [Row(date=datetime.date(1997, 2, 28))]
+    """
+    sc = SparkContext._active_spark_context
+    return Column(sc._jvm.functions.to_date(_to_java_column(col)))
+
+
+@since(1.5)
+def trunc(date, format):
+    """
+    Returns date truncated to the unit specified by the format.
+
+    :param format: 'year', 'YYYY', 'yy' or 'month', 'mon', 'mm'
+
+    >>> df = sqlContext.createDataFrame([('1997-02-28',)], ['d'])
+    >>> df.select(trunc(df.d, 'year').alias('year')).collect()
+    [Row(year=datetime.date(1997, 1, 1))]
+    >>> df.select(trunc(df.d, 'mon').alias('month')).collect()
+    [Row(month=datetime.date(1997, 2, 1))]
+    """
+    sc = SparkContext._active_spark_context
+    return Column(sc._jvm.functions.trunc(_to_java_column(date), format))
+
+
 @since(1.5)
 def size(col):
     """
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index 6c7c481fab8db..1bf7204a2515c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -223,6 +223,8 @@ object FunctionRegistry {
     expression[NextDay]("next_day"),
     expression[Quarter]("quarter"),
     expression[Second]("second"),
+    expression[ToDate]("to_date"),
+    expression[TruncDate]("trunc"),
     expression[UnixTimestamp]("unix_timestamp"),
     expression[WeekOfYear]("weekofyear"),
     expression[Year]("year"),
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeFunctions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeFunctions.scala
index 9795673ee0664..6e7613340c032 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeFunctions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeFunctions.scala
@@ -507,7 +507,6 @@ case class FromUnixTime(sec: Expression, format: Expression)
       })
     }
   }
-
 }
 
 /**
@@ -696,3 +695,90 @@ case class MonthsBetween(date1: Expression, date2: Expression)
     })
   }
 }
+
+/**
+ * Returns the date part of a timestamp or string.
+ */
+case class ToDate(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
+
+  // Implicit casting of spark will accept string in both date and timestamp format, as
+  // well as TimestampType.
+  override def inputTypes: Seq[AbstractDataType] = Seq(DateType)
+
+  override def dataType: DataType = DateType
+
+  override def eval(input: InternalRow): Any = child.eval(input)
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    defineCodeGen(ctx, ev, d => d)
+  }
+}
+
+/*
+ * Returns date truncated to the unit specified by the format.
+ */
+case class TruncDate(date: Expression, format: Expression)
+  extends BinaryExpression with ImplicitCastInputTypes {
+  override def left: Expression = date
+  override def right: Expression = format
+
+  override def inputTypes: Seq[AbstractDataType] = Seq(DateType, StringType)
+  override def dataType: DataType = DateType
+  override def prettyName: String = "trunc"
+
+  lazy val minItemConst = DateTimeUtils.parseTruncLevel(format.eval().asInstanceOf[UTF8String])
+
+  override def eval(input: InternalRow): Any = {
+    val minItem = if (format.foldable) {
+      minItemConst
+    } else {
+      DateTimeUtils.parseTruncLevel(format.eval().asInstanceOf[UTF8String])
+    }
+    if (minItem == -1) {
+      // unknown format
+      null
+    } else {
+      val d = date.eval(input)
+      if (d == null) {
+        null
+      } else {
+        DateTimeUtils.truncDate(d.asInstanceOf[Int], minItem)
+      }
+    }
+  }
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    val dtu = DateTimeUtils.getClass.getName.stripSuffix("$")
+
+    if (format.foldable) {
+      if (minItemConst == -1) {
+        s"""
+          boolean ${ev.isNull} = true;
+          ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+        """
+      } else {
+        val d = date.gen(ctx)
+        s"""
+          ${d.code}
+          boolean ${ev.isNull} = ${d.isNull};
+          ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+          if (!${ev.isNull}) {
+            ${ev.primitive} = $dtu.truncDate(${d.primitive}, $minItemConst);
+          }
+        """
+      }
+    } else {
+      nullSafeCodeGen(ctx, ev, (dateVal, fmt) => {
+        val form = ctx.freshName("form")
+        s"""
+          int $form = $dtu.parseTruncLevel($fmt);
+          if ($form == -1) {
+            ${ev.isNull} = true;
+          } else {
+            ${ev.primitive} = $dtu.truncDate($dateVal, $form);
+          }
+        """
+      })
+    }
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
index 53abdf6618eac..5a7c25b8d508d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
@@ -779,4 +779,38 @@ object DateTimeUtils {
     }
     date + (lastDayOfMonthInYear - dayInYear)
   }
+
+  private val TRUNC_TO_YEAR = 1
+  private val TRUNC_TO_MONTH = 2
+  private val TRUNC_INVALID = -1
+
+  /**
+   * Returns the trunc date from original date and trunc level.
+   * Trunc level should be generated using `parseTruncLevel()`, should only be 1 or 2.
+   */
+  def truncDate(d: Int, level: Int): Int = {
+    if (level == TRUNC_TO_YEAR) {
+      d - DateTimeUtils.getDayInYear(d) + 1
+    } else if (level == TRUNC_TO_MONTH) {
+      d - DateTimeUtils.getDayOfMonth(d) + 1
+    } else {
+      throw new Exception(s"Invalid trunc level: $level")
+    }
+  }
+
+  /**
+   * Returns the truncate level, could be TRUNC_YEAR, TRUNC_MONTH, or TRUNC_INVALID,
+   * TRUNC_INVALID means unsupported truncate level.
+   */
+  def parseTruncLevel(format: UTF8String): Int = {
+    if (format == null) {
+      TRUNC_INVALID
+    } else {
+      format.toString.toUpperCase match {
+        case "YEAR" | "YYYY" | "YY" => TRUNC_TO_YEAR
+        case "MON" | "MONTH" | "MM" => TRUNC_TO_MONTH
+        case _ => TRUNC_INVALID
+      }
+    }
+  }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
index 887e43621a941..6c15c05da3094 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
@@ -351,6 +351,34 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
       NextDay(Literal(Date.valueOf("2015-07-23")), Literal.create(null, StringType)), null)
   }
 
+  test("function to_date") {
+    checkEvaluation(
+      ToDate(Literal(Date.valueOf("2015-07-22"))),
+      DateTimeUtils.fromJavaDate(Date.valueOf("2015-07-22")))
+    checkEvaluation(ToDate(Literal.create(null, DateType)), null)
+  }
+
+  test("function trunc") {
+    def testTrunc(input: Date, fmt: String, expected: Date): Unit = {
+      checkEvaluation(TruncDate(Literal.create(input, DateType), Literal.create(fmt, StringType)),
+        expected)
+      checkEvaluation(
+        TruncDate(Literal.create(input, DateType), NonFoldableLiteral.create(fmt, StringType)),
+        expected)
+    }
+    val date = Date.valueOf("2015-07-22")
+    Seq("yyyy", "YYYY", "year", "YEAR", "yy", "YY").foreach{ fmt =>
+      testTrunc(date, fmt, Date.valueOf("2015-01-01"))
+    }
+    Seq("month", "MONTH", "mon", "MON", "mm", "MM").foreach { fmt =>
+      testTrunc(date, fmt, Date.valueOf("2015-07-01"))
+    }
+    testTrunc(date, "DD", null)
+    testTrunc(date, null, null)
+    testTrunc(null, "MON", null)
+    testTrunc(null, null, null)
+  }
+
   test("from_unixtime") {
     val sdf1 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
     val fmt2 = "yyyy-MM-dd HH:mm:ss.SSS"
@@ -405,5 +433,4 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(
       UnixTimestamp(Literal("2015-07-24"), Literal("not a valid format")), null)
   }
-
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NonFoldableLiteral.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NonFoldableLiteral.scala
index 0559fb80e7fce..31ecf4a9e810a 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NonFoldableLiteral.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NonFoldableLiteral.scala
@@ -47,4 +47,8 @@ object NonFoldableLiteral {
     val lit = Literal(value)
     NonFoldableLiteral(lit.value, lit.dataType)
   }
+  def create(value: Any, dataType: DataType): NonFoldableLiteral = {
+    val lit = Literal.create(value, dataType)
+    NonFoldableLiteral(lit.value, lit.dataType)
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 168894d66117d..46dc4605a5ccb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -2181,6 +2181,22 @@ object functions {
    */
   def unix_timestamp(s: Column, p: String): Column = UnixTimestamp(s.expr, Literal(p))
 
+  /*
+   * Converts the column into DateType.
+   *
+   * @group datetime_funcs
+   * @since 1.5.0
+   */
+  def to_date(e: Column): Column = ToDate(e.expr)
+
+  /**
+   * Returns date truncated to the unit specified by the format.
+   *
+   * @group datetime_funcs
+   * @since 1.5.0
+   */
+  def trunc(date: Column, format: String): Column = TruncDate(date.expr, Literal(format))
+
   //////////////////////////////////////////////////////////////////////////////////////////////
   // Collection functions
   //////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
index b7267c413165a..8c596fad74ee4 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
@@ -345,6 +345,50 @@ class DateFunctionsSuite extends QueryTest {
       Seq(Row(Date.valueOf("2015-07-30")), Row(Date.valueOf("2015-07-30"))))
   }
 
+  test("function to_date") {
+    val d1 = Date.valueOf("2015-07-22")
+    val d2 = Date.valueOf("2015-07-01")
+    val t1 = Timestamp.valueOf("2015-07-22 10:00:00")
+    val t2 = Timestamp.valueOf("2014-12-31 23:59:59")
+    val s1 = "2015-07-22 10:00:00"
+    val s2 = "2014-12-31"
+    val df = Seq((d1, t1, s1), (d2, t2, s2)).toDF("d", "t", "s")
+
+    checkAnswer(
+      df.select(to_date(col("t"))),
+      Seq(Row(Date.valueOf("2015-07-22")), Row(Date.valueOf("2014-12-31"))))
+    checkAnswer(
+      df.select(to_date(col("d"))),
+      Seq(Row(Date.valueOf("2015-07-22")), Row(Date.valueOf("2015-07-01"))))
+    checkAnswer(
+      df.select(to_date(col("s"))),
+      Seq(Row(Date.valueOf("2015-07-22")), Row(Date.valueOf("2014-12-31"))))
+
+    checkAnswer(
+      df.selectExpr("to_date(t)"),
+      Seq(Row(Date.valueOf("2015-07-22")), Row(Date.valueOf("2014-12-31"))))
+    checkAnswer(
+      df.selectExpr("to_date(d)"),
+      Seq(Row(Date.valueOf("2015-07-22")), Row(Date.valueOf("2015-07-01"))))
+    checkAnswer(
+      df.selectExpr("to_date(s)"),
+      Seq(Row(Date.valueOf("2015-07-22")), Row(Date.valueOf("2014-12-31"))))
+  }
+
+  test("function trunc") {
+    val df = Seq(
+      (1, Timestamp.valueOf("2015-07-22 10:00:00")),
+      (2, Timestamp.valueOf("2014-12-31 00:00:00"))).toDF("i", "t")
+
+    checkAnswer(
+      df.select(trunc(col("t"), "YY")),
+      Seq(Row(Date.valueOf("2015-01-01")), Row(Date.valueOf("2014-01-01"))))
+
+    checkAnswer(
+      df.selectExpr("trunc(t, 'Month')"),
+      Seq(Row(Date.valueOf("2015-07-01")), Row(Date.valueOf("2014-12-01"))))
+  }
+
   test("from_unixtime") {
     val sdf1 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
     val fmt2 = "yyyy-MM-dd HH:mm:ss.SSS"

From 4e5919bfb47a58bcbda90ae01c1bed2128ded983 Mon Sep 17 00:00:00 2001
From: Ram Sriharsha <rsriharsha@hw11853.local>
Date: Thu, 30 Jul 2015 23:02:11 -0700
Subject: [PATCH 504/549] [SPARK-7690] [ML] Multiclass classification Evaluator

Multiclass Classification Evaluator for ML Pipelines. F1 score, precision, recall, weighted precision and weighted recall are supported as available metrics.

Author: Ram Sriharsha <rsriharsha@hw11853.local>

Closes #7475 from harsha2010/SPARK-7690 and squashes the following commits:

9bf4ec7 [Ram Sriharsha] fix indentation
3f09a85 [Ram Sriharsha] cleanup doc
16115ae [Ram Sriharsha] code review fixes
032d2a3 [Ram Sriharsha] fix test
eec9865 [Ram Sriharsha] Fix Python Indentation
1dbeffd [Ram Sriharsha] Merge branch 'master' into SPARK-7690
68cea85 [Ram Sriharsha] Merge branch 'master' into SPARK-7690
54c03de [Ram Sriharsha] [SPARK-7690][ml][WIP] Multiclass Evaluator for ML Pipeline
---
 .../MulticlassClassificationEvaluator.scala   | 85 +++++++++++++++++++
 ...lticlassClassificationEvaluatorSuite.scala | 28 ++++++
 python/pyspark/ml/evaluation.py               | 66 ++++++++++++++
 3 files changed, 179 insertions(+)
 create mode 100644 mllib/src/main/scala/org/apache/spark/ml/evaluation/MulticlassClassificationEvaluator.scala
 create mode 100644 mllib/src/test/scala/org/apache/spark/ml/evaluation/MulticlassClassificationEvaluatorSuite.scala

diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/MulticlassClassificationEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/MulticlassClassificationEvaluator.scala
new file mode 100644
index 0000000000000..44f779c1908d7
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/MulticlassClassificationEvaluator.scala
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.evaluation
+
+import org.apache.spark.annotation.Experimental
+import org.apache.spark.ml.param.{ParamMap, ParamValidators, Param}
+import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol}
+import org.apache.spark.ml.util.{SchemaUtils, Identifiable}
+import org.apache.spark.mllib.evaluation.MulticlassMetrics
+import org.apache.spark.sql.{Row, DataFrame}
+import org.apache.spark.sql.types.DoubleType
+
+/**
+ * :: Experimental ::
+ * Evaluator for multiclass classification, which expects two input columns: score and label.
+ */
+@Experimental
+class MulticlassClassificationEvaluator (override val uid: String)
+  extends Evaluator with HasPredictionCol with HasLabelCol {
+
+  def this() = this(Identifiable.randomUID("mcEval"))
+
+  /**
+   * param for metric name in evaluation (supports `"f1"` (default), `"precision"`, `"recall"`,
+   * `"weightedPrecision"`, `"weightedRecall"`)
+   * @group param
+   */
+  val metricName: Param[String] = {
+    val allowedParams = ParamValidators.inArray(Array("f1", "precision",
+      "recall", "weightedPrecision", "weightedRecall"))
+    new Param(this, "metricName", "metric name in evaluation " +
+      "(f1|precision|recall|weightedPrecision|weightedRecall)", allowedParams)
+  }
+
+  /** @group getParam */
+  def getMetricName: String = $(metricName)
+
+  /** @group setParam */
+  def setMetricName(value: String): this.type = set(metricName, value)
+
+  /** @group setParam */
+  def setPredictionCol(value: String): this.type = set(predictionCol, value)
+
+  /** @group setParam */
+  def setLabelCol(value: String): this.type = set(labelCol, value)
+
+  setDefault(metricName -> "f1")
+
+  override def evaluate(dataset: DataFrame): Double = {
+    val schema = dataset.schema
+    SchemaUtils.checkColumnType(schema, $(predictionCol), DoubleType)
+    SchemaUtils.checkColumnType(schema, $(labelCol), DoubleType)
+
+    val predictionAndLabels = dataset.select($(predictionCol), $(labelCol))
+      .map { case Row(prediction: Double, label: Double) =>
+      (prediction, label)
+    }
+    val metrics = new MulticlassMetrics(predictionAndLabels)
+    val metric = $(metricName) match {
+      case "f1" => metrics.weightedFMeasure
+      case "precision" => metrics.precision
+      case "recall" => metrics.recall
+      case "weightedPrecision" => metrics.weightedPrecision
+      case "weightedRecall" => metrics.weightedRecall
+    }
+    metric
+  }
+
+  override def copy(extra: ParamMap): MulticlassClassificationEvaluator = defaultCopy(extra)
+}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/evaluation/MulticlassClassificationEvaluatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/evaluation/MulticlassClassificationEvaluatorSuite.scala
new file mode 100644
index 0000000000000..6d8412b0b3701
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/ml/evaluation/MulticlassClassificationEvaluatorSuite.scala
@@ -0,0 +1,28 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.evaluation
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.ml.param.ParamsSuite
+
+class MulticlassClassificationEvaluatorSuite extends SparkFunSuite {
+
+  test("params") {
+    ParamsSuite.checkParams(new MulticlassClassificationEvaluator)
+  }
+}
diff --git a/python/pyspark/ml/evaluation.py b/python/pyspark/ml/evaluation.py
index 595593a7f2cde..06e809352225b 100644
--- a/python/pyspark/ml/evaluation.py
+++ b/python/pyspark/ml/evaluation.py
@@ -214,6 +214,72 @@ def setParams(self, predictionCol="prediction", labelCol="label",
         kwargs = self.setParams._input_kwargs
         return self._set(**kwargs)
 
+
+@inherit_doc
+class MulticlassClassificationEvaluator(JavaEvaluator, HasLabelCol, HasPredictionCol):
+    """
+    Evaluator for Multiclass Classification, which expects two input
+    columns: prediction and label.
+    >>> scoreAndLabels = [(0.0, 0.0), (0.0, 1.0), (0.0, 0.0),
+    ...     (1.0, 0.0), (1.0, 1.0), (1.0, 1.0), (1.0, 1.0), (2.0, 2.0), (2.0, 0.0)]
+    >>> dataset = sqlContext.createDataFrame(scoreAndLabels, ["prediction", "label"])
+    ...
+    >>> evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
+    >>> evaluator.evaluate(dataset)
+    0.66...
+    >>> evaluator.evaluate(dataset, {evaluator.metricName: "precision"})
+    0.66...
+    >>> evaluator.evaluate(dataset, {evaluator.metricName: "recall"})
+    0.66...
+    """
+    # a placeholder to make it appear in the generated doc
+    metricName = Param(Params._dummy(), "metricName",
+                       "metric name in evaluation "
+                       "(f1|precision|recall|weightedPrecision|weightedRecall)")
+
+    @keyword_only
+    def __init__(self, predictionCol="prediction", labelCol="label",
+                 metricName="f1"):
+        """
+        __init__(self, predictionCol="prediction", labelCol="label", \
+                 metricName="f1")
+        """
+        super(MulticlassClassificationEvaluator, self).__init__()
+        self._java_obj = self._new_java_obj(
+            "org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator", self.uid)
+        # param for metric name in evaluation (f1|precision|recall|weightedPrecision|weightedRecall)
+        self.metricName = Param(self, "metricName",
+                                "metric name in evaluation"
+                                " (f1|precision|recall|weightedPrecision|weightedRecall)")
+        self._setDefault(predictionCol="prediction", labelCol="label",
+                         metricName="f1")
+        kwargs = self.__init__._input_kwargs
+        self._set(**kwargs)
+
+    def setMetricName(self, value):
+        """
+        Sets the value of :py:attr:`metricName`.
+        """
+        self._paramMap[self.metricName] = value
+        return self
+
+    def getMetricName(self):
+        """
+        Gets the value of metricName or its default value.
+        """
+        return self.getOrDefault(self.metricName)
+
+    @keyword_only
+    def setParams(self, predictionCol="prediction", labelCol="label",
+                  metricName="f1"):
+        """
+        setParams(self, predictionCol="prediction", labelCol="label", \
+                  metricName="f1")
+        Sets params for multiclass classification evaluator.
+        """
+        kwargs = self.setParams._input_kwargs
+        return self._set(**kwargs)
+
 if __name__ == "__main__":
     import doctest
     from pyspark.context import SparkContext

From 69b62f76fced18efa35a107c9be4bc22eba72878 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Thu, 30 Jul 2015 23:03:48 -0700
Subject: [PATCH 505/549] [SPARK-9214] [ML] [PySpark] support ml.NaiveBayes for
 Python

support ml.NaiveBayes for Python

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #7568 from yanboliang/spark-9214 and squashes the following commits:

5ee3fd6 [Yanbo Liang] fix typos
3ecd046 [Yanbo Liang] fix typos
f9c94d1 [Yanbo Liang] change lambda_ to smoothing and fix other issues
180452a [Yanbo Liang] fix typos
7dda1f4 [Yanbo Liang] support ml.NaiveBayes for Python
---
 .../spark/ml/classification/NaiveBayes.scala  |  10 +-
 .../classification/JavaNaiveBayesSuite.java   |   4 +-
 .../ml/classification/NaiveBayesSuite.scala   |   6 +-
 python/pyspark/ml/classification.py           | 116 +++++++++++++++++-
 4 files changed, 125 insertions(+), 11 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
index 1f547e4a98af7..5be35fe209291 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
@@ -38,11 +38,11 @@ private[ml] trait NaiveBayesParams extends PredictorParams {
    * (default = 1.0).
    * @group param
    */
-  final val lambda: DoubleParam = new DoubleParam(this, "lambda", "The smoothing parameter.",
+  final val smoothing: DoubleParam = new DoubleParam(this, "smoothing", "The smoothing parameter.",
     ParamValidators.gtEq(0))
 
   /** @group getParam */
-  final def getLambda: Double = $(lambda)
+  final def getSmoothing: Double = $(smoothing)
 
   /**
    * The model type which is a string (case-sensitive).
@@ -79,8 +79,8 @@ class NaiveBayes(override val uid: String)
    * Default is 1.0.
    * @group setParam
    */
-  def setLambda(value: Double): this.type = set(lambda, value)
-  setDefault(lambda -> 1.0)
+  def setSmoothing(value: Double): this.type = set(smoothing, value)
+  setDefault(smoothing -> 1.0)
 
   /**
    * Set the model type using a string (case-sensitive).
@@ -92,7 +92,7 @@ class NaiveBayes(override val uid: String)
 
   override protected def train(dataset: DataFrame): NaiveBayesModel = {
     val oldDataset: RDD[LabeledPoint] = extractLabeledPoints(dataset)
-    val oldModel = OldNaiveBayes.train(oldDataset, $(lambda), $(modelType))
+    val oldModel = OldNaiveBayes.train(oldDataset, $(smoothing), $(modelType))
     NaiveBayesModel.fromOld(oldModel, this)
   }
 
diff --git a/mllib/src/test/java/org/apache/spark/ml/classification/JavaNaiveBayesSuite.java b/mllib/src/test/java/org/apache/spark/ml/classification/JavaNaiveBayesSuite.java
index 09a9fba0c19cf..a700c9cddb206 100644
--- a/mllib/src/test/java/org/apache/spark/ml/classification/JavaNaiveBayesSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/classification/JavaNaiveBayesSuite.java
@@ -68,7 +68,7 @@ public void naiveBayesDefaultParams() {
     assert(nb.getLabelCol() == "label");
     assert(nb.getFeaturesCol() == "features");
     assert(nb.getPredictionCol() == "prediction");
-    assert(nb.getLambda() == 1.0);
+    assert(nb.getSmoothing() == 1.0);
     assert(nb.getModelType() == "multinomial");
   }
 
@@ -89,7 +89,7 @@ public void testNaiveBayes() {
     });
 
     DataFrame dataset = jsql.createDataFrame(jrdd, schema);
-    NaiveBayes nb = new NaiveBayes().setLambda(0.5).setModelType("multinomial");
+    NaiveBayes nb = new NaiveBayes().setSmoothing(0.5).setModelType("multinomial");
     NaiveBayesModel model = nb.fit(dataset);
 
     DataFrame predictionAndLabels = model.transform(dataset).select("prediction", "label");
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala
index 76381a2741296..264bde3703c5f 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala
@@ -58,7 +58,7 @@ class NaiveBayesSuite extends SparkFunSuite with MLlibTestSparkContext {
     assert(nb.getLabelCol === "label")
     assert(nb.getFeaturesCol === "features")
     assert(nb.getPredictionCol === "prediction")
-    assert(nb.getLambda === 1.0)
+    assert(nb.getSmoothing === 1.0)
     assert(nb.getModelType === "multinomial")
   }
 
@@ -75,7 +75,7 @@ class NaiveBayesSuite extends SparkFunSuite with MLlibTestSparkContext {
 
     val testDataset = sqlContext.createDataFrame(generateNaiveBayesInput(
       piArray, thetaArray, nPoints, 42, "multinomial"))
-    val nb = new NaiveBayes().setLambda(1.0).setModelType("multinomial")
+    val nb = new NaiveBayes().setSmoothing(1.0).setModelType("multinomial")
     val model = nb.fit(testDataset)
 
     validateModelFit(pi, theta, model)
@@ -101,7 +101,7 @@ class NaiveBayesSuite extends SparkFunSuite with MLlibTestSparkContext {
 
     val testDataset = sqlContext.createDataFrame(generateNaiveBayesInput(
       piArray, thetaArray, nPoints, 45, "bernoulli"))
-    val nb = new NaiveBayes().setLambda(1.0).setModelType("bernoulli")
+    val nb = new NaiveBayes().setSmoothing(1.0).setModelType("bernoulli")
     val model = nb.fit(testDataset)
 
     validateModelFit(pi, theta, model)
diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
index 5a82bc286d1e8..93ffcd40949b3 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -25,7 +25,8 @@
 
 __all__ = ['LogisticRegression', 'LogisticRegressionModel', 'DecisionTreeClassifier',
            'DecisionTreeClassificationModel', 'GBTClassifier', 'GBTClassificationModel',
-           'RandomForestClassifier', 'RandomForestClassificationModel']
+           'RandomForestClassifier', 'RandomForestClassificationModel', 'NaiveBayes',
+           'NaiveBayesModel']
 
 
 @inherit_doc
@@ -576,6 +577,119 @@ class GBTClassificationModel(TreeEnsembleModels):
     """
 
 
+@inherit_doc
+class NaiveBayes(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol):
+    """
+    Naive Bayes Classifiers.
+
+    >>> from pyspark.sql import Row
+    >>> from pyspark.mllib.linalg import Vectors
+    >>> df = sqlContext.createDataFrame([
+    ...     Row(label=0.0, features=Vectors.dense([0.0, 0.0])),
+    ...     Row(label=0.0, features=Vectors.dense([0.0, 1.0])),
+    ...     Row(label=1.0, features=Vectors.dense([1.0, 0.0]))])
+    >>> nb = NaiveBayes(smoothing=1.0, modelType="multinomial")
+    >>> model = nb.fit(df)
+    >>> model.pi
+    DenseVector([-0.51..., -0.91...])
+    >>> model.theta
+    DenseMatrix(2, 2, [-1.09..., -0.40..., -0.40..., -1.09...], 1)
+    >>> test0 = sc.parallelize([Row(features=Vectors.dense([1.0, 0.0]))]).toDF()
+    >>> model.transform(test0).head().prediction
+    1.0
+    >>> test1 = sc.parallelize([Row(features=Vectors.sparse(2, [0], [1.0]))]).toDF()
+    >>> model.transform(test1).head().prediction
+    1.0
+    """
+
+    # a placeholder to make it appear in the generated doc
+    smoothing = Param(Params._dummy(), "smoothing", "The smoothing parameter, should be >= 0, " +
+                      "default is 1.0")
+    modelType = Param(Params._dummy(), "modelType", "The model type which is a string " +
+                      "(case-sensitive). Supported options: multinomial (default) and bernoulli.")
+
+    @keyword_only
+    def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
+                 smoothing=1.0, modelType="multinomial"):
+        """
+        __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
+                 smoothing=1.0, modelType="multinomial")
+        """
+        super(NaiveBayes, self).__init__()
+        self._java_obj = self._new_java_obj(
+            "org.apache.spark.ml.classification.NaiveBayes", self.uid)
+        #: param for the smoothing parameter.
+        self.smoothing = Param(self, "smoothing", "The smoothing parameter, should be >= 0, " +
+                               "default is 1.0")
+        #: param for the model type.
+        self.modelType = Param(self, "modelType", "The model type which is a string " +
+                               "(case-sensitive). Supported options: multinomial (default) " +
+                               "and bernoulli.")
+        self._setDefault(smoothing=1.0, modelType="multinomial")
+        kwargs = self.__init__._input_kwargs
+        self.setParams(**kwargs)
+
+    @keyword_only
+    def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction",
+                  smoothing=1.0, modelType="multinomial"):
+        """
+        setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction",
+                  smoothing=1.0, modelType="multinomial")
+        Sets params for Naive Bayes.
+        """
+        kwargs = self.setParams._input_kwargs
+        return self._set(**kwargs)
+
+    def _create_model(self, java_model):
+        return NaiveBayesModel(java_model)
+
+    def setSmoothing(self, value):
+        """
+        Sets the value of :py:attr:`smoothing`.
+        """
+        self._paramMap[self.smoothing] = value
+        return self
+
+    def getSmoothing(self):
+        """
+        Gets the value of smoothing or its default value.
+        """
+        return self.getOrDefault(self.smoothing)
+
+    def setModelType(self, value):
+        """
+        Sets the value of :py:attr:`modelType`.
+        """
+        self._paramMap[self.modelType] = value
+        return self
+
+    def getModelType(self):
+        """
+        Gets the value of modelType or its default value.
+        """
+        return self.getOrDefault(self.modelType)
+
+
+class NaiveBayesModel(JavaModel):
+    """
+    Model fitted by NaiveBayes.
+    """
+
+    @property
+    def pi(self):
+        """
+        log of class priors.
+        """
+        return self._call_java("pi")
+
+    @property
+    def theta(self):
+        """
+        log of class conditional probabilities.
+        """
+        return self._call_java("theta")
+
+
 if __name__ == "__main__":
     import doctest
     from pyspark.context import SparkContext

From 0244170b66476abc4a39ed609a852f1a6fa455e7 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@appier.com>
Date: Thu, 30 Jul 2015 23:05:58 -0700
Subject: [PATCH 506/549] [SPARK-9152][SQL] Implement code generation for Like
 and RLike

JIRA: https://issues.apache.org/jira/browse/SPARK-9152

This PR implements code generation for `Like` and `RLike`.

Author: Liang-Chi Hsieh <viirya@appier.com>

Closes #7561 from viirya/like_rlike_codegen and squashes the following commits:

fe5641b [Liang-Chi Hsieh] Add test for NonFoldableLiteral.
ccd1b43 [Liang-Chi Hsieh] For comments.
0086723 [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into like_rlike_codegen
50df9a8 [Liang-Chi Hsieh] Use nullSafeCodeGen.
8092a68 [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into like_rlike_codegen
696d451 [Liang-Chi Hsieh] Check expression foldable.
48e5536 [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into like_rlike_codegen
aea58e0 [Liang-Chi Hsieh] For comments.
46d946f [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into like_rlike_codegen
a0fb76e [Liang-Chi Hsieh] For comments.
6cffe3c [Liang-Chi Hsieh] For comments.
69f0fb6 [Liang-Chi Hsieh] Add code generation for Like and RLike.
---
 .../expressions/stringOperations.scala        | 105 ++++++++++++++----
 .../spark/sql/catalyst/util/StringUtils.scala |  47 ++++++++
 .../expressions/StringExpressionsSuite.scala  |  16 +++
 .../sql/catalyst/util/StringUtilsSuite.scala  |  34 ++++++
 4 files changed, 180 insertions(+), 22 deletions(-)
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala
 create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/StringUtilsSuite.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
index 79c0ca56a8e79..99a62343f138d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
@@ -21,8 +21,11 @@ import java.text.DecimalFormat
 import java.util.Locale
 import java.util.regex.{MatchResult, Pattern}
 
+import org.apache.commons.lang3.StringEscapeUtils
+
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.codegen._
+import org.apache.spark.sql.catalyst.util.StringUtils
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
 
@@ -160,32 +163,51 @@ trait StringRegexExpression extends ImplicitCastInputTypes {
 case class Like(left: Expression, right: Expression)
   extends BinaryExpression with StringRegexExpression with CodegenFallback {
 
-  // replace the _ with .{1} exactly match 1 time of any character
-  // replace the % with .*, match 0 or more times with any character
-  override def escape(v: String): String =
-    if (!v.isEmpty) {
-      "(?s)" + (' ' +: v.init).zip(v).flatMap {
-        case (prev, '\\') => ""
-        case ('\\', c) =>
-          c match {
-            case '_' => "_"
-            case '%' => "%"
-            case _ => Pattern.quote("\\" + c)
-          }
-        case (prev, c) =>
-          c match {
-            case '_' => "."
-            case '%' => ".*"
-            case _ => Pattern.quote(Character.toString(c))
-          }
-      }.mkString
-    } else {
-      v
-    }
+  override def escape(v: String): String = StringUtils.escapeLikeRegex(v)
 
   override def matches(regex: Pattern, str: String): Boolean = regex.matcher(str).matches()
 
   override def toString: String = s"$left LIKE $right"
+
+  override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    val patternClass = classOf[Pattern].getName
+    val escapeFunc = StringUtils.getClass.getName.stripSuffix("$") + ".escapeLikeRegex"
+    val pattern = ctx.freshName("pattern")
+
+    if (right.foldable) {
+      val rVal = right.eval()
+      if (rVal != null) {
+        val regexStr =
+          StringEscapeUtils.escapeJava(escape(rVal.asInstanceOf[UTF8String].toString()))
+        ctx.addMutableState(patternClass, pattern,
+          s"""$pattern = ${patternClass}.compile("$regexStr");""")
+
+        // We don't use nullSafeCodeGen here because we don't want to re-evaluate right again.
+        val eval = left.gen(ctx)
+        s"""
+          ${eval.code}
+          boolean ${ev.isNull} = ${eval.isNull};
+          ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+          if (!${ev.isNull}) {
+            ${ev.primitive} = $pattern.matcher(${eval.primitive}.toString()).matches();
+          }
+        """
+      } else {
+        s"""
+          boolean ${ev.isNull} = true;
+          ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+        """
+      }
+    } else {
+      nullSafeCodeGen(ctx, ev, (eval1, eval2) => {
+        s"""
+          String rightStr = ${eval2}.toString();
+          ${patternClass} $pattern = ${patternClass}.compile($escapeFunc(rightStr));
+          ${ev.primitive} = $pattern.matcher(${eval1}.toString()).matches();
+        """
+      })
+    }
+  }
 }
 
 
@@ -195,6 +217,45 @@ case class RLike(left: Expression, right: Expression)
   override def escape(v: String): String = v
   override def matches(regex: Pattern, str: String): Boolean = regex.matcher(str).find(0)
   override def toString: String = s"$left RLIKE $right"
+
+  override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    val patternClass = classOf[Pattern].getName
+    val pattern = ctx.freshName("pattern")
+
+    if (right.foldable) {
+      val rVal = right.eval()
+      if (rVal != null) {
+        val regexStr =
+          StringEscapeUtils.escapeJava(rVal.asInstanceOf[UTF8String].toString())
+        ctx.addMutableState(patternClass, pattern,
+          s"""$pattern = ${patternClass}.compile("$regexStr");""")
+
+        // We don't use nullSafeCodeGen here because we don't want to re-evaluate right again.
+        val eval = left.gen(ctx)
+        s"""
+          ${eval.code}
+          boolean ${ev.isNull} = ${eval.isNull};
+          ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+          if (!${ev.isNull}) {
+            ${ev.primitive} = $pattern.matcher(${eval.primitive}.toString()).find(0);
+          }
+        """
+      } else {
+        s"""
+          boolean ${ev.isNull} = true;
+          ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+        """
+      }
+    } else {
+      nullSafeCodeGen(ctx, ev, (eval1, eval2) => {
+        s"""
+          String rightStr = ${eval2}.toString();
+          ${patternClass} $pattern = ${patternClass}.compile(rightStr);
+          ${ev.primitive} = $pattern.matcher(${eval1}.toString()).find(0);
+        """
+      })
+    }
+  }
 }
 
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala
new file mode 100644
index 0000000000000..9ddfb3a0d3759
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.util
+
+import java.util.regex.Pattern
+
+object StringUtils {
+
+  // replace the _ with .{1} exactly match 1 time of any character
+  // replace the % with .*, match 0 or more times with any character
+  def escapeLikeRegex(v: String): String = {
+    if (!v.isEmpty) {
+      "(?s)" + (' ' +: v.init).zip(v).flatMap {
+        case (prev, '\\') => ""
+        case ('\\', c) =>
+          c match {
+            case '_' => "_"
+            case '%' => "%"
+            case _ => Pattern.quote("\\" + c)
+          }
+        case (prev, c) =>
+          c match {
+            case '_' => "."
+            case '%' => ".*"
+            case _ => Pattern.quote(Character.toString(c))
+          }
+      }.mkString
+    } else {
+      v
+    }
+  }
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
index 07b952531ec2e..3ecd0d374c46b 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
@@ -191,6 +191,15 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(Literal.create(null, StringType).like("a"), null)
     checkEvaluation(Literal.create("a", StringType).like(Literal.create(null, StringType)), null)
     checkEvaluation(Literal.create(null, StringType).like(Literal.create(null, StringType)), null)
+    checkEvaluation(
+      Literal.create("a", StringType).like(NonFoldableLiteral.create("a", StringType)), true)
+    checkEvaluation(
+      Literal.create("a", StringType).like(NonFoldableLiteral.create(null, StringType)), null)
+    checkEvaluation(
+      Literal.create(null, StringType).like(NonFoldableLiteral.create("a", StringType)), null)
+    checkEvaluation(
+      Literal.create(null, StringType).like(NonFoldableLiteral.create(null, StringType)), null)
+
     checkEvaluation("abdef" like "abdef", true)
     checkEvaluation("a_%b" like "a\\__b", true)
     checkEvaluation("addb" like "a_%b", true)
@@ -232,6 +241,13 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(Literal.create(null, StringType) rlike "abdef", null)
     checkEvaluation("abdef" rlike Literal.create(null, StringType), null)
     checkEvaluation(Literal.create(null, StringType) rlike Literal.create(null, StringType), null)
+    checkEvaluation("abdef" rlike NonFoldableLiteral.create("abdef", StringType), true)
+    checkEvaluation("abdef" rlike NonFoldableLiteral.create(null, StringType), null)
+    checkEvaluation(
+      Literal.create(null, StringType) rlike NonFoldableLiteral.create("abdef", StringType), null)
+    checkEvaluation(
+      Literal.create(null, StringType) rlike NonFoldableLiteral.create(null, StringType), null)
+
     checkEvaluation("abdef" rlike "abdef", true)
     checkEvaluation("abbbbc" rlike "a.*c", true)
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/StringUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/StringUtilsSuite.scala
new file mode 100644
index 0000000000000..d6f273f9e568a
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/StringUtilsSuite.scala
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.util
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.catalyst.util.StringUtils._
+
+class StringUtilsSuite extends SparkFunSuite {
+
+  test("escapeLikeRegex") {
+    assert(escapeLikeRegex("abdef") === "(?s)\\Qa\\E\\Qb\\E\\Qd\\E\\Qe\\E\\Qf\\E")
+    assert(escapeLikeRegex("a\\__b") === "(?s)\\Qa\\E_.\\Qb\\E")
+    assert(escapeLikeRegex("a_%b") === "(?s)\\Qa\\E..*\\Qb\\E")
+    assert(escapeLikeRegex("a%\\%b") === "(?s)\\Qa\\E.*%\\Qb\\E")
+    assert(escapeLikeRegex("a%") === "(?s)\\Qa\\E.*")
+    assert(escapeLikeRegex("**") === "(?s)\\Q*\\E\\Q*\\E")
+    assert(escapeLikeRegex("a_b") === "(?s)\\Qa\\E.\\Qb\\E")
+  }
+}

From a3a85d73da053c8e2830759fbc68b734081fa4f3 Mon Sep 17 00:00:00 2001
From: WangTaoTheTonic <wangtao111@huawei.com>
Date: Thu, 30 Jul 2015 23:50:06 -0700
Subject: [PATCH 507/549] [SPARK-9496][SQL]do not print the password in config

https://issues.apache.org/jira/browse/SPARK-9496

We better do not print the password in log.

Author: WangTaoTheTonic <wangtao111@huawei.com>

Closes #7815 from WangTaoTheTonic/master and squashes the following commits:

c7a5145 [WangTaoTheTonic] do not print the password in config
---
 .../org/apache/spark/sql/hive/client/ClientWrapper.scala    | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala
index 8adda54754230..6e0912da5862d 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala
@@ -91,7 +91,11 @@ private[hive] class ClientWrapper(
         // this action explicit.
         initialConf.setClassLoader(initClassLoader)
         config.foreach { case (k, v) =>
-          logDebug(s"Hive Config: $k=$v")
+          if (k.toLowerCase.contains("password")) {
+            logDebug(s"Hive Config: $k=xxx")
+          } else {
+            logDebug(s"Hive Config: $k=$v")
+          }
           initialConf.set(k, v)
         }
         val newState = new SessionState(initialConf)

From 6bba7509a932aa4d39266df2d15b1370b7aabbec Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Fri, 31 Jul 2015 08:28:05 -0700
Subject: [PATCH 508/549] [SPARK-9500] add TernaryExpression to simplify
 ternary expressions

There lots of duplicated code in ternary expressions, create a TernaryExpression for them to reduce duplicated code.

cc chenghao-intel

Author: Davies Liu <davies@databricks.com>

Closes #7816 from davies/ternary and squashes the following commits:

ed2bf76 [Davies Liu] add TernaryExpression
---
 .../sql/catalyst/expressions/Expression.scala |  85 +++++
 .../expressions/codegen/CodeGenerator.scala   |   2 +-
 .../spark/sql/catalyst/expressions/math.scala |  66 +---
 .../expressions/stringOperations.scala        | 356 +++++-------------
 4 files changed, 183 insertions(+), 326 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
index 8fc182607ce68..2842b3ec5a0c8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
@@ -432,3 +432,88 @@ abstract class BinaryOperator extends BinaryExpression with ExpectsInputTypes {
 private[sql] object BinaryOperator {
   def unapply(e: BinaryOperator): Option[(Expression, Expression)] = Some((e.left, e.right))
 }
+
+/**
+ * An expression with three inputs and one output. The output is by default evaluated to null
+ * if any input is evaluated to null.
+ */
+abstract class TernaryExpression extends Expression {
+
+  override def foldable: Boolean = children.forall(_.foldable)
+
+  override def nullable: Boolean = children.exists(_.nullable)
+
+  /**
+   * Default behavior of evaluation according to the default nullability of BinaryExpression.
+   * If subclass of BinaryExpression override nullable, probably should also override this.
+   */
+  override def eval(input: InternalRow): Any = {
+    val exprs = children
+    val value1 = exprs(0).eval(input)
+    if (value1 != null) {
+      val value2 = exprs(1).eval(input)
+      if (value2 != null) {
+        val value3 = exprs(2).eval(input)
+        if (value3 != null) {
+          return nullSafeEval(value1, value2, value3)
+        }
+      }
+    }
+    null
+  }
+
+  /**
+   * Called by default [[eval]] implementation.  If subclass of BinaryExpression keep the default
+   * nullability, they can override this method to save null-check code.  If we need full control
+   * of evaluation process, we should override [[eval]].
+   */
+  protected def nullSafeEval(input1: Any, input2: Any, input3: Any): Any =
+    sys.error(s"BinaryExpressions must override either eval or nullSafeEval")
+
+  /**
+   * Short hand for generating binary evaluation code.
+   * If either of the sub-expressions is null, the result of this computation
+   * is assumed to be null.
+   *
+   * @param f accepts two variable names and returns Java code to compute the output.
+   */
+  protected def defineCodeGen(
+    ctx: CodeGenContext,
+    ev: GeneratedExpressionCode,
+    f: (String, String, String) => String): String = {
+    nullSafeCodeGen(ctx, ev, (eval1, eval2, eval3) => {
+      s"${ev.primitive} = ${f(eval1, eval2, eval3)};"
+    })
+  }
+
+  /**
+   * Short hand for generating binary evaluation code.
+   * If either of the sub-expressions is null, the result of this computation
+   * is assumed to be null.
+   *
+   * @param f function that accepts the 2 non-null evaluation result names of children
+   *          and returns Java code to compute the output.
+   */
+  protected def nullSafeCodeGen(
+    ctx: CodeGenContext,
+    ev: GeneratedExpressionCode,
+    f: (String, String, String) => String): String = {
+    val evals = children.map(_.gen(ctx))
+    val resultCode = f(evals(0).primitive, evals(1).primitive, evals(2).primitive)
+    s"""
+      ${evals(0).code}
+      boolean ${ev.isNull} = true;
+      ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
+      if (!${evals(0).isNull}) {
+        ${evals(1).code}
+        if (!${evals(1).isNull}) {
+          ${evals(2).code}
+          if (!${evals(2).isNull}) {
+            ${ev.isNull} = false;  // resultCode could change nullability
+            $resultCode
+          }
+        }
+      }
+    """
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index 60e2863f7bbb0..e50ec27fc2eb6 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -305,7 +305,7 @@ abstract class CodeGenerator[InType <: AnyRef, OutType <: AnyRef] extends Loggin
       evaluator.cook(code)
     } catch {
       case e: Exception =>
-        val msg = "failed to compile:\n " + CodeFormatter.format(code)
+        val msg = s"failed to compile: $e\n" + CodeFormatter.format(code)
         logError(msg, e)
         throw new Exception(msg, e)
     }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
index e6d807f6d897b..15ceb9193a8c5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/math.scala
@@ -165,69 +165,29 @@ case class Cosh(child: Expression) extends UnaryMathExpression(math.cosh, "COSH"
  * @param toBaseExpr to which base
  */
 case class Conv(numExpr: Expression, fromBaseExpr: Expression, toBaseExpr: Expression)
-  extends Expression with ImplicitCastInputTypes {
-
-  override def foldable: Boolean = numExpr.foldable && fromBaseExpr.foldable && toBaseExpr.foldable
-
-  override def nullable: Boolean = numExpr.nullable || fromBaseExpr.nullable || toBaseExpr.nullable
+  extends TernaryExpression with ImplicitCastInputTypes {
 
   override def children: Seq[Expression] = Seq(numExpr, fromBaseExpr, toBaseExpr)
-
   override def inputTypes: Seq[AbstractDataType] = Seq(StringType, IntegerType, IntegerType)
-
   override def dataType: DataType = StringType
 
-  /** Returns the result of evaluating this expression on a given input Row */
-  override def eval(input: InternalRow): Any = {
-    val num = numExpr.eval(input)
-    if (num != null) {
-      val fromBase = fromBaseExpr.eval(input)
-      if (fromBase != null) {
-        val toBase = toBaseExpr.eval(input)
-        if (toBase != null) {
-          NumberConverter.convert(
-            num.asInstanceOf[UTF8String].getBytes,
-            fromBase.asInstanceOf[Int],
-            toBase.asInstanceOf[Int])
-        } else {
-          null
-        }
-      } else {
-        null
-      }
-    } else {
-      null
-    }
+  override def nullSafeEval(num: Any, fromBase: Any, toBase: Any): Any = {
+    NumberConverter.convert(
+      num.asInstanceOf[UTF8String].getBytes,
+      fromBase.asInstanceOf[Int],
+      toBase.asInstanceOf[Int])
   }
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
-    val numGen = numExpr.gen(ctx)
-    val from = fromBaseExpr.gen(ctx)
-    val to = toBaseExpr.gen(ctx)
-
     val numconv = NumberConverter.getClass.getName.stripSuffix("$")
-    s"""
-       ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
-       ${numGen.code}
-       boolean ${ev.isNull} = ${numGen.isNull};
-       if (!${ev.isNull}) {
-         ${from.code}
-         if (!${from.isNull}) {
-           ${to.code}
-           if (!${to.isNull}) {
-             ${ev.primitive} = $numconv.convert(${numGen.primitive}.getBytes(),
-               ${from.primitive}, ${to.primitive});
-             if (${ev.primitive} == null) {
-               ${ev.isNull} = true;
-             }
-           } else {
-             ${ev.isNull} = true;
-           }
-         } else {
-           ${ev.isNull} = true;
-         }
+    nullSafeCodeGen(ctx, ev, (num, from, to) =>
+      s"""
+       ${ev.primitive} = $numconv.convert($num.getBytes(), $from, $to);
+       if (${ev.primitive} == null) {
+         ${ev.isNull} = true;
        }
-     """
+       """
+    )
   }
 }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
index 99a62343f138d..684eac12bd6f0 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
@@ -426,15 +426,13 @@ case class StringInstr(str: Expression, substr: Expression)
  * in given string after position pos.
  */
 case class StringLocate(substr: Expression, str: Expression, start: Expression)
-  extends Expression with ImplicitCastInputTypes with CodegenFallback {
+  extends TernaryExpression with ImplicitCastInputTypes with CodegenFallback {
 
   def this(substr: Expression, str: Expression) = {
     this(substr, str, Literal(0))
   }
 
   override def children: Seq[Expression] = substr :: str :: start :: Nil
-  override def foldable: Boolean = children.forall(_.foldable)
-  override def nullable: Boolean = substr.nullable || str.nullable
   override def dataType: DataType = IntegerType
   override def inputTypes: Seq[DataType] = Seq(StringType, StringType, IntegerType)
 
@@ -467,60 +465,18 @@ case class StringLocate(substr: Expression, str: Expression, start: Expression)
  * Returns str, left-padded with pad to a length of len.
  */
 case class StringLPad(str: Expression, len: Expression, pad: Expression)
-  extends Expression with ImplicitCastInputTypes {
+  extends TernaryExpression with ImplicitCastInputTypes {
 
   override def children: Seq[Expression] = str :: len :: pad :: Nil
-  override def foldable: Boolean = children.forall(_.foldable)
-  override def nullable: Boolean = children.exists(_.nullable)
   override def dataType: DataType = StringType
   override def inputTypes: Seq[DataType] = Seq(StringType, IntegerType, StringType)
 
-  override def eval(input: InternalRow): Any = {
-    val s = str.eval(input)
-    if (s == null) {
-      null
-    } else {
-      val l = len.eval(input)
-      if (l == null) {
-        null
-      } else {
-        val p = pad.eval(input)
-        if (p == null) {
-          null
-        } else {
-          val len = l.asInstanceOf[Int]
-          val str = s.asInstanceOf[UTF8String]
-          val pad = p.asInstanceOf[UTF8String]
-
-          str.lpad(len, pad)
-        }
-      }
-    }
+  override def nullSafeEval(str: Any, len: Any, pad: Any): Any = {
+    str.asInstanceOf[UTF8String].lpad(len.asInstanceOf[Int], pad.asInstanceOf[UTF8String])
   }
 
   override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
-    val lenGen = len.gen(ctx)
-    val strGen = str.gen(ctx)
-    val padGen = pad.gen(ctx)
-
-    s"""
-      ${lenGen.code}
-      boolean ${ev.isNull} = ${lenGen.isNull};
-      ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
-      if (!${ev.isNull}) {
-        ${strGen.code}
-        if (!${strGen.isNull}) {
-          ${padGen.code}
-          if (!${padGen.isNull}) {
-            ${ev.primitive} = ${strGen.primitive}.lpad(${lenGen.primitive}, ${padGen.primitive});
-          } else {
-            ${ev.isNull} = true;
-          }
-        } else {
-          ${ev.isNull} = true;
-        }
-      }
-     """
+    defineCodeGen(ctx, ev, (str, len, pad) => s"$str.lpad($len, $pad)")
   }
 
   override def prettyName: String = "lpad"
@@ -530,60 +486,18 @@ case class StringLPad(str: Expression, len: Expression, pad: Expression)
  * Returns str, right-padded with pad to a length of len.
  */
 case class StringRPad(str: Expression, len: Expression, pad: Expression)
-  extends Expression with ImplicitCastInputTypes {
+  extends TernaryExpression with ImplicitCastInputTypes {
 
   override def children: Seq[Expression] = str :: len :: pad :: Nil
-  override def foldable: Boolean = children.forall(_.foldable)
-  override def nullable: Boolean = children.exists(_.nullable)
   override def dataType: DataType = StringType
   override def inputTypes: Seq[DataType] = Seq(StringType, IntegerType, StringType)
 
-  override def eval(input: InternalRow): Any = {
-    val s = str.eval(input)
-    if (s == null) {
-      null
-    } else {
-      val l = len.eval(input)
-      if (l == null) {
-        null
-      } else {
-        val p = pad.eval(input)
-        if (p == null) {
-          null
-        } else {
-          val len = l.asInstanceOf[Int]
-          val str = s.asInstanceOf[UTF8String]
-          val pad = p.asInstanceOf[UTF8String]
-
-          str.rpad(len, pad)
-        }
-      }
-    }
+  override def nullSafeEval(str: Any, len: Any, pad: Any): Any = {
+    str.asInstanceOf[UTF8String].rpad(len.asInstanceOf[Int], pad.asInstanceOf[UTF8String])
   }
 
   override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
-    val lenGen = len.gen(ctx)
-    val strGen = str.gen(ctx)
-    val padGen = pad.gen(ctx)
-
-    s"""
-      ${lenGen.code}
-      boolean ${ev.isNull} = ${lenGen.isNull};
-      ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
-      if (!${ev.isNull}) {
-        ${strGen.code}
-        if (!${strGen.isNull}) {
-          ${padGen.code}
-          if (!${padGen.isNull}) {
-            ${ev.primitive} = ${strGen.primitive}.rpad(${lenGen.primitive}, ${padGen.primitive});
-          } else {
-            ${ev.isNull} = true;
-          }
-        } else {
-          ${ev.isNull} = true;
-        }
-      }
-     """
+    defineCodeGen(ctx, ev, (str, len, pad) => s"$str.rpad($len, $pad)")
   }
 
   override def prettyName: String = "rpad"
@@ -745,68 +659,24 @@ case class StringSplit(str: Expression, pattern: Expression)
  * Defined for String and Binary types.
  */
 case class Substring(str: Expression, pos: Expression, len: Expression)
-  extends Expression with ImplicitCastInputTypes {
+  extends TernaryExpression with ImplicitCastInputTypes {
 
   def this(str: Expression, pos: Expression) = {
     this(str, pos, Literal(Integer.MAX_VALUE))
   }
 
-  override def foldable: Boolean = str.foldable && pos.foldable && len.foldable
-  override def nullable: Boolean = str.nullable || pos.nullable || len.nullable
-
   override def dataType: DataType = StringType
 
   override def inputTypes: Seq[DataType] = Seq(StringType, IntegerType, IntegerType)
 
   override def children: Seq[Expression] = str :: pos :: len :: Nil
 
-  override def eval(input: InternalRow): Any = {
-    val stringEval = str.eval(input)
-    if (stringEval != null) {
-      val posEval = pos.eval(input)
-      if (posEval != null) {
-        val lenEval = len.eval(input)
-        if (lenEval != null) {
-          stringEval.asInstanceOf[UTF8String]
-            .substringSQL(posEval.asInstanceOf[Int], lenEval.asInstanceOf[Int])
-        } else {
-          null
-        }
-      } else {
-        null
-      }
-    } else {
-      null
-    }
+  override def nullSafeEval(string: Any, pos: Any, len: Any): Any = {
+    string.asInstanceOf[UTF8String].substringSQL(pos.asInstanceOf[Int], len.asInstanceOf[Int])
   }
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
-    val strGen = str.gen(ctx)
-    val posGen = pos.gen(ctx)
-    val lenGen = len.gen(ctx)
-
-    val start = ctx.freshName("start")
-    val end = ctx.freshName("end")
-
-    s"""
-      ${strGen.code}
-      boolean ${ev.isNull} = ${strGen.isNull};
-      ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
-      if (!${ev.isNull}) {
-        ${posGen.code}
-        if (!${posGen.isNull}) {
-          ${lenGen.code}
-          if (!${lenGen.isNull}) {
-            ${ev.primitive} = ${strGen.primitive}
-              .substringSQL(${posGen.primitive}, ${lenGen.primitive});
-          } else {
-            ${ev.isNull} = true;
-          }
-        } else {
-          ${ev.isNull} = true;
-        }
-      }
-     """
+    defineCodeGen(ctx, ev, (str, pos, len) => s"$str.substringSQL($pos, $len)")
   }
 }
 
@@ -986,7 +856,7 @@ case class Encode(value: Expression, charset: Expression)
  * NOTE: this expression is not THREAD-SAFE, as it has some internal mutable status.
  */
 case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expression)
-  extends Expression with ImplicitCastInputTypes {
+  extends TernaryExpression with ImplicitCastInputTypes {
 
   // last regex in string, we will update the pattern iff regexp value changed.
   @transient private var lastRegex: UTF8String = _
@@ -998,40 +868,26 @@ case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expressio
   // result buffer write by Matcher
   @transient private val result: StringBuffer = new StringBuffer
 
-  override def nullable: Boolean = subject.nullable || regexp.nullable || rep.nullable
-  override def foldable: Boolean = subject.foldable && regexp.foldable && rep.foldable
-
-  override def eval(input: InternalRow): Any = {
-    val s = subject.eval(input)
-    if (null != s) {
-      val p = regexp.eval(input)
-      if (null != p) {
-        val r = rep.eval(input)
-        if (null != r) {
-          if (!p.equals(lastRegex)) {
-            // regex value changed
-            lastRegex = p.asInstanceOf[UTF8String]
-            pattern = Pattern.compile(lastRegex.toString)
-          }
-          if (!r.equals(lastReplacementInUTF8)) {
-            // replacement string changed
-            lastReplacementInUTF8 = r.asInstanceOf[UTF8String]
-            lastReplacement = lastReplacementInUTF8.toString
-          }
-          val m = pattern.matcher(s.toString())
-          result.delete(0, result.length())
-
-          while (m.find) {
-            m.appendReplacement(result, lastReplacement)
-          }
-          m.appendTail(result)
+  override def nullSafeEval(s: Any, p: Any, r: Any): Any = {
+    if (!p.equals(lastRegex)) {
+      // regex value changed
+      lastRegex = p.asInstanceOf[UTF8String]
+      pattern = Pattern.compile(lastRegex.toString)
+    }
+    if (!r.equals(lastReplacementInUTF8)) {
+      // replacement string changed
+      lastReplacementInUTF8 = r.asInstanceOf[UTF8String]
+      lastReplacement = lastReplacementInUTF8.toString
+    }
+    val m = pattern.matcher(s.toString())
+    result.delete(0, result.length())
 
-          return UTF8String.fromString(result.toString)
-        }
-      }
+    while (m.find) {
+      m.appendReplacement(result, lastReplacement)
     }
+    m.appendTail(result)
 
-    null
+    UTF8String.fromString(result.toString)
   }
 
   override def dataType: DataType = StringType
@@ -1048,59 +904,43 @@ case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expressio
 
     val termResult = ctx.freshName("result")
 
-    val classNameUTF8String = classOf[UTF8String].getCanonicalName
     val classNamePattern = classOf[Pattern].getCanonicalName
-    val classNameString = classOf[java.lang.String].getCanonicalName
     val classNameStringBuffer = classOf[java.lang.StringBuffer].getCanonicalName
 
-    ctx.addMutableState(classNameUTF8String,
+    ctx.addMutableState("UTF8String",
       termLastRegex, s"${termLastRegex} = null;")
     ctx.addMutableState(classNamePattern,
       termPattern, s"${termPattern} = null;")
-    ctx.addMutableState(classNameString,
+    ctx.addMutableState("String",
       termLastReplacement, s"${termLastReplacement} = null;")
-    ctx.addMutableState(classNameUTF8String,
+    ctx.addMutableState("UTF8String",
       termLastReplacementInUTF8, s"${termLastReplacementInUTF8} = null;")
     ctx.addMutableState(classNameStringBuffer,
       termResult, s"${termResult} = new $classNameStringBuffer();")
 
-    val evalSubject = subject.gen(ctx)
-    val evalRegexp = regexp.gen(ctx)
-    val evalRep = rep.gen(ctx)
-
+    nullSafeCodeGen(ctx, ev, (subject, regexp, rep) => {
     s"""
-      ${evalSubject.code}
-      boolean ${ev.isNull} = true;
-      ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
-      if (!${evalSubject.isNull}) {
-        ${evalRegexp.code}
-        if (!${evalRegexp.isNull}) {
-          ${evalRep.code}
-          if (!${evalRep.isNull}) {
-            if (!${evalRegexp.primitive}.equals(${termLastRegex})) {
-              // regex value changed
-              ${termLastRegex} = ${evalRegexp.primitive};
-              ${termPattern} = ${classNamePattern}.compile(${termLastRegex}.toString());
-            }
-            if (!${evalRep.primitive}.equals(${termLastReplacementInUTF8})) {
-              // replacement string changed
-              ${termLastReplacementInUTF8} = ${evalRep.primitive};
-              ${termLastReplacement} = ${termLastReplacementInUTF8}.toString();
-            }
-            ${termResult}.delete(0, ${termResult}.length());
-            ${classOf[java.util.regex.Matcher].getCanonicalName} m =
-                                   ${termPattern}.matcher(${evalSubject.primitive}.toString());
+      if (!$regexp.equals(${termLastRegex})) {
+        // regex value changed
+        ${termLastRegex} = $regexp;
+        ${termPattern} = ${classNamePattern}.compile(${termLastRegex}.toString());
+      }
+      if (!$rep.equals(${termLastReplacementInUTF8})) {
+        // replacement string changed
+        ${termLastReplacementInUTF8} = $rep;
+        ${termLastReplacement} = ${termLastReplacementInUTF8}.toString();
+      }
+      ${termResult}.delete(0, ${termResult}.length());
+      java.util.regex.Matcher m = ${termPattern}.matcher($subject.toString());
 
-            while (m.find()) {
-              m.appendReplacement(${termResult}, ${termLastReplacement});
-            }
-            m.appendTail(${termResult});
-            ${ev.primitive} = ${classNameUTF8String}.fromString(${termResult}.toString());
-            ${ev.isNull} = false;
-          }
-        }
+      while (m.find()) {
+        m.appendReplacement(${termResult}, ${termLastReplacement});
       }
+      m.appendTail(${termResult});
+      ${ev.primitive} = UTF8String.fromString(${termResult}.toString());
+      ${ev.isNull} = false;
     """
+    })
   }
 }
 
@@ -1110,7 +950,7 @@ case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expressio
  * NOTE: this expression is not THREAD-SAFE, as it has some internal mutable status.
  */
 case class RegExpExtract(subject: Expression, regexp: Expression, idx: Expression)
-  extends Expression with ImplicitCastInputTypes {
+  extends TernaryExpression with ImplicitCastInputTypes {
   def this(s: Expression, r: Expression) = this(s, r, Literal(1))
 
   // last regex in string, we will update the pattern iff regexp value changed.
@@ -1118,32 +958,19 @@ case class RegExpExtract(subject: Expression, regexp: Expression, idx: Expressio
   // last regex pattern, we cache it for performance concern
   @transient private var pattern: Pattern = _
 
-  override def nullable: Boolean = subject.nullable || regexp.nullable || idx.nullable
-  override def foldable: Boolean = subject.foldable && regexp.foldable && idx.foldable
-
-  override def eval(input: InternalRow): Any = {
-    val s = subject.eval(input)
-    if (null != s) {
-      val p = regexp.eval(input)
-      if (null != p) {
-        val r = idx.eval(input)
-        if (null != r) {
-          if (!p.equals(lastRegex)) {
-            // regex value changed
-            lastRegex = p.asInstanceOf[UTF8String]
-            pattern = Pattern.compile(lastRegex.toString)
-          }
-          val m = pattern.matcher(s.toString())
-          if (m.find) {
-            val mr: MatchResult = m.toMatchResult
-            return UTF8String.fromString(mr.group(r.asInstanceOf[Int]))
-          }
-          return UTF8String.EMPTY_UTF8
-        }
-      }
+  override def nullSafeEval(s: Any, p: Any, r: Any): Any = {
+    if (!p.equals(lastRegex)) {
+      // regex value changed
+      lastRegex = p.asInstanceOf[UTF8String]
+      pattern = Pattern.compile(lastRegex.toString)
+    }
+    val m = pattern.matcher(s.toString())
+    if (m.find) {
+      val mr: MatchResult = m.toMatchResult
+      UTF8String.fromString(mr.group(r.asInstanceOf[Int]))
+    } else {
+      UTF8String.EMPTY_UTF8
     }
-
-    null
   }
 
   override def dataType: DataType = StringType
@@ -1154,44 +981,29 @@ case class RegExpExtract(subject: Expression, regexp: Expression, idx: Expressio
   override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     val termLastRegex = ctx.freshName("lastRegex")
     val termPattern = ctx.freshName("pattern")
-    val classNameUTF8String = classOf[UTF8String].getCanonicalName
     val classNamePattern = classOf[Pattern].getCanonicalName
 
-    ctx.addMutableState(classNameUTF8String, termLastRegex, s"${termLastRegex} = null;")
+    ctx.addMutableState("UTF8String", termLastRegex, s"${termLastRegex} = null;")
     ctx.addMutableState(classNamePattern, termPattern, s"${termPattern} = null;")
 
-    val evalSubject = subject.gen(ctx)
-    val evalRegexp = regexp.gen(ctx)
-    val evalIdx = idx.gen(ctx)
-
-    s"""
-      ${evalSubject.code}
-      ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
-      boolean ${ev.isNull} = true;
-      if (!${evalSubject.isNull}) {
-        ${evalRegexp.code}
-        if (!${evalRegexp.isNull}) {
-          ${evalIdx.code}
-          if (!${evalIdx.isNull}) {
-            if (!${evalRegexp.primitive}.equals(${termLastRegex})) {
-              // regex value changed
-              ${termLastRegex} = ${evalRegexp.primitive};
-              ${termPattern} = ${classNamePattern}.compile(${termLastRegex}.toString());
-            }
-            ${classOf[java.util.regex.Matcher].getCanonicalName} m =
-              ${termPattern}.matcher(${evalSubject.primitive}.toString());
-            if (m.find()) {
-              ${classOf[java.util.regex.MatchResult].getCanonicalName} mr = m.toMatchResult();
-              ${ev.primitive} = ${classNameUTF8String}.fromString(mr.group(${evalIdx.primitive}));
-              ${ev.isNull} = false;
-            } else {
-              ${ev.primitive} = ${classNameUTF8String}.EMPTY_UTF8;
-              ${ev.isNull} = false;
-            }
-          }
-        }
+    nullSafeCodeGen(ctx, ev, (subject, regexp, idx) => {
+      s"""
+      if (!$regexp.equals(${termLastRegex})) {
+        // regex value changed
+        ${termLastRegex} = $regexp;
+        ${termPattern} = ${classNamePattern}.compile(${termLastRegex}.toString());
       }
-    """
+      java.util.regex.Matcher m =
+        ${termPattern}.matcher($subject.toString());
+      if (m.find()) {
+        java.util.regex.MatchResult mr = m.toMatchResult();
+        ${ev.primitive} = UTF8String.fromString(mr.group($idx));
+        ${ev.isNull} = false;
+      } else {
+        ${ev.primitive} = UTF8String.EMPTY_UTF8;
+        ${ev.isNull} = false;
+      }"""
+    })
   }
 }
 

From 5f8a1d435526a601d0c435359df2b42ccc4dbbb3 Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Fri, 31 Jul 2015 16:19:14 +0800
Subject: [PATCH 509/549] Make the maven build generate the test jar for Python
 MQTT tests

---
 external/mqtt/pom.xml                        | 28 +++++++++++++++
 external/mqtt/src/main/assembly/assembly.xml | 38 ++++++++++++++++++++
 2 files changed, 66 insertions(+)
 create mode 100644 external/mqtt/src/main/assembly/assembly.xml

diff --git a/external/mqtt/pom.xml b/external/mqtt/pom.xml
index 0e41e5781784b..69b309876a0db 100644
--- a/external/mqtt/pom.xml
+++ b/external/mqtt/pom.xml
@@ -78,5 +78,33 @@
   <build>
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
     <testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
+
+    <plugins>
+      <!-- Assemble a jar with test dependencies for Python tests -->
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-assembly-plugin</artifactId>
+        <executions>
+          <execution>
+            <id>test-jar-with-dependencies</id>
+            <phase>package</phase>
+            <goals>
+              <goal>single</goal>
+            </goals>
+            <configuration>
+              <!-- Make sure the file path is same as the sbt build -->
+              <finalName>spark-streaming-mqtt-test-${project.version}</finalName>
+              <outputDirectory>${project.build.directory}/scala-${scala.binary.version}/</outputDirectory>
+              <appendAssemblyId>false</appendAssemblyId>
+              <!-- Don't publish it since it's only for Python tests -->
+              <attach>false</attach>
+              <descriptors>
+                <descriptor>src/main/assembly/assembly.xml</descriptor>
+              </descriptors>
+            </configuration>
+          </execution>
+        </executions>
+      </plugin>
+    </plugins>
   </build>
 </project>
diff --git a/external/mqtt/src/main/assembly/assembly.xml b/external/mqtt/src/main/assembly/assembly.xml
new file mode 100644
index 0000000000000..92318ec16dba8
--- /dev/null
+++ b/external/mqtt/src/main/assembly/assembly.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~    http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+<assembly>
+  <id>test-jar-with-dependencies</id>
+  <formats>
+    <format>jar</format>
+  </formats>
+  <includeBaseDirectory>false</includeBaseDirectory>
+
+  <dependencySets>
+    <dependencySet>
+      <useTransitiveDependencies>true</useTransitiveDependencies>
+      <scope>test</scope>
+      <useProjectArtifact>false</useProjectArtifact>
+      <excludes>
+        <exclude>org.apache.hadoop:*:jar</exclude>
+        <exclude>org.apache.spark:*:jar</exclude>
+        <exclude>org.apache.zookeeper:*:jar</exclude>
+        <exclude>org.apache.avro:*:jar</exclude>
+      </excludes>
+    </dependencySet>
+  </dependencySets>
+
+</assembly>

From fc0e57e5aba82a3f227fef05a843283e2ec893fc Mon Sep 17 00:00:00 2001
From: Yu ISHIKAWA <yuu.ishikawa@gmail.com>
Date: Fri, 31 Jul 2015 09:33:38 -0700
Subject: [PATCH 510/549] [SPARK-9053] [SPARKR] Fix spaces around parens, infix
 operators etc.

### JIRA
[[SPARK-9053] Fix spaces around parens, infix operators etc. - ASF JIRA](https://issues.apache.org/jira/browse/SPARK-9053)

### The Result of `lint-r`
[The result of lint-r at the rivision:a4c83cb1e4b066cd60264b6572fd3e51d160d26a](https://gist.github.com/yu-iskw/d253d7f8ef351f86443d)

Author: Yu ISHIKAWA <yuu.ishikawa@gmail.com>

Closes #7584 from yu-iskw/SPARK-9053 and squashes the following commits:

613170f [Yu ISHIKAWA] Ignore a warning about a space before a left parentheses
ede61e1 [Yu ISHIKAWA] Ignores two warnings about a space before a left parentheses. TODO: After updating `lintr`, we will remove the ignores
de3e0db [Yu ISHIKAWA] Add '## nolint start' & '## nolint end' statement to ignore infix space warnings
e233ea8 [Yu ISHIKAWA] [SPARK-9053][SparkR] Fix spaces around parens, infix operators etc.
---
 R/pkg/R/DataFrame.R                     | 4 ++++
 R/pkg/R/RDD.R                           | 7 +++++--
 R/pkg/R/column.R                        | 2 +-
 R/pkg/R/context.R                       | 2 +-
 R/pkg/R/pairRDD.R                       | 2 +-
 R/pkg/R/utils.R                         | 4 ++--
 R/pkg/inst/tests/test_binary_function.R | 2 +-
 R/pkg/inst/tests/test_rdd.R             | 6 +++---
 R/pkg/inst/tests/test_sparkSQL.R        | 4 +++-
 9 files changed, 21 insertions(+), 12 deletions(-)

diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index f4c93d3c7dd67..b31ad3729e09b 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -1322,9 +1322,11 @@ setMethod("write.df",
                                     "org.apache.spark.sql.parquet")
             }
             allModes <- c("append", "overwrite", "error", "ignore")
+            # nolint start
             if (!(mode %in% allModes)) {
               stop('mode should be one of "append", "overwrite", "error", "ignore"')
             }
+            # nolint end
             jmode <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "saveMode", mode)
             options <- varargsToEnv(...)
             if (!is.null(path)) {
@@ -1384,9 +1386,11 @@ setMethod("saveAsTable",
                                     "org.apache.spark.sql.parquet")
             }
             allModes <- c("append", "overwrite", "error", "ignore")
+            # nolint start
             if (!(mode %in% allModes)) {
               stop('mode should be one of "append", "overwrite", "error", "ignore"')
             }
+            # nolint end
             jmode <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "saveMode", mode)
             options <- varargsToEnv(...)
             callJMethod(df@sdf, "saveAsTable", tableName, source, jmode, options)
diff --git a/R/pkg/R/RDD.R b/R/pkg/R/RDD.R
index d2d096709245d..2a013b3dbb968 100644
--- a/R/pkg/R/RDD.R
+++ b/R/pkg/R/RDD.R
@@ -85,7 +85,9 @@ setMethod("initialize", "PipelinedRDD", function(.Object, prev, func, jrdd_val)
 
   isPipelinable <- function(rdd) {
     e <- rdd@env
+    # nolint start
     !(e$isCached || e$isCheckpointed)
+    # nolint end
   }
 
   if (!inherits(prev, "PipelinedRDD") || !isPipelinable(prev)) {
@@ -97,7 +99,8 @@ setMethod("initialize", "PipelinedRDD", function(.Object, prev, func, jrdd_val)
     # prev_serializedMode is used during the delayed computation of JRDD in getJRDD
   } else {
     pipelinedFunc <- function(partIndex, part) {
-      func(partIndex, prev@func(partIndex, part))
+      f <- prev@func
+      func(partIndex, f(partIndex, part))
     }
     .Object@func <- cleanClosure(pipelinedFunc)
     .Object@prev_jrdd <- prev@prev_jrdd # maintain the pipeline
@@ -841,7 +844,7 @@ setMethod("sampleRDD",
                 if (withReplacement) {
                   count <- rpois(1, fraction)
                   if (count > 0) {
-                    res[(len + 1):(len + count)] <- rep(list(elem), count)
+                    res[ (len + 1) : (len + count) ] <- rep(list(elem), count)
                     len <- len + count
                   }
                 } else {
diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R
index 2892e1416cc65..eeaf9f193b728 100644
--- a/R/pkg/R/column.R
+++ b/R/pkg/R/column.R
@@ -65,7 +65,7 @@ functions <- c("min", "max", "sum", "avg", "mean", "count", "abs", "sqrt",
                "acos", "asin", "atan", "cbrt", "ceiling", "cos", "cosh", "exp",
                "expm1", "floor", "log", "log10", "log1p", "rint", "sign",
                "sin", "sinh", "tan", "tanh", "toDegrees", "toRadians")
-binary_mathfunctions<- c("atan2", "hypot")
+binary_mathfunctions <- c("atan2", "hypot")
 
 createOperator <- function(op) {
   setMethod(op,
diff --git a/R/pkg/R/context.R b/R/pkg/R/context.R
index 43be9c904fdf6..720990e1c6087 100644
--- a/R/pkg/R/context.R
+++ b/R/pkg/R/context.R
@@ -121,7 +121,7 @@ parallelize <- function(sc, coll, numSlices = 1) {
     numSlices <- length(coll)
 
   sliceLen <- ceiling(length(coll) / numSlices)
-  slices <- split(coll, rep(1:(numSlices + 1), each = sliceLen)[1:length(coll)])
+  slices <- split(coll, rep(1: (numSlices + 1), each = sliceLen)[1:length(coll)])
 
   # Serialize each slice: obtain a list of raws, or a list of lists (slices) of
   # 2-tuples of raws
diff --git a/R/pkg/R/pairRDD.R b/R/pkg/R/pairRDD.R
index 83801d3209700..199c3fd6ab1b2 100644
--- a/R/pkg/R/pairRDD.R
+++ b/R/pkg/R/pairRDD.R
@@ -879,7 +879,7 @@ setMethod("sampleByKey",
                   if (withReplacement) {
                     count <- rpois(1, frac)
                     if (count > 0) {
-                      res[(len + 1):(len + count)] <- rep(list(elem), count)
+                      res[ (len + 1) : (len + count) ] <- rep(list(elem), count)
                       len <- len + count
                     }
                   } else {
diff --git a/R/pkg/R/utils.R b/R/pkg/R/utils.R
index 3f45589a50443..4f9f4d9cad2a8 100644
--- a/R/pkg/R/utils.R
+++ b/R/pkg/R/utils.R
@@ -32,7 +32,7 @@ convertJListToRList <- function(jList, flatten, logicalUpperBound = NULL,
   }
 
   results <- if (arrSize > 0) {
-    lapply(0:(arrSize - 1),
+    lapply(0 : (arrSize - 1),
           function(index) {
             obj <- callJMethod(jList, "get", as.integer(index))
 
@@ -572,7 +572,7 @@ mergePartitions <- function(rdd, zip) {
           keys <- list()
         }
         if (lengthOfValues > 1) {
-          values <- part[(lengthOfKeys + 1) : (len - 1)]
+          values <- part[ (lengthOfKeys + 1) : (len - 1) ]
         } else {
           values <- list()
         }
diff --git a/R/pkg/inst/tests/test_binary_function.R b/R/pkg/inst/tests/test_binary_function.R
index dca0657c57e0d..f054ac9a87d61 100644
--- a/R/pkg/inst/tests/test_binary_function.R
+++ b/R/pkg/inst/tests/test_binary_function.R
@@ -40,7 +40,7 @@ test_that("union on two RDDs", {
   expect_equal(actual, c(as.list(nums), mockFile))
   expect_equal(getSerializedMode(union.rdd), "byte")
 
-  rdd<- map(text.rdd, function(x) {x})
+  rdd <- map(text.rdd, function(x) {x})
   union.rdd <- unionRDD(rdd, text.rdd)
   actual <- collect(union.rdd)
   expect_equal(actual, as.list(c(mockFile, mockFile)))
diff --git a/R/pkg/inst/tests/test_rdd.R b/R/pkg/inst/tests/test_rdd.R
index 6c3aaab8c711e..71aed2bb9d6a8 100644
--- a/R/pkg/inst/tests/test_rdd.R
+++ b/R/pkg/inst/tests/test_rdd.R
@@ -250,7 +250,7 @@ test_that("flatMapValues() on pairwise RDDs", {
   expect_equal(actual, list(list(1,1), list(1,2), list(2,3), list(2,4)))
 
   # Generate x to x+1 for every value
-  actual <- collect(flatMapValues(intRdd, function(x) { x:(x + 1) }))
+  actual <- collect(flatMapValues(intRdd, function(x) { x: (x + 1) }))
   expect_equal(actual,
                list(list(1L, -1), list(1L, 0), list(2L, 100), list(2L, 101),
                     list(2L, 1), list(2L, 2), list(1L, 200), list(1L, 201)))
@@ -293,7 +293,7 @@ test_that("sumRDD() on RDDs", {
 })
 
 test_that("keyBy on RDDs", {
-  func <- function(x) { x*x }
+  func <- function(x) { x * x }
   keys <- keyBy(rdd, func)
   actual <- collect(keys)
   expect_equal(actual, lapply(nums, function(x) { list(func(x), x) }))
@@ -311,7 +311,7 @@ test_that("repartition/coalesce on RDDs", {
   r2 <- repartition(rdd, 6)
   expect_equal(numPartitions(r2), 6L)
   count <- length(collectPartition(r2, 0L))
-  expect_true(count >=0 && count <= 4)
+  expect_true(count >= 0 && count <= 4)
 
   # coalesce
   r3 <- coalesce(rdd, 1)
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index 61c8a7ec7d837..aca41aa6dcf24 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -666,10 +666,12 @@ test_that("column binary mathfunctions", {
   expect_equal(collect(select(df, atan2(df$a, df$b)))[2, "ATAN2(a, b)"], atan2(2, 6))
   expect_equal(collect(select(df, atan2(df$a, df$b)))[3, "ATAN2(a, b)"], atan2(3, 7))
   expect_equal(collect(select(df, atan2(df$a, df$b)))[4, "ATAN2(a, b)"], atan2(4, 8))
+  ## nolint start
   expect_equal(collect(select(df, hypot(df$a, df$b)))[1, "HYPOT(a, b)"], sqrt(1^2 + 5^2))
   expect_equal(collect(select(df, hypot(df$a, df$b)))[2, "HYPOT(a, b)"], sqrt(2^2 + 6^2))
   expect_equal(collect(select(df, hypot(df$a, df$b)))[3, "HYPOT(a, b)"], sqrt(3^2 + 7^2))
   expect_equal(collect(select(df, hypot(df$a, df$b)))[4, "HYPOT(a, b)"], sqrt(4^2 + 8^2))
+  ## nolint end
 })
 
 test_that("string operators", {
@@ -876,7 +878,7 @@ test_that("parquetFile works with multiple input paths", {
   write.df(df, parquetPath2, "parquet", mode="overwrite")
   parquetDF <- parquetFile(sqlContext, parquetPath, parquetPath2)
   expect_is(parquetDF, "DataFrame")
-  expect_equal(count(parquetDF), count(df)*2)
+  expect_equal(count(parquetDF), count(df) * 2)
 })
 
 test_that("describe() on a DataFrame", {

From 04a49edfdb606c01fa4f8ae6e730ec4f9bd0cb6d Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Fri, 31 Jul 2015 09:34:10 -0700
Subject: [PATCH 511/549] [SPARK-9497] [SPARK-9509] [CORE] Use ask instead of
 askWithRetry

`RpcEndpointRef.askWithRetry` throws `SparkException` rather than `TimeoutException`. Use ask to replace it because we don't need to retry here.

Author: zsxwing <zsxwing@gmail.com>

Closes #7824 from zsxwing/SPARK-9497 and squashes the following commits:

7bfc2b4 [zsxwing] Use ask instead of askWithRetry
---
 .../scala/org/apache/spark/deploy/client/AppClient.scala     | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/client/AppClient.scala b/core/src/main/scala/org/apache/spark/deploy/client/AppClient.scala
index 79b251e7e62fe..a659abf70395d 100644
--- a/core/src/main/scala/org/apache/spark/deploy/client/AppClient.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/client/AppClient.scala
@@ -27,7 +27,7 @@ import org.apache.spark.deploy.{ApplicationDescription, ExecutorState}
 import org.apache.spark.deploy.DeployMessages._
 import org.apache.spark.deploy.master.Master
 import org.apache.spark.rpc._
-import org.apache.spark.util.{ThreadUtils, Utils}
+import org.apache.spark.util.{RpcUtils, ThreadUtils, Utils}
 
 /**
  * Interface allowing applications to speak with a Spark deploy cluster. Takes a master URL,
@@ -248,7 +248,8 @@ private[spark] class AppClient(
   def stop() {
     if (endpoint != null) {
       try {
-        endpoint.askWithRetry[Boolean](StopAppClient)
+        val timeout = RpcUtils.askRpcTimeout(conf)
+        timeout.awaitResult(endpoint.ask[Boolean](StopAppClient))
       } catch {
         case e: TimeoutException =>
           logInfo("Stop request to Master timed out; it may already be shut down.")

From 478f844f25b2388675836015af04b6be94de0d8e Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Sat, 1 Aug 2015 00:44:51 +0800
Subject: [PATCH 512/549] Add unpack

---
 external/mqtt/src/main/assembly/assembly.xml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/external/mqtt/src/main/assembly/assembly.xml b/external/mqtt/src/main/assembly/assembly.xml
index 92318ec16dba8..13747d65cbee5 100644
--- a/external/mqtt/src/main/assembly/assembly.xml
+++ b/external/mqtt/src/main/assembly/assembly.xml
@@ -26,6 +26,7 @@
       <useTransitiveDependencies>true</useTransitiveDependencies>
       <scope>test</scope>
       <useProjectArtifact>false</useProjectArtifact>
+      <unpack>true</unpack>
       <excludes>
         <exclude>org.apache.hadoop:*:jar</exclude>
         <exclude>org.apache.spark:*:jar</exclude>

From 47278c5a1a5364cbd327569050c133eb49098134 Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Sat, 1 Aug 2015 00:58:38 +0800
Subject: [PATCH 513/549] Include the project class files

---
 external/mqtt/src/main/assembly/assembly.xml | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/external/mqtt/src/main/assembly/assembly.xml b/external/mqtt/src/main/assembly/assembly.xml
index 13747d65cbee5..ecab5b360eb3e 100644
--- a/external/mqtt/src/main/assembly/assembly.xml
+++ b/external/mqtt/src/main/assembly/assembly.xml
@@ -21,15 +21,20 @@
   </formats>
   <includeBaseDirectory>false</includeBaseDirectory>
 
+  <fileSets>
+    <fileSet>
+      <directory>${project.build.directory}/scala-${scala.binary.version}/test-classes</directory>
+      <outputDirectory>/</outputDirectory>
+    </fileSet>
+  </fileSets>
+
   <dependencySets>
     <dependencySet>
       <useTransitiveDependencies>true</useTransitiveDependencies>
       <scope>test</scope>
-      <useProjectArtifact>false</useProjectArtifact>
       <unpack>true</unpack>
       <excludes>
         <exclude>org.apache.hadoop:*:jar</exclude>
-        <exclude>org.apache.spark:*:jar</exclude>
         <exclude>org.apache.zookeeper:*:jar</exclude>
         <exclude>org.apache.avro:*:jar</exclude>
       </excludes>

From 27ae851ce16082775ffbcb5b8fc6bdbe65dc70fc Mon Sep 17 00:00:00 2001
From: tedyu <yuzhihong@gmail.com>
Date: Fri, 31 Jul 2015 18:16:55 +0100
Subject: [PATCH 514/549] [SPARK-9446] Clear Active SparkContext in stop()
 method

In thread 'stopped SparkContext remaining active' on mailing list, Andres observed the following in driver log:
```
15/07/29 15:17:09 WARN YarnSchedulerBackend$YarnSchedulerEndpoint: ApplicationMaster has disassociated: <address removed>
15/07/29 15:17:09 INFO YarnClientSchedulerBackend: Shutting down all executors
Exception in thread "Yarn application state monitor" org.apache.spark.SparkException: Error asking standalone scheduler to shut down executors
        at org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend.stopExecutors(CoarseGrainedSchedulerBackend.scala:261)
        at org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend.stop(CoarseGrainedSchedulerBackend.scala:266)
        at org.apache.spark.scheduler.cluster.YarnClientSchedulerBackend.stop(YarnClientSchedulerBackend.scala:158)
        at org.apache.spark.scheduler.TaskSchedulerImpl.stop(TaskSchedulerImpl.scala:416)
        at org.apache.spark.scheduler.DAGScheduler.stop(DAGScheduler.scala:1411)
        at org.apache.spark.SparkContext.stop(SparkContext.scala:1644)
        at org.apache.spark.scheduler.cluster.YarnClientSchedulerBackend$$anon$1.run(YarnClientSchedulerBackend.scala:139)
Caused by: java.lang.InterruptedException
        at java.util.concurrent.locks.AbstractQueuedSynchronizer.tryAcquireSharedNanos(AbstractQueuedSynchronizer.java:1325)
        at scala.concurrent.impl.Promise$DefaultPromise.tryAwait(Promise.scala:208)
        at scala.concurrent.impl.Promise$DefaultPromise.ready(Promise.scala:218)
        at scala.concurrent.impl.Promise$DefaultPromise.result(Promise.scala:223)
        at scala.concurrent.Await$$anonfun$result$1.apply(package.scala:190)
        at scala.concurrent.BlockContext$DefaultBlockContext$.blockOn(BlockContext.scala:53)
        at scala.concurrent.Await$.result(package.scala:190)15/07/29 15:17:09 INFO YarnClientSchedulerBackend: Asking each executor to shut down

        at org.apache.spark.rpc.RpcEndpointRef.askWithRetry(RpcEndpointRef.scala:102)
        at org.apache.spark.rpc.RpcEndpointRef.askWithRetry(RpcEndpointRef.scala:78)
        at org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend.stopExecutors(CoarseGrainedSchedulerBackend.scala:257)
        ... 6 more
```
Effect of the above exception is that a stopped SparkContext is returned to user since SparkContext.clearActiveContext() is not called.

Author: tedyu <yuzhihong@gmail.com>

Closes #7756 from tedyu/master and squashes the following commits:

7339ff2 [tedyu] Move null assignment out of tryLogNonFatalError block
6e02cd9 [tedyu] Use Utils.tryLogNonFatalError to guard resource release
f5fb519 [tedyu] Clear Active SparkContext in stop() method using finally
---
 .../scala/org/apache/spark/SparkContext.scala | 50 ++++++++++++++-----
 1 file changed, 37 insertions(+), 13 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index ac6ac6c216767..2d8aa25d81daa 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -1689,33 +1689,57 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
       Utils.removeShutdownHook(_shutdownHookRef)
     }
 
-    postApplicationEnd()
-    _ui.foreach(_.stop())
+    Utils.tryLogNonFatalError {
+      postApplicationEnd()
+    }
+    Utils.tryLogNonFatalError {
+      _ui.foreach(_.stop())
+    }
     if (env != null) {
-      env.metricsSystem.report()
+      Utils.tryLogNonFatalError {
+        env.metricsSystem.report()
+      }
     }
     if (metadataCleaner != null) {
-      metadataCleaner.cancel()
+      Utils.tryLogNonFatalError {
+        metadataCleaner.cancel()
+      }
+    }
+    Utils.tryLogNonFatalError {
+      _cleaner.foreach(_.stop())
+    }
+    Utils.tryLogNonFatalError {
+      _executorAllocationManager.foreach(_.stop())
     }
-    _cleaner.foreach(_.stop())
-    _executorAllocationManager.foreach(_.stop())
     if (_dagScheduler != null) {
-      _dagScheduler.stop()
+      Utils.tryLogNonFatalError {
+        _dagScheduler.stop()
+      }
       _dagScheduler = null
     }
     if (_listenerBusStarted) {
-      listenerBus.stop()
-      _listenerBusStarted = false
+      Utils.tryLogNonFatalError {
+        listenerBus.stop()
+        _listenerBusStarted = false
+      }
+    }
+    Utils.tryLogNonFatalError {
+      _eventLogger.foreach(_.stop())
     }
-    _eventLogger.foreach(_.stop())
     if (env != null && _heartbeatReceiver != null) {
-      env.rpcEnv.stop(_heartbeatReceiver)
+      Utils.tryLogNonFatalError {
+        env.rpcEnv.stop(_heartbeatReceiver)
+      }
+    }
+    Utils.tryLogNonFatalError {
+      _progressBar.foreach(_.stop())
     }
-    _progressBar.foreach(_.stop())
     _taskScheduler = null
     // TODO: Cache.stop()?
     if (_env != null) {
-      _env.stop()
+      Utils.tryLogNonFatalError {
+        _env.stop()
+      }
       SparkEnv.set(null)
     }
     SparkContext.clearActiveContext()

From 935615c63479da81dab544325e2bc2641891bffd Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Sat, 1 Aug 2015 02:00:48 +0800
Subject: [PATCH 515/549] Fix the flaky MQTT tests

---
 .../streaming/mqtt/MQTTStreamSuite.scala      | 10 +----
 .../spark/streaming/mqtt/MQTTTestUtils.scala  | 42 -------------------
 python/pyspark/streaming/tests.py             | 27 ++++++++----
 3 files changed, 22 insertions(+), 57 deletions(-)

diff --git a/external/mqtt/src/test/scala/org/apache/spark/streaming/mqtt/MQTTStreamSuite.scala b/external/mqtt/src/test/scala/org/apache/spark/streaming/mqtt/MQTTStreamSuite.scala
index 98821c71c7aee..879be775ea9f9 100644
--- a/external/mqtt/src/test/scala/org/apache/spark/streaming/mqtt/MQTTStreamSuite.scala
+++ b/external/mqtt/src/test/scala/org/apache/spark/streaming/mqtt/MQTTStreamSuite.scala
@@ -67,17 +67,11 @@ class MQTTStreamSuite extends SparkFunSuite with Eventually with BeforeAndAfter
       }
     }
 
-    MQTTTestUtils.registerStreamingListener(ssc)
-
     ssc.start()
 
-    // wait for the receiver to start before publishing data, or we risk failing
-    // the test nondeterministically. See SPARK-4631
-    MQTTTestUtils.waitForReceiverToStart(ssc)
-
-    MQTTTestUtils.publishData(topic, sendMessage)
-
+    // Retry it because we don't know when the receiver will start.
     eventually(timeout(10000 milliseconds), interval(100 milliseconds)) {
+      MQTTTestUtils.publishData(topic, sendMessage)
       assert(sendMessage.equals(receiveMessage(0)))
     }
     ssc.stop()
diff --git a/external/mqtt/src/test/scala/org/apache/spark/streaming/mqtt/MQTTTestUtils.scala b/external/mqtt/src/test/scala/org/apache/spark/streaming/mqtt/MQTTTestUtils.scala
index 6c85019ae0723..47cc9af497778 100644
--- a/external/mqtt/src/test/scala/org/apache/spark/streaming/mqtt/MQTTTestUtils.scala
+++ b/external/mqtt/src/test/scala/org/apache/spark/streaming/mqtt/MQTTTestUtils.scala
@@ -18,7 +18,6 @@
 package org.apache.spark.streaming.mqtt
 
 import java.net.{ServerSocket, URI}
-import java.util.concurrent.{CountDownLatch, TimeUnit}
 
 import scala.language.postfixOps
 
@@ -28,10 +27,6 @@ import org.apache.commons.lang3.RandomUtils
 import org.eclipse.paho.client.mqttv3._
 import org.eclipse.paho.client.mqttv3.persist.MqttDefaultFilePersistence
 
-import org.apache.spark.streaming.StreamingContext
-import org.apache.spark.streaming.api.java.JavaStreamingContext
-import org.apache.spark.streaming.scheduler.StreamingListener
-import org.apache.spark.streaming.scheduler.StreamingListenerReceiverStarted
 import org.apache.spark.util.Utils
 import org.apache.spark.{Logging, SparkConf}
 
@@ -47,8 +42,6 @@ private class MQTTTestUtils extends Logging {
   private var broker: BrokerService = _
   private var connector: TransportConnector = _
 
-  private var receiverStartedLatch = new CountDownLatch(1)
-
   def brokerUri: String = {
     s"$brokerHost:$brokerPort"
   }
@@ -73,7 +66,6 @@ private class MQTTTestUtils extends Logging {
       connector = null
     }
     Utils.deleteRecursively(persistenceDir)
-    receiverStartedLatch = null
   }
 
   private def findFreePort(): Int = {
@@ -114,38 +106,4 @@ private class MQTTTestUtils extends Logging {
     }
   }
 
-  /**
-   * Call this one before starting StreamingContext so that we won't miss the
-   * StreamingListenerReceiverStarted event.
-   */
-  def registerStreamingListener(jssc: JavaStreamingContext): Unit = {
-    registerStreamingListener(jssc.ssc)
-  }
-
-  /**
-   * Call this one before starting StreamingContext so that we won't miss the
-   * StreamingListenerReceiverStarted event.
-   */
-  def registerStreamingListener(ssc: StreamingContext): Unit = {
-    ssc.addStreamingListener(new StreamingListener {
-      override def onReceiverStarted(receiverStarted: StreamingListenerReceiverStarted) {
-        receiverStartedLatch.countDown()
-      }
-    })
-  }
-
-  /**
-   * Block until at least one receiver has started or timeout occurs.
-   */
-  def waitForReceiverToStart(jssc: JavaStreamingContext): Unit = {
-    waitForReceiverToStart(jssc.ssc)
-  }
-
-  /**
-   * Block until at least one receiver has started or timeout occurs.
-   */
-  def waitForReceiverToStart(ssc: StreamingContext): Unit = {
-    assert(
-      receiverStartedLatch.await(10, TimeUnit.SECONDS), "Timeout waiting for receiver to start.")
-  }
 }
diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
index 0da312b89b72f..2e37aa96c614f 100644
--- a/python/pyspark/streaming/tests.py
+++ b/python/pyspark/streaming/tests.py
@@ -931,14 +931,27 @@ def test_mqtt_stream(self):
         """Test the Python MQTT stream API."""
         sendData = "MQTT demo for spark streaming"
         topic = self._randomTopic()
-        self._MQTTTestUtils.registerStreamingListener(self.ssc._jssc)
         result = self._startContext(topic)
-        self._MQTTTestUtils.waitForReceiverToStart(self.ssc._jssc)
-        self._MQTTTestUtils.publishData(topic, sendData)
-        self.wait_for(result, 1)
-        # Because "publishData" sends duplicate messages, here we should use > 0
-        self.assertTrue(len(result) > 0)
-        self.assertEqual(sendData, result[0])
+
+        def retry():
+            self._MQTTTestUtils.publishData(topic, sendData)
+            # Because "publishData" sends duplicate messages, here we should use > 0
+            self.assertTrue(len(result) > 0)
+            self.assertEqual(sendData, result[0])
+
+        # Retry it because we don't know when the receiver will start.
+        self._retry_or_timeout(retry)
+
+    def _retry_or_timeout(self, test_func):
+        start_time = time.time()
+        while True:
+            try:
+                test_func()
+                break
+            except:
+                if time.time() - start_time > self.timeout:
+                    raise
+                time.sleep(0.01)
 
 
 def search_kafka_assembly_jar():

From 0024da9157ba12ec84883a78441fa6835c1d0042 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Fri, 31 Jul 2015 11:07:34 -0700
Subject: [PATCH 516/549] [SQL] address comments for to_date/trunc

This PR address the comments in #7805

cc rxin

Author: Davies Liu <davies@databricks.com>

Closes #7817 from davies/trunc and squashes the following commits:

f729d5f [Davies Liu] rollback
cb7f7832 [Davies Liu] genCode() is protected
31e52ef [Davies Liu] fix style
ed1edc7 [Davies Liu] address comments for #7805
---
 .../catalyst/expressions/datetimeFunctions.scala  | 15 ++++++++-------
 .../spark/sql/catalyst/util/DateTimeUtils.scala   |  3 ++-
 .../expressions/ExpressionEvalHelper.scala        |  4 +---
 .../scala/org/apache/spark/sql/functions.scala    |  3 +++
 4 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeFunctions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeFunctions.scala
index 6e7613340c032..07dea5b470b5f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeFunctions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeFunctions.scala
@@ -726,15 +726,16 @@ case class TruncDate(date: Expression, format: Expression)
   override def dataType: DataType = DateType
   override def prettyName: String = "trunc"
 
-  lazy val minItemConst = DateTimeUtils.parseTruncLevel(format.eval().asInstanceOf[UTF8String])
+  private lazy val truncLevel: Int =
+    DateTimeUtils.parseTruncLevel(format.eval().asInstanceOf[UTF8String])
 
   override def eval(input: InternalRow): Any = {
-    val minItem = if (format.foldable) {
-      minItemConst
+    val level = if (format.foldable) {
+      truncLevel
     } else {
       DateTimeUtils.parseTruncLevel(format.eval().asInstanceOf[UTF8String])
     }
-    if (minItem == -1) {
+    if (level == -1) {
       // unknown format
       null
     } else {
@@ -742,7 +743,7 @@ case class TruncDate(date: Expression, format: Expression)
       if (d == null) {
         null
       } else {
-        DateTimeUtils.truncDate(d.asInstanceOf[Int], minItem)
+        DateTimeUtils.truncDate(d.asInstanceOf[Int], level)
       }
     }
   }
@@ -751,7 +752,7 @@ case class TruncDate(date: Expression, format: Expression)
     val dtu = DateTimeUtils.getClass.getName.stripSuffix("$")
 
     if (format.foldable) {
-      if (minItemConst == -1) {
+      if (truncLevel == -1) {
         s"""
           boolean ${ev.isNull} = true;
           ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
@@ -763,7 +764,7 @@ case class TruncDate(date: Expression, format: Expression)
           boolean ${ev.isNull} = ${d.isNull};
           ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)};
           if (!${ev.isNull}) {
-            ${ev.primitive} = $dtu.truncDate(${d.primitive}, $minItemConst);
+            ${ev.primitive} = $dtu.truncDate(${d.primitive}, $truncLevel);
           }
         """
       }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
index 5a7c25b8d508d..032ed8a56a50e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
@@ -794,7 +794,8 @@ object DateTimeUtils {
     } else if (level == TRUNC_TO_MONTH) {
       d - DateTimeUtils.getDayOfMonth(d) + 1
     } else {
-      throw new Exception(s"Invalid trunc level: $level")
+      // caller make sure that this should never be reached
+      sys.error(s"Invalid trunc level: $level")
     }
   }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
index 3c05e5c3b833c..a41185b4d8754 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
@@ -18,11 +18,9 @@
 package org.apache.spark.sql.catalyst.expressions
 
 import org.scalactic.TripleEqualsSupport.Spread
-import org.scalatest.Matchers._
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.CatalystTypeConverters
+import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
 import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.catalyst.optimizer.DefaultOptimizer
 import org.apache.spark.sql.catalyst.plans.logical.{OneRowRelation, Project}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 46dc4605a5ccb..5d82a5eadd94d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -2192,6 +2192,9 @@ object functions {
   /**
    * Returns date truncated to the unit specified by the format.
    *
+   * @param format: 'year', 'yyyy', 'yy' for truncate by year,
+   *               or 'month', 'mon', 'mm' for truncate by month
+   *
    * @group datetime_funcs
    * @since 1.5.0
    */

From 6add4eddb39e7748a87da3e921ea3c7881d30a82 Mon Sep 17 00:00:00 2001
From: Alexander Ulanov <nashb@yandex.ru>
Date: Fri, 31 Jul 2015 11:22:40 -0700
Subject: [PATCH 517/549] [SPARK-9471] [ML] Multilayer Perceptron

This pull request contains the following feature for ML:
   - Multilayer Perceptron classifier

This implementation is based on our initial pull request with bgreeven: https://github.com/apache/spark/pull/1290 and inspired by very insightful suggestions from mengxr and witgo (I would like to thank all other people from the mentioned thread for useful discussions). The original code was extensively tested and benchmarked. Since then, I've addressed two main requirements that prevented the code from merging into the main branch:
   - Extensible interface, so it will be easy to implement new types of networks
     - Main building blocks are traits `Layer` and `LayerModel`. They are used for constructing layers of ANN. New layers can be added by extending the `Layer` and `LayerModel` traits. These traits are private in this release in order to save path to improve them based on community feedback
     - Back propagation is implemented in general form, so there is no need to change it (optimization algorithm) when new layers are implemented
   - Speed and scalability: this implementation has to be comparable in terms of speed to the state of the art single node implementations.
     - The developed benchmark for large ANN shows that the proposed code is on par with C++ CPU implementation and scales nicely with the number of workers. Details can be found here: https://github.com/avulanov/ann-benchmark

   - DBN and RBM by witgo https://github.com/witgo/spark/tree/ann-interface-gemm-dbn
   - Dropout https://github.com/avulanov/spark/tree/ann-interface-gemm

mengxr and dbtsai kindly agreed to perform code review.

Author: Alexander Ulanov <nashb@yandex.ru>
Author: Bert Greevenbosch <opensrc@bertgreevenbosch.nl>

Closes #7621 from avulanov/SPARK-2352-ann and squashes the following commits:

4806b6f [Alexander Ulanov] Addressing reviewers comments.
a7e7951 [Alexander Ulanov] Default blockSize: 100. Added documentation to blockSize parameter and DataStacker class
f69bb3d [Alexander Ulanov] Addressing reviewers comments.
374bea6 [Alexander Ulanov] Moving ANN to ML package. GradientDescent constructor is now spark private.
43b0ae2 [Alexander Ulanov] Addressing reviewers comments. Adding multiclass test.
9d18469 [Alexander Ulanov] Addressing reviewers comments: unnecessary copy of data in predict
35125ab [Alexander Ulanov] Style fix in tests
e191301 [Alexander Ulanov] Apache header
a226133 [Alexander Ulanov] Multilayer Perceptron regressor and classifier
---
 .../org/apache/spark/ml/ann/BreezeUtil.scala  |  63 ++
 .../scala/org/apache/spark/ml/ann/Layer.scala | 882 ++++++++++++++++++
 .../MultilayerPerceptronClassifier.scala      | 193 ++++
 .../org/apache/spark/ml/param/params.scala    |   5 +
 .../mllib/optimization/GradientDescent.scala  |   2 +-
 .../org/apache/spark/ml/ann/ANNSuite.scala    |  91 ++
 .../MultilayerPerceptronClassifierSuite.scala |  91 ++
 7 files changed, 1326 insertions(+), 1 deletion(-)
 create mode 100644 mllib/src/main/scala/org/apache/spark/ml/ann/BreezeUtil.scala
 create mode 100644 mllib/src/main/scala/org/apache/spark/ml/ann/Layer.scala
 create mode 100644 mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
 create mode 100644 mllib/src/test/scala/org/apache/spark/ml/ann/ANNSuite.scala
 create mode 100644 mllib/src/test/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifierSuite.scala

diff --git a/mllib/src/main/scala/org/apache/spark/ml/ann/BreezeUtil.scala b/mllib/src/main/scala/org/apache/spark/ml/ann/BreezeUtil.scala
new file mode 100644
index 0000000000000..7429f9d652ac5
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/ann/BreezeUtil.scala
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.ann
+
+import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV}
+import com.github.fommil.netlib.BLAS.{getInstance => NativeBLAS}
+
+/**
+ * In-place DGEMM and DGEMV for Breeze
+ */
+private[ann] object BreezeUtil {
+
+  // TODO: switch to MLlib BLAS interface
+  private def transposeString(a: BDM[Double]): String = if (a.isTranspose) "T" else "N"
+
+  /**
+   * DGEMM: C := alpha * A * B + beta * C
+   * @param alpha alpha
+   * @param a A
+   * @param b B
+   * @param beta beta
+   * @param c C
+   */
+  def dgemm(alpha: Double, a: BDM[Double], b: BDM[Double], beta: Double, c: BDM[Double]): Unit = {
+    // TODO: add code if matrices isTranspose!!!
+    require(a.cols == b.rows, "A & B Dimension mismatch!")
+    require(a.rows == c.rows, "A & C Dimension mismatch!")
+    require(b.cols == c.cols, "A & C Dimension mismatch!")
+    NativeBLAS.dgemm(transposeString(a), transposeString(b), c.rows, c.cols, a.cols,
+      alpha, a.data, a.offset, a.majorStride, b.data, b.offset, b.majorStride,
+      beta, c.data, c.offset, c.rows)
+  }
+
+  /**
+   * DGEMV: y := alpha * A * x + beta * y
+   * @param alpha alpha
+   * @param a A
+   * @param x x
+   * @param beta beta
+   * @param y y
+   */
+  def dgemv(alpha: Double, a: BDM[Double], x: BDV[Double], beta: Double, y: BDV[Double]): Unit = {
+    require(a.cols == x.length, "A & b Dimension mismatch!")
+    NativeBLAS.dgemv(transposeString(a), a.rows, a.cols,
+      alpha, a.data, a.offset, a.majorStride, x.data, x.offset, x.stride,
+      beta, y.data, y.offset, y.stride)
+  }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/ann/Layer.scala b/mllib/src/main/scala/org/apache/spark/ml/ann/Layer.scala
new file mode 100644
index 0000000000000..b5258ff348477
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/ann/Layer.scala
@@ -0,0 +1,882 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.ann
+
+import breeze.linalg.{*, DenseMatrix => BDM, DenseVector => BDV, Vector => BV, axpy => Baxpy,
+  sum => Bsum}
+import breeze.numerics.{log => Blog, sigmoid => Bsigmoid}
+
+import org.apache.spark.mllib.linalg.{Vector, Vectors}
+import org.apache.spark.mllib.optimization._
+import org.apache.spark.rdd.RDD
+import org.apache.spark.util.random.XORShiftRandom
+
+/**
+ * Trait that holds Layer properties, that are needed to instantiate it.
+ * Implements Layer instantiation.
+ *
+ */
+private[ann] trait Layer extends Serializable {
+  /**
+   * Returns the instance of the layer based on weights provided
+   * @param weights vector with layer weights
+   * @param position position of weights in the vector
+   * @return the layer model
+   */
+  def getInstance(weights: Vector, position: Int): LayerModel
+
+  /**
+   * Returns the instance of the layer with random generated weights
+   * @param seed seed
+   * @return the layer model
+   */
+  def getInstance(seed: Long): LayerModel
+}
+
+/**
+ * Trait that holds Layer weights (or parameters).
+ * Implements functions needed for forward propagation, computing delta and gradient.
+ * Can return weights in Vector format.
+ */
+private[ann] trait LayerModel extends Serializable {
+  /**
+   * number of weights
+   */
+  val size: Int
+
+  /**
+   * Evaluates the data (process the data through the layer)
+   * @param data data
+   * @return processed data
+   */
+  def eval(data: BDM[Double]): BDM[Double]
+
+  /**
+   * Computes the delta for back propagation
+   * @param nextDelta delta of the next layer
+   * @param input input data
+   * @return delta
+   */
+  def prevDelta(nextDelta: BDM[Double], input: BDM[Double]): BDM[Double]
+
+  /**
+   * Computes the gradient
+   * @param delta delta for this layer
+   * @param input input data
+   * @return gradient
+   */
+  def grad(delta: BDM[Double], input: BDM[Double]): Array[Double]
+
+  /**
+   * Returns weights for the layer in a single vector
+   * @return layer weights
+   */
+  def weights(): Vector
+}
+
+/**
+ * Layer properties of affine transformations, that is y=A*x+b
+ * @param numIn number of inputs
+ * @param numOut number of outputs
+ */
+private[ann] class AffineLayer(val numIn: Int, val numOut: Int) extends Layer {
+
+  override def getInstance(weights: Vector, position: Int): LayerModel = {
+    AffineLayerModel(this, weights, position)
+  }
+
+  override def getInstance(seed: Long = 11L): LayerModel = {
+    AffineLayerModel(this, seed)
+  }
+}
+
+/**
+ * Model of Affine layer y=A*x+b
+ * @param w weights (matrix A)
+ * @param b bias (vector b)
+ */
+private[ann] class AffineLayerModel private(w: BDM[Double], b: BDV[Double]) extends LayerModel {
+  val size = w.size + b.length
+  val gwb = new Array[Double](size)
+  private lazy val gw: BDM[Double] = new BDM[Double](w.rows, w.cols, gwb)
+  private lazy val gb: BDV[Double] = new BDV[Double](gwb, w.size)
+  private var z: BDM[Double] = null
+  private var d: BDM[Double] = null
+  private var ones: BDV[Double] = null
+
+  override def eval(data: BDM[Double]): BDM[Double] = {
+    if (z == null || z.cols != data.cols) z = new BDM[Double](w.rows, data.cols)
+    z(::, *) := b
+    BreezeUtil.dgemm(1.0, w, data, 1.0, z)
+    z
+  }
+
+  override def prevDelta(nextDelta: BDM[Double], input: BDM[Double]): BDM[Double] = {
+    if (d == null || d.cols != nextDelta.cols) d = new BDM[Double](w.cols, nextDelta.cols)
+    BreezeUtil.dgemm(1.0, w.t, nextDelta, 0.0, d)
+    d
+  }
+
+  override def grad(delta: BDM[Double], input: BDM[Double]): Array[Double] = {
+    BreezeUtil.dgemm(1.0 / input.cols, delta, input.t, 0.0, gw)
+    if (ones == null || ones.length != delta.cols) ones = BDV.ones[Double](delta.cols)
+    BreezeUtil.dgemv(1.0 / input.cols, delta, ones, 0.0, gb)
+    gwb
+  }
+
+  override def weights(): Vector = AffineLayerModel.roll(w, b)
+}
+
+/**
+ * Fabric for Affine layer models
+ */
+private[ann] object AffineLayerModel {
+
+  /**
+   * Creates a model of Affine layer
+   * @param layer layer properties
+   * @param weights vector with weights
+   * @param position position of weights in the vector
+   * @return model of Affine layer
+   */
+  def apply(layer: AffineLayer, weights: Vector, position: Int): AffineLayerModel = {
+    val (w, b) = unroll(weights, position, layer.numIn, layer.numOut)
+    new AffineLayerModel(w, b)
+  }
+
+  /**
+   * Creates a model of Affine layer
+   * @param layer layer properties
+   * @param seed seed
+   * @return model of Affine layer
+   */
+  def apply(layer: AffineLayer, seed: Long): AffineLayerModel = {
+    val (w, b) = randomWeights(layer.numIn, layer.numOut, seed)
+    new AffineLayerModel(w, b)
+  }
+
+  /**
+   * Unrolls the weights from the vector
+   * @param weights vector with weights
+   * @param position position of weights for this layer
+   * @param numIn number of layer inputs
+   * @param numOut number of layer outputs
+   * @return matrix A and vector b
+   */
+  def unroll(
+    weights: Vector,
+    position: Int,
+    numIn: Int,
+    numOut: Int): (BDM[Double], BDV[Double]) = {
+    val weightsCopy = weights.toArray
+    // TODO: the array is not copied to BDMs, make sure this is OK!
+    val a = new BDM[Double](numOut, numIn, weightsCopy, position)
+    val b = new BDV[Double](weightsCopy, position + (numOut * numIn), 1, numOut)
+    (a, b)
+  }
+
+  /**
+   * Roll the layer weights into a vector
+   * @param a matrix A
+   * @param b vector b
+   * @return vector of weights
+   */
+  def roll(a: BDM[Double], b: BDV[Double]): Vector = {
+    val result = new Array[Double](a.size + b.length)
+    // TODO: make sure that we need to copy!
+    System.arraycopy(a.toArray, 0, result, 0, a.size)
+    System.arraycopy(b.toArray, 0, result, a.size, b.length)
+    Vectors.dense(result)
+  }
+
+  /**
+   * Generate random weights for the layer
+   * @param numIn number of inputs
+   * @param numOut number of outputs
+   * @param seed seed
+   * @return (matrix A, vector b)
+   */
+  def randomWeights(numIn: Int, numOut: Int, seed: Long = 11L): (BDM[Double], BDV[Double]) = {
+    val rand: XORShiftRandom = new XORShiftRandom(seed)
+    val weights = BDM.fill[Double](numOut, numIn){ (rand.nextDouble * 4.8 - 2.4) / numIn }
+    val bias = BDV.fill[Double](numOut){ (rand.nextDouble * 4.8 - 2.4) / numIn }
+    (weights, bias)
+  }
+}
+
+/**
+ * Trait for functions and their derivatives for functional layers
+ */
+private[ann] trait ActivationFunction extends Serializable {
+
+  /**
+   * Implements a function
+   * @param x input data
+   * @param y output data
+   */
+  def eval(x: BDM[Double], y: BDM[Double]): Unit
+
+  /**
+   * Implements a derivative of a function (needed for the back propagation)
+   * @param x input data
+   * @param y output data
+   */
+  def derivative(x: BDM[Double], y: BDM[Double]): Unit
+
+  /**
+   * Implements a cross entropy error of a function.
+   * Needed if the functional layer that contains this function is the output layer
+   * of the network.
+   * @param target target output
+   * @param output computed output
+   * @param result intermediate result
+   * @return cross-entropy
+   */
+  def crossEntropy(target: BDM[Double], output: BDM[Double], result: BDM[Double]): Double
+
+  /**
+   * Implements a mean squared error of a function
+   * @param target target output
+   * @param output computed output
+   * @param result intermediate result
+   * @return mean squared error
+   */
+  def squared(target: BDM[Double], output: BDM[Double], result: BDM[Double]): Double
+}
+
+/**
+ * Implements in-place application of functions
+ */
+private[ann] object ActivationFunction {
+
+  def apply(x: BDM[Double], y: BDM[Double], func: Double => Double): Unit = {
+    var i = 0
+    while (i < x.rows) {
+      var j = 0
+      while (j < x.cols) {
+        y(i, j) = func(x(i, j))
+        j += 1
+      }
+      i += 1
+    }
+  }
+
+  def apply(
+    x1: BDM[Double],
+    x2: BDM[Double],
+    y: BDM[Double],
+    func: (Double, Double) => Double): Unit = {
+    var i = 0
+    while (i < x1.rows) {
+      var j = 0
+      while (j < x1.cols) {
+        y(i, j) = func(x1(i, j), x2(i, j))
+        j += 1
+      }
+      i += 1
+    }
+  }
+}
+
+/**
+ * Implements SoftMax activation function
+ */
+private[ann] class SoftmaxFunction extends ActivationFunction {
+  override def eval(x: BDM[Double], y: BDM[Double]): Unit = {
+    var j = 0
+    // find max value to make sure later that exponent is computable
+    while (j < x.cols) {
+      var i = 0
+      var max = Double.MinValue
+      while (i < x.rows) {
+        if (x(i, j) > max) {
+          max = x(i, j)
+        }
+        i += 1
+      }
+      var sum = 0.0
+      i = 0
+      while (i < x.rows) {
+        val res = Math.exp(x(i, j) - max)
+        y(i, j) = res
+        sum += res
+        i += 1
+      }
+      i = 0
+      while (i < x.rows) {
+        y(i, j) /= sum
+        i += 1
+      }
+      j += 1
+    }
+  }
+
+  override def crossEntropy(
+    output: BDM[Double],
+    target: BDM[Double],
+    result: BDM[Double]): Double = {
+    def m(o: Double, t: Double): Double = o - t
+    ActivationFunction(output, target, result, m)
+    -Bsum( target :* Blog(output)) / output.cols
+  }
+
+  override def derivative(x: BDM[Double], y: BDM[Double]): Unit = {
+    def sd(z: Double): Double = (1 - z) * z
+    ActivationFunction(x, y, sd)
+  }
+
+  override def squared(output: BDM[Double], target: BDM[Double], result: BDM[Double]): Double = {
+    throw new UnsupportedOperationException("Sorry, squared error is not defined for SoftMax.")
+  }
+}
+
+/**
+ * Implements Sigmoid activation function
+ */
+private[ann] class SigmoidFunction extends ActivationFunction {
+  override def eval(x: BDM[Double], y: BDM[Double]): Unit = {
+    def s(z: Double): Double = Bsigmoid(z)
+    ActivationFunction(x, y, s)
+  }
+
+  override def crossEntropy(
+    output: BDM[Double],
+    target: BDM[Double],
+    result: BDM[Double]): Double = {
+    def m(o: Double, t: Double): Double = o - t
+    ActivationFunction(output, target, result, m)
+    -Bsum(target :* Blog(output)) / output.cols
+  }
+
+  override def derivative(x: BDM[Double], y: BDM[Double]): Unit = {
+    def sd(z: Double): Double = (1 - z) * z
+    ActivationFunction(x, y, sd)
+  }
+
+  override def squared(output: BDM[Double], target: BDM[Double], result: BDM[Double]): Double = {
+    // TODO: make it readable
+    def m(o: Double, t: Double): Double = (o - t)
+    ActivationFunction(output, target, result, m)
+    val e = Bsum(result :* result) / 2 / output.cols
+    def m2(x: Double, o: Double) = x * (o - o * o)
+    ActivationFunction(result, output, result, m2)
+    e
+  }
+}
+
+/**
+ * Functional layer properties, y = f(x)
+ * @param activationFunction activation function
+ */
+private[ann] class FunctionalLayer (val activationFunction: ActivationFunction) extends Layer {
+  override def getInstance(weights: Vector, position: Int): LayerModel = getInstance(0L)
+
+  override def getInstance(seed: Long): LayerModel =
+    FunctionalLayerModel(this)
+}
+
+/**
+ * Functional layer model. Holds no weights.
+ * @param activationFunction activation function
+ */
+private[ann] class FunctionalLayerModel private (val activationFunction: ActivationFunction)
+  extends LayerModel {
+  val size = 0
+  // matrices for in-place computations
+  // outputs
+  private var f: BDM[Double] = null
+  // delta
+  private var d: BDM[Double] = null
+  // matrix for error computation
+  private var e: BDM[Double] = null
+  // delta gradient
+  private lazy val dg = new Array[Double](0)
+
+  override def eval(data: BDM[Double]): BDM[Double] = {
+    if (f == null || f.cols != data.cols) f = new BDM[Double](data.rows, data.cols)
+    activationFunction.eval(data, f)
+    f
+  }
+
+  override def prevDelta(nextDelta: BDM[Double], input: BDM[Double]): BDM[Double] = {
+    if (d == null || d.cols != nextDelta.cols) d = new BDM[Double](nextDelta.rows, nextDelta.cols)
+    activationFunction.derivative(input, d)
+    d :*= nextDelta
+    d
+  }
+
+  override def grad(delta: BDM[Double], input: BDM[Double]): Array[Double] = dg
+
+  override def weights(): Vector = Vectors.dense(new Array[Double](0))
+
+  def crossEntropy(output: BDM[Double], target: BDM[Double]): (BDM[Double], Double) = {
+    if (e == null || e.cols != output.cols) e = new BDM[Double](output.rows, output.cols)
+    val error = activationFunction.crossEntropy(output, target, e)
+    (e, error)
+  }
+
+  def squared(output: BDM[Double], target: BDM[Double]): (BDM[Double], Double) = {
+    if (e == null || e.cols != output.cols) e = new BDM[Double](output.rows, output.cols)
+    val error = activationFunction.squared(output, target, e)
+    (e, error)
+  }
+
+  def error(output: BDM[Double], target: BDM[Double]): (BDM[Double], Double) = {
+    // TODO: allow user pick error
+    activationFunction match {
+      case sigmoid: SigmoidFunction => squared(output, target)
+      case softmax: SoftmaxFunction => crossEntropy(output, target)
+    }
+  }
+}
+
+/**
+ * Fabric of functional layer models
+ */
+private[ann] object FunctionalLayerModel {
+  def apply(layer: FunctionalLayer): FunctionalLayerModel =
+    new FunctionalLayerModel(layer.activationFunction)
+}
+
+/**
+ * Trait for the artificial neural network (ANN) topology properties
+ */
+private[ann] trait Topology extends Serializable{
+  def getInstance(weights: Vector): TopologyModel
+  def getInstance(seed: Long): TopologyModel
+}
+
+/**
+ * Trait for ANN topology model
+ */
+private[ann] trait TopologyModel extends Serializable{
+  /**
+   * Forward propagation
+   * @param data input data
+   * @return array of outputs for each of the layers
+   */
+  def forward(data: BDM[Double]): Array[BDM[Double]]
+
+  /**
+   * Prediction of the model
+   * @param data input data
+   * @return prediction
+   */
+  def predict(data: Vector): Vector
+
+  /**
+   * Computes gradient for the network
+   * @param data input data
+   * @param target target output
+   * @param cumGradient cumulative gradient
+   * @param blockSize block size
+   * @return error
+   */
+  def computeGradient(data: BDM[Double], target: BDM[Double], cumGradient: Vector,
+                      blockSize: Int): Double
+
+  /**
+   * Returns the weights of the ANN
+   * @return weights
+   */
+  def weights(): Vector
+}
+
+/**
+ * Feed forward ANN
+ * @param layers
+ */
+private[ann] class FeedForwardTopology private(val layers: Array[Layer]) extends Topology {
+  override def getInstance(weights: Vector): TopologyModel = FeedForwardModel(this, weights)
+
+  override def getInstance(seed: Long): TopologyModel = FeedForwardModel(this, seed)
+}
+
+/**
+ * Factory for some of the frequently-used topologies
+ */
+private[ml] object FeedForwardTopology {
+  /**
+   * Creates a feed forward topology from the array of layers
+   * @param layers array of layers
+   * @return feed forward topology
+   */
+  def apply(layers: Array[Layer]): FeedForwardTopology = {
+    new FeedForwardTopology(layers)
+  }
+
+  /**
+   * Creates a multi-layer perceptron
+   * @param layerSizes sizes of layers including input and output size
+   * @param softmax wether to use SoftMax or Sigmoid function for an output layer.
+   *                Softmax is default
+   * @return multilayer perceptron topology
+   */
+  def multiLayerPerceptron(layerSizes: Array[Int], softmax: Boolean = true): FeedForwardTopology = {
+    val layers = new Array[Layer]((layerSizes.length - 1) * 2)
+    for(i <- 0 until layerSizes.length - 1){
+      layers(i * 2) = new AffineLayer(layerSizes(i), layerSizes(i + 1))
+      layers(i * 2 + 1) =
+        if (softmax && i == layerSizes.length - 2) {
+          new FunctionalLayer(new SoftmaxFunction())
+        } else {
+          new FunctionalLayer(new SigmoidFunction())
+        }
+    }
+    FeedForwardTopology(layers)
+  }
+}
+
+/**
+ * Model of Feed Forward Neural Network.
+ * Implements forward, gradient computation and can return weights in vector format.
+ * @param layerModels models of layers
+ * @param topology topology of the network
+ */
+private[ml] class FeedForwardModel private(
+    val layerModels: Array[LayerModel],
+    val topology: FeedForwardTopology) extends TopologyModel {
+  override def forward(data: BDM[Double]): Array[BDM[Double]] = {
+    val outputs = new Array[BDM[Double]](layerModels.length)
+    outputs(0) = layerModels(0).eval(data)
+    for (i <- 1 until layerModels.length) {
+      outputs(i) = layerModels(i).eval(outputs(i-1))
+    }
+    outputs
+  }
+
+  override def computeGradient(
+    data: BDM[Double],
+    target: BDM[Double],
+    cumGradient: Vector,
+    realBatchSize: Int): Double = {
+    val outputs = forward(data)
+    val deltas = new Array[BDM[Double]](layerModels.length)
+    val L = layerModels.length - 1
+    val (newE, newError) = layerModels.last match {
+      case flm: FunctionalLayerModel => flm.error(outputs.last, target)
+      case _ =>
+        throw new UnsupportedOperationException("Non-functional layer not supported at the top")
+    }
+    deltas(L) = new BDM[Double](0, 0)
+    deltas(L - 1) = newE
+    for (i <- (L - 2) to (0, -1)) {
+      deltas(i) = layerModels(i + 1).prevDelta(deltas(i + 1), outputs(i + 1))
+    }
+    val grads = new Array[Array[Double]](layerModels.length)
+    for (i <- 0 until layerModels.length) {
+      val input = if (i==0) data else outputs(i - 1)
+      grads(i) = layerModels(i).grad(deltas(i), input)
+    }
+    // update cumGradient
+    val cumGradientArray = cumGradient.toArray
+    var offset = 0
+    // TODO: extract roll
+    for (i <- 0 until grads.length) {
+      val gradArray = grads(i)
+      var k = 0
+      while (k < gradArray.length) {
+        cumGradientArray(offset + k) += gradArray(k)
+        k += 1
+      }
+      offset += gradArray.length
+    }
+    newError
+  }
+
+  // TODO: do we really need to copy the weights? they should be read-only
+  override def weights(): Vector = {
+    // TODO: extract roll
+    var size = 0
+    for (i <- 0 until layerModels.length) {
+      size += layerModels(i).size
+    }
+    val array = new Array[Double](size)
+    var offset = 0
+    for (i <- 0 until layerModels.length) {
+      val layerWeights = layerModels(i).weights().toArray
+      System.arraycopy(layerWeights, 0, array, offset, layerWeights.length)
+      offset += layerWeights.length
+    }
+    Vectors.dense(array)
+  }
+
+  override def predict(data: Vector): Vector = {
+    val size = data.size
+    val result = forward(new BDM[Double](size, 1, data.toArray))
+    Vectors.dense(result.last.toArray)
+  }
+}
+
+/**
+ * Fabric for feed forward ANN models
+ */
+private[ann] object FeedForwardModel {
+
+  /**
+   * Creates a model from a topology and weights
+   * @param topology topology
+   * @param weights weights
+   * @return model
+   */
+  def apply(topology: FeedForwardTopology, weights: Vector): FeedForwardModel = {
+    val layers = topology.layers
+    val layerModels = new Array[LayerModel](layers.length)
+    var offset = 0
+    for (i <- 0 until layers.length) {
+      layerModels(i) = layers(i).getInstance(weights, offset)
+      offset += layerModels(i).size
+    }
+    new FeedForwardModel(layerModels, topology)
+  }
+
+  /**
+   * Creates a model given a topology and seed
+   * @param topology topology
+   * @param seed seed for generating the weights
+   * @return model
+   */
+  def apply(topology: FeedForwardTopology, seed: Long = 11L): FeedForwardModel = {
+    val layers = topology.layers
+    val layerModels = new Array[LayerModel](layers.length)
+    var offset = 0
+    for(i <- 0 until layers.length){
+      layerModels(i) = layers(i).getInstance(seed)
+      offset += layerModels(i).size
+    }
+    new FeedForwardModel(layerModels, topology)
+  }
+}
+
+/**
+ * Neural network gradient. Does nothing but calling Model's gradient
+ * @param topology topology
+ * @param dataStacker data stacker
+ */
+private[ann] class ANNGradient(topology: Topology, dataStacker: DataStacker) extends Gradient {
+
+  override def compute(data: Vector, label: Double, weights: Vector): (Vector, Double) = {
+    val gradient = Vectors.zeros(weights.size)
+    val loss = compute(data, label, weights, gradient)
+    (gradient, loss)
+  }
+
+  override def compute(
+    data: Vector,
+    label: Double,
+    weights: Vector,
+    cumGradient: Vector): Double = {
+    val (input, target, realBatchSize) = dataStacker.unstack(data)
+    val model = topology.getInstance(weights)
+    model.computeGradient(input, target, cumGradient, realBatchSize)
+  }
+}
+
+/**
+ * Stacks pairs of training samples (input, output) in one vector allowing them to pass
+ * through Optimizer/Gradient interfaces. If stackSize is more than one, makes blocks
+ * or matrices of inputs and outputs and then stack them in one vector.
+ * This can be used for further batch computations after unstacking.
+ * @param stackSize stack size
+ * @param inputSize size of the input vectors
+ * @param outputSize size of the output vectors
+ */
+private[ann] class DataStacker(stackSize: Int, inputSize: Int, outputSize: Int)
+  extends Serializable {
+
+  /**
+   * Stacks the data
+   * @param data RDD of vector pairs
+   * @return RDD of double (always zero) and vector that contains the stacked vectors
+   */
+  def stack(data: RDD[(Vector, Vector)]): RDD[(Double, Vector)] = {
+    val stackedData = if (stackSize == 1) {
+      data.map { v =>
+        (0.0,
+          Vectors.fromBreeze(BDV.vertcat(
+            v._1.toBreeze.toDenseVector,
+            v._2.toBreeze.toDenseVector))
+          ) }
+    } else {
+      data.mapPartitions { it =>
+        it.grouped(stackSize).map { seq =>
+          val size = seq.size
+          val bigVector = new Array[Double](inputSize * size + outputSize * size)
+          var i = 0
+          seq.foreach { case (in, out) =>
+            System.arraycopy(in.toArray, 0, bigVector, i * inputSize, inputSize)
+            System.arraycopy(out.toArray, 0, bigVector,
+              inputSize * size + i * outputSize, outputSize)
+            i += 1
+          }
+          (0.0, Vectors.dense(bigVector))
+        }
+      }
+    }
+    stackedData
+  }
+
+  /**
+   * Unstack the stacked vectors into matrices for batch operations
+   * @param data stacked vector
+   * @return pair of matrices holding input and output data and the real stack size
+   */
+  def unstack(data: Vector): (BDM[Double], BDM[Double], Int) = {
+    val arrData = data.toArray
+    val realStackSize = arrData.length / (inputSize + outputSize)
+    val input = new BDM(inputSize, realStackSize, arrData)
+    val target = new BDM(outputSize, realStackSize, arrData, inputSize * realStackSize)
+    (input, target, realStackSize)
+  }
+}
+
+/**
+ * Simple updater
+ */
+private[ann] class ANNUpdater extends Updater {
+
+  override def compute(
+    weightsOld: Vector,
+    gradient: Vector,
+    stepSize: Double,
+    iter: Int,
+    regParam: Double): (Vector, Double) = {
+    val thisIterStepSize = stepSize
+    val brzWeights: BV[Double] = weightsOld.toBreeze.toDenseVector
+    Baxpy(-thisIterStepSize, gradient.toBreeze, brzWeights)
+    (Vectors.fromBreeze(brzWeights), 0)
+  }
+}
+
+/**
+ * MLlib-style trainer class that trains a network given the data and topology
+ * @param topology topology of ANN
+ * @param inputSize input size
+ * @param outputSize output size
+ */
+private[ml] class FeedForwardTrainer(
+    topology: Topology,
+    val inputSize: Int,
+    val outputSize: Int) extends Serializable {
+
+  // TODO: what if we need to pass random seed?
+  private var _weights = topology.getInstance(11L).weights()
+  private var _stackSize = 128
+  private var dataStacker = new DataStacker(_stackSize, inputSize, outputSize)
+  private var _gradient: Gradient = new ANNGradient(topology, dataStacker)
+  private var _updater: Updater = new ANNUpdater()
+  private var optimizer: Optimizer = LBFGSOptimizer.setConvergenceTol(1e-4).setNumIterations(100)
+
+  /**
+   * Returns weights
+   * @return weights
+   */
+  def getWeights: Vector = _weights
+
+  /**
+   * Sets weights
+   * @param value weights
+   * @return trainer
+   */
+  def setWeights(value: Vector): FeedForwardTrainer = {
+    _weights = value
+    this
+  }
+
+  /**
+   * Sets the stack size
+   * @param value stack size
+   * @return trainer
+   */
+  def setStackSize(value: Int): FeedForwardTrainer = {
+    _stackSize = value
+    dataStacker = new DataStacker(value, inputSize, outputSize)
+    this
+  }
+
+  /**
+   * Sets the SGD optimizer
+   * @return SGD optimizer
+   */
+  def SGDOptimizer: GradientDescent = {
+    val sgd = new GradientDescent(_gradient, _updater)
+    optimizer = sgd
+    sgd
+  }
+
+  /**
+   * Sets the LBFGS optimizer
+   * @return LBGS optimizer
+   */
+  def LBFGSOptimizer: LBFGS = {
+    val lbfgs = new LBFGS(_gradient, _updater)
+    optimizer = lbfgs
+    lbfgs
+  }
+
+  /**
+   * Sets the updater
+   * @param value updater
+   * @return trainer
+   */
+  def setUpdater(value: Updater): FeedForwardTrainer = {
+    _updater = value
+    updateUpdater(value)
+    this
+  }
+
+  /**
+   * Sets the gradient
+   * @param value gradient
+   * @return trainer
+   */
+  def setGradient(value: Gradient): FeedForwardTrainer = {
+    _gradient = value
+    updateGradient(value)
+    this
+  }
+
+  private[this] def updateGradient(gradient: Gradient): Unit = {
+    optimizer match {
+      case lbfgs: LBFGS => lbfgs.setGradient(gradient)
+      case sgd: GradientDescent => sgd.setGradient(gradient)
+      case other => throw new UnsupportedOperationException(
+        s"Only LBFGS and GradientDescent are supported but got ${other.getClass}.")
+    }
+  }
+
+  private[this] def updateUpdater(updater: Updater): Unit = {
+    optimizer match {
+      case lbfgs: LBFGS => lbfgs.setUpdater(updater)
+      case sgd: GradientDescent => sgd.setUpdater(updater)
+      case other => throw new UnsupportedOperationException(
+        s"Only LBFGS and GradientDescent are supported but got ${other.getClass}.")
+    }
+  }
+
+  /**
+   * Trains the ANN
+   * @param data RDD of input and output vector pairs
+   * @return model
+   */
+  def train(data: RDD[(Vector, Vector)]): TopologyModel = {
+    val newWeights = optimizer.optimize(dataStacker.stack(data), getWeights)
+    topology.getInstance(newWeights)
+  }
+
+}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
new file mode 100644
index 0000000000000..8cd2103d7d5e6
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
@@ -0,0 +1,193 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.classification
+
+import org.apache.spark.annotation.Experimental
+import org.apache.spark.ml.param.shared.{HasTol, HasMaxIter, HasSeed}
+import org.apache.spark.ml.{PredictorParams, PredictionModel, Predictor}
+import org.apache.spark.ml.param.{IntParam, ParamValidators, IntArrayParam, ParamMap}
+import org.apache.spark.ml.util.Identifiable
+import org.apache.spark.ml.ann.{FeedForwardTrainer, FeedForwardTopology}
+import org.apache.spark.mllib.linalg.{Vectors, Vector}
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.sql.DataFrame
+
+/** Params for Multilayer Perceptron. */
+private[ml] trait MultilayerPerceptronParams extends PredictorParams
+  with HasSeed with HasMaxIter with HasTol {
+  /**
+   * Layer sizes including input size and output size.
+   * @group param
+   */
+  final val layers: IntArrayParam = new IntArrayParam(this, "layers",
+    "Sizes of layers from input layer to output layer" +
+      " E.g., Array(780, 100, 10) means 780 inputs, " +
+      "one hidden layer with 100 neurons and output layer of 10 neurons.",
+    // TODO: how to check ALSO that all elements are greater than 0?
+    ParamValidators.arrayLengthGt(1)
+  )
+
+  /** @group setParam */
+  def setLayers(value: Array[Int]): this.type = set(layers, value)
+
+  /** @group getParam */
+  final def getLayers: Array[Int] = $(layers)
+
+  /**
+   * Block size for stacking input data in matrices to speed up the computation.
+   * Data is stacked within partitions. If block size is more than remaining data in
+   * a partition then it is adjusted to the size of this data.
+   * Recommended size is between 10 and 1000.
+   * @group expertParam
+   */
+  final val blockSize: IntParam = new IntParam(this, "blockSize",
+    "Block size for stacking input data in matrices. Data is stacked within partitions." +
+      " If block size is more than remaining data in a partition then " +
+      "it is adjusted to the size of this data. Recommended size is between 10 and 1000",
+    ParamValidators.gt(0))
+
+  /** @group setParam */
+  def setBlockSize(value: Int): this.type = set(blockSize, value)
+
+  /** @group getParam */
+  final def getBlockSize: Int = $(blockSize)
+
+  /**
+   * Set the maximum number of iterations.
+   * Default is 100.
+   * @group setParam
+   */
+  def setMaxIter(value: Int): this.type = set(maxIter, value)
+
+  /**
+   * Set the convergence tolerance of iterations.
+   * Smaller value will lead to higher accuracy with the cost of more iterations.
+   * Default is 1E-4.
+   * @group setParam
+   */
+  def setTol(value: Double): this.type = set(tol, value)
+
+  /**
+   * Set the seed for weights initialization.
+   * @group setParam
+   */
+  def setSeed(value: Long): this.type = set(seed, value)
+
+  setDefault(maxIter -> 100, tol -> 1e-4, layers -> Array(1, 1), blockSize -> 128)
+}
+
+/** Label to vector converter. */
+private object LabelConverter {
+  // TODO: Use OneHotEncoder instead
+  /**
+   * Encodes a label as a vector.
+   * Returns a vector of given length with zeroes at all positions
+   * and value 1.0 at the position that corresponds to the label.
+   *
+   * @param labeledPoint labeled point
+   * @param labelCount total number of labels
+   * @return pair of features and vector encoding of a label
+   */
+  def encodeLabeledPoint(labeledPoint: LabeledPoint, labelCount: Int): (Vector, Vector) = {
+    val output = Array.fill(labelCount)(0.0)
+    output(labeledPoint.label.toInt) = 1.0
+    (labeledPoint.features, Vectors.dense(output))
+  }
+
+  /**
+   * Converts a vector to a label.
+   * Returns the position of the maximal element of a vector.
+   *
+   * @param output label encoded with a vector
+   * @return label
+   */
+  def decodeLabel(output: Vector): Double = {
+    output.argmax.toDouble
+  }
+}
+
+/**
+ * :: Experimental ::
+ * Classifier trainer based on the Multilayer Perceptron.
+ * Each layer has sigmoid activation function, output layer has softmax.
+ * Number of inputs has to be equal to the size of feature vectors.
+ * Number of outputs has to be equal to the total number of labels.
+ *
+ */
+@Experimental
+class MultilayerPerceptronClassifier(override val uid: String)
+  extends Predictor[Vector, MultilayerPerceptronClassifier, MultilayerPerceptronClassifierModel]
+  with MultilayerPerceptronParams {
+
+  def this() = this(Identifiable.randomUID("mlpc"))
+
+  override def copy(extra: ParamMap): MultilayerPerceptronClassifier = defaultCopy(extra)
+
+  /**
+   * Train a model using the given dataset and parameters.
+   * Developers can implement this instead of [[fit()]] to avoid dealing with schema validation
+   * and copying parameters into the model.
+   *
+   * @param dataset Training dataset
+   * @return Fitted model
+   */
+  override protected def train(dataset: DataFrame): MultilayerPerceptronClassifierModel = {
+    val myLayers = $(layers)
+    val labels = myLayers.last
+    val lpData = extractLabeledPoints(dataset)
+    val data = lpData.map(lp => LabelConverter.encodeLabeledPoint(lp, labels))
+    val topology = FeedForwardTopology.multiLayerPerceptron(myLayers, true)
+    val FeedForwardTrainer = new FeedForwardTrainer(topology, myLayers(0), myLayers.last)
+    FeedForwardTrainer.LBFGSOptimizer.setConvergenceTol($(tol)).setNumIterations($(maxIter))
+    FeedForwardTrainer.setStackSize($(blockSize))
+    val mlpModel = FeedForwardTrainer.train(data)
+    new MultilayerPerceptronClassifierModel(uid, myLayers, mlpModel.weights())
+  }
+}
+
+/**
+ * :: Experimental ::
+ * Classifier model based on the Multilayer Perceptron.
+ * Each layer has sigmoid activation function, output layer has softmax.
+ * @param uid uid
+ * @param layers array of layer sizes including input and output layers
+ * @param weights vector of initial weights for the model that consists of the weights of layers
+ * @return prediction model
+ */
+@Experimental
+class MultilayerPerceptronClassifierModel private[ml] (
+    override val uid: String,
+    layers: Array[Int],
+    weights: Vector)
+  extends PredictionModel[Vector, MultilayerPerceptronClassifierModel]
+  with Serializable {
+
+  private val mlpModel = FeedForwardTopology.multiLayerPerceptron(layers, true).getInstance(weights)
+
+  /**
+   * Predict label for the given features.
+   * This internal method is used to implement [[transform()]] and output [[predictionCol]].
+   */
+  override protected def predict(features: Vector): Double = {
+    LabelConverter.decodeLabel(mlpModel.predict(features))
+  }
+
+  override def copy(extra: ParamMap): MultilayerPerceptronClassifierModel = {
+    copyValues(new MultilayerPerceptronClassifierModel(uid, layers, weights), extra)
+  }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
index 954aa17e26a02..d68f5ff0053c9 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
@@ -166,6 +166,11 @@ object ParamValidators {
   def inArray[T](allowed: java.util.List[T]): T => Boolean = { (value: T) =>
     allowed.contains(value)
   }
+
+  /** Check that the array length is greater than lowerBound. */
+  def arrayLengthGt[T](lowerBound: Double): Array[T] => Boolean = { (value: Array[T]) =>
+    value.length > lowerBound
+  }
 }
 
 // specialize primitive-typed params because Java doesn't recognize scala.Double, scala.Int, ...
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala b/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala
index ab7611fd077ef..8f0d1e4aa010a 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala
@@ -32,7 +32,7 @@ import org.apache.spark.mllib.linalg.{Vectors, Vector}
  * @param gradient Gradient function to be used.
  * @param updater Updater to be used to update weights after every iteration.
  */
-class GradientDescent private[mllib] (private var gradient: Gradient, private var updater: Updater)
+class GradientDescent private[spark] (private var gradient: Gradient, private var updater: Updater)
   extends Optimizer with Logging {
 
   private var stepSize: Double = 1.0
diff --git a/mllib/src/test/scala/org/apache/spark/ml/ann/ANNSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/ann/ANNSuite.scala
new file mode 100644
index 0000000000000..1292e57d7c01a
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/ml/ann/ANNSuite.scala
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.ann
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.mllib.util.TestingUtils._
+
+
+class ANNSuite extends SparkFunSuite with MLlibTestSparkContext {
+
+  // TODO: test for weights comparison with Weka MLP
+  test("ANN with Sigmoid learns XOR function with LBFGS optimizer") {
+    val inputs = Array(
+      Array(0.0, 0.0),
+      Array(0.0, 1.0),
+      Array(1.0, 0.0),
+      Array(1.0, 1.0)
+    )
+    val outputs = Array(0.0, 1.0, 1.0, 0.0)
+    val data = inputs.zip(outputs).map { case (features, label) =>
+      (Vectors.dense(features), Vectors.dense(label))
+    }
+    val rddData = sc.parallelize(data, 1)
+    val hiddenLayersTopology = Array(5)
+    val dataSample = rddData.first()
+    val layerSizes = dataSample._1.size +: hiddenLayersTopology :+ dataSample._2.size
+    val topology = FeedForwardTopology.multiLayerPerceptron(layerSizes, false)
+    val initialWeights = FeedForwardModel(topology, 23124).weights()
+    val trainer = new FeedForwardTrainer(topology, 2, 1)
+    trainer.setWeights(initialWeights)
+    trainer.LBFGSOptimizer.setNumIterations(20)
+    val model = trainer.train(rddData)
+    val predictionAndLabels = rddData.map { case (input, label) =>
+      (model.predict(input)(0), label(0))
+    }.collect()
+    predictionAndLabels.foreach { case (p, l) =>
+      assert(math.round(p) === l)
+    }
+  }
+
+  test("ANN with SoftMax learns XOR function with 2-bit output and batch GD optimizer") {
+    val inputs = Array(
+      Array(0.0, 0.0),
+      Array(0.0, 1.0),
+      Array(1.0, 0.0),
+      Array(1.0, 1.0)
+    )
+    val outputs = Array(
+      Array(1.0, 0.0),
+      Array(0.0, 1.0),
+      Array(0.0, 1.0),
+      Array(1.0, 0.0)
+    )
+    val data = inputs.zip(outputs).map { case (features, label) =>
+      (Vectors.dense(features), Vectors.dense(label))
+    }
+    val rddData = sc.parallelize(data, 1)
+    val hiddenLayersTopology = Array(5)
+    val dataSample = rddData.first()
+    val layerSizes = dataSample._1.size +: hiddenLayersTopology :+ dataSample._2.size
+    val topology = FeedForwardTopology.multiLayerPerceptron(layerSizes, false)
+    val initialWeights = FeedForwardModel(topology, 23124).weights()
+    val trainer = new FeedForwardTrainer(topology, 2, 2)
+    trainer.SGDOptimizer.setNumIterations(2000)
+    trainer.setWeights(initialWeights)
+    val model = trainer.train(rddData)
+    val predictionAndLabels = rddData.map { case (input, label) =>
+      (model.predict(input), label)
+    }.collect()
+    predictionAndLabels.foreach { case (p, l) =>
+      assert(p ~== l absTol 0.5)
+    }
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifierSuite.scala
new file mode 100644
index 0000000000000..ddc948f65df45
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifierSuite.scala
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.classification
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.mllib.classification.LogisticRegressionSuite._
+import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
+import org.apache.spark.mllib.evaluation.MulticlassMetrics
+import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.mllib.util.TestingUtils._
+import org.apache.spark.sql.Row
+
+class MultilayerPerceptronClassifierSuite extends SparkFunSuite with MLlibTestSparkContext {
+
+  test("XOR function learning as binary classification problem with two outputs.") {
+    val dataFrame = sqlContext.createDataFrame(Seq(
+        (Vectors.dense(0.0, 0.0), 0.0),
+        (Vectors.dense(0.0, 1.0), 1.0),
+        (Vectors.dense(1.0, 0.0), 1.0),
+        (Vectors.dense(1.0, 1.0), 0.0))
+    ).toDF("features", "label")
+    val layers = Array[Int](2, 5, 2)
+    val trainer = new MultilayerPerceptronClassifier()
+      .setLayers(layers)
+      .setBlockSize(1)
+      .setSeed(11L)
+      .setMaxIter(100)
+    val model = trainer.fit(dataFrame)
+    val result = model.transform(dataFrame)
+    val predictionAndLabels = result.select("prediction", "label").collect()
+    predictionAndLabels.foreach { case Row(p: Double, l: Double) =>
+      assert(p == l)
+    }
+  }
+
+  // TODO: implement a more rigorous test
+  test("3 class classification with 2 hidden layers") {
+    val nPoints = 1000
+
+    // The following weights are taken from OneVsRestSuite.scala
+    // they represent 3-class iris dataset
+    val weights = Array(
+      -0.57997, 0.912083, -0.371077, -0.819866, 2.688191,
+      -0.16624, -0.84355, -0.048509, -0.301789, 4.170682)
+
+    val xMean = Array(5.843, 3.057, 3.758, 1.199)
+    val xVariance = Array(0.6856, 0.1899, 3.116, 0.581)
+    val rdd = sc.parallelize(generateMultinomialLogisticInput(
+      weights, xMean, xVariance, true, nPoints, 42), 2)
+    val dataFrame = sqlContext.createDataFrame(rdd).toDF("label", "features")
+    val numClasses = 3
+    val numIterations = 100
+    val layers = Array[Int](4, 5, 4, numClasses)
+    val trainer = new MultilayerPerceptronClassifier()
+      .setLayers(layers)
+      .setBlockSize(1)
+      .setSeed(11L)
+      .setMaxIter(numIterations)
+    val model = trainer.fit(dataFrame)
+    val mlpPredictionAndLabels = model.transform(dataFrame).select("prediction", "label")
+      .map { case Row(p: Double, l: Double) => (p, l) }
+    // train multinomial logistic regression
+    val lr = new LogisticRegressionWithLBFGS()
+      .setIntercept(true)
+      .setNumClasses(numClasses)
+    lr.optimizer.setRegParam(0.0)
+      .setNumIterations(numIterations)
+    val lrModel = lr.run(rdd)
+    val lrPredictionAndLabels = lrModel.predict(rdd.map(_.features)).zip(rdd.map(_.label))
+    // MLP's predictions should not differ a lot from LR's.
+    val lrMetrics = new MulticlassMetrics(lrPredictionAndLabels)
+    val mlpMetrics = new MulticlassMetrics(mlpPredictionAndLabels)
+    assert(mlpMetrics.confusionMatrix ~== lrMetrics.confusionMatrix absTol 100)
+  }
+}

From 4011a947154d97a9ffb5a71f077481a12534d36b Mon Sep 17 00:00:00 2001
From: Yuhao Yang <hhbyyh@gmail.com>
Date: Fri, 31 Jul 2015 11:50:15 -0700
Subject: [PATCH 518/549] [SPARK-9231] [MLLIB] DistributedLDAModel method for
 top topics per document

jira: https://issues.apache.org/jira/browse/SPARK-9231

Helper method in DistributedLDAModel of this form:
```
/**
 * For each document, return the top k weighted topics for that document.
 * return RDD of (doc ID, topic indices, topic weights)
 */
def topTopicsPerDocument(k: Int): RDD[(Long, Array[Int], Array[Double])]
```

Author: Yuhao Yang <hhbyyh@gmail.com>

Closes #7785 from hhbyyh/topTopicsPerdoc and squashes the following commits:

30ad153 [Yuhao Yang] small fix
fd24580 [Yuhao Yang] add topTopics per document to DistributedLDAModel
---
 .../spark/mllib/clustering/LDAModel.scala     | 19 ++++++++++++++++++-
 .../spark/mllib/clustering/LDASuite.scala     | 13 ++++++++++++-
 2 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
index 6cfad3fbbdb87..82281a0daf008 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.mllib.clustering
 
-import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV, normalize, sum}
+import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV, argtopk, normalize, sum}
 import breeze.numerics.{exp, lgamma}
 import org.apache.hadoop.fs.Path
 import org.json4s.DefaultFormats
@@ -591,6 +591,23 @@ class DistributedLDAModel private[clustering] (
     JavaPairRDD.fromRDD(topicDistributions.asInstanceOf[RDD[(java.lang.Long, Vector)]])
   }
 
+  /**
+   * For each document, return the top k weighted topics for that document and their weights.
+   * @return RDD of (doc ID, topic indices, topic weights)
+   */
+  def topTopicsPerDocument(k: Int): RDD[(Long, Array[Int], Array[Double])] = {
+    graph.vertices.filter(LDA.isDocumentVertex).map { case (docID, topicCounts) =>
+      val topIndices = argtopk(topicCounts, k)
+      val sumCounts = sum(topicCounts)
+      val weights = if (sumCounts != 0) {
+        topicCounts(topIndices) / sumCounts
+      } else {
+        topicCounts(topIndices)
+      }
+      (docID.toLong, topIndices.toArray, weights.toArray)
+    }
+  }
+
   // TODO:
   // override def topicDistributions(documents: RDD[(Long, Vector)]): RDD[(Long, Vector)] = ???
 
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
index c43e1e575c09c..695ee3b82efc5 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.mllib.clustering
 
-import breeze.linalg.{DenseMatrix => BDM, max, argmax}
+import breeze.linalg.{DenseMatrix => BDM, argtopk, max, argmax}
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.graphx.Edge
@@ -108,6 +108,17 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext {
       assert(topicDistribution.toArray.sum ~== 1.0 absTol 1e-5)
     }
 
+    val top2TopicsPerDoc = model.topTopicsPerDocument(2).map(t => (t._1, (t._2, t._3)))
+    model.topicDistributions.join(top2TopicsPerDoc).collect().foreach {
+      case (docId, (topicDistribution, (indices, weights))) =>
+        assert(indices.length == 2)
+        assert(weights.length == 2)
+        val bdvTopicDist = topicDistribution.toBreeze
+        val top2Indices = argtopk(bdvTopicDist, 2)
+        assert(top2Indices.toArray === indices)
+        assert(bdvTopicDist(top2Indices).toArray === weights)
+    }
+
     // Check: log probabilities
     assert(model.logLikelihood < 0.0)
     assert(model.logPrior < 0.0)

From e8bdcdeabb2df139a656f86686cdb53c891b1f4b Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Fri, 31 Jul 2015 11:56:52 -0700
Subject: [PATCH 519/549] [SPARK-6885] [ML] decision tree support predict class
 probabilities

Decision tree support predict class probabilities.
Implement the prediction probabilities function referred the old DecisionTree API and the [sklean API](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/tree/tree.py#L593).
I make the DecisionTreeClassificationModel inherit from ProbabilisticClassificationModel, make the predictRaw to return the raw counts vector and make raw2probabilityInPlace/predictProbability return the probabilities for each prediction.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #7694 from yanboliang/spark-6885 and squashes the following commits:

08d5b7f [Yanbo Liang] fix ImpurityStats null parameters and raw2probabilityInPlace sum = 0 issue
2174278 [Yanbo Liang] solve merge conflicts
7e90ba8 [Yanbo Liang] fix typos
33ae183 [Yanbo Liang] fix annotation
ff043d3 [Yanbo Liang] raw2probabilityInPlace should operate in-place
c32d6ce [Yanbo Liang] optimize calculateImpurityStats function again
6167fb0 [Yanbo Liang] optimize calculateImpurityStats function
fbbe2ec [Yanbo Liang] eliminate duplicated struct and code
beb1634 [Yanbo Liang] try to eliminate impurityStats for each LearningNode
99e8943 [Yanbo Liang] code optimization
5ec3323 [Yanbo Liang] implement InformationGainAndImpurityStats
227c91b [Yanbo Liang] refactor LearningNode to store ImpurityCalculator
d746ffc [Yanbo Liang] decision tree support predict class probabilities
---
 .../DecisionTreeClassifier.scala              |  40 ++++--
 .../ml/classification/GBTClassifier.scala     |   2 +-
 .../RandomForestClassifier.scala              |   2 +-
 .../ml/regression/DecisionTreeRegressor.scala |   2 +-
 .../spark/ml/regression/GBTRegressor.scala    |   2 +-
 .../ml/regression/RandomForestRegressor.scala |   2 +-
 .../scala/org/apache/spark/ml/tree/Node.scala |  80 ++++++-----
 .../spark/ml/tree/impl/RandomForest.scala     | 126 ++++++++----------
 .../spark/mllib/tree/impurity/Entropy.scala   |   2 +-
 .../spark/mllib/tree/impurity/Gini.scala      |   2 +-
 .../spark/mllib/tree/impurity/Impurity.scala  |   2 +-
 .../spark/mllib/tree/impurity/Variance.scala  |   2 +-
 .../tree/model/InformationGainStats.scala     |  61 ++++++++-
 .../DecisionTreeClassifierSuite.scala         |  30 ++++-
 .../classification/GBTClassifierSuite.scala   |   2 +-
 .../RandomForestClassifierSuite.scala         |   2 +-
 16 files changed, 229 insertions(+), 130 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
index 36fe1bd40469c..f27cfd0331419 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
@@ -18,12 +18,11 @@
 package org.apache.spark.ml.classification
 
 import org.apache.spark.annotation.Experimental
-import org.apache.spark.ml.{PredictionModel, Predictor}
 import org.apache.spark.ml.param.ParamMap
 import org.apache.spark.ml.tree.{DecisionTreeModel, DecisionTreeParams, Node, TreeClassifierParams}
 import org.apache.spark.ml.tree.impl.RandomForest
 import org.apache.spark.ml.util.{Identifiable, MetadataUtils}
-import org.apache.spark.mllib.linalg.Vector
+import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.tree.configuration.{Algo => OldAlgo, Strategy => OldStrategy}
 import org.apache.spark.mllib.tree.model.{DecisionTreeModel => OldDecisionTreeModel}
@@ -39,7 +38,7 @@ import org.apache.spark.sql.DataFrame
  */
 @Experimental
 final class DecisionTreeClassifier(override val uid: String)
-  extends Predictor[Vector, DecisionTreeClassifier, DecisionTreeClassificationModel]
+  extends ProbabilisticClassifier[Vector, DecisionTreeClassifier, DecisionTreeClassificationModel]
   with DecisionTreeParams with TreeClassifierParams {
 
   def this() = this(Identifiable.randomUID("dtc"))
@@ -106,8 +105,9 @@ object DecisionTreeClassifier {
 @Experimental
 final class DecisionTreeClassificationModel private[ml] (
     override val uid: String,
-    override val rootNode: Node)
-  extends PredictionModel[Vector, DecisionTreeClassificationModel]
+    override val rootNode: Node,
+    override val numClasses: Int)
+  extends ProbabilisticClassificationModel[Vector, DecisionTreeClassificationModel]
   with DecisionTreeModel with Serializable {
 
   require(rootNode != null,
@@ -117,14 +117,36 @@ final class DecisionTreeClassificationModel private[ml] (
    * Construct a decision tree classification model.
    * @param rootNode  Root node of tree, with other nodes attached.
    */
-  def this(rootNode: Node) = this(Identifiable.randomUID("dtc"), rootNode)
+  def this(rootNode: Node, numClasses: Int) =
+    this(Identifiable.randomUID("dtc"), rootNode, numClasses)
 
   override protected def predict(features: Vector): Double = {
-    rootNode.predict(features)
+    rootNode.predictImpl(features).prediction
+  }
+
+  override protected def predictRaw(features: Vector): Vector = {
+    Vectors.dense(rootNode.predictImpl(features).impurityStats.stats.clone())
+  }
+
+  override protected def raw2probabilityInPlace(rawPrediction: Vector): Vector = {
+    rawPrediction match {
+      case dv: DenseVector =>
+        var i = 0
+        val size = dv.size
+        val sum = dv.values.sum
+        while (i < size) {
+          dv.values(i) = if (sum != 0) dv.values(i) / sum else 0.0
+          i += 1
+        }
+        dv
+      case sv: SparseVector =>
+        throw new RuntimeException("Unexpected error in DecisionTreeClassificationModel:" +
+          " raw2probabilityInPlace encountered SparseVector")
+    }
   }
 
   override def copy(extra: ParamMap): DecisionTreeClassificationModel = {
-    copyValues(new DecisionTreeClassificationModel(uid, rootNode), extra)
+    copyValues(new DecisionTreeClassificationModel(uid, rootNode, numClasses), extra)
   }
 
   override def toString: String = {
@@ -149,6 +171,6 @@ private[ml] object DecisionTreeClassificationModel {
         s" DecisionTreeClassificationModel (new API).  Algo is: ${oldModel.algo}")
     val rootNode = Node.fromOld(oldModel.topNode, categoricalFeatures)
     val uid = if (parent != null) parent.uid else Identifiable.randomUID("dtc")
-    new DecisionTreeClassificationModel(uid, rootNode)
+    new DecisionTreeClassificationModel(uid, rootNode, -1)
   }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
index eb0b1a0a405fc..c3891a9599262 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
@@ -190,7 +190,7 @@ final class GBTClassificationModel(
   override protected def predict(features: Vector): Double = {
     // TODO: When we add a generic Boosting class, handle transform there?  SPARK-7129
     // Classifies by thresholding sum of weighted tree predictions
-    val treePredictions = _trees.map(_.rootNode.predict(features))
+    val treePredictions = _trees.map(_.rootNode.predictImpl(features).prediction)
     val prediction = blas.ddot(numTrees, treePredictions, 1, _treeWeights, 1)
     if (prediction > 0.0) 1.0 else 0.0
   }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
index bc19bd6df894f..0c7eb4a662fdb 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
@@ -160,7 +160,7 @@ final class RandomForestClassificationModel private[ml] (
     // Ignore the weights since all are 1.0 for now.
     val votes = new Array[Double](numClasses)
     _trees.view.foreach { tree =>
-      val prediction = tree.rootNode.predict(features).toInt
+      val prediction = tree.rootNode.predictImpl(features).prediction.toInt
       votes(prediction) = votes(prediction) + 1.0 // 1.0 = weight
     }
     Vectors.dense(votes)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala
index 6f3340c2f02be..4d30e4b5548aa 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala
@@ -110,7 +110,7 @@ final class DecisionTreeRegressionModel private[ml] (
   def this(rootNode: Node) = this(Identifiable.randomUID("dtr"), rootNode)
 
   override protected def predict(features: Vector): Double = {
-    rootNode.predict(features)
+    rootNode.predictImpl(features).prediction
   }
 
   override def copy(extra: ParamMap): DecisionTreeRegressionModel = {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
index e38dc73ee0ba7..5633bc320273a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
@@ -180,7 +180,7 @@ final class GBTRegressionModel(
   override protected def predict(features: Vector): Double = {
     // TODO: When we add a generic Boosting class, handle transform there?  SPARK-7129
     // Classifies by thresholding sum of weighted tree predictions
-    val treePredictions = _trees.map(_.rootNode.predict(features))
+    val treePredictions = _trees.map(_.rootNode.predictImpl(features).prediction)
     blas.ddot(numTrees, treePredictions, 1, _treeWeights, 1)
   }
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala
index 506a878c2553b..17fb1ad5e15d4 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala
@@ -143,7 +143,7 @@ final class RandomForestRegressionModel private[ml] (
     // TODO: When we add a generic Bagging class, handle transform there.  SPARK-7128
     // Predict average of tree predictions.
     // Ignore the weights since all are 1.0 for now.
-    _trees.map(_.rootNode.predict(features)).sum / numTrees
+    _trees.map(_.rootNode.predictImpl(features).prediction).sum / numTrees
   }
 
   override def copy(extra: ParamMap): RandomForestRegressionModel = {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/Node.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/Node.scala
index bbc2427ca7d3d..8879352a600a9 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tree/Node.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tree/Node.scala
@@ -19,8 +19,9 @@ package org.apache.spark.ml.tree
 
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.mllib.linalg.Vector
+import org.apache.spark.mllib.tree.impurity.ImpurityCalculator
 import org.apache.spark.mllib.tree.model.{InformationGainStats => OldInformationGainStats,
-  Node => OldNode, Predict => OldPredict}
+  Node => OldNode, Predict => OldPredict, ImpurityStats}
 
 /**
  * :: DeveloperApi ::
@@ -38,8 +39,15 @@ sealed abstract class Node extends Serializable {
   /** Impurity measure at this node (for training data) */
   def impurity: Double
 
+  /**
+   * Statistics aggregated from training data at this node, used to compute prediction, impurity,
+   * and probabilities.
+   * For classification, the array of class counts must be normalized to a probability distribution.
+   */
+  private[tree] def impurityStats: ImpurityCalculator
+
   /** Recursive prediction helper method */
-  private[ml] def predict(features: Vector): Double = prediction
+  private[ml] def predictImpl(features: Vector): LeafNode
 
   /**
    * Get the number of nodes in tree below this node, including leaf nodes.
@@ -75,7 +83,8 @@ private[ml] object Node {
     if (oldNode.isLeaf) {
       // TODO: Once the implementation has been moved to this API, then include sufficient
       //       statistics here.
-      new LeafNode(prediction = oldNode.predict.predict, impurity = oldNode.impurity)
+      new LeafNode(prediction = oldNode.predict.predict,
+        impurity = oldNode.impurity, impurityStats = null)
     } else {
       val gain = if (oldNode.stats.nonEmpty) {
         oldNode.stats.get.gain
@@ -85,7 +94,7 @@ private[ml] object Node {
       new InternalNode(prediction = oldNode.predict.predict, impurity = oldNode.impurity,
         gain = gain, leftChild = fromOld(oldNode.leftNode.get, categoricalFeatures),
         rightChild = fromOld(oldNode.rightNode.get, categoricalFeatures),
-        split = Split.fromOld(oldNode.split.get, categoricalFeatures))
+        split = Split.fromOld(oldNode.split.get, categoricalFeatures), impurityStats = null)
     }
   }
 }
@@ -99,11 +108,13 @@ private[ml] object Node {
 @DeveloperApi
 final class LeafNode private[ml] (
     override val prediction: Double,
-    override val impurity: Double) extends Node {
+    override val impurity: Double,
+    override val impurityStats: ImpurityCalculator) extends Node {
 
-  override def toString: String = s"LeafNode(prediction = $prediction, impurity = $impurity)"
+  override def toString: String =
+    s"LeafNode(prediction = $prediction, impurity = $impurity)"
 
-  override private[ml] def predict(features: Vector): Double = prediction
+  override private[ml] def predictImpl(features: Vector): LeafNode = this
 
   override private[tree] def numDescendants: Int = 0
 
@@ -115,9 +126,8 @@ final class LeafNode private[ml] (
   override private[tree] def subtreeDepth: Int = 0
 
   override private[ml] def toOld(id: Int): OldNode = {
-    // NOTE: We do NOT store 'prob' in the new API currently.
-    new OldNode(id, new OldPredict(prediction, prob = 0.0), impurity, isLeaf = true,
-      None, None, None, None)
+    new OldNode(id, new OldPredict(prediction, prob = impurityStats.prob(prediction)),
+      impurity, isLeaf = true, None, None, None, None)
   }
 }
 
@@ -139,17 +149,18 @@ final class InternalNode private[ml] (
     val gain: Double,
     val leftChild: Node,
     val rightChild: Node,
-    val split: Split) extends Node {
+    val split: Split,
+    override val impurityStats: ImpurityCalculator) extends Node {
 
   override def toString: String = {
     s"InternalNode(prediction = $prediction, impurity = $impurity, split = $split)"
   }
 
-  override private[ml] def predict(features: Vector): Double = {
+  override private[ml] def predictImpl(features: Vector): LeafNode = {
     if (split.shouldGoLeft(features)) {
-      leftChild.predict(features)
+      leftChild.predictImpl(features)
     } else {
-      rightChild.predict(features)
+      rightChild.predictImpl(features)
     }
   }
 
@@ -172,9 +183,8 @@ final class InternalNode private[ml] (
   override private[ml] def toOld(id: Int): OldNode = {
     assert(id.toLong * 2 < Int.MaxValue, "Decision Tree could not be converted from new to old API"
       + " since the old API does not support deep trees.")
-    // NOTE: We do NOT store 'prob' in the new API currently.
-    new OldNode(id, new OldPredict(prediction, prob = 0.0), impurity, isLeaf = false,
-      Some(split.toOld), Some(leftChild.toOld(OldNode.leftChildIndex(id))),
+    new OldNode(id, new OldPredict(prediction, prob = impurityStats.prob(prediction)), impurity,
+      isLeaf = false, Some(split.toOld), Some(leftChild.toOld(OldNode.leftChildIndex(id))),
       Some(rightChild.toOld(OldNode.rightChildIndex(id))),
       Some(new OldInformationGainStats(gain, impurity, leftChild.impurity, rightChild.impurity,
         new OldPredict(leftChild.prediction, prob = 0.0),
@@ -223,36 +233,36 @@ private object InternalNode {
  *
  * @param id  We currently use the same indexing as the old implementation in
  *            [[org.apache.spark.mllib.tree.model.Node]], but this will change later.
- * @param predictionStats  Predicted label + class probability (for classification).
- *                         We will later modify this to store aggregate statistics for labels
- *                         to provide all class probabilities (for classification) and maybe a
- *                         distribution (for regression).
  * @param isLeaf  Indicates whether this node will definitely be a leaf in the learned tree,
  *                so that we do not need to consider splitting it further.
- * @param stats  Old structure for storing stats about information gain, prediction, etc.
- *               This is legacy and will be modified in the future.
+ * @param stats  Impurity statistics for this node.
  */
 private[tree] class LearningNode(
     var id: Int,
-    var predictionStats: OldPredict,
-    var impurity: Double,
     var leftChild: Option[LearningNode],
     var rightChild: Option[LearningNode],
     var split: Option[Split],
     var isLeaf: Boolean,
-    var stats: Option[OldInformationGainStats]) extends Serializable {
+    var stats: ImpurityStats) extends Serializable {
 
   /**
    * Convert this [[LearningNode]] to a regular [[Node]], and recurse on any children.
    */
   def toNode: Node = {
     if (leftChild.nonEmpty) {
-      assert(rightChild.nonEmpty && split.nonEmpty && stats.nonEmpty,
+      assert(rightChild.nonEmpty && split.nonEmpty && stats != null,
         "Unknown error during Decision Tree learning.  Could not convert LearningNode to Node.")
-      new InternalNode(predictionStats.predict, impurity, stats.get.gain,
-        leftChild.get.toNode, rightChild.get.toNode, split.get)
+      new InternalNode(stats.impurityCalculator.predict, stats.impurity, stats.gain,
+        leftChild.get.toNode, rightChild.get.toNode, split.get, stats.impurityCalculator)
     } else {
-      new LeafNode(predictionStats.predict, impurity)
+      if (stats.valid) {
+        new LeafNode(stats.impurityCalculator.predict, stats.impurity,
+          stats.impurityCalculator)
+      } else {
+        // Here we want to keep same behavior with the old mllib.DecisionTreeModel
+        new LeafNode(stats.impurityCalculator.predict, -1.0, stats.impurityCalculator)
+      }
+
     }
   }
 
@@ -263,16 +273,14 @@ private[tree] object LearningNode {
   /** Create a node with some of its fields set. */
   def apply(
       id: Int,
-      predictionStats: OldPredict,
-      impurity: Double,
-      isLeaf: Boolean): LearningNode = {
-    new LearningNode(id, predictionStats, impurity, None, None, None, false, None)
+      isLeaf: Boolean,
+      stats: ImpurityStats): LearningNode = {
+    new LearningNode(id, None, None, None, false, stats)
   }
 
   /** Create an empty node with the given node index.  Values must be set later on. */
   def emptyNode(nodeIndex: Int): LearningNode = {
-    new LearningNode(nodeIndex, new OldPredict(Double.NaN, Double.NaN), Double.NaN,
-      None, None, None, false, None)
+    new LearningNode(nodeIndex, None, None, None, false, null)
   }
 
   // The below indexing methods were copied from spark.mllib.tree.model.Node
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala
index 15b56bd844bad..a8b90d9d266a1 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala
@@ -31,7 +31,7 @@ import org.apache.spark.mllib.tree.configuration.{Algo => OldAlgo, Strategy => O
 import org.apache.spark.mllib.tree.impl.{BaggedPoint, DTStatsAggregator, DecisionTreeMetadata,
   TimeTracker}
 import org.apache.spark.mllib.tree.impurity.ImpurityCalculator
-import org.apache.spark.mllib.tree.model.{InformationGainStats, Predict}
+import org.apache.spark.mllib.tree.model.ImpurityStats
 import org.apache.spark.rdd.RDD
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.util.random.{SamplingUtils, XORShiftRandom}
@@ -180,13 +180,17 @@ private[ml] object RandomForest extends Logging {
     parentUID match {
       case Some(uid) =>
         if (strategy.algo == OldAlgo.Classification) {
-          topNodes.map(rootNode => new DecisionTreeClassificationModel(uid, rootNode.toNode))
+          topNodes.map { rootNode =>
+            new DecisionTreeClassificationModel(uid, rootNode.toNode, strategy.getNumClasses)
+          }
         } else {
           topNodes.map(rootNode => new DecisionTreeRegressionModel(uid, rootNode.toNode))
         }
       case None =>
         if (strategy.algo == OldAlgo.Classification) {
-          topNodes.map(rootNode => new DecisionTreeClassificationModel(rootNode.toNode))
+          topNodes.map { rootNode =>
+            new DecisionTreeClassificationModel(rootNode.toNode, strategy.getNumClasses)
+          }
         } else {
           topNodes.map(rootNode => new DecisionTreeRegressionModel(rootNode.toNode))
         }
@@ -549,9 +553,9 @@ private[ml] object RandomForest extends Logging {
         }
 
         // find best split for each node
-        val (split: Split, stats: InformationGainStats, predict: Predict) =
+        val (split: Split, stats: ImpurityStats) =
           binsToBestSplit(aggStats, splits, featuresForNode, nodes(nodeIndex))
-        (nodeIndex, (split, stats, predict))
+        (nodeIndex, (split, stats))
     }.collectAsMap()
 
     timer.stop("chooseSplits")
@@ -568,17 +572,15 @@ private[ml] object RandomForest extends Logging {
         val nodeIndex = node.id
         val nodeInfo = treeToNodeToIndexInfo(treeIndex)(nodeIndex)
         val aggNodeIndex = nodeInfo.nodeIndexInGroup
-        val (split: Split, stats: InformationGainStats, predict: Predict) =
+        val (split: Split, stats: ImpurityStats) =
           nodeToBestSplits(aggNodeIndex)
         logDebug("best split = " + split)
 
         // Extract info for this node.  Create children if not leaf.
         val isLeaf =
           (stats.gain <= 0) || (LearningNode.indexToLevel(nodeIndex) == metadata.maxDepth)
-        node.predictionStats = predict
         node.isLeaf = isLeaf
-        node.stats = Some(stats)
-        node.impurity = stats.impurity
+        node.stats = stats
         logDebug("Node = " + node)
 
         if (!isLeaf) {
@@ -587,9 +589,9 @@ private[ml] object RandomForest extends Logging {
           val leftChildIsLeaf = childIsLeaf || (stats.leftImpurity == 0.0)
           val rightChildIsLeaf = childIsLeaf || (stats.rightImpurity == 0.0)
           node.leftChild = Some(LearningNode(LearningNode.leftChildIndex(nodeIndex),
-            stats.leftPredict, stats.leftImpurity, leftChildIsLeaf))
+            leftChildIsLeaf, ImpurityStats.getEmptyImpurityStats(stats.leftImpurityCalculator)))
           node.rightChild = Some(LearningNode(LearningNode.rightChildIndex(nodeIndex),
-            stats.rightPredict, stats.rightImpurity, rightChildIsLeaf))
+            rightChildIsLeaf, ImpurityStats.getEmptyImpurityStats(stats.rightImpurityCalculator)))
 
           if (nodeIdCache.nonEmpty) {
             val nodeIndexUpdater = NodeIndexUpdater(
@@ -621,28 +623,44 @@ private[ml] object RandomForest extends Logging {
   }
 
   /**
-   * Calculate the information gain for a given (feature, split) based upon left/right aggregates.
+   * Calculate the impurity statistics for a give (feature, split) based upon left/right aggregates.
+   * @param stats the recycle impurity statistics for this feature's all splits,
+   *              only 'impurity' and 'impurityCalculator' are valid between each iteration
    * @param leftImpurityCalculator left node aggregates for this (feature, split)
    * @param rightImpurityCalculator right node aggregate for this (feature, split)
-   * @return information gain and statistics for split
+   * @param metadata learning and dataset metadata for DecisionTree
+   * @return Impurity statistics for this (feature, split)
    */
-  private def calculateGainForSplit(
+  private def calculateImpurityStats(
+      stats: ImpurityStats,
       leftImpurityCalculator: ImpurityCalculator,
       rightImpurityCalculator: ImpurityCalculator,
-      metadata: DecisionTreeMetadata,
-      impurity: Double): InformationGainStats = {
+      metadata: DecisionTreeMetadata): ImpurityStats = {
+
+    val parentImpurityCalculator: ImpurityCalculator = if (stats == null) {
+      leftImpurityCalculator.copy.add(rightImpurityCalculator)
+    } else {
+      stats.impurityCalculator
+    }
+
+    val impurity: Double = if (stats == null) {
+      parentImpurityCalculator.calculate()
+    } else {
+      stats.impurity
+    }
+
     val leftCount = leftImpurityCalculator.count
     val rightCount = rightImpurityCalculator.count
 
+    val totalCount = leftCount + rightCount
+
     // If left child or right child doesn't satisfy minimum instances per node,
     // then this split is invalid, return invalid information gain stats.
     if ((leftCount < metadata.minInstancesPerNode) ||
       (rightCount < metadata.minInstancesPerNode)) {
-      return InformationGainStats.invalidInformationGainStats
+      return ImpurityStats.getInvalidImpurityStats(parentImpurityCalculator)
     }
 
-    val totalCount = leftCount + rightCount
-
     val leftImpurity = leftImpurityCalculator.calculate() // Note: This equals 0 if count = 0
     val rightImpurity = rightImpurityCalculator.calculate()
 
@@ -654,39 +672,11 @@ private[ml] object RandomForest extends Logging {
     // if information gain doesn't satisfy minimum information gain,
     // then this split is invalid, return invalid information gain stats.
     if (gain < metadata.minInfoGain) {
-      return InformationGainStats.invalidInformationGainStats
+      return ImpurityStats.getInvalidImpurityStats(parentImpurityCalculator)
     }
 
-    // calculate left and right predict
-    val leftPredict = calculatePredict(leftImpurityCalculator)
-    val rightPredict = calculatePredict(rightImpurityCalculator)
-
-    new InformationGainStats(gain, impurity, leftImpurity, rightImpurity,
-      leftPredict, rightPredict)
-  }
-
-  private def calculatePredict(impurityCalculator: ImpurityCalculator): Predict = {
-    val predict = impurityCalculator.predict
-    val prob = impurityCalculator.prob(predict)
-    new Predict(predict, prob)
-  }
-
-  /**
-   * Calculate predict value for current node, given stats of any split.
-   * Note that this function is called only once for each node.
-   * @param leftImpurityCalculator left node aggregates for a split
-   * @param rightImpurityCalculator right node aggregates for a split
-   * @return predict value and impurity for current node
-   */
-  private def calculatePredictImpurity(
-      leftImpurityCalculator: ImpurityCalculator,
-      rightImpurityCalculator: ImpurityCalculator): (Predict, Double) = {
-    val parentNodeAgg = leftImpurityCalculator.copy
-    parentNodeAgg.add(rightImpurityCalculator)
-    val predict = calculatePredict(parentNodeAgg)
-    val impurity = parentNodeAgg.calculate()
-
-    (predict, impurity)
+    new ImpurityStats(gain, impurity, parentImpurityCalculator,
+      leftImpurityCalculator, rightImpurityCalculator)
   }
 
   /**
@@ -698,14 +688,14 @@ private[ml] object RandomForest extends Logging {
       binAggregates: DTStatsAggregator,
       splits: Array[Array[Split]],
       featuresForNode: Option[Array[Int]],
-      node: LearningNode): (Split, InformationGainStats, Predict) = {
+      node: LearningNode): (Split, ImpurityStats) = {
 
-    // Calculate prediction and impurity if current node is top node
+    // Calculate InformationGain and ImpurityStats if current node is top node
     val level = LearningNode.indexToLevel(node.id)
-    var predictionAndImpurity: Option[(Predict, Double)] = if (level == 0) {
-      None
+    var gainAndImpurityStats: ImpurityStats = if (level ==0) {
+      null
     } else {
-      Some((node.predictionStats, node.impurity))
+      node.stats
     }
 
     // For each (feature, split), calculate the gain, and select the best (feature, split).
@@ -734,11 +724,9 @@ private[ml] object RandomForest extends Logging {
               val rightChildStats =
                 binAggregates.getImpurityCalculator(nodeFeatureOffset, numSplits)
               rightChildStats.subtract(leftChildStats)
-              predictionAndImpurity = Some(predictionAndImpurity.getOrElse(
-                calculatePredictImpurity(leftChildStats, rightChildStats)))
-              val gainStats = calculateGainForSplit(leftChildStats,
-                rightChildStats, binAggregates.metadata, predictionAndImpurity.get._2)
-              (splitIdx, gainStats)
+              gainAndImpurityStats = calculateImpurityStats(gainAndImpurityStats,
+                leftChildStats, rightChildStats, binAggregates.metadata)
+              (splitIdx, gainAndImpurityStats)
             }.maxBy(_._2.gain)
           (splits(featureIndex)(bestFeatureSplitIndex), bestFeatureGainStats)
         } else if (binAggregates.metadata.isUnordered(featureIndex)) {
@@ -750,11 +738,9 @@ private[ml] object RandomForest extends Logging {
               val leftChildStats = binAggregates.getImpurityCalculator(leftChildOffset, splitIndex)
               val rightChildStats =
                 binAggregates.getImpurityCalculator(rightChildOffset, splitIndex)
-              predictionAndImpurity = Some(predictionAndImpurity.getOrElse(
-                calculatePredictImpurity(leftChildStats, rightChildStats)))
-              val gainStats = calculateGainForSplit(leftChildStats,
-                rightChildStats, binAggregates.metadata, predictionAndImpurity.get._2)
-              (splitIndex, gainStats)
+              gainAndImpurityStats = calculateImpurityStats(gainAndImpurityStats,
+                leftChildStats, rightChildStats, binAggregates.metadata)
+              (splitIndex, gainAndImpurityStats)
             }.maxBy(_._2.gain)
           (splits(featureIndex)(bestFeatureSplitIndex), bestFeatureGainStats)
         } else {
@@ -825,11 +811,9 @@ private[ml] object RandomForest extends Logging {
               val rightChildStats =
                 binAggregates.getImpurityCalculator(nodeFeatureOffset, lastCategory)
               rightChildStats.subtract(leftChildStats)
-              predictionAndImpurity = Some(predictionAndImpurity.getOrElse(
-                calculatePredictImpurity(leftChildStats, rightChildStats)))
-              val gainStats = calculateGainForSplit(leftChildStats,
-                rightChildStats, binAggregates.metadata, predictionAndImpurity.get._2)
-              (splitIndex, gainStats)
+              gainAndImpurityStats = calculateImpurityStats(gainAndImpurityStats,
+                leftChildStats, rightChildStats, binAggregates.metadata)
+              (splitIndex, gainAndImpurityStats)
             }.maxBy(_._2.gain)
           val categoriesForSplit =
             categoriesSortedByCentroid.map(_._1.toDouble).slice(0, bestFeatureSplitIndex + 1)
@@ -839,7 +823,7 @@ private[ml] object RandomForest extends Logging {
         }
       }.maxBy(_._2.gain)
 
-    (bestSplit, bestSplitStats, predictionAndImpurity.get._1)
+    (bestSplit, bestSplitStats)
   }
 
   /**
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Entropy.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Entropy.scala
index 5ac10f3fd32dd..0768204c33914 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Entropy.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Entropy.scala
@@ -118,7 +118,7 @@ private[tree] class EntropyAggregator(numClasses: Int)
  * (node, feature, bin).
  * @param stats  Array of sufficient statistics for a (node, feature, bin).
  */
-private[tree] class EntropyCalculator(stats: Array[Double]) extends ImpurityCalculator(stats) {
+private[spark] class EntropyCalculator(stats: Array[Double]) extends ImpurityCalculator(stats) {
 
   /**
    * Make a deep copy of this [[ImpurityCalculator]].
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Gini.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Gini.scala
index 19d318203c344..d0077db6832e3 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Gini.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Gini.scala
@@ -114,7 +114,7 @@ private[tree] class GiniAggregator(numClasses: Int)
  * (node, feature, bin).
  * @param stats  Array of sufficient statistics for a (node, feature, bin).
  */
-private[tree] class GiniCalculator(stats: Array[Double]) extends ImpurityCalculator(stats) {
+private[spark] class GiniCalculator(stats: Array[Double]) extends ImpurityCalculator(stats) {
 
   /**
    * Make a deep copy of this [[ImpurityCalculator]].
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Impurity.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Impurity.scala
index 578749d85a4e6..86cee7e430b0a 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Impurity.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Impurity.scala
@@ -95,7 +95,7 @@ private[spark] abstract class ImpurityAggregator(val statsSize: Int) extends Ser
  * (node, feature, bin).
  * @param stats  Array of sufficient statistics for a (node, feature, bin).
  */
-private[spark] abstract class ImpurityCalculator(val stats: Array[Double]) {
+private[spark] abstract class ImpurityCalculator(val stats: Array[Double]) extends Serializable {
 
   /**
    * Make a deep copy of this [[ImpurityCalculator]].
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Variance.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Variance.scala
index 7104a7fa4dd4c..04d0cd24e6632 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Variance.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Variance.scala
@@ -98,7 +98,7 @@ private[tree] class VarianceAggregator()
  * (node, feature, bin).
  * @param stats  Array of sufficient statistics for a (node, feature, bin).
  */
-private[tree] class VarianceCalculator(stats: Array[Double]) extends ImpurityCalculator(stats) {
+private[spark] class VarianceCalculator(stats: Array[Double]) extends ImpurityCalculator(stats) {
 
   require(stats.size == 3,
     s"VarianceCalculator requires sufficient statistics array stats to be of length 3," +
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/InformationGainStats.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/InformationGainStats.scala
index dc9e0f9f51ffb..508bf9c1bdb47 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/InformationGainStats.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/InformationGainStats.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.mllib.tree.model
 
 import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.mllib.tree.impurity.ImpurityCalculator
 
 /**
  * :: DeveloperApi ::
@@ -66,7 +67,6 @@ class InformationGainStats(
   }
 }
 
-
 private[spark] object InformationGainStats {
   /**
    * An [[org.apache.spark.mllib.tree.model.InformationGainStats]] object to
@@ -76,3 +76,62 @@ private[spark] object InformationGainStats {
   val invalidInformationGainStats = new InformationGainStats(Double.MinValue, -1.0, -1.0, -1.0,
     new Predict(0.0, 0.0), new Predict(0.0, 0.0))
 }
+
+/**
+ * :: DeveloperApi ::
+ * Impurity statistics for each split
+ * @param gain information gain value
+ * @param impurity current node impurity
+ * @param impurityCalculator impurity statistics for current node
+ * @param leftImpurityCalculator impurity statistics for left child node
+ * @param rightImpurityCalculator impurity statistics for right child node
+ * @param valid whether the current split satisfies minimum info gain or
+ *              minimum number of instances per node
+ */
+@DeveloperApi
+private[spark] class ImpurityStats(
+    val gain: Double,
+    val impurity: Double,
+    val impurityCalculator: ImpurityCalculator,
+    val leftImpurityCalculator: ImpurityCalculator,
+    val rightImpurityCalculator: ImpurityCalculator,
+    val valid: Boolean = true) extends Serializable {
+
+  override def toString: String = {
+    s"gain = $gain, impurity = $impurity, left impurity = $leftImpurity, " +
+      s"right impurity = $rightImpurity"
+  }
+
+  def leftImpurity: Double = if (leftImpurityCalculator != null) {
+    leftImpurityCalculator.calculate()
+  } else {
+    -1.0
+  }
+
+  def rightImpurity: Double = if (rightImpurityCalculator != null) {
+    rightImpurityCalculator.calculate()
+  } else {
+    -1.0
+  }
+}
+
+private[spark] object ImpurityStats {
+
+  /**
+   * Return an [[org.apache.spark.mllib.tree.model.ImpurityStats]] object to
+   * denote that current split doesn't satisfies minimum info gain or
+   * minimum number of instances per node.
+   */
+  def getInvalidImpurityStats(impurityCalculator: ImpurityCalculator): ImpurityStats = {
+    new ImpurityStats(Double.MinValue, impurityCalculator.calculate(),
+      impurityCalculator, null, null, false)
+  }
+
+  /**
+   * Return an [[org.apache.spark.mllib.tree.model.ImpurityStats]] object
+   * that only 'impurity' and 'impurityCalculator' are defined.
+   */
+  def getEmptyImpurityStats(impurityCalculator: ImpurityCalculator): ImpurityStats = {
+    new ImpurityStats(Double.NaN, impurityCalculator.calculate(), impurityCalculator, null, null)
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala
index 73b4805c4c597..c7bbf1ce07a23 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala
@@ -21,12 +21,13 @@ import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.impl.TreeTests
 import org.apache.spark.ml.param.ParamsSuite
 import org.apache.spark.ml.tree.LeafNode
-import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.linalg.{Vector, Vectors}
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.tree.{DecisionTree => OldDecisionTree, DecisionTreeSuite => OldDecisionTreeSuite}
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.Row
 
 class DecisionTreeClassifierSuite extends SparkFunSuite with MLlibTestSparkContext {
 
@@ -57,7 +58,7 @@ class DecisionTreeClassifierSuite extends SparkFunSuite with MLlibTestSparkConte
 
   test("params") {
     ParamsSuite.checkParams(new DecisionTreeClassifier)
-    val model = new DecisionTreeClassificationModel("dtc", new LeafNode(0.0, 0.0))
+    val model = new DecisionTreeClassificationModel("dtc", new LeafNode(0.0, 0.0, null), 2)
     ParamsSuite.checkParams(model)
   }
 
@@ -231,6 +232,31 @@ class DecisionTreeClassifierSuite extends SparkFunSuite with MLlibTestSparkConte
     compareAPIs(rdd, dt, categoricalFeatures = Map.empty[Int, Int], numClasses)
   }
 
+  test("predictRaw and predictProbability") {
+    val rdd = continuousDataPointsForMulticlassRDD
+    val dt = new DecisionTreeClassifier()
+      .setImpurity("Gini")
+      .setMaxDepth(4)
+      .setMaxBins(100)
+    val categoricalFeatures = Map(0 -> 3)
+    val numClasses = 3
+
+    val newData: DataFrame = TreeTests.setMetadata(rdd, categoricalFeatures, numClasses)
+    val newTree = dt.fit(newData)
+
+    val predictions = newTree.transform(newData)
+      .select(newTree.getPredictionCol, newTree.getRawPredictionCol, newTree.getProbabilityCol)
+      .collect()
+
+    predictions.foreach { case Row(pred: Double, rawPred: Vector, probPred: Vector) =>
+      assert(pred === rawPred.argmax,
+        s"Expected prediction $pred but calculated ${rawPred.argmax} from rawPrediction.")
+      val sum = rawPred.toArray.sum
+      assert(Vectors.dense(rawPred.toArray.map(_ / sum)) === probPred,
+        "probability prediction mismatch")
+    }
+  }
+
   /////////////////////////////////////////////////////////////////////////////
   // Tests of model save/load
   /////////////////////////////////////////////////////////////////////////////
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala
index a7bc77965fefd..d4b5896c12c06 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala
@@ -58,7 +58,7 @@ class GBTClassifierSuite extends SparkFunSuite with MLlibTestSparkContext {
   test("params") {
     ParamsSuite.checkParams(new GBTClassifier)
     val model = new GBTClassificationModel("gbtc",
-      Array(new DecisionTreeRegressionModel("dtr", new LeafNode(0.0, 0.0))),
+      Array(new DecisionTreeRegressionModel("dtr", new LeafNode(0.0, 0.0, null))),
       Array(1.0))
     ParamsSuite.checkParams(model)
   }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala
index ab711c8e4b215..dbb2577c6204d 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala
@@ -66,7 +66,7 @@ class RandomForestClassifierSuite extends SparkFunSuite with MLlibTestSparkConte
   test("params") {
     ParamsSuite.checkParams(new RandomForestClassifier)
     val model = new RandomForestClassificationModel("rfc",
-      Array(new DecisionTreeClassificationModel("dtc", new LeafNode(0.0, 0.0))), 2)
+      Array(new DecisionTreeClassificationModel("dtc", new LeafNode(0.0, 0.0, null), 2)), 2)
     ParamsSuite.checkParams(model)
   }
 

From 0a1d2ca42c8b31d6b0e70163795f0185d4622f87 Mon Sep 17 00:00:00 2001
From: Iulian Dragos <jaguarul@gmail.com>
Date: Fri, 31 Jul 2015 12:04:03 -0700
Subject: [PATCH 520/549] [SPARK-8979] Add a PID based rate estimator
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Based on #7600

/cc tdas

Author: Iulian Dragos <jaguarul@gmail.com>
Author: François Garillot <francois@garillot.net>

Closes #7648 from dragos/topic/streaming-bp/pid and squashes the following commits:

aa5b097 [Iulian Dragos] Add more comments, made all PID constant parameters positive, a couple more tests.
93b74f8 [Iulian Dragos] Better explanation of historicalError.
7975b0c [Iulian Dragos] Add configuration for PID.
26cfd78 [Iulian Dragos] A couple of variable renames.
d0bdf7c [Iulian Dragos] Update to latest version of the code, various style and name improvements.
d58b845 [François Garillot] [SPARK-8979][Streaming] Implements a PIDRateEstimator
---
 .../dstream/ReceiverInputDStream.scala        |   2 +-
 .../scheduler/rate/PIDRateEstimator.scala     | 124 ++++++++++++++++
 .../scheduler/rate/RateEstimator.scala        |  18 ++-
 .../rate/PIDRateEstimatorSuite.scala          | 137 ++++++++++++++++++
 4 files changed, 276 insertions(+), 5 deletions(-)
 create mode 100644 streaming/src/main/scala/org/apache/spark/streaming/scheduler/rate/PIDRateEstimator.scala
 create mode 100644 streaming/src/test/scala/org/apache/spark/streaming/scheduler/rate/PIDRateEstimatorSuite.scala

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReceiverInputDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReceiverInputDStream.scala
index 646a8c3530a62..670ef8d296a0b 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReceiverInputDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReceiverInputDStream.scala
@@ -46,7 +46,7 @@ abstract class ReceiverInputDStream[T: ClassTag](@transient ssc_ : StreamingCont
    */
   override protected[streaming] val rateController: Option[RateController] = {
     if (RateController.isBackPressureEnabled(ssc.conf)) {
-      RateEstimator.create(ssc.conf).map { new ReceiverRateController(id, _) }
+      Some(new ReceiverRateController(id, RateEstimator.create(ssc.conf, ssc.graph.batchDuration)))
     } else {
       None
     }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/rate/PIDRateEstimator.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/rate/PIDRateEstimator.scala
new file mode 100644
index 0000000000000..6ae56a68ad88c
--- /dev/null
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/rate/PIDRateEstimator.scala
@@ -0,0 +1,124 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.streaming.scheduler.rate
+
+/**
+ * Implements a proportional-integral-derivative (PID) controller which acts on
+ * the speed of ingestion of elements into Spark Streaming. A PID controller works
+ * by calculating an '''error''' between a measured output and a desired value. In the
+ * case of Spark Streaming the error is the difference between the measured processing
+ * rate (number of elements/processing delay) and the previous rate.
+ *
+ * @see https://en.wikipedia.org/wiki/PID_controller
+ *
+ * @param batchDurationMillis the batch duration, in milliseconds
+ * @param proportional how much the correction should depend on the current
+ *        error. This term usually provides the bulk of correction and should be positive or zero.
+ *        A value too large would make the controller overshoot the setpoint, while a small value
+ *        would make the controller too insensitive. The default value is 1.
+ * @param integral how much the correction should depend on the accumulation
+ *        of past errors. This value should be positive or 0. This term accelerates the movement
+ *        towards the desired value, but a large value may lead to overshooting. The default value
+ *        is 0.2.
+ * @param derivative how much the correction should depend on a prediction
+ *        of future errors, based on current rate of change. This value should be positive or 0.
+ *        This term is not used very often, as it impacts stability of the system. The default
+ *        value is 0.
+ */
+private[streaming] class PIDRateEstimator(
+    batchIntervalMillis: Long,
+    proportional: Double = 1D,
+    integral: Double = .2D,
+    derivative: Double = 0D)
+  extends RateEstimator {
+
+  private var firstRun: Boolean = true
+  private var latestTime: Long = -1L
+  private var latestRate: Double = -1D
+  private var latestError: Double = -1L
+
+  require(
+    batchIntervalMillis > 0,
+    s"Specified batch interval $batchIntervalMillis in PIDRateEstimator is invalid.")
+  require(
+    proportional >= 0,
+    s"Proportional term $proportional in PIDRateEstimator should be >= 0.")
+  require(
+    integral >= 0,
+    s"Integral term $integral in PIDRateEstimator should be >= 0.")
+  require(
+    derivative >= 0,
+    s"Derivative term $derivative in PIDRateEstimator should be >= 0.")
+
+
+  def compute(time: Long, // in milliseconds
+      numElements: Long,
+      processingDelay: Long, // in milliseconds
+      schedulingDelay: Long // in milliseconds
+    ): Option[Double] = {
+
+    this.synchronized {
+      if (time > latestTime && processingDelay > 0 && batchIntervalMillis > 0) {
+
+        // in seconds, should be close to batchDuration
+        val delaySinceUpdate = (time - latestTime).toDouble / 1000
+
+        // in elements/second
+        val processingRate = numElements.toDouble / processingDelay * 1000
+
+        // In our system `error` is the difference between the desired rate and the measured rate
+        // based on the latest batch information. We consider the desired rate to be latest rate,
+        // which is what this estimator calculated for the previous batch.
+        // in elements/second
+        val error = latestRate - processingRate
+
+        // The error integral, based on schedulingDelay as an indicator for accumulated errors.
+        // A scheduling delay s corresponds to s * processingRate overflowing elements. Those
+        // are elements that couldn't be processed in previous batches, leading to this delay.
+        // In the following, we assume the processingRate didn't change too much.
+        // From the number of overflowing elements we can calculate the rate at which they would be
+        // processed by dividing it by the batch interval. This rate is our "historical" error,
+        // or integral part, since if we subtracted this rate from the previous "calculated rate",
+        // there wouldn't have been any overflowing elements, and the scheduling delay would have
+        // been zero.
+        // (in elements/second)
+        val historicalError = schedulingDelay.toDouble * processingRate / batchIntervalMillis
+
+        // in elements/(second ^ 2)
+        val dError = (error - latestError) / delaySinceUpdate
+
+        val newRate = (latestRate - proportional * error -
+                                    integral * historicalError -
+                                    derivative * dError).max(0.0)
+        latestTime = time
+        if (firstRun) {
+          latestRate = processingRate
+          latestError = 0D
+          firstRun = false
+
+          None
+        } else {
+          latestRate = newRate
+          latestError = error
+
+          Some(newRate)
+        }
+      } else None
+    }
+  }
+}
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/rate/RateEstimator.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/rate/RateEstimator.scala
index a08685119e5d5..17ccebc1ed41b 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/rate/RateEstimator.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/rate/RateEstimator.scala
@@ -19,6 +19,7 @@ package org.apache.spark.streaming.scheduler.rate
 
 import org.apache.spark.SparkConf
 import org.apache.spark.SparkException
+import org.apache.spark.streaming.Duration
 
 /**
  * A component that estimates the rate at wich an InputDStream should ingest
@@ -48,12 +49,21 @@ object RateEstimator {
   /**
    * Return a new RateEstimator based on the value of `spark.streaming.RateEstimator`.
    *
-   * @return None if there is no configured estimator, otherwise an instance of RateEstimator
+   * The only known estimator right now is `pid`.
+   *
+   * @return An instance of RateEstimator
    * @throws IllegalArgumentException if there is a configured RateEstimator that doesn't match any
    *         known estimators.
    */
-  def create(conf: SparkConf): Option[RateEstimator] =
-    conf.getOption("spark.streaming.backpressure.rateEstimator").map { estimator =>
-      throw new IllegalArgumentException(s"Unkown rate estimator: $estimator")
+  def create(conf: SparkConf, batchInterval: Duration): RateEstimator =
+    conf.get("spark.streaming.backpressure.rateEstimator", "pid") match {
+      case "pid" =>
+        val proportional = conf.getDouble("spark.streaming.backpressure.pid.proportional", 1.0)
+        val integral = conf.getDouble("spark.streaming.backpressure.pid.integral", 0.2)
+        val derived = conf.getDouble("spark.streaming.backpressure.pid.derived", 0.0)
+        new PIDRateEstimator(batchInterval.milliseconds, proportional, integral, derived)
+
+      case estimator =>
+        throw new IllegalArgumentException(s"Unkown rate estimator: $estimator")
     }
 }
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/rate/PIDRateEstimatorSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/rate/PIDRateEstimatorSuite.scala
new file mode 100644
index 0000000000000..97c32d8f2d59e
--- /dev/null
+++ b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/rate/PIDRateEstimatorSuite.scala
@@ -0,0 +1,137 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.streaming.scheduler.rate
+
+import scala.util.Random
+
+import org.scalatest.Inspectors.forAll
+import org.scalatest.Matchers
+
+import org.apache.spark.{SparkConf, SparkFunSuite}
+import org.apache.spark.streaming.Seconds
+
+class PIDRateEstimatorSuite extends SparkFunSuite with Matchers {
+
+  test("the right estimator is created") {
+    val conf = new SparkConf
+    conf.set("spark.streaming.backpressure.rateEstimator", "pid")
+    val pid = RateEstimator.create(conf, Seconds(1))
+    pid.getClass should equal(classOf[PIDRateEstimator])
+  }
+
+  test("estimator checks ranges") {
+    intercept[IllegalArgumentException] {
+      new PIDRateEstimator(0, 1, 2, 3)
+    }
+    intercept[IllegalArgumentException] {
+      new PIDRateEstimator(100, -1, 2, 3)
+    }
+    intercept[IllegalArgumentException] {
+      new PIDRateEstimator(100, 0, -1, 3)
+    }
+    intercept[IllegalArgumentException] {
+      new PIDRateEstimator(100, 0, 0, -1)
+    }
+  }
+
+  private def createDefaultEstimator: PIDRateEstimator = {
+    new PIDRateEstimator(20, 1D, 0D, 0D)
+  }
+
+  test("first bound is None") {
+    val p = createDefaultEstimator
+    p.compute(0, 10, 10, 0) should equal(None)
+  }
+
+  test("second bound is rate") {
+    val p = createDefaultEstimator
+    p.compute(0, 10, 10, 0)
+    // 1000 elements / s
+    p.compute(10, 10, 10, 0) should equal(Some(1000))
+  }
+
+  test("works even with no time between updates") {
+    val p = createDefaultEstimator
+    p.compute(0, 10, 10, 0)
+    p.compute(10, 10, 10, 0)
+    p.compute(10, 10, 10, 0) should equal(None)
+  }
+
+  test("bound is never negative") {
+    val p = new PIDRateEstimator(20, 1D, 1D, 0D)
+    // prepare a series of batch updates, one every 20ms, 0 processed elements, 2ms of processing
+    // this might point the estimator to try and decrease the bound, but we test it never
+    // goes below zero, which would be nonsensical.
+    val times = List.tabulate(50)(x => x * 20) // every 20ms
+    val elements = List.fill(50)(0) // no processing
+    val proc = List.fill(50)(20) // 20ms of processing
+    val sched = List.fill(50)(100) // strictly positive accumulation
+    val res = for (i <- List.range(0, 50)) yield p.compute(times(i), elements(i), proc(i), sched(i))
+    res.head should equal(None)
+    res.tail should equal(List.fill(49)(Some(0D)))
+  }
+
+  test("with no accumulated or positive error, |I| > 0, follow the processing speed") {
+    val p = new PIDRateEstimator(20, 1D, 1D, 0D)
+    // prepare a series of batch updates, one every 20ms with an increasing number of processed
+    // elements in each batch, but constant processing time, and no accumulated error. Even though
+    // the integral part is non-zero, the estimated rate should follow only the proportional term
+    val times = List.tabulate(50)(x => x * 20) // every 20ms
+    val elements = List.tabulate(50)(x => x * 20) // increasing
+    val proc = List.fill(50)(20) // 20ms of processing
+    val sched = List.fill(50)(0)
+    val res = for (i <- List.range(0, 50)) yield p.compute(times(i), elements(i), proc(i), sched(i))
+    res.head should equal(None)
+    res.tail should equal(List.tabulate(50)(x => Some(x * 1000D)).tail)
+  }
+
+  test("with no accumulated but some positive error, |I| > 0, follow the processing speed") {
+    val p = new PIDRateEstimator(20, 1D, 1D, 0D)
+    // prepare a series of batch updates, one every 20ms with an decreasing number of processed
+    // elements in each batch, but constant processing time, and no accumulated error. Even though
+    // the integral part is non-zero, the estimated rate should follow only the proportional term,
+    // asking for less and less elements
+    val times = List.tabulate(50)(x => x * 20) // every 20ms
+    val elements = List.tabulate(50)(x => (50 - x) * 20) // decreasing
+    val proc = List.fill(50)(20) // 20ms of processing
+    val sched = List.fill(50)(0)
+    val res = for (i <- List.range(0, 50)) yield p.compute(times(i), elements(i), proc(i), sched(i))
+    res.head should equal(None)
+    res.tail should equal(List.tabulate(50)(x => Some((50 - x) * 1000D)).tail)
+  }
+
+  test("with some accumulated and some positive error, |I| > 0, stay below the processing speed") {
+    val p = new PIDRateEstimator(20, 1D, .01D, 0D)
+    val times = List.tabulate(50)(x => x * 20) // every 20ms
+    val rng = new Random()
+    val elements = List.tabulate(50)(x => rng.nextInt(1000))
+    val procDelayMs = 20
+    val proc = List.fill(50)(procDelayMs) // 20ms of processing
+    val sched = List.tabulate(50)(x => rng.nextInt(19)) // random wait
+    val speeds = elements map ((x) => x.toDouble / procDelayMs * 1000)
+
+    val res = for (i <- List.range(0, 50)) yield p.compute(times(i), elements(i), proc(i), sched(i))
+    res.head should equal(None)
+    forAll(List.range(1, 50)) { (n) =>
+      res(n) should not be None
+      if (res(n).get > 0 && sched(n) > 0) {
+        res(n).get should be < speeds(n)
+      }
+    }
+  }
+}

From 39ab199a3f735b7658ab3331d3e2fb03441aec13 Mon Sep 17 00:00:00 2001
From: Herman van Hovell <hvanhovell@questtec.nl>
Date: Fri, 31 Jul 2015 12:07:18 -0700
Subject: [PATCH 521/549] [SPARK-8640] [SQL] Enable Processing of Multiple
 Window Frames in a Single Window Operator

This PR enables the processing of multiple window frames in a single window operator. This should improve the performance of processing multiple window expressions wich share partition by/order by clauses, because it will be more efficient with respect to memory use and group processing.

Author: Herman van Hovell <hvanhovell@questtec.nl>

Closes #7515 from hvanhovell/SPARK-8640 and squashes the following commits:

f0e1c21 [Herman van Hovell] Changed Window Logical/Physical plans to use partition by/order by specs directly instead of using WindowSpec.
e1711c2 [Herman van Hovell] Enabled the processing of multiple window frames in a single Window operator.
---
 .../sql/catalyst/analysis/Analyzer.scala      | 12 +++++++-----
 .../plans/logical/basicOperators.scala        |  3 ++-
 .../spark/sql/execution/SparkStrategies.scala |  5 +++--
 .../apache/spark/sql/execution/Window.scala   | 19 ++++++++++---------
 .../sql/hive/execution/HivePlanTest.scala     | 18 ++++++++++++++++++
 5 files changed, 40 insertions(+), 17 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 265f3d1e41765..51d910b258647 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -347,7 +347,7 @@ class Analyzer(
             val newOutput = oldVersion.generatorOutput.map(_.newInstance())
             (oldVersion, oldVersion.copy(generatorOutput = newOutput))
 
-          case oldVersion @ Window(_, windowExpressions, _, child)
+          case oldVersion @ Window(_, windowExpressions, _, _, child)
               if AttributeSet(windowExpressions.map(_.toAttribute)).intersect(conflictingAttributes)
                 .nonEmpty =>
             (oldVersion, oldVersion.copy(windowExpressions = newAliases(windowExpressions)))
@@ -825,7 +825,7 @@ class Analyzer(
         }.asInstanceOf[NamedExpression]
       }
 
-      // Second, we group extractedWindowExprBuffer based on their Window Spec.
+      // Second, we group extractedWindowExprBuffer based on their Partition and Order Specs.
       val groupedWindowExpressions = extractedWindowExprBuffer.groupBy { expr =>
         val distinctWindowSpec = expr.collect {
           case window: WindowExpression => window.windowSpec
@@ -841,7 +841,8 @@ class Analyzer(
           failAnalysis(s"$expr has multiple Window Specifications ($distinctWindowSpec)." +
             s"Please file a bug report with this error message, stack trace, and the query.")
         } else {
-          distinctWindowSpec.head
+          val spec = distinctWindowSpec.head
+          (spec.partitionSpec, spec.orderSpec)
         }
       }.toSeq
 
@@ -850,9 +851,10 @@ class Analyzer(
       var currentChild = child
       var i = 0
       while (i < groupedWindowExpressions.size) {
-        val (windowSpec, windowExpressions) = groupedWindowExpressions(i)
+        val ((partitionSpec, orderSpec), windowExpressions) = groupedWindowExpressions(i)
         // Set currentChild to the newly created Window operator.
-        currentChild = Window(currentChild.output, windowExpressions, windowSpec, currentChild)
+        currentChild = Window(currentChild.output, windowExpressions,
+          partitionSpec, orderSpec, currentChild)
 
         // Move to next Window Spec.
         i += 1
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
index a67f8de6b733a..aacfc86ab0e49 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
@@ -228,7 +228,8 @@ case class Aggregate(
 case class Window(
     projectList: Seq[Attribute],
     windowExpressions: Seq[NamedExpression],
-    windowSpec: WindowSpecDefinition,
+    partitionSpec: Seq[Expression],
+    orderSpec: Seq[SortOrder],
     child: LogicalPlan) extends UnaryNode {
 
   override def output: Seq[Attribute] =
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index 03d24a88d4ecd..4aff52d992e6b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -389,8 +389,9 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
           execution.Aggregate(partial = false, group, agg, planLater(child)) :: Nil
         }
       }
-      case logical.Window(projectList, windowExpressions, spec, child) =>
-        execution.Window(projectList, windowExpressions, spec, planLater(child)) :: Nil
+      case logical.Window(projectList, windowExprs, partitionSpec, orderSpec, child) =>
+        execution.Window(
+          projectList, windowExprs, partitionSpec, orderSpec, planLater(child)) :: Nil
       case logical.Sample(lb, ub, withReplacement, seed, child) =>
         execution.Sample(lb, ub, withReplacement, seed, planLater(child)) :: Nil
       case logical.LocalRelation(output, data) =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Window.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Window.scala
index 91c8a02e2b5bc..fe9f2c7028171 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Window.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Window.scala
@@ -80,23 +80,24 @@ import scala.collection.mutable
 case class Window(
     projectList: Seq[Attribute],
     windowExpression: Seq[NamedExpression],
-    windowSpec: WindowSpecDefinition,
+    partitionSpec: Seq[Expression],
+    orderSpec: Seq[SortOrder],
     child: SparkPlan)
   extends UnaryNode {
 
   override def output: Seq[Attribute] = projectList ++ windowExpression.map(_.toAttribute)
 
   override def requiredChildDistribution: Seq[Distribution] = {
-    if (windowSpec.partitionSpec.isEmpty) {
+    if (partitionSpec.isEmpty) {
       // Only show warning when the number of bytes is larger than 100 MB?
       logWarning("No Partition Defined for Window operation! Moving all data to a single "
         + "partition, this can cause serious performance degradation.")
       AllTuples :: Nil
-    } else ClusteredDistribution(windowSpec.partitionSpec) :: Nil
+    } else ClusteredDistribution(partitionSpec) :: Nil
   }
 
   override def requiredChildOrdering: Seq[Seq[SortOrder]] =
-    Seq(windowSpec.partitionSpec.map(SortOrder(_, Ascending)) ++ windowSpec.orderSpec)
+    Seq(partitionSpec.map(SortOrder(_, Ascending)) ++ orderSpec)
 
   override def outputOrdering: Seq[SortOrder] = child.outputOrdering
 
@@ -115,12 +116,12 @@ case class Window(
       case RangeFrame =>
         val (exprs, current, bound) = if (offset == 0) {
           // Use the entire order expression when the offset is 0.
-          val exprs = windowSpec.orderSpec.map(_.child)
+          val exprs = orderSpec.map(_.child)
           val projection = newMutableProjection(exprs, child.output)
-          (windowSpec.orderSpec, projection(), projection())
-        } else if (windowSpec.orderSpec.size == 1) {
+          (orderSpec, projection(), projection())
+        } else if (orderSpec.size == 1) {
           // Use only the first order expression when the offset is non-null.
-          val sortExpr = windowSpec.orderSpec.head
+          val sortExpr = orderSpec.head
           val expr = sortExpr.child
           // Create the projection which returns the current 'value'.
           val current = newMutableProjection(expr :: Nil, child.output)()
@@ -250,7 +251,7 @@ case class Window(
 
         // Get all relevant projections.
         val result = createResultProjection(unboundExpressions)
-        val grouping = newProjection(windowSpec.partitionSpec, child.output)
+        val grouping = newProjection(partitionSpec, child.output)
 
         // Manage the stream and the grouping.
         var nextRow: InternalRow = EmptyRow
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HivePlanTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HivePlanTest.scala
index bdb53ddf59c19..ba56a8a6b689c 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HivePlanTest.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HivePlanTest.scala
@@ -17,7 +17,10 @@
 
 package org.apache.spark.sql.hive.execution
 
+import org.apache.spark.sql.functions._
 import org.apache.spark.sql.QueryTest
+import org.apache.spark.sql.catalyst.plans.logical
+import org.apache.spark.sql.expressions.Window
 import org.apache.spark.sql.hive.test.TestHive
 
 class HivePlanTest extends QueryTest {
@@ -31,4 +34,19 @@ class HivePlanTest extends QueryTest {
 
     comparePlans(optimized, correctAnswer)
   }
+
+  test("window expressions sharing the same partition by and order by clause") {
+    val df = Seq.empty[(Int, String, Int, Int)].toDF("id", "grp", "seq", "val")
+    val window = Window.
+      partitionBy($"grp").
+      orderBy($"val")
+    val query = df.select(
+      $"id",
+      sum($"val").over(window.rowsBetween(-1, 1)),
+      sum($"val").over(window.rangeBetween(-1, 1))
+    )
+    val plan = query.queryExecution.analyzed
+    assert(plan.collect{ case w: logical.Window => w }.size === 1,
+      "Should have only 1 Window operator.")
+  }
 }

From 3afc1de89cb4de9f8ea74003dd1e6b5b006d06f0 Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Fri, 31 Jul 2015 12:09:48 -0700
Subject: [PATCH 522/549] [SPARK-8564] [STREAMING] Add the Python API for
 Kinesis

This PR adds the Python API for Kinesis, including a Python example and a simple unit test.

Author: zsxwing <zsxwing@gmail.com>

Closes #6955 from zsxwing/kinesis-python and squashes the following commits:

e42e471 [zsxwing] Merge branch 'master' into kinesis-python
455f7ea [zsxwing] Remove streaming_kinesis_asl_assembly module and simply add the source folder to streaming_kinesis_asl module
32e6451 [zsxwing] Merge remote-tracking branch 'origin/master' into kinesis-python
5082d28 [zsxwing] Fix the syntax error for Python 2.6
fca416b [zsxwing] Fix wrong comparison
96670ff [zsxwing] Fix the compilation error after merging master
756a128 [zsxwing] Merge branch 'master' into kinesis-python
6c37395 [zsxwing] Print stack trace for debug
7c5cfb0 [zsxwing] RUN_KINESIS_TESTS -> ENABLE_KINESIS_TESTS
cc9d071 [zsxwing] Fix the python test errors
466b425 [zsxwing] Add python tests for Kinesis
e33d505 [zsxwing] Merge remote-tracking branch 'origin/master' into kinesis-python
3da2601 [zsxwing] Fix the kinesis folder
687446b [zsxwing] Fix the error message and the maven output path
add2beb [zsxwing] Merge branch 'master' into kinesis-python
4957c0b [zsxwing] Add the Python API for Kinesis
---
 dev/run-tests.py                              |   3 +-
 dev/sparktestsupport/modules.py               |   9 +-
 docs/streaming-kinesis-integration.md         |  19 +++
 extras/kinesis-asl-assembly/pom.xml           | 103 ++++++++++++++++
 .../streaming/kinesis_wordcount_asl.py        |  81 +++++++++++++
 .../streaming/kinesis/KinesisTestUtils.scala  |  19 ++-
 .../streaming/kinesis/KinesisUtils.scala      |  78 +++++++++---
 pom.xml                                       |   1 +
 project/SparkBuild.scala                      |   6 +-
 python/pyspark/streaming/kinesis.py           | 112 ++++++++++++++++++
 python/pyspark/streaming/tests.py             |  86 +++++++++++++-
 11 files changed, 492 insertions(+), 25 deletions(-)
 create mode 100644 extras/kinesis-asl-assembly/pom.xml
 create mode 100644 extras/kinesis-asl/src/main/python/examples/streaming/kinesis_wordcount_asl.py
 create mode 100644 python/pyspark/streaming/kinesis.py

diff --git a/dev/run-tests.py b/dev/run-tests.py
index 29420da9aa956..b6d181418f027 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -301,7 +301,8 @@ def build_spark_sbt(hadoop_version):
     sbt_goals = ["package",
                  "assembly/assembly",
                  "streaming-kafka-assembly/assembly",
-                 "streaming-flume-assembly/assembly"]
+                 "streaming-flume-assembly/assembly",
+                 "streaming-kinesis-asl-assembly/assembly"]
     profiles_and_goals = build_profiles + sbt_goals
 
     print("[info] Building Spark (w/Hive 0.13.1) using SBT with these arguments: ",
diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
index 44600cb9523c1..956dc81b62e93 100644
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@@ -138,6 +138,7 @@ def contains_file(self, filename):
     dependencies=[],
     source_file_regexes=[
         "extras/kinesis-asl/",
+        "extras/kinesis-asl-assembly/",
     ],
     build_profile_flags=[
         "-Pkinesis-asl",
@@ -300,7 +301,13 @@ def contains_file(self, filename):
 
 pyspark_streaming = Module(
     name="pyspark-streaming",
-    dependencies=[pyspark_core, streaming, streaming_kafka, streaming_flume_assembly],
+    dependencies=[
+        pyspark_core,
+        streaming,
+        streaming_kafka,
+        streaming_flume_assembly,
+        streaming_kinesis_asl
+    ],
     source_file_regexes=[
         "python/pyspark/streaming"
     ],
diff --git a/docs/streaming-kinesis-integration.md b/docs/streaming-kinesis-integration.md
index aa9749afbc867..a7bcaec6fcd84 100644
--- a/docs/streaming-kinesis-integration.md
+++ b/docs/streaming-kinesis-integration.md
@@ -51,6 +51,17 @@ A Kinesis stream can be set up at one of the valid Kinesis endpoints with 1 or m
 	See the [API docs](api/java/index.html?org/apache/spark/streaming/kinesis/KinesisUtils.html)
 	and the [example]({{site.SPARK_GITHUB_URL}}/tree/master/extras/kinesis-asl/src/main/java/org/apache/spark/examples/streaming/JavaKinesisWordCountASL.java). Refer to the next subsection for instructions to run the example.
 
+	</div>
+	<div data-lang="python" markdown="1">
+		from pyspark.streaming.kinesis import KinesisUtils, InitialPositionInStream
+
+		kinesisStream = KinesisUtils.createStream(
+			streamingContext, [Kinesis app name], [Kinesis stream name], [endpoint URL],
+			[region name], [initial position], [checkpoint interval], StorageLevel.MEMORY_AND_DISK_2)
+
+	See the [API docs](api/python/pyspark.streaming.html#pyspark.streaming.kinesis.KinesisUtils)
+	and the [example]({{site.SPARK_GITHUB_URL}}/tree/master/extras/kinesis-asl/src/main/python/examples/streaming/kinesis_wordcount_asl.py). Refer to the next subsection for instructions to run the example.
+
 	</div>
 	</div>
 
@@ -135,6 +146,14 @@ To run the example,
 
         bin/run-example streaming.JavaKinesisWordCountASL [Kinesis app name] [Kinesis stream name] [endpoint URL]
 
+	</div>
+	<div data-lang="python" markdown="1">
+
+        bin/spark-submit --jars extras/kinesis-asl/target/scala-*/\
+            spark-streaming-kinesis-asl-assembly_*.jar \
+            extras/kinesis-asl/src/main/python/examples/streaming/kinesis_wordcount_asl.py \
+            [Kinesis app name] [Kinesis stream name] [endpoint URL] [region name]
+
 	</div>
 	</div>
 
diff --git a/extras/kinesis-asl-assembly/pom.xml b/extras/kinesis-asl-assembly/pom.xml
new file mode 100644
index 0000000000000..70d2c9c58f54e
--- /dev/null
+++ b/extras/kinesis-asl-assembly/pom.xml
@@ -0,0 +1,103 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~    http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  <parent>
+    <groupId>org.apache.spark</groupId>
+    <artifactId>spark-parent_2.10</artifactId>
+    <version>1.5.0-SNAPSHOT</version>
+    <relativePath>../../pom.xml</relativePath>
+  </parent>
+
+  <groupId>org.apache.spark</groupId>
+  <artifactId>spark-streaming-kinesis-asl-assembly_2.10</artifactId>
+  <packaging>jar</packaging>
+  <name>Spark Project Kinesis Assembly</name>
+  <url>http://spark.apache.org/</url>
+
+  <properties>
+    <sbt.project.name>streaming-kinesis-asl-assembly</sbt.project.name>
+  </properties>
+
+  <dependencies>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-streaming-kinesis-asl_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-streaming_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <scope>provided</scope>
+    </dependency>
+  </dependencies>
+
+  <build>
+  <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
+  <testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
+  <plugins>
+    <plugin>
+      <groupId>org.apache.maven.plugins</groupId>
+      <artifactId>maven-shade-plugin</artifactId>
+      <configuration>
+        <shadedArtifactAttached>false</shadedArtifactAttached>
+        <outputFile>${project.build.directory}/scala-${scala.binary.version}/spark-streaming-kinesis-asl-assembly-${project.version}.jar</outputFile>
+        <artifactSet>
+          <includes>
+            <include>*:*</include>
+          </includes>
+        </artifactSet>
+        <filters>
+          <filter>
+            <artifact>*:*</artifact>
+            <excludes>
+              <exclude>META-INF/*.SF</exclude>
+              <exclude>META-INF/*.DSA</exclude>
+              <exclude>META-INF/*.RSA</exclude>
+            </excludes>
+          </filter>
+        </filters>
+      </configuration>
+      <executions>
+        <execution>
+          <phase>package</phase>
+          <goals>
+            <goal>shade</goal>
+          </goals>
+          <configuration>
+            <transformers>
+              <transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
+              <transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
+                <resource>reference.conf</resource>
+              </transformer>
+              <transformer implementation="org.apache.maven.plugins.shade.resource.DontIncludeResourceTransformer">
+                <resource>log4j.properties</resource>
+              </transformer>
+              <transformer implementation="org.apache.maven.plugins.shade.resource.ApacheLicenseResourceTransformer"/>
+              <transformer implementation="org.apache.maven.plugins.shade.resource.ApacheNoticeResourceTransformer"/>
+            </transformers>
+          </configuration>
+        </execution>
+      </executions>
+    </plugin>
+  </plugins>
+</build>
+</project>
+
diff --git a/extras/kinesis-asl/src/main/python/examples/streaming/kinesis_wordcount_asl.py b/extras/kinesis-asl/src/main/python/examples/streaming/kinesis_wordcount_asl.py
new file mode 100644
index 0000000000000..f428f64da3c42
--- /dev/null
+++ b/extras/kinesis-asl/src/main/python/examples/streaming/kinesis_wordcount_asl.py
@@ -0,0 +1,81 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+  Consumes messages from a Amazon Kinesis streams and does wordcount.
+
+  This example spins up 1 Kinesis Receiver per shard for the given stream.
+  It then starts pulling from the last checkpointed sequence number of the given stream.
+
+  Usage: kinesis_wordcount_asl.py <app-name> <stream-name> <endpoint-url> <region-name>
+    <app-name> is the name of the consumer app, used to track the read data in DynamoDB
+    <stream-name> name of the Kinesis stream (ie. mySparkStream)
+    <endpoint-url> endpoint of the Kinesis service
+      (e.g. https://kinesis.us-east-1.amazonaws.com)
+
+
+  Example:
+      # export AWS keys if necessary
+      $ export AWS_ACCESS_KEY_ID=<your-access-key>
+      $ export AWS_SECRET_KEY=<your-secret-key>
+
+      # run the example
+      $ bin/spark-submit -jar extras/kinesis-asl/target/scala-*/\
+        spark-streaming-kinesis-asl-assembly_*.jar \
+        extras/kinesis-asl/src/main/python/examples/streaming/kinesis_wordcount_asl.py \
+        myAppName mySparkStream https://kinesis.us-east-1.amazonaws.com
+
+  There is a companion helper class called KinesisWordProducerASL which puts dummy data
+  onto the Kinesis stream.
+
+  This code uses the DefaultAWSCredentialsProviderChain to find credentials
+  in the following order:
+      Environment Variables - AWS_ACCESS_KEY_ID and AWS_SECRET_KEY
+      Java System Properties - aws.accessKeyId and aws.secretKey
+      Credential profiles file - default location (~/.aws/credentials) shared by all AWS SDKs
+      Instance profile credentials - delivered through the Amazon EC2 metadata service
+  For more information, see
+      http://docs.aws.amazon.com/AWSSdkDocsJava/latest/DeveloperGuide/credentials.html
+
+  See http://spark.apache.org/docs/latest/streaming-kinesis-integration.html for more details on
+  the Kinesis Spark Streaming integration.
+"""
+import sys
+
+from pyspark import SparkContext
+from pyspark.streaming import StreamingContext
+from pyspark.streaming.kinesis import KinesisUtils, InitialPositionInStream
+
+if __name__ == "__main__":
+    if len(sys.argv) != 5:
+        print(
+            "Usage: kinesis_wordcount_asl.py <app-name> <stream-name> <endpoint-url> <region-name>",
+            file=sys.stderr)
+        sys.exit(-1)
+
+    sc = SparkContext(appName="PythonStreamingKinesisWordCountAsl")
+    ssc = StreamingContext(sc, 1)
+    appName, streamName, endpointUrl, regionName = sys.argv[1:]
+    lines = KinesisUtils.createStream(
+        ssc, appName, streamName, endpointUrl, regionName, InitialPositionInStream.LATEST, 2)
+    counts = lines.flatMap(lambda line: line.split(" ")) \
+        .map(lambda word: (word, 1)) \
+        .reduceByKey(lambda a, b: a+b)
+    counts.pprint()
+
+    ssc.start()
+    ssc.awaitTermination()
diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisTestUtils.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisTestUtils.scala
index ca39358b75cb6..255ac27f793ba 100644
--- a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisTestUtils.scala
+++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisTestUtils.scala
@@ -36,9 +36,15 @@ import org.apache.spark.Logging
 /**
  * Shared utility methods for performing Kinesis tests that actually transfer data
  */
-private class KinesisTestUtils(
-    val endpointUrl: String = "https://kinesis.us-west-2.amazonaws.com",
-    _regionName: String = "") extends Logging {
+private class KinesisTestUtils(val endpointUrl: String, _regionName: String) extends Logging {
+
+  def this() {
+    this("https://kinesis.us-west-2.amazonaws.com", "")
+  }
+
+  def this(endpointUrl: String) {
+    this(endpointUrl, "")
+  }
 
   val regionName = if (_regionName.length == 0) {
     RegionUtils.getRegionByEndpoint(endpointUrl).getName()
@@ -117,6 +123,13 @@ private class KinesisTestUtils(
     shardIdToSeqNumbers.toMap
   }
 
+  /**
+   * Expose a Python friendly API.
+   */
+  def pushData(testData: java.util.List[Int]): Unit = {
+    pushData(scala.collection.JavaConversions.asScalaBuffer(testData))
+  }
+
   def deleteStream(): Unit = {
     try {
       if (streamCreated) {
diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala
index e5acab50181e1..7dab17eba8483 100644
--- a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala
+++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala
@@ -86,19 +86,19 @@ object KinesisUtils {
    * @param endpointUrl  Url of Kinesis service (e.g., https://kinesis.us-east-1.amazonaws.com)
    * @param regionName   Name of region used by the Kinesis Client Library (KCL) to update
    *                     DynamoDB (lease coordination and checkpointing) and CloudWatch (metrics)
-   * @param awsAccessKeyId  AWS AccessKeyId (if null, will use DefaultAWSCredentialsProviderChain)
-   * @param awsSecretKey  AWS SecretKey (if null, will use DefaultAWSCredentialsProviderChain)
-   * @param checkpointInterval  Checkpoint interval for Kinesis checkpointing.
-   *                            See the Kinesis Spark Streaming documentation for more
-   *                            details on the different types of checkpoints.
    * @param initialPositionInStream  In the absence of Kinesis checkpoint info, this is the
    *                                 worker's initial starting position in the stream.
    *                                 The values are either the beginning of the stream
    *                                 per Kinesis' limit of 24 hours
    *                                 (InitialPositionInStream.TRIM_HORIZON) or
    *                                 the tip of the stream (InitialPositionInStream.LATEST).
+   * @param checkpointInterval  Checkpoint interval for Kinesis checkpointing.
+   *                            See the Kinesis Spark Streaming documentation for more
+   *                            details on the different types of checkpoints.
    * @param storageLevel Storage level to use for storing the received objects.
    *                     StorageLevel.MEMORY_AND_DISK_2 is recommended.
+   * @param awsAccessKeyId  AWS AccessKeyId (if null, will use DefaultAWSCredentialsProviderChain)
+   * @param awsSecretKey  AWS SecretKey (if null, will use DefaultAWSCredentialsProviderChain)
    */
   def createStream(
       ssc: StreamingContext,
@@ -130,7 +130,7 @@ object KinesisUtils {
    * - The Kinesis application name used by the Kinesis Client Library (KCL) will be the app name in
    *   [[org.apache.spark.SparkConf]].
    *
-   * @param ssc Java StreamingContext object
+   * @param ssc StreamingContext object
    * @param streamName   Kinesis stream name
    * @param endpointUrl  Endpoint url of Kinesis service
    *                     (e.g., https://kinesis.us-east-1.amazonaws.com)
@@ -175,15 +175,15 @@ object KinesisUtils {
    * @param endpointUrl  Url of Kinesis service (e.g., https://kinesis.us-east-1.amazonaws.com)
    * @param regionName   Name of region used by the Kinesis Client Library (KCL) to update
    *                     DynamoDB (lease coordination and checkpointing) and CloudWatch (metrics)
-   * @param checkpointInterval  Checkpoint interval for Kinesis checkpointing.
-   *                            See the Kinesis Spark Streaming documentation for more
-   *                            details on the different types of checkpoints.
    * @param initialPositionInStream  In the absence of Kinesis checkpoint info, this is the
    *                                 worker's initial starting position in the stream.
    *                                 The values are either the beginning of the stream
    *                                 per Kinesis' limit of 24 hours
    *                                 (InitialPositionInStream.TRIM_HORIZON) or
    *                                 the tip of the stream (InitialPositionInStream.LATEST).
+   * @param checkpointInterval  Checkpoint interval for Kinesis checkpointing.
+   *                            See the Kinesis Spark Streaming documentation for more
+   *                            details on the different types of checkpoints.
    * @param storageLevel Storage level to use for storing the received objects.
    *                     StorageLevel.MEMORY_AND_DISK_2 is recommended.
    */
@@ -206,8 +206,8 @@ object KinesisUtils {
    * This uses the Kinesis Client Library (KCL) to pull messages from Kinesis.
    *
    * Note:
-   *  The given AWS credentials will get saved in DStream checkpoints if checkpointing
-   *  is enabled. Make sure that your checkpoint directory is secure.
+   * The given AWS credentials will get saved in DStream checkpoints if checkpointing
+   * is enabled. Make sure that your checkpoint directory is secure.
    *
    * @param jssc Java StreamingContext object
    * @param kinesisAppName  Kinesis application name used by the Kinesis Client Library
@@ -216,19 +216,19 @@ object KinesisUtils {
    * @param endpointUrl  Url of Kinesis service (e.g., https://kinesis.us-east-1.amazonaws.com)
    * @param regionName   Name of region used by the Kinesis Client Library (KCL) to update
    *                     DynamoDB (lease coordination and checkpointing) and CloudWatch (metrics)
-   * @param awsAccessKeyId  AWS AccessKeyId (if null, will use DefaultAWSCredentialsProviderChain)
-   * @param awsSecretKey  AWS SecretKey (if null, will use DefaultAWSCredentialsProviderChain)
-   * @param checkpointInterval  Checkpoint interval for Kinesis checkpointing.
-   *                            See the Kinesis Spark Streaming documentation for more
-   *                            details on the different types of checkpoints.
    * @param initialPositionInStream  In the absence of Kinesis checkpoint info, this is the
    *                                 worker's initial starting position in the stream.
    *                                 The values are either the beginning of the stream
    *                                 per Kinesis' limit of 24 hours
    *                                 (InitialPositionInStream.TRIM_HORIZON) or
    *                                 the tip of the stream (InitialPositionInStream.LATEST).
+   * @param checkpointInterval  Checkpoint interval for Kinesis checkpointing.
+   *                            See the Kinesis Spark Streaming documentation for more
+   *                            details on the different types of checkpoints.
    * @param storageLevel Storage level to use for storing the received objects.
    *                     StorageLevel.MEMORY_AND_DISK_2 is recommended.
+   * @param awsAccessKeyId  AWS AccessKeyId (if null, will use DefaultAWSCredentialsProviderChain)
+   * @param awsSecretKey  AWS SecretKey (if null, will use DefaultAWSCredentialsProviderChain)
    */
   def createStream(
       jssc: JavaStreamingContext,
@@ -297,3 +297,49 @@ object KinesisUtils {
     }
   }
 }
+
+/**
+ * This is a helper class that wraps the methods in KinesisUtils into more Python-friendly class and
+ * function so that it can be easily instantiated and called from Python's KinesisUtils.
+ */
+private class KinesisUtilsPythonHelper {
+
+  def getInitialPositionInStream(initialPositionInStream: Int): InitialPositionInStream = {
+    initialPositionInStream match {
+      case 0 => InitialPositionInStream.LATEST
+      case 1 => InitialPositionInStream.TRIM_HORIZON
+      case _ => throw new IllegalArgumentException(
+        "Illegal InitialPositionInStream. Please use " +
+          "InitialPositionInStream.LATEST or InitialPositionInStream.TRIM_HORIZON")
+    }
+  }
+
+  def createStream(
+      jssc: JavaStreamingContext,
+      kinesisAppName: String,
+      streamName: String,
+      endpointUrl: String,
+      regionName: String,
+      initialPositionInStream: Int,
+      checkpointInterval: Duration,
+      storageLevel: StorageLevel,
+      awsAccessKeyId: String,
+      awsSecretKey: String
+      ): JavaReceiverInputDStream[Array[Byte]] = {
+    if (awsAccessKeyId == null && awsSecretKey != null) {
+      throw new IllegalArgumentException("awsSecretKey is set but awsAccessKeyId is null")
+    }
+    if (awsAccessKeyId != null && awsSecretKey == null) {
+      throw new IllegalArgumentException("awsAccessKeyId is set but awsSecretKey is null")
+    }
+    if (awsAccessKeyId == null && awsSecretKey == null) {
+      KinesisUtils.createStream(jssc, kinesisAppName, streamName, endpointUrl, regionName,
+        getInitialPositionInStream(initialPositionInStream), checkpointInterval, storageLevel)
+    } else {
+      KinesisUtils.createStream(jssc, kinesisAppName, streamName, endpointUrl, regionName,
+        getInitialPositionInStream(initialPositionInStream), checkpointInterval, storageLevel,
+        awsAccessKeyId, awsSecretKey)
+    }
+  }
+
+}
diff --git a/pom.xml b/pom.xml
index 35fc8c44bc1b0..e351c7c19df96 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1642,6 +1642,7 @@
       <id>kinesis-asl</id>
       <modules>
         <module>extras/kinesis-asl</module>
+        <module>extras/kinesis-asl-assembly</module>
       </modules>
     </profile>
 
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index 61a05d375d99e..9a33baa7c6ce1 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -45,8 +45,8 @@ object BuildCommons {
     sparkKinesisAsl) = Seq("yarn", "yarn-stable", "java8-tests", "ganglia-lgpl",
     "kinesis-asl").map(ProjectRef(buildLocation, _))
 
-  val assemblyProjects@Seq(assembly, examples, networkYarn, streamingFlumeAssembly, streamingKafkaAssembly) =
-    Seq("assembly", "examples", "network-yarn", "streaming-flume-assembly", "streaming-kafka-assembly")
+  val assemblyProjects@Seq(assembly, examples, networkYarn, streamingFlumeAssembly, streamingKafkaAssembly, streamingKinesisAslAssembly) =
+    Seq("assembly", "examples", "network-yarn", "streaming-flume-assembly", "streaming-kafka-assembly", "streaming-kinesis-asl-assembly")
       .map(ProjectRef(buildLocation, _))
 
   val tools = ProjectRef(buildLocation, "tools")
@@ -382,7 +382,7 @@ object Assembly {
         .getOrElse(SbtPomKeys.effectivePom.value.getProperties.get("hadoop.version").asInstanceOf[String])
     },
     jarName in assembly <<= (version, moduleName, hadoopVersion) map { (v, mName, hv) =>
-      if (mName.contains("streaming-flume-assembly") || mName.contains("streaming-kafka-assembly")) {
+      if (mName.contains("streaming-flume-assembly") || mName.contains("streaming-kafka-assembly") || mName.contains("streaming-kinesis-asl-assembly")) {
         // This must match the same name used in maven (see external/kafka-assembly/pom.xml)
         s"${mName}-${v}.jar"
       } else {
diff --git a/python/pyspark/streaming/kinesis.py b/python/pyspark/streaming/kinesis.py
new file mode 100644
index 0000000000000..bcfe2703fecf9
--- /dev/null
+++ b/python/pyspark/streaming/kinesis.py
@@ -0,0 +1,112 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from py4j.java_gateway import Py4JJavaError
+
+from pyspark.serializers import PairDeserializer, NoOpSerializer
+from pyspark.storagelevel import StorageLevel
+from pyspark.streaming import DStream
+
+__all__ = ['KinesisUtils', 'InitialPositionInStream', 'utf8_decoder']
+
+
+def utf8_decoder(s):
+    """ Decode the unicode as UTF-8 """
+    return s and s.decode('utf-8')
+
+
+class KinesisUtils(object):
+
+    @staticmethod
+    def createStream(ssc, kinesisAppName, streamName, endpointUrl, regionName,
+                     initialPositionInStream, checkpointInterval,
+                     storageLevel=StorageLevel.MEMORY_AND_DISK_2,
+                     awsAccessKeyId=None, awsSecretKey=None, decoder=utf8_decoder):
+        """
+        Create an input stream that pulls messages from a Kinesis stream. This uses the
+        Kinesis Client Library (KCL) to pull messages from Kinesis.
+
+        Note: The given AWS credentials will get saved in DStream checkpoints if checkpointing is
+        enabled. Make sure that your checkpoint directory is secure.
+
+        :param ssc:  StreamingContext object
+        :param kinesisAppName:  Kinesis application name used by the Kinesis Client Library (KCL) to
+                                update DynamoDB
+        :param streamName:  Kinesis stream name
+        :param endpointUrl:  Url of Kinesis service (e.g., https://kinesis.us-east-1.amazonaws.com)
+        :param regionName:  Name of region used by the Kinesis Client Library (KCL) to update
+                            DynamoDB (lease coordination and checkpointing) and CloudWatch (metrics)
+        :param initialPositionInStream:  In the absence of Kinesis checkpoint info, this is the
+                                         worker's initial starting position in the stream. The
+                                         values are either the beginning of the stream per Kinesis'
+                                         limit of 24 hours (InitialPositionInStream.TRIM_HORIZON) or
+                                         the tip of the stream (InitialPositionInStream.LATEST).
+        :param checkpointInterval:  Checkpoint interval for Kinesis checkpointing. See the Kinesis
+                                    Spark Streaming documentation for more details on the different
+                                    types of checkpoints.
+        :param storageLevel:  Storage level to use for storing the received objects (default is
+                              StorageLevel.MEMORY_AND_DISK_2)
+        :param awsAccessKeyId:  AWS AccessKeyId (default is None. If None, will use
+                                DefaultAWSCredentialsProviderChain)
+        :param awsSecretKey:  AWS SecretKey (default is None. If None, will use
+                              DefaultAWSCredentialsProviderChain)
+        :param decoder:  A function used to decode value (default is utf8_decoder)
+        :return: A DStream object
+        """
+        jlevel = ssc._sc._getJavaStorageLevel(storageLevel)
+        jduration = ssc._jduration(checkpointInterval)
+
+        try:
+            # Use KinesisUtilsPythonHelper to access Scala's KinesisUtils
+            helperClass = ssc._jvm.java.lang.Thread.currentThread().getContextClassLoader()\
+                .loadClass("org.apache.spark.streaming.kinesis.KinesisUtilsPythonHelper")
+            helper = helperClass.newInstance()
+            jstream = helper.createStream(ssc._jssc, kinesisAppName, streamName, endpointUrl,
+                                          regionName, initialPositionInStream, jduration, jlevel,
+                                          awsAccessKeyId, awsSecretKey)
+        except Py4JJavaError as e:
+            if 'ClassNotFoundException' in str(e.java_exception):
+                KinesisUtils._printErrorMsg(ssc.sparkContext)
+            raise e
+        stream = DStream(jstream, ssc, NoOpSerializer())
+        return stream.map(lambda v: decoder(v))
+
+    @staticmethod
+    def _printErrorMsg(sc):
+        print("""
+________________________________________________________________________________________________
+
+  Spark Streaming's Kinesis libraries not found in class path. Try one of the following.
+
+  1. Include the Kinesis library and its dependencies with in the
+     spark-submit command as
+
+     $ bin/spark-submit --packages org.apache.spark:spark-streaming-kinesis-asl:%s ...
+
+  2. Download the JAR of the artifact from Maven Central http://search.maven.org/,
+     Group Id = org.apache.spark, Artifact Id = spark-streaming-kinesis-asl-assembly, Version = %s.
+     Then, include the jar in the spark-submit command as
+
+     $ bin/spark-submit --jars <spark-streaming-kinesis-asl-assembly.jar> ...
+
+________________________________________________________________________________________________
+
+""" % (sc.version, sc.version))
+
+
+class InitialPositionInStream(object):
+    LATEST, TRIM_HORIZON = (0, 1)
diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
index 4ecae1e4bf282..5cd544b2144ef 100644
--- a/python/pyspark/streaming/tests.py
+++ b/python/pyspark/streaming/tests.py
@@ -36,9 +36,11 @@
     import unittest
 
 from pyspark.context import SparkConf, SparkContext, RDD
+from pyspark.storagelevel import StorageLevel
 from pyspark.streaming.context import StreamingContext
 from pyspark.streaming.kafka import Broker, KafkaUtils, OffsetRange, TopicAndPartition
 from pyspark.streaming.flume import FlumeUtils
+from pyspark.streaming.kinesis import KinesisUtils, InitialPositionInStream
 
 
 class PySparkStreamingTestCase(unittest.TestCase):
@@ -891,6 +893,67 @@ def test_flume_polling_multiple_hosts(self):
         self._testMultipleTimes(self._testFlumePollingMultipleHosts)
 
 
+class KinesisStreamTests(PySparkStreamingTestCase):
+
+    def test_kinesis_stream_api(self):
+        # Don't start the StreamingContext because we cannot test it in Jenkins
+        kinesisStream1 = KinesisUtils.createStream(
+            self.ssc, "myAppNam", "mySparkStream",
+            "https://kinesis.us-west-2.amazonaws.com", "us-west-2",
+            InitialPositionInStream.LATEST, 2, StorageLevel.MEMORY_AND_DISK_2)
+        kinesisStream2 = KinesisUtils.createStream(
+            self.ssc, "myAppNam", "mySparkStream",
+            "https://kinesis.us-west-2.amazonaws.com", "us-west-2",
+            InitialPositionInStream.LATEST, 2, StorageLevel.MEMORY_AND_DISK_2,
+            "awsAccessKey", "awsSecretKey")
+
+    def test_kinesis_stream(self):
+        if os.environ.get('ENABLE_KINESIS_TESTS') != '1':
+            print("Skip test_kinesis_stream")
+            return
+
+        import random
+        kinesisAppName = ("KinesisStreamTests-%d" % abs(random.randint(0, 10000000)))
+        kinesisTestUtilsClz = \
+            self.sc._jvm.java.lang.Thread.currentThread().getContextClassLoader() \
+                .loadClass("org.apache.spark.streaming.kinesis.KinesisTestUtils")
+        kinesisTestUtils = kinesisTestUtilsClz.newInstance()
+        try:
+            kinesisTestUtils.createStream()
+            aWSCredentials = kinesisTestUtils.getAWSCredentials()
+            stream = KinesisUtils.createStream(
+                self.ssc, kinesisAppName, kinesisTestUtils.streamName(),
+                kinesisTestUtils.endpointUrl(), kinesisTestUtils.regionName(),
+                InitialPositionInStream.LATEST, 10, StorageLevel.MEMORY_ONLY,
+                aWSCredentials.getAWSAccessKeyId(), aWSCredentials.getAWSSecretKey())
+
+            outputBuffer = []
+
+            def get_output(_, rdd):
+                for e in rdd.collect():
+                    outputBuffer.append(e)
+
+            stream.foreachRDD(get_output)
+            self.ssc.start()
+
+            testData = [i for i in range(1, 11)]
+            expectedOutput = set([str(i) for i in testData])
+            start_time = time.time()
+            while time.time() - start_time < 120:
+                kinesisTestUtils.pushData(testData)
+                if expectedOutput == set(outputBuffer):
+                    break
+                time.sleep(10)
+            self.assertEqual(expectedOutput, set(outputBuffer))
+        except:
+            import traceback
+            traceback.print_exc()
+            raise
+        finally:
+            kinesisTestUtils.deleteStream()
+            kinesisTestUtils.deleteDynamoDBTable(kinesisAppName)
+
+
 def search_kafka_assembly_jar():
     SPARK_HOME = os.environ["SPARK_HOME"]
     kafka_assembly_dir = os.path.join(SPARK_HOME, "external/kafka-assembly")
@@ -926,10 +989,31 @@ def search_flume_assembly_jar():
     else:
         return jars[0]
 
+
+def search_kinesis_asl_assembly_jar():
+    SPARK_HOME = os.environ["SPARK_HOME"]
+    kinesis_asl_assembly_dir = os.path.join(SPARK_HOME, "extras/kinesis-asl-assembly")
+    jars = glob.glob(
+        os.path.join(kinesis_asl_assembly_dir,
+                     "target/scala-*/spark-streaming-kinesis-asl-assembly-*.jar"))
+    if not jars:
+        raise Exception(
+            ("Failed to find Spark Streaming Kinesis ASL assembly jar in %s. " %
+             kinesis_asl_assembly_dir) + "You need to build Spark with "
+            "'build/sbt -Pkinesis-asl assembly/assembly streaming-kinesis-asl-assembly/assembly' "
+            "or 'build/mvn -Pkinesis-asl package' before running this test")
+    elif len(jars) > 1:
+        raise Exception(("Found multiple Spark Streaming Kinesis ASL assembly JARs in %s; please "
+                         "remove all but one") % kinesis_asl_assembly_dir)
+    else:
+        return jars[0]
+
+
 if __name__ == "__main__":
     kafka_assembly_jar = search_kafka_assembly_jar()
     flume_assembly_jar = search_flume_assembly_jar()
-    jars = "%s,%s" % (kafka_assembly_jar, flume_assembly_jar)
+    kinesis_asl_assembly_jar = search_kinesis_asl_assembly_jar()
+    jars = "%s,%s,%s" % (kafka_assembly_jar, flume_assembly_jar, kinesis_asl_assembly_jar)
 
     os.environ["PYSPARK_SUBMIT_ARGS"] = "--jars %s pyspark-shell" % jars
     unittest.main()

From d04634701413410938a133358fe1d9fbc077645e Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Fri, 31 Jul 2015 12:10:55 -0700
Subject: [PATCH 523/549] [SPARK-9504] [STREAMING] [TESTS] Use eventually to
 fix the flaky test

The previous code uses `ssc.awaitTerminationOrTimeout(500)`. Since nobody will stop it during `awaitTerminationOrTimeout`, it's just like `sleep(500)`. In a super overloaded Jenkins worker, the receiver may be not able to start in 500 milliseconds. Verified this in the log of https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/39149/ There is no log about starting the receiver before this failure. That's why `assert(runningCount > 0)` failed.

This PR replaces `awaitTerminationOrTimeout` with `eventually` which should be more reliable.

Author: zsxwing <zsxwing@gmail.com>

Closes #7823 from zsxwing/SPARK-9504 and squashes the following commits:

7af66a6 [zsxwing] Remove wrong assertion
5ba2c99 [zsxwing] Use eventually to fix the flaky test
---
 .../apache/spark/streaming/StreamingContextSuite.scala | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala
index 84a5fbb3d95eb..b7db280f63588 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala
@@ -261,7 +261,7 @@ class StreamingContextSuite extends SparkFunSuite with BeforeAndAfter with Timeo
     for (i <- 1 to 4) {
       logInfo("==================================\n\n\n")
       ssc = new StreamingContext(sc, Milliseconds(100))
-      var runningCount = 0
+      @volatile var runningCount = 0
       TestReceiver.counter.set(1)
       val input = ssc.receiverStream(new TestReceiver)
       input.count().foreachRDD { rdd =>
@@ -270,14 +270,14 @@ class StreamingContextSuite extends SparkFunSuite with BeforeAndAfter with Timeo
         logInfo("Count = " + count + ", Running count = " + runningCount)
       }
       ssc.start()
-      ssc.awaitTerminationOrTimeout(500)
+      eventually(timeout(10.seconds), interval(10.millis)) {
+        assert(runningCount > 0)
+      }
       ssc.stop(stopSparkContext = false, stopGracefully = true)
       logInfo("Running count = " + runningCount)
       logInfo("TestReceiver.counter = " + TestReceiver.counter.get())
-      assert(runningCount > 0)
       assert(
-        (TestReceiver.counter.get() == runningCount + 1) ||
-          (TestReceiver.counter.get() == runningCount + 2),
+        TestReceiver.counter.get() == runningCount + 1,
         "Received records = " + TestReceiver.counter.get() + ", " +
           "processed records = " + runningCount
       )

From a8340fa7df17e3f0a3658f8b8045ab840845a72a Mon Sep 17 00:00:00 2001
From: Feynman Liang <fliang@databricks.com>
Date: Fri, 31 Jul 2015 12:12:22 -0700
Subject: [PATCH 524/549] [SPARK-9481] Add logLikelihood to LocalLDAModel

jkbradley Exposes `bound` (variational log likelihood bound) through public API as `logLikelihood`. Also adds unit tests, some DRYing of `LDASuite`, and includes unit tests mentioned in #7760

Author: Feynman Liang <fliang@databricks.com>

Closes #7801 from feynmanliang/SPARK-9481-logLikelihood and squashes the following commits:

6d1b2c9 [Feynman Liang] Negate perplexity definition
5f62b20 [Feynman Liang] Add logLikelihood
---
 .../spark/mllib/clustering/LDAModel.scala     |  20 ++-
 .../spark/mllib/clustering/LDASuite.scala     | 129 +++++++++---------
 2 files changed, 78 insertions(+), 71 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
index 82281a0daf008..ff7035d2246c2 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
@@ -217,22 +217,28 @@ class LocalLDAModel private[clustering] (
     LocalLDAModel.SaveLoadV1_0.save(sc, path, topicsMatrix, docConcentration, topicConcentration,
       gammaShape)
   }
-  // TODO
-  // override def logLikelihood(documents: RDD[(Long, Vector)]): Double = ???
+
+  // TODO: declare in LDAModel and override once implemented in DistributedLDAModel
+  /**
+   * Calculates a lower bound on the log likelihood of the entire corpus.
+   * @param documents test corpus to use for calculating log likelihood
+   * @return variational lower bound on the log likelihood of the entire corpus
+   */
+  def logLikelihood(documents: RDD[(Long, Vector)]): Double = bound(documents,
+    docConcentration, topicConcentration, topicsMatrix.toBreeze.toDenseMatrix, gammaShape, k,
+    vocabSize)
 
   /**
-   * Calculate the log variational bound on perplexity. See Equation (16) in original Online
+   * Calculate an upper bound bound on perplexity. See Equation (16) in original Online
    * LDA paper.
    * @param documents test corpus to use for calculating perplexity
-   * @return the log perplexity per word
+   * @return variational upper bound on log perplexity per word
    */
   def logPerplexity(documents: RDD[(Long, Vector)]): Double = {
     val corpusWords = documents
       .map { case (_, termCounts) => termCounts.toArray.sum }
       .sum()
-    val batchVariationalBound = bound(documents, docConcentration,
-      topicConcentration, topicsMatrix.toBreeze.toDenseMatrix, gammaShape, k, vocabSize)
-    val perWordBound = batchVariationalBound / corpusWords
+    val perWordBound = -logLikelihood(documents) / corpusWords
 
     perWordBound
   }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
index 695ee3b82efc5..79d2a1cafd1fa 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
@@ -210,16 +210,7 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext {
   }
 
   test("OnlineLDAOptimizer with toy data") {
-    def toydata: Array[(Long, Vector)] = Array(
-      Vectors.sparse(6, Array(0, 1), Array(1, 1)),
-      Vectors.sparse(6, Array(1, 2), Array(1, 1)),
-      Vectors.sparse(6, Array(0, 2), Array(1, 1)),
-      Vectors.sparse(6, Array(3, 4), Array(1, 1)),
-      Vectors.sparse(6, Array(3, 5), Array(1, 1)),
-      Vectors.sparse(6, Array(4, 5), Array(1, 1))
-    ).zipWithIndex.map { case (wordCounts, docId) => (docId.toLong, wordCounts) }
-
-    val docs = sc.parallelize(toydata)
+    val docs = sc.parallelize(toyData)
     val op = new OnlineLDAOptimizer().setMiniBatchFraction(1).setTau0(1024).setKappa(0.51)
       .setGammaShape(1e10)
     val lda = new LDA().setK(2)
@@ -242,30 +233,45 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext {
     }
   }
 
-  test("LocalLDAModel logPerplexity") {
-    val k = 2
-    val vocabSize = 6
-    val alpha = 0.01
-    val eta = 0.01
-    val gammaShape = 100
-    // obtained from LDA model trained in gensim, see below
-    val topics = new DenseMatrix(numRows = vocabSize, numCols = k, values = Array(
-      1.86738052, 1.94056535, 1.89981687, 0.0833265, 0.07405918, 0.07940597,
-      0.15081551, 0.08637973, 0.12428538, 1.9474897, 1.94615165, 1.95204124))
+  test("LocalLDAModel logLikelihood") {
+    val ldaModel: LocalLDAModel = toyModel
 
-    def toydata: Array[(Long, Vector)] = Array(
-      Vectors.sparse(6, Array(0, 1), Array(1, 1)),
-      Vectors.sparse(6, Array(1, 2), Array(1, 1)),
-      Vectors.sparse(6, Array(0, 2), Array(1, 1)),
-      Vectors.sparse(6, Array(3, 4), Array(1, 1)),
-      Vectors.sparse(6, Array(3, 5), Array(1, 1)),
-      Vectors.sparse(6, Array(4, 5), Array(1, 1))
-    ).zipWithIndex.map { case (wordCounts, docId) => (docId.toLong, wordCounts) }
-    val docs = sc.parallelize(toydata)
+    val docsSingleWord = sc.parallelize(Array(Vectors.sparse(6, Array(0), Array(1)))
+      .zipWithIndex
+      .map { case (wordCounts, docId) => (docId.toLong, wordCounts) })
+    val docsRepeatedWord = sc.parallelize(Array(Vectors.sparse(6, Array(0), Array(5)))
+      .zipWithIndex
+      .map { case (wordCounts, docId) => (docId.toLong, wordCounts) })
 
+    /* Verify results using gensim:
+       import numpy as np
+       from gensim import models
+       corpus = [
+          [(0, 1.0), (1, 1.0)],
+          [(1, 1.0), (2, 1.0)],
+          [(0, 1.0), (2, 1.0)],
+          [(3, 1.0), (4, 1.0)],
+          [(3, 1.0), (5, 1.0)],
+          [(4, 1.0), (5, 1.0)]]
+       np.random.seed(2345)
+       lda = models.ldamodel.LdaModel(
+          corpus=corpus, alpha=0.01, eta=0.01, num_topics=2, update_every=0, passes=100,
+          decay=0.51, offset=1024)
+       docsSingleWord = [[(0, 1.0)]]
+       docsRepeatedWord = [[(0, 5.0)]]
+       print(lda.bound(docsSingleWord))
+       > -25.9706969833
+       print(lda.bound(docsRepeatedWord))
+       > -31.4413908227
+     */
 
-    val ldaModel: LocalLDAModel = new LocalLDAModel(
-      topics, Vectors.dense(Array.fill(k)(alpha)), eta, gammaShape)
+    assert(ldaModel.logLikelihood(docsSingleWord) ~== -25.971 relTol 1E-3D)
+    assert(ldaModel.logLikelihood(docsRepeatedWord) ~== -31.441  relTol 1E-3D)
+  }
+
+  test("LocalLDAModel logPerplexity") {
+    val docs = sc.parallelize(toyData)
+    val ldaModel: LocalLDAModel = toyModel
 
     /* Verify results using gensim:
        import numpy as np
@@ -285,32 +291,13 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext {
        > -3.69051285096
      */
 
-    assert(ldaModel.logPerplexity(docs) ~== -3.690D relTol 1E-3D)
+    // Gensim's definition of perplexity is negative our (and Stanford NLP's) definition
+    assert(ldaModel.logPerplexity(docs) ~== 3.690D relTol 1E-3D)
   }
 
   test("LocalLDAModel predict") {
-    val k = 2
-    val vocabSize = 6
-    val alpha = 0.01
-    val eta = 0.01
-    val gammaShape = 100
-    // obtained from LDA model trained in gensim, see below
-    val topics = new DenseMatrix(numRows = vocabSize, numCols = k, values = Array(
-      1.86738052, 1.94056535, 1.89981687, 0.0833265, 0.07405918, 0.07940597,
-      0.15081551, 0.08637973, 0.12428538, 1.9474897, 1.94615165, 1.95204124))
-
-    def toydata: Array[(Long, Vector)] = Array(
-      Vectors.sparse(6, Array(0, 1), Array(1, 1)),
-      Vectors.sparse(6, Array(1, 2), Array(1, 1)),
-      Vectors.sparse(6, Array(0, 2), Array(1, 1)),
-      Vectors.sparse(6, Array(3, 4), Array(1, 1)),
-      Vectors.sparse(6, Array(3, 5), Array(1, 1)),
-      Vectors.sparse(6, Array(4, 5), Array(1, 1))
-    ).zipWithIndex.map { case (wordCounts, docId) => (docId.toLong, wordCounts) }
-    val docs = sc.parallelize(toydata)
-
-    val ldaModel: LocalLDAModel = new LocalLDAModel(
-      topics, Vectors.dense(Array.fill(k)(alpha)), eta, gammaShape)
+    val docs = sc.parallelize(toyData)
+    val ldaModel: LocalLDAModel = toyModel
 
     /* Verify results using gensim:
        import numpy as np
@@ -351,16 +338,7 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext {
   }
 
   test("OnlineLDAOptimizer with asymmetric prior") {
-    def toydata: Array[(Long, Vector)] = Array(
-      Vectors.sparse(6, Array(0, 1), Array(1, 1)),
-      Vectors.sparse(6, Array(1, 2), Array(1, 1)),
-      Vectors.sparse(6, Array(0, 2), Array(1, 1)),
-      Vectors.sparse(6, Array(3, 4), Array(1, 1)),
-      Vectors.sparse(6, Array(3, 5), Array(1, 1)),
-      Vectors.sparse(6, Array(4, 5), Array(1, 1))
-    ).zipWithIndex.map { case (wordCounts, docId) => (docId.toLong, wordCounts) }
-
-    val docs = sc.parallelize(toydata)
+    val docs = sc.parallelize(toyData)
     val op = new OnlineLDAOptimizer().setMiniBatchFraction(1).setTau0(1024).setKappa(0.51)
       .setGammaShape(1e10)
     val lda = new LDA().setK(2)
@@ -531,4 +509,27 @@ private[clustering] object LDASuite {
   def getNonEmptyDoc(corpus: Array[(Long, Vector)]): Array[(Long, Vector)] = corpus.filter {
     case (_, wc: Vector) => Vectors.norm(wc, p = 1.0) != 0.0
   }
+
+  def toyData: Array[(Long, Vector)] = Array(
+    Vectors.sparse(6, Array(0, 1), Array(1, 1)),
+    Vectors.sparse(6, Array(1, 2), Array(1, 1)),
+    Vectors.sparse(6, Array(0, 2), Array(1, 1)),
+    Vectors.sparse(6, Array(3, 4), Array(1, 1)),
+    Vectors.sparse(6, Array(3, 5), Array(1, 1)),
+    Vectors.sparse(6, Array(4, 5), Array(1, 1))
+  ).zipWithIndex.map { case (wordCounts, docId) => (docId.toLong, wordCounts) }
+
+  def toyModel: LocalLDAModel = {
+    val k = 2
+    val vocabSize = 6
+    val alpha = 0.01
+    val eta = 0.01
+    val gammaShape = 100
+    val topics = new DenseMatrix(numRows = vocabSize, numCols = k, values = Array(
+      1.86738052, 1.94056535, 1.89981687, 0.0833265, 0.07405918, 0.07940597,
+      0.15081551, 0.08637973, 0.12428538, 1.9474897, 1.94615165, 1.95204124))
+    val ldaModel: LocalLDAModel = new LocalLDAModel(
+      topics, Vectors.dense(Array.fill(k)(alpha)), eta, gammaShape)
+    ldaModel
+  }
 }

From c0686668ae6a92b6bb4801a55c3b78aedbee816a Mon Sep 17 00:00:00 2001
From: CodingCat <zhunansjtu@gmail.com>
Date: Fri, 31 Jul 2015 20:27:00 +0100
Subject: [PATCH 525/549] [SPARK-9202] capping maximum number of
 executor&driver information kept in Worker

https://issues.apache.org/jira/browse/SPARK-9202

Author: CodingCat <zhunansjtu@gmail.com>

Closes #7714 from CodingCat/SPARK-9202 and squashes the following commits:

23977fb [CodingCat] add comments about why we don't synchronize finishedExecutors & finishedDrivers
dc9772d [CodingCat] addressing the comments
e125241 [CodingCat] stylistic fix
80bfe52 [CodingCat] fix JsonProtocolSuite
d7d9485 [CodingCat] styistic fix and respect insert ordering
031755f [CodingCat] add license info & stylistic fix
c3b5361 [CodingCat] test cases and docs
c557b3a [CodingCat] applications are fine
9cac751 [CodingCat] application is fine...
ad87ed7 [CodingCat] trimFinishedExecutorsAndDrivers
---
 .../apache/spark/deploy/worker/Worker.scala   | 124 ++++++++++------
 .../spark/deploy/worker/ui/WorkerWebUI.scala  |   4 +-
 .../apache/spark/deploy/DeployTestUtils.scala |  89 ++++++++++++
 .../spark/deploy/JsonProtocolSuite.scala      |  59 ++------
 .../spark/deploy/worker/WorkerSuite.scala     | 133 +++++++++++++++++-
 docs/configuration.md                         |  14 ++
 6 files changed, 329 insertions(+), 94 deletions(-)
 create mode 100644 core/src/test/scala/org/apache/spark/deploy/DeployTestUtils.scala

diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
index 82e9578bbcba5..0276c24f85368 100755
--- a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
@@ -25,7 +25,7 @@ import java.util.concurrent._
 import java.util.concurrent.{Future => JFuture, ScheduledFuture => JScheduledFuture}
 
 import scala.collection.JavaConversions._
-import scala.collection.mutable.{HashMap, HashSet}
+import scala.collection.mutable.{HashMap, HashSet, LinkedHashMap}
 import scala.concurrent.ExecutionContext
 import scala.util.Random
 import scala.util.control.NonFatal
@@ -115,13 +115,18 @@ private[worker] class Worker(
     }
 
   var workDir: File = null
-  val finishedExecutors = new HashMap[String, ExecutorRunner]
+  val finishedExecutors = new LinkedHashMap[String, ExecutorRunner]
   val drivers = new HashMap[String, DriverRunner]
   val executors = new HashMap[String, ExecutorRunner]
-  val finishedDrivers = new HashMap[String, DriverRunner]
+  val finishedDrivers = new LinkedHashMap[String, DriverRunner]
   val appDirectories = new HashMap[String, Seq[String]]
   val finishedApps = new HashSet[String]
 
+  val retainedExecutors = conf.getInt("spark.worker.ui.retainedExecutors",
+    WorkerWebUI.DEFAULT_RETAINED_EXECUTORS)
+  val retainedDrivers = conf.getInt("spark.worker.ui.retainedDrivers",
+    WorkerWebUI.DEFAULT_RETAINED_DRIVERS)
+
   // The shuffle service is not actually started unless configured.
   private val shuffleService = new ExternalShuffleService(conf, securityMgr)
 
@@ -461,25 +466,7 @@ private[worker] class Worker(
       }
 
     case executorStateChanged @ ExecutorStateChanged(appId, execId, state, message, exitStatus) =>
-      sendToMaster(executorStateChanged)
-      val fullId = appId + "/" + execId
-      if (ExecutorState.isFinished(state)) {
-        executors.get(fullId) match {
-          case Some(executor) =>
-            logInfo("Executor " + fullId + " finished with state " + state +
-              message.map(" message " + _).getOrElse("") +
-              exitStatus.map(" exitStatus " + _).getOrElse(""))
-            executors -= fullId
-            finishedExecutors(fullId) = executor
-            coresUsed -= executor.cores
-            memoryUsed -= executor.memory
-          case None =>
-            logInfo("Unknown Executor " + fullId + " finished with state " + state +
-              message.map(" message " + _).getOrElse("") +
-              exitStatus.map(" exitStatus " + _).getOrElse(""))
-        }
-        maybeCleanupApplication(appId)
-      }
+      handleExecutorStateChanged(executorStateChanged)
 
     case KillExecutor(masterUrl, appId, execId) =>
       if (masterUrl != activeMasterUrl) {
@@ -523,24 +510,8 @@ private[worker] class Worker(
       }
     }
 
-    case driverStageChanged @ DriverStateChanged(driverId, state, exception) => {
-      state match {
-        case DriverState.ERROR =>
-          logWarning(s"Driver $driverId failed with unrecoverable exception: ${exception.get}")
-        case DriverState.FAILED =>
-          logWarning(s"Driver $driverId exited with failure")
-        case DriverState.FINISHED =>
-          logInfo(s"Driver $driverId exited successfully")
-        case DriverState.KILLED =>
-          logInfo(s"Driver $driverId was killed by user")
-        case _ =>
-          logDebug(s"Driver $driverId changed state to $state")
-      }
-      sendToMaster(driverStageChanged)
-      val driver = drivers.remove(driverId).get
-      finishedDrivers(driverId) = driver
-      memoryUsed -= driver.driverDesc.mem
-      coresUsed -= driver.driverDesc.cores
+    case driverStateChanged @ DriverStateChanged(driverId, state, exception) => {
+      handleDriverStateChanged(driverStateChanged)
     }
 
     case ReregisterWithMaster =>
@@ -614,6 +585,78 @@ private[worker] class Worker(
     webUi.stop()
     metricsSystem.stop()
   }
+
+  private def trimFinishedExecutorsIfNecessary(): Unit = {
+    // do not need to protect with locks since both WorkerPage and Restful server get data through
+    // thread-safe RpcEndPoint
+    if (finishedExecutors.size > retainedExecutors) {
+      finishedExecutors.take(math.max(finishedExecutors.size / 10, 1)).foreach {
+        case (executorId, _) => finishedExecutors.remove(executorId)
+      }
+    }
+  }
+
+  private def trimFinishedDriversIfNecessary(): Unit = {
+    // do not need to protect with locks since both WorkerPage and Restful server get data through
+    // thread-safe RpcEndPoint
+    if (finishedDrivers.size > retainedDrivers) {
+      finishedDrivers.take(math.max(finishedDrivers.size / 10, 1)).foreach {
+        case (driverId, _) => finishedDrivers.remove(driverId)
+      }
+    }
+  }
+
+  private[worker] def handleDriverStateChanged(driverStateChanged: DriverStateChanged): Unit = {
+    val driverId = driverStateChanged.driverId
+    val exception = driverStateChanged.exception
+    val state = driverStateChanged.state
+    state match {
+      case DriverState.ERROR =>
+        logWarning(s"Driver $driverId failed with unrecoverable exception: ${exception.get}")
+      case DriverState.FAILED =>
+        logWarning(s"Driver $driverId exited with failure")
+      case DriverState.FINISHED =>
+        logInfo(s"Driver $driverId exited successfully")
+      case DriverState.KILLED =>
+        logInfo(s"Driver $driverId was killed by user")
+      case _ =>
+        logDebug(s"Driver $driverId changed state to $state")
+    }
+    sendToMaster(driverStateChanged)
+    val driver = drivers.remove(driverId).get
+    finishedDrivers(driverId) = driver
+    trimFinishedDriversIfNecessary()
+    memoryUsed -= driver.driverDesc.mem
+    coresUsed -= driver.driverDesc.cores
+  }
+
+  private[worker] def handleExecutorStateChanged(executorStateChanged: ExecutorStateChanged):
+    Unit = {
+    sendToMaster(executorStateChanged)
+    val state = executorStateChanged.state
+    if (ExecutorState.isFinished(state)) {
+      val appId = executorStateChanged.appId
+      val fullId = appId + "/" + executorStateChanged.execId
+      val message = executorStateChanged.message
+      val exitStatus = executorStateChanged.exitStatus
+      executors.get(fullId) match {
+        case Some(executor) =>
+          logInfo("Executor " + fullId + " finished with state " + state +
+            message.map(" message " + _).getOrElse("") +
+            exitStatus.map(" exitStatus " + _).getOrElse(""))
+          executors -= fullId
+          finishedExecutors(fullId) = executor
+          trimFinishedExecutorsIfNecessary()
+          coresUsed -= executor.cores
+          memoryUsed -= executor.memory
+        case None =>
+          logInfo("Unknown Executor " + fullId + " finished with state " + state +
+            message.map(" message " + _).getOrElse("") +
+            exitStatus.map(" exitStatus " + _).getOrElse(""))
+      }
+      maybeCleanupApplication(appId)
+    }
+  }
 }
 
 private[deploy] object Worker extends Logging {
@@ -669,5 +712,4 @@ private[deploy] object Worker extends Logging {
       cmd
     }
   }
-
 }
diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/ui/WorkerWebUI.scala b/core/src/main/scala/org/apache/spark/deploy/worker/ui/WorkerWebUI.scala
index 334a5b10142aa..709a27233598c 100644
--- a/core/src/main/scala/org/apache/spark/deploy/worker/ui/WorkerWebUI.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/ui/WorkerWebUI.scala
@@ -53,6 +53,8 @@ class WorkerWebUI(
   }
 }
 
-private[ui] object WorkerWebUI {
+private[worker] object WorkerWebUI {
   val STATIC_RESOURCE_BASE = SparkUI.STATIC_RESOURCE_DIR
+  val DEFAULT_RETAINED_DRIVERS = 1000
+  val DEFAULT_RETAINED_EXECUTORS = 1000
 }
diff --git a/core/src/test/scala/org/apache/spark/deploy/DeployTestUtils.scala b/core/src/test/scala/org/apache/spark/deploy/DeployTestUtils.scala
new file mode 100644
index 0000000000000..967aa0976f0ce
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/deploy/DeployTestUtils.scala
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy
+
+import java.io.File
+import java.util.Date
+
+import org.apache.spark.deploy.master.{ApplicationInfo, DriverInfo, WorkerInfo}
+import org.apache.spark.deploy.worker.{DriverRunner, ExecutorRunner}
+import org.apache.spark.{SecurityManager, SparkConf}
+
+private[deploy] object DeployTestUtils {
+  def createAppDesc(): ApplicationDescription = {
+    val cmd = new Command("mainClass", List("arg1", "arg2"), Map(), Seq(), Seq(), Seq())
+    new ApplicationDescription("name", Some(4), 1234, cmd, "appUiUrl")
+  }
+
+  def createAppInfo() : ApplicationInfo = {
+    val appInfo = new ApplicationInfo(JsonConstants.appInfoStartTime,
+      "id", createAppDesc(), JsonConstants.submitDate, null, Int.MaxValue)
+    appInfo.endTime = JsonConstants.currTimeInMillis
+    appInfo
+  }
+
+  def createDriverCommand(): Command = new Command(
+    "org.apache.spark.FakeClass", Seq("some arg --and-some options -g foo"),
+    Map(("K1", "V1"), ("K2", "V2")), Seq("cp1", "cp2"), Seq("lp1", "lp2"), Seq("-Dfoo")
+  )
+
+  def createDriverDesc(): DriverDescription =
+    new DriverDescription("hdfs://some-dir/some.jar", 100, 3, false, createDriverCommand())
+
+  def createDriverInfo(): DriverInfo = new DriverInfo(3, "driver-3",
+    createDriverDesc(), new Date())
+
+  def createWorkerInfo(): WorkerInfo = {
+    val workerInfo = new WorkerInfo("id", "host", 8080, 4, 1234, null, 80, "publicAddress")
+    workerInfo.lastHeartbeat = JsonConstants.currTimeInMillis
+    workerInfo
+  }
+
+  def createExecutorRunner(execId: Int): ExecutorRunner = {
+    new ExecutorRunner(
+      "appId",
+      execId,
+      createAppDesc(),
+      4,
+      1234,
+      null,
+      "workerId",
+      "host",
+      123,
+      "publicAddress",
+      new File("sparkHome"),
+      new File("workDir"),
+      "akka://worker",
+      new SparkConf,
+      Seq("localDir"),
+      ExecutorState.RUNNING)
+  }
+
+  def createDriverRunner(driverId: String): DriverRunner = {
+    val conf = new SparkConf()
+    new DriverRunner(
+      conf,
+      driverId,
+      new File("workDir"),
+      new File("sparkHome"),
+      createDriverDesc(),
+      null,
+      "akka://worker",
+      new SecurityManager(conf))
+  }
+}
diff --git a/core/src/test/scala/org/apache/spark/deploy/JsonProtocolSuite.scala b/core/src/test/scala/org/apache/spark/deploy/JsonProtocolSuite.scala
index 08529e0ef2806..0a9f128a3a6b6 100644
--- a/core/src/test/scala/org/apache/spark/deploy/JsonProtocolSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/JsonProtocolSuite.scala
@@ -17,7 +17,6 @@
 
 package org.apache.spark.deploy
 
-import java.io.File
 import java.util.Date
 
 import com.fasterxml.jackson.core.JsonParseException
@@ -25,12 +24,14 @@ import org.json4s._
 import org.json4s.jackson.JsonMethods
 
 import org.apache.spark.deploy.DeployMessages.{MasterStateResponse, WorkerStateResponse}
-import org.apache.spark.deploy.master.{ApplicationInfo, DriverInfo, RecoveryState, WorkerInfo}
-import org.apache.spark.deploy.worker.{DriverRunner, ExecutorRunner}
-import org.apache.spark.{JsonTestUtils, SecurityManager, SparkConf, SparkFunSuite}
+import org.apache.spark.deploy.master.{ApplicationInfo, RecoveryState}
+import org.apache.spark.deploy.worker.ExecutorRunner
+import org.apache.spark.{JsonTestUtils, SparkFunSuite}
 
 class JsonProtocolSuite extends SparkFunSuite with JsonTestUtils {
 
+  import org.apache.spark.deploy.DeployTestUtils._
+
   test("writeApplicationInfo") {
     val output = JsonProtocol.writeApplicationInfo(createAppInfo())
     assertValidJson(output)
@@ -50,7 +51,7 @@ class JsonProtocolSuite extends SparkFunSuite with JsonTestUtils {
   }
 
   test("writeExecutorRunner") {
-    val output = JsonProtocol.writeExecutorRunner(createExecutorRunner())
+    val output = JsonProtocol.writeExecutorRunner(createExecutorRunner(123))
     assertValidJson(output)
     assertValidDataInJson(output, JsonMethods.parse(JsonConstants.executorRunnerJsonStr))
   }
@@ -77,9 +78,10 @@ class JsonProtocolSuite extends SparkFunSuite with JsonTestUtils {
 
   test("writeWorkerState") {
     val executors = List[ExecutorRunner]()
-    val finishedExecutors = List[ExecutorRunner](createExecutorRunner(), createExecutorRunner())
-    val drivers = List(createDriverRunner())
-    val finishedDrivers = List(createDriverRunner(), createDriverRunner())
+    val finishedExecutors = List[ExecutorRunner](createExecutorRunner(123),
+      createExecutorRunner(123))
+    val drivers = List(createDriverRunner("driverId"))
+    val finishedDrivers = List(createDriverRunner("driverId"), createDriverRunner("driverId"))
     val stateResponse = new WorkerStateResponse("host", 8080, "workerId", executors,
       finishedExecutors, drivers, finishedDrivers, "masterUrl", 4, 1234, 4, 1234, "masterWebUiUrl")
     val output = JsonProtocol.writeWorkerState(stateResponse)
@@ -87,47 +89,6 @@ class JsonProtocolSuite extends SparkFunSuite with JsonTestUtils {
     assertValidDataInJson(output, JsonMethods.parse(JsonConstants.workerStateJsonStr))
   }
 
-  def createAppDesc(): ApplicationDescription = {
-    val cmd = new Command("mainClass", List("arg1", "arg2"), Map(), Seq(), Seq(), Seq())
-    new ApplicationDescription("name", Some(4), 1234, cmd, "appUiUrl")
-  }
-
-  def createAppInfo() : ApplicationInfo = {
-    val appInfo = new ApplicationInfo(JsonConstants.appInfoStartTime,
-      "id", createAppDesc(), JsonConstants.submitDate, null, Int.MaxValue)
-    appInfo.endTime = JsonConstants.currTimeInMillis
-    appInfo
-  }
-
-  def createDriverCommand(): Command = new Command(
-    "org.apache.spark.FakeClass", Seq("some arg --and-some options -g foo"),
-    Map(("K1", "V1"), ("K2", "V2")), Seq("cp1", "cp2"), Seq("lp1", "lp2"), Seq("-Dfoo")
-  )
-
-  def createDriverDesc(): DriverDescription =
-    new DriverDescription("hdfs://some-dir/some.jar", 100, 3, false, createDriverCommand())
-
-  def createDriverInfo(): DriverInfo = new DriverInfo(3, "driver-3",
-    createDriverDesc(), new Date())
-
-  def createWorkerInfo(): WorkerInfo = {
-    val workerInfo = new WorkerInfo("id", "host", 8080, 4, 1234, null, 80, "publicAddress")
-    workerInfo.lastHeartbeat = JsonConstants.currTimeInMillis
-    workerInfo
-  }
-
-  def createExecutorRunner(): ExecutorRunner = {
-    new ExecutorRunner("appId", 123, createAppDesc(), 4, 1234, null, "workerId", "host", 123,
-      "publicAddress", new File("sparkHome"), new File("workDir"), "akka://worker",
-      new SparkConf, Seq("localDir"), ExecutorState.RUNNING)
-  }
-
-  def createDriverRunner(): DriverRunner = {
-    val conf = new SparkConf()
-    new DriverRunner(conf, "driverId", new File("workDir"), new File("sparkHome"),
-      createDriverDesc(), null, "akka://worker", new SecurityManager(conf))
-  }
-
   def assertValidJson(json: JValue) {
     try {
       JsonMethods.parse(JsonMethods.compact(json))
diff --git a/core/src/test/scala/org/apache/spark/deploy/worker/WorkerSuite.scala b/core/src/test/scala/org/apache/spark/deploy/worker/WorkerSuite.scala
index 0f4d3b28d09df..faed4bdc68447 100644
--- a/core/src/test/scala/org/apache/spark/deploy/worker/WorkerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/worker/WorkerSuite.scala
@@ -17,13 +17,18 @@
 
 package org.apache.spark.deploy.worker
 
-import org.apache.spark.{SparkConf, SparkFunSuite}
-import org.apache.spark.deploy.Command
-
 import org.scalatest.Matchers
 
+import org.apache.spark.deploy.DeployMessages.{DriverStateChanged, ExecutorStateChanged}
+import org.apache.spark.deploy.master.DriverState
+import org.apache.spark.deploy.{Command, ExecutorState}
+import org.apache.spark.rpc.{RpcAddress, RpcEnv}
+import org.apache.spark.{SecurityManager, SparkConf, SparkFunSuite}
+
 class WorkerSuite extends SparkFunSuite with Matchers {
 
+  import org.apache.spark.deploy.DeployTestUtils._
+
   def cmd(javaOpts: String*): Command = {
     Command("", Seq.empty, Map.empty, Seq.empty, Seq.empty, Seq(javaOpts : _*))
   }
@@ -56,4 +61,126 @@ class WorkerSuite extends SparkFunSuite with Matchers {
           "-Dspark.ssl.useNodeLocalConf=true", "-Dspark.ssl.opt1=y", "-Dspark.ssl.opt2=z")
 
   }
+
+  test("test clearing of finishedExecutors (small number of executors)") {
+    val conf = new SparkConf()
+    conf.set("spark.worker.ui.retainedExecutors", 2.toString)
+    val rpcEnv = RpcEnv.create("test", "localhost", 12345, conf, new SecurityManager(conf))
+    val worker = new Worker(rpcEnv, 50000, 20, 1234 * 5, Array.fill(1)(RpcAddress("1.2.3.4", 1234)),
+      "sparkWorker1", "Worker", "/tmp", conf, new SecurityManager(conf))
+    // initialize workers
+    for (i <- 0 until 5) {
+      worker.executors += s"app1/$i" -> createExecutorRunner(i)
+    }
+    // initialize ExecutorStateChanged Message
+    worker.handleExecutorStateChanged(
+      ExecutorStateChanged("app1", 0, ExecutorState.EXITED, None, None))
+    assert(worker.finishedExecutors.size === 1)
+    assert(worker.executors.size === 4)
+    for (i <- 1 until 5) {
+      worker.handleExecutorStateChanged(
+        ExecutorStateChanged("app1", i, ExecutorState.EXITED, None, None))
+      assert(worker.finishedExecutors.size === 2)
+      if (i > 1) {
+        assert(!worker.finishedExecutors.contains(s"app1/${i - 2}"))
+      }
+      assert(worker.executors.size === 4 - i)
+    }
+  }
+
+  test("test clearing of finishedExecutors (more executors)") {
+    val conf = new SparkConf()
+    conf.set("spark.worker.ui.retainedExecutors", 30.toString)
+    val rpcEnv = RpcEnv.create("test", "localhost", 12345, conf, new SecurityManager(conf))
+    val worker = new Worker(rpcEnv, 50000, 20, 1234 * 5, Array.fill(1)(RpcAddress("1.2.3.4", 1234)),
+      "sparkWorker1", "Worker", "/tmp", conf, new SecurityManager(conf))
+    // initialize workers
+    for (i <- 0 until 50) {
+      worker.executors += s"app1/$i" -> createExecutorRunner(i)
+    }
+    // initialize ExecutorStateChanged Message
+    worker.handleExecutorStateChanged(
+      ExecutorStateChanged("app1", 0, ExecutorState.EXITED, None, None))
+    assert(worker.finishedExecutors.size === 1)
+    assert(worker.executors.size === 49)
+    for (i <- 1 until 50) {
+      val expectedValue = {
+        if (worker.finishedExecutors.size < 30) {
+          worker.finishedExecutors.size + 1
+        } else {
+          28
+        }
+      }
+      worker.handleExecutorStateChanged(
+        ExecutorStateChanged("app1", i, ExecutorState.EXITED, None, None))
+      if (expectedValue == 28) {
+        for (j <- i - 30 until i - 27) {
+          assert(!worker.finishedExecutors.contains(s"app1/$j"))
+        }
+      }
+      assert(worker.executors.size === 49 - i)
+      assert(worker.finishedExecutors.size === expectedValue)
+    }
+  }
+
+  test("test clearing of finishedDrivers (small number of drivers)") {
+    val conf = new SparkConf()
+    conf.set("spark.worker.ui.retainedDrivers", 2.toString)
+    val rpcEnv = RpcEnv.create("test", "localhost", 12345, conf, new SecurityManager(conf))
+    val worker = new Worker(rpcEnv, 50000, 20, 1234 * 5, Array.fill(1)(RpcAddress("1.2.3.4", 1234)),
+      "sparkWorker1", "Worker", "/tmp", conf, new SecurityManager(conf))
+    // initialize workers
+    for (i <- 0 until 5) {
+      val driverId = s"driverId-$i"
+      worker.drivers += driverId -> createDriverRunner(driverId)
+    }
+    // initialize DriverStateChanged Message
+    worker.handleDriverStateChanged(DriverStateChanged("driverId-0", DriverState.FINISHED, None))
+    assert(worker.drivers.size === 4)
+    assert(worker.finishedDrivers.size === 1)
+    for (i <- 1 until 5) {
+      val driverId = s"driverId-$i"
+      worker.handleDriverStateChanged(DriverStateChanged(driverId, DriverState.FINISHED, None))
+      if (i > 1) {
+        assert(!worker.finishedDrivers.contains(s"driverId-${i - 2}"))
+      }
+      assert(worker.drivers.size === 4 - i)
+      assert(worker.finishedDrivers.size === 2)
+    }
+  }
+
+  test("test clearing of finishedDrivers (more drivers)") {
+    val conf = new SparkConf()
+    conf.set("spark.worker.ui.retainedDrivers", 30.toString)
+    val rpcEnv = RpcEnv.create("test", "localhost", 12345, conf, new SecurityManager(conf))
+    val worker = new Worker(rpcEnv, 50000, 20, 1234 * 5, Array.fill(1)(RpcAddress("1.2.3.4", 1234)),
+      "sparkWorker1", "Worker", "/tmp", conf, new SecurityManager(conf))
+    // initialize workers
+    for (i <- 0 until 50) {
+      val driverId = s"driverId-$i"
+      worker.drivers += driverId -> createDriverRunner(driverId)
+    }
+    // initialize DriverStateChanged Message
+    worker.handleDriverStateChanged(DriverStateChanged("driverId-0", DriverState.FINISHED, None))
+    assert(worker.finishedDrivers.size === 1)
+    assert(worker.drivers.size === 49)
+    for (i <- 1 until 50) {
+      val expectedValue = {
+        if (worker.finishedDrivers.size < 30) {
+          worker.finishedDrivers.size + 1
+        } else {
+          28
+        }
+      }
+      val driverId = s"driverId-$i"
+      worker.handleDriverStateChanged(DriverStateChanged(driverId, DriverState.FINISHED, None))
+      if (expectedValue == 28) {
+        for (j <- i - 30 until i - 27) {
+          assert(!worker.finishedDrivers.contains(s"driverId-$j"))
+        }
+      }
+      assert(worker.drivers.size === 49 - i)
+      assert(worker.finishedDrivers.size === expectedValue)
+    }
+  }
 }
diff --git a/docs/configuration.md b/docs/configuration.md
index fd236137cb96e..24b606356a149 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -557,6 +557,20 @@ Apart from these, the following properties are also available, and may be useful
     collecting.
   </td>
 </tr>
+<tr>
+  <td><code>spark.worker.ui.retainedExecutors</code></td>
+  <td>1000</td>
+  <td>
+    How many finished executors the Spark UI and status APIs remember before garbage collecting.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.worker.ui.retainedDrivers</code></td>
+  <td>1000</td>
+  <td>
+    How many finished drivers the Spark UI and status APIs remember before garbage collecting.
+  </td>
+</tr>
 </table>
 
 #### Compression and Serialization

From 3c0d2e55210735e0df2f8febb5f63c224af230e3 Mon Sep 17 00:00:00 2001
From: Meihua Wu <meihuawu@umich.edu>
Date: Fri, 31 Jul 2015 13:01:10 -0700
Subject: [PATCH 526/549] [SPARK-9246] [MLLIB] DistributedLDAModel predict top
 docs per topic

Add topDocumentsPerTopic to DistributedLDAModel.

Add ScalaDoc and unit tests.

Author: Meihua Wu <meihuawu@umich.edu>

Closes #7769 from rotationsymmetry/SPARK-9246 and squashes the following commits:

1029e79c [Meihua Wu] clean up code comments
a023b82 [Meihua Wu] Update tests to use Long for doc index.
91e5998 [Meihua Wu] Use Long for doc index.
b9f70cf [Meihua Wu] Revise topDocumentsPerTopic
26ff3f6 [Meihua Wu] Add topDocumentsPerTopic, scala doc and unit tests
---
 .../spark/mllib/clustering/LDAModel.scala     | 37 +++++++++++++++++++
 .../spark/mllib/clustering/LDASuite.scala     | 22 +++++++++++
 2 files changed, 59 insertions(+)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
index ff7035d2246c2..0cdac84eeb591 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
@@ -516,6 +516,43 @@ class DistributedLDAModel private[clustering] (
     }
   }
 
+  /**
+   * Return the top documents for each topic
+   *
+   * This is approximate; it may not return exactly the top-weighted documents for each topic.
+   * To get a more precise set of top documents, increase maxDocumentsPerTopic.
+   *
+   * @param maxDocumentsPerTopic  Maximum number of documents to collect for each topic.
+   * @return  Array over topics.  Each element represent as a pair of matching arrays:
+   *          (IDs for the documents, weights of the topic in these documents).
+   *          For each topic, documents are sorted in order of decreasing topic weights.
+   */
+  def topDocumentsPerTopic(maxDocumentsPerTopic: Int): Array[(Array[Long], Array[Double])] = {
+    val numTopics = k
+    val topicsInQueues: Array[BoundedPriorityQueue[(Double, Long)]] =
+      topicDistributions.mapPartitions { docVertices =>
+        // For this partition, collect the most common docs for each topic in queues:
+        //  queues(topic) = queue of (doc topic, doc ID).
+        val queues =
+          Array.fill(numTopics)(new BoundedPriorityQueue[(Double, Long)](maxDocumentsPerTopic))
+        for ((docId, docTopics) <- docVertices) {
+          var topic = 0
+          while (topic < numTopics) {
+            queues(topic) += (docTopics(topic) -> docId)
+            topic += 1
+          }
+        }
+        Iterator(queues)
+      }.treeReduce { (q1, q2) =>
+        q1.zip(q2).foreach { case (a, b) => a ++= b }
+        q1
+      }
+    topicsInQueues.map { q =>
+      val (docTopics, docs) = q.toArray.sortBy(-_._1).unzip
+      (docs.toArray, docTopics.toArray)
+    }
+  }
+
   // TODO
   // override def logLikelihood(documents: RDD[(Long, Vector)]): Double = ???
 
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
index 79d2a1cafd1fa..f2b94707fd0ff 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
@@ -122,6 +122,28 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext {
     // Check: log probabilities
     assert(model.logLikelihood < 0.0)
     assert(model.logPrior < 0.0)
+
+    // Check: topDocumentsPerTopic
+    // Compare it with top documents per topic derived from topicDistributions
+    val topDocsByTopicDistributions = { n: Int =>
+      Range(0, k).map { topic =>
+        val (doc, docWeights) = topicDistributions.sortBy(-_._2(topic)).take(n).unzip
+        (doc.toArray, docWeights.map(_(topic)).toArray)
+      }.toArray
+    }
+
+    // Top 3 documents per topic
+    model.topDocumentsPerTopic(3).zip(topDocsByTopicDistributions(3)).foreach {case (t1, t2) =>
+      assert(t1._1 === t2._1)
+      assert(t1._2 === t2._2)
+    }
+
+    // All documents per topic
+    val q = tinyCorpus.length
+    model.topDocumentsPerTopic(q).zip(topDocsByTopicDistributions(q)).foreach {case (t1, t2) =>
+      assert(t1._1 === t2._1)
+      assert(t1._2 === t2._2)
+    }
   }
 
   test("vertex indexing") {

From 060c79aab58efd4ce7353a1b00534de0d9e1de0b Mon Sep 17 00:00:00 2001
From: Sameer Abhyankar <sabhyankar@sabhyankar-MBP.local>
Date: Fri, 31 Jul 2015 13:08:55 -0700
Subject: [PATCH 527/549] [SPARK-9056] [STREAMING] Rename configuration
 `spark.streaming.minRememberDuration` to
 `spark.streaming.fileStream.minRememberDuration`

Rename configuration `spark.streaming.minRememberDuration` to `spark.streaming.fileStream.minRememberDuration`

Author: Sameer Abhyankar <sabhyankar@sabhyankar-MBP.local>
Author: Sameer Abhyankar <sabhyankar@sabhyankar-MBP.Samavihome>

Closes #7740 from sabhyankar/spark_branch_9056 and squashes the following commits:

d5b2f1f [Sameer Abhyankar] Correct deprecated version to 1.5
1268133 [Sameer Abhyankar] Add {} and indentation
ddf9844 [Sameer Abhyankar] Change 4 space indentation to 2 space indentation
1819b5f [Sameer Abhyankar] Use spark.streaming.fileStream.minRememberDuration property in lieu of spark.streaming.minRememberDuration
---
 core/src/main/scala/org/apache/spark/SparkConf.scala        | 4 +++-
 .../apache/spark/streaming/dstream/FileInputDStream.scala   | 6 ++++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala b/core/src/main/scala/org/apache/spark/SparkConf.scala
index 4161792976c7b..08bab4bf2739f 100644
--- a/core/src/main/scala/org/apache/spark/SparkConf.scala
+++ b/core/src/main/scala/org/apache/spark/SparkConf.scala
@@ -548,7 +548,9 @@ private[spark] object SparkConf extends Logging {
     "spark.rpc.askTimeout" -> Seq(
       AlternateConfig("spark.akka.askTimeout", "1.4")),
     "spark.rpc.lookupTimeout" -> Seq(
-      AlternateConfig("spark.akka.lookupTimeout", "1.4"))
+      AlternateConfig("spark.akka.lookupTimeout", "1.4")),
+    "spark.streaming.fileStream.minRememberDuration" -> Seq(
+      AlternateConfig("spark.streaming.minRememberDuration", "1.5"))
     )
 
   /**
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala
index dd4da9d9ca6a2..c358f5b5bd70b 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala
@@ -86,8 +86,10 @@ class FileInputDStream[K, V, F <: NewInputFormat[K, V]](
    * Files with mod times older than this "window" of remembering will be ignored. So if new
    * files are visible within this window, then the file will get selected in the next batch.
    */
-  private val minRememberDurationS =
-    Seconds(ssc.conf.getTimeAsSeconds("spark.streaming.minRememberDuration", "60s"))
+  private val minRememberDurationS = {
+    Seconds(ssc.conf.getTimeAsSeconds("spark.streaming.fileStream.minRememberDuration",
+      ssc.conf.get("spark.streaming.minRememberDuration", "60s")))
+  }
 
   // This is a def so that it works during checkpoint recovery:
   private def clock = ssc.scheduler.clock

From fbef566a107b47e5fddde0ea65b8587d5039062d Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Fri, 31 Jul 2015 13:11:42 -0700
Subject: [PATCH 528/549] [SPARK-9308] [ML] ml.NaiveBayesModel support
 predicting class probabilities

Make NaiveBayesModel support predicting class probabilities, inherit from ProbabilisticClassificationModel.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #7672 from yanboliang/spark-9308 and squashes the following commits:

25e224c [Yanbo Liang] raw2probabilityInPlace should operate in-place
3ee56d6 [Yanbo Liang] change predictRaw and raw2probabilityInPlace
c07e7a2 [Yanbo Liang] ml.NaiveBayesModel support predicting class probabilities
---
 .../spark/ml/classification/NaiveBayes.scala  | 65 ++++++++++++++-----
 .../ml/classification/NaiveBayesSuite.scala   | 54 ++++++++++++++-
 2 files changed, 101 insertions(+), 18 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
index 5be35fe209291..b46b676204e0e 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
@@ -69,7 +69,7 @@ private[ml] trait NaiveBayesParams extends PredictorParams {
  * The input feature values must be nonnegative.
  */
 class NaiveBayes(override val uid: String)
-  extends Predictor[Vector, NaiveBayes, NaiveBayesModel]
+  extends ProbabilisticClassifier[Vector, NaiveBayes, NaiveBayesModel]
   with NaiveBayesParams {
 
   def this() = this(Identifiable.randomUID("nb"))
@@ -106,7 +106,7 @@ class NaiveBayesModel private[ml] (
     override val uid: String,
     val pi: Vector,
     val theta: Matrix)
-  extends PredictionModel[Vector, NaiveBayesModel] with NaiveBayesParams {
+  extends ProbabilisticClassificationModel[Vector, NaiveBayesModel] with NaiveBayesParams {
 
   import OldNaiveBayes.{Bernoulli, Multinomial}
 
@@ -129,29 +129,62 @@ class NaiveBayesModel private[ml] (
       throw new UnknownError(s"Invalid modelType: ${$(modelType)}.")
   }
 
-  override protected def predict(features: Vector): Double = {
+  override val numClasses: Int = pi.size
+
+  private def multinomialCalculation(features: Vector) = {
+    val prob = theta.multiply(features)
+    BLAS.axpy(1.0, pi, prob)
+    prob
+  }
+
+  private def bernoulliCalculation(features: Vector) = {
+    features.foreachActive((_, value) =>
+      if (value != 0.0 && value != 1.0) {
+        throw new SparkException(
+          s"Bernoulli naive Bayes requires 0 or 1 feature values but found $features.")
+      }
+    )
+    val prob = thetaMinusNegTheta.get.multiply(features)
+    BLAS.axpy(1.0, pi, prob)
+    BLAS.axpy(1.0, negThetaSum.get, prob)
+    prob
+  }
+
+  override protected def predictRaw(features: Vector): Vector = {
     $(modelType) match {
       case Multinomial =>
-        val prob = theta.multiply(features)
-        BLAS.axpy(1.0, pi, prob)
-        prob.argmax
+        multinomialCalculation(features)
       case Bernoulli =>
-        features.foreachActive{ (index, value) =>
-          if (value != 0.0 && value != 1.0) {
-            throw new SparkException(
-              s"Bernoulli naive Bayes requires 0 or 1 feature values but found $features")
-          }
-        }
-        val prob = thetaMinusNegTheta.get.multiply(features)
-        BLAS.axpy(1.0, pi, prob)
-        BLAS.axpy(1.0, negThetaSum.get, prob)
-        prob.argmax
+        bernoulliCalculation(features)
       case _ =>
         // This should never happen.
         throw new UnknownError(s"Invalid modelType: ${$(modelType)}.")
     }
   }
 
+  override protected def raw2probabilityInPlace(rawPrediction: Vector): Vector = {
+    rawPrediction match {
+      case dv: DenseVector =>
+        var i = 0
+        val size = dv.size
+        val maxLog = dv.values.max
+        while (i < size) {
+          dv.values(i) = math.exp(dv.values(i) - maxLog)
+          i += 1
+        }
+        val probSum = dv.values.sum
+        i = 0
+        while (i < size) {
+          dv.values(i) = dv.values(i) / probSum
+          i += 1
+        }
+        dv
+      case sv: SparseVector =>
+        throw new RuntimeException("Unexpected error in NaiveBayesModel:" +
+          " raw2probabilityInPlace encountered SparseVector")
+    }
+  }
+
   override def copy(extra: ParamMap): NaiveBayesModel = {
     copyValues(new NaiveBayesModel(uid, pi, theta).setParent(this.parent), extra)
   }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala
index 264bde3703c5f..aea3d9b694490 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala
@@ -17,8 +17,11 @@
 
 package org.apache.spark.ml.classification
 
+import breeze.linalg.{Vector => BV}
+
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.param.ParamsSuite
+import org.apache.spark.mllib.classification.NaiveBayes
 import org.apache.spark.mllib.linalg._
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.mllib.util.TestingUtils._
@@ -28,6 +31,8 @@ import org.apache.spark.sql.Row
 
 class NaiveBayesSuite extends SparkFunSuite with MLlibTestSparkContext {
 
+  import NaiveBayes.{Multinomial, Bernoulli}
+
   def validatePrediction(predictionAndLabels: DataFrame): Unit = {
     val numOfErrorPredictions = predictionAndLabels.collect().count {
       case Row(prediction: Double, label: Double) =>
@@ -46,6 +51,43 @@ class NaiveBayesSuite extends SparkFunSuite with MLlibTestSparkContext {
     assert(model.theta.map(math.exp) ~== thetaData.map(math.exp) absTol 0.05, "theta mismatch")
   }
 
+  def expectedMultinomialProbabilities(model: NaiveBayesModel, feature: Vector): Vector = {
+    val logClassProbs: BV[Double] = model.pi.toBreeze + model.theta.multiply(feature).toBreeze
+    val classProbs = logClassProbs.toArray.map(math.exp)
+    val classProbsSum = classProbs.sum
+    Vectors.dense(classProbs.map(_ / classProbsSum))
+  }
+
+  def expectedBernoulliProbabilities(model: NaiveBayesModel, feature: Vector): Vector = {
+    val negThetaMatrix = model.theta.map(v => math.log(1.0 - math.exp(v)))
+    val negFeature = Vectors.dense(feature.toArray.map(v => 1.0 - v))
+    val piTheta: BV[Double] = model.pi.toBreeze + model.theta.multiply(feature).toBreeze
+    val logClassProbs: BV[Double] = piTheta + negThetaMatrix.multiply(negFeature).toBreeze
+    val classProbs = logClassProbs.toArray.map(math.exp)
+    val classProbsSum = classProbs.sum
+    Vectors.dense(classProbs.map(_ / classProbsSum))
+  }
+
+  def validateProbabilities(
+      featureAndProbabilities: DataFrame,
+      model: NaiveBayesModel,
+      modelType: String): Unit = {
+    featureAndProbabilities.collect().foreach {
+      case Row(features: Vector, probability: Vector) => {
+        assert(probability.toArray.sum ~== 1.0 relTol 1.0e-10)
+        val expected = modelType match {
+          case Multinomial =>
+            expectedMultinomialProbabilities(model, features)
+          case Bernoulli =>
+            expectedBernoulliProbabilities(model, features)
+          case _ =>
+            throw new UnknownError(s"Invalid modelType: $modelType.")
+        }
+        assert(probability ~== expected relTol 1.0e-10)
+      }
+    }
+  }
+
   test("params") {
     ParamsSuite.checkParams(new NaiveBayes)
     val model = new NaiveBayesModel("nb", pi = Vectors.dense(Array(0.2, 0.8)),
@@ -83,9 +125,13 @@ class NaiveBayesSuite extends SparkFunSuite with MLlibTestSparkContext {
 
     val validationDataset = sqlContext.createDataFrame(generateNaiveBayesInput(
       piArray, thetaArray, nPoints, 17, "multinomial"))
-    val predictionAndLabels = model.transform(validationDataset).select("prediction", "label")
 
+    val predictionAndLabels = model.transform(validationDataset).select("prediction", "label")
     validatePrediction(predictionAndLabels)
+
+    val featureAndProbabilities = model.transform(validationDataset)
+      .select("features", "probability")
+    validateProbabilities(featureAndProbabilities, model, "multinomial")
   }
 
   test("Naive Bayes Bernoulli") {
@@ -109,8 +155,12 @@ class NaiveBayesSuite extends SparkFunSuite with MLlibTestSparkContext {
 
     val validationDataset = sqlContext.createDataFrame(generateNaiveBayesInput(
       piArray, thetaArray, nPoints, 20, "bernoulli"))
-    val predictionAndLabels = model.transform(validationDataset).select("prediction", "label")
 
+    val predictionAndLabels = model.transform(validationDataset).select("prediction", "label")
     validatePrediction(predictionAndLabels)
+
+    val featureAndProbabilities = model.transform(validationDataset)
+      .select("features", "probability")
+    validateProbabilities(featureAndProbabilities, model, "bernoulli")
   }
 }

From 815c8245f47e61226a04e2e02f508457b5e9e536 Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Fri, 31 Jul 2015 13:45:12 -0700
Subject: [PATCH 529/549] [SPARK-9466] [SQL] Increate two timeouts in CliSuite.

Hopefully this can resolve the flakiness of this suite.

JIRA: https://issues.apache.org/jira/browse/SPARK-9466

Author: Yin Huai <yhuai@databricks.com>

Closes #7777 from yhuai/SPARK-9466 and squashes the following commits:

e0e3a86 [Yin Huai] Increate the timeout.
---
 .../org/apache/spark/sql/hive/thriftserver/CliSuite.scala     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala
index 13b0c5951dddc..df80d04b40801 100644
--- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala
+++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala
@@ -137,7 +137,7 @@ class CliSuite extends SparkFunSuite with BeforeAndAfter with Logging {
   }
 
   test("Single command with --database") {
-    runCliWithin(1.minute)(
+    runCliWithin(2.minute)(
       "CREATE DATABASE hive_test_db;"
         -> "OK",
       "USE hive_test_db;"
@@ -148,7 +148,7 @@ class CliSuite extends SparkFunSuite with BeforeAndAfter with Logging {
         -> "Time taken: "
     )
 
-    runCliWithin(1.minute, Seq("--database", "hive_test_db", "-e", "SHOW TABLES;"))(
+    runCliWithin(2.minute, Seq("--database", "hive_test_db", "-e", "SHOW TABLES;"))(
       ""
         -> "OK",
       ""

From 873ab0f9692d8ea6220abdb8d9200041068372a8 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Fri, 31 Jul 2015 13:45:28 -0700
Subject: [PATCH 530/549] [SPARK-9490] [DOCS] [MLLIB] MLlib evaluation metrics
 guide example python code uses deprecated print statement

Use print(x) not print x for Python 3 in eval examples
CC sethah mengxr -- just wanted to close this out before 1.5

Author: Sean Owen <sowen@cloudera.com>

Closes #7822 from srowen/SPARK-9490 and squashes the following commits:

01abeba [Sean Owen] Change "print x" to "print(x)" in the rest of the docs too
bd7f7fb [Sean Owen] Use print(x) not print x for Python 3 in eval examples
---
 docs/ml-guide.md                    |  2 +-
 docs/mllib-evaluation-metrics.md    | 66 ++++++++++++++---------------
 docs/mllib-feature-extraction.md    |  2 +-
 docs/mllib-statistics.md            | 20 ++++-----
 docs/quick-start.md                 |  2 +-
 docs/sql-programming-guide.md       |  6 +--
 docs/streaming-programming-guide.md |  2 +-
 7 files changed, 50 insertions(+), 50 deletions(-)

diff --git a/docs/ml-guide.md b/docs/ml-guide.md
index 8c46adf256a9a..b6ca50e98db02 100644
--- a/docs/ml-guide.md
+++ b/docs/ml-guide.md
@@ -561,7 +561,7 @@ test = sc.parallelize([(4L, "spark i j k"),
 prediction = model.transform(test)
 selected = prediction.select("id", "text", "prediction")
 for row in selected.collect():
-    print row
+    print(row)
 
 sc.stop()
 {% endhighlight %}
diff --git a/docs/mllib-evaluation-metrics.md b/docs/mllib-evaluation-metrics.md
index 4ca0bb06b26a6..7066d5c97418c 100644
--- a/docs/mllib-evaluation-metrics.md
+++ b/docs/mllib-evaluation-metrics.md
@@ -302,10 +302,10 @@ predictionAndLabels = test.map(lambda lp: (float(model.predict(lp.features)), lp
 metrics = BinaryClassificationMetrics(predictionAndLabels)
 
 # Area under precision-recall curve
-print "Area under PR = %s" % metrics.areaUnderPR
+print("Area under PR = %s" % metrics.areaUnderPR)
 
 # Area under ROC curve
-print "Area under ROC = %s" % metrics.areaUnderROC
+print("Area under ROC = %s" % metrics.areaUnderROC)
 
 {% endhighlight %}
 
@@ -606,24 +606,24 @@ metrics = MulticlassMetrics(predictionAndLabels)
 precision = metrics.precision()
 recall = metrics.recall()
 f1Score = metrics.fMeasure()
-print "Summary Stats"
-print "Precision = %s" % precision
-print "Recall = %s" % recall
-print "F1 Score = %s" % f1Score
+print("Summary Stats")
+print("Precision = %s" % precision)
+print("Recall = %s" % recall)
+print("F1 Score = %s" % f1Score)
 
 # Statistics by class
 labels = data.map(lambda lp: lp.label).distinct().collect()
 for label in sorted(labels):
-    print "Class %s precision = %s" % (label, metrics.precision(label))
-    print "Class %s recall = %s" % (label, metrics.recall(label))
-    print "Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0))
+    print("Class %s precision = %s" % (label, metrics.precision(label)))
+    print("Class %s recall = %s" % (label, metrics.recall(label)))
+    print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0)))
 
 # Weighted stats
-print "Weighted recall = %s" % metrics.weightedRecall
-print "Weighted precision = %s" % metrics.weightedPrecision
-print "Weighted F(1) Score = %s" % metrics.weightedFMeasure()
-print "Weighted F(0.5) Score = %s" % metrics.weightedFMeasure(beta=0.5)
-print "Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate
+print("Weighted recall = %s" % metrics.weightedRecall)
+print("Weighted precision = %s" % metrics.weightedPrecision)
+print("Weighted F(1) Score = %s" % metrics.weightedFMeasure())
+print("Weighted F(0.5) Score = %s" % metrics.weightedFMeasure(beta=0.5))
+print("Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate)
 {% endhighlight %}
 
 </div>
@@ -881,28 +881,28 @@ scoreAndLabels = sc.parallelize([
 metrics = MultilabelMetrics(scoreAndLabels)
 
 # Summary stats
-print "Recall = %s" % metrics.recall()
-print "Precision = %s" % metrics.precision()
-print "F1 measure = %s" % metrics.f1Measure()
-print "Accuracy = %s" % metrics.accuracy
+print("Recall = %s" % metrics.recall())
+print("Precision = %s" % metrics.precision())
+print("F1 measure = %s" % metrics.f1Measure())
+print("Accuracy = %s" % metrics.accuracy)
 
 # Individual label stats
 labels = scoreAndLabels.flatMap(lambda x: x[1]).distinct().collect()
 for label in labels:
-    print "Class %s precision = %s" % (label, metrics.precision(label))
-    print "Class %s recall = %s" % (label, metrics.recall(label))
-    print "Class %s F1 Measure = %s" % (label, metrics.f1Measure(label))
+    print("Class %s precision = %s" % (label, metrics.precision(label)))
+    print("Class %s recall = %s" % (label, metrics.recall(label)))
+    print("Class %s F1 Measure = %s" % (label, metrics.f1Measure(label)))
 
 # Micro stats
-print "Micro precision = %s" % metrics.microPrecision
-print "Micro recall = %s" % metrics.microRecall
-print "Micro F1 measure = %s" % metrics.microF1Measure
+print("Micro precision = %s" % metrics.microPrecision)
+print("Micro recall = %s" % metrics.microRecall)
+print("Micro F1 measure = %s" % metrics.microF1Measure)
 
 # Hamming loss
-print "Hamming loss = %s" % metrics.hammingLoss
+print("Hamming loss = %s" % metrics.hammingLoss)
 
 # Subset accuracy
-print "Subset accuracy = %s" % metrics.subsetAccuracy
+print("Subset accuracy = %s" % metrics.subsetAccuracy)
 
 {% endhighlight %}
 
@@ -1283,10 +1283,10 @@ scoreAndLabels = predictions.join(ratingsTuple).map(lambda tup: tup[1])
 metrics = RegressionMetrics(scoreAndLabels)
 
 # Root mean sqaured error
-print "RMSE = %s" % metrics.rootMeanSquaredError
+print("RMSE = %s" % metrics.rootMeanSquaredError)
 
 # R-squared
-print "R-squared = %s" % metrics.r2
+print("R-squared = %s" % metrics.r2)
 
 {% endhighlight %}
 
@@ -1479,17 +1479,17 @@ valuesAndPreds = parsedData.map(lambda p: (float(model.predict(p.features)), p.l
 metrics = RegressionMetrics(valuesAndPreds)
 
 # Squared Error
-print "MSE = %s" % metrics.meanSquaredError
-print "RMSE = %s" % metrics.rootMeanSquaredError
+print("MSE = %s" % metrics.meanSquaredError)
+print("RMSE = %s" % metrics.rootMeanSquaredError)
 
 # R-squared
-print "R-squared = %s" % metrics.r2
+print("R-squared = %s" % metrics.r2)
 
 # Mean absolute error
-print "MAE = %s" % metrics.meanAbsoluteError
+print("MAE = %s" % metrics.meanAbsoluteError)
 
 # Explained variance
-print "Explained variance = %s" % metrics.explainedVariance
+print("Explained variance = %s" % metrics.explainedVariance)
 
 {% endhighlight %}
 
diff --git a/docs/mllib-feature-extraction.md b/docs/mllib-feature-extraction.md
index a69e41e2a1936..de86aba2ae627 100644
--- a/docs/mllib-feature-extraction.md
+++ b/docs/mllib-feature-extraction.md
@@ -221,7 +221,7 @@ model = word2vec.fit(inp)
 synonyms = model.findSynonyms('china', 40)
 
 for word, cosine_distance in synonyms:
-    print "{}: {}".format(word, cosine_distance)
+    print("{}: {}".format(word, cosine_distance))
 {% endhighlight %}
 </div>
 </div>
diff --git a/docs/mllib-statistics.md b/docs/mllib-statistics.md
index de5d6485f9b5f..be04d0b4b53a8 100644
--- a/docs/mllib-statistics.md
+++ b/docs/mllib-statistics.md
@@ -95,9 +95,9 @@ mat = ... # an RDD of Vectors
 
 # Compute column summary statistics.
 summary = Statistics.colStats(mat)
-print summary.mean()
-print summary.variance()
-print summary.numNonzeros()
+print(summary.mean())
+print(summary.variance())
+print(summary.numNonzeros())
 
 {% endhighlight %}
 </div>
@@ -183,12 +183,12 @@ seriesY = ... # must have the same number of partitions and cardinality as serie
 
 # Compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a 
 # method is not specified, Pearson's method will be used by default. 
-print Statistics.corr(seriesX, seriesY, method="pearson")
+print(Statistics.corr(seriesX, seriesY, method="pearson"))
 
 data = ... # an RDD of Vectors
 # calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method.
 # If a method is not specified, Pearson's method will be used by default. 
-print Statistics.corr(data, method="pearson")
+print(Statistics.corr(data, method="pearson"))
 
 {% endhighlight %}
 </div>
@@ -398,14 +398,14 @@ vec = Vectors.dense(...) # a vector composed of the frequencies of events
 # compute the goodness of fit. If a second vector to test against is not supplied as a parameter,
 # the test runs against a uniform distribution.
 goodnessOfFitTestResult = Statistics.chiSqTest(vec)
-print goodnessOfFitTestResult # summary of the test including the p-value, degrees of freedom,
-                              # test statistic, the method used, and the null hypothesis.
+print(goodnessOfFitTestResult) # summary of the test including the p-value, degrees of freedom,
+                               # test statistic, the method used, and the null hypothesis.
 
 mat = Matrices.dense(...) # a contingency matrix
 
 # conduct Pearson's independence test on the input contingency matrix
 independenceTestResult = Statistics.chiSqTest(mat)
-print independenceTestResult  # summary of the test including the p-value, degrees of freedom...
+print(independenceTestResult)  # summary of the test including the p-value, degrees of freedom...
 
 obs = sc.parallelize(...)  # LabeledPoint(feature, label) .
 
@@ -415,8 +415,8 @@ obs = sc.parallelize(...)  # LabeledPoint(feature, label) .
 featureTestResults = Statistics.chiSqTest(obs)
 
 for i, result in enumerate(featureTestResults):
-    print "Column $d:" % (i + 1)
-    print result
+    print("Column $d:" % (i + 1))
+    print(result)
 {% endhighlight %}
 </div>
 
diff --git a/docs/quick-start.md b/docs/quick-start.md
index bb39e4111f244..ce2cc9d2169cd 100644
--- a/docs/quick-start.md
+++ b/docs/quick-start.md
@@ -406,7 +406,7 @@ logData = sc.textFile(logFile).cache()
 numAs = logData.filter(lambda s: 'a' in s).count()
 numBs = logData.filter(lambda s: 'b' in s).count()
 
-print "Lines with a: %i, lines with b: %i" % (numAs, numBs)
+print("Lines with a: %i, lines with b: %i" % (numAs, numBs))
 {% endhighlight %}
 
 
diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 95945eb7fc8a0..d31baa080cbce 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -570,7 +570,7 @@ teenagers = sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 1
 # The results of SQL queries are RDDs and support all the normal RDD operations.
 teenNames = teenagers.map(lambda p: "Name: " + p.name)
 for teenName in teenNames.collect():
-  print teenName
+  print(teenName)
 {% endhighlight %}
 
 </div>
@@ -752,7 +752,7 @@ results = sqlContext.sql("SELECT name FROM people")
 # The results of SQL queries are RDDs and support all the normal RDD operations.
 names = results.map(lambda p: "Name: " + p.name)
 for name in names.collect():
-  print name
+  print(name)
 {% endhighlight %}
 
 </div>
@@ -1006,7 +1006,7 @@ parquetFile.registerTempTable("parquetFile");
 teenagers = sqlContext.sql("SELECT name FROM parquetFile WHERE age >= 13 AND age <= 19")
 teenNames = teenagers.map(lambda p: "Name: " + p.name)
 for teenName in teenNames.collect():
-  print teenName
+  print(teenName)
 {% endhighlight %}
 
 </div>
diff --git a/docs/streaming-programming-guide.md b/docs/streaming-programming-guide.md
index 2f3013b533eb0..4663b3f14c527 100644
--- a/docs/streaming-programming-guide.md
+++ b/docs/streaming-programming-guide.md
@@ -1525,7 +1525,7 @@ def getSqlContextInstance(sparkContext):
 words = ... # DStream of strings
 
 def process(time, rdd):
-    print "========= %s =========" % str(time)
+    print("========= %s =========" % str(time))
     try:
         # Get the singleton instance of SQLContext
         sqlContext = getSqlContextInstance(rdd.context)

From 6e5fd613ea4b9aa0ab485ba681277a51a4367168 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Fri, 31 Jul 2015 21:51:55 +0100
Subject: [PATCH 531/549] [SPARK-9507] [BUILD] Remove dependency reduced POM
 hack now that shade plugin is updated

Update to shade plugin 2.4.1, which removes the need for the dependency-reduced-POM workaround and the 'release' profile. Fix management of shade plugin version so children inherit it; bump assembly plugin version while here

See https://issues.apache.org/jira/browse/SPARK-8819

I verified that `mvn clean package -DskipTests` works with Maven 3.3.3.

pwendell are you up for trying this for the 1.5.0 release?

Author: Sean Owen <sowen@cloudera.com>

Closes #7826 from srowen/SPARK-9507 and squashes the following commits:

e0b0fd2 [Sean Owen] Update to shade plugin 2.4.1, which removes the need for the dependency-reduced-POM workaround and the 'release' profile. Fix management of shade plugin version so children inherit it; bump assembly plugin version while here
---
 dev/create-release/create-release.sh |  4 ++--
 pom.xml                              | 33 +++++-----------------------
 2 files changed, 8 insertions(+), 29 deletions(-)

diff --git a/dev/create-release/create-release.sh b/dev/create-release/create-release.sh
index 86a7a4068c40e..4311c8c9e4ca6 100755
--- a/dev/create-release/create-release.sh
+++ b/dev/create-release/create-release.sh
@@ -118,13 +118,13 @@ if [[ ! "$@" =~ --skip-publish ]]; then
 
   rm -rf $SPARK_REPO
 
-  build/mvn -DskipTests -Pyarn -Phive -Prelease\
+  build/mvn -DskipTests -Pyarn -Phive \
     -Phive-thriftserver -Phadoop-2.2 -Pspark-ganglia-lgpl -Pkinesis-asl \
     clean install
 
   ./dev/change-scala-version.sh 2.11
   
-  build/mvn -DskipTests -Pyarn -Phive -Prelease\
+  build/mvn -DskipTests -Pyarn -Phive \
     -Dscala-2.11 -Phadoop-2.2 -Pspark-ganglia-lgpl -Pkinesis-asl \
     clean install
 
diff --git a/pom.xml b/pom.xml
index e351c7c19df96..1371a1b6bd9f1 100644
--- a/pom.xml
+++ b/pom.xml
@@ -160,9 +160,6 @@
     <fasterxml.jackson.version>2.4.4</fasterxml.jackson.version>
     <snappy.version>1.1.1.7</snappy.version>
     <netlib.java.version>1.1.2</netlib.java.version>
-    <!-- For maven shade plugin (see SPARK-8819) -->
-    <create.dependency.reduced.pom>false</create.dependency.reduced.pom>
-
     <test.java.home>${java.home}</test.java.home>
 
     <!--
@@ -1376,7 +1373,12 @@
         <plugin>
           <groupId>org.apache.maven.plugins</groupId>
           <artifactId>maven-assembly-plugin</artifactId>
-          <version>2.5.3</version>
+          <version>2.5.5</version>
+        </plugin>
+        <plugin>
+          <groupId>org.apache.maven.plugins</groupId>
+          <artifactId>maven-shade-plugin</artifactId>
+          <version>2.4.1</version>
         </plugin>
         <plugin>
           <groupId>org.apache.maven.plugins</groupId>
@@ -1470,11 +1472,8 @@
       <plugin>
         <groupId>org.apache.maven.plugins</groupId>
         <artifactId>maven-shade-plugin</artifactId>
-        <version>2.3</version>
         <configuration>
           <shadedArtifactAttached>false</shadedArtifactAttached>
-          <!-- Work around MSHADE-148. See SPARK-8819. -->
-          <createDependencyReducedPom>${create.dependency.reduced.pom}</createDependencyReducedPom>
           <artifactSet>
             <includes>
               <!-- At a minimum we must include this to force effective pom generation -->
@@ -1836,26 +1835,6 @@
       </properties>
     </profile>
 
-    <profile>
-      <!--
-          Use this profile only for making Spark releases. Note that due to SPARK-8819,
-          you must use maven version 3.2.x or before to avoid running into MSHADE-148.
-      -->
-      <id>release</id>
-      <properties>
-        <!--
-            The maven shade plugin has a bug where enabling the `createDependencyReducedPom`
-            property causes maven to go into an infinite loop (MSHADE-148). This is only an
-            issue for the Spark build if the maven version is 3.3.x or newer (SPARK-8819).
-
-            However, since disabling this property has the side effect of not resolving
-            variables in the released pom files (SPARK-8781), we need to enable this during
-            releases.
-        -->
-        <create.dependency.reduced.pom>true</create.dependency.reduced.pom>
-      </properties>
-    </profile>
-
     <!--
       These empty profiles are available in some sub-modules. Declare them here so that
       maven does not complain when they're provided on the command line for a sub-module

From 82f47b811607a1eeeecba437fe0ffc15d4e5f9ec Mon Sep 17 00:00:00 2001
From: Shivaram Venkataraman <shivaram@cs.berkeley.edu>
Date: Fri, 31 Jul 2015 14:02:44 -0700
Subject: [PATCH 532/549] [SPARK-9510] [SPARKR] Remaining SparkR style fixes

With the change in this patch, I get no more warnings from `./dev/lint-r` in my machine

Author: Shivaram Venkataraman <shivaram@cs.berkeley.edu>

Closes #7834 from shivaram/sparkr-style-fixes and squashes the following commits:

716cd8e [Shivaram Venkataraman] Remaining SparkR style fixes
---
 R/pkg/R/RDD.R                    | 6 +++---
 R/pkg/inst/tests/test_sparkSQL.R | 4 +++-
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/R/pkg/R/RDD.R b/R/pkg/R/RDD.R
index 2a013b3dbb968..051e441d4e063 100644
--- a/R/pkg/R/RDD.R
+++ b/R/pkg/R/RDD.R
@@ -1264,12 +1264,12 @@ setMethod("pipeRDD",
           signature(x = "RDD", command = "character"),
           function(x, command, env = list()) {
             func <- function(part) {
-              trim.trailing.func <- function(x) {
+              trim_trailing_func <- function(x) {
                 sub("[\r\n]*$", "", toString(x))
               }
-              input <- unlist(lapply(part, trim.trailing.func))
+              input <- unlist(lapply(part, trim_trailing_func))
               res <- system2(command, stdout = TRUE, input = input, env = env)
-              lapply(res, trim.trailing.func)
+              lapply(res, trim_trailing_func)
             }
             lapplyPartition(x, func)
           })
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index aca41aa6dcf24..25f697314f803 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -128,7 +128,9 @@ test_that("create DataFrame from RDD", {
   expect_equal(dtypes(df2), list(c("name", "string"), c("age", "int"), c("height", "float")))
   expect_equal(collect(where(df2, df2$name == "Bob")), c("Bob", 16, 176.5))
 
-  localDF <- data.frame(name=c("John", "Smith", "Sarah"), age=c(19, 23, 18), height=c(164.10, 181.4, 173.7))
+  localDF <- data.frame(name=c("John", "Smith", "Sarah"),
+                        age=c(19, 23, 18),
+                        height=c(164.10, 181.4, 173.7))
   df <- createDataFrame(sqlContext, localDF, schema)
   expect_is(df, "DataFrame")
   expect_equal(count(df), 3)

From 710c2b5dd2dc6b8d947303ad8dfae4539b63fe11 Mon Sep 17 00:00:00 2001
From: Hossein <hossein@databricks.com>
Date: Fri, 31 Jul 2015 14:07:41 -0700
Subject: [PATCH 533/549] [SPARK-9324] [SPARK-9322] [SPARK-9321] [SPARKR] Some
 aliases for R-like functions in DataFrames

Adds following aliases:
* unique (distinct)
* rbind (unionAll): accepts many DataFrames
* nrow (count)
* ncol
* dim
* names (columns): along with the replacement function to change names

Author: Hossein <hossein@databricks.com>

Closes #7764 from falaki/sparkR-alias and squashes the following commits:

56016f5 [Hossein] Updated R documentation
5e4a4d0 [Hossein] Removed extra code
f51cbef [Hossein] Merge branch 'master' into sparkR-alias
c1b88bd [Hossein] Moved setGeneric and other comments applied
d9307f8 [Hossein] Added tests
b5aa988 [Hossein] Added dim, ncol, nrow, names, rbind, and unique functions to DataFrames
---
 R/pkg/NAMESPACE                  |  6 +++
 R/pkg/R/DataFrame.R              | 90 ++++++++++++++++++++++++++++++++
 R/pkg/R/generics.R               |  4 ++
 R/pkg/inst/tests/test_sparkSQL.R | 22 ++++++--
 4 files changed, 119 insertions(+), 3 deletions(-)

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index a329e14f25aeb..ff116cb1fbde2 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -29,6 +29,7 @@ exportMethods("arrange",
               "count",
               "crosstab",
               "describe",
+              "dim",
               "distinct",
               "dropna",
               "dtypes",
@@ -45,11 +46,15 @@ exportMethods("arrange",
               "isLocal",
               "join",
               "limit",
+              "names",
+              "ncol",
+              "nrow",
               "orderBy",
               "mutate",
               "names",
               "persist",
               "printSchema",
+              "rbind",
               "registerTempTable",
               "rename",
               "repartition",
@@ -66,6 +71,7 @@ exportMethods("arrange",
               "summarize",
               "take",
               "unionAll",
+              "unique",
               "unpersist",
               "where",
               "withColumn",
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index b31ad3729e09b..b4065d2944bdc 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -255,6 +255,16 @@ setMethod("names",
             columns(x)
           })
 
+#' @rdname columns
+setMethod("names<-",
+          signature(x = "DataFrame"),
+          function(x, value) {
+            if (!is.null(value)) {
+              sdf <- callJMethod(x@sdf, "toDF", listToSeq(as.list(value)))
+              dataFrame(sdf)
+            }
+          })
+
 #' Register Temporary Table
 #'
 #' Registers a DataFrame as a Temporary Table in the SQLContext
@@ -473,6 +483,18 @@ setMethod("distinct",
             dataFrame(sdf)
           })
 
+#' @title Distinct rows in a DataFrame
+#
+#' @description Returns a new DataFrame containing distinct rows in this DataFrame
+#'
+#' @rdname unique
+#' @aliases unique
+setMethod("unique",
+          signature(x = "DataFrame"),
+          function(x) {
+            distinct(x)
+          })
+
 #' Sample
 #'
 #' Return a sampled subset of this DataFrame using a random seed.
@@ -534,6 +556,58 @@ setMethod("count",
             callJMethod(x@sdf, "count")
           })
 
+#' @title Number of rows for a DataFrame
+#' @description Returns number of rows in a DataFrames
+#'
+#' @name nrow
+#'
+#' @rdname nrow
+#' @aliases count
+setMethod("nrow",
+          signature(x = "DataFrame"),
+          function(x) {
+            count(x)
+          })
+
+#' Returns the number of columns in a DataFrame
+#'
+#' @param x a SparkSQL DataFrame
+#'
+#' @rdname ncol
+#' @export
+#' @examples
+#'\dontrun{
+#' sc <- sparkR.init()
+#' sqlContext <- sparkRSQL.init(sc)
+#' path <- "path/to/file.json"
+#' df <- jsonFile(sqlContext, path)
+#' ncol(df)
+#' }
+setMethod("ncol",
+          signature(x = "DataFrame"),
+          function(x) {
+            length(columns(x))
+          })
+
+#' Returns the dimentions (number of rows and columns) of a DataFrame
+#' @param x a SparkSQL DataFrame
+#'
+#' @rdname dim
+#' @export
+#' @examples
+#'\dontrun{
+#' sc <- sparkR.init()
+#' sqlContext <- sparkRSQL.init(sc)
+#' path <- "path/to/file.json"
+#' df <- jsonFile(sqlContext, path)
+#' dim(df)
+#' }
+setMethod("dim",
+          signature(x = "DataFrame"),
+          function(x) {
+            c(count(x), ncol(x))
+          })
+
 #' Collects all the elements of a Spark DataFrame and coerces them into an R data.frame.
 #'
 #' @param x A SparkSQL DataFrame
@@ -1231,6 +1305,22 @@ setMethod("unionAll",
             dataFrame(unioned)
           })
 
+#' @title Union two or more DataFrames
+#
+#' @description Returns a new DataFrame containing rows of all parameters.
+#
+#' @rdname rbind
+#' @aliases unionAll
+setMethod("rbind",
+          signature(... = "DataFrame"),
+          function(x, ..., deparse.level = 1) {
+            if (nargs() == 3) {
+              unionAll(x, ...)
+            } else {
+              unionAll(x, Recall(..., deparse.level = 1))
+            }
+          })
+
 #' Intersect
 #'
 #' Return a new DataFrame containing rows only in both this DataFrame
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index a3a121058e165..71d1e348c4efb 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -669,3 +669,7 @@ setGeneric("upper", function(x) { standardGeneric("upper") })
 #' @rdname glm
 #' @export
 setGeneric("glm")
+
+#' @rdname rbind
+#' @export
+setGeneric("rbind", signature = "...")
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index 25f697314f803..9faee8d59c3af 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -88,6 +88,9 @@ test_that("create DataFrame from RDD", {
   df <- createDataFrame(sqlContext, rdd, list("a", "b"))
   expect_is(df, "DataFrame")
   expect_equal(count(df), 10)
+  expect_equal(nrow(df), 10)
+  expect_equal(ncol(df), 2)
+  expect_equal(dim(df), c(10, 2))
   expect_equal(columns(df), c("a", "b"))
   expect_equal(dtypes(df), list(c("a", "int"), c("b", "string")))
 
@@ -491,7 +494,7 @@ test_that("head() and first() return the correct data", {
   expect_equal(nrow(testFirst), 1)
 })
 
-test_that("distinct() on DataFrames", {
+test_that("distinct() and unique on DataFrames", {
   lines <- c("{\"name\":\"Michael\"}",
              "{\"name\":\"Andy\", \"age\":30}",
              "{\"name\":\"Justin\", \"age\":19}",
@@ -503,6 +506,10 @@ test_that("distinct() on DataFrames", {
   uniques <- distinct(df)
   expect_is(uniques, "DataFrame")
   expect_equal(count(uniques), 3)
+
+  uniques2 <- unique(df)
+  expect_is(uniques2, "DataFrame")
+  expect_equal(count(uniques2), 3)
 })
 
 test_that("sample on a DataFrame", {
@@ -815,7 +822,7 @@ test_that("isLocal()", {
   expect_false(isLocal(df))
 })
 
-test_that("unionAll(), except(), and intersect() on a DataFrame", {
+test_that("unionAll(), rbind(), except(), and intersect() on a DataFrame", {
   df <- jsonFile(sqlContext, jsonPath)
 
   lines <- c("{\"name\":\"Bob\", \"age\":24}",
@@ -830,6 +837,11 @@ test_that("unionAll(), except(), and intersect() on a DataFrame", {
   expect_equal(count(unioned), 6)
   expect_equal(first(unioned)$name, "Michael")
 
+  unioned2 <- arrange(rbind(unioned, df, df2), df$age)
+  expect_is(unioned2, "DataFrame")
+  expect_equal(count(unioned2), 12)
+  expect_equal(first(unioned2)$name, "Michael")
+
   excepted <- arrange(except(df, df2), desc(df$age))
   expect_is(unioned, "DataFrame")
   expect_equal(count(excepted), 2)
@@ -853,7 +865,7 @@ test_that("withColumn() and withColumnRenamed()", {
   expect_equal(columns(newDF2)[1], "newerAge")
 })
 
-test_that("mutate() and rename()", {
+test_that("mutate(), rename() and names()", {
   df <- jsonFile(sqlContext, jsonPath)
   newDF <- mutate(df, newAge = df$age + 2)
   expect_equal(length(columns(newDF)), 3)
@@ -863,6 +875,10 @@ test_that("mutate() and rename()", {
   newDF2 <- rename(df, newerAge = df$age)
   expect_equal(length(columns(newDF2)), 2)
   expect_equal(columns(newDF2)[1], "newerAge")
+
+  names(newDF2) <- c("newerName", "evenNewerAge")
+  expect_equal(length(names(newDF2)), 2)
+  expect_equal(names(newDF2)[1], "newerName")
 })
 
 test_that("write.df() on DataFrame and works with parquetFile", {

From 3fc0cb92001798167a14c1377362a3335397dd4c Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Fri, 31 Jul 2015 14:13:06 -0700
Subject: [PATCH 534/549] [SPARK-9233] [SQL] Enable code-gen in window function
 unit tests

Since code-gen is enabled by default, it is better to run window function tests with code-gen.

https://issues.apache.org/jira/browse/SPARK-9233

Author: Yin Huai <yhuai@databricks.com>

Closes #7832 from yhuai/SPARK-9233 and squashes the following commits:

4e4e4cc [Yin Huai] style
ca80e07 [Yin Huai] Test window function with codegen.
---
 .../sql/catalyst/analysis/Analyzer.scala      |  9 ++++-
 .../HiveWindowFunctionQuerySuite.scala        | 38 +++----------------
 2 files changed, 12 insertions(+), 35 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 51d910b258647..f5daba1543da9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -853,8 +853,13 @@ class Analyzer(
       while (i < groupedWindowExpressions.size) {
         val ((partitionSpec, orderSpec), windowExpressions) = groupedWindowExpressions(i)
         // Set currentChild to the newly created Window operator.
-        currentChild = Window(currentChild.output, windowExpressions,
-          partitionSpec, orderSpec, currentChild)
+        currentChild =
+          Window(
+            currentChild.output,
+            windowExpressions,
+            partitionSpec,
+            orderSpec,
+            currentChild)
 
         // Move to next Window Spec.
         i += 1
diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveWindowFunctionQuerySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveWindowFunctionQuerySuite.scala
index 24a758f53170a..92bb9e6d73af1 100644
--- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveWindowFunctionQuerySuite.scala
+++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveWindowFunctionQuerySuite.scala
@@ -32,7 +32,7 @@ import org.apache.spark.util.Utils
  * for different tests and there are a few properties needed to let Hive generate golden
  * files, every `createQueryTest` calls should explicitly set `reset` to `false`.
  */
-abstract class HiveWindowFunctionQueryBaseSuite extends HiveComparisonTest with BeforeAndAfter {
+class HiveWindowFunctionQuerySuite extends HiveComparisonTest with BeforeAndAfter {
   private val originalTimeZone = TimeZone.getDefault
   private val originalLocale = Locale.getDefault
   private val testTempDir = Utils.createTempDir()
@@ -759,21 +759,7 @@ abstract class HiveWindowFunctionQueryBaseSuite extends HiveComparisonTest with
     """.stripMargin, reset = false)
 }
 
-class HiveWindowFunctionQueryWithoutCodeGenSuite extends HiveWindowFunctionQueryBaseSuite {
-  var originalCodegenEnabled: Boolean = _
-  override def beforeAll(): Unit = {
-    super.beforeAll()
-    originalCodegenEnabled = conf.codegenEnabled
-    sql("set spark.sql.codegen=false")
-  }
-
-  override def afterAll(): Unit = {
-    sql(s"set spark.sql.codegen=$originalCodegenEnabled")
-    super.afterAll()
-  }
-}
-
-abstract class HiveWindowFunctionQueryFileBaseSuite
+class HiveWindowFunctionQueryFileSuite
   extends HiveCompatibilitySuite with BeforeAndAfter {
   private val originalTimeZone = TimeZone.getDefault
   private val originalLocale = Locale.getDefault
@@ -789,11 +775,11 @@ abstract class HiveWindowFunctionQueryFileBaseSuite
     // The following settings are used for generating golden files with Hive.
     // We have to use kryo to correctly let Hive serialize plans with window functions.
     // This is used to generate golden files.
-    sql("set hive.plan.serialization.format=kryo")
+    // sql("set hive.plan.serialization.format=kryo")
     // Explicitly set fs to local fs.
-    sql(s"set fs.default.name=file://$testTempDir/")
+    // sql(s"set fs.default.name=file://$testTempDir/")
     // Ask Hive to run jobs in-process as a single map and reduce task.
-    sql("set mapred.job.tracker=local")
+    // sql("set mapred.job.tracker=local")
   }
 
   override def afterAll() {
@@ -838,17 +824,3 @@ abstract class HiveWindowFunctionQueryFileBaseSuite
     case (name, _) => realWhiteList.contains(name)
   }
 }
-
-class HiveWindowFunctionQueryFileWithoutCodeGenSuite extends HiveWindowFunctionQueryFileBaseSuite {
-  var originalCodegenEnabled: Boolean = _
-  override def beforeAll(): Unit = {
-    super.beforeAll()
-    originalCodegenEnabled = conf.codegenEnabled
-    sql("set spark.sql.codegen=false")
-  }
-
-  override def afterAll(): Unit = {
-    sql(s"set spark.sql.codegen=$originalCodegenEnabled")
-    super.afterAll()
-  }
-}

From 4d5a6e7b60b315968973e2298eeee5eb174ec721 Mon Sep 17 00:00:00 2001
From: HuJiayin <jiayin.hu@intel.com>
Date: Fri, 31 Jul 2015 16:05:26 -0700
Subject: [PATCH 535/549]  [SPARK-8271][SQL]string function: soundex

This PR brings SQL function soundex(), see https://issues.apache.org/jira/browse/HIVE-9738

It's based on #7115 , thanks to HuJiayin

Author: HuJiayin <jiayin.hu@intel.com>
Author: Davies Liu <davies@databricks.com>

Closes #7812 from davies/soundex and squashes the following commits:

fa75941 [Davies Liu] Merge branch 'master' of github.com:apache/spark into soundex
a4bd6d8 [Davies Liu] fix soundex
2538908 [HuJiayin] add codegen soundex
d15d329 [HuJiayin] add back ut
ded1a14 [HuJiayin] Merge branch 'master' of https://github.com/apache/spark
e2dec2c [HuJiayin] support soundex rebase code
---
 python/pyspark/sql/functions.py               | 17 ++++++
 .../catalyst/analysis/FunctionRegistry.scala  |  1 +
 .../expressions/stringOperations.scala        | 16 ++++++
 .../expressions/StringExpressionsSuite.scala  | 28 ++++++++++
 .../org/apache/spark/sql/functions.scala      |  8 +++
 .../spark/sql/StringFunctionsSuite.scala      |  9 ++++
 .../apache/spark/unsafe/types/UTF8String.java | 53 +++++++++++++++++++
 .../spark/unsafe/types/UTF8StringSuite.java   | 48 +++++++++++++++++
 8 files changed, 180 insertions(+)

diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 8024a8de07c98..bb9926ce8ca85 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -63,6 +63,8 @@
     'year', 'quarter', 'month', 'hour', 'minute', 'second',
     'dayofmonth', 'dayofyear', 'weekofyear']
 
+__all__ += ['soundex']
+
 
 def _create_function(name, doc=""):
     """ Create a function for aggregator by name"""
@@ -922,6 +924,7 @@ def trunc(date, format):
 def size(col):
     """
     Collection function: returns the length of the array or map stored in the column.
+
     :param col: name of column or expression
 
     >>> df = sqlContext.createDataFrame([([1, 2, 3],),([1],),([],)], ['data'])
@@ -932,6 +935,20 @@ def size(col):
     return Column(sc._jvm.functions.size(_to_java_column(col)))
 
 
+@since
+@ignore_unicode_prefix
+def soundex(col):
+    """
+    Returns the SoundEx encoding for a string
+
+    >>> df = sqlContext.createDataFrame([("Peters",),("Uhrbach",)], ['name'])
+    >>> df.select(soundex(df.name).alias("soundex")).collect()
+    [Row(soundex=u'P362'), Row(soundex=u'U612')]
+    """
+    sc = SparkContext._active_spark_context
+    return Column(sc._jvm.functions.size(_to_java_column(col)))
+
+
 class UserDefinedFunction(object):
     """
     User defined function in Python
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index 1bf7204a2515c..3f61a9af1f075 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -194,6 +194,7 @@ object FunctionRegistry {
     expression[StringRepeat]("repeat"),
     expression[StringReverse]("reverse"),
     expression[StringTrimRight]("rtrim"),
+    expression[SoundEx]("soundex"),
     expression[StringSpace]("space"),
     expression[StringSplit]("split"),
     expression[Substring]("substr"),
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
index 684eac12bd6f0..160e72f384a9b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
@@ -718,6 +718,22 @@ case class Levenshtein(left: Expression, right: Expression) extends BinaryExpres
   }
 }
 
+/**
+ * A function that return soundex code of the given string expression.
+ */
+case class SoundEx(child: Expression) extends UnaryExpression with ExpectsInputTypes {
+
+  override def dataType: DataType = StringType
+
+  override def inputTypes: Seq[DataType] = Seq(StringType)
+
+  override def nullSafeEval(input: Any): Any = input.asInstanceOf[UTF8String].soundex()
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    defineCodeGen(ctx, ev, c => s"$c.soundex()")
+  }
+}
+
 /**
  * Returns the numeric value of the first character of str.
  */
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
index 3ecd0d374c46b..fb72fe1714153 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
@@ -347,6 +347,34 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     // scalastyle:on
   }
 
+  test("soundex unit test") {
+    checkEvaluation(SoundEx(Literal("ZIN")), "Z500")
+    checkEvaluation(SoundEx(Literal("SU")), "S000")
+    checkEvaluation(SoundEx(Literal("")), "")
+    checkEvaluation(SoundEx(Literal.create(null, StringType)), null)
+
+    // scalastyle:off
+    // non ascii characters are not allowed in the code, so we disable the scalastyle here.
+    checkEvaluation(SoundEx(Literal("测试")), "测试")
+    checkEvaluation(SoundEx(Literal("Tschüss")), "T220")
+    // scalastyle:on
+    checkEvaluation(SoundEx(Literal("zZ")), "Z000", create_row("s8"))
+    checkEvaluation(SoundEx(Literal("RAGSSEEESSSVEEWE")), "R221")
+    checkEvaluation(SoundEx(Literal("Ashcraft")), "A261")
+    checkEvaluation(SoundEx(Literal("Aswcraft")), "A261")
+    checkEvaluation(SoundEx(Literal("Tymczak")), "T522")
+    checkEvaluation(SoundEx(Literal("Pfister")), "P236")
+    checkEvaluation(SoundEx(Literal("Miller")), "M460")
+    checkEvaluation(SoundEx(Literal("Peterson")), "P362")
+    checkEvaluation(SoundEx(Literal("Peters")), "P362")
+    checkEvaluation(SoundEx(Literal("Auerbach")), "A612")
+    checkEvaluation(SoundEx(Literal("Uhrbach")), "U612")
+    checkEvaluation(SoundEx(Literal("Moskowitz")), "M232")
+    checkEvaluation(SoundEx(Literal("Moskovitz")), "M213")
+    checkEvaluation(SoundEx(Literal("relyheewsgeessg")), "R422")
+    checkEvaluation(SoundEx(Literal("!!")), "!!")
+  }
+
   test("TRIM/LTRIM/RTRIM") {
     val s = 'a.string.at(0)
     checkEvaluation(StringTrim(Literal(" aa  ")), "aa", create_row(" abdef "))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 5d82a5eadd94d..89ffa9c50d6c7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -1902,6 +1902,14 @@ object functions {
     StringRepeat(str.expr, lit(n).expr)
   }
 
+  /**
+   * * Return the soundex code for the specified expression.
+   *
+   * @group string_funcs
+   * @since 1.5.0
+   */
+  def soundex(e: Column): Column = SoundEx(e.expr)
+
   /**
    * Splits str around pattern (pattern is a regular expression).
    * NOTE: pattern is a string represent the regular expression.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
index 8e0ea76d15881..b7f073cccb6ac 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
@@ -142,6 +142,15 @@ class StringFunctionsSuite extends QueryTest {
       Row("aa123cc"))
   }
 
+  test("soundex function") {
+    val df = Seq(("MARY", "SU")).toDF("l", "r")
+    checkAnswer(
+      df.select(soundex($"l"), soundex($"r")), Row("M600", "S000"))
+
+    checkAnswer(
+      df.selectExpr("SoundEx(l)", "SoundEx(r)"), Row("M600", "S000"))
+  }
+
   test("string instr function") {
     val df = Seq(("aaads", "aa", "zz")).toDF("a", "b", "c")
 
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
index c38953f65d7d7..9d4998fd48a38 100644
--- a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
+++ b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -680,4 +680,57 @@ public int hashCode() {
     }
     return result;
   }
+
+  /**
+   * Soundex mapping table
+   */
+  private static final byte[] US_ENGLISH_MAPPING = {'0', '1', '2', '3', '0', '1', '2', '7',
+    '0', '2', '2', '4', '5', '5', '0', '1', '2', '6', '2', '3', '0', '1', '7', '2', '0', '2'};
+
+  /**
+   * Encodes a string into a Soundex value. Soundex is an encoding used to relate similar names,
+   * but can also be used as a general purpose scheme to find word with similar phonemes.
+   * https://en.wikipedia.org/wiki/Soundex
+   */
+  public UTF8String soundex() {
+    if (numBytes == 0) {
+      return EMPTY_UTF8;
+    }
+
+    byte b = getByte(0);
+    if ('a' <= b && b <= 'z') {
+      b -= 32;
+    } else if (b < 'A' || 'Z' < b) {
+      // first character must be a letter
+      return this;
+    }
+    byte sx[] = {'0', '0', '0', '0'};
+    sx[0] = b;
+    int sxi = 1;
+    int idx = b - 'A';
+    byte lastCode = US_ENGLISH_MAPPING[idx];
+
+    for (int i = 1; i < numBytes; i++) {
+      b = getByte(i);
+      if ('a' <= b && b <= 'z') {
+        b -= 32;
+      } else if (b < 'A' || 'Z' < b) {
+        // not a letter, skip it
+        lastCode = '0';
+        continue;
+      }
+      idx = b - 'A';
+      byte code = US_ENGLISH_MAPPING[idx];
+      if (code == '7') {
+        // ignore it
+      } else {
+        if (code != '0' && code != lastCode) {
+          sx[sxi++] = code;
+          if (sxi > 3) break;
+        }
+        lastCode = code;
+      }
+    }
+    return UTF8String.fromBytes(sx);
+  }
 }
diff --git a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
index f2cc19ca6b172..c565210872322 100644
--- a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
+++ b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
@@ -341,4 +341,52 @@ public void createBlankString() {
     assertEquals(fromString("   "), blankString(3));
     assertEquals(fromString(""), blankString(0));
   }
+
+  @Test
+  public void soundex() {
+    assertEquals(fromString("Robert").soundex(), fromString("R163"));
+    assertEquals(fromString("Rupert").soundex(), fromString("R163"));
+    assertEquals(fromString("Rubin").soundex(), fromString("R150"));
+    assertEquals(fromString("Ashcraft").soundex(), fromString("A261"));
+    assertEquals(fromString("Ashcroft").soundex(), fromString("A261"));
+    assertEquals(fromString("Burroughs").soundex(), fromString("B620"));
+    assertEquals(fromString("Burrows").soundex(), fromString("B620"));
+    assertEquals(fromString("Ekzampul").soundex(), fromString("E251"));
+    assertEquals(fromString("Example").soundex(), fromString("E251"));
+    assertEquals(fromString("Ellery").soundex(), fromString("E460"));
+    assertEquals(fromString("Euler").soundex(), fromString("E460"));
+    assertEquals(fromString("Ghosh").soundex(), fromString("G200"));
+    assertEquals(fromString("Gauss").soundex(), fromString("G200"));
+    assertEquals(fromString("Gutierrez").soundex(), fromString("G362"));
+    assertEquals(fromString("Heilbronn").soundex(), fromString("H416"));
+    assertEquals(fromString("Hilbert").soundex(), fromString("H416"));
+    assertEquals(fromString("Jackson").soundex(), fromString("J250"));
+    assertEquals(fromString("Kant").soundex(), fromString("K530"));
+    assertEquals(fromString("Knuth").soundex(), fromString("K530"));
+    assertEquals(fromString("Lee").soundex(), fromString("L000"));
+    assertEquals(fromString("Lukasiewicz").soundex(), fromString("L222"));
+    assertEquals(fromString("Lissajous").soundex(), fromString("L222"));
+    assertEquals(fromString("Ladd").soundex(), fromString("L300"));
+    assertEquals(fromString("Lloyd").soundex(), fromString("L300"));
+    assertEquals(fromString("Moses").soundex(), fromString("M220"));
+    assertEquals(fromString("O'Hara").soundex(), fromString("O600"));
+    assertEquals(fromString("Pfister").soundex(), fromString("P236"));
+    assertEquals(fromString("Rubin").soundex(), fromString("R150"));
+    assertEquals(fromString("Robert").soundex(), fromString("R163"));
+    assertEquals(fromString("Rupert").soundex(), fromString("R163"));
+    assertEquals(fromString("Soundex").soundex(), fromString("S532"));
+    assertEquals(fromString("Sownteks").soundex(), fromString("S532"));
+    assertEquals(fromString("Tymczak").soundex(), fromString("T522"));
+    assertEquals(fromString("VanDeusen").soundex(), fromString("V532"));
+    assertEquals(fromString("Washington").soundex(), fromString("W252"));
+    assertEquals(fromString("Wheaton").soundex(), fromString("W350"));
+
+    assertEquals(fromString("a").soundex(), fromString("A000"));
+    assertEquals(fromString("ab").soundex(), fromString("A100"));
+    assertEquals(fromString("abc").soundex(), fromString("A120"));
+    assertEquals(fromString("abcd").soundex(), fromString("A123"));
+    assertEquals(fromString("").soundex(), fromString(""));
+    assertEquals(fromString("123").soundex(), fromString("123"));
+    assertEquals(fromString("世界千世").soundex(), fromString("世界千世"));
+  }
 }

From f51fd6fbb4d9822502f98b312251e317d757bc3a Mon Sep 17 00:00:00 2001
From: Feynman Liang <fliang@databricks.com>
Date: Fri, 31 Jul 2015 18:36:22 -0700
Subject: [PATCH 536/549] [SPARK-8936] [MLLIB] OnlineLDA document-topic
 Dirichlet hyperparameter optimization

Adds `alpha` (document-topic Dirichlet parameter) hyperparameter optimization to `OnlineLDAOptimizer` following Huang: Maximum Likelihood Estimation of Dirichlet Distribution Parameters. Also introduces a private `setSampleWithReplacement` to `OnlineLDAOptimizer` for unit testing purposes.

Author: Feynman Liang <fliang@databricks.com>

Closes #7836 from feynmanliang/SPARK-8936-alpha-optimize and squashes the following commits:

4bef484 [Feynman Liang] Documentation improvements
c3c6c1d [Feynman Liang] Fix docs
151e859 [Feynman Liang] Fix style
fa77518 [Feynman Liang] Hyperparameter optimization
---
 .../spark/mllib/clustering/LDAOptimizer.scala | 75 ++++++++++++++++---
 .../spark/mllib/clustering/LDASuite.scala     | 34 +++++++++
 2 files changed, 99 insertions(+), 10 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
index d6f8b29a43dfd..b0e14cb8296a6 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
@@ -19,8 +19,8 @@ package org.apache.spark.mllib.clustering
 
 import java.util.Random
 
-import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV, normalize, sum}
-import breeze.numerics.{abs, exp}
+import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV, all, normalize, sum}
+import breeze.numerics.{trigamma, abs, exp}
 import breeze.stats.distributions.{Gamma, RandBasis}
 
 import org.apache.spark.annotation.DeveloperApi
@@ -239,22 +239,26 @@ final class OnlineLDAOptimizer extends LDAOptimizer {
   /** alias for docConcentration */
   private var alpha: Vector = Vectors.dense(0)
 
-  /** (private[clustering] for debugging)  Get docConcentration */
+  /** (for debugging)  Get docConcentration */
   private[clustering] def getAlpha: Vector = alpha
 
   /** alias for topicConcentration */
   private var eta: Double = 0
 
-  /** (private[clustering] for debugging)  Get topicConcentration */
+  /** (for debugging)  Get topicConcentration */
   private[clustering] def getEta: Double = eta
 
   private var randomGenerator: java.util.Random = null
 
+  /** (for debugging) Whether to sample mini-batches with replacement. (default = true) */
+  private var sampleWithReplacement: Boolean = true
+
   // Online LDA specific parameters
   // Learning rate is: (tau0 + t)^{-kappa}
   private var tau0: Double = 1024
   private var kappa: Double = 0.51
   private var miniBatchFraction: Double = 0.05
+  private var optimizeAlpha: Boolean = false
 
   // internal data structure
   private var docs: RDD[(Long, Vector)] = null
@@ -262,7 +266,7 @@ final class OnlineLDAOptimizer extends LDAOptimizer {
   /** Dirichlet parameter for the posterior over topics */
   private var lambda: BDM[Double] = null
 
-  /** (private[clustering] for debugging) Get parameter for topics */
+  /** (for debugging) Get parameter for topics */
   private[clustering] def getLambda: BDM[Double] = lambda
 
   /** Current iteration (count of invocations of [[next()]]) */
@@ -325,7 +329,22 @@ final class OnlineLDAOptimizer extends LDAOptimizer {
   }
 
   /**
-   * (private[clustering])
+   * Optimize alpha, indicates whether alpha (Dirichlet parameter for document-topic distribution)
+   * will be optimized during training.
+   */
+  def getOptimzeAlpha: Boolean = this.optimizeAlpha
+
+  /**
+   * Sets whether to optimize alpha parameter during training.
+   *
+   * Default: false
+   */
+  def setOptimzeAlpha(optimizeAlpha: Boolean): this.type = {
+    this.optimizeAlpha = optimizeAlpha
+    this
+  }
+
+  /**
    * Set the Dirichlet parameter for the posterior over topics.
    * This is only used for testing now. In the future, it can help support training stop/resume.
    */
@@ -335,7 +354,6 @@ final class OnlineLDAOptimizer extends LDAOptimizer {
   }
 
   /**
-   * (private[clustering])
    * Used for random initialization of the variational parameters.
    * Larger value produces values closer to 1.0.
    * This is only used for testing currently.
@@ -345,6 +363,15 @@ final class OnlineLDAOptimizer extends LDAOptimizer {
     this
   }
 
+  /**
+   * Sets whether to sample mini-batches with or without replacement. (default = true)
+   * This is only used for testing currently.
+   */
+  private[clustering] def setSampleWithReplacement(replace: Boolean): this.type = {
+    this.sampleWithReplacement = replace
+    this
+  }
+
   override private[clustering] def initialize(
       docs: RDD[(Long, Vector)],
       lda: LDA): OnlineLDAOptimizer = {
@@ -376,7 +403,8 @@ final class OnlineLDAOptimizer extends LDAOptimizer {
   }
 
   override private[clustering] def next(): OnlineLDAOptimizer = {
-    val batch = docs.sample(withReplacement = true, miniBatchFraction, randomGenerator.nextLong())
+    val batch = docs.sample(withReplacement = sampleWithReplacement, miniBatchFraction,
+      randomGenerator.nextLong())
     if (batch.isEmpty()) return this
     submitMiniBatch(batch)
   }
@@ -418,6 +446,7 @@ final class OnlineLDAOptimizer extends LDAOptimizer {
 
     // Note that this is an optimization to avoid batch.count
     updateLambda(batchResult, (miniBatchFraction * corpusSize).ceil.toInt)
+    if (optimizeAlpha) updateAlpha(gammat)
     this
   }
 
@@ -433,13 +462,39 @@ final class OnlineLDAOptimizer extends LDAOptimizer {
       weight * (stat * (corpusSize.toDouble / batchSize.toDouble) + eta)
   }
 
-  /** Calculates learning rate rho, which decays as a function of [[iteration]] */
+  /**
+   * Update alpha based on `gammat`, the inferred topic distributions for documents in the
+   * current mini-batch. Uses Newton-Rhapson method.
+   * @see Section 3.3, Huang: Maximum Likelihood Estimation of Dirichlet Distribution Parameters
+   *      (http://jonathan-huang.org/research/dirichlet/dirichlet.pdf)
+   */
+  private def updateAlpha(gammat: BDM[Double]): Unit = {
+    val weight = rho()
+    val N = gammat.rows.toDouble
+    val alpha = this.alpha.toBreeze.toDenseVector
+    val logphat: BDM[Double] = sum(LDAUtils.dirichletExpectation(gammat)(::, breeze.linalg.*)) / N
+    val gradf = N * (-LDAUtils.dirichletExpectation(alpha) + logphat.toDenseVector)
+
+    val c = N * trigamma(sum(alpha))
+    val q = -N * trigamma(alpha)
+    val b = sum(gradf / q) / (1D / c + sum(1D / q))
+
+    val dalpha = -(gradf - b) / q
+
+    if (all((weight * dalpha + alpha) :> 0D)) {
+      alpha :+= weight * dalpha
+      this.alpha = Vectors.dense(alpha.toArray)
+    }
+  }
+
+
+  /** Calculate learning rate rho for the current [[iteration]]. */
   private def rho(): Double = {
     math.pow(getTau0 + this.iteration, -getKappa)
   }
 
   /**
-   * Get a random matrix to initialize lambda
+   * Get a random matrix to initialize lambda.
    */
   private def getGammaMatrix(row: Int, col: Int): BDM[Double] = {
     val randBasis = new RandBasis(new org.apache.commons.math3.random.MersenneTwister(
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
index f2b94707fd0ff..fdc2554ab853e 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
@@ -400,6 +400,40 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext {
     }
   }
 
+  test("OnlineLDAOptimizer alpha hyperparameter optimization") {
+    val k = 2
+    val docs = sc.parallelize(toyData)
+    val op = new OnlineLDAOptimizer().setMiniBatchFraction(1).setTau0(1024).setKappa(0.51)
+      .setGammaShape(100).setOptimzeAlpha(true).setSampleWithReplacement(false)
+    val lda = new LDA().setK(k)
+      .setDocConcentration(1D / k)
+      .setTopicConcentration(0.01)
+      .setMaxIterations(100)
+      .setOptimizer(op)
+      .setSeed(12345)
+    val ldaModel: LocalLDAModel = lda.run(docs).asInstanceOf[LocalLDAModel]
+
+    /* Verify the results with gensim:
+      import numpy as np
+      from gensim import models
+      corpus = [
+       [(0, 1.0), (1, 1.0)],
+       [(1, 1.0), (2, 1.0)],
+       [(0, 1.0), (2, 1.0)],
+       [(3, 1.0), (4, 1.0)],
+       [(3, 1.0), (5, 1.0)],
+       [(4, 1.0), (5, 1.0)]]
+      np.random.seed(2345)
+      lda = models.ldamodel.LdaModel(
+         corpus=corpus, alpha='auto', eta=0.01, num_topics=2, update_every=0, passes=100,
+         decay=0.51, offset=1024)
+      print(lda.alpha)
+      > [ 0.42582646  0.43511073]
+     */
+
+    assert(ldaModel.docConcentration ~== Vectors.dense(0.42582646, 0.43511073) absTol 0.05)
+  }
+
   test("model save/load") {
     // Test for LocalLDAModel.
     val localModel = new LocalLDAModel(tinyTopics,

From 8cb415a4b9bc1f82127ccce4a5579d433f4e8f83 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Fri, 31 Jul 2015 19:19:27 -0700
Subject: [PATCH 537/549] [SPARK-9451] [SQL] Support entries larger than
 default page size in BytesToBytesMap & integrate with ShuffleMemoryManager

This patch adds support for entries larger than the default page size in BytesToBytesMap.  These large rows are handled by allocating special overflow pages to hold individual entries.

In addition, this patch integrates BytesToBytesMap with the ShuffleMemoryManager:

- Move BytesToBytesMap from `unsafe` to `core` so that it can import `ShuffleMemoryManager`.
- Before allocating new data pages, ask the ShuffleMemoryManager to reserve the memory:
  - `putNewKey()` now returns a boolean to indicate whether the insert succeeded or failed due to a lack of memory.  The caller can use this value to respond to the memory pressure (e.g. by spilling).
- `UnsafeFixedWidthAggregationMap. getAggregationBuffer()` now returns `null` to signal failure due to a lack of memory.
- Updated all uses of these classes to handle these error conditions.
- Added new tests for allocating large records and for allocations which fail due to memory pressure.
- Extended the `afterAll()` test teardown methods to detect ShuffleMemoryManager leaks.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #7762 from JoshRosen/large-rows and squashes the following commits:

ae7bc56 [Josh Rosen] Fix compilation
82fc657 [Josh Rosen] Merge remote-tracking branch 'origin/master' into large-rows
34ab943 [Josh Rosen] Remove semi
31a525a [Josh Rosen] Integrate BytesToBytesMap with ShuffleMemoryManager.
626b33c [Josh Rosen] Move code to sql/core and spark/core packages so that ShuffleMemoryManager can be integrated
ec4484c [Josh Rosen] Move BytesToBytesMap from unsafe package to core.
642ed69 [Josh Rosen] Rename size to numElements
bea1152 [Josh Rosen] Add basic test.
2cd3570 [Josh Rosen] Remove accidental duplicated code
07ff9ef [Josh Rosen] Basic support for large rows in BytesToBytesMap.
---
 .../spark/unsafe/map/BytesToBytesMap.java     | 170 ++++++++++-----
 .../unsafe/map/HashMapGrowthStrategy.java     |   0
 .../spark/shuffle/ShuffleMemoryManager.scala  |   8 +-
 .../map/AbstractBytesToBytesMapSuite.java     | 204 ++++++++++++++----
 .../map/BytesToBytesMapOffHeapSuite.java      |   0
 .../map/BytesToBytesMapOnHeapSuite.java       |   0
 .../UnsafeFixedWidthAggregationMap.java       |  25 ++-
 .../sql/execution/GeneratedAggregate.scala    |   6 +
 .../sql/execution/joins/HashedRelation.scala  |  27 ++-
 .../UnsafeFixedWidthAggregationMapSuite.scala |  36 ++--
 .../spark/unsafe/memory/MemoryBlock.java      |   2 +-
 .../unsafe/memory/TaskMemoryManager.java      |   1 +
 12 files changed, 353 insertions(+), 126 deletions(-)
 rename {unsafe => core}/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java (78%)
 rename {unsafe => core}/src/main/java/org/apache/spark/unsafe/map/HashMapGrowthStrategy.java (100%)
 rename {unsafe => core}/src/test/java/org/apache/spark/unsafe/map/AbstractBytesToBytesMapSuite.java (64%)
 rename {unsafe => core}/src/test/java/org/apache/spark/unsafe/map/BytesToBytesMapOffHeapSuite.java (100%)
 rename {unsafe => core}/src/test/java/org/apache/spark/unsafe/map/BytesToBytesMapOnHeapSuite.java (100%)
 rename sql/{catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions => core/src/main/java/org/apache/spark/sql/execution}/UnsafeFixedWidthAggregationMap.java (89%)
 rename sql/{catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions => core/src/test/scala/org/apache/spark/sql/execution}/UnsafeFixedWidthAggregationMapSuite.scala (78%)

diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java b/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java
similarity index 78%
rename from unsafe/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java
rename to core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java
index 198e0684f32f8..0f42950e6ed8b 100644
--- a/unsafe/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java
+++ b/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java
@@ -17,6 +17,7 @@
 
 package org.apache.spark.unsafe.map;
 
+import java.io.IOException;
 import java.lang.Override;
 import java.lang.UnsupportedOperationException;
 import java.util.Iterator;
@@ -24,7 +25,10 @@
 import java.util.List;
 
 import com.google.common.annotations.VisibleForTesting;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
+import org.apache.spark.shuffle.ShuffleMemoryManager;
 import org.apache.spark.unsafe.*;
 import org.apache.spark.unsafe.array.ByteArrayMethods;
 import org.apache.spark.unsafe.array.LongArray;
@@ -45,6 +49,8 @@
  */
 public final class BytesToBytesMap {
 
+  private final Logger logger = LoggerFactory.getLogger(BytesToBytesMap.class);
+
   private static final Murmur3_x86_32 HASHER = new Murmur3_x86_32(0);
 
   private static final HashMapGrowthStrategy growthStrategy = HashMapGrowthStrategy.DOUBLING;
@@ -54,7 +60,9 @@ public final class BytesToBytesMap {
    */
   private static final int END_OF_PAGE_MARKER = -1;
 
-  private final TaskMemoryManager memoryManager;
+  private final TaskMemoryManager taskMemoryManager;
+
+  private final ShuffleMemoryManager shuffleMemoryManager;
 
   /**
    * A linked list for tracking all allocated data pages so that we can free all of our memory.
@@ -120,7 +128,7 @@ public final class BytesToBytesMap {
   /**
    * Number of keys defined in the map.
    */
-  private int size;
+  private int numElements;
 
   /**
    * The map will be expanded once the number of keys exceeds this threshold.
@@ -150,12 +158,14 @@ public final class BytesToBytesMap {
   private long numHashCollisions = 0;
 
   public BytesToBytesMap(
-      TaskMemoryManager memoryManager,
+      TaskMemoryManager taskMemoryManager,
+      ShuffleMemoryManager shuffleMemoryManager,
       int initialCapacity,
       double loadFactor,
       long pageSizeBytes,
       boolean enablePerfMetrics) {
-    this.memoryManager = memoryManager;
+    this.taskMemoryManager = taskMemoryManager;
+    this.shuffleMemoryManager = shuffleMemoryManager;
     this.loadFactor = loadFactor;
     this.loc = new Location();
     this.pageSizeBytes = pageSizeBytes;
@@ -175,24 +185,32 @@ public BytesToBytesMap(
   }
 
   public BytesToBytesMap(
-      TaskMemoryManager memoryManager,
+      TaskMemoryManager taskMemoryManager,
+      ShuffleMemoryManager shuffleMemoryManager,
       int initialCapacity,
       long pageSizeBytes) {
-    this(memoryManager, initialCapacity, 0.70, pageSizeBytes, false);
+    this(taskMemoryManager, shuffleMemoryManager, initialCapacity, 0.70, pageSizeBytes, false);
   }
 
   public BytesToBytesMap(
-      TaskMemoryManager memoryManager,
+      TaskMemoryManager taskMemoryManager,
+      ShuffleMemoryManager shuffleMemoryManager,
       int initialCapacity,
       long pageSizeBytes,
       boolean enablePerfMetrics) {
-    this(memoryManager, initialCapacity, 0.70, pageSizeBytes, enablePerfMetrics);
+    this(
+      taskMemoryManager,
+      shuffleMemoryManager,
+      initialCapacity,
+      0.70,
+      pageSizeBytes,
+      enablePerfMetrics);
   }
 
   /**
    * Returns the number of keys defined in the map.
    */
-  public int size() { return size; }
+  public int numElements() { return numElements; }
 
   private static final class BytesToBytesMapIterator implements Iterator<Location> {
 
@@ -252,7 +270,7 @@ public void remove() {
    * `lookup()`, the behavior of the returned iterator is undefined.
    */
   public Iterator<Location> iterator() {
-    return new BytesToBytesMapIterator(size, dataPages.iterator(), loc);
+    return new BytesToBytesMapIterator(numElements, dataPages.iterator(), loc);
   }
 
   /**
@@ -330,7 +348,8 @@ public final class Location {
 
     private void updateAddressesAndSizes(long fullKeyAddress) {
       updateAddressesAndSizes(
-        memoryManager.getPage(fullKeyAddress), memoryManager.getOffsetInPage(fullKeyAddress));
+        taskMemoryManager.getPage(fullKeyAddress),
+        taskMemoryManager.getOffsetInPage(fullKeyAddress));
     }
 
     private void updateAddressesAndSizes(Object page, long keyOffsetInPage) {
@@ -411,7 +430,8 @@ public int getValueLength() {
     /**
      * Store a new key and value. This method may only be called once for a given key; if you want
      * to update the value associated with a key, then you can directly manipulate the bytes stored
-     * at the value address.
+     * at the value address. The return value indicates whether the put succeeded or whether it
+     * failed because additional memory could not be acquired.
      * <p>
      * It is only valid to call this method immediately after calling `lookup()` using the same key.
      * </p>
@@ -428,14 +448,19 @@ public int getValueLength() {
      * <pre>
      *   Location loc = map.lookup(keyBaseObject, keyBaseOffset, keyLengthInBytes);
      *   if (!loc.isDefined()) {
-     *     loc.putNewKey(keyBaseObject, keyBaseOffset, keyLengthInBytes, ...)
+     *     if (!loc.putNewKey(keyBaseObject, keyBaseOffset, keyLengthInBytes, ...)) {
+     *       // handle failure to grow map (by spilling, for example)
+     *     }
      *   }
      * </pre>
      * <p>
      * Unspecified behavior if the key is not defined.
      * </p>
+     *
+     * @return true if the put() was successful and false if the put() failed because memory could
+     *         not be acquired.
      */
-    public void putNewKey(
+    public boolean putNewKey(
         Object keyBaseObject,
         long keyBaseOffset,
         int keyLengthBytes,
@@ -445,63 +470,110 @@ public void putNewKey(
       assert (!isDefined) : "Can only set value once for a key";
       assert (keyLengthBytes % 8 == 0);
       assert (valueLengthBytes % 8 == 0);
-      if (size == MAX_CAPACITY) {
+      if (numElements == MAX_CAPACITY) {
         throw new IllegalStateException("BytesToBytesMap has reached maximum capacity");
       }
+
       // Here, we'll copy the data into our data pages. Because we only store a relative offset from
       // the key address instead of storing the absolute address of the value, the key and value
       // must be stored in the same memory page.
       // (8 byte key length) (key) (8 byte value length) (value)
       final long requiredSize = 8 + keyLengthBytes + 8 + valueLengthBytes;
-      assert (requiredSize <= pageSizeBytes - 8); // Reserve 8 bytes for the end-of-page marker.
-      size++;
-      bitset.set(pos);
 
-      // If there's not enough space in the current page, allocate a new page (8 bytes are reserved
-      // for the end-of-page marker).
-      if (currentDataPage == null || pageSizeBytes - 8 - pageCursor < requiredSize) {
+      // --- Figure out where to insert the new record ---------------------------------------------
+
+      final MemoryBlock dataPage;
+      final Object dataPageBaseObject;
+      final long dataPageInsertOffset;
+      boolean useOverflowPage = requiredSize > pageSizeBytes - 8;
+      if (useOverflowPage) {
+        // The record is larger than the page size, so allocate a special overflow page just to hold
+        // that record.
+        final long memoryRequested = requiredSize + 8;
+        final long memoryGranted = shuffleMemoryManager.tryToAcquire(memoryRequested);
+        if (memoryGranted != memoryRequested) {
+          shuffleMemoryManager.release(memoryGranted);
+          logger.debug("Failed to acquire {} bytes of memory", memoryRequested);
+          return false;
+        }
+        MemoryBlock overflowPage = taskMemoryManager.allocatePage(memoryRequested);
+        dataPages.add(overflowPage);
+        dataPage = overflowPage;
+        dataPageBaseObject = overflowPage.getBaseObject();
+        dataPageInsertOffset = overflowPage.getBaseOffset();
+      } else if (currentDataPage == null || pageSizeBytes - 8 - pageCursor < requiredSize) {
+        // The record can fit in a data page, but either we have not allocated any pages yet or
+        // the current page does not have enough space.
         if (currentDataPage != null) {
           // There wasn't enough space in the current page, so write an end-of-page marker:
           final Object pageBaseObject = currentDataPage.getBaseObject();
           final long lengthOffsetInPage = currentDataPage.getBaseOffset() + pageCursor;
           PlatformDependent.UNSAFE.putLong(pageBaseObject, lengthOffsetInPage, END_OF_PAGE_MARKER);
         }
-        MemoryBlock newPage = memoryManager.allocatePage(pageSizeBytes);
+        final long memoryGranted = shuffleMemoryManager.tryToAcquire(pageSizeBytes);
+        if (memoryGranted != pageSizeBytes) {
+          shuffleMemoryManager.release(memoryGranted);
+          logger.debug("Failed to acquire {} bytes of memory", pageSizeBytes);
+          return false;
+        }
+        MemoryBlock newPage = taskMemoryManager.allocatePage(pageSizeBytes);
         dataPages.add(newPage);
         pageCursor = 0;
         currentDataPage = newPage;
+        dataPage = currentDataPage;
+        dataPageBaseObject = currentDataPage.getBaseObject();
+        dataPageInsertOffset = currentDataPage.getBaseOffset();
+      } else {
+        // There is enough space in the current data page.
+        dataPage = currentDataPage;
+        dataPageBaseObject = currentDataPage.getBaseObject();
+        dataPageInsertOffset = currentDataPage.getBaseOffset() + pageCursor;
       }
 
+      // --- Append the key and value data to the current data page --------------------------------
+
+      long insertCursor = dataPageInsertOffset;
+
       // Compute all of our offsets up-front:
-      final Object pageBaseObject = currentDataPage.getBaseObject();
-      final long pageBaseOffset = currentDataPage.getBaseOffset();
-      final long keySizeOffsetInPage = pageBaseOffset + pageCursor;
-      pageCursor += 8; // word used to store the key size
-      final long keyDataOffsetInPage = pageBaseOffset + pageCursor;
-      pageCursor += keyLengthBytes;
-      final long valueSizeOffsetInPage = pageBaseOffset + pageCursor;
-      pageCursor += 8; // word used to store the value size
-      final long valueDataOffsetInPage = pageBaseOffset + pageCursor;
-      pageCursor += valueLengthBytes;
+      final long keySizeOffsetInPage = insertCursor;
+      insertCursor += 8; // word used to store the key size
+      final long keyDataOffsetInPage = insertCursor;
+      insertCursor += keyLengthBytes;
+      final long valueSizeOffsetInPage = insertCursor;
+      insertCursor += 8; // word used to store the value size
+      final long valueDataOffsetInPage = insertCursor;
+      insertCursor += valueLengthBytes; // word used to store the value size
 
       // Copy the key
-      PlatformDependent.UNSAFE.putLong(pageBaseObject, keySizeOffsetInPage, keyLengthBytes);
+      PlatformDependent.UNSAFE.putLong(dataPageBaseObject, keySizeOffsetInPage, keyLengthBytes);
       PlatformDependent.copyMemory(
-        keyBaseObject, keyBaseOffset, pageBaseObject, keyDataOffsetInPage, keyLengthBytes);
+        keyBaseObject, keyBaseOffset, dataPageBaseObject, keyDataOffsetInPage, keyLengthBytes);
       // Copy the value
-      PlatformDependent.UNSAFE.putLong(pageBaseObject, valueSizeOffsetInPage, valueLengthBytes);
-      PlatformDependent.copyMemory(
-        valueBaseObject, valueBaseOffset, pageBaseObject, valueDataOffsetInPage, valueLengthBytes);
+      PlatformDependent.UNSAFE.putLong(dataPageBaseObject, valueSizeOffsetInPage, valueLengthBytes);
+      PlatformDependent.copyMemory(valueBaseObject, valueBaseOffset, dataPageBaseObject,
+        valueDataOffsetInPage, valueLengthBytes);
+
+      // --- Update bookeeping data structures -----------------------------------------------------
+
+      if (useOverflowPage) {
+        // Store the end-of-page marker at the end of the data page
+        PlatformDependent.UNSAFE.putLong(dataPageBaseObject, insertCursor, END_OF_PAGE_MARKER);
+      } else {
+        pageCursor += requiredSize;
+      }
 
-      final long storedKeyAddress = memoryManager.encodePageNumberAndOffset(
-        currentDataPage, keySizeOffsetInPage);
+      numElements++;
+      bitset.set(pos);
+      final long storedKeyAddress = taskMemoryManager.encodePageNumberAndOffset(
+        dataPage, keySizeOffsetInPage);
       longArray.set(pos * 2, storedKeyAddress);
       longArray.set(pos * 2 + 1, keyHashcode);
       updateAddressesAndSizes(storedKeyAddress);
       isDefined = true;
-      if (size > growthThreshold && longArray.size() < MAX_CAPACITY) {
+      if (numElements > growthThreshold && longArray.size() < MAX_CAPACITY) {
         growAndRehash();
       }
+      return true;
     }
   }
 
@@ -516,7 +588,7 @@ private void allocate(int capacity) {
     // The capacity needs to be divisible by 64 so that our bit set can be sized properly
     capacity = Math.max((int) Math.min(MAX_CAPACITY, nextPowerOf2(capacity)), 64);
     assert (capacity <= MAX_CAPACITY);
-    longArray = new LongArray(memoryManager.allocate(capacity * 8L * 2));
+    longArray = new LongArray(MemoryBlock.fromLongArray(new long[capacity * 2]));
     bitset = new BitSet(MemoryBlock.fromLongArray(new long[capacity / 64]));
 
     this.growthThreshold = (int) (capacity * loadFactor);
@@ -530,18 +602,14 @@ private void allocate(int capacity) {
    * This method is idempotent.
    */
   public void free() {
-    if (longArray != null) {
-      memoryManager.free(longArray.memoryBlock());
-      longArray = null;
-    }
-    if (bitset != null) {
-      // The bitset's heap memory isn't managed by a memory manager, so no need to free it here.
-      bitset = null;
-    }
+    longArray = null;
+    bitset = null;
     Iterator<MemoryBlock> dataPagesIterator = dataPages.iterator();
     while (dataPagesIterator.hasNext()) {
-      memoryManager.freePage(dataPagesIterator.next());
+      MemoryBlock dataPage = dataPagesIterator.next();
       dataPagesIterator.remove();
+      taskMemoryManager.freePage(dataPage);
+      shuffleMemoryManager.release(dataPage.size());
     }
     assert(dataPages.isEmpty());
   }
@@ -628,8 +696,6 @@ void growAndRehash() {
       }
     }
 
-    // Deallocate the old data structures.
-    memoryManager.free(oldLongArray.memoryBlock());
     if (enablePerfMetrics) {
       timeSpentResizingNs += System.nanoTime() - resizeStartTime;
     }
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/map/HashMapGrowthStrategy.java b/core/src/main/java/org/apache/spark/unsafe/map/HashMapGrowthStrategy.java
similarity index 100%
rename from unsafe/src/main/java/org/apache/spark/unsafe/map/HashMapGrowthStrategy.java
rename to core/src/main/java/org/apache/spark/unsafe/map/HashMapGrowthStrategy.java
diff --git a/core/src/main/scala/org/apache/spark/shuffle/ShuffleMemoryManager.scala b/core/src/main/scala/org/apache/spark/shuffle/ShuffleMemoryManager.scala
index f038b722957b8..00c1e078a441c 100644
--- a/core/src/main/scala/org/apache/spark/shuffle/ShuffleMemoryManager.scala
+++ b/core/src/main/scala/org/apache/spark/shuffle/ShuffleMemoryManager.scala
@@ -85,7 +85,7 @@ private[spark] class ShuffleMemoryManager(maxMemory: Long) extends Logging {
           return toGrant
         } else {
           logInfo(
-            s"Thread $taskAttemptId waiting for at least 1/2N of shuffle memory pool to be free")
+            s"TID $taskAttemptId waiting for at least 1/2N of shuffle memory pool to be free")
           wait()
         }
       } else {
@@ -116,6 +116,12 @@ private[spark] class ShuffleMemoryManager(maxMemory: Long) extends Logging {
     taskMemory.remove(taskAttemptId)
     notifyAll()  // Notify waiters who locked "this" in tryToAcquire that memory has been freed
   }
+
+  /** Returns the memory consumption, in bytes, for the current task */
+  def getMemoryConsumptionForThisTask(): Long = synchronized {
+    val taskAttemptId = currentTaskAttemptId()
+    taskMemory.getOrElse(taskAttemptId, 0L)
+  }
 }
 
 private object ShuffleMemoryManager {
diff --git a/unsafe/src/test/java/org/apache/spark/unsafe/map/AbstractBytesToBytesMapSuite.java b/core/src/test/java/org/apache/spark/unsafe/map/AbstractBytesToBytesMapSuite.java
similarity index 64%
rename from unsafe/src/test/java/org/apache/spark/unsafe/map/AbstractBytesToBytesMapSuite.java
rename to core/src/test/java/org/apache/spark/unsafe/map/AbstractBytesToBytesMapSuite.java
index 0be94ad371255..60f483acbcb80 100644
--- a/unsafe/src/test/java/org/apache/spark/unsafe/map/AbstractBytesToBytesMapSuite.java
+++ b/core/src/test/java/org/apache/spark/unsafe/map/AbstractBytesToBytesMapSuite.java
@@ -21,15 +21,14 @@
 import java.nio.ByteBuffer;
 import java.util.*;
 
-import org.junit.After;
-import org.junit.Assert;
-import org.junit.Before;
-import org.junit.Test;
+import org.junit.*;
 import org.mockito.invocation.InvocationOnMock;
 import org.mockito.stubbing.Answer;
+import static org.hamcrest.Matchers.greaterThan;
 import static org.mockito.AdditionalMatchers.geq;
 import static org.mockito.Mockito.*;
 
+import org.apache.spark.shuffle.ShuffleMemoryManager;
 import org.apache.spark.unsafe.array.ByteArrayMethods;
 import org.apache.spark.unsafe.memory.*;
 import org.apache.spark.unsafe.PlatformDependent;
@@ -41,32 +40,39 @@ public abstract class AbstractBytesToBytesMapSuite {
 
   private final Random rand = new Random(42);
 
-  private TaskMemoryManager memoryManager;
-  private TaskMemoryManager sizeLimitedMemoryManager;
+  private ShuffleMemoryManager shuffleMemoryManager;
+  private TaskMemoryManager taskMemoryManager;
+  private TaskMemoryManager sizeLimitedTaskMemoryManager;
   private final long PAGE_SIZE_BYTES = 1L << 26; // 64 megabytes
 
   @Before
   public void setup() {
-    memoryManager = new TaskMemoryManager(new ExecutorMemoryManager(getMemoryAllocator()));
+    shuffleMemoryManager = new ShuffleMemoryManager(Long.MAX_VALUE);
+    taskMemoryManager = new TaskMemoryManager(new ExecutorMemoryManager(getMemoryAllocator()));
     // Mocked memory manager for tests that check the maximum array size, since actually allocating
     // such large arrays will cause us to run out of memory in our tests.
-    sizeLimitedMemoryManager = spy(memoryManager);
-    when(sizeLimitedMemoryManager.allocate(geq(1L << 20))).thenAnswer(new Answer<MemoryBlock>() {
-      @Override
-      public MemoryBlock answer(InvocationOnMock invocation) throws Throwable {
-        if (((Long) invocation.getArguments()[0] / 8) > Integer.MAX_VALUE) {
-          throw new OutOfMemoryError("Requested array size exceeds VM limit");
+    sizeLimitedTaskMemoryManager = mock(TaskMemoryManager.class);
+    when(sizeLimitedTaskMemoryManager.allocate(geq(1L << 20))).thenAnswer(
+      new Answer<MemoryBlock>() {
+        @Override
+        public MemoryBlock answer(InvocationOnMock invocation) throws Throwable {
+          if (((Long) invocation.getArguments()[0] / 8) > Integer.MAX_VALUE) {
+            throw new OutOfMemoryError("Requested array size exceeds VM limit");
+          }
+          return new MemoryBlock(null, 0, (Long) invocation.getArguments()[0]);
         }
-        return memoryManager.allocate(1L << 20);
       }
-    });
+    );
   }
 
   @After
   public void tearDown() {
-    if (memoryManager != null) {
-      memoryManager.cleanUpAllAllocatedMemory();
-      memoryManager = null;
+    if (taskMemoryManager != null) {
+      long leakedShuffleMemory = shuffleMemoryManager.getMemoryConsumptionForThisTask();
+      Assert.assertEquals(0, taskMemoryManager.cleanUpAllAllocatedMemory());
+      Assert.assertEquals(0, leakedShuffleMemory);
+      shuffleMemoryManager = null;
+      taskMemoryManager = null;
     }
   }
 
@@ -85,7 +91,7 @@ private static byte[] getByteArray(MemoryLocation loc, int size) {
   }
 
   private byte[] getRandomByteArray(int numWords) {
-    Assert.assertTrue(numWords > 0);
+    Assert.assertTrue(numWords >= 0);
     final int lengthInBytes = numWords * 8;
     final byte[] bytes = new byte[lengthInBytes];
     rand.nextBytes(bytes);
@@ -111,9 +117,10 @@ private static boolean arrayEquals(
 
   @Test
   public void emptyMap() {
-    BytesToBytesMap map = new BytesToBytesMap(memoryManager, 64, PAGE_SIZE_BYTES);
+    BytesToBytesMap map = new BytesToBytesMap(
+      taskMemoryManager, shuffleMemoryManager, 64, PAGE_SIZE_BYTES);
     try {
-      Assert.assertEquals(0, map.size());
+      Assert.assertEquals(0, map.numElements());
       final int keyLengthInWords = 10;
       final int keyLengthInBytes = keyLengthInWords * 8;
       final byte[] key = getRandomByteArray(keyLengthInWords);
@@ -126,7 +133,8 @@ public void emptyMap() {
 
   @Test
   public void setAndRetrieveAKey() {
-    BytesToBytesMap map = new BytesToBytesMap(memoryManager, 64, PAGE_SIZE_BYTES);
+    BytesToBytesMap map = new BytesToBytesMap(
+      taskMemoryManager, shuffleMemoryManager, 64, PAGE_SIZE_BYTES);
     final int recordLengthWords = 10;
     final int recordLengthBytes = recordLengthWords * 8;
     final byte[] keyData = getRandomByteArray(recordLengthWords);
@@ -135,14 +143,14 @@ public void setAndRetrieveAKey() {
       final BytesToBytesMap.Location loc =
         map.lookup(keyData, BYTE_ARRAY_OFFSET, recordLengthBytes);
       Assert.assertFalse(loc.isDefined());
-      loc.putNewKey(
+      Assert.assertTrue(loc.putNewKey(
         keyData,
         BYTE_ARRAY_OFFSET,
         recordLengthBytes,
         valueData,
         BYTE_ARRAY_OFFSET,
         recordLengthBytes
-      );
+      ));
       // After storing the key and value, the other location methods should return results that
       // reflect the result of this store without us having to call lookup() again on the same key.
       Assert.assertEquals(recordLengthBytes, loc.getKeyLength());
@@ -158,14 +166,14 @@ public void setAndRetrieveAKey() {
       Assert.assertArrayEquals(valueData, getByteArray(loc.getValueAddress(), recordLengthBytes));
 
       try {
-        loc.putNewKey(
+        Assert.assertTrue(loc.putNewKey(
           keyData,
           BYTE_ARRAY_OFFSET,
           recordLengthBytes,
           valueData,
           BYTE_ARRAY_OFFSET,
           recordLengthBytes
-        );
+        ));
         Assert.fail("Should not be able to set a new value for a key");
       } catch (AssertionError e) {
         // Expected exception; do nothing.
@@ -178,7 +186,8 @@ public void setAndRetrieveAKey() {
   @Test
   public void iteratorTest() throws Exception {
     final int size = 4096;
-    BytesToBytesMap map = new BytesToBytesMap(memoryManager, size / 2, PAGE_SIZE_BYTES);
+    BytesToBytesMap map = new BytesToBytesMap(
+      taskMemoryManager, shuffleMemoryManager, size / 2, PAGE_SIZE_BYTES);
     try {
       for (long i = 0; i < size; i++) {
         final long[] value = new long[] { i };
@@ -187,23 +196,23 @@ public void iteratorTest() throws Exception {
         Assert.assertFalse(loc.isDefined());
         // Ensure that we store some zero-length keys
         if (i % 5 == 0) {
-          loc.putNewKey(
+          Assert.assertTrue(loc.putNewKey(
             null,
             PlatformDependent.LONG_ARRAY_OFFSET,
             0,
             value,
             PlatformDependent.LONG_ARRAY_OFFSET,
             8
-          );
+          ));
         } else {
-          loc.putNewKey(
+          Assert.assertTrue(loc.putNewKey(
             value,
             PlatformDependent.LONG_ARRAY_OFFSET,
             8,
             value,
             PlatformDependent.LONG_ARRAY_OFFSET,
             8
-          );
+          ));
         }
       }
       final java.util.BitSet valuesSeen = new java.util.BitSet(size);
@@ -236,7 +245,8 @@ public void iteratingOverDataPagesWithWastedSpace() throws Exception {
     final int NUM_ENTRIES = 1000 * 1000;
     final int KEY_LENGTH = 16;
     final int VALUE_LENGTH = 40;
-    final BytesToBytesMap map = new BytesToBytesMap(memoryManager, NUM_ENTRIES, PAGE_SIZE_BYTES);
+    final BytesToBytesMap map = new BytesToBytesMap(
+      taskMemoryManager, shuffleMemoryManager, NUM_ENTRIES, PAGE_SIZE_BYTES);
     // Each record will take 8 + 8 + 16 + 40 = 72 bytes of space in the data page. Our 64-megabyte
     // pages won't be evenly-divisible by records of this size, which will cause us to waste some
     // space at the end of the page. This is necessary in order for us to take the end-of-record
@@ -251,14 +261,14 @@ public void iteratingOverDataPagesWithWastedSpace() throws Exception {
           KEY_LENGTH
         );
         Assert.assertFalse(loc.isDefined());
-        loc.putNewKey(
+        Assert.assertTrue(loc.putNewKey(
           key,
           LONG_ARRAY_OFFSET,
           KEY_LENGTH,
           value,
           LONG_ARRAY_OFFSET,
           VALUE_LENGTH
-        );
+        ));
       }
       Assert.assertEquals(2, map.getNumDataPages());
 
@@ -305,7 +315,8 @@ public void randomizedStressTest() {
     // Java arrays' hashCodes() aren't based on the arrays' contents, so we need to wrap arrays
     // into ByteBuffers in order to use them as keys here.
     final Map<ByteBuffer, byte[]> expected = new HashMap<ByteBuffer, byte[]>();
-    final BytesToBytesMap map = new BytesToBytesMap(memoryManager, size, PAGE_SIZE_BYTES);
+    final BytesToBytesMap map = new BytesToBytesMap(
+      taskMemoryManager, shuffleMemoryManager, size, PAGE_SIZE_BYTES);
 
     try {
       // Fill the map to 90% full so that we can trigger probing
@@ -320,14 +331,14 @@ public void randomizedStressTest() {
             key.length
           );
           Assert.assertFalse(loc.isDefined());
-          loc.putNewKey(
+          Assert.assertTrue(loc.putNewKey(
             key,
             BYTE_ARRAY_OFFSET,
             key.length,
             value,
             BYTE_ARRAY_OFFSET,
             value.length
-          );
+          ));
           // After calling putNewKey, the following should be true, even before calling
           // lookup():
           Assert.assertTrue(loc.isDefined());
@@ -351,10 +362,102 @@ public void randomizedStressTest() {
     }
   }
 
+  @Test
+  public void randomizedTestWithRecordsLargerThanPageSize() {
+    final long pageSizeBytes = 128;
+    final BytesToBytesMap map = new BytesToBytesMap(
+      taskMemoryManager, shuffleMemoryManager, 64, pageSizeBytes);
+    // Java arrays' hashCodes() aren't based on the arrays' contents, so we need to wrap arrays
+    // into ByteBuffers in order to use them as keys here.
+    final Map<ByteBuffer, byte[]> expected = new HashMap<ByteBuffer, byte[]>();
+    try {
+      for (int i = 0; i < 1000; i++) {
+        final byte[] key = getRandomByteArray(rand.nextInt(128));
+        final byte[] value = getRandomByteArray(rand.nextInt(128));
+        if (!expected.containsKey(ByteBuffer.wrap(key))) {
+          expected.put(ByteBuffer.wrap(key), value);
+          final BytesToBytesMap.Location loc = map.lookup(
+            key,
+            BYTE_ARRAY_OFFSET,
+            key.length
+          );
+          Assert.assertFalse(loc.isDefined());
+          Assert.assertTrue(loc.putNewKey(
+            key,
+            BYTE_ARRAY_OFFSET,
+            key.length,
+            value,
+            BYTE_ARRAY_OFFSET,
+            value.length
+          ));
+          // After calling putNewKey, the following should be true, even before calling
+          // lookup():
+          Assert.assertTrue(loc.isDefined());
+          Assert.assertEquals(key.length, loc.getKeyLength());
+          Assert.assertEquals(value.length, loc.getValueLength());
+          Assert.assertTrue(arrayEquals(key, loc.getKeyAddress(), key.length));
+          Assert.assertTrue(arrayEquals(value, loc.getValueAddress(), value.length));
+        }
+      }
+      for (Map.Entry<ByteBuffer, byte[]> entry : expected.entrySet()) {
+        final byte[] key = entry.getKey().array();
+        final byte[] value = entry.getValue();
+        final BytesToBytesMap.Location loc = map.lookup(key, BYTE_ARRAY_OFFSET, key.length);
+        Assert.assertTrue(loc.isDefined());
+        Assert.assertTrue(arrayEquals(key, loc.getKeyAddress(), loc.getKeyLength()));
+        Assert.assertTrue(arrayEquals(value, loc.getValueAddress(), loc.getValueLength()));
+      }
+    } finally {
+      map.free();
+    }
+  }
+
+  @Test
+  public void failureToAllocateFirstPage() {
+    shuffleMemoryManager = new ShuffleMemoryManager(1024);
+    BytesToBytesMap map =
+      new BytesToBytesMap(taskMemoryManager, shuffleMemoryManager, 1, PAGE_SIZE_BYTES);
+    try {
+      final long[] emptyArray = new long[0];
+      final BytesToBytesMap.Location loc =
+        map.lookup(emptyArray, PlatformDependent.LONG_ARRAY_OFFSET, 0);
+      Assert.assertFalse(loc.isDefined());
+      Assert.assertFalse(loc.putNewKey(
+        emptyArray, LONG_ARRAY_OFFSET, 0,
+        emptyArray, LONG_ARRAY_OFFSET, 0
+      ));
+    } finally {
+      map.free();
+    }
+  }
+
+
+  @Test
+  public void failureToGrow() {
+    shuffleMemoryManager = new ShuffleMemoryManager(1024 * 10);
+    BytesToBytesMap map = new BytesToBytesMap(taskMemoryManager, shuffleMemoryManager, 1, 1024);
+    try {
+      boolean success = true;
+      int i;
+      for (i = 0; i < 1024; i++) {
+        final long[] arr = new long[]{i};
+        final BytesToBytesMap.Location loc = map.lookup(arr, PlatformDependent.LONG_ARRAY_OFFSET, 8);
+        success = loc.putNewKey(arr, LONG_ARRAY_OFFSET, 8, arr, LONG_ARRAY_OFFSET, 8);
+        if (!success) {
+          break;
+        }
+      }
+      Assert.assertThat(i, greaterThan(0));
+      Assert.assertFalse(success);
+    } finally {
+      map.free();
+    }
+  }
+
   @Test
   public void initialCapacityBoundsChecking() {
     try {
-      new BytesToBytesMap(sizeLimitedMemoryManager, 0, PAGE_SIZE_BYTES);
+      new BytesToBytesMap(sizeLimitedTaskMemoryManager, shuffleMemoryManager, 0, PAGE_SIZE_BYTES);
       Assert.fail("Expected IllegalArgumentException to be thrown");
     } catch (IllegalArgumentException e) {
       // expected exception
@@ -362,23 +465,34 @@ public void initialCapacityBoundsChecking() {
 
     try {
       new BytesToBytesMap(
-        sizeLimitedMemoryManager, BytesToBytesMap.MAX_CAPACITY + 1, PAGE_SIZE_BYTES);
+        sizeLimitedTaskMemoryManager,
+        shuffleMemoryManager,
+        BytesToBytesMap.MAX_CAPACITY + 1,
+        PAGE_SIZE_BYTES);
       Assert.fail("Expected IllegalArgumentException to be thrown");
     } catch (IllegalArgumentException e) {
       // expected exception
     }
 
-   // Can allocate _at_ the max capacity
-    BytesToBytesMap map =
-      new BytesToBytesMap(sizeLimitedMemoryManager, BytesToBytesMap.MAX_CAPACITY, PAGE_SIZE_BYTES);
-    map.free();
+    // Ignored because this can OOM now that we allocate the long array w/o a TaskMemoryManager
+    // Can allocate _at_ the max capacity
+    //    BytesToBytesMap map = new BytesToBytesMap(
+    //      sizeLimitedTaskMemoryManager,
+    //      shuffleMemoryManager,
+    //      BytesToBytesMap.MAX_CAPACITY,
+    //      PAGE_SIZE_BYTES);
+    //    map.free();
   }
 
-  @Test
+  // Ignored because this can OOM now that we allocate the long array w/o a TaskMemoryManager
+  @Ignore
   public void resizingLargeMap() {
     // As long as a map's capacity is below the max, we should be able to resize up to the max
     BytesToBytesMap map = new BytesToBytesMap(
-      sizeLimitedMemoryManager, BytesToBytesMap.MAX_CAPACITY - 64, PAGE_SIZE_BYTES);
+      sizeLimitedTaskMemoryManager,
+      shuffleMemoryManager,
+      BytesToBytesMap.MAX_CAPACITY - 64,
+      PAGE_SIZE_BYTES);
     map.growAndRehash();
     map.free();
   }
diff --git a/unsafe/src/test/java/org/apache/spark/unsafe/map/BytesToBytesMapOffHeapSuite.java b/core/src/test/java/org/apache/spark/unsafe/map/BytesToBytesMapOffHeapSuite.java
similarity index 100%
rename from unsafe/src/test/java/org/apache/spark/unsafe/map/BytesToBytesMapOffHeapSuite.java
rename to core/src/test/java/org/apache/spark/unsafe/map/BytesToBytesMapOffHeapSuite.java
diff --git a/unsafe/src/test/java/org/apache/spark/unsafe/map/BytesToBytesMapOnHeapSuite.java b/core/src/test/java/org/apache/spark/unsafe/map/BytesToBytesMapOnHeapSuite.java
similarity index 100%
rename from unsafe/src/test/java/org/apache/spark/unsafe/map/BytesToBytesMapOnHeapSuite.java
rename to core/src/test/java/org/apache/spark/unsafe/map/BytesToBytesMapOnHeapSuite.java
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMap.java b/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMap.java
similarity index 89%
rename from sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMap.java
rename to sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMap.java
index f3b462778dc10..66012e3c94d27 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMap.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMap.java
@@ -15,11 +15,15 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.catalyst.expressions;
+package org.apache.spark.sql.execution;
 
+import java.io.IOException;
 import java.util.Iterator;
 
+import org.apache.spark.shuffle.ShuffleMemoryManager;
 import org.apache.spark.sql.catalyst.InternalRow;
+import org.apache.spark.sql.catalyst.expressions.UnsafeProjection;
+import org.apache.spark.sql.catalyst.expressions.UnsafeRow;
 import org.apache.spark.sql.types.Decimal;
 import org.apache.spark.sql.types.DecimalType;
 import org.apache.spark.sql.types.StructField;
@@ -87,7 +91,9 @@ public static boolean supportsAggregationBufferSchema(StructType schema) {
    * @param emptyAggregationBuffer the default value for new keys (a "zero" of the agg. function)
    * @param aggregationBufferSchema the schema of the aggregation buffer, used for row conversion.
    * @param groupingKeySchema the schema of the grouping key, used for row conversion.
-   * @param memoryManager the memory manager used to allocate our Unsafe memory structures.
+   * @param taskMemoryManager the memory manager used to allocate our Unsafe memory structures.
+   * @param shuffleMemoryManager the shuffle memory manager, for coordinating our memory usage with
+   *                             other tasks.
    * @param initialCapacity the initial capacity of the map (a sizing hint to avoid re-hashing).
    * @param pageSizeBytes the data page size, in bytes; limits the maximum record size.
    * @param enablePerfMetrics if true, performance metrics will be recorded (has minor perf impact)
@@ -96,15 +102,16 @@ public UnsafeFixedWidthAggregationMap(
       InternalRow emptyAggregationBuffer,
       StructType aggregationBufferSchema,
       StructType groupingKeySchema,
-      TaskMemoryManager memoryManager,
+      TaskMemoryManager taskMemoryManager,
+      ShuffleMemoryManager shuffleMemoryManager,
       int initialCapacity,
       long pageSizeBytes,
       boolean enablePerfMetrics) {
     this.aggregationBufferSchema = aggregationBufferSchema;
     this.groupingKeyProjection = UnsafeProjection.create(groupingKeySchema);
     this.groupingKeySchema = groupingKeySchema;
-    this.map =
-      new BytesToBytesMap(memoryManager, initialCapacity, pageSizeBytes, enablePerfMetrics);
+    this.map = new BytesToBytesMap(
+      taskMemoryManager, shuffleMemoryManager, initialCapacity, pageSizeBytes, enablePerfMetrics);
     this.enablePerfMetrics = enablePerfMetrics;
 
     // Initialize the buffer for aggregation value
@@ -116,7 +123,8 @@ public UnsafeFixedWidthAggregationMap(
 
   /**
    * Return the aggregation buffer for the current group. For efficiency, all calls to this method
-   * return the same object.
+   * return the same object. If additional memory could not be allocated, then this method will
+   * signal an error by returning null.
    */
   public UnsafeRow getAggregationBuffer(InternalRow groupingKey) {
     final UnsafeRow unsafeGroupingKeyRow = this.groupingKeyProjection.apply(groupingKey);
@@ -129,7 +137,7 @@ public UnsafeRow getAggregationBuffer(InternalRow groupingKey) {
     if (!loc.isDefined()) {
       // This is the first time that we've seen this grouping key, so we'll insert a copy of the
       // empty aggregation buffer into the map:
-      loc.putNewKey(
+      boolean putSucceeded = loc.putNewKey(
         unsafeGroupingKeyRow.getBaseObject(),
         unsafeGroupingKeyRow.getBaseOffset(),
         unsafeGroupingKeyRow.getSizeInBytes(),
@@ -137,6 +145,9 @@ public UnsafeRow getAggregationBuffer(InternalRow groupingKey) {
         PlatformDependent.BYTE_ARRAY_OFFSET,
         emptyAggregationBuffer.length
       );
+      if (!putSucceeded) {
+        return null;
+      }
     }
 
     // Reset the pointer to point to the value that we just stored or looked up:
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala
index d851eae3fcc71..469de6ca8e101 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.execution
 
+import java.io.IOException
+
 import org.apache.spark.{SparkEnv, TaskContext}
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
@@ -266,6 +268,7 @@ case class GeneratedAggregate(
           aggregationBufferSchema,
           groupKeySchema,
           TaskContext.get.taskMemoryManager(),
+          SparkEnv.get.shuffleMemoryManager,
           1024 * 16, // initial capacity
           pageSizeBytes,
           false // disable tracking of performance metrics
@@ -275,6 +278,9 @@ case class GeneratedAggregate(
           val currentRow: InternalRow = iter.next()
           val groupKey: InternalRow = groupProjection(currentRow)
           val aggregationBuffer = aggregationMap.getAggregationBuffer(groupKey)
+          if (aggregationBuffer == null) {
+            throw new IOException("Could not allocate memory to grow aggregation buffer")
+          }
           updateProjection.target(aggregationBuffer)(joinedRow(aggregationBuffer, currentRow))
         }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
index f88a45f48aee9..cc8bbfd2f8943 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
@@ -17,10 +17,11 @@
 
 package org.apache.spark.sql.execution.joins
 
-import java.io.{Externalizable, ObjectInput, ObjectOutput}
+import java.io.{IOException, Externalizable, ObjectInput, ObjectOutput}
 import java.nio.ByteOrder
 import java.util.{HashMap => JavaHashMap}
 
+import org.apache.spark.shuffle.ShuffleMemoryManager
 import org.apache.spark.{SparkConf, SparkEnv, TaskContext}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
@@ -28,6 +29,7 @@ import org.apache.spark.sql.execution.SparkSqlSerializer
 import org.apache.spark.unsafe.PlatformDependent
 import org.apache.spark.unsafe.map.BytesToBytesMap
 import org.apache.spark.unsafe.memory.{ExecutorMemoryManager, MemoryAllocator, TaskMemoryManager}
+import org.apache.spark.util.Utils
 import org.apache.spark.util.collection.CompactBuffer
 
 
@@ -217,7 +219,7 @@ private[joins] final class UnsafeHashedRelation(
     }
   }
 
-  override def writeExternal(out: ObjectOutput): Unit = {
+  override def writeExternal(out: ObjectOutput): Unit = Utils.tryOrIOException {
     out.writeInt(hashTable.size())
 
     val iter = hashTable.entrySet().iterator()
@@ -256,16 +258,26 @@ private[joins] final class UnsafeHashedRelation(
     }
   }
 
-  override def readExternal(in: ObjectInput): Unit = {
+  override def readExternal(in: ObjectInput): Unit = Utils.tryOrIOException {
     val nKeys = in.readInt()
     // This is used in Broadcast, shared by multiple tasks, so we use on-heap memory
-    val memoryManager = new TaskMemoryManager(new ExecutorMemoryManager(MemoryAllocator.HEAP))
+    val taskMemoryManager = new TaskMemoryManager(new ExecutorMemoryManager(MemoryAllocator.HEAP))
+
+    // Dummy shuffle memory manager which always grants all memory allocation requests.
+    // We use this because it doesn't make sense count shared broadcast variables' memory usage
+    // towards individual tasks' quotas. In the future, we should devise a better way of handling
+    // this.
+    val shuffleMemoryManager = new ShuffleMemoryManager(new SparkConf()) {
+      override def tryToAcquire(numBytes: Long): Long = numBytes
+      override def release(numBytes: Long): Unit = {}
+    }
 
     val pageSizeBytes = Option(SparkEnv.get).map(_.conf).getOrElse(new SparkConf())
       .getSizeAsBytes("spark.buffer.pageSize", "64m")
 
     binaryMap = new BytesToBytesMap(
-      memoryManager,
+      taskMemoryManager,
+      shuffleMemoryManager,
       nKeys * 2, // reduce hash collision
       pageSizeBytes)
 
@@ -287,8 +299,11 @@ private[joins] final class UnsafeHashedRelation(
       // put it into binary map
       val loc = binaryMap.lookup(keyBuffer, PlatformDependent.BYTE_ARRAY_OFFSET, keySize)
       assert(!loc.isDefined, "Duplicated key found!")
-      loc.putNewKey(keyBuffer, PlatformDependent.BYTE_ARRAY_OFFSET, keySize,
+      val putSuceeded = loc.putNewKey(keyBuffer, PlatformDependent.BYTE_ARRAY_OFFSET, keySize,
         valuesBuffer, PlatformDependent.BYTE_ARRAY_OFFSET, valuesSize)
+      if (!putSuceeded) {
+        throw new IOException("Could not allocate memory to grow BytesToBytesMap")
+      }
       i += 1
     }
   }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMapSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMapSuite.scala
similarity index 78%
rename from sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMapSuite.scala
rename to sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMapSuite.scala
index c6b4c729de2f9..79fd52dacda52 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMapSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMapSuite.scala
@@ -15,17 +15,18 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.catalyst.expressions
+package org.apache.spark.sql.execution
+
+import org.scalatest.{BeforeAndAfterEach, Matchers}
 
 import scala.collection.JavaConverters._
 import scala.util.Random
 
-import org.scalatest.{BeforeAndAfterEach, Matchers}
-
 import org.apache.spark.SparkFunSuite
+import org.apache.spark.shuffle.ShuffleMemoryManager
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.types._
-import org.apache.spark.unsafe.memory.{ExecutorMemoryManager, TaskMemoryManager, MemoryAllocator}
+import org.apache.spark.unsafe.memory.{ExecutorMemoryManager, MemoryAllocator, TaskMemoryManager}
 import org.apache.spark.unsafe.types.UTF8String
 
 
@@ -41,16 +42,20 @@ class UnsafeFixedWidthAggregationMapSuite
   private def emptyAggregationBuffer: InternalRow = InternalRow(0)
   private val PAGE_SIZE_BYTES: Long = 1L << 26; // 64 megabytes
 
-  private var memoryManager: TaskMemoryManager = null
+  private var taskMemoryManager: TaskMemoryManager = null
+  private var shuffleMemoryManager: ShuffleMemoryManager = null
 
   override def beforeEach(): Unit = {
-    memoryManager = new TaskMemoryManager(new ExecutorMemoryManager(MemoryAllocator.HEAP))
+    taskMemoryManager = new TaskMemoryManager(new ExecutorMemoryManager(MemoryAllocator.HEAP))
+    shuffleMemoryManager = new ShuffleMemoryManager(Long.MaxValue)
   }
 
   override def afterEach(): Unit = {
-    if (memoryManager != null) {
-      memoryManager.cleanUpAllAllocatedMemory()
-      memoryManager = null
+    if (taskMemoryManager != null) {
+      val leakedShuffleMemory = shuffleMemoryManager.getMemoryConsumptionForThisTask
+      assert(taskMemoryManager.cleanUpAllAllocatedMemory() === 0)
+      assert(leakedShuffleMemory === 0)
+      taskMemoryManager = null
     }
   }
 
@@ -69,7 +74,8 @@ class UnsafeFixedWidthAggregationMapSuite
       emptyAggregationBuffer,
       aggBufferSchema,
       groupKeySchema,
-      memoryManager,
+      taskMemoryManager,
+      shuffleMemoryManager,
       1024, // initial capacity,
       PAGE_SIZE_BYTES,
       false // disable perf metrics
@@ -83,7 +89,8 @@ class UnsafeFixedWidthAggregationMapSuite
       emptyAggregationBuffer,
       aggBufferSchema,
       groupKeySchema,
-      memoryManager,
+      taskMemoryManager,
+      shuffleMemoryManager,
       1024, // initial capacity
       PAGE_SIZE_BYTES,
       false // disable perf metrics
@@ -91,7 +98,7 @@ class UnsafeFixedWidthAggregationMapSuite
     val groupKey = InternalRow(UTF8String.fromString("cats"))
 
     // Looking up a key stores a zero-entry in the map (like Python Counters or DefaultDicts)
-    map.getAggregationBuffer(groupKey)
+    assert(map.getAggregationBuffer(groupKey) != null)
     val iter = map.iterator()
     val entry = iter.next()
     assert(!iter.hasNext)
@@ -110,7 +117,8 @@ class UnsafeFixedWidthAggregationMapSuite
       emptyAggregationBuffer,
       aggBufferSchema,
       groupKeySchema,
-      memoryManager,
+      taskMemoryManager,
+      shuffleMemoryManager,
       128, // initial capacity
       PAGE_SIZE_BYTES,
       false // disable perf metrics
@@ -118,7 +126,7 @@ class UnsafeFixedWidthAggregationMapSuite
     val rand = new Random(42)
     val groupKeys: Set[String] = Seq.fill(512)(rand.nextString(1024)).toSet
     groupKeys.foreach { keyString =>
-      map.getAggregationBuffer(InternalRow(UTF8String.fromString(keyString)))
+      assert(map.getAggregationBuffer(InternalRow(UTF8String.fromString(keyString))) != null)
     }
     val seenKeys: Set[String] = map.iterator().asScala.map { entry =>
       entry.key.getString(0)
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/memory/MemoryBlock.java b/unsafe/src/main/java/org/apache/spark/unsafe/memory/MemoryBlock.java
index 3dc82d8c2eb39..91be46ba21ff8 100644
--- a/unsafe/src/main/java/org/apache/spark/unsafe/memory/MemoryBlock.java
+++ b/unsafe/src/main/java/org/apache/spark/unsafe/memory/MemoryBlock.java
@@ -34,7 +34,7 @@ public class MemoryBlock extends MemoryLocation {
    */
   int pageNumber = -1;
 
-  MemoryBlock(@Nullable Object obj, long offset, long length) {
+  public MemoryBlock(@Nullable Object obj, long offset, long length) {
     super(obj, offset);
     this.length = length;
   }
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/memory/TaskMemoryManager.java b/unsafe/src/main/java/org/apache/spark/unsafe/memory/TaskMemoryManager.java
index dd70df3b1f791..358bb37250158 100644
--- a/unsafe/src/main/java/org/apache/spark/unsafe/memory/TaskMemoryManager.java
+++ b/unsafe/src/main/java/org/apache/spark/unsafe/memory/TaskMemoryManager.java
@@ -164,6 +164,7 @@ public void freePage(MemoryBlock page) {
    * top-level Javadoc for more details).
    */
   public MemoryBlock allocate(long size) throws OutOfMemoryError {
+    assert(size > 0) : "Size must be positive, but got " + size;
     final MemoryBlock memory = executorMemoryManager.allocate(size);
     allocatedNonPageMemory.add(memory);
     return memory;

From 712f5b7a9ab52c26e3d086629633950ec2fb7afc Mon Sep 17 00:00:00 2001
From: Hossein <hossein@databricks.com>
Date: Fri, 31 Jul 2015 19:24:00 -0700
Subject: [PATCH 538/549] [SPARK-9318] [SPARK-9320] [SPARKR] Aliases for merge
 and summary functions on DataFrames

This PR adds synonyms for ```merge``` and ```summary``` in SparkR DataFrame API.

cc shivaram

Author: Hossein <hossein@databricks.com>

Closes #7806 from falaki/SPARK-9320 and squashes the following commits:

72600f7 [Hossein] Updated docs
92a6e75 [Hossein] Fixed merge generic signature issue
4c2b051 [Hossein] Fixing naming with mllib summary
0f3a64c [Hossein] Added ... to generic for merge
30fbaf8 [Hossein] Merged master
ae1a4cf [Hossein] Merge branch 'master' into SPARK-9320
e8eb86f [Hossein] Add a generic for merge
fc01f2d [Hossein] Added unit test
8d92012 [Hossein] Added merge as an alias for join
5b8bedc [Hossein] Added unit test
632693d [Hossein] Added summary as an alias for describe for DataFrame
---
 R/pkg/NAMESPACE                  |  2 ++
 R/pkg/R/DataFrame.R              | 22 ++++++++++++++++++++++
 R/pkg/R/generics.R               |  8 ++++++++
 R/pkg/R/mllib.R                  |  8 ++++----
 R/pkg/inst/tests/test_sparkSQL.R | 14 ++++++++++++--
 5 files changed, 48 insertions(+), 6 deletions(-)

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index ff116cb1fbde2..b2d92bdf4840e 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -46,6 +46,7 @@ exportMethods("arrange",
               "isLocal",
               "join",
               "limit",
+              "merge",
               "names",
               "ncol",
               "nrow",
@@ -69,6 +70,7 @@ exportMethods("arrange",
               "show",
               "showDF",
               "summarize",
+              "summary",
               "take",
               "unionAll",
               "unique",
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index b4065d2944bdc..895603235011e 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -1279,6 +1279,15 @@ setMethod("join",
             dataFrame(sdf)
           })
 
+#' rdname merge
+#' aliases join
+setMethod("merge",
+          signature(x = "DataFrame", y = "DataFrame"),
+          function(x, y, joinExpr = NULL, joinType = NULL, ...) {
+            join(x, y, joinExpr, joinType)
+          })
+
+
 #' UnionAll
 #'
 #' Return a new DataFrame containing the union of rows in this DataFrame
@@ -1524,6 +1533,19 @@ setMethod("describe",
             dataFrame(sdf)
           })
 
+#' @title Summary
+#'
+#' @description Computes statistics for numeric columns of the DataFrame
+#'
+#' @rdname summary
+#' @aliases describe
+setMethod("summary",
+          signature(x = "DataFrame"),
+          function(x) {
+            describe(x)
+          })
+
+
 #' dropna
 #'
 #' Returns a new DataFrame omitting rows with null values.
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index 71d1e348c4efb..c43b947129e87 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -461,6 +461,10 @@ setGeneric("isLocal", function(x) { standardGeneric("isLocal") })
 #' @export
 setGeneric("limit", function(x, num) {standardGeneric("limit") })
 
+#' rdname merge
+#' @export
+setGeneric("merge")
+
 #' @rdname withColumn
 #' @export
 setGeneric("mutate", function(x, ...) {standardGeneric("mutate") })
@@ -531,6 +535,10 @@ setGeneric("showDF", function(x,...) { standardGeneric("showDF") })
 #' @export
 setGeneric("summarize", function(x,...) { standardGeneric("summarize") })
 
+##' rdname summary
+##' @export
+setGeneric("summary", function(x, ...) { standardGeneric("summary") })
+
 # @rdname tojson
 # @export
 setGeneric("toJSON", function(x) { standardGeneric("toJSON") })
diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
index efddcc1d8d71c..b524d1fd87496 100644
--- a/R/pkg/R/mllib.R
+++ b/R/pkg/R/mllib.R
@@ -86,12 +86,12 @@ setMethod("predict", signature(object = "PipelineModel"),
 #' model <- glm(y ~ x, trainingData)
 #' summary(model)
 #'}
-setMethod("summary", signature(object = "PipelineModel"),
-          function(object) {
+setMethod("summary", signature(x = "PipelineModel"),
+          function(x, ...) {
             features <- callJStatic("org.apache.spark.ml.api.r.SparkRWrappers",
-                                   "getModelFeatures", object@model)
+                                   "getModelFeatures", x@model)
             weights <- callJStatic("org.apache.spark.ml.api.r.SparkRWrappers",
-                                   "getModelWeights", object@model)
+                                   "getModelWeights", x@model)
             coefficients <- as.matrix(unlist(weights))
             colnames(coefficients) <- c("Estimate")
             rownames(coefficients) <- unlist(features)
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index 9faee8d59c3af..7377fc8f1ca9c 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -765,7 +765,7 @@ test_that("filter() on a DataFrame", {
   expect_equal(count(filtered6), 2)
 })
 
-test_that("join() on a DataFrame", {
+test_that("join() and merge() on a DataFrame", {
   df <- jsonFile(sqlContext, jsonPath)
 
   mockLines2 <- c("{\"name\":\"Michael\", \"test\": \"yes\"}",
@@ -794,6 +794,12 @@ test_that("join() on a DataFrame", {
   expect_equal(names(joined4), c("newAge", "name", "test"))
   expect_equal(count(joined4), 4)
   expect_equal(collect(orderBy(joined4, joined4$name))$newAge[3], 24)
+
+  merged <- select(merge(df, df2, df$name == df2$name, "outer"),
+                   alias(df$age + 5, "newAge"), df$name, df2$test)
+  expect_equal(names(merged), c("newAge", "name", "test"))
+  expect_equal(count(merged), 4)
+  expect_equal(collect(orderBy(merged, joined4$name))$newAge[3], 24)
 })
 
 test_that("toJSON() returns an RDD of the correct values", {
@@ -899,7 +905,7 @@ test_that("parquetFile works with multiple input paths", {
   expect_equal(count(parquetDF), count(df) * 2)
 })
 
-test_that("describe() on a DataFrame", {
+test_that("describe() and summarize() on a DataFrame", {
   df <- jsonFile(sqlContext, jsonPath)
   stats <- describe(df, "age")
   expect_equal(collect(stats)[1, "summary"], "count")
@@ -908,6 +914,10 @@ test_that("describe() on a DataFrame", {
   stats <- describe(df)
   expect_equal(collect(stats)[4, "name"], "Andy")
   expect_equal(collect(stats)[5, "age"], "30")
+
+  stats2 <- summary(df)
+  expect_equal(collect(stats2)[4, "name"], "Andy")
+  expect_equal(collect(stats2)[5, "age"], "30")
 })
 
 test_that("dropna() on a DataFrame", {

From 03377d2522776267a07b7d6ae9bddf79a4e0f516 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Fri, 31 Jul 2015 21:09:00 -0700
Subject: [PATCH 539/549] [SPARK-9358][SQL] Code generation for UnsafeRow
 joiner.

This patch creates a code generated unsafe row concatenator that can be used to concatenate/join two UnsafeRows into a single UnsafeRow.

Since it is inherently hard to test these low level stuff, the test suites employ randomized testing heavily in order to guarantee correctness.

Author: Reynold Xin <rxin@databricks.com>

Closes #7821 from rxin/rowconcat and squashes the following commits:

8717f35 [Reynold Xin] Rebase and code review.
72c5d8e [Reynold Xin] Fixed a bug.
a84ed2e [Reynold Xin] Fixed offset.
40c3fb2 [Reynold Xin] Reset random data generator.
f0913aa [Reynold Xin] Test fixes.
6687b6f [Reynold Xin] Updated documentation.
00354b9 [Reynold Xin] Support concat data as well.
e9a4347 [Reynold Xin] Updated.
6269f96 [Reynold Xin] Fixed a bug .
0f89716 [Reynold Xin] [SPARK-9358][SQL][WIP] Code generation for UnsafeRow concat.
---
 .../sql/catalyst/expressions/UnsafeRow.java   |  19 ++
 .../expressions/codegen/CodeGenerator.scala   |   2 +
 .../codegen/GenerateUnsafeProjection.scala    |   6 +-
 .../codegen/GenerateUnsafeRowJoiner.scala     | 241 ++++++++++++++++++
 .../spark/sql/RandomDataGenerator.scala       |  15 +-
 .../GenerateUnsafeRowJoinerBitsetSuite.scala  | 147 +++++++++++
 .../GenerateUnsafeRowJoinerSuite.scala        | 114 +++++++++
 .../UnsafeFixedWidthAggregationMap.java       |   7 +-
 .../sql/execution/TungstenSortSuite.scala     |   3 +
 9 files changed, 544 insertions(+), 10 deletions(-)
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoiner.scala
 create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoinerBitsetSuite.scala
 create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoinerSuite.scala

diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
index e7088edced1a1..24dc80b1a7932 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
@@ -85,6 +85,14 @@ public static int calculateBitSetWidthInBytes(int numFields) {
         })));
   }
 
+  public static boolean isFixedLength(DataType dt) {
+    if (dt instanceof DecimalType) {
+      return ((DecimalType) dt).precision() < Decimal.MAX_LONG_DIGITS();
+    } else {
+      return settableFieldTypes.contains(dt);
+    }
+  }
+
   //////////////////////////////////////////////////////////////////////////////
   // Private fields and methods
   //////////////////////////////////////////////////////////////////////////////
@@ -144,6 +152,17 @@ public void pointTo(Object baseObject, long baseOffset, int numFields, int sizeI
     this.sizeInBytes = sizeInBytes;
   }
 
+  /**
+   * Update this UnsafeRow to point to the underlying byte array.
+   *
+   * @param buf byte array to point to
+   * @param numFields the number of fields in this row
+   * @param sizeInBytes the number of bytes valid in the byte array
+   */
+  public void pointTo(byte[] buf, int numFields, int sizeInBytes) {
+    pointTo(buf, PlatformDependent.BYTE_ARRAY_OFFSET, numFields, sizeInBytes);
+  }
+
   private void assertIndexIsValid(int index) {
     assert index >= 0 : "index (" + index + ") should >= 0";
     assert index < numFields : "index (" + index + ") should < " + numFields;
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index e50ec27fc2eb6..36f4e9c6be570 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -27,6 +27,7 @@ import org.apache.spark.Logging
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.PlatformDependent
 import org.apache.spark.unsafe.types._
 
 
@@ -293,6 +294,7 @@ abstract class CodeGenerator[InType <: AnyRef, OutType <: AnyRef] extends Loggin
     val evaluator = new ClassBodyEvaluator()
     evaluator.setParentClassLoader(getClass.getClassLoader)
     evaluator.setDefaultImports(Array(
+      classOf[PlatformDependent].getName,
       classOf[InternalRow].getName,
       classOf[UnsafeRow].getName,
       classOf[UTF8String].getName,
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
index 1d223986d9441..6c99086046683 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
@@ -266,16 +266,16 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
 
     val code = s"""
       public Object generate($exprType[] exprs) {
-        return new SpecificProjection(exprs);
+        return new SpecificUnsafeProjection(exprs);
       }
 
-      class SpecificProjection extends ${classOf[UnsafeProjection].getName} {
+      class SpecificUnsafeProjection extends ${classOf[UnsafeProjection].getName} {
 
         private $exprType[] expressions;
 
         ${declareMutableStates(ctx)}
 
-        public SpecificProjection($exprType[] expressions) {
+        public SpecificUnsafeProjection($exprType[] expressions) {
           this.expressions = expressions;
           ${initMutableStates(ctx)}
         }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoiner.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoiner.scala
new file mode 100644
index 0000000000000..645eb48d5a51b
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoiner.scala
@@ -0,0 +1,241 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions.codegen
+
+import org.apache.spark.sql.catalyst.expressions.{UnsafeRow, Attribute}
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.unsafe.PlatformDependent
+
+
+abstract class UnsafeRowJoiner {
+  def join(row1: UnsafeRow, row2: UnsafeRow): UnsafeRow
+}
+
+
+/**
+ * A code generator for concatenating two [[UnsafeRow]]s into a single [[UnsafeRow]].
+ *
+ * The high level algorithm is:
+ *
+ * 1. Concatenate the two bitsets together into a single one, taking padding into account.
+ * 2. Move fixed-length data.
+ * 3. Move variable-length data.
+ * 4. Update the offset position (i.e. the upper 32 bits in the fixed length part) for all
+ *    variable-length data.
+ */
+object GenerateUnsafeRowJoiner extends CodeGenerator[(StructType, StructType), UnsafeRowJoiner] {
+
+  def dump(word: Long): String = {
+    Seq.tabulate(64) { i => if ((word >> i) % 2 == 0) "0" else "1" }.reverse.mkString
+  }
+
+  override protected def create(in: (StructType, StructType)): UnsafeRowJoiner = {
+    create(in._1, in._2)
+  }
+
+  override protected def canonicalize(in: (StructType, StructType)): (StructType, StructType) = in
+
+  override protected def bind(in: (StructType, StructType), inputSchema: Seq[Attribute])
+    : (StructType, StructType) = {
+    in
+  }
+
+  def create(schema1: StructType, schema2: StructType): UnsafeRowJoiner = {
+    val ctx = newCodeGenContext()
+    val offset = PlatformDependent.BYTE_ARRAY_OFFSET
+
+    val bitset1Words = (schema1.size + 63) / 64
+    val bitset2Words = (schema2.size + 63) / 64
+    val outputBitsetWords = (schema1.size + schema2.size + 63) / 64
+    val bitset1Remainder = schema1.size % 64
+    val bitset2Remainder = schema2.size % 64
+
+    // The number of words we can reduce when we concat two rows together.
+    // The only reduction comes from merging the bitset portion of the two rows, saving 1 word.
+    val sizeReduction = bitset1Words + bitset2Words - outputBitsetWords
+
+    // --------------------- copy bitset from row 1 ----------------------- //
+    val copyBitset1 = Seq.tabulate(bitset1Words) { i =>
+      s"""
+         |PlatformDependent.UNSAFE.putLong(buf, ${offset + i * 8},
+         |  PlatformDependent.UNSAFE.getLong(obj1, ${offset + i * 8}));
+       """.stripMargin
+    }.mkString
+
+
+    // --------------------- copy bitset from row 2 ----------------------- //
+    var copyBitset2 = ""
+    if (bitset1Remainder == 0) {
+      copyBitset2 += Seq.tabulate(bitset2Words) { i =>
+        s"""
+           |PlatformDependent.UNSAFE.putLong(buf, ${offset + (bitset1Words + i) * 8},
+           |  PlatformDependent.UNSAFE.getLong(obj2, ${offset + i * 8}));
+         """.stripMargin
+      }.mkString
+    } else {
+      copyBitset2 = Seq.tabulate(bitset2Words) { i =>
+        s"""
+           |long bs2w$i = PlatformDependent.UNSAFE.getLong(obj2, ${offset + i * 8});
+           |long bs2w${i}p1 = (bs2w$i << $bitset1Remainder) & ~((1L << $bitset1Remainder) - 1);
+           |long bs2w${i}p2 = (bs2w$i >>> ${64 - bitset1Remainder});
+         """.stripMargin
+      }.mkString
+
+      copyBitset2 += Seq.tabulate(bitset2Words) { i =>
+        val currentOffset = offset + (bitset1Words + i - 1) * 8
+        if (i == 0) {
+          if (bitset1Words > 0) {
+            s"""
+               |PlatformDependent.UNSAFE.putLong(buf, $currentOffset,
+               |  bs2w${i}p1 | PlatformDependent.UNSAFE.getLong(obj1, $currentOffset));
+            """.stripMargin
+          } else {
+            s"""
+               |PlatformDependent.UNSAFE.putLong(buf, $currentOffset + 8, bs2w${i}p1);
+            """.stripMargin
+          }
+        } else {
+          s"""
+             |PlatformDependent.UNSAFE.putLong(buf, $currentOffset, bs2w${i}p1 | bs2w${i - 1}p2);
+          """.stripMargin
+        }
+      }.mkString("\n")
+
+      if (bitset2Words > 0 &&
+        (bitset2Remainder == 0 || bitset2Remainder > (64 - bitset1Remainder))) {
+        val lastWord = bitset2Words - 1
+        copyBitset2 +=
+          s"""
+             |PlatformDependent.UNSAFE.putLong(buf, ${offset + (outputBitsetWords - 1) * 8},
+             |  bs2w${lastWord}p2);
+          """.stripMargin
+      }
+    }
+
+    // --------------------- copy fixed length portion from row 1 ----------------------- //
+    var cursor = offset + outputBitsetWords * 8
+    val copyFixedLengthRow1 = s"""
+       |// Copy fixed length data for row1
+       |PlatformDependent.copyMemory(
+       |  obj1, offset1 + ${bitset1Words * 8},
+       |  buf, $cursor,
+       |  ${schema1.size * 8});
+     """.stripMargin
+    cursor += schema1.size * 8
+
+    // --------------------- copy fixed length portion from row 2 ----------------------- //
+    val copyFixedLengthRow2 = s"""
+       |// Copy fixed length data for row2
+       |PlatformDependent.copyMemory(
+       |  obj2, offset2 + ${bitset2Words * 8},
+       |  buf, $cursor,
+       |  ${schema2.size * 8});
+     """.stripMargin
+    cursor += schema2.size * 8
+
+    // --------------------- copy variable length portion from row 1 ----------------------- //
+    val copyVariableLengthRow1 = s"""
+       |// Copy variable length data for row1
+       |long numBytesBitsetAndFixedRow1 = ${(bitset1Words + schema1.size) * 8};
+       |long numBytesVariableRow1 = row1.getSizeInBytes() - numBytesBitsetAndFixedRow1;
+       |PlatformDependent.copyMemory(
+       |  obj1, offset1 + ${(bitset1Words + schema1.size) * 8},
+       |  buf, $cursor,
+       |  numBytesVariableRow1);
+     """.stripMargin
+
+    // --------------------- copy variable length portion from row 2 ----------------------- //
+    val copyVariableLengthRow2 = s"""
+       |// Copy variable length data for row2
+       |long numBytesBitsetAndFixedRow2 = ${(bitset2Words + schema2.size) * 8};
+       |long numBytesVariableRow2 = row2.getSizeInBytes() - numBytesBitsetAndFixedRow2;
+       |PlatformDependent.copyMemory(
+       |  obj2, offset2 + ${(bitset2Words + schema2.size) * 8},
+       |  buf, $cursor + numBytesVariableRow1,
+       |  numBytesVariableRow2);
+     """.stripMargin
+
+    // ------------- update fixed length data for variable length data type  --------------- //
+    val updateOffset = (schema1 ++ schema2).zipWithIndex.map { case (field, i) =>
+      // Skip fixed length data types, and only generate code for variable length data
+      if (UnsafeRow.isFixedLength(field.dataType)) {
+        ""
+      } else {
+        // Number of bytes to increase for the offset. Note that since in UnsafeRow we store the
+        // offset in the upper 32 bit of the words, we can just shift the offset to the left by
+        // 32 and increment that amount in place.
+        val shift =
+          if (i < schema1.size) {
+            s"${(outputBitsetWords - bitset1Words + schema2.size) * 8}L"
+          } else {
+            s"${(outputBitsetWords - bitset2Words + schema1.size) * 8}L + numBytesVariableRow1"
+          }
+        val cursor = offset + outputBitsetWords * 8 + i * 8
+        s"""
+           |PlatformDependent.UNSAFE.putLong(buf, $cursor,
+           |  PlatformDependent.UNSAFE.getLong(buf, $cursor) + ($shift << 32));
+         """.stripMargin
+      }
+    }.mkString
+
+    // ------------------------ Finally, put everything together  --------------------------- //
+    val code = s"""
+       |public Object generate($exprType[] exprs) {
+       |  return new SpecificUnsafeRowJoiner();
+       |}
+       |
+       |class SpecificUnsafeRowJoiner extends ${classOf[UnsafeRowJoiner].getName} {
+       |  private byte[] buf = new byte[64];
+       |  private UnsafeRow out = new UnsafeRow();
+       |
+       |  public UnsafeRow join(UnsafeRow row1, UnsafeRow row2) {
+       |    // row1: ${schema1.size} fields, $bitset1Words words in bitset
+       |    // row2: ${schema2.size}, $bitset2Words words in bitset
+       |    // output: ${schema1.size + schema2.size} fields, $outputBitsetWords words in bitset
+       |    final int sizeInBytes = row1.getSizeInBytes() + row2.getSizeInBytes();
+       |    if (sizeInBytes > buf.length) {
+       |      buf = new byte[sizeInBytes];
+       |    }
+       |
+       |    final Object obj1 = row1.getBaseObject();
+       |    final long offset1 = row1.getBaseOffset();
+       |    final Object obj2 = row2.getBaseObject();
+       |    final long offset2 = row2.getBaseOffset();
+       |
+       |    $copyBitset1
+       |    $copyBitset2
+       |    $copyFixedLengthRow1
+       |    $copyFixedLengthRow2
+       |    $copyVariableLengthRow1
+       |    $copyVariableLengthRow2
+       |    $updateOffset
+       |
+       |    out.pointTo(buf, ${schema1.size + schema2.size}, sizeInBytes - $sizeReduction);
+       |
+       |    return out;
+       |  }
+       |}
+     """.stripMargin
+
+    logDebug(s"SpecificUnsafeRowJoiner($schema1, $schema2):\n${CodeFormatter.format(code)}")
+    // println(CodeFormatter.format(code))
+
+    val c = compile(code)
+    c.generate(Array.empty).asInstanceOf[UnsafeRowJoiner]
+  }
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala
index 75ae29d690770..81267dc915c10 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala
@@ -65,6 +65,19 @@ object RandomDataGenerator {
     Some(f)
   }
 
+  /**
+   * Returns a randomly generated schema, based on the given accepted types.
+   *
+   * @param numFields the number of fields in this schema
+   * @param acceptedTypes types to draw from.
+   */
+  def randomSchema(numFields: Int, acceptedTypes: Seq[DataType]): StructType = {
+    StructType(Seq.tabulate(numFields) { i =>
+      val dt = acceptedTypes(Random.nextInt(acceptedTypes.size))
+      StructField("col_" + i, dt, nullable = true)
+    })
+  }
+
   /**
    * Returns a function which generates random values for the given [[DataType]], or `None` if no
    * random data generator is defined for that data type. The generated values will use an external
@@ -94,7 +107,7 @@ object RandomDataGenerator {
       case DateType => Some(() => new java.sql.Date(rand.nextInt()))
       case TimestampType => Some(() => new java.sql.Timestamp(rand.nextLong()))
       case DecimalType.Fixed(precision, scale) => Some(
-        () => BigDecimal.apply(rand.nextLong, rand.nextInt, new MathContext(precision)))
+        () => BigDecimal.apply(rand.nextLong(), rand.nextInt(), new MathContext(precision)))
       case DoubleType => randomNumeric[Double](
         rand, r => longBitsToDouble(r.nextLong()), Seq(Double.MinValue, Double.MinPositiveValue,
           Double.MaxValue, Double.PositiveInfinity, Double.NegativeInfinity, Double.NaN, 0.0))
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoinerBitsetSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoinerBitsetSuite.scala
new file mode 100644
index 0000000000000..76d9d991ed0dc
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoinerBitsetSuite.scala
@@ -0,0 +1,147 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions.codegen
+
+import scala.util.Random
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.catalyst.expressions.UnsafeRow
+import org.apache.spark.sql.types._
+
+/**
+ * A test suite for the bitset portion of the row concatenation.
+ */
+class GenerateUnsafeRowJoinerBitsetSuite extends SparkFunSuite {
+
+  test("bitset concat: boundary size 0, 0") {
+    testBitsets(0, 0)
+  }
+
+  test("bitset concat: boundary size 0, 64") {
+    testBitsets(0, 64)
+  }
+
+  test("bitset concat: boundary size 64, 0") {
+    testBitsets(64, 0)
+  }
+
+  test("bitset concat: boundary size 64, 64") {
+    testBitsets(64, 64)
+  }
+
+  test("bitset concat: boundary size 0, 128") {
+    testBitsets(0, 128)
+  }
+
+  test("bitset concat: boundary size 128, 0") {
+    testBitsets(128, 0)
+  }
+
+  test("bitset concat: boundary size 128, 128") {
+    testBitsets(128, 128)
+  }
+
+  test("bitset concat: single word bitsets") {
+    testBitsets(10, 5)
+  }
+
+  test("bitset concat: first bitset larger than a word") {
+    testBitsets(67, 5)
+  }
+
+  test("bitset concat: second bitset larger than a word") {
+    testBitsets(6, 67)
+  }
+
+  test("bitset concat: no reduction in bitset size") {
+    testBitsets(33, 34)
+  }
+
+  test("bitset concat: two words") {
+    testBitsets(120, 95)
+  }
+
+  test("bitset concat: bitset 65, 128") {
+    testBitsets(65, 128)
+  }
+
+  test("bitset concat: randomized tests") {
+    for (i <- 1 until 20) {
+      val numFields1 = Random.nextInt(1000)
+      val numFields2 = Random.nextInt(1000)
+      testBitsetsOnce(numFields1, numFields2)
+    }
+  }
+
+  private def createUnsafeRow(numFields: Int): UnsafeRow = {
+    val row = new UnsafeRow
+    val sizeInBytes = numFields * 8 + ((numFields + 63) / 64) * 8
+    val buf = new Array[Byte](sizeInBytes)
+    row.pointTo(buf, numFields, sizeInBytes)
+    row
+  }
+
+  private def testBitsets(numFields1: Int, numFields2: Int): Unit = {
+    for (i <- 0 until 5) {
+      testBitsetsOnce(numFields1, numFields2)
+    }
+  }
+
+  private def testBitsetsOnce(numFields1: Int, numFields2: Int): Unit = {
+    info(s"num fields: $numFields1 and $numFields2")
+    val schema1 = StructType(Seq.tabulate(numFields1) { i => StructField(s"a_$i", IntegerType) })
+    val schema2 = StructType(Seq.tabulate(numFields2) { i => StructField(s"b_$i", IntegerType) })
+
+    val row1 = createUnsafeRow(numFields1)
+    val row2 = createUnsafeRow(numFields2)
+
+    if (numFields1 > 0) {
+      for (i <- 0 until Random.nextInt(numFields1)) {
+        row1.setNullAt(Random.nextInt(numFields1))
+      }
+    }
+    if (numFields2 > 0) {
+      for (i <- 0 until Random.nextInt(numFields2)) {
+        row2.setNullAt(Random.nextInt(numFields2))
+      }
+    }
+
+    val concater = GenerateUnsafeRowJoiner.create(schema1, schema2)
+    val output = concater.join(row1, row2)
+
+    def dumpDebug(): String = {
+      val set1 = Seq.tabulate(numFields1) { i => if (row1.isNullAt(i)) "1" else "0" }
+      val set2 = Seq.tabulate(numFields2) { i => if (row2.isNullAt(i)) "1" else "0" }
+      val out = Seq.tabulate(numFields1 + numFields2) { i => if (output.isNullAt(i)) "1" else "0" }
+
+      s"""
+         |input1: ${set1.mkString}
+         |input2: ${set2.mkString}
+         |output: ${out.mkString}
+       """.stripMargin
+    }
+
+    for (i <- 0 until (numFields1 + numFields2)) {
+      if (i < numFields1) {
+        assert(output.isNullAt(i) === row1.isNullAt(i), dumpDebug())
+      } else {
+        assert(output.isNullAt(i) === row2.isNullAt(i - numFields1), dumpDebug())
+      }
+    }
+  }
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoinerSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoinerSuite.scala
new file mode 100644
index 0000000000000..59729e7646beb
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoinerSuite.scala
@@ -0,0 +1,114 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions.codegen
+
+import scala.util.Random
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.RandomDataGenerator
+import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
+import org.apache.spark.sql.catalyst.expressions.UnsafeProjection
+import org.apache.spark.sql.types._
+
+/**
+ * Test suite for [[GenerateUnsafeRowJoiner]].
+ *
+ * There is also a separate [[GenerateUnsafeRowJoinerBitsetSuite]] that tests specifically
+ * concatenation for the bitset portion, since that is the hardest one to get right.
+ */
+class GenerateUnsafeRowJoinerSuite extends SparkFunSuite {
+
+  private val fixed = Seq(IntegerType)
+  private val variable = Seq(IntegerType, StringType)
+
+  test("simple fixed width types") {
+    testConcat(0, 0, fixed)
+    testConcat(0, 1, fixed)
+    testConcat(1, 0, fixed)
+    testConcat(64, 0, fixed)
+    testConcat(0, 64, fixed)
+    testConcat(64, 64, fixed)
+  }
+
+  test("randomized fix width types") {
+    for (i <- 0 until 20) {
+      testConcatOnce(Random.nextInt(100), Random.nextInt(100), fixed)
+    }
+  }
+
+  test("simple variable width types") {
+    testConcat(0, 0, variable)
+    testConcat(0, 1, variable)
+    testConcat(1, 0, variable)
+    testConcat(64, 0, variable)
+    testConcat(0, 64, variable)
+    testConcat(64, 64, variable)
+  }
+
+  test("randomized variable width types") {
+    for (i <- 0 until 10) {
+      testConcatOnce(Random.nextInt(100), Random.nextInt(100), variable)
+    }
+  }
+
+  private def testConcat(numFields1: Int, numFields2: Int, candidateTypes: Seq[DataType]): Unit = {
+    for (i <- 0 until 10) {
+      testConcatOnce(numFields1, numFields2, candidateTypes)
+    }
+  }
+
+  private def testConcatOnce(numFields1: Int, numFields2: Int, candidateTypes: Seq[DataType]) {
+    info(s"schema size $numFields1, $numFields2")
+    val schema1 = RandomDataGenerator.randomSchema(numFields1, candidateTypes)
+    val schema2 = RandomDataGenerator.randomSchema(numFields2, candidateTypes)
+
+    // Create the converters needed to convert from external row to internal row and to UnsafeRows.
+    val internalConverter1 = CatalystTypeConverters.createToCatalystConverter(schema1)
+    val internalConverter2 = CatalystTypeConverters.createToCatalystConverter(schema2)
+    val converter1 = UnsafeProjection.create(schema1)
+    val converter2 = UnsafeProjection.create(schema2)
+
+    // Create the input rows, convert them into UnsafeRows.
+    val extRow1 = RandomDataGenerator.forType(schema1, nullable = false).get.apply()
+    val extRow2 = RandomDataGenerator.forType(schema2, nullable = false).get.apply()
+    val row1 = converter1.apply(internalConverter1.apply(extRow1).asInstanceOf[InternalRow])
+    val row2 = converter2.apply(internalConverter2.apply(extRow2).asInstanceOf[InternalRow])
+
+    // Run the joiner.
+    val mergedSchema = StructType(schema1 ++ schema2)
+    val concater = GenerateUnsafeRowJoiner.create(schema1, schema2)
+    val output = concater.join(row1, row2)
+
+    // Test everything equals ...
+    for (i <- mergedSchema.indices) {
+      if (i < schema1.size) {
+        assert(output.isNullAt(i) === row1.isNullAt(i))
+        if (!output.isNullAt(i)) {
+          assert(output.get(i, mergedSchema(i).dataType) === row1.get(i, mergedSchema(i).dataType))
+        }
+      } else {
+        assert(output.isNullAt(i) === row2.isNullAt(i - schema1.size))
+        if (!output.isNullAt(i)) {
+          assert(output.get(i, mergedSchema(i).dataType) ===
+            row2.get(i - schema1.size, mergedSchema(i).dataType))
+        }
+      }
+    }
+  }
+
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMap.java b/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMap.java
index 66012e3c94d27..08a98cdd94a4c 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMap.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMap.java
@@ -73,12 +73,7 @@ public final class UnsafeFixedWidthAggregationMap {
    */
   public static boolean supportsAggregationBufferSchema(StructType schema) {
     for (StructField field: schema.fields()) {
-      if (field.dataType() instanceof DecimalType) {
-        DecimalType dt = (DecimalType) field.dataType();
-        if (dt.precision() > Decimal.MAX_LONG_DIGITS()) {
-          return false;
-        }
-      } else if (!UnsafeRow.settableFieldTypes.contains(field.dataType())) {
+      if (!UnsafeRow.isFixedLength(field.dataType())) {
         return false;
       }
     }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/TungstenSortSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/TungstenSortSuite.scala
index 450963547c798..b3f821e0cdd37 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/TungstenSortSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/TungstenSortSuite.scala
@@ -26,6 +26,9 @@ import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.test.TestSQLContext
 import org.apache.spark.sql.types._
 
+/**
+ * A test suite that generates randomized data to test the [[TungstenSort]] operator.
+ */
 class TungstenSortSuite extends SparkPlanTest with BeforeAndAfterAll {
 
   override def beforeAll(): Unit = {

From 6996bd2e81bf6597dcda499d9a9a80927a43e30f Mon Sep 17 00:00:00 2001
From: "zhichao.li" <zhichao.li@intel.com>
Date: Fri, 31 Jul 2015 21:18:01 -0700
Subject: [PATCH 540/549] [SPARK-8264][SQL]add substring_index function

This PR is based on #7533 , thanks to zhichao-li

Closes #7533

Author: zhichao.li <zhichao.li@intel.com>
Author: Davies Liu <davies@databricks.com>

Closes #7843 from davies/str_index and squashes the following commits:

391347b [Davies Liu] add python api
3ce7802 [Davies Liu] fix substringIndex
f2d29a1 [Davies Liu] Merge branch 'master' of github.com:apache/spark into str_index
515519b [zhichao.li] add foldable and remove null checking
9546991 [zhichao.li] scala style
67c253a [zhichao.li] hide some apis and clean code
b19b013 [zhichao.li] add codegen and clean code
ac863e9 [zhichao.li] reduce the calling of numChars
12e108f [zhichao.li] refine unittest
d92951b [zhichao.li] add lastIndexOf
52d7b03 [zhichao.li] add substring_index function
---
 python/pyspark/sql/functions.py               | 19 +++++
 .../catalyst/analysis/FunctionRegistry.scala  |  1 +
 .../expressions/stringOperations.scala        | 25 ++++++
 .../expressions/StringExpressionsSuite.scala  | 31 +++++++
 .../org/apache/spark/sql/functions.scala      | 12 ++-
 .../spark/sql/StringFunctionsSuite.scala      | 57 +++++++++++++
 .../apache/spark/unsafe/types/UTF8String.java | 80 ++++++++++++++++++-
 .../spark/unsafe/types/UTF8StringSuite.java   | 38 +++++++++
 8 files changed, 261 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index bb9926ce8ca85..89a2a5ceaa9bf 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -920,6 +920,25 @@ def trunc(date, format):
     return Column(sc._jvm.functions.trunc(_to_java_column(date), format))
 
 
+@since(1.5)
+@ignore_unicode_prefix
+def substring_index(str, delim, count):
+    """
+    Returns the substring from string str before count occurrences of the delimiter delim.
+    If count is positive, everything the left of the final delimiter (counting from left) is
+    returned. If count is negative, every to the right of the final delimiter (counting from the
+    right) is returned. substring_index performs a case-sensitive match when searching for delim.
+
+    >>> df = sqlContext.createDataFrame([('a.b.c.d',)], ['s'])
+    >>> df.select(substring_index(df.s, '.', 2).alias('s')).collect()
+    [Row(s=u'a.b')]
+    >>> df.select(substring_index(df.s, '.', -3).alias('s')).collect()
+    [Row(s=u'b.c.d')]
+    """
+    sc = SparkContext._active_spark_context
+    return Column(sc._jvm.functions.substring_index(_to_java_column(str), delim, count))
+
+
 @since(1.5)
 def size(col):
     """
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index 3f61a9af1f075..ee44cbcba68e7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -199,6 +199,7 @@ object FunctionRegistry {
     expression[StringSplit]("split"),
     expression[Substring]("substr"),
     expression[Substring]("substring"),
+    expression[SubstringIndex]("substring_index"),
     expression[StringTrim]("trim"),
     expression[UnBase64]("unbase64"),
     expression[Upper]("ucase"),
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
index 160e72f384a9b..5dd387a418a1d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
@@ -421,6 +421,31 @@ case class StringInstr(str: Expression, substr: Expression)
   }
 }
 
+/**
+ * Returns the substring from string str before count occurrences of the delimiter delim.
+ * If count is positive, everything the left of the final delimiter (counting from left) is
+ * returned. If count is negative, every to the right of the final delimiter (counting from the
+ * right) is returned. substring_index performs a case-sensitive match when searching for delim.
+ */
+case class SubstringIndex(strExpr: Expression, delimExpr: Expression, countExpr: Expression)
+ extends TernaryExpression with ImplicitCastInputTypes {
+
+  override def dataType: DataType = StringType
+  override def inputTypes: Seq[DataType] = Seq(StringType, StringType, IntegerType)
+  override def children: Seq[Expression] = Seq(strExpr, delimExpr, countExpr)
+  override def prettyName: String = "substring_index"
+
+  override def nullSafeEval(str: Any, delim: Any, count: Any): Any = {
+    str.asInstanceOf[UTF8String].subStringIndex(
+      delim.asInstanceOf[UTF8String],
+      count.asInstanceOf[Int])
+  }
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    defineCodeGen(ctx, ev, (str, delim, count) => s"$str.subStringIndex($delim, $count)")
+  }
+}
+
 /**
  * A function that returns the position of the first occurrence of substr
  * in given string after position pos.
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
index fb72fe1714153..ad87ab36fd645 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.catalyst.expressions
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.UTF8String
 
 
 class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
@@ -187,6 +188,36 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(s.substring(0), "example", row)
   }
 
+  test("string substring_index function") {
+    checkEvaluation(
+      SubstringIndex(Literal("www.apache.org"), Literal("."), Literal(3)), "www.apache.org")
+    checkEvaluation(
+      SubstringIndex(Literal("www.apache.org"), Literal("."), Literal(2)), "www.apache")
+    checkEvaluation(
+      SubstringIndex(Literal("www.apache.org"), Literal("."), Literal(1)), "www")
+    checkEvaluation(
+      SubstringIndex(Literal("www.apache.org"), Literal("."), Literal(0)), "")
+    checkEvaluation(
+      SubstringIndex(Literal("www.apache.org"), Literal("."), Literal(-3)), "www.apache.org")
+    checkEvaluation(
+      SubstringIndex(Literal("www.apache.org"), Literal("."), Literal(-2)), "apache.org")
+    checkEvaluation(
+      SubstringIndex(Literal("www.apache.org"), Literal("."), Literal(-1)), "org")
+    checkEvaluation(
+      SubstringIndex(Literal(""), Literal("."), Literal(-2)), "")
+    checkEvaluation(
+      SubstringIndex(Literal.create(null, StringType), Literal("."), Literal(-2)), null)
+    checkEvaluation(SubstringIndex(
+        Literal("www.apache.org"), Literal.create(null, StringType), Literal(-2)), null)
+    // non ascii chars
+    // scalastyle:off
+    checkEvaluation(
+      SubstringIndex(Literal("大千世界大千世界"), Literal( "千"), Literal(2)), "大千世界大")
+    // scalastyle:on
+    checkEvaluation(
+      SubstringIndex(Literal("www||apache||org"), Literal( "||"), Literal(2)), "www||apache")
+  }
+
   test("LIKE literal Regular Expression") {
     checkEvaluation(Literal.create(null, StringType).like("a"), null)
     checkEvaluation(Literal.create("a", StringType).like(Literal.create(null, StringType)), null)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 89ffa9c50d6c7..57bb00a7417af 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -1788,8 +1788,18 @@ object functions {
   def instr(str: Column, substring: String): Column = StringInstr(str.expr, lit(substring).expr)
 
   /**
-   * Locate the position of the first occurrence of substr in a string column.
+   * Returns the substring from string str before count occurrences of the delimiter delim.
+   * If count is positive, everything the left of the final delimiter (counting from left) is
+   * returned. If count is negative, every to the right of the final delimiter (counting from the
+   * right) is returned. substring_index performs a case-sensitive match when searching for delim.
    *
+   * @group string_funcs
+   */
+  def substring_index(str: Column, delim: String, count: Int): Column =
+    SubstringIndex(str.expr, lit(delim).expr, lit(count).expr)
+
+  /**
+   * Locate the position of the first occurrence of substr.
    * NOTE: The position is not zero based, but 1 based index, returns 0 if substr
    * could not be found in str.
    *
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
index b7f073cccb6ac..628da95298aa9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
@@ -163,6 +163,63 @@ class StringFunctionsSuite extends QueryTest {
       Row(1))
   }
 
+  test("string substring_index function") {
+    val df = Seq(("www.apache.org", ".", "zz")).toDF("a", "b", "c")
+    checkAnswer(
+      df.select(substring_index($"a", ".", 3)),
+      Row("www.apache.org"))
+    checkAnswer(
+      df.select(substring_index($"a", ".", 2)),
+      Row("www.apache"))
+    checkAnswer(
+      df.select(substring_index($"a", ".", 1)),
+      Row("www"))
+    checkAnswer(
+      df.select(substring_index($"a", ".", 0)),
+      Row(""))
+    checkAnswer(
+      df.select(substring_index(lit("www.apache.org"), ".", -1)),
+      Row("org"))
+    checkAnswer(
+      df.select(substring_index(lit("www.apache.org"), ".", -2)),
+      Row("apache.org"))
+    checkAnswer(
+      df.select(substring_index(lit("www.apache.org"), ".", -3)),
+      Row("www.apache.org"))
+    // str is empty string
+    checkAnswer(
+      df.select(substring_index(lit(""), ".", 1)),
+      Row(""))
+    // empty string delim
+    checkAnswer(
+      df.select(substring_index(lit("www.apache.org"), "", 1)),
+      Row(""))
+    // delim does not exist in str
+    checkAnswer(
+      df.select(substring_index(lit("www.apache.org"), "#", 1)),
+      Row("www.apache.org"))
+    // delim is 2 chars
+    checkAnswer(
+      df.select(substring_index(lit("www||apache||org"), "||", 2)),
+      Row("www||apache"))
+    checkAnswer(
+      df.select(substring_index(lit("www||apache||org"), "||", -2)),
+      Row("apache||org"))
+    // null
+    checkAnswer(
+      df.select(substring_index(lit(null), "||", 2)),
+      Row(null))
+    checkAnswer(
+      df.select(substring_index(lit("www.apache.org"), null, 2)),
+      Row(null))
+    // non ascii chars
+    // scalastyle:off
+    checkAnswer(
+      df.selectExpr("""substring_index("大千世界大千世界", "千", 2)"""),
+      Row("大千世界大"))
+    // scalastyle:on
+  }
+
   test("string locate function") {
     val df = Seq(("aaads", "aa", "zz", 1)).toDF("a", "b", "c", "d")
 
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
index 9d4998fd48a38..2561c1c2a1932 100644
--- a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
+++ b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -198,7 +198,7 @@ public byte[] getBytes() {
    */
   public UTF8String substring(final int start, final int until) {
     if (until <= start || start >= numBytes) {
-      return fromBytes(new byte[0]);
+      return UTF8String.EMPTY_UTF8;
     }
 
     int i = 0;
@@ -406,6 +406,84 @@ public int indexOf(UTF8String v, int start) {
     return -1;
   }
 
+  /**
+   * Find the `str` from left to right.
+   */
+  private int find(UTF8String str, int start) {
+    assert (str.numBytes > 0);
+    while (start <= numBytes - str.numBytes) {
+      if (ByteArrayMethods.arrayEquals(base, offset + start, str.base, str.offset, str.numBytes)) {
+        return start;
+      }
+      start += 1;
+    }
+    return -1;
+  }
+
+  /**
+   * Find the `str` from right to left.
+   */
+  private int rfind(UTF8String str, int start) {
+    assert (str.numBytes > 0);
+    while (start >= 0) {
+      if (ByteArrayMethods.arrayEquals(base, offset + start, str.base, str.offset, str.numBytes)) {
+        return start;
+      }
+      start -= 1;
+    }
+    return -1;
+  }
+
+  /**
+   * Returns the substring from string str before count occurrences of the delimiter delim.
+   * If count is positive, everything the left of the final delimiter (counting from left) is
+   * returned. If count is negative, every to the right of the final delimiter (counting from the
+   * right) is returned. subStringIndex performs a case-sensitive match when searching for delim.
+   */
+  public UTF8String subStringIndex(UTF8String delim, int count) {
+    if (delim.numBytes == 0 || count == 0) {
+      return EMPTY_UTF8;
+    }
+    if (count > 0) {
+      int idx = -1;
+      while (count > 0) {
+        idx = find(delim, idx + 1);
+        if (idx >= 0) {
+          count --;
+        } else {
+          // can not find enough delim
+          return this;
+        }
+      }
+      if (idx == 0) {
+        return EMPTY_UTF8;
+      }
+      byte[] bytes = new byte[idx];
+      copyMemory(base, offset, bytes, BYTE_ARRAY_OFFSET, idx);
+      return fromBytes(bytes);
+
+    } else {
+      int idx = numBytes - delim.numBytes + 1;
+      count = -count;
+      while (count > 0) {
+        idx = rfind(delim, idx - 1);
+        if (idx >= 0) {
+          count --;
+        } else {
+          // can not find enough delim
+          return this;
+        }
+      }
+      if (idx + delim.numBytes == numBytes) {
+        return EMPTY_UTF8;
+      }
+      int size = numBytes - delim.numBytes - idx;
+      byte[] bytes = new byte[size];
+      copyMemory(base, offset + idx + delim.numBytes, bytes, BYTE_ARRAY_OFFSET, size);
+      return fromBytes(bytes);
+    }
+  }
+
   /**
    * Returns str, right-padded with pad to a length of len
    * For example:
diff --git a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
index c565210872322..43eed7063298d 100644
--- a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
+++ b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
@@ -240,6 +240,44 @@ public void indexOf() {
     assertEquals(3, fromString("数据砖头").indexOf(fromString("头"), 0));
   }
 
+  @Test
+  public void substring_index() {
+    assertEquals(fromString("www.apache.org"),
+      fromString("www.apache.org").subStringIndex(fromString("."), 3));
+    assertEquals(fromString("www.apache"),
+      fromString("www.apache.org").subStringIndex(fromString("."), 2));
+    assertEquals(fromString("www"),
+      fromString("www.apache.org").subStringIndex(fromString("."), 1));
+    assertEquals(fromString(""),
+      fromString("www.apache.org").subStringIndex(fromString("."), 0));
+    assertEquals(fromString("org"),
+      fromString("www.apache.org").subStringIndex(fromString("."), -1));
+    assertEquals(fromString("apache.org"),
+      fromString("www.apache.org").subStringIndex(fromString("."), -2));
+    assertEquals(fromString("www.apache.org"),
+      fromString("www.apache.org").subStringIndex(fromString("."), -3));
+    // str is empty string
+    assertEquals(fromString(""),
+      fromString("").subStringIndex(fromString("."), 1));
+    // empty string delim
+    assertEquals(fromString(""),
+      fromString("www.apache.org").subStringIndex(fromString(""), 1));
+    // delim does not exist in str
+    assertEquals(fromString("www.apache.org"),
+      fromString("www.apache.org").subStringIndex(fromString("#"), 2));
+    // delim is 2 chars
+    assertEquals(fromString("www||apache"),
+      fromString("www||apache||org").subStringIndex(fromString("||"), 2));
+    assertEquals(fromString("apache||org"),
+      fromString("www||apache||org").subStringIndex(fromString("||"), -2));
+    // non ascii chars
+    assertEquals(fromString("大千世界大"),
+      fromString("大千世界大千世界").subStringIndex(fromString("千"), 2));
+    // overlapped delim
+    assertEquals(fromString("||"), fromString("||||||").subStringIndex(fromString("|||"), 3));
+    assertEquals(fromString("|||"), fromString("||||||").subStringIndex(fromString("|||"), -4));
+  }
+
   @Test
   public void reverse() {
     assertEquals(fromString("olleh"), fromString("hello").reverse());

From 14f263448471f182123fc84619559df90e7ae52c Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Fri, 31 Jul 2015 21:19:23 -0700
Subject: [PATCH 541/549] [SPARK-9464][SQL] Property checks for UTF8String

This PR is based on the original work by JoshRosen in #7780, which adds ScalaCheck property-based tests for UTF8String.

Author: Josh Rosen <joshrosen@databricks.com>
Author: Yijie Shen <henry.yijieshen@gmail.com>

Closes #7830 from yjshen/utf8-property-checks and squashes the following commits:

593da3a [Yijie Shen] resolve comments
c0800e6 [Yijie Shen] Finish all todos in suite
52f51a0 [Josh Rosen] Add some more failing tests
49ed0697 [Josh Rosen] Rename suite
9209c64 [Josh Rosen] UTF8String Property Checks.
---
 unsafe/pom.xml                                |  10 +
 .../apache/spark/unsafe/types/UTF8String.java |  19 +-
 .../spark/unsafe/types/UTF8StringSuite.java   |  13 +-
 .../types/UTF8StringPropertyCheckSuite.scala  | 249 ++++++++++++++++++
 4 files changed, 280 insertions(+), 11 deletions(-)
 create mode 100644 unsafe/src/test/scala/org/apache/spark/unsafe/types/UTF8StringPropertyCheckSuite.scala

diff --git a/unsafe/pom.xml b/unsafe/pom.xml
index 33782c6c66f90..89475ee3cf5a1 100644
--- a/unsafe/pom.xml
+++ b/unsafe/pom.xml
@@ -70,6 +70,16 @@
       <artifactId>mockito-core</artifactId>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>org.scalacheck</groupId>
+      <artifactId>scalacheck_${scala.binary.version}</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.commons</groupId>
+      <artifactId>commons-lang3</artifactId>
+      <scope>test</scope>
+    </dependency>
   </dependencies>
   <build>
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
index 2561c1c2a1932..f6dafe94c617a 100644
--- a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
+++ b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -301,10 +301,9 @@ public UTF8String trim() {
     int s = 0;
     int e = this.numBytes - 1;
     // skip all of the space (0x20) in the left side
-    while (s < this.numBytes && getByte(s) == 0x20) s++;
+    while (s < this.numBytes && getByte(s) <= 0x20 && getByte(s) >= 0x00) s++;
     // skip all of the space (0x20) in the right side
-    while (e >= 0 && getByte(e) == 0x20) e--;
-
+    while (e >= 0 && getByte(e) <= 0x20 && getByte(e) >= 0x00) e--;
     if (s > e) {
       // empty string
       return UTF8String.fromBytes(new byte[0]);
@@ -316,7 +315,7 @@ public UTF8String trim() {
   public UTF8String trimLeft() {
     int s = 0;
     // skip all of the space (0x20) in the left side
-    while (s < this.numBytes && getByte(s) == 0x20) s++;
+    while (s < this.numBytes && getByte(s) <= 0x20 && getByte(s) >= 0x00) s++;
     if (s == this.numBytes) {
       // empty string
       return UTF8String.fromBytes(new byte[0]);
@@ -328,7 +327,7 @@ public UTF8String trimLeft() {
   public UTF8String trimRight() {
     int e = numBytes - 1;
     // skip all of the space (0x20) in the right side
-    while (e >= 0 && getByte(e) == 0x20) e--;
+    while (e >= 0 && getByte(e) <= 0x20 && getByte(e) >= 0x00) e--;
 
     if (e < 0) {
       // empty string
@@ -354,7 +353,7 @@ public UTF8String reverse() {
   }
 
   public UTF8String repeat(int times) {
-    if (times <=0) {
+    if (times <= 0) {
       return EMPTY_UTF8;
     }
 
@@ -492,7 +491,7 @@ public UTF8String subStringIndex(UTF8String delim, int count) {
    */
   public UTF8String rpad(int len, UTF8String pad) {
     int spaces = len - this.numChars(); // number of char need to pad
-    if (spaces <= 0) {
+    if (spaces <= 0 || pad.numBytes() == 0) {
       // no padding at all, return the substring of the current string
       return substring(0, len);
     } else {
@@ -507,7 +506,7 @@ public UTF8String rpad(int len, UTF8String pad) {
       int idx = 0;
       while (idx < count) {
         copyMemory(pad.base, pad.offset, data, BYTE_ARRAY_OFFSET + offset, pad.numBytes);
-        ++idx;
+        ++ idx;
         offset += pad.numBytes;
       }
       copyMemory(remain.base, remain.offset, data, BYTE_ARRAY_OFFSET + offset, remain.numBytes);
@@ -524,7 +523,7 @@ public UTF8String rpad(int len, UTF8String pad) {
    */
   public UTF8String lpad(int len, UTF8String pad) {
     int spaces = len - this.numChars(); // number of char need to pad
-    if (spaces <= 0) {
+    if (spaces <= 0 || pad.numBytes() == 0) {
       // no padding at all, return the substring of the current string
       return substring(0, len);
     } else {
@@ -539,7 +538,7 @@ public UTF8String lpad(int len, UTF8String pad) {
       int idx = 0;
       while (idx < count) {
         copyMemory(pad.base, pad.offset, data, BYTE_ARRAY_OFFSET + offset, pad.numBytes);
-        ++idx;
+        ++ idx;
         offset += pad.numBytes;
       }
       copyMemory(remain.base, remain.offset, data, BYTE_ARRAY_OFFSET + offset, remain.numBytes);
diff --git a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
index 43eed7063298d..ed50cdcb29af8 100644
--- a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
+++ b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
@@ -309,7 +309,6 @@ public void pad() {
     assertEquals(fromString("hello?????"), fromString("hello").rpad(10, fromString("?????")));
     assertEquals(fromString("???????"), EMPTY_UTF8.rpad(7, fromString("?????")));
 
-
     assertEquals(fromString("数据砖"), fromString("数据砖头").lpad(3, fromString("????")));
     assertEquals(fromString("?数据砖头"), fromString("数据砖头").lpad(5, fromString("????")));
     assertEquals(fromString("??数据砖头"), fromString("数据砖头").lpad(6, fromString("????")));
@@ -327,6 +326,18 @@ public void pad() {
     assertEquals(
       fromString("数据砖头孙行者孙行者孙行"),
       fromString("数据砖头").rpad(12, fromString("孙行者")));
+
+    assertEquals(EMPTY_UTF8, fromString("数据砖头").lpad(-10, fromString("孙行者")));
+    assertEquals(EMPTY_UTF8, fromString("数据砖头").lpad(-10, EMPTY_UTF8));
+    assertEquals(fromString("数据砖头"), fromString("数据砖头").lpad(5, EMPTY_UTF8));
+    assertEquals(fromString("数据砖"), fromString("数据砖头").lpad(3, EMPTY_UTF8));
+    assertEquals(EMPTY_UTF8, EMPTY_UTF8.lpad(3, EMPTY_UTF8));
+
+    assertEquals(EMPTY_UTF8, fromString("数据砖头").rpad(-10, fromString("孙行者")));
+    assertEquals(EMPTY_UTF8, fromString("数据砖头").rpad(-10, EMPTY_UTF8));
+    assertEquals(fromString("数据砖头"), fromString("数据砖头").rpad(5, EMPTY_UTF8));
+    assertEquals(fromString("数据砖"), fromString("数据砖头").rpad(3, EMPTY_UTF8));
+    assertEquals(EMPTY_UTF8, EMPTY_UTF8.rpad(3, EMPTY_UTF8));
   }
 
   @Test
diff --git a/unsafe/src/test/scala/org/apache/spark/unsafe/types/UTF8StringPropertyCheckSuite.scala b/unsafe/src/test/scala/org/apache/spark/unsafe/types/UTF8StringPropertyCheckSuite.scala
new file mode 100644
index 0000000000000..12a002befa0ac
--- /dev/null
+++ b/unsafe/src/test/scala/org/apache/spark/unsafe/types/UTF8StringPropertyCheckSuite.scala
@@ -0,0 +1,249 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.unsafe.types
+
+import org.apache.commons.lang3.StringUtils
+
+import org.scalacheck.{Arbitrary, Gen}
+import org.scalatest.prop.GeneratorDrivenPropertyChecks
+// scalastyle:off
+import org.scalatest.{FunSuite, Matchers}
+
+import org.apache.spark.unsafe.types.UTF8String.{fromString => toUTF8}
+
+/**
+ * This TestSuite utilize ScalaCheck to generate randomized inputs for UTF8String testing.
+ */
+class UTF8StringPropertyCheckSuite extends FunSuite with GeneratorDrivenPropertyChecks with Matchers {
+// scalastyle:on
+
+  test("toString") {
+    forAll { (s: String) =>
+      assert(toUTF8(s).toString() === s)
+    }
+  }
+
+  test("numChars") {
+    forAll { (s: String) =>
+      assert(toUTF8(s).numChars() === s.length)
+    }
+  }
+
+  test("startsWith") {
+    forAll { (s: String) =>
+      val utf8 = toUTF8(s)
+      assert(utf8.startsWith(utf8))
+      for (i <- 1 to s.length) {
+        assert(utf8.startsWith(toUTF8(s.dropRight(i))))
+      }
+    }
+  }
+
+  test("endsWith") {
+    forAll { (s: String) =>
+      val utf8 = toUTF8(s)
+      assert(utf8.endsWith(utf8))
+      for (i <- 1 to s.length) {
+        assert(utf8.endsWith(toUTF8(s.drop(i))))
+      }
+    }
+  }
+
+  test("toUpperCase") {
+    forAll { (s: String) =>
+      assert(toUTF8(s).toUpperCase === toUTF8(s.toUpperCase))
+    }
+  }
+
+  test("toLowerCase") {
+    forAll { (s: String) =>
+      assert(toUTF8(s).toLowerCase === toUTF8(s.toLowerCase))
+    }
+  }
+
+  test("compare") {
+    forAll { (s1: String, s2: String) =>
+      assert(Math.signum(toUTF8(s1).compareTo(toUTF8(s2))) === Math.signum(s1.compareTo(s2)))
+    }
+  }
+
+  test("substring") {
+    forAll { (s: String) =>
+      for (start <- 0 to s.length; end <- 0 to s.length; if start <= end) {
+        assert(toUTF8(s).substring(start, end).toString === s.substring(start, end))
+      }
+    }
+  }
+
+  test("contains") {
+    forAll { (s: String) =>
+      for (start <- 0 to s.length; end <- 0 to s.length; if start <= end) {
+        val substring = s.substring(start, end)
+        assert(toUTF8(s).contains(toUTF8(substring)) === s.contains(substring))
+      }
+    }
+  }
+
+  val whitespaceChar: Gen[Char] = Gen.choose(0x00, 0x20).map(_.toChar)
+  val whitespaceString: Gen[String] = Gen.listOf(whitespaceChar).map(_.mkString)
+  val randomString: Gen[String] = Arbitrary.arbString.arbitrary
+
+  test("trim, trimLeft, trimRight") {
+    // lTrim and rTrim are both modified from java.lang.String.trim
+    def lTrim(s: String): String = {
+      var st = 0
+      val array: Array[Char] = s.toCharArray
+      while ((st < s.length) && (array(st) <= ' ')) {
+        st += 1
+      }
+      if (st > 0) s.substring(st, s.length) else s
+    }
+    def rTrim(s: String): String = {
+      var len = s.length
+      val array: Array[Char] = s.toCharArray
+      while ((len > 0) && (array(len - 1) <= ' ')) {
+        len -= 1
+      }
+      if (len < s.length) s.substring(0, len) else s
+    }
+
+    forAll(
+        whitespaceString,
+        randomString,
+        whitespaceString
+    ) { (start: String, middle: String, end: String) =>
+      val s = start + middle + end
+      assert(toUTF8(s).trim() === toUTF8(s.trim()))
+      assert(toUTF8(s).trimLeft() === toUTF8(lTrim(s)))
+      assert(toUTF8(s).trimRight() === toUTF8(rTrim(s)))
+    }
+  }
+
+  test("reverse") {
+    forAll { (s: String) =>
+      assert(toUTF8(s).reverse === toUTF8(s.reverse))
+    }
+  }
+
+  test("indexOf") {
+    forAll { (s: String) =>
+      for (start <- 0 to s.length; end <- 0 to s.length; if start <= end) {
+        val substring = s.substring(start, end)
+        assert(toUTF8(s).indexOf(toUTF8(substring), 0) === s.indexOf(substring))
+      }
+    }
+  }
+
+  val randomInt = Gen.choose(-100, 100)
+
+  test("repeat") {
+    def repeat(str: String, times: Int): String = {
+      if (times > 0) str * times else ""
+    }
+    // ScalaCheck always generating too large repeat times which might hang the test forever.
+    forAll(randomString, randomInt) { (s: String, times: Int) =>
+      assert(toUTF8(s).repeat(times) === toUTF8(repeat(s, times)))
+    }
+  }
+
+  test("lpad, rpad") {
+    def padding(origin: String, pad: String, length: Int, isLPad: Boolean): String = {
+      if (length <= 0) return ""
+      if (length <= origin.length) {
+        if (length <= 0) "" else origin.substring(0, length)
+      } else {
+        if (pad.length == 0) return origin
+        val toPad = length - origin.length
+        val partPad = if (toPad % pad.length == 0) "" else pad.substring(0, toPad % pad.length)
+        if (isLPad) {
+          pad * (toPad / pad.length) + partPad + origin
+        } else {
+          origin + pad * (toPad / pad.length) + partPad
+        }
+      }
+    }
+
+    forAll (
+      randomString,
+      randomString,
+      randomInt
+    ) { (s: String, pad: String, length: Int) =>
+      assert(toUTF8(s).lpad(length, toUTF8(pad)) ===
+        toUTF8(padding(s, pad, length, true)))
+      assert(toUTF8(s).rpad(length, toUTF8(pad)) ===
+        toUTF8(padding(s, pad, length, false)))
+    }
+  }
+
+  val nullalbeSeq = Gen.listOf(Gen.oneOf[String](null: String, randomString))
+
+  test("concat") {
+    def concat(orgin: Seq[String]): String =
+      if (orgin.exists(_ == null)) null else orgin.mkString
+
+    forAll { (inputs: Seq[String]) =>
+      assert(UTF8String.concat(inputs.map(toUTF8): _*) === toUTF8(inputs.mkString))
+    }
+    forAll (nullalbeSeq) { (inputs: Seq[String]) =>
+      assert(UTF8String.concat(inputs.map(toUTF8): _*) === toUTF8(concat(inputs)))
+    }
+  }
+
+  test("concatWs") {
+    def concatWs(sep: String, inputs: Seq[String]): String = {
+      if (sep == null) return null
+      inputs.filter(_ != null).mkString(sep)
+    }
+
+    forAll { (sep: String, inputs: Seq[String]) =>
+      assert(UTF8String.concatWs(toUTF8(sep), inputs.map(toUTF8): _*) ===
+        toUTF8(inputs.mkString(sep)))
+    }
+    forAll(randomString, nullalbeSeq) {(sep: String, inputs: Seq[String]) =>
+      assert(UTF8String.concatWs(toUTF8(sep), inputs.map(toUTF8): _*) ===
+        toUTF8(concatWs(sep, inputs)))
+    }
+  }
+
+  // TODO: enable this when we find a proper way to generate valid patterns
+  ignore("split") {
+    forAll { (s: String, pattern: String, limit: Int) =>
+      assert(toUTF8(s).split(toUTF8(pattern), limit) ===
+        s.split(pattern, limit).map(toUTF8(_)))
+    }
+  }
+
+  test("levenshteinDistance") {
+    forAll { (one: String, another: String) =>
+      assert(toUTF8(one).levenshteinDistance(toUTF8(another)) ===
+        StringUtils.getLevenshteinDistance(one, another))
+    }
+  }
+
+  test("hashCode") {
+    forAll { (s: String) =>
+      assert(toUTF8(s).hashCode() === toUTF8(s).hashCode())
+    }
+  }
+
+  test("equals") {
+    forAll { (one: String, another: String) =>
+      assert(toUTF8(one).equals(toUTF8(another)) === one.equals(another))
+    }
+  }
+}

From 3320b0ba262159c0c7209ce39b353c93c597077d Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@appier.com>
Date: Fri, 31 Jul 2015 22:26:30 -0700
Subject: [PATCH 542/549] [SPARK-9415][SQL] Throw AnalysisException when using
 MapType on Join and Aggregate

JIRA: https://issues.apache.org/jira/browse/SPARK-9415

Following up #7787. We shouldn't use MapType as grouping keys and join keys too.

Author: Liang-Chi Hsieh <viirya@appier.com>

Closes #7819 from viirya/map_join_groupby and squashes the following commits:

005ee0c [Liang-Chi Hsieh] For comments.
7463398 [Liang-Chi Hsieh] MapType can't be used as join keys, grouping keys.
---
 .../sql/catalyst/analysis/CheckAnalysis.scala | 16 +++--
 .../analysis/AnalysisErrorSuite.scala         | 68 ++++++++++++++++++-
 .../spark/sql/DataFrameAggregateSuite.scala   | 10 ---
 .../org/apache/spark/sql/JoinSuite.scala      |  8 ---
 4 files changed, 77 insertions(+), 25 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
index 0ebc3d180a780..364569d8f0b40 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
@@ -92,8 +92,11 @@ trait CheckAnalysis {
               case p: Predicate =>
                 p.asInstanceOf[Expression].children.foreach(checkValidJoinConditionExprs)
               case e if e.dataType.isInstanceOf[BinaryType] =>
-                failAnalysis(s"expression ${e.prettyString} in join condition " +
-                  s"'${condition.prettyString}' can't be binary type.")
+                failAnalysis(s"binary type expression ${e.prettyString} cannot be used " +
+                  "in join conditions")
+              case e if e.dataType.isInstanceOf[MapType] =>
+                failAnalysis(s"map type expression ${e.prettyString} cannot be used " +
+                  "in join conditions")
               case _ => // OK
             }
 
@@ -114,13 +117,16 @@ trait CheckAnalysis {
 
             def checkValidGroupingExprs(expr: Expression): Unit = expr.dataType match {
               case BinaryType =>
-                failAnalysis(s"grouping expression '${expr.prettyString}' in aggregate can " +
-                  s"not be binary type.")
+                failAnalysis(s"binary type expression ${expr.prettyString} cannot be used " +
+                  "in grouping expression")
+              case m: MapType =>
+                failAnalysis(s"map type expression ${expr.prettyString} cannot be used " +
+                  "in grouping expression")
               case _ => // OK
             }
 
             aggregateExprs.foreach(checkValidAggregateExpression)
-            aggregateExprs.foreach(checkValidGroupingExprs)
+            groupingExprs.foreach(checkValidGroupingExprs)
 
           case Sort(orders, _, _) =>
             orders.foreach { order =>
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
index 2588df98246dd..aa19cdce31ec6 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
@@ -181,7 +181,71 @@ class AnalysisErrorSuite extends SparkFunSuite with BeforeAndAfter {
     val error = intercept[AnalysisException] {
       SimpleAnalyzer.checkAnalysis(join)
     }
-    error.message.contains("Failure when resolving conflicting references in Join")
-    error.message.contains("Conflicting attributes")
+    assert(error.message.contains("Failure when resolving conflicting references in Join"))
+    assert(error.message.contains("Conflicting attributes"))
+  }
+
+  test("aggregation can't work on binary and map types") {
+    val plan =
+      Aggregate(
+        AttributeReference("a", BinaryType)(exprId = ExprId(2)) :: Nil,
+        Alias(Sum(AttributeReference("b", IntegerType)(exprId = ExprId(1))), "c")() :: Nil,
+        LocalRelation(
+          AttributeReference("a", BinaryType)(exprId = ExprId(2)),
+          AttributeReference("b", IntegerType)(exprId = ExprId(1))))
+
+    val error = intercept[AnalysisException] {
+      caseSensitiveAnalyze(plan)
+    }
+    assert(error.message.contains("binary type expression a cannot be used in grouping expression"))
+
+    val plan2 =
+      Aggregate(
+        AttributeReference("a", MapType(IntegerType, StringType))(exprId = ExprId(2)) :: Nil,
+        Alias(Sum(AttributeReference("b", IntegerType)(exprId = ExprId(1))), "c")() :: Nil,
+        LocalRelation(
+          AttributeReference("a", MapType(IntegerType, StringType))(exprId = ExprId(2)),
+          AttributeReference("b", IntegerType)(exprId = ExprId(1))))
+
+    val error2 = intercept[AnalysisException] {
+      caseSensitiveAnalyze(plan2)
+    }
+    assert(error2.message.contains("map type expression a cannot be used in grouping expression"))
+  }
+
+  test("Join can't work on binary and map types") {
+    val plan =
+      Join(
+        LocalRelation(
+          AttributeReference("a", BinaryType)(exprId = ExprId(2)),
+          AttributeReference("b", IntegerType)(exprId = ExprId(1))),
+        LocalRelation(
+          AttributeReference("c", BinaryType)(exprId = ExprId(4)),
+          AttributeReference("d", IntegerType)(exprId = ExprId(3))),
+        Inner,
+        Some(EqualTo(AttributeReference("a", BinaryType)(exprId = ExprId(2)),
+          AttributeReference("c", BinaryType)(exprId = ExprId(4)))))
+
+    val error = intercept[AnalysisException] {
+      caseSensitiveAnalyze(plan)
+    }
+    assert(error.message.contains("binary type expression a cannot be used in join conditions"))
+
+    val plan2 =
+      Join(
+        LocalRelation(
+          AttributeReference("a", MapType(IntegerType, StringType))(exprId = ExprId(2)),
+          AttributeReference("b", IntegerType)(exprId = ExprId(1))),
+        LocalRelation(
+          AttributeReference("c", MapType(IntegerType, StringType))(exprId = ExprId(4)),
+          AttributeReference("d", IntegerType)(exprId = ExprId(3))),
+        Inner,
+        Some(EqualTo(AttributeReference("a", MapType(IntegerType, StringType))(exprId = ExprId(2)),
+          AttributeReference("c", MapType(IntegerType, StringType))(exprId = ExprId(4)))))
+
+    val error2 = intercept[AnalysisException] {
+      caseSensitiveAnalyze(plan2)
+    }
+    assert(error2.message.contains("map type expression a cannot be used in join conditions"))
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
index 228ece8065151..f9cff7440a76e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
@@ -190,14 +190,4 @@ class DataFrameAggregateSuite extends QueryTest {
       emptyTableData.agg(sumDistinct('a)),
       Row(null))
   }
-
-  test("aggregation can't work on binary type") {
-    val df = Seq(1, 1, 2, 2).map(i => Tuple1(i.toString)).toDF("c").select($"c" cast BinaryType)
-    intercept[AnalysisException] {
-      df.groupBy("c").agg(count("*"))
-    }
-    intercept[AnalysisException] {
-      df.distinct
-    }
-  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
index 27c08f64649ee..5bef1d8966031 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
@@ -490,12 +490,4 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach {
         Row(3, 2) :: Nil)
 
   }
-
-  test("Join can't work on binary type") {
-    val left = Seq(1, 1, 2, 2).map(i => Tuple1(i.toString)).toDF("c").select($"c" cast BinaryType)
-    val right = Seq(1, 1, 2, 2).map(i => Tuple1(i.toString)).toDF("d").select($"d" cast BinaryType)
-    intercept[AnalysisException] {
-      left.join(right, ($"left.N" === $"right.N"), "full")
-    }
-  }
 }

From 67ad4e21fc68336b0ad6f9a363fb5ebb51f592bf Mon Sep 17 00:00:00 2001
From: Cheng Hao <hao.cheng@intel.com>
Date: Fri, 31 Jul 2015 23:11:22 -0700
Subject: [PATCH 543/549] [SPARK-8232] [SQL] Add sort_array support

Add expression `sort_array` support.

Author: Cheng Hao <hao.cheng@intel.com>

This patch had conflicts when merged, resolved by
Committer: Davies Liu <davies.liu@gmail.com>

Closes #7581 from chenghao-intel/sort_array and squashes the following commits:

664c960 [Cheng Hao] update the sort_array by using the ArrayData
276d2d5 [Cheng Hao] add empty line
0edab9c [Cheng Hao] Add asending/descending support for sort_array
80fc0f8 [Cheng Hao] Add type checking
a42b678 [Cheng Hao] Add sort_array support
---
 python/pyspark/sql/functions.py               | 20 +++++
 .../catalyst/analysis/FunctionRegistry.scala  |  1 +
 .../expressions/collectionOperations.scala    | 80 ++++++++++++++++++-
 .../CollectionFunctionsSuite.scala            | 22 +++++
 .../org/apache/spark/sql/functions.scala      | 19 ++++-
 .../spark/sql/DataFrameFunctionsSuite.scala   | 51 +++++++++++-
 6 files changed, 186 insertions(+), 7 deletions(-)

diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 89a2a5ceaa9bf..fb542e6cff81a 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -51,6 +51,7 @@
     'sha1',
     'sha2',
     'size',
+    'sort_array',
     'sparkPartitionId',
     'struct',
     'udf',
@@ -570,8 +571,10 @@ def length(col):
 def format_number(col, d):
     """Formats the number X to a format like '#,###,###.##', rounded to d decimal places,
        and returns the result as a string.
+
     :param col: the column name of the numeric value to be formatted
     :param d: the N decimal places
+
     >>> sqlContext.createDataFrame([(5,)], ['a']).select(format_number('a', 4).alias('v')).collect()
     [Row(v=u'5.0000')]
     """
@@ -968,6 +971,23 @@ def soundex(col):
     return Column(sc._jvm.functions.size(_to_java_column(col)))
 
 
+@since(1.5)
+def sort_array(col, asc=True):
+    """
+    Collection function: sorts the input array for the given column in ascending order.
+
+    :param col: name of column or expression
+
+    >>> df = sqlContext.createDataFrame([([2, 1, 3],),([1],),([],)], ['data'])
+    >>> df.select(sort_array(df.data).alias('r')).collect()
+    [Row(r=[1, 2, 3]), Row(r=[1]), Row(r=[])]
+    >>> df.select(sort_array(df.data, asc=False).alias('r')).collect()
+    [Row(r=[3, 2, 1]), Row(r=[1]), Row(r=[])]
+     """
+    sc = SparkContext._active_spark_context
+    return Column(sc._jvm.functions.sort_array(_to_java_column(col), asc))
+
+
 class UserDefinedFunction(object):
     """
     User defined function in Python
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index ee44cbcba68e7..6e144518bb009 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -233,6 +233,7 @@ object FunctionRegistry {
 
     // collection functions
     expression[Size]("size"),
+    expression[SortArray]("sort_array"),
 
     // misc functions
     expression[Crc32]("crc32"),
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
index 1a00dbc254de1..0a530596a98c6 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
@@ -16,7 +16,10 @@
  */
 package org.apache.spark.sql.catalyst.expressions
 
-import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode}
+import java.util.Comparator
+
+import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
+import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenFallback, CodeGenContext, GeneratedExpressionCode}
 import org.apache.spark.sql.types._
 
 /**
@@ -39,3 +42,78 @@ case class Size(child: Expression) extends UnaryExpression with ExpectsInputType
     nullSafeCodeGen(ctx, ev, c => s"${ev.primitive} = ($c).$sizeCall;")
   }
 }
+
+/**
+ * Sorts the input array in ascending / descending order according to the natural ordering of
+ * the array elements and returns it.
+ */
+case class SortArray(base: Expression, ascendingOrder: Expression)
+  extends BinaryExpression with ExpectsInputTypes with CodegenFallback {
+
+  def this(e: Expression) = this(e, Literal(true))
+
+  override def left: Expression = base
+  override def right: Expression = ascendingOrder
+  override def dataType: DataType = base.dataType
+  override def inputTypes: Seq[AbstractDataType] = Seq(ArrayType, BooleanType)
+
+  override def checkInputDataTypes(): TypeCheckResult = base.dataType match {
+    case _ @ ArrayType(n: AtomicType, _) => TypeCheckResult.TypeCheckSuccess
+    case _ @ ArrayType(n, _) => TypeCheckResult.TypeCheckFailure(
+                    s"Type $n is not the AtomicType, we can not perform the ordering operations")
+    case other =>
+      TypeCheckResult.TypeCheckFailure(s"ArrayType(AtomicType) is expected, but we got $other")
+  }
+
+  @transient
+  private lazy val lt = {
+    val ordering = base.dataType match {
+      case _ @ ArrayType(n: AtomicType, _) => n.ordering.asInstanceOf[Ordering[Any]]
+    }
+
+    new Comparator[Any]() {
+      override def compare(o1: Any, o2: Any): Int = {
+        if (o1 == null && o2 == null) {
+          0
+        } else if (o1 == null) {
+          -1
+        } else if (o2 == null) {
+          1
+        } else {
+          ordering.compare(o1, o2)
+        }
+      }
+    }
+  }
+
+  @transient
+  private lazy val gt = {
+    val ordering = base.dataType match {
+      case _ @ ArrayType(n: AtomicType, _) => n.ordering.asInstanceOf[Ordering[Any]]
+    }
+
+    new Comparator[Any]() {
+      override def compare(o1: Any, o2: Any): Int = {
+        if (o1 == null && o2 == null) {
+          0
+        } else if (o1 == null) {
+          1
+        } else if (o2 == null) {
+          -1
+        } else {
+          -ordering.compare(o1, o2)
+        }
+      }
+    }
+  }
+
+  override def nullSafeEval(array: Any, ascending: Any): Any = {
+    val data = array.asInstanceOf[ArrayData].toArray().asInstanceOf[Array[AnyRef]]
+    java.util.Arrays.sort(
+      data,
+      if (ascending.asInstanceOf[Boolean]) lt else gt)
+    new GenericArrayData(data.asInstanceOf[Array[Any]])
+  }
+
+  override def prettyName: String = "sort_array"
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionFunctionsSuite.scala
index 28c41b57169f9..2c7e85c446ec6 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionFunctionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionFunctionsSuite.scala
@@ -43,4 +43,26 @@ class CollectionFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(Literal.create(null, MapType(StringType, StringType)), null)
     checkEvaluation(Literal.create(null, ArrayType(StringType)), null)
   }
+
+  test("Sort Array") {
+    val a0 = Literal.create(Seq(2, 1, 3), ArrayType(IntegerType))
+    val a1 = Literal.create(Seq[Integer](), ArrayType(IntegerType))
+    val a2 = Literal.create(Seq("b", "a"), ArrayType(StringType))
+    val a3 = Literal.create(Seq("b", null, "a"), ArrayType(StringType))
+
+    checkEvaluation(new SortArray(a0), Seq(1, 2, 3))
+    checkEvaluation(new SortArray(a1), Seq[Integer]())
+    checkEvaluation(new SortArray(a2), Seq("a", "b"))
+    checkEvaluation(new SortArray(a3), Seq(null, "a", "b"))
+    checkEvaluation(SortArray(a0, Literal(true)), Seq(1, 2, 3))
+    checkEvaluation(SortArray(a1, Literal(true)), Seq[Integer]())
+    checkEvaluation(SortArray(a2, Literal(true)), Seq("a", "b"))
+    checkEvaluation(new SortArray(a3, Literal(true)), Seq(null, "a", "b"))
+    checkEvaluation(SortArray(a0, Literal(false)), Seq(3, 2, 1))
+    checkEvaluation(SortArray(a1, Literal(false)), Seq[Integer]())
+    checkEvaluation(SortArray(a2, Literal(false)), Seq("b", "a"))
+    checkEvaluation(new SortArray(a3, Literal(false)), Seq("b", "a", null))
+
+    checkEvaluation(Literal.create(null, ArrayType(StringType)), null)
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 57bb00a7417af..3c9421f5cd14b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -2223,19 +2223,30 @@ object functions {
   //////////////////////////////////////////////////////////////////////////////////////////////
 
   /**
-   * Returns length of array or map
+   * Returns length of array or map.
+   *
    * @group collection_funcs
    * @since 1.5.0
    */
-  def size(columnName: String): Column = size(Column(columnName))
+  def size(e: Column): Column = Size(e.expr)
 
   /**
-   * Returns length of array or map
+   * Sorts the input array for the given column in ascending order,
+   * according to the natural ordering of the array elements.
+   *
    * @group collection_funcs
    * @since 1.5.0
    */
-  def size(column: Column): Column = Size(column.expr)
+  def sort_array(e: Column): Column = sort_array(e, true)
 
+  /**
+   * Sorts the input array for the given column in ascending / descending order,
+   * according to the natural ordering of the array elements.
+   *
+   * @group collection_funcs
+   * @since 1.5.0
+   */
+  def sort_array(e: Column, asc: Boolean): Column = SortArray(e.expr, lit(asc).expr)
 
   //////////////////////////////////////////////////////////////////////////////////////////////
   //////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
index 1baec5d37699d..46921d14256b9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
@@ -267,6 +267,53 @@ class DataFrameFunctionsSuite extends QueryTest {
     )
   }
 
+  test("sort_array function") {
+    val df = Seq(
+      (Array[Int](2, 1, 3), Array("b", "c", "a")),
+      (Array[Int](), Array[String]()),
+      (null, null)
+    ).toDF("a", "b")
+    checkAnswer(
+      df.select(sort_array($"a"), sort_array($"b")),
+      Seq(
+        Row(Seq(1, 2, 3), Seq("a", "b", "c")),
+        Row(Seq[Int](), Seq[String]()),
+        Row(null, null))
+    )
+    checkAnswer(
+      df.select(sort_array($"a", false), sort_array($"b", false)),
+      Seq(
+        Row(Seq(3, 2, 1), Seq("c", "b", "a")),
+        Row(Seq[Int](), Seq[String]()),
+        Row(null, null))
+    )
+    checkAnswer(
+      df.selectExpr("sort_array(a)", "sort_array(b)"),
+      Seq(
+        Row(Seq(1, 2, 3), Seq("a", "b", "c")),
+        Row(Seq[Int](), Seq[String]()),
+        Row(null, null))
+    )
+    checkAnswer(
+      df.selectExpr("sort_array(a, true)", "sort_array(b, false)"),
+      Seq(
+        Row(Seq(1, 2, 3), Seq("c", "b", "a")),
+        Row(Seq[Int](), Seq[String]()),
+        Row(null, null))
+    )
+
+    val df2 = Seq((Array[Array[Int]](Array(2)), "x")).toDF("a", "b")
+    assert(intercept[AnalysisException] {
+      df2.selectExpr("sort_array(a)").collect()
+    }.getMessage().contains("Type ArrayType(IntegerType,false) is not the AtomicType, " +
+      "we can not perform the ordering operations"))
+
+    val df3 = Seq(("xxx", "x")).toDF("a", "b")
+    assert(intercept[AnalysisException] {
+      df3.selectExpr("sort_array(a)").collect()
+    }.getMessage().contains("ArrayType(AtomicType) is expected, but we got StringType"))
+  }
+
   test("array size function") {
     val df = Seq(
       (Array[Int](1, 2), "x"),
@@ -274,7 +321,7 @@ class DataFrameFunctionsSuite extends QueryTest {
       (Array[Int](1, 2, 3), "z")
     ).toDF("a", "b")
     checkAnswer(
-      df.select(size("a")),
+      df.select(size($"a")),
       Seq(Row(2), Row(0), Row(3))
     )
     checkAnswer(
@@ -290,7 +337,7 @@ class DataFrameFunctionsSuite extends QueryTest {
       (Map[Int, Int](1 -> 1, 2 -> 2, 3 -> 3), "z")
     ).toDF("a", "b")
     checkAnswer(
-      df.select(size("a")),
+      df.select(size($"a")),
       Seq(Row(2), Row(0), Row(3))
     )
     checkAnswer(

From d90f2cf7a2a1d1e69f9ab385f35f62d4091b5302 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Fri, 31 Jul 2015 23:55:16 -0700
Subject: [PATCH 544/549] [SPARK-9517][SQL] BytesToBytesMap should encode data
 the same way as UnsafeExternalSorter

BytesToBytesMap current encodes key/value data in the following format:
```
8B key length, key data, 8B value length, value data
```

UnsafeExternalSorter, on the other hand, encodes data this way:
```
4B record length, data
```

As a result, we cannot pass records encoded by BytesToBytesMap directly into UnsafeExternalSorter for sorting. However, if we rearrange data slightly, we can then pass the key/value records directly into UnsafeExternalSorter:
```
4B key+value length, 4B key length, key data, value data
```

Author: Reynold Xin <rxin@databricks.com>

Closes #7845 from rxin/kvsort-rebase and squashes the following commits:

5716b59 [Reynold Xin] Fixed test.
2e62ccb [Reynold Xin] Updated BytesToBytesMap's data encoding to put the key first.
a51b641 [Reynold Xin] Added a KV sorter interface.
---
 .../spark/unsafe/map/BytesToBytesMap.java     | 58 ++++++++-------
 .../unsafe/sort/UnsafeExternalSorter.java     | 15 ++++
 .../map/AbstractBytesToBytesMapSuite.java     |  6 +-
 .../sql/execution/UnsafeKeyValueSorter.java   | 30 ++++++++
 .../UnsafeFixedWidthAggregationMap.java       | 73 +++++++++----------
 .../sql/execution/GeneratedAggregate.scala    | 27 ++++---
 .../UnsafeFixedWidthAggregationMapSuite.scala | 27 ++++---
 .../org/apache/spark/unsafe/KVIterator.java   | 29 ++++++++
 8 files changed, 175 insertions(+), 90 deletions(-)
 create mode 100644 sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeKeyValueSorter.java
 create mode 100644 unsafe/src/main/java/org/apache/spark/unsafe/KVIterator.java

diff --git a/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java b/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java
index 0f42950e6ed8b..481375f493a50 100644
--- a/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java
+++ b/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java
@@ -17,7 +17,6 @@
 
 package org.apache.spark.unsafe.map;
 
-import java.io.IOException;
 import java.lang.Override;
 import java.lang.UnsupportedOperationException;
 import java.util.Iterator;
@@ -212,7 +211,7 @@ public BytesToBytesMap(
    */
   public int numElements() { return numElements; }
 
-  private static final class BytesToBytesMapIterator implements Iterator<Location> {
+  public static final class BytesToBytesMapIterator implements Iterator<Location> {
 
     private final int numRecords;
     private final Iterator<MemoryBlock> dataPagesIterator;
@@ -222,7 +221,8 @@ private static final class BytesToBytesMapIterator implements Iterator<Location>
     private Object pageBaseObject;
     private long offsetInPage;
 
-    BytesToBytesMapIterator(int numRecords, Iterator<MemoryBlock> dataPagesIterator, Location loc) {
+    private BytesToBytesMapIterator(
+        int numRecords, Iterator<MemoryBlock> dataPagesIterator, Location loc) {
       this.numRecords = numRecords;
       this.dataPagesIterator = dataPagesIterator;
       this.loc = loc;
@@ -244,13 +244,13 @@ public boolean hasNext() {
 
     @Override
     public Location next() {
-      int keyLength = (int) PlatformDependent.UNSAFE.getLong(pageBaseObject, offsetInPage);
-      if (keyLength == END_OF_PAGE_MARKER) {
+      int totalLength = PlatformDependent.UNSAFE.getInt(pageBaseObject, offsetInPage);
+      if (totalLength == END_OF_PAGE_MARKER) {
         advanceToNextPage();
-        keyLength = (int) PlatformDependent.UNSAFE.getLong(pageBaseObject, offsetInPage);
+        totalLength = PlatformDependent.UNSAFE.getInt(pageBaseObject, offsetInPage);
       }
       loc.with(pageBaseObject, offsetInPage);
-      offsetInPage += 8 + 8 + keyLength + loc.getValueLength();
+      offsetInPage += 8 + totalLength;
       currentRecordNumber++;
       return loc;
     }
@@ -269,7 +269,7 @@ public void remove() {
    * If any other lookups or operations are performed on this map while iterating over it, including
    * `lookup()`, the behavior of the returned iterator is undefined.
    */
-  public Iterator<Location> iterator() {
+  public BytesToBytesMapIterator iterator() {
     return new BytesToBytesMapIterator(numElements, dataPages.iterator(), loc);
   }
 
@@ -352,15 +352,18 @@ private void updateAddressesAndSizes(long fullKeyAddress) {
         taskMemoryManager.getOffsetInPage(fullKeyAddress));
     }
 
-    private void updateAddressesAndSizes(Object page, long keyOffsetInPage) {
-        long position = keyOffsetInPage;
-        keyLength = (int) PlatformDependent.UNSAFE.getLong(page, position);
-        position += 8; // word used to store the key size
-        keyMemoryLocation.setObjAndOffset(page, position);
-        position += keyLength;
-        valueLength = (int) PlatformDependent.UNSAFE.getLong(page, position);
-        position += 8; // word used to store the key size
-        valueMemoryLocation.setObjAndOffset(page, position);
+    private void updateAddressesAndSizes(final Object page, final long keyOffsetInPage) {
+      long position = keyOffsetInPage;
+      final int totalLength = PlatformDependent.UNSAFE.getInt(page, position);
+      position += 4;
+      keyLength = PlatformDependent.UNSAFE.getInt(page, position);
+      position += 4;
+      valueLength = totalLength - keyLength;
+
+      keyMemoryLocation.setObjAndOffset(page, position);
+
+      position += keyLength;
+      valueMemoryLocation.setObjAndOffset(page, position);
     }
 
     Location with(int pos, int keyHashcode, boolean isDefined) {
@@ -478,7 +481,7 @@ public boolean putNewKey(
       // the key address instead of storing the absolute address of the value, the key and value
       // must be stored in the same memory page.
       // (8 byte key length) (key) (8 byte value length) (value)
-      final long requiredSize = 8 + keyLengthBytes + 8 + valueLengthBytes;
+      final long requiredSize = 8 + keyLengthBytes + valueLengthBytes;
 
       // --- Figure out where to insert the new record ---------------------------------------------
 
@@ -508,7 +511,7 @@ public boolean putNewKey(
           // There wasn't enough space in the current page, so write an end-of-page marker:
           final Object pageBaseObject = currentDataPage.getBaseObject();
           final long lengthOffsetInPage = currentDataPage.getBaseOffset() + pageCursor;
-          PlatformDependent.UNSAFE.putLong(pageBaseObject, lengthOffsetInPage, END_OF_PAGE_MARKER);
+          PlatformDependent.UNSAFE.putInt(pageBaseObject, lengthOffsetInPage, END_OF_PAGE_MARKER);
         }
         final long memoryGranted = shuffleMemoryManager.tryToAcquire(pageSizeBytes);
         if (memoryGranted != pageSizeBytes) {
@@ -535,21 +538,22 @@ public boolean putNewKey(
       long insertCursor = dataPageInsertOffset;
 
       // Compute all of our offsets up-front:
-      final long keySizeOffsetInPage = insertCursor;
-      insertCursor += 8; // word used to store the key size
+      final long totalLengthOffset = insertCursor;
+      insertCursor += 4;
+      final long keyLengthOffset = insertCursor;
+      insertCursor += 4;
       final long keyDataOffsetInPage = insertCursor;
       insertCursor += keyLengthBytes;
-      final long valueSizeOffsetInPage = insertCursor;
-      insertCursor += 8; // word used to store the value size
       final long valueDataOffsetInPage = insertCursor;
       insertCursor += valueLengthBytes; // word used to store the value size
 
+      PlatformDependent.UNSAFE.putInt(dataPageBaseObject, totalLengthOffset,
+        keyLengthBytes + valueLengthBytes);
+      PlatformDependent.UNSAFE.putInt(dataPageBaseObject, keyLengthOffset, keyLengthBytes);
       // Copy the key
-      PlatformDependent.UNSAFE.putLong(dataPageBaseObject, keySizeOffsetInPage, keyLengthBytes);
       PlatformDependent.copyMemory(
         keyBaseObject, keyBaseOffset, dataPageBaseObject, keyDataOffsetInPage, keyLengthBytes);
       // Copy the value
-      PlatformDependent.UNSAFE.putLong(dataPageBaseObject, valueSizeOffsetInPage, valueLengthBytes);
       PlatformDependent.copyMemory(valueBaseObject, valueBaseOffset, dataPageBaseObject,
         valueDataOffsetInPage, valueLengthBytes);
 
@@ -557,7 +561,7 @@ public boolean putNewKey(
 
       if (useOverflowPage) {
         // Store the end-of-page marker at the end of the data page
-        PlatformDependent.UNSAFE.putLong(dataPageBaseObject, insertCursor, END_OF_PAGE_MARKER);
+        PlatformDependent.UNSAFE.putInt(dataPageBaseObject, insertCursor, END_OF_PAGE_MARKER);
       } else {
         pageCursor += requiredSize;
       }
@@ -565,7 +569,7 @@ public boolean putNewKey(
       numElements++;
       bitset.set(pos);
       final long storedKeyAddress = taskMemoryManager.encodePageNumberAndOffset(
-        dataPage, keySizeOffsetInPage);
+        dataPage, totalLengthOffset);
       longArray.set(pos * 2, storedKeyAddress);
       longArray.set(pos * 2 + 1, keyHashcode);
       updateAddressesAndSizes(storedKeyAddress);
diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java
index 866e0b4151577..c05f2c332eee3 100644
--- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java
+++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java
@@ -282,6 +282,21 @@ public void insertRecord(
     sorter.insertRecord(recordAddress, prefix);
   }
 
+  /**
+   * Write a record to the sorter. The record is broken down into two different parts, and
+   *
+   */
+  public void insertRecord(
+      Object recordBaseObject1,
+      long recordBaseOffset1,
+      int lengthInBytes1,
+      Object recordBaseObject2,
+      long recordBaseOffset2,
+      int lengthInBytes2,
+      long prefix) throws IOException {
+
+  }
+
   public UnsafeSorterIterator getSortedIterator() throws IOException {
     final UnsafeSorterIterator inMemoryIterator = sorter.getSortedIterator();
     int numIteratorsToMerge = spillWriters.size() + (inMemoryIterator.hasNext() ? 1 : 0);
diff --git a/core/src/test/java/org/apache/spark/unsafe/map/AbstractBytesToBytesMapSuite.java b/core/src/test/java/org/apache/spark/unsafe/map/AbstractBytesToBytesMapSuite.java
index 60f483acbcb80..70f8ca4d21345 100644
--- a/core/src/test/java/org/apache/spark/unsafe/map/AbstractBytesToBytesMapSuite.java
+++ b/core/src/test/java/org/apache/spark/unsafe/map/AbstractBytesToBytesMapSuite.java
@@ -243,17 +243,17 @@ public void iteratorTest() throws Exception {
   @Test
   public void iteratingOverDataPagesWithWastedSpace() throws Exception {
     final int NUM_ENTRIES = 1000 * 1000;
-    final int KEY_LENGTH = 16;
+    final int KEY_LENGTH = 24;
     final int VALUE_LENGTH = 40;
     final BytesToBytesMap map = new BytesToBytesMap(
       taskMemoryManager, shuffleMemoryManager, NUM_ENTRIES, PAGE_SIZE_BYTES);
-    // Each record will take 8 + 8 + 16 + 40 = 72 bytes of space in the data page. Our 64-megabyte
+    // Each record will take 8 + 24 + 40 = 72 bytes of space in the data page. Our 64-megabyte
     // pages won't be evenly-divisible by records of this size, which will cause us to waste some
     // space at the end of the page. This is necessary in order for us to take the end-of-record
     // handling branch in iterator().
     try {
       for (int i = 0; i < NUM_ENTRIES; i++) {
-        final long[] key = new long[] { i, i };  // 2 * 8 = 16 bytes
+        final long[] key = new long[] { i, i, i };  // 3 * 8 = 24 bytes
         final long[] value = new long[] { i, i, i, i, i }; // 5 * 8 = 40 bytes
         final BytesToBytesMap.Location loc = map.lookup(
           key,
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeKeyValueSorter.java b/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeKeyValueSorter.java
new file mode 100644
index 0000000000000..59c774da74acf
--- /dev/null
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeKeyValueSorter.java
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution;
+
+import java.io.IOException;
+
+import org.apache.spark.sql.catalyst.expressions.UnsafeRow;
+import org.apache.spark.unsafe.KVIterator;
+
+public abstract class UnsafeKeyValueSorter {
+
+  public abstract void insert(UnsafeRow key, UnsafeRow value);
+
+  public abstract KVIterator<UnsafeRow, UnsafeRow> sort() throws IOException;
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMap.java b/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMap.java
index 08a98cdd94a4c..c18b6dea6b2e1 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMap.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMap.java
@@ -17,9 +17,6 @@
 
 package org.apache.spark.sql.execution;
 
-import java.io.IOException;
-import java.util.Iterator;
-
 import org.apache.spark.shuffle.ShuffleMemoryManager;
 import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.catalyst.expressions.UnsafeProjection;
@@ -28,6 +25,7 @@
 import org.apache.spark.sql.types.DecimalType;
 import org.apache.spark.sql.types.StructField;
 import org.apache.spark.sql.types.StructType;
+import org.apache.spark.unsafe.KVIterator;
 import org.apache.spark.unsafe.PlatformDependent;
 import org.apache.spark.unsafe.map.BytesToBytesMap;
 import org.apache.spark.unsafe.memory.MemoryLocation;
@@ -156,54 +154,55 @@ public UnsafeRow getAggregationBuffer(InternalRow groupingKey) {
     return currentAggregationBuffer;
   }
 
-  /**
-   * Mutable pair object returned by {@link UnsafeFixedWidthAggregationMap#iterator()}.
-   */
-  public static class MapEntry {
-    private MapEntry() { };
-    public final UnsafeRow key = new UnsafeRow();
-    public final UnsafeRow value = new UnsafeRow();
-  }
-
   /**
    * Returns an iterator over the keys and values in this map.
    *
    * For efficiency, each call returns the same object.
    */
-  public Iterator<MapEntry> iterator() {
-    return new Iterator<MapEntry>() {
+  public KVIterator<UnsafeRow, UnsafeRow> iterator() {
+    return new KVIterator<UnsafeRow, UnsafeRow>() {
+
+      private final BytesToBytesMap.BytesToBytesMapIterator mapLocationIterator = map.iterator();
+      private final UnsafeRow key = new UnsafeRow();
+      private final UnsafeRow value = new UnsafeRow();
 
-      private final MapEntry entry = new MapEntry();
-      private final Iterator<BytesToBytesMap.Location> mapLocationIterator = map.iterator();
+      @Override
+      public boolean next() {
+        if (mapLocationIterator.hasNext()) {
+          final BytesToBytesMap.Location loc = mapLocationIterator.next();
+          final MemoryLocation keyAddress = loc.getKeyAddress();
+          final MemoryLocation valueAddress = loc.getValueAddress();
+          key.pointTo(
+            keyAddress.getBaseObject(),
+            keyAddress.getBaseOffset(),
+            groupingKeySchema.length(),
+            loc.getKeyLength()
+          );
+          value.pointTo(
+            valueAddress.getBaseObject(),
+            valueAddress.getBaseOffset(),
+            aggregationBufferSchema.length(),
+            loc.getValueLength()
+          );
+          return true;
+        } else {
+          return false;
+        }
+      }
 
       @Override
-      public boolean hasNext() {
-        return mapLocationIterator.hasNext();
+      public UnsafeRow getKey() {
+        return key;
       }
 
       @Override
-      public MapEntry next() {
-        final BytesToBytesMap.Location loc = mapLocationIterator.next();
-        final MemoryLocation keyAddress = loc.getKeyAddress();
-        final MemoryLocation valueAddress = loc.getValueAddress();
-        entry.key.pointTo(
-          keyAddress.getBaseObject(),
-          keyAddress.getBaseOffset(),
-          groupingKeySchema.length(),
-          loc.getKeyLength()
-        );
-        entry.value.pointTo(
-          valueAddress.getBaseObject(),
-          valueAddress.getBaseOffset(),
-          aggregationBufferSchema.length(),
-          loc.getValueLength()
-        );
-        return entry;
+      public UnsafeRow getValue() {
+        return value;
       }
 
       @Override
-      public void remove() {
-        throw new UnsupportedOperationException();
+      public void close() {
+        // Do nothing.
       }
     };
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala
index 469de6ca8e101..cd87b8deba0c2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala
@@ -287,21 +287,26 @@ case class GeneratedAggregate(
         new Iterator[InternalRow] {
           private[this] val mapIterator = aggregationMap.iterator()
           private[this] val resultProjection = resultProjectionBuilder()
+          private[this] var _hasNext = mapIterator.next()
 
-          def hasNext: Boolean = mapIterator.hasNext
+          def hasNext: Boolean = _hasNext
 
           def next(): InternalRow = {
-            val entry = mapIterator.next()
-            val result = resultProjection(joinedRow(entry.key, entry.value))
-            if (hasNext) {
-              result
+            if (_hasNext) {
+              val result = resultProjection(joinedRow(mapIterator.getKey, mapIterator.getValue))
+              _hasNext = mapIterator.next()
+              if (_hasNext) {
+                result
+              } else {
+                // This is the last element in the iterator, so let's free the buffer. Before we do,
+                // though, we need to make a defensive copy of the result so that we don't return an
+                // object that might contain dangling pointers to the freed memory
+                val resultCopy = result.copy()
+                aggregationMap.free()
+                resultCopy
+              }
             } else {
-              // This is the last element in the iterator, so let's free the buffer. Before we do,
-              // though, we need to make a defensive copy of the result so that we don't return an
-              // object that might contain dangling pointers to the freed memory
-              val resultCopy = result.copy()
-              aggregationMap.free()
-              resultCopy
+              throw new java.util.NoSuchElementException
             }
           }
         }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMapSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMapSuite.scala
index 79fd52dacda52..6a2c51ca88ac3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMapSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMapSuite.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.execution
 import org.scalatest.{BeforeAndAfterEach, Matchers}
 
 import scala.collection.JavaConverters._
+import scala.collection.mutable
 import scala.util.Random
 
 import org.apache.spark.SparkFunSuite
@@ -52,7 +53,7 @@ class UnsafeFixedWidthAggregationMapSuite
 
   override def afterEach(): Unit = {
     if (taskMemoryManager != null) {
-      val leakedShuffleMemory = shuffleMemoryManager.getMemoryConsumptionForThisTask
+      val leakedShuffleMemory = shuffleMemoryManager.getMemoryConsumptionForThisTask()
       assert(taskMemoryManager.cleanUpAllAllocatedMemory() === 0)
       assert(leakedShuffleMemory === 0)
       taskMemoryManager = null
@@ -80,7 +81,7 @@ class UnsafeFixedWidthAggregationMapSuite
       PAGE_SIZE_BYTES,
       false // disable perf metrics
     )
-    assert(!map.iterator().hasNext)
+    assert(!map.iterator().next())
     map.free()
   }
 
@@ -100,13 +101,13 @@ class UnsafeFixedWidthAggregationMapSuite
     // Looking up a key stores a zero-entry in the map (like Python Counters or DefaultDicts)
     assert(map.getAggregationBuffer(groupKey) != null)
     val iter = map.iterator()
-    val entry = iter.next()
-    assert(!iter.hasNext)
-    entry.key.getString(0) should be ("cats")
-    entry.value.getInt(0) should be (0)
+    assert(iter.next())
+    iter.getKey.getString(0) should be ("cats")
+    iter.getValue.getInt(0) should be (0)
+    assert(!iter.next())
 
     // Modifications to rows retrieved from the map should update the values in the map
-    entry.value.setInt(0, 42)
+    iter.getValue.setInt(0, 42)
     map.getAggregationBuffer(groupKey).getInt(0) should be (42)
 
     map.free()
@@ -128,12 +129,14 @@ class UnsafeFixedWidthAggregationMapSuite
     groupKeys.foreach { keyString =>
       assert(map.getAggregationBuffer(InternalRow(UTF8String.fromString(keyString))) != null)
     }
-    val seenKeys: Set[String] = map.iterator().asScala.map { entry =>
-      entry.key.getString(0)
-    }.toSet
-    seenKeys.size should be (groupKeys.size)
-    seenKeys should be (groupKeys)
 
+    val seenKeys = new mutable.HashSet[String]
+    val iter = map.iterator()
+    while (iter.next()) {
+      seenKeys += iter.getKey.getString(0)
+    }
+    assert(seenKeys.size === groupKeys.size)
+    assert(seenKeys === groupKeys)
     map.free()
   }
 
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/KVIterator.java b/unsafe/src/main/java/org/apache/spark/unsafe/KVIterator.java
new file mode 100644
index 0000000000000..fb163401c0d27
--- /dev/null
+++ b/unsafe/src/main/java/org/apache/spark/unsafe/KVIterator.java
@@ -0,0 +1,29 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.unsafe;
+
+public abstract class KVIterator<K, V> {
+
+  public abstract boolean next();
+
+  public abstract K getKey();
+
+  public abstract V getValue();
+
+  public abstract void close();
+}

From 1d59a4162bf5142af270ed7f4b3eab42870c87b7 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@outlook.com>
Date: Sat, 1 Aug 2015 00:17:15 -0700
Subject: [PATCH 545/549] [SPARK-9480][SQL] add MapData and cleanup internal
 row stuff

This PR adds a `MapData` as internal representation of map type in Spark SQL, and provides a default implementation with just 2 `ArrayData`.

After that, we have specialized getters for all internal type, so I removed generic getter in `ArrayData` and added specialized `toArray` for it.
Also did some refactor and cleanup for `InternalRow` and its subclasses.

Author: Wenchen Fan <cloud0fan@outlook.com>

Closes #7799 from cloud-fan/map-data and squashes the following commits:

77d482f [Wenchen Fan] fix python
e8f6682 [Wenchen Fan] skip MapData equality check in HiveInspectorSuite
40cc9db [Wenchen Fan] add toString
6e06ec9 [Wenchen Fan] some more cleanup
a90aca1 [Wenchen Fan] add MapData
---
 .../apache/spark/mllib/linalg/Matrices.scala  |   6 +-
 .../apache/spark/mllib/linalg/Vectors.scala   |   6 +-
 .../expressions/SpecializedGetters.java       |   6 +
 .../sql/catalyst/expressions/UnsafeRow.java   |  12 +-
 .../sql/catalyst/CatalystTypeConverters.scala |  79 +++++----
 .../spark/sql/catalyst/InternalRow.scala      | 117 ++++---------
 .../catalyst/expressions/BoundAttribute.scala |   3 +
 .../spark/sql/catalyst/expressions/Cast.scala | 101 +++++-------
 .../GenericSpecializedGetters.scala           |  61 +++++++
 .../sql/catalyst/expressions/Projection.scala |  69 ++++----
 .../expressions/SpecificMutableRow.scala      |  13 +-
 .../expressions/codegen/CodeGenerator.scala   |  11 +-
 .../codegen/GenerateProjection.scala          |   2 +-
 .../expressions/collectionOperations.scala    |   8 +-
 .../expressions/complexTypeExtractors.scala   |  81 ++++++---
 .../sql/catalyst/expressions/generators.scala |  29 +++-
 .../spark/sql/catalyst/expressions/rows.scala |  44 ++---
 .../expressions/stringOperations.scala        |   2 +-
 .../spark/sql/types/ArrayBasedMapData.scala   |  51 ++++++
 .../apache/spark/sql/types/ArrayData.scala    | 155 +++++++++---------
 .../spark/sql/types/GenericArrayData.scala    | 116 +++++++++----
 .../org/apache/spark/sql/types/MapData.scala  |  38 +++++
 .../expressions/ComplexTypeSuite.scala        |   4 -
 .../expressions/UnsafeRowConverterSuite.scala |   2 +-
 .../spark/sql/execution/debug/package.scala   |  14 +-
 .../spark/sql/execution/pythonUDFs.scala      |  31 ++--
 .../apache/spark/sql/json/JacksonParser.scala |  21 ++-
 .../sql/parquet/CatalystRowConverter.scala    |  16 +-
 .../spark/sql/parquet/ParquetConverter.scala  |   8 +-
 .../sql/parquet/ParquetTableSupport.scala     |  23 +--
 .../org/apache/spark/sql/DataFrameSuite.scala |   1 +
 .../scala/org/apache/spark/sql/RowSuite.scala |   4 +-
 .../org/apache/spark/sql/SQLQuerySuite.scala  |  34 ++--
 .../scala/org/apache/spark/sql/TestData.scala |   4 +-
 .../spark/sql/UserDefinedTypeSuite.scala      |   2 +-
 .../spark/sql/hive/HiveInspectors.scala       |  89 ++++++----
 .../hive/execution/InsertIntoHiveTable.scala  |   4 +-
 .../org/apache/spark/sql/hive/hiveUDFs.scala  |   4 +-
 .../spark/sql/hive/orc/OrcRelation.scala      |   7 +-
 .../spark/sql/hive/HiveInspectorSuite.scala   |   4 +-
 40 files changed, 750 insertions(+), 532 deletions(-)
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/GenericSpecializedGetters.scala
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayBasedMapData.scala
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapData.scala

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
index 88914fa875990..1c858348bf20e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
@@ -179,12 +179,12 @@ private[spark] class MatrixUDT extends UserDefinedType[Matrix] {
         val tpe = row.getByte(0)
         val numRows = row.getInt(1)
         val numCols = row.getInt(2)
-        val values = row.getArray(5).toArray.map(_.asInstanceOf[Double])
+        val values = row.getArray(5).toDoubleArray()
         val isTransposed = row.getBoolean(6)
         tpe match {
           case 0 =>
-            val colPtrs = row.getArray(3).toArray.map(_.asInstanceOf[Int])
-            val rowIndices = row.getArray(4).toArray.map(_.asInstanceOf[Int])
+            val colPtrs = row.getArray(3).toIntArray()
+            val rowIndices = row.getArray(4).toIntArray()
             new SparseMatrix(numRows, numCols, colPtrs, rowIndices, values, isTransposed)
           case 1 =>
             new DenseMatrix(numRows, numCols, values, isTransposed)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
index 89a1818db0d1d..96d1f48ba2ba3 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
@@ -209,11 +209,11 @@ private[spark] class VectorUDT extends UserDefinedType[Vector] {
         tpe match {
           case 0 =>
             val size = row.getInt(1)
-            val indices = row.getArray(2).toArray().map(_.asInstanceOf[Int])
-            val values = row.getArray(3).toArray().map(_.asInstanceOf[Double])
+            val indices = row.getArray(2).toIntArray()
+            val values = row.getArray(3).toDoubleArray()
             new SparseVector(size, indices, values)
           case 1 =>
-            val values = row.getArray(3).toArray().map(_.asInstanceOf[Double])
+            val values = row.getArray(3).toDoubleArray()
             new DenseVector(values)
         }
     }
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/SpecializedGetters.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/SpecializedGetters.java
index e3d3ba7a9ccc0..8f1027f3164c8 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/SpecializedGetters.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/SpecializedGetters.java
@@ -19,7 +19,9 @@
 
 import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.types.ArrayData;
+import org.apache.spark.sql.types.DataType;
 import org.apache.spark.sql.types.Decimal;
+import org.apache.spark.sql.types.MapData;
 import org.apache.spark.unsafe.types.CalendarInterval;
 import org.apache.spark.unsafe.types.UTF8String;
 
@@ -52,4 +54,8 @@ public interface SpecializedGetters {
   InternalRow getStruct(int ordinal, int numFields);
 
   ArrayData getArray(int ordinal);
+
+  MapData getMap(int ordinal);
+
+  Object get(int ordinal, DataType dataType);
 }
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
index 24dc80b1a7932..5a19aa8920c2f 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
@@ -118,6 +118,11 @@ private long getFieldOffset(int ordinal) {
     return baseOffset + bitSetWidthInBytes + ordinal * 8L;
   }
 
+  private void assertIndexIsValid(int index) {
+    assert index >= 0 : "index (" + index + ") should >= 0";
+    assert index < numFields : "index (" + index + ") should < " + numFields;
+  }
+
   //////////////////////////////////////////////////////////////////////////////
   // Public methods
   //////////////////////////////////////////////////////////////////////////////
@@ -163,11 +168,6 @@ public void pointTo(byte[] buf, int numFields, int sizeInBytes) {
     pointTo(buf, PlatformDependent.BYTE_ARRAY_OFFSET, numFields, sizeInBytes);
   }
 
-  private void assertIndexIsValid(int index) {
-    assert index >= 0 : "index (" + index + ") should >= 0";
-    assert index < numFields : "index (" + index + ") should < " + numFields;
-  }
-
   @Override
   public void setNullAt(int i) {
     assertIndexIsValid(i);
@@ -254,7 +254,7 @@ public void setDecimal(int ordinal, Decimal value, int precision) {
   }
 
   @Override
-  public Object get(int ordinal) {
+  public Object genericGet(int ordinal) {
     throw new UnsupportedOperationException();
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala
index 7ca20fe97fbef..c666864e43abd 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala
@@ -23,7 +23,6 @@ import java.sql.{Date, Timestamp}
 import java.util.{Map => JavaMap}
 import javax.annotation.Nullable
 
-import scala.collection.mutable.HashMap
 import scala.language.existentials
 
 import org.apache.spark.sql.Row
@@ -53,12 +52,6 @@ object CatalystTypeConverters {
     }
   }
 
-  private def isWholePrimitive(dt: DataType): Boolean = dt match {
-    case dt if isPrimitive(dt) => true
-    case MapType(keyType, valueType, _) => isWholePrimitive(keyType) && isWholePrimitive(valueType)
-    case _ => false
-  }
-
   private def getConverterForType(dataType: DataType): CatalystTypeConverter[Any, Any, Any] = {
     val converter = dataType match {
       case udt: UserDefinedType[_] => UDTConverter(udt)
@@ -157,8 +150,6 @@ object CatalystTypeConverters {
 
     private[this] val elementConverter = getConverterForType(elementType)
 
-    private[this] val isNoChange = isWholePrimitive(elementType)
-
     override def toCatalystImpl(scalaValue: Any): ArrayData = {
       scalaValue match {
         case a: Array[_] =>
@@ -179,10 +170,14 @@ object CatalystTypeConverters {
     override def toScala(catalystValue: ArrayData): Seq[Any] = {
       if (catalystValue == null) {
         null
-      } else if (isNoChange) {
-        catalystValue.toArray()
+      } else if (isPrimitive(elementType)) {
+        catalystValue.toArray[Any](elementType)
       } else {
-        catalystValue.toArray().map(elementConverter.toScala)
+        val result = new Array[Any](catalystValue.numElements())
+        catalystValue.foreach(elementType, (i, e) => {
+          result(i) = elementConverter.toScala(e)
+        })
+        result
       }
     }
 
@@ -193,44 +188,58 @@ object CatalystTypeConverters {
   private case class MapConverter(
       keyType: DataType,
       valueType: DataType)
-    extends CatalystTypeConverter[Any, Map[Any, Any], Map[Any, Any]] {
+    extends CatalystTypeConverter[Any, Map[Any, Any], MapData] {
 
     private[this] val keyConverter = getConverterForType(keyType)
     private[this] val valueConverter = getConverterForType(valueType)
 
-    private[this] val isNoChange = isWholePrimitive(keyType) && isWholePrimitive(valueType)
-
-    override def toCatalystImpl(scalaValue: Any): Map[Any, Any] = scalaValue match {
+    override def toCatalystImpl(scalaValue: Any): MapData = scalaValue match {
       case m: Map[_, _] =>
-        m.map { case (k, v) =>
-          keyConverter.toCatalyst(k) -> valueConverter.toCatalyst(v)
+        val length = m.size
+        val convertedKeys = new Array[Any](length)
+        val convertedValues = new Array[Any](length)
+
+        var i = 0
+        for ((key, value) <- m) {
+          convertedKeys(i) = keyConverter.toCatalyst(key)
+          convertedValues(i) = valueConverter.toCatalyst(value)
+          i += 1
         }
+        ArrayBasedMapData(convertedKeys, convertedValues)
 
       case jmap: JavaMap[_, _] =>
+        val length = jmap.size()
+        val convertedKeys = new Array[Any](length)
+        val convertedValues = new Array[Any](length)
+
+        var i = 0
         val iter = jmap.entrySet.iterator
-        val convertedMap: HashMap[Any, Any] = HashMap()
         while (iter.hasNext) {
           val entry = iter.next()
-          val key = keyConverter.toCatalyst(entry.getKey)
-          convertedMap(key) = valueConverter.toCatalyst(entry.getValue)
+          convertedKeys(i) = keyConverter.toCatalyst(entry.getKey)
+          convertedValues(i) = valueConverter.toCatalyst(entry.getValue)
+          i += 1
         }
-        convertedMap
+        ArrayBasedMapData(convertedKeys, convertedValues)
     }
 
-    override def toScala(catalystValue: Map[Any, Any]): Map[Any, Any] = {
+    override def toScala(catalystValue: MapData): Map[Any, Any] = {
       if (catalystValue == null) {
         null
-      } else if (isNoChange) {
-        catalystValue
       } else {
-        catalystValue.map { case (k, v) =>
-          keyConverter.toScala(k) -> valueConverter.toScala(v)
-        }
+        val keys = catalystValue.keyArray().toArray[Any](keyType)
+        val values = catalystValue.valueArray().toArray[Any](valueType)
+        val convertedKeys =
+          if (isPrimitive(keyType)) keys else keys.map(keyConverter.toScala)
+        val convertedValues =
+          if (isPrimitive(valueType)) values else values.map(valueConverter.toScala)
+
+        convertedKeys.zip(convertedValues).toMap
       }
     }
 
     override def toScalaImpl(row: InternalRow, column: Int): Map[Any, Any] =
-      toScala(row.get(column, MapType(keyType, valueType)).asInstanceOf[Map[Any, Any]])
+      toScala(row.getMap(column))
   }
 
   private case class StructConverter(
@@ -410,7 +419,17 @@ object CatalystTypeConverters {
     case r: Row => InternalRow(r.toSeq.map(convertToCatalyst): _*)
     case arr: Array[Any] => new GenericArrayData(arr.map(convertToCatalyst))
     case m: Map[_, _] =>
-      m.map { case (k, v) => (convertToCatalyst(k), convertToCatalyst(v)) }.toMap
+      val length = m.size
+      val convertedKeys = new Array[Any](length)
+      val convertedValues = new Array[Any](length)
+
+      var i = 0
+      for ((key, value) <- m) {
+        convertedKeys(i) = convertToCatalyst(key)
+        convertedValues(i) = convertToCatalyst(value)
+        i += 1
+      }
+      ArrayBasedMapData(convertedKeys, convertedValues)
     case other => other
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala
index b19bf4386b0ba..7656d054dc361 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala
@@ -19,71 +19,25 @@ package org.apache.spark.sql.catalyst
 
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.types._
-import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String}
 
 /**
  * An abstract class for row used internal in Spark SQL, which only contain the columns as
  * internal types.
  */
-abstract class InternalRow extends Serializable with SpecializedGetters {
+// todo: make InternalRow just extends SpecializedGetters, remove generic getter
+abstract class InternalRow extends GenericSpecializedGetters with Serializable {
 
   def numFields: Int
 
-  def get(ordinal: Int): Any = get(ordinal, null)
-
-  def genericGet(ordinal: Int): Any = get(ordinal, null)
-
-  def get(ordinal: Int, dataType: DataType): Any
-
-  def getAs[T](ordinal: Int, dataType: DataType): T = get(ordinal, dataType).asInstanceOf[T]
-
-  override def isNullAt(ordinal: Int): Boolean = get(ordinal) == null
-
-  override def getBoolean(ordinal: Int): Boolean = getAs[Boolean](ordinal, BooleanType)
-
-  override def getByte(ordinal: Int): Byte = getAs[Byte](ordinal, ByteType)
-
-  override def getShort(ordinal: Int): Short = getAs[Short](ordinal, ShortType)
-
-  override def getInt(ordinal: Int): Int = getAs[Int](ordinal, IntegerType)
-
-  override def getLong(ordinal: Int): Long = getAs[Long](ordinal, LongType)
-
-  override def getFloat(ordinal: Int): Float = getAs[Float](ordinal, FloatType)
-
-  override def getDouble(ordinal: Int): Double = getAs[Double](ordinal, DoubleType)
-
-  override def getUTF8String(ordinal: Int): UTF8String = getAs[UTF8String](ordinal, StringType)
-
-  override def getBinary(ordinal: Int): Array[Byte] = getAs[Array[Byte]](ordinal, BinaryType)
-
-  override def getDecimal(ordinal: Int, precision: Int, scale: Int): Decimal =
-    getAs[Decimal](ordinal, DecimalType(precision, scale))
-
-  override def getInterval(ordinal: Int): CalendarInterval =
-    getAs[CalendarInterval](ordinal, CalendarIntervalType)
-
   // This is only use for test and will throw a null pointer exception if the position is null.
   def getString(ordinal: Int): String = getUTF8String(ordinal).toString
 
-  /**
-   * Returns a struct from ordinal position.
-   *
-   * @param ordinal position to get the struct from.
-   * @param numFields number of fields the struct type has
-   */
-  override def getStruct(ordinal: Int, numFields: Int): InternalRow =
-    getAs[InternalRow](ordinal, null)
-
-  override def getArray(ordinal: Int): ArrayData = getAs(ordinal, null)
-
-  override def toString: String = s"[${this.mkString(",")}]"
+  override def toString: String = mkString("[", ",", "]")
 
   /**
    * Make a copy of the current [[InternalRow]] object.
    */
-  def copy(): InternalRow = this
+  def copy(): InternalRow
 
   /** Returns true if there are any NULL values in this row. */
   def anyNull: Boolean = {
@@ -117,8 +71,8 @@ abstract class InternalRow extends Serializable with SpecializedGetters {
         return false
       }
       if (!isNullAt(i)) {
-        val o1 = get(i)
-        val o2 = other.get(i)
+        val o1 = genericGet(i)
+        val o2 = other.genericGet(i)
         o1 match {
           case b1: Array[Byte] =>
             if (!o2.isInstanceOf[Array[Byte]] ||
@@ -143,34 +97,6 @@ abstract class InternalRow extends Serializable with SpecializedGetters {
     true
   }
 
-  /* ---------------------- utility methods for Scala ---------------------- */
-
-  /**
-   * Return a Scala Seq representing the row. Elements are placed in the same order in the Seq.
-   */
-  def toSeq: Seq[Any] = {
-    val n = numFields
-    val values = new Array[Any](n)
-    var i = 0
-    while (i < n) {
-      values.update(i, get(i))
-      i += 1
-    }
-    values.toSeq
-  }
-
-  /** Displays all elements of this sequence in a string (without a separator). */
-  def mkString: String = toSeq.mkString
-
-  /** Displays all elements of this sequence in a string using a separator string. */
-  def mkString(sep: String): String = toSeq.mkString(sep)
-
-  /**
-   * Displays all elements of this traversable or iterator in a string using
-   * start, end, and separator strings.
-   */
-  def mkString(start: String, sep: String, end: String): String = toSeq.mkString(start, sep, end)
-
   // Custom hashCode function that matches the efficient code generated version.
   override def hashCode: Int = {
     var result: Int = 37
@@ -181,7 +107,7 @@ abstract class InternalRow extends Serializable with SpecializedGetters {
         if (isNullAt(i)) {
           0
         } else {
-          get(i) match {
+          genericGet(i) match {
             case b: Boolean => if (b) 0 else 1
             case b: Byte => b.toInt
             case s: Short => s.toInt
@@ -200,6 +126,35 @@ abstract class InternalRow extends Serializable with SpecializedGetters {
     }
     result
   }
+
+  /* ---------------------- utility methods for Scala ---------------------- */
+
+  /**
+   * Return a Scala Seq representing the row. Elements are placed in the same order in the Seq.
+   */
+  // todo: remove this as it needs the generic getter
+  def toSeq: Seq[Any] = {
+    val n = numFields
+    val values = new Array[Any](n)
+    var i = 0
+    while (i < n) {
+      values.update(i, genericGet(i))
+      i += 1
+    }
+    values
+  }
+
+  /** Displays all elements of this sequence in a string (without a separator). */
+  def mkString: String = toSeq.mkString
+
+  /** Displays all elements of this sequence in a string using a separator string. */
+  def mkString(sep: String): String = toSeq.mkString(sep)
+
+  /**
+   * Displays all elements of this traversable or iterator in a string using
+   * start, end, and separator strings.
+   */
+  def mkString(start: String, sep: String, end: String): String = toSeq.mkString(start, sep, end)
 }
 
 object InternalRow {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala
index 45709c1c8f554..473b9b787058c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala
@@ -49,7 +49,10 @@ case class BoundReference(ordinal: Int, dataType: DataType, nullable: Boolean)
         case StringType => input.getUTF8String(ordinal)
         case BinaryType => input.getBinary(ordinal)
         case CalendarIntervalType => input.getInterval(ordinal)
+        case t: DecimalType => input.getDecimal(ordinal, t.precision, t.scale)
         case t: StructType => input.getStruct(ordinal, t.size)
+        case _: ArrayType => input.getArray(ordinal)
+        case _: MapType => input.getMap(ordinal)
         case _ => input.get(ordinal, dataType)
       }
     }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
index 43be11c48ae7c..88429bb84b1e3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -361,30 +361,29 @@ case class Cast(child: Expression, dataType: DataType)
       b => x.numeric.asInstanceOf[Numeric[Any]].toFloat(b)
   }
 
-  private[this] def castArray(from: ArrayType, to: ArrayType): Any => Any = {
-    val elementCast = cast(from.elementType, to.elementType)
+  private[this] def castArray(fromType: DataType, toType: DataType): Any => Any = {
+    val elementCast = cast(fromType, toType)
     // TODO: Could be faster?
     buildCast[ArrayData](_, array => {
-      val length = array.numElements()
-      val values = new Array[Any](length)
-      var i = 0
-      while (i < length) {
-        if (array.isNullAt(i)) {
+      val values = new Array[Any](array.numElements())
+      array.foreach(fromType, (i, e) => {
+        if (e == null) {
           values(i) = null
         } else {
-          values(i) = elementCast(array.get(i))
+          values(i) = elementCast(e)
         }
-        i += 1
-      }
+      })
       new GenericArrayData(values)
     })
   }
 
   private[this] def castMap(from: MapType, to: MapType): Any => Any = {
-    val keyCast = cast(from.keyType, to.keyType)
-    val valueCast = cast(from.valueType, to.valueType)
-    buildCast[Map[Any, Any]](_, _.map {
-      case (key, value) => (keyCast(key), if (value == null) null else valueCast(value))
+    val keyCast = castArray(from.keyType, to.keyType)
+    val valueCast = castArray(from.valueType, to.valueType)
+    buildCast[MapData](_, map => {
+      val keys = keyCast(map.keyArray()).asInstanceOf[ArrayData]
+      val values = valueCast(map.valueArray()).asInstanceOf[ArrayData]
+      new ArrayBasedMapData(keys, values)
     })
   }
 
@@ -420,7 +419,7 @@ case class Cast(child: Expression, dataType: DataType)
     case FloatType => castToFloat(from)
     case LongType => castToLong(from)
     case DoubleType => castToDouble(from)
-    case array: ArrayType => castArray(from.asInstanceOf[ArrayType], array)
+    case array: ArrayType => castArray(from.asInstanceOf[ArrayType].elementType, array.elementType)
     case map: MapType => castMap(from.asInstanceOf[MapType], map)
     case struct: StructType => castStruct(from.asInstanceOf[StructType], struct)
   }
@@ -461,7 +460,8 @@ case class Cast(child: Expression, dataType: DataType)
     case LongType => castToLongCode(from)
     case DoubleType => castToDoubleCode(from)
 
-    case array: ArrayType => castArrayCode(from.asInstanceOf[ArrayType], array, ctx)
+    case array: ArrayType =>
+      castArrayCode(from.asInstanceOf[ArrayType].elementType, array.elementType, ctx)
     case map: MapType => castMapCode(from.asInstanceOf[MapType], map, ctx)
     case struct: StructType => castStructCode(from.asInstanceOf[StructType], struct, ctx)
   }
@@ -801,8 +801,8 @@ case class Cast(child: Expression, dataType: DataType)
   }
 
   private[this] def castArrayCode(
-      from: ArrayType, to: ArrayType, ctx: CodeGenContext): CastFunction = {
-    val elementCast = nullSafeCastFunction(from.elementType, to.elementType, ctx)
+      fromType: DataType, toType: DataType, ctx: CodeGenContext): CastFunction = {
+    val elementCast = nullSafeCastFunction(fromType, toType, ctx)
     val arrayClass = classOf[GenericArrayData].getName
     val fromElementNull = ctx.freshName("feNull")
     val fromElementPrim = ctx.freshName("fePrim")
@@ -821,10 +821,10 @@ case class Cast(child: Expression, dataType: DataType)
             $values[$j] = null;
           } else {
             boolean $fromElementNull = false;
-            ${ctx.javaType(from.elementType)} $fromElementPrim =
-              ${ctx.getValue(c, from.elementType, j)};
+            ${ctx.javaType(fromType)} $fromElementPrim =
+              ${ctx.getValue(c, fromType, j)};
             ${castCode(ctx, fromElementPrim,
-              fromElementNull, toElementPrim, toElementNull, to.elementType, elementCast)}
+              fromElementNull, toElementPrim, toElementNull, toType, elementCast)}
             if ($toElementNull) {
               $values[$j] = null;
             } else {
@@ -837,48 +837,29 @@ case class Cast(child: Expression, dataType: DataType)
   }
 
   private[this] def castMapCode(from: MapType, to: MapType, ctx: CodeGenContext): CastFunction = {
-    val keyCast = nullSafeCastFunction(from.keyType, to.keyType, ctx)
-    val valueCast = nullSafeCastFunction(from.valueType, to.valueType, ctx)
-
-    val hashMapClass = classOf[mutable.HashMap[Any, Any]].getName
-    val fromKeyPrim = ctx.freshName("fkp")
-    val fromKeyNull = ctx.freshName("fkn")
-    val fromValuePrim = ctx.freshName("fvp")
-    val fromValueNull = ctx.freshName("fvn")
-    val toKeyPrim = ctx.freshName("tkp")
-    val toKeyNull = ctx.freshName("tkn")
-    val toValuePrim = ctx.freshName("tvp")
-    val toValueNull = ctx.freshName("tvn")
-    val result = ctx.freshName("result")
+    val keysCast = castArrayCode(from.keyType, to.keyType, ctx)
+    val valuesCast = castArrayCode(from.valueType, to.valueType, ctx)
+
+    val mapClass = classOf[ArrayBasedMapData].getName
+
+    val keys = ctx.freshName("keys")
+    val convertedKeys = ctx.freshName("convertedKeys")
+    val convertedKeysNull = ctx.freshName("convertedKeysNull")
+
+    val values = ctx.freshName("values")
+    val convertedValues = ctx.freshName("convertedValues")
+    val convertedValuesNull = ctx.freshName("convertedValuesNull")
 
     (c, evPrim, evNull) =>
       s"""
-        final $hashMapClass $result = new $hashMapClass();
-        scala.collection.Iterator iter = $c.iterator();
-        while (iter.hasNext()) {
-          scala.Tuple2 kv = (scala.Tuple2) iter.next();
-          boolean $fromKeyNull = false;
-          ${ctx.javaType(from.keyType)} $fromKeyPrim =
-            (${ctx.boxedType(from.keyType)}) kv._1();
-          ${castCode(ctx, fromKeyPrim,
-            fromKeyNull, toKeyPrim, toKeyNull, to.keyType, keyCast)}
-
-          boolean $fromValueNull = kv._2() == null;
-          if ($fromValueNull) {
-            $result.put($toKeyPrim, null);
-          } else {
-            ${ctx.javaType(from.valueType)} $fromValuePrim =
-              (${ctx.boxedType(from.valueType)}) kv._2();
-            ${castCode(ctx, fromValuePrim,
-              fromValueNull, toValuePrim, toValueNull, to.valueType, valueCast)}
-            if ($toValueNull) {
-              $result.put($toKeyPrim, null);
-            } else {
-              $result.put($toKeyPrim, $toValuePrim);
-            }
-          }
-        }
-        $evPrim = $result;
+        final ArrayData $keys = $c.keyArray();
+        final ArrayData $values = $c.valueArray();
+        ${castCode(ctx, keys, "false",
+          convertedKeys, convertedKeysNull, ArrayType(to.keyType), keysCast)}
+        ${castCode(ctx, values, "false",
+          convertedValues, convertedValuesNull, ArrayType(to.valueType), valuesCast)}
+
+        $evPrim = new $mapClass($convertedKeys, $convertedValues);
       """
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/GenericSpecializedGetters.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/GenericSpecializedGetters.scala
new file mode 100644
index 0000000000000..6e957928e02a8
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/GenericSpecializedGetters.scala
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions
+
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.types.{DataType, MapData, ArrayData, Decimal}
+import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String}
+
+trait GenericSpecializedGetters extends SpecializedGetters {
+
+  def genericGet(ordinal: Int): Any
+
+  private def getAs[T](ordinal: Int) = genericGet(ordinal).asInstanceOf[T]
+
+  override def isNullAt(ordinal: Int): Boolean = getAs[AnyRef](ordinal) eq null
+
+  override def get(ordinal: Int, elementType: DataType): AnyRef = getAs(ordinal)
+
+  override def getBoolean(ordinal: Int): Boolean = getAs(ordinal)
+
+  override def getByte(ordinal: Int): Byte = getAs(ordinal)
+
+  override def getShort(ordinal: Int): Short = getAs(ordinal)
+
+  override def getInt(ordinal: Int): Int = getAs(ordinal)
+
+  override def getLong(ordinal: Int): Long = getAs(ordinal)
+
+  override def getFloat(ordinal: Int): Float = getAs(ordinal)
+
+  override def getDouble(ordinal: Int): Double = getAs(ordinal)
+
+  override def getDecimal(ordinal: Int, precision: Int, scale: Int): Decimal = getAs(ordinal)
+
+  override def getUTF8String(ordinal: Int): UTF8String = getAs(ordinal)
+
+  override def getBinary(ordinal: Int): Array[Byte] = getAs(ordinal)
+
+  override def getInterval(ordinal: Int): CalendarInterval = getAs(ordinal)
+
+  override def getStruct(ordinal: Int, numFields: Int): InternalRow = getAs(ordinal)
+
+  override def getArray(ordinal: Int): ArrayData = getAs(ordinal)
+
+  override def getMap(ordinal: Int): MapData = getAs(ordinal)
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
index 7c7664e4c1a91..d79325aea87e3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
@@ -19,8 +19,8 @@ package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.codegen.{GenerateUnsafeProjection, GenerateMutableProjection}
-import org.apache.spark.sql.types.{Decimal, StructType, DataType}
-import org.apache.spark.unsafe.types.UTF8String
+import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String}
 
 /**
  * A [[Projection]] that is calculated by calling the `eval` of each of the specified expressions.
@@ -190,45 +190,55 @@ class JoinedRow extends InternalRow {
 
   override def numFields: Int = row1.numFields + row2.numFields
 
-  override def getUTF8String(i: Int): UTF8String = {
-    if (i < row1.numFields) row1.getUTF8String(i) else row2.getUTF8String(i - row1.numFields)
-  }
-
-  override def getBinary(i: Int): Array[Byte] = {
-    if (i < row1.numFields) row1.getBinary(i) else row2.getBinary(i - row1.numFields)
-  }
-
-  override def get(i: Int, dataType: DataType): Any =
-    if (i < row1.numFields) row1.get(i) else row2.get(i - row1.numFields)
+  override def genericGet(i: Int): Any =
+    if (i < row1.numFields) row1.genericGet(i) else row2.genericGet(i - row1.numFields)
 
   override def isNullAt(i: Int): Boolean =
     if (i < row1.numFields) row1.isNullAt(i) else row2.isNullAt(i - row1.numFields)
 
+  override def getBoolean(i: Int): Boolean =
+    if (i < row1.numFields) row1.getBoolean(i) else row2.getBoolean(i - row1.numFields)
+
+  override def getByte(i: Int): Byte =
+    if (i < row1.numFields) row1.getByte(i) else row2.getByte(i - row1.numFields)
+
+  override def getShort(i: Int): Short =
+    if (i < row1.numFields) row1.getShort(i) else row2.getShort(i - row1.numFields)
+
   override def getInt(i: Int): Int =
     if (i < row1.numFields) row1.getInt(i) else row2.getInt(i - row1.numFields)
 
   override def getLong(i: Int): Long =
     if (i < row1.numFields) row1.getLong(i) else row2.getLong(i - row1.numFields)
 
+  override def getFloat(i: Int): Float =
+    if (i < row1.numFields) row1.getFloat(i) else row2.getFloat(i - row1.numFields)
+
   override def getDouble(i: Int): Double =
     if (i < row1.numFields) row1.getDouble(i) else row2.getDouble(i - row1.numFields)
 
-  override def getBoolean(i: Int): Boolean =
-    if (i < row1.numFields) row1.getBoolean(i) else row2.getBoolean(i - row1.numFields)
+  override def getDecimal(i: Int, precision: Int, scale: Int): Decimal = {
+    if (i < row1.numFields) {
+      row1.getDecimal(i, precision, scale)
+    } else {
+      row2.getDecimal(i - row1.numFields, precision, scale)
+    }
+  }
 
-  override def getShort(i: Int): Short =
-    if (i < row1.numFields) row1.getShort(i) else row2.getShort(i - row1.numFields)
+  override def getUTF8String(i: Int): UTF8String =
+    if (i < row1.numFields) row1.getUTF8String(i) else row2.getUTF8String(i - row1.numFields)
 
-  override def getByte(i: Int): Byte =
-    if (i < row1.numFields) row1.getByte(i) else row2.getByte(i - row1.numFields)
+  override def getBinary(i: Int): Array[Byte] =
+    if (i < row1.numFields) row1.getBinary(i) else row2.getBinary(i - row1.numFields)
 
-  override def getFloat(i: Int): Float =
-    if (i < row1.numFields) row1.getFloat(i) else row2.getFloat(i - row1.numFields)
+  override def getArray(i: Int): ArrayData =
+    if (i < row1.numFields) row1.getArray(i) else row2.getArray(i - row1.numFields)
 
-  override def getDecimal(i: Int, precision: Int, scale: Int): Decimal = {
-    if (i < row1.numFields) row1.getDecimal(i, precision, scale)
-    else row2.getDecimal(i - row1.numFields, precision, scale)
-  }
+  override def getInterval(i: Int): CalendarInterval =
+    if (i < row1.numFields) row1.getInterval(i) else row2.getInterval(i - row1.numFields)
+
+  override def getMap(i: Int): MapData =
+    if (i < row1.numFields) row1.getMap(i) else row2.getMap(i - row1.numFields)
 
   override def getStruct(i: Int, numFields: Int): InternalRow = {
     if (i < row1.numFields) {
@@ -239,14 +249,9 @@ class JoinedRow extends InternalRow {
   }
 
   override def copy(): InternalRow = {
-    val totalSize = row1.numFields + row2.numFields
-    val copiedValues = new Array[Any](totalSize)
-    var i = 0
-    while(i < totalSize) {
-      copiedValues(i) = get(i)
-      i += 1
-    }
-    new GenericInternalRow(copiedValues)
+    val copy1 = row1.copy()
+    val copy2 = row2.copy()
+    new JoinedRow(copy1, copy2)
   }
 
   override def toString: String = {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SpecificMutableRow.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SpecificMutableRow.scala
index b877ce47c083f..d149a5b179b63 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SpecificMutableRow.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SpecificMutableRow.scala
@@ -213,18 +213,12 @@ final class SpecificMutableRow(val values: Array[MutableValue]) extends MutableR
 
   override def numFields: Int = values.length
 
-  override def toSeq: Seq[Any] = values.map(_.boxed).toSeq
+  override def toSeq: Seq[Any] = values.map(_.boxed)
 
   override def setNullAt(i: Int): Unit = {
     values(i).isNull = true
   }
 
-  override def get(i: Int, dataType: DataType): Any = values(i).boxed
-
-  override def getStruct(ordinal: Int, numFields: Int): InternalRow = {
-    values(ordinal).boxed.asInstanceOf[InternalRow]
-  }
-
   override def isNullAt(i: Int): Boolean = values(i).isNull
 
   override def copy(): InternalRow = {
@@ -238,6 +232,8 @@ final class SpecificMutableRow(val values: Array[MutableValue]) extends MutableR
     new GenericInternalRow(newValues)
   }
 
+  override def genericGet(i: Int): Any = values(i).boxed
+
   override def update(ordinal: Int, value: Any) {
     if (value == null) {
       setNullAt(ordinal)
@@ -246,9 +242,6 @@ final class SpecificMutableRow(val values: Array[MutableValue]) extends MutableR
     }
   }
 
-  override def setString(ordinal: Int, value: String): Unit =
-    update(ordinal, UTF8String.fromString(value))
-
   override def setInt(ordinal: Int, value: Int): Unit = {
     val currentValue = values(ordinal).asInstanceOf[MutableInt]
     currentValue.isNull = false
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index 36f4e9c6be570..fc7cfee989666 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -112,8 +112,10 @@ class CodeGenContext {
       case BinaryType => s"$getter.getBinary($ordinal)"
       case CalendarIntervalType => s"$getter.getInterval($ordinal)"
       case t: StructType => s"$getter.getStruct($ordinal, ${t.size})"
-      case a: ArrayType => s"$getter.getArray($ordinal)"
-      case _ => s"($jt)$getter.get($ordinal)" // todo: remove generic getter.
+      case _: ArrayType => s"$getter.getArray($ordinal)"
+      case _: MapType => s"$getter.getMap($ordinal)"
+      case NullType => "null"
+      case _ => s"($jt)$getter.get($ordinal, null)"
     }
   }
 
@@ -156,7 +158,7 @@ class CodeGenContext {
     case CalendarIntervalType => "CalendarInterval"
     case _: StructType => "InternalRow"
     case _: ArrayType => "ArrayData"
-    case _: MapType => "scala.collection.Map"
+    case _: MapType => "MapData"
     case dt: OpenHashSetUDT if dt.elementType == IntegerType => classOf[IntegerHashSet].getName
     case dt: OpenHashSetUDT if dt.elementType == LongType => classOf[LongHashSet].getName
     case _ => "Object"
@@ -300,7 +302,8 @@ abstract class CodeGenerator[InType <: AnyRef, OutType <: AnyRef] extends Loggin
       classOf[UTF8String].getName,
       classOf[Decimal].getName,
       classOf[CalendarInterval].getName,
-      classOf[ArrayData].getName
+      classOf[ArrayData].getName,
+      classOf[MapData].getName
     ))
     evaluator.setExtendedClass(classOf[GeneratedClass])
     try {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala
index 35920147105ff..6f9acda071a62 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala
@@ -183,7 +183,7 @@ object GenerateProjection extends CodeGenerator[Seq[Expression], Projection] {
         public void setNullAt(int i) { nullBits[i] = true; }
         public boolean isNullAt(int i) { return nullBits[i]; }
 
-        public Object get(int i, ${classOf[DataType].getName} dataType) {
+        public Object genericGet(int i) {
           if (isNullAt(i)) return null;
           switch (i) {
           $getCases
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
index 0a530596a98c6..1156797b2c37d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
@@ -31,15 +31,11 @@ case class Size(child: Expression) extends UnaryExpression with ExpectsInputType
 
   override def nullSafeEval(value: Any): Int = child.dataType match {
     case _: ArrayType => value.asInstanceOf[ArrayData].numElements()
-    case _: MapType => value.asInstanceOf[Map[Any, Any]].size
+    case _: MapType => value.asInstanceOf[MapData].numElements()
   }
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
-    val sizeCall = child.dataType match {
-      case _: ArrayType => "numElements()"
-      case _: MapType => "size()"
-    }
-    nullSafeCodeGen(ctx, ev, c => s"${ev.primitive} = ($c).$sizeCall;")
+    nullSafeCodeGen(ctx, ev, c => s"${ev.primitive} = ($c).numElements();")
   }
 }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala
index 99393c9c76ab6..9927da21b052e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala
@@ -17,8 +17,6 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
-import scala.collection.Map
-
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis._
@@ -41,7 +39,7 @@ object ExtractValue {
    *    Struct      |   Literal String   |        GetStructField
    * Array[Struct]  |   Literal String   |     GetArrayStructFields
    *    Array       |   Integral type    |         GetArrayItem
-   *     Map        |      Any type      |         GetMapValue
+   *     Map        |   map key type     |         GetMapValue
    */
   def apply(
       child: Expression,
@@ -60,18 +58,14 @@ object ExtractValue {
         GetArrayStructFields(child, fields(ordinal).copy(name = fieldName),
           ordinal, fields.length, containsNull)
 
-      case (_: ArrayType, _) if extraction.dataType.isInstanceOf[IntegralType] =>
-        GetArrayItem(child, extraction)
+      case (_: ArrayType, _) => GetArrayItem(child, extraction)
 
-      case (_: MapType, _) =>
-        GetMapValue(child, extraction)
+      case (MapType(kt, _, _), _) => GetMapValue(child, extraction)
 
       case (otherType, _) =>
         val errorMsg = otherType match {
-          case StructType(_) | ArrayType(StructType(_), _) =>
+          case StructType(_) =>
             s"Field name should be String Literal, but it's $extraction"
-          case _: ArrayType =>
-            s"Array index should be integral type, but it's ${extraction.dataType}"
           case other =>
             s"Can't extract value from $child"
         }
@@ -190,9 +184,13 @@ case class GetArrayStructFields(
 /**
  * Returns the field at `ordinal` in the Array `child`.
  *
- * No need to do type checking since it is handled by [[ExtractValue]].
+ * We need to do type checking here as `ordinal` expression maybe unresolved.
  */
-case class GetArrayItem(child: Expression, ordinal: Expression) extends BinaryExpression {
+case class GetArrayItem(child: Expression, ordinal: Expression)
+  extends BinaryExpression with ExpectsInputTypes {
+
+  // We have done type checking for child in `ExtractValue`, so only need to check the `ordinal`.
+  override def inputTypes: Seq[AbstractDataType] = Seq(AnyDataType, IntegralType)
 
   override def toString: String = s"$child[$ordinal]"
 
@@ -205,14 +203,12 @@ case class GetArrayItem(child: Expression, ordinal: Expression) extends BinaryEx
   override def dataType: DataType = child.dataType.asInstanceOf[ArrayType].elementType
 
   protected override def nullSafeEval(value: Any, ordinal: Any): Any = {
-    // TODO: consider using Array[_] for ArrayType child to avoid
-    // boxing of primitives
     val baseValue = value.asInstanceOf[ArrayData]
     val index = ordinal.asInstanceOf[Number].intValue()
     if (index >= baseValue.numElements() || index < 0) {
       null
     } else {
-      baseValue.get(index)
+      baseValue.get(index, dataType)
     }
   }
 
@@ -233,9 +229,15 @@ case class GetArrayItem(child: Expression, ordinal: Expression) extends BinaryEx
 /**
  * Returns the value of key `key` in Map `child`.
  *
- * No need to do type checking since it is handled by [[ExtractValue]].
+ * We need to do type checking here as `key` expression maybe unresolved.
  */
-case class GetMapValue(child: Expression, key: Expression) extends BinaryExpression {
+case class GetMapValue(child: Expression, key: Expression)
+  extends BinaryExpression with ExpectsInputTypes {
+
+  private def keyType = child.dataType.asInstanceOf[MapType].keyType
+
+  // We have done type checking for child in `ExtractValue`, so only need to check the `key`.
+  override def inputTypes: Seq[AbstractDataType] = Seq(AnyDataType, keyType)
 
   override def toString: String = s"$child[$key]"
 
@@ -247,16 +249,53 @@ case class GetMapValue(child: Expression, key: Expression) extends BinaryExpress
 
   override def dataType: DataType = child.dataType.asInstanceOf[MapType].valueType
 
+  // todo: current search is O(n), improve it.
   protected override def nullSafeEval(value: Any, ordinal: Any): Any = {
-    val baseValue = value.asInstanceOf[Map[Any, _]]
-    baseValue.get(ordinal).orNull
+    val map = value.asInstanceOf[MapData]
+    val length = map.numElements()
+    val keys = map.keyArray()
+
+    var i = 0
+    var found = false
+    while (i < length && !found) {
+      if (keys.get(i, keyType) == ordinal) {
+        found = true
+      } else {
+        i += 1
+      }
+    }
+
+    if (!found) {
+      null
+    } else {
+      map.valueArray().get(i, dataType)
+    }
   }
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    val index = ctx.freshName("index")
+    val length = ctx.freshName("length")
+    val keys = ctx.freshName("keys")
+    val found = ctx.freshName("found")
+    val key = ctx.freshName("key")
     nullSafeCodeGen(ctx, ev, (eval1, eval2) => {
       s"""
-        if ($eval1.contains($eval2)) {
-          ${ev.primitive} = (${ctx.boxedType(dataType)})$eval1.apply($eval2);
+        final int $length = $eval1.numElements();
+        final ArrayData $keys = $eval1.keyArray();
+
+        int $index = 0;
+        boolean $found = false;
+        while ($index < $length && !$found) {
+          final ${ctx.javaType(keyType)} $key = ${ctx.getValue(keys, keyType, index)};
+          if (${ctx.genEqual(keyType, key, eval2)}) {
+            $found = true;
+          } else {
+            $index++;
+          }
+        }
+
+        if ($found) {
+          ${ev.primitive} = ${ctx.getValue(eval1 + ".valueArray()", dataType, index)};
         } else {
           ${ev.isNull} = true;
         }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
index 8064235c64ef9..d474853355e5b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
@@ -120,13 +120,30 @@ case class Explode(child: Expression) extends UnaryExpression with Generator wit
 
   override def eval(input: InternalRow): TraversableOnce[InternalRow] = {
     child.dataType match {
-      case ArrayType(_, _) =>
+      case ArrayType(et, _) =>
         val inputArray = child.eval(input).asInstanceOf[ArrayData]
-        if (inputArray == null) Nil else inputArray.toArray().map(v => InternalRow(v))
-      case MapType(_, _, _) =>
-        val inputMap = child.eval(input).asInstanceOf[Map[Any, Any]]
-        if (inputMap == null) Nil
-        else inputMap.map { case (k, v) => InternalRow(k, v) }
+        if (inputArray == null) {
+          Nil
+        } else {
+          val rows = new Array[InternalRow](inputArray.numElements())
+          inputArray.foreach(et, (i, e) => {
+            rows(i) = InternalRow(e)
+          })
+          rows
+        }
+      case MapType(kt, vt, _) =>
+        val inputMap = child.eval(input).asInstanceOf[MapData]
+        if (inputMap == null) {
+          Nil
+        } else {
+          val rows = new Array[InternalRow](inputMap.numElements())
+          var i = 0
+          inputMap.foreach(kt, vt, (k, v) => {
+            rows(i) = InternalRow(k, v)
+            i += 1
+          })
+          rows
+        }
     }
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala
index df6ea586c87ba..73f6b7a5500e6 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala
@@ -32,28 +32,14 @@ abstract class MutableRow extends InternalRow {
   def update(i: Int, value: Any)
 
   // default implementation (slow)
-  def setInt(i: Int, value: Int): Unit = { update(i, value) }
-  def setLong(i: Int, value: Long): Unit = { update(i, value) }
-  def setDouble(i: Int, value: Double): Unit = { update(i, value) }
   def setBoolean(i: Int, value: Boolean): Unit = { update(i, value) }
-  def setShort(i: Int, value: Short): Unit = { update(i, value) }
   def setByte(i: Int, value: Byte): Unit = { update(i, value) }
+  def setShort(i: Int, value: Short): Unit = { update(i, value) }
+  def setInt(i: Int, value: Int): Unit = { update(i, value) }
+  def setLong(i: Int, value: Long): Unit = { update(i, value) }
   def setFloat(i: Int, value: Float): Unit = { update(i, value) }
+  def setDouble(i: Int, value: Double): Unit = { update(i, value) }
   def setDecimal(i: Int, value: Decimal, precision: Int) { update(i, value) }
-  def setString(i: Int, value: String): Unit = {
-    update(i, UTF8String.fromString(value))
-  }
-
-  override def copy(): InternalRow = {
-    val n = numFields
-    val arr = new Array[Any](n)
-    var i = 0
-    while (i < n) {
-      arr(i) = get(i)
-      i += 1
-    }
-    new GenericInternalRow(arr)
-  }
 }
 
 /**
@@ -96,17 +82,13 @@ class GenericInternalRow(protected[sql] val values: Array[Any]) extends Internal
 
   def this(size: Int) = this(new Array[Any](size))
 
-  override def toSeq: Seq[Any] = values.toSeq
-
-  override def numFields: Int = values.length
+  override def genericGet(ordinal: Int): Any = values(ordinal)
 
-  override def get(i: Int, dataType: DataType): Any = values(i)
+  override def toSeq: Seq[Any] = values
 
-  override def getStruct(ordinal: Int, numFields: Int): InternalRow = {
-    values(ordinal).asInstanceOf[InternalRow]
-  }
+  override def numFields: Int = values.length
 
-  override def copy(): InternalRow = this
+  override def copy(): InternalRow = new GenericInternalRow(values.clone())
 }
 
 /**
@@ -127,15 +109,11 @@ class GenericMutableRow(val values: Array[Any]) extends MutableRow {
 
   def this(size: Int) = this(new Array[Any](size))
 
-  override def toSeq: Seq[Any] = values.toSeq
-
-  override def numFields: Int = values.length
+  override def genericGet(ordinal: Int): Any = values(ordinal)
 
-  override def get(i: Int, dataType: DataType): Any = values(i)
+  override def toSeq: Seq[Any] = values
 
-  override def getStruct(ordinal: Int, numFields: Int): InternalRow = {
-    values(ordinal).asInstanceOf[InternalRow]
-  }
+  override def numFields: Int = values.length
 
   override def setNullAt(i: Int): Unit = { values(i) = null}
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
index 5dd387a418a1d..3ce5d6a9c7e02 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
@@ -95,7 +95,7 @@ case class ConcatWs(children: Seq[Expression])
     val flatInputs = children.flatMap { child =>
       child.eval(input) match {
         case s: UTF8String => Iterator(s)
-        case arr: ArrayData => arr.toArray().map(_.asInstanceOf[UTF8String])
+        case arr: ArrayData => arr.toArray[UTF8String](StringType)
         case null => Iterator(null.asInstanceOf[UTF8String])
       }
     }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayBasedMapData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayBasedMapData.scala
new file mode 100644
index 0000000000000..db4876355daec
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayBasedMapData.scala
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.types
+
+class ArrayBasedMapData(val keyArray: ArrayData, val valueArray: ArrayData) extends MapData {
+  require(keyArray.numElements() == valueArray.numElements())
+
+  override def numElements(): Int = keyArray.numElements()
+
+  override def equals(o: Any): Boolean = {
+    if (!o.isInstanceOf[ArrayBasedMapData]) {
+      return false
+    }
+
+    val other = o.asInstanceOf[ArrayBasedMapData]
+    if (other eq null) {
+      return false
+    }
+
+    this.keyArray == other.keyArray && this.valueArray == other.valueArray
+  }
+
+  override def hashCode: Int = {
+    keyArray.hashCode() * 37 + valueArray.hashCode()
+  }
+
+  override def toString(): String = {
+    s"keys: $keyArray\nvalues: $valueArray"
+  }
+}
+
+object ArrayBasedMapData {
+  def apply(keys: Array[Any], values: Array[Any]): ArrayBasedMapData = {
+    new ArrayBasedMapData(new GenericArrayData(keys), new GenericArrayData(values))
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayData.scala
index 14a7285877622..c99fc233255e5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayData.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayData.scala
@@ -20,102 +20,111 @@ package org.apache.spark.sql.types
 import org.apache.spark.sql.catalyst.expressions.SpecializedGetters
 
 abstract class ArrayData extends SpecializedGetters with Serializable {
-  // todo: remove this after we handle all types.(map type need special getter)
-  def get(ordinal: Int): Any
-
   def numElements(): Int
 
-  // todo: need a more efficient way to iterate array type.
-  def toArray(): Array[Any] = {
-    val n = numElements()
-    val values = new Array[Any](n)
+  def toBooleanArray(): Array[Boolean] = {
+    val size = numElements()
+    val values = new Array[Boolean](size)
     var i = 0
-    while (i < n) {
-      if (isNullAt(i)) {
-        values(i) = null
-      } else {
-        values(i) = get(i)
-      }
+    while (i < size) {
+      values(i) = getBoolean(i)
       i += 1
     }
     values
   }
 
-  override def toString(): String = toArray.mkString("[", ",", "]")
+  def toByteArray(): Array[Byte] = {
+    val size = numElements()
+    val values = new Array[Byte](size)
+    var i = 0
+    while (i < size) {
+      values(i) = getByte(i)
+      i += 1
+    }
+    values
+  }
 
-  override def equals(o: Any): Boolean = {
-    if (!o.isInstanceOf[ArrayData]) {
-      return false
+  def toShortArray(): Array[Short] = {
+    val size = numElements()
+    val values = new Array[Short](size)
+    var i = 0
+    while (i < size) {
+      values(i) = getShort(i)
+      i += 1
     }
+    values
+  }
 
-    val other = o.asInstanceOf[ArrayData]
-    if (other eq null) {
-      return false
+  def toIntArray(): Array[Int] = {
+    val size = numElements()
+    val values = new Array[Int](size)
+    var i = 0
+    while (i < size) {
+      values(i) = getInt(i)
+      i += 1
     }
+    values
+  }
 
-    val len = numElements()
-    if (len != other.numElements()) {
-      return false
+  def toLongArray(): Array[Long] = {
+    val size = numElements()
+    val values = new Array[Long](size)
+    var i = 0
+    while (i < size) {
+      values(i) = getLong(i)
+      i += 1
     }
+    values
+  }
 
+  def toFloatArray(): Array[Float] = {
+    val size = numElements()
+    val values = new Array[Float](size)
     var i = 0
-    while (i < len) {
-      if (isNullAt(i) != other.isNullAt(i)) {
-        return false
-      }
-      if (!isNullAt(i)) {
-        val o1 = get(i)
-        val o2 = other.get(i)
-        o1 match {
-          case b1: Array[Byte] =>
-            if (!o2.isInstanceOf[Array[Byte]] ||
-              !java.util.Arrays.equals(b1, o2.asInstanceOf[Array[Byte]])) {
-              return false
-            }
-          case f1: Float if java.lang.Float.isNaN(f1) =>
-            if (!o2.isInstanceOf[Float] || ! java.lang.Float.isNaN(o2.asInstanceOf[Float])) {
-              return false
-            }
-          case d1: Double if java.lang.Double.isNaN(d1) =>
-            if (!o2.isInstanceOf[Double] || ! java.lang.Double.isNaN(o2.asInstanceOf[Double])) {
-              return false
-            }
-          case _ => if (o1 != o2) {
-            return false
-          }
-        }
+    while (i < size) {
+      values(i) = getFloat(i)
+      i += 1
+    }
+    values
+  }
+
+  def toDoubleArray(): Array[Double] = {
+    val size = numElements()
+    val values = new Array[Double](size)
+    var i = 0
+    while (i < size) {
+      values(i) = getDouble(i)
+      i += 1
+    }
+    values
+  }
+
+  def toArray[T](elementType: DataType): Array[T] = {
+    val size = numElements()
+    val values = new Array[Any](size)
+    var i = 0
+    while (i < size) {
+      if (isNullAt(i)) {
+        values(i) = null
+      } else {
+        values(i) = get(i, elementType)
       }
       i += 1
     }
-    true
+    values.asInstanceOf[Array[T]]
   }
 
-  override def hashCode: Int = {
-    var result: Int = 37
+  // todo: specialize this.
+  def foreach(elementType: DataType, f: (Int, Any) => Unit): Unit = {
+    val size = numElements()
     var i = 0
-    val len = numElements()
-    while (i < len) {
-      val update: Int =
-        if (isNullAt(i)) {
-          0
-        } else {
-          get(i) match {
-            case b: Boolean => if (b) 0 else 1
-            case b: Byte => b.toInt
-            case s: Short => s.toInt
-            case i: Int => i
-            case l: Long => (l ^ (l >>> 32)).toInt
-            case f: Float => java.lang.Float.floatToIntBits(f)
-            case d: Double =>
-              val b = java.lang.Double.doubleToLongBits(d)
-              (b ^ (b >>> 32)).toInt
-            case a: Array[Byte] => java.util.Arrays.hashCode(a)
-            case other => other.hashCode()
-          }
-        }
-      result = 37 * result + update
+    while (i < size) {
+      if (isNullAt(i)) {
+        f(i, null)
+      } else {
+        f(i, get(i, elementType))
+      }
       i += 1
     }
-    result
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/GenericArrayData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/GenericArrayData.scala
index 35ace673fb3da..b3e75f8bad502 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/GenericArrayData.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/GenericArrayData.scala
@@ -17,43 +17,91 @@
 
 package org.apache.spark.sql.types
 
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.unsafe.types.{UTF8String, CalendarInterval}
+import org.apache.spark.sql.catalyst.expressions.GenericSpecializedGetters
 
-class GenericArrayData(array: Array[Any]) extends ArrayData {
-  private def getAs[T](ordinal: Int) = get(ordinal).asInstanceOf[T]
+class GenericArrayData(array: Array[Any]) extends ArrayData with GenericSpecializedGetters {
 
-  override def toArray(): Array[Any] = array
+  override def genericGet(ordinal: Int): Any = array(ordinal)
 
-  override def get(ordinal: Int): Any = array(ordinal)
-
-  override def isNullAt(ordinal: Int): Boolean = get(ordinal) == null
-
-  override def getBoolean(ordinal: Int): Boolean = getAs(ordinal)
-
-  override def getByte(ordinal: Int): Byte = getAs(ordinal)
-
-  override def getShort(ordinal: Int): Short = getAs(ordinal)
-
-  override def getInt(ordinal: Int): Int = getAs(ordinal)
-
-  override def getLong(ordinal: Int): Long = getAs(ordinal)
-
-  override def getFloat(ordinal: Int): Float = getAs(ordinal)
-
-  override def getDouble(ordinal: Int): Double = getAs(ordinal)
-
-  override def getDecimal(ordinal: Int, precision: Int, scale: Int): Decimal = getAs(ordinal)
-
-  override def getUTF8String(ordinal: Int): UTF8String = getAs(ordinal)
-
-  override def getBinary(ordinal: Int): Array[Byte] = getAs(ordinal)
-
-  override def getInterval(ordinal: Int): CalendarInterval = getAs(ordinal)
-
-  override def getStruct(ordinal: Int, numFields: Int): InternalRow = getAs(ordinal)
-
-  override def getArray(ordinal: Int): ArrayData = getAs(ordinal)
+  override def toArray[T](elementType: DataType): Array[T] = array.asInstanceOf[Array[T]]
 
   override def numElements(): Int = array.length
+
+  override def toString(): String = array.mkString("[", ",", "]")
+
+  override def equals(o: Any): Boolean = {
+    if (!o.isInstanceOf[GenericArrayData]) {
+      return false
+    }
+
+    val other = o.asInstanceOf[GenericArrayData]
+    if (other eq null) {
+      return false
+    }
+
+    val len = numElements()
+    if (len != other.numElements()) {
+      return false
+    }
+
+    var i = 0
+    while (i < len) {
+      if (isNullAt(i) != other.isNullAt(i)) {
+        return false
+      }
+      if (!isNullAt(i)) {
+        val o1 = genericGet(i)
+        val o2 = other.genericGet(i)
+        o1 match {
+          case b1: Array[Byte] =>
+            if (!o2.isInstanceOf[Array[Byte]] ||
+              !java.util.Arrays.equals(b1, o2.asInstanceOf[Array[Byte]])) {
+              return false
+            }
+          case f1: Float if java.lang.Float.isNaN(f1) =>
+            if (!o2.isInstanceOf[Float] || ! java.lang.Float.isNaN(o2.asInstanceOf[Float])) {
+              return false
+            }
+          case d1: Double if java.lang.Double.isNaN(d1) =>
+            if (!o2.isInstanceOf[Double] || ! java.lang.Double.isNaN(o2.asInstanceOf[Double])) {
+              return false
+            }
+          case _ => if (o1 != o2) {
+            return false
+          }
+        }
+      }
+      i += 1
+    }
+    true
+  }
+
+  override def hashCode: Int = {
+    var result: Int = 37
+    var i = 0
+    val len = numElements()
+    while (i < len) {
+      val update: Int =
+        if (isNullAt(i)) {
+          0
+        } else {
+          genericGet(i) match {
+            case b: Boolean => if (b) 0 else 1
+            case b: Byte => b.toInt
+            case s: Short => s.toInt
+            case i: Int => i
+            case l: Long => (l ^ (l >>> 32)).toInt
+            case f: Float => java.lang.Float.floatToIntBits(f)
+            case d: Double =>
+              val b = java.lang.Double.doubleToLongBits(d)
+              (b ^ (b >>> 32)).toInt
+            case a: Array[Byte] => java.util.Arrays.hashCode(a)
+            case other => other.hashCode()
+          }
+        }
+      result = 37 * result + update
+      i += 1
+    }
+    result
+  }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapData.scala
new file mode 100644
index 0000000000000..5514c3cd8546a
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapData.scala
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.types
+
+abstract class MapData extends Serializable {
+
+  def numElements(): Int
+
+  def keyArray(): ArrayData
+
+  def valueArray(): ArrayData
+
+  def foreach(keyType: DataType, valueType: DataType, f: (Any, Any) => Unit): Unit = {
+    val length = numElements()
+    val keys = keyArray()
+    val values = valueArray()
+    var i = 0
+    while (i < length) {
+      f(keys.get(i, keyType), values.get(i, valueType))
+      i += 1
+    }
+  }
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala
index 3fa246b69d1f1..e60990aeb423f 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala
@@ -171,8 +171,6 @@ class ComplexTypeSuite extends SparkFunSuite with ExpressionEvalHelper {
 
   test("error message of ExtractValue") {
     val structType = StructType(StructField("a", StringType, true) :: Nil)
-    val arrayStructType = ArrayType(structType)
-    val arrayType = ArrayType(StringType)
     val otherType = StringType
 
     def checkErrorMessage(
@@ -189,8 +187,6 @@ class ComplexTypeSuite extends SparkFunSuite with ExpressionEvalHelper {
     }
 
     checkErrorMessage(structType, IntegerType, "Field name should be String Literal")
-    checkErrorMessage(arrayStructType, BooleanType, "Field name should be String Literal")
-    checkErrorMessage(arrayType, StringType, "Array index should be integral type")
     checkErrorMessage(otherType, StringType, "Can't extract value from")
   }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala
index a0e1701339ea7..44f845620a109 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala
@@ -87,7 +87,7 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers {
 
     val row = new SpecificMutableRow(fieldTypes)
     row.setLong(0, 0)
-    row.setString(1, "Hello")
+    row.update(1, UTF8String.fromString("Hello"))
     row.update(2, DateTimeUtils.fromJavaDate(Date.valueOf("1970-01-01")))
     row.update(3, DateTimeUtils.fromJavaTimestamp(Timestamp.valueOf("2015-05-08 08:10:25")))
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala
index f26f41fb75d57..c37007f1eece7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala
@@ -159,10 +159,16 @@ package object debug {
       case (row: InternalRow, StructType(fields)) =>
         row.toSeq.zip(fields.map(_.dataType)).foreach { case(d, t) => typeCheck(d, t) }
       case (a: ArrayData, ArrayType(elemType, _)) =>
-        a.toArray().foreach(typeCheck(_, elemType))
-      case (m: Map[_, _], MapType(keyType, valueType, _)) =>
-        m.keys.foreach(typeCheck(_, keyType))
-        m.values.foreach(typeCheck(_, valueType))
+        a.foreach(elemType, (_, e) => {
+          typeCheck(e, elemType)
+        })
+      case (m: MapData, MapType(keyType, valueType, _)) =>
+        m.keyArray().foreach(keyType, (_, e) => {
+          typeCheck(e, keyType)
+        })
+        m.valueArray().foreach(valueType, (_, e) => {
+          typeCheck(e, valueType)
+        })
 
       case (_: Long, LongType) =>
       case (_: Int, IntegerType) =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUDFs.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUDFs.scala
index ef1c6e57dc08a..aade2e769ccd6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUDFs.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUDFs.scala
@@ -135,22 +135,18 @@ object EvaluatePython {
       new GenericInternalRowWithSchema(values, struct)
 
     case (a: ArrayData, array: ArrayType) =>
-      val length = a.numElements()
-      val values = new java.util.ArrayList[Any](length)
-      var i = 0
-      while (i < length) {
-        if (a.isNullAt(i)) {
-          values.add(null)
-        } else {
-          values.add(toJava(a.get(i), array.elementType))
-        }
-        i += 1
-      }
+      val values = new java.util.ArrayList[Any](a.numElements())
+      a.foreach(array.elementType, (_, e) => {
+        values.add(toJava(e, array.elementType))
+      })
       values
 
-    case (obj: Map[_, _], mt: MapType) => obj.map {
-      case (k, v) => (toJava(k, mt.keyType), toJava(v, mt.valueType))
-    }.asJava
+    case (map: MapData, mt: MapType) =>
+      val jmap = new java.util.HashMap[Any, Any](map.numElements())
+      map.foreach(mt.keyType, mt.valueType, (k, v) => {
+        jmap.put(toJava(k, mt.keyType), toJava(v, mt.valueType))
+      })
+      jmap
 
     case (ud, udt: UserDefinedType[_]) => toJava(ud, udt.sqlType)
 
@@ -206,9 +202,10 @@ object EvaluatePython {
     case (c, ArrayType(elementType, _)) if c.getClass.isArray =>
       new GenericArrayData(c.asInstanceOf[Array[_]].map(e => fromJava(e, elementType)))
 
-    case (c: java.util.Map[_, _], MapType(keyType, valueType, _)) => c.map {
-      case (key, value) => (fromJava(key, keyType), fromJava(value, valueType))
-    }.toMap
+    case (c: java.util.Map[_, _], MapType(keyType, valueType, _)) =>
+      val keys = c.keysIterator.map(fromJava(_, keyType)).toArray
+      val values = c.valuesIterator.map(fromJava(_, valueType)).toArray
+      ArrayBasedMapData(keys, values)
 
     case (c, StructType(fields)) if c.getClass.isArray =>
       new GenericInternalRow(c.asInstanceOf[Array[_]].zip(fields).map {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/json/JacksonParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/json/JacksonParser.scala
index 1c309f8794ef3..bf0448ee9645d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/json/JacksonParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/json/JacksonParser.scala
@@ -19,10 +19,10 @@ package org.apache.spark.sql.json
 
 import java.io.ByteArrayOutputStream
 
-import scala.collection.Map
-
 import com.fasterxml.jackson.core._
 
+import scala.collection.mutable.ArrayBuffer
+
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
@@ -31,7 +31,6 @@ import org.apache.spark.sql.json.JacksonUtils.nextUntil
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
 
-
 private[sql] object JacksonParser {
   def apply(
       json: RDD[String],
@@ -160,21 +159,21 @@ private[sql] object JacksonParser {
   private def convertMap(
       factory: JsonFactory,
       parser: JsonParser,
-      valueType: DataType): Map[UTF8String, Any] = {
-    val builder = Map.newBuilder[UTF8String, Any]
+      valueType: DataType): MapData = {
+    val keys = ArrayBuffer.empty[UTF8String]
+    val values = ArrayBuffer.empty[Any]
     while (nextUntil(parser, JsonToken.END_OBJECT)) {
-      builder +=
-        UTF8String.fromString(parser.getCurrentName) -> convertField(factory, parser, valueType)
+      keys += UTF8String.fromString(parser.getCurrentName)
+      values += convertField(factory, parser, valueType)
     }
-
-    builder.result()
+    ArrayBasedMapData(keys.toArray, values.toArray)
   }
 
   private def convertArray(
       factory: JsonFactory,
       parser: JsonParser,
       elementType: DataType): ArrayData = {
-    val values = scala.collection.mutable.ArrayBuffer.empty[Any]
+    val values = ArrayBuffer.empty[Any]
     while (nextUntil(parser, JsonToken.END_ARRAY)) {
       values += convertField(factory, parser, elementType)
     }
@@ -213,7 +212,7 @@ private[sql] object JacksonParser {
               if (array.numElements() == 0) {
                 Nil
               } else {
-                array.toArray().map(_.asInstanceOf[InternalRow])
+                array.toArray[InternalRow](schema)
               }
             case _ =>
               sys.error(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystRowConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystRowConverter.scala
index 172db8362afb6..6938b071065cd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystRowConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystRowConverter.scala
@@ -385,7 +385,8 @@ private[parquet] class CatalystRowConverter(
       updater: ParentContainerUpdater)
     extends GroupConverter {
 
-    private var currentMap: mutable.Map[Any, Any] = _
+    private var currentKeys: ArrayBuffer[Any] = _
+    private var currentValues: ArrayBuffer[Any] = _
 
     private val keyValueConverter = {
       val repeatedType = parquetType.getType(0).asGroupType()
@@ -398,12 +399,16 @@ private[parquet] class CatalystRowConverter(
 
     override def getConverter(fieldIndex: Int): Converter = keyValueConverter
 
-    override def end(): Unit = updater.set(currentMap)
+    override def end(): Unit =
+      updater.set(ArrayBasedMapData(currentKeys.toArray, currentValues.toArray))
 
     // NOTE: We can't reuse the mutable Map here and must instantiate a new `Map` for the next
     // value.  `Row.copy()` only copies row cells, it doesn't do deep copy to objects stored in row
     // cells.
-    override def start(): Unit = currentMap = mutable.Map.empty[Any, Any]
+    override def start(): Unit = {
+      currentKeys = ArrayBuffer.empty[Any]
+      currentValues = ArrayBuffer.empty[Any]
+    }
 
     /** Parquet converter for key-value pairs within the map. */
     private final class KeyValueConverter(
@@ -430,7 +435,10 @@ private[parquet] class CatalystRowConverter(
 
       override def getConverter(fieldIndex: Int): Converter = converters(fieldIndex)
 
-      override def end(): Unit = currentMap(currentKey) = currentValue
+      override def end(): Unit = {
+        currentKeys += currentKey
+        currentValues += currentValue
+      }
 
       override def start(): Unit = {
         currentKey = null
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala
index 2332a36468dbc..6ed3580af0729 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.parquet
 
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.types.ArrayData
+import org.apache.spark.sql.types.{MapData, ArrayData}
 
 // TODO Removes this while fixing SPARK-8848
 private[sql] object CatalystConverter {
@@ -33,7 +33,7 @@ private[sql] object CatalystConverter {
   val MAP_SCHEMA_NAME = "map"
 
   // TODO: consider using Array[T] for arrays to avoid boxing of primitive types
-  type ArrayScalaType[T] = ArrayData
-  type StructScalaType[T] = InternalRow
-  type MapScalaType[K, V] = Map[K, V]
+  type ArrayScalaType = ArrayData
+  type StructScalaType = InternalRow
+  type MapScalaType = MapData
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
index ec8da38a3d427..9cd0250f9c510 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
@@ -88,13 +88,13 @@ private[parquet] class RowWriteSupport extends WriteSupport[InternalRow] with Lo
         case t: UserDefinedType[_] => writeValue(t.sqlType, value)
         case t @ ArrayType(_, _) => writeArray(
           t,
-          value.asInstanceOf[CatalystConverter.ArrayScalaType[_]])
+          value.asInstanceOf[CatalystConverter.ArrayScalaType])
         case t @ MapType(_, _, _) => writeMap(
           t,
-          value.asInstanceOf[CatalystConverter.MapScalaType[_, _]])
+          value.asInstanceOf[CatalystConverter.MapScalaType])
         case t @ StructType(_) => writeStruct(
           t,
-          value.asInstanceOf[CatalystConverter.StructScalaType[_]])
+          value.asInstanceOf[CatalystConverter.StructScalaType])
         case _ => writePrimitive(schema.asInstanceOf[AtomicType], value)
       }
     }
@@ -124,7 +124,7 @@ private[parquet] class RowWriteSupport extends WriteSupport[InternalRow] with Lo
 
   private[parquet] def writeStruct(
       schema: StructType,
-      struct: CatalystConverter.StructScalaType[_]): Unit = {
+      struct: CatalystConverter.StructScalaType): Unit = {
     if (struct != null) {
       val fields = schema.fields.toArray
       writer.startGroup()
@@ -143,7 +143,7 @@ private[parquet] class RowWriteSupport extends WriteSupport[InternalRow] with Lo
 
   private[parquet] def writeArray(
       schema: ArrayType,
-      array: CatalystConverter.ArrayScalaType[_]): Unit = {
+      array: CatalystConverter.ArrayScalaType): Unit = {
     val elementType = schema.elementType
     writer.startGroup()
     if (array.numElements() > 0) {
@@ -154,7 +154,7 @@ private[parquet] class RowWriteSupport extends WriteSupport[InternalRow] with Lo
           writer.startGroup()
           if (!array.isNullAt(i)) {
             writer.startField(CatalystConverter.ARRAY_ELEMENTS_SCHEMA_NAME, 0)
-            writeValue(elementType, array.get(i))
+            writeValue(elementType, array.get(i, elementType))
             writer.endField(CatalystConverter.ARRAY_ELEMENTS_SCHEMA_NAME, 0)
           }
           writer.endGroup()
@@ -165,7 +165,7 @@ private[parquet] class RowWriteSupport extends WriteSupport[InternalRow] with Lo
         writer.startField(CatalystConverter.ARRAY_ELEMENTS_SCHEMA_NAME, 0)
         var i = 0
         while (i < array.numElements()) {
-          writeValue(elementType, array.get(i))
+          writeValue(elementType, array.get(i, elementType))
           i = i + 1
         }
         writer.endField(CatalystConverter.ARRAY_ELEMENTS_SCHEMA_NAME, 0)
@@ -176,11 +176,12 @@ private[parquet] class RowWriteSupport extends WriteSupport[InternalRow] with Lo
 
   private[parquet] def writeMap(
       schema: MapType,
-      map: CatalystConverter.MapScalaType[_, _]): Unit = {
+      map: CatalystConverter.MapScalaType): Unit = {
     writer.startGroup()
-    if (map.size > 0) {
+    val length = map.numElements()
+    if (length > 0) {
       writer.startField(CatalystConverter.MAP_SCHEMA_NAME, 0)
-      for ((key, value) <- map) {
+      map.foreach(schema.keyType, schema.valueType, (key, value) => {
         writer.startGroup()
         writer.startField(CatalystConverter.MAP_KEY_SCHEMA_NAME, 0)
         writeValue(schema.keyType, key)
@@ -191,7 +192,7 @@ private[parquet] class RowWriteSupport extends WriteSupport[InternalRow] with Lo
           writer.endField(CatalystConverter.MAP_VALUE_SCHEMA_NAME, 1)
         }
         writer.endGroup()
-      }
+      })
       writer.endField(CatalystConverter.MAP_SCHEMA_NAME, 0)
     }
     writer.endGroup()
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 97beae2f85c50..aef940a526675 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -620,6 +620,7 @@ class DataFrameSuite extends QueryTest with SQLTestUtils {
     assert(complexData.filter(complexData("m")("1") === 1).count() == 1)
     assert(complexData.filter(complexData("s")("key") === 1).count() == 1)
     assert(complexData.filter(complexData("m")(complexData("s")("value")) === 1).count() == 1)
+    assert(complexData.filter(complexData("a")(complexData("s")("key")) === 1).count() == 1)
   }
 
   test("SPARK-7551: support backticks for DataFrame attribute resolution") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/RowSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/RowSuite.scala
index 01b7c21e84159..8a679c7865d6a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/RowSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/RowSuite.scala
@@ -19,9 +19,9 @@ package org.apache.spark.sql
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.execution.SparkSqlSerializer
-
 import org.apache.spark.sql.catalyst.expressions.{GenericMutableRow, SpecificMutableRow}
 import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.UTF8String
 
 class RowSuite extends SparkFunSuite {
 
@@ -31,7 +31,7 @@ class RowSuite extends SparkFunSuite {
   test("create row") {
     val expected = new GenericMutableRow(4)
     expected.setInt(0, 2147483647)
-    expected.setString(1, "this is a string")
+    expected.update(1, UTF8String.fromString("this is a string"))
     expected.setBoolean(2, false)
     expected.setNullAt(3)
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 535011fe3db5b..51fe9d9d98bf3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -581,42 +581,28 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll with SQLTestUtils {
   }
 
   test("sorting") {
-    val before = sqlContext.conf.externalSortEnabled
-    sqlContext.setConf(SQLConf.EXTERNAL_SORT, false)
-    sortTest()
-    sqlContext.setConf(SQLConf.EXTERNAL_SORT, before)
+    withSQLConf(SQLConf.EXTERNAL_SORT.key -> "false") {
+      sortTest()
+    }
   }
 
   test("external sorting") {
-    val before = sqlContext.conf.externalSortEnabled
-    sqlContext.setConf(SQLConf.EXTERNAL_SORT, true)
-    sortTest()
-    sqlContext.setConf(SQLConf.EXTERNAL_SORT, before)
+    withSQLConf(SQLConf.EXTERNAL_SORT.key -> "true") {
+      sortTest()
+    }
   }
 
   test("SPARK-6927 sorting with codegen on") {
-    val externalbefore = sqlContext.conf.externalSortEnabled
-    val codegenbefore = sqlContext.conf.codegenEnabled
-    sqlContext.setConf(SQLConf.EXTERNAL_SORT, false)
-    sqlContext.setConf(SQLConf.CODEGEN_ENABLED, true)
-    try{
+    withSQLConf(SQLConf.EXTERNAL_SORT.key -> "false",
+      SQLConf.CODEGEN_ENABLED.key -> "true") {
       sortTest()
-    } finally {
-      sqlContext.setConf(SQLConf.EXTERNAL_SORT, externalbefore)
-      sqlContext.setConf(SQLConf.CODEGEN_ENABLED, codegenbefore)
     }
   }
 
   test("SPARK-6927 external sorting with codegen on") {
-    val externalbefore = sqlContext.conf.externalSortEnabled
-    val codegenbefore = sqlContext.conf.codegenEnabled
-    sqlContext.setConf(SQLConf.CODEGEN_ENABLED, true)
-    sqlContext.setConf(SQLConf.EXTERNAL_SORT, true)
-    try {
+    withSQLConf(SQLConf.EXTERNAL_SORT.key -> "true",
+      SQLConf.CODEGEN_ENABLED.key -> "true") {
       sortTest()
-    } finally {
-      sqlContext.setConf(SQLConf.EXTERNAL_SORT, externalbefore)
-      sqlContext.setConf(SQLConf.CODEGEN_ENABLED, codegenbefore)
     }
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala b/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala
index e340f54850bcc..bd9729c431f30 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala
@@ -190,8 +190,8 @@ object TestData {
   case class ComplexData(m: Map[String, Int], s: TestData, a: Seq[Int], b: Boolean)
   val complexData =
     TestSQLContext.sparkContext.parallelize(
-      ComplexData(Map("1" -> 1), TestData(1, "1"), Seq(1), true)
-        :: ComplexData(Map("2" -> 2), TestData(2, "2"), Seq(2), false)
+      ComplexData(Map("1" -> 1), TestData(1, "1"), Seq(1, 1, 1), true)
+        :: ComplexData(Map("2" -> 2), TestData(2, "2"), Seq(2, 2, 2), false)
         :: Nil).toDF()
   complexData.registerTempTable("complexData")
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala
index 77ed4a9c0d5ae..f29935224e5b6 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala
@@ -57,7 +57,7 @@ private[sql] class MyDenseVectorUDT extends UserDefinedType[MyDenseVector] {
   override def deserialize(datum: Any): MyDenseVector = {
     datum match {
       case data: ArrayData =>
-        new MyDenseVector(data.toArray.map(_.asInstanceOf[Double]))
+        new MyDenseVector(data.toDoubleArray())
     }
   }
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
index 5926ef9aa388b..39d798d072aeb 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
@@ -51,7 +51,7 @@ import scala.collection.JavaConversions._
  *     java.sql.Date
  *     java.sql.Timestamp
  *  Complex Types =>
- *    Map: scala.collection.immutable.Map
+ *    Map: [[org.apache.spark.sql.types.MapData]]
  *    List: [[org.apache.spark.sql.types.ArrayData]]
  *    Struct: [[org.apache.spark.sql.catalyst.InternalRow]]
  *    Union: NOT SUPPORTED YET
@@ -290,10 +290,10 @@ private[hive] trait HiveInspectors {
       DateTimeUtils.fromJavaDate(poi.getWritableConstantValue.get())
     case mi: StandardConstantMapObjectInspector =>
       // take the value from the map inspector object, rather than the input data
-      mi.getWritableConstantValue.map { case (k, v) =>
-        (unwrap(k, mi.getMapKeyObjectInspector),
-          unwrap(v, mi.getMapValueObjectInspector))
-      }.toMap
+      val map = mi.getWritableConstantValue
+      val keys = map.keysIterator.map(unwrap(_, mi.getMapKeyObjectInspector)).toArray
+      val values = map.valuesIterator.map(unwrap(_, mi.getMapValueObjectInspector)).toArray
+      ArrayBasedMapData(keys, values)
     case li: StandardConstantListObjectInspector =>
       // take the value from the list inspector object, rather than the input data
       val values = li.getWritableConstantValue
@@ -347,12 +347,14 @@ private[hive] trait HiveInspectors {
         }
         .orNull
     case mi: MapObjectInspector =>
-      Option(mi.getMap(data)).map(
-        _.map {
-          case (k, v) =>
-            (unwrap(k, mi.getMapKeyObjectInspector),
-              unwrap(v, mi.getMapValueObjectInspector))
-        }.toMap).orNull
+      val map = mi.getMap(data)
+      if (map == null) {
+        null
+      } else {
+        val keys = map.keysIterator.map(unwrap(_, mi.getMapKeyObjectInspector)).toArray
+        val values = map.valuesIterator.map(unwrap(_, mi.getMapValueObjectInspector)).toArray
+        ArrayBasedMapData(keys, values)
+      }
     // currently, hive doesn't provide the ConstantStructObjectInspector
     case si: StructObjectInspector =>
       val allRefs = si.getAllStructFieldRefs
@@ -365,7 +367,7 @@ private[hive] trait HiveInspectors {
    * Wraps with Hive types based on object inspector.
    * TODO: Consolidate all hive OI/data interface code.
    */
-  protected def wrapperFor(oi: ObjectInspector): Any => Any = oi match {
+  protected def wrapperFor(oi: ObjectInspector, dataType: DataType): Any => Any = oi match {
     case _: JavaHiveVarcharObjectInspector =>
       (o: Any) =>
         val s = o.asInstanceOf[UTF8String].toString
@@ -381,7 +383,10 @@ private[hive] trait HiveInspectors {
       (o: Any) => DateTimeUtils.toJavaTimestamp(o.asInstanceOf[Long])
 
     case soi: StandardStructObjectInspector =>
-      val wrappers = soi.getAllStructFieldRefs.map(ref => wrapperFor(ref.getFieldObjectInspector))
+      val schema = dataType.asInstanceOf[StructType]
+      val wrappers = soi.getAllStructFieldRefs.zip(schema.fields).map { case (ref, field) =>
+        wrapperFor(ref.getFieldObjectInspector, field.dataType)
+      }
       (o: Any) => {
         if (o != null) {
           val struct = soi.create()
@@ -395,27 +400,34 @@ private[hive] trait HiveInspectors {
       }
 
     case loi: ListObjectInspector =>
-      val wrapper = wrapperFor(loi.getListElementObjectInspector)
+      val elementType = dataType.asInstanceOf[ArrayType].elementType
+      val wrapper = wrapperFor(loi.getListElementObjectInspector, elementType)
       (o: Any) => {
         if (o != null) {
-          seqAsJavaList(o.asInstanceOf[ArrayData].toArray().map(wrapper))
+          val array = o.asInstanceOf[ArrayData]
+          val values = new java.util.ArrayList[Any](array.numElements())
+          array.foreach(elementType, (_, e) => {
+            values.add(wrapper(e))
+          })
+          values
         } else {
           null
         }
       }
 
     case moi: MapObjectInspector =>
-      // The Predef.Map is scala.collection.immutable.Map.
-      // Since the map values can be mutable, we explicitly import scala.collection.Map at here.
-      import scala.collection.Map
+      val mt = dataType.asInstanceOf[MapType]
+      val keyWrapper = wrapperFor(moi.getMapKeyObjectInspector, mt.keyType)
+      val valueWrapper = wrapperFor(moi.getMapValueObjectInspector, mt.valueType)
 
-      val keyWrapper = wrapperFor(moi.getMapKeyObjectInspector)
-      val valueWrapper = wrapperFor(moi.getMapValueObjectInspector)
       (o: Any) => {
         if (o != null) {
-          mapAsJavaMap(o.asInstanceOf[Map[_, _]].map { case (key, value) =>
-            keyWrapper(key) -> valueWrapper(value)
+          val map = o.asInstanceOf[MapData]
+          val jmap = new java.util.HashMap[Any, Any](map.numElements())
+          map.foreach(mt.keyType, mt.valueType, (k, v) => {
+            jmap.put(keyWrapper(k), valueWrapper(v))
           })
+          jmap
         } else {
           null
         }
@@ -531,18 +543,21 @@ private[hive] trait HiveInspectors {
     case x: ListObjectInspector =>
       val list = new java.util.ArrayList[Object]
       val tpe = dataType.asInstanceOf[ArrayType].elementType
-      a.asInstanceOf[ArrayData].toArray().foreach {
-        v => list.add(wrap(v, x.getListElementObjectInspector, tpe))
-      }
+      a.asInstanceOf[ArrayData].foreach(tpe, (_, e) => {
+        list.add(wrap(e, x.getListElementObjectInspector, tpe))
+      })
       list
     case x: MapObjectInspector =>
       val keyType = dataType.asInstanceOf[MapType].keyType
       val valueType = dataType.asInstanceOf[MapType].valueType
+      val map = a.asInstanceOf[MapData]
+
       // Some UDFs seem to assume we pass in a HashMap.
-      val hashMap = new java.util.HashMap[AnyRef, AnyRef]()
-      hashMap.putAll(a.asInstanceOf[Map[_, _]].map { case (k, v) =>
-        wrap(k, x.getMapKeyObjectInspector, keyType) ->
-          wrap(v, x.getMapValueObjectInspector, valueType)
+      val hashMap = new java.util.HashMap[Any, Any](map.numElements())
+
+      map.foreach(keyType, valueType, (k, v) => {
+        hashMap.put(wrap(k, x.getMapKeyObjectInspector, keyType),
+          wrap(v, x.getMapValueObjectInspector, valueType))
       })
 
       hashMap
@@ -645,8 +660,9 @@ private[hive] trait HiveInspectors {
         ObjectInspectorFactory.getStandardConstantListObjectInspector(listObjectInspector, null)
       } else {
         val list = new java.util.ArrayList[Object]()
-        value.asInstanceOf[ArrayData].toArray()
-          .foreach(v => list.add(wrap(v, listObjectInspector, dt)))
+        value.asInstanceOf[ArrayData].foreach(dt, (_, e) => {
+          list.add(wrap(e, listObjectInspector, dt))
+        })
         ObjectInspectorFactory.getStandardConstantListObjectInspector(listObjectInspector, list)
       }
     case Literal(value, MapType(keyType, valueType, _)) =>
@@ -655,11 +671,14 @@ private[hive] trait HiveInspectors {
       if (value == null) {
         ObjectInspectorFactory.getStandardConstantMapObjectInspector(keyOI, valueOI, null)
       } else {
-        val map = new java.util.HashMap[Object, Object]()
-        value.asInstanceOf[Map[_, _]].foreach (entry => {
-          map.put(wrap(entry._1, keyOI, keyType), wrap(entry._2, valueOI, valueType))
+        val map = value.asInstanceOf[MapData]
+        val jmap = new java.util.HashMap[Any, Any](map.numElements())
+
+        map.foreach(keyType, valueType, (k, v) => {
+          jmap.put(wrap(k, keyOI, keyType), wrap(v, valueOI, valueType))
         })
-        ObjectInspectorFactory.getStandardConstantMapObjectInspector(keyOI, valueOI, map)
+
+        ObjectInspectorFactory.getStandardConstantMapObjectInspector(keyOI, valueOI, jmap)
       }
     // We will enumerate all of the possible constant expressions, throw exception if we missed
     case Literal(_, dt) => sys.error(s"Hive doesn't support the constant type [$dt].")
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
index e4944caeff924..40a6a32156687 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
@@ -95,9 +95,9 @@ case class InsertIntoHiveTable(
         .asInstanceOf[StructObjectInspector]
 
       val fieldOIs = standardOI.getAllStructFieldRefs.map(_.getFieldObjectInspector).toArray
-      val wrappers = fieldOIs.map(wrapperFor)
-      val outputData = new Array[Any](fieldOIs.length)
       val dataTypes: Array[DataType] = child.output.map(_.dataType).toArray
+      val wrappers = fieldOIs.zip(dataTypes).map { case (f, dt) => wrapperFor(f, dt)}
+      val outputData = new Array[Any](fieldOIs.length)
 
       writerContainer.executorSideSetup(context.stageId, context.partitionId, context.attemptNumber)
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
index 4a13022eddf60..abe5c69003130 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
@@ -428,10 +428,10 @@ private[hive] case class HiveWindowFunction(
       // if pivotResult is false, we will get a single value for all rows in the frame.
       outputBuffer
     } else {
-      // if pivotResult is true, we will get a Seq having the same size with the size
+      // if pivotResult is true, we will get a ArrayData having the same size with the size
       // of the window frame. At here, we will return the result at the position of
       // index in the output buffer.
-      outputBuffer.asInstanceOf[ArrayData].get(index)
+      outputBuffer.asInstanceOf[ArrayData].get(index, dataType)
     }
   }
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
index 924f4d37ce21f..6fa599734892b 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala
@@ -95,9 +95,10 @@ private[orc] class OrcOutputWriter(
   private val reusableOutputBuffer = new Array[Any](dataSchema.length)
 
   // Used to convert Catalyst values into Hadoop `Writable`s.
-  private val wrappers = structOI.getAllStructFieldRefs.map { ref =>
-    wrapperFor(ref.getFieldObjectInspector)
-  }.toArray
+  private val wrappers = structOI.getAllStructFieldRefs.zip(dataSchema.fields.map(_.dataType))
+    .map { case (ref, dt) =>
+      wrapperFor(ref.getFieldObjectInspector, dt)
+    }.toArray
 
   // `OrcRecordWriter.close()` creates an empty file if no rows are written at all.  We use this
   // flag to decide whether `OrcRecordWriter.close()` needs to be called.
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala
index f719f2e06ab63..99e95fb921301 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala
@@ -147,6 +147,8 @@ class HiveInspectorSuite extends SparkFunSuite with HiveInspectors {
       case (r1: Array[Byte], r2: Array[Byte])
         if r1 != null && r2 != null && r1.length == r2.length =>
         r1.zip(r2).foreach { case (b1, b2) => assert(b1 === b2) }
+      // We don't support equality & ordering for map type, so skip it.
+      case (r1: MapData, r2: MapData) =>
       case (r1, r2) => assert(r1 === r2)
     }
   }
@@ -230,7 +232,7 @@ class HiveInspectorSuite extends SparkFunSuite with HiveInspectors {
   test("wrap / unwrap Map Type") {
     val dt = MapType(dataTypes(0), dataTypes(1))
 
-    val d = Map(row(0) -> row(1))
+    val d = ArrayBasedMapData(Array(row(0)), Array(row(1)))
     checkValue(d, unwrap(wrap(d, toInspector(dt), dt), toInspector(dt)))
     checkValue(null, unwrap(wrap(null, toInspector(dt), dt), toInspector(dt)))
     checkValue(d,

From 60ea7ab4bbfaea29a6cdf4e0e71ddc56afd04de6 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies.liu@gmail.com>
Date: Sat, 1 Aug 2015 00:41:15 -0700
Subject: [PATCH 546/549] Revert "[SPARK-8232] [SQL] Add sort_array support"

This reverts commit 67ad4e21fc68336b0ad6f9a363fb5ebb51f592bf.
---
 python/pyspark/sql/functions.py               | 20 -----
 .../catalyst/analysis/FunctionRegistry.scala  |  1 -
 .../expressions/collectionOperations.scala    | 80 +------------------
 .../CollectionFunctionsSuite.scala            | 22 -----
 .../org/apache/spark/sql/functions.scala      | 19 +----
 .../spark/sql/DataFrameFunctionsSuite.scala   | 51 +-----------
 6 files changed, 7 insertions(+), 186 deletions(-)

diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index fb542e6cff81a..89a2a5ceaa9bf 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -51,7 +51,6 @@
     'sha1',
     'sha2',
     'size',
-    'sort_array',
     'sparkPartitionId',
     'struct',
     'udf',
@@ -571,10 +570,8 @@ def length(col):
 def format_number(col, d):
     """Formats the number X to a format like '#,###,###.##', rounded to d decimal places,
        and returns the result as a string.
-
     :param col: the column name of the numeric value to be formatted
     :param d: the N decimal places
-
     >>> sqlContext.createDataFrame([(5,)], ['a']).select(format_number('a', 4).alias('v')).collect()
     [Row(v=u'5.0000')]
     """
@@ -971,23 +968,6 @@ def soundex(col):
     return Column(sc._jvm.functions.size(_to_java_column(col)))
 
 
-@since(1.5)
-def sort_array(col, asc=True):
-    """
-    Collection function: sorts the input array for the given column in ascending order.
-
-    :param col: name of column or expression
-
-    >>> df = sqlContext.createDataFrame([([2, 1, 3],),([1],),([],)], ['data'])
-    >>> df.select(sort_array(df.data).alias('r')).collect()
-    [Row(r=[1, 2, 3]), Row(r=[1]), Row(r=[])]
-    >>> df.select(sort_array(df.data, asc=False).alias('r')).collect()
-    [Row(r=[3, 2, 1]), Row(r=[1]), Row(r=[])]
-     """
-    sc = SparkContext._active_spark_context
-    return Column(sc._jvm.functions.sort_array(_to_java_column(col), asc))
-
-
 class UserDefinedFunction(object):
     """
     User defined function in Python
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index 6e144518bb009..ee44cbcba68e7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -233,7 +233,6 @@ object FunctionRegistry {
 
     // collection functions
     expression[Size]("size"),
-    expression[SortArray]("sort_array"),
 
     // misc functions
     expression[Crc32]("crc32"),
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
index 1156797b2c37d..015bbb658266d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
@@ -16,10 +16,7 @@
  */
 package org.apache.spark.sql.catalyst.expressions
 
-import java.util.Comparator
-
-import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
-import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenFallback, CodeGenContext, GeneratedExpressionCode}
+import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode}
 import org.apache.spark.sql.types._
 
 /**
@@ -38,78 +35,3 @@ case class Size(child: Expression) extends UnaryExpression with ExpectsInputType
     nullSafeCodeGen(ctx, ev, c => s"${ev.primitive} = ($c).numElements();")
   }
 }
-
-/**
- * Sorts the input array in ascending / descending order according to the natural ordering of
- * the array elements and returns it.
- */
-case class SortArray(base: Expression, ascendingOrder: Expression)
-  extends BinaryExpression with ExpectsInputTypes with CodegenFallback {
-
-  def this(e: Expression) = this(e, Literal(true))
-
-  override def left: Expression = base
-  override def right: Expression = ascendingOrder
-  override def dataType: DataType = base.dataType
-  override def inputTypes: Seq[AbstractDataType] = Seq(ArrayType, BooleanType)
-
-  override def checkInputDataTypes(): TypeCheckResult = base.dataType match {
-    case _ @ ArrayType(n: AtomicType, _) => TypeCheckResult.TypeCheckSuccess
-    case _ @ ArrayType(n, _) => TypeCheckResult.TypeCheckFailure(
-                    s"Type $n is not the AtomicType, we can not perform the ordering operations")
-    case other =>
-      TypeCheckResult.TypeCheckFailure(s"ArrayType(AtomicType) is expected, but we got $other")
-  }
-
-  @transient
-  private lazy val lt = {
-    val ordering = base.dataType match {
-      case _ @ ArrayType(n: AtomicType, _) => n.ordering.asInstanceOf[Ordering[Any]]
-    }
-
-    new Comparator[Any]() {
-      override def compare(o1: Any, o2: Any): Int = {
-        if (o1 == null && o2 == null) {
-          0
-        } else if (o1 == null) {
-          -1
-        } else if (o2 == null) {
-          1
-        } else {
-          ordering.compare(o1, o2)
-        }
-      }
-    }
-  }
-
-  @transient
-  private lazy val gt = {
-    val ordering = base.dataType match {
-      case _ @ ArrayType(n: AtomicType, _) => n.ordering.asInstanceOf[Ordering[Any]]
-    }
-
-    new Comparator[Any]() {
-      override def compare(o1: Any, o2: Any): Int = {
-        if (o1 == null && o2 == null) {
-          0
-        } else if (o1 == null) {
-          1
-        } else if (o2 == null) {
-          -1
-        } else {
-          -ordering.compare(o1, o2)
-        }
-      }
-    }
-  }
-
-  override def nullSafeEval(array: Any, ascending: Any): Any = {
-    val data = array.asInstanceOf[ArrayData].toArray().asInstanceOf[Array[AnyRef]]
-    java.util.Arrays.sort(
-      data,
-      if (ascending.asInstanceOf[Boolean]) lt else gt)
-    new GenericArrayData(data.asInstanceOf[Array[Any]])
-  }
-
-  override def prettyName: String = "sort_array"
-}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionFunctionsSuite.scala
index 2c7e85c446ec6..28c41b57169f9 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionFunctionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionFunctionsSuite.scala
@@ -43,26 +43,4 @@ class CollectionFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(Literal.create(null, MapType(StringType, StringType)), null)
     checkEvaluation(Literal.create(null, ArrayType(StringType)), null)
   }
-
-  test("Sort Array") {
-    val a0 = Literal.create(Seq(2, 1, 3), ArrayType(IntegerType))
-    val a1 = Literal.create(Seq[Integer](), ArrayType(IntegerType))
-    val a2 = Literal.create(Seq("b", "a"), ArrayType(StringType))
-    val a3 = Literal.create(Seq("b", null, "a"), ArrayType(StringType))
-
-    checkEvaluation(new SortArray(a0), Seq(1, 2, 3))
-    checkEvaluation(new SortArray(a1), Seq[Integer]())
-    checkEvaluation(new SortArray(a2), Seq("a", "b"))
-    checkEvaluation(new SortArray(a3), Seq(null, "a", "b"))
-    checkEvaluation(SortArray(a0, Literal(true)), Seq(1, 2, 3))
-    checkEvaluation(SortArray(a1, Literal(true)), Seq[Integer]())
-    checkEvaluation(SortArray(a2, Literal(true)), Seq("a", "b"))
-    checkEvaluation(new SortArray(a3, Literal(true)), Seq(null, "a", "b"))
-    checkEvaluation(SortArray(a0, Literal(false)), Seq(3, 2, 1))
-    checkEvaluation(SortArray(a1, Literal(false)), Seq[Integer]())
-    checkEvaluation(SortArray(a2, Literal(false)), Seq("b", "a"))
-    checkEvaluation(new SortArray(a3, Literal(false)), Seq("b", "a", null))
-
-    checkEvaluation(Literal.create(null, ArrayType(StringType)), null)
-  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 3c9421f5cd14b..57bb00a7417af 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -2223,30 +2223,19 @@ object functions {
   //////////////////////////////////////////////////////////////////////////////////////////////
 
   /**
-   * Returns length of array or map.
-   *
+   * Returns length of array or map
    * @group collection_funcs
    * @since 1.5.0
    */
-  def size(e: Column): Column = Size(e.expr)
+  def size(columnName: String): Column = size(Column(columnName))
 
   /**
-   * Sorts the input array for the given column in ascending order,
-   * according to the natural ordering of the array elements.
-   *
+   * Returns length of array or map
    * @group collection_funcs
    * @since 1.5.0
    */
-  def sort_array(e: Column): Column = sort_array(e, true)
+  def size(column: Column): Column = Size(column.expr)
 
-  /**
-   * Sorts the input array for the given column in ascending / descending order,
-   * according to the natural ordering of the array elements.
-   *
-   * @group collection_funcs
-   * @since 1.5.0
-   */
-  def sort_array(e: Column, asc: Boolean): Column = SortArray(e.expr, lit(asc).expr)
 
   //////////////////////////////////////////////////////////////////////////////////////////////
   //////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
index 46921d14256b9..1baec5d37699d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
@@ -267,53 +267,6 @@ class DataFrameFunctionsSuite extends QueryTest {
     )
   }
 
-  test("sort_array function") {
-    val df = Seq(
-      (Array[Int](2, 1, 3), Array("b", "c", "a")),
-      (Array[Int](), Array[String]()),
-      (null, null)
-    ).toDF("a", "b")
-    checkAnswer(
-      df.select(sort_array($"a"), sort_array($"b")),
-      Seq(
-        Row(Seq(1, 2, 3), Seq("a", "b", "c")),
-        Row(Seq[Int](), Seq[String]()),
-        Row(null, null))
-    )
-    checkAnswer(
-      df.select(sort_array($"a", false), sort_array($"b", false)),
-      Seq(
-        Row(Seq(3, 2, 1), Seq("c", "b", "a")),
-        Row(Seq[Int](), Seq[String]()),
-        Row(null, null))
-    )
-    checkAnswer(
-      df.selectExpr("sort_array(a)", "sort_array(b)"),
-      Seq(
-        Row(Seq(1, 2, 3), Seq("a", "b", "c")),
-        Row(Seq[Int](), Seq[String]()),
-        Row(null, null))
-    )
-    checkAnswer(
-      df.selectExpr("sort_array(a, true)", "sort_array(b, false)"),
-      Seq(
-        Row(Seq(1, 2, 3), Seq("c", "b", "a")),
-        Row(Seq[Int](), Seq[String]()),
-        Row(null, null))
-    )
-
-    val df2 = Seq((Array[Array[Int]](Array(2)), "x")).toDF("a", "b")
-    assert(intercept[AnalysisException] {
-      df2.selectExpr("sort_array(a)").collect()
-    }.getMessage().contains("Type ArrayType(IntegerType,false) is not the AtomicType, " +
-      "we can not perform the ordering operations"))
-
-    val df3 = Seq(("xxx", "x")).toDF("a", "b")
-    assert(intercept[AnalysisException] {
-      df3.selectExpr("sort_array(a)").collect()
-    }.getMessage().contains("ArrayType(AtomicType) is expected, but we got StringType"))
-  }
-
   test("array size function") {
     val df = Seq(
       (Array[Int](1, 2), "x"),
@@ -321,7 +274,7 @@ class DataFrameFunctionsSuite extends QueryTest {
       (Array[Int](1, 2, 3), "z")
     ).toDF("a", "b")
     checkAnswer(
-      df.select(size($"a")),
+      df.select(size("a")),
       Seq(Row(2), Row(0), Row(3))
     )
     checkAnswer(
@@ -337,7 +290,7 @@ class DataFrameFunctionsSuite extends QueryTest {
       (Map[Int, Int](1 -> 1, 2 -> 2, 3 -> 3), "z")
     ).toDF("a", "b")
     checkAnswer(
-      df.select(size($"a")),
+      df.select(size("a")),
       Seq(Row(2), Row(0), Row(3))
     )
     checkAnswer(

From 65038973a17904e0e04d453799ec108af240fbab Mon Sep 17 00:00:00 2001
From: Holden Karau <holden@pigscanfly.ca>
Date: Sat, 1 Aug 2015 01:09:38 -0700
Subject: [PATCH 547/549] [SPARK-7446] [MLLIB] Add inverse transform for string
 indexer

It is useful to convert the encoded indices back to their string representation for result inspection. We can add a function which creates an inverse transformation.

Author: Holden Karau <holden@pigscanfly.ca>

Closes #6339 from holdenk/SPARK-7446-inverse-transform-for-string-indexer and squashes the following commits:

7cdf915 [Holden Karau] scala style comment fix
b9cffb6 [Holden Karau] Update the labels param to have the metadata note
6a38edb [Holden Karau] Setting the default needs to come after the value gets defined
9e241d8 [Holden Karau] use Array.empty
21c8cfa [Holden Karau] Merge branch 'master' into SPARK-7446-inverse-transform-for-string-indexer
64dd3a3 [Holden Karau] Merge branch 'master' into SPARK-7446-inverse-transform-for-string-indexer
4f06c59 [Holden Karau] Fix comment styles, use empty array as the default, etc.
a60c0e3 [Holden Karau] CR feedback (remove old constructor, add a note about use of setLabels)
1987b95 [Holden Karau] Use default copy
71e8d66 [Holden Karau] Make labels a local param for StringIndexerInverse
8450d0b [Holden Karau] Use the labels param in StringIndexerInverse
7464019 [Holden Karau] Add a labels param
868b1a9 [Holden Karau] Update scaladoc since we don't have labelsCol anymore
5aa38bf [Holden Karau] Add an inverse test using only meta data, pass labels when calling inverse method
f3e0c64 [Holden Karau] CR feedback
ebed932 [Holden Karau] Add Experimental tag and some scaladocs. Also don't require that the inputCol has the metadata on it, instead have the labelsCol specified when creating the inverse.
03ebf95 [Holden Karau] Add explicit type for invert function
ecc65e0 [Holden Karau] Read the metadata correctly, use the array, pass the test
a42d773 [Holden Karau] Fix test to supply cols as per new invert method
16cc3c3 [Holden Karau] Add an invert method
d4bcb20 [Holden Karau] Make the inverse string indexer into a transformer (still needs test updates but compiles)
e8bf3ad [Holden Karau] Merge branch 'master' into SPARK-7446-inverse-transform-for-string-indexer
c3fdee1 [Holden Karau] Some WIP refactoring based on jkbradley's CR feedback. Definite work-in-progress
557bef8 [Holden Karau] Instead of using a private inverse transform, add an invert function so we can use it in a pipeline
88779c1 [Holden Karau] fix long line
78b28c1 [Holden Karau] Finish reverse part and add a test :)
bb16a6a [Holden Karau] Some progress
---
 .../spark/ml/feature/StringIndexer.scala      | 108 +++++++++++++++++-
 .../spark/ml/feature/StringIndexerSuite.scala |  13 +++
 2 files changed, 118 insertions(+), 3 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
index bf7be363b8224..ebfa972532358 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
@@ -20,13 +20,14 @@ package org.apache.spark.ml.feature
 import org.apache.spark.SparkException
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.ml.{Estimator, Model}
-import org.apache.spark.ml.attribute.NominalAttribute
+import org.apache.spark.ml.attribute.{Attribute, NominalAttribute}
 import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared._
-import org.apache.spark.ml.util.Identifiable
+import org.apache.spark.ml.Transformer
+import org.apache.spark.ml.util.{Identifiable, MetadataUtils}
 import org.apache.spark.sql.DataFrame
 import org.apache.spark.sql.functions._
-import org.apache.spark.sql.types.{NumericType, StringType, StructType}
+import org.apache.spark.sql.types.{DoubleType, NumericType, StringType, StructType}
 import org.apache.spark.util.collection.OpenHashMap
 
 /**
@@ -151,4 +152,105 @@ class StringIndexerModel private[ml] (
     val copied = new StringIndexerModel(uid, labels)
     copyValues(copied, extra)
   }
+
+  /**
+   * Return a model to perform the inverse transformation.
+   * Note: By default we keep the original columns during this transformation, so the inverse
+   * should only be used on new columns such as predicted labels.
+   */
+  def invert(inputCol: String, outputCol: String): StringIndexerInverse = {
+    new StringIndexerInverse()
+      .setInputCol(inputCol)
+      .setOutputCol(outputCol)
+      .setLabels(labels)
+  }
+}
+
+/**
+ * :: Experimental ::
+ * Transform a provided column back to the original input types using either the metadata
+ * on the input column, or if provided using the labels supplied by the user.
+ * Note: By default we keep the original columns during this transformation,
+ * so the inverse should only be used on new columns such as predicted labels.
+ */
+@Experimental
+class StringIndexerInverse private[ml] (
+  override val uid: String) extends Transformer
+    with HasInputCol with HasOutputCol {
+
+  def this() =
+    this(Identifiable.randomUID("strIdxInv"))
+
+  /** @group setParam */
+  def setInputCol(value: String): this.type = set(inputCol, value)
+
+  /** @group setParam */
+  def setOutputCol(value: String): this.type = set(outputCol, value)
+
+  /**
+   * Optional labels to be provided by the user, if not supplied column
+   * metadata is read for labels. The default value is an empty array,
+   * but the empty array is ignored and column metadata used instead.
+   * @group setParam
+   */
+  def setLabels(value: Array[String]): this.type = set(labels, value)
+
+  /**
+   * Param for array of labels.
+   * Optional labels to be provided by the user, if not supplied column
+   * metadata is read for labels.
+   * @group param
+   */
+  final val labels: StringArrayParam = new StringArrayParam(this, "labels",
+    "array of labels, if not provided metadata from inputCol is used instead.")
+  setDefault(labels, Array.empty[String])
+
+  /**
+   * Optional labels to be provided by the user, if not supplied column
+   * metadata is read for labels.
+   * @group getParam
+   */
+  final def getLabels: Array[String] = $(labels)
+
+  /** Transform the schema for the inverse transformation */
+  override def transformSchema(schema: StructType): StructType = {
+    val inputColName = $(inputCol)
+    val inputDataType = schema(inputColName).dataType
+    require(inputDataType.isInstanceOf[NumericType],
+      s"The input column $inputColName must be a numeric type, " +
+        s"but got $inputDataType.")
+    val inputFields = schema.fields
+    val outputColName = $(outputCol)
+    require(inputFields.forall(_.name != outputColName),
+      s"Output column $outputColName already exists.")
+    val attr = NominalAttribute.defaultAttr.withName($(outputCol))
+    val outputFields = inputFields :+ attr.toStructField()
+    StructType(outputFields)
+  }
+
+  override def transform(dataset: DataFrame): DataFrame = {
+    val inputColSchema = dataset.schema($(inputCol))
+    // If the labels array is empty use column metadata
+    val values = if ($(labels).isEmpty) {
+      Attribute.fromStructField(inputColSchema)
+        .asInstanceOf[NominalAttribute].values.get
+    } else {
+      $(labels)
+    }
+    val indexer = udf { index: Double =>
+      val idx = index.toInt
+      if (0 <= idx && idx < values.size) {
+        values(idx)
+      } else {
+        throw new SparkException(s"Unseen index: $index ??")
+      }
+    }
+    val outputColName = $(outputCol)
+    dataset.select(col("*"),
+      indexer(dataset($(inputCol)).cast(DoubleType)).as(outputColName))
+  }
+
+  override def copy(extra: ParamMap): StringIndexerInverse = {
+    defaultCopy(extra)
+  }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala
index 99f82bea42688..d0295a0fe2fc1 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala
@@ -47,6 +47,19 @@ class StringIndexerSuite extends SparkFunSuite with MLlibTestSparkContext {
     // a -> 0, b -> 2, c -> 1
     val expected = Set((0, 0.0), (1, 2.0), (2, 1.0), (3, 0.0), (4, 0.0), (5, 1.0))
     assert(output === expected)
+    // convert reverse our transform
+    val reversed = indexer.invert("labelIndex", "label2")
+      .transform(transformed)
+      .select("id", "label2")
+    assert(df.collect().map(r => (r.getInt(0), r.getString(1))).toSet ===
+      reversed.collect().map(r => (r.getInt(0), r.getString(1))).toSet)
+    // Check invert using only metadata
+    val inverse2 = new StringIndexerInverse()
+      .setInputCol("labelIndex")
+      .setOutputCol("label2")
+    val reversed2 = inverse2.transform(transformed).select("id", "label2")
+    assert(df.collect().map(r => (r.getInt(0), r.getString(1))).toSet ===
+      reversed2.collect().map(r => (r.getInt(0), r.getString(1))).toSet)
   }
 
   test("StringIndexer with a numeric input column") {

From d2a9b66f6c0de89d6d16370af1c77c7f51b11d3e Mon Sep 17 00:00:00 2001
From: zhangjiajin <zhangjiajin@huawei.com>
Date: Sat, 1 Aug 2015 01:56:27 -0700
Subject: [PATCH 548/549] [SPARK-8999] [MLLIB] PrefixSpan non-temporal
 sequences

mengxr Extends PrefixSpan to non-temporal itemsets. Continues work by zhangjiajin

 * Internal API uses List[Set[Int]] which is likely not efficient; will need to refactor during QA

Closes #7646

Author: zhangjiajin <zhangjiajin@huawei.com>
Author: Feynman Liang <fliang@databricks.com>
Author: zhang jiajin <zhangjiajin@huawei.com>

Closes #7818 from feynmanliang/SPARK-8999-nonTemporal and squashes the following commits:

4ded81d [Feynman Liang] Replace all filters to filter nonempty
350e67e [Feynman Liang] Code review feedback
03156ca [Feynman Liang] Fix tests, drop delimiters at boundaries of sequences
d1fe0ed [Feynman Liang] Remove comments
86ca4e5 [Feynman Liang] Fix style
7c7bf39 [Feynman Liang] Fixed itemSet sequences
6073b10 [Feynman Liang] Basic itemset functionality, failing test
1a7fb48 [Feynman Liang] Add delimiter to results
5db00aa [Feynman Liang] Working for items, not itemsets
6787716 [Feynman Liang] Working on temporal sequences
f1114b9 [Feynman Liang] Add -1 delimiter
00fe756 [Feynman Liang] Reset base files for rebase
f486dcd [zhangjiajin] change maxLocalProjDBSize and fix a bug (remove -3 from frequent items).
60a0b76 [zhangjiajin] fixed a scala style error.
740c203 [zhangjiajin] fixed a scala style error.
5785cb8 [zhangjiajin] support non-temporal sequence
a5d649d [zhangjiajin] restore original version
09dc409 [zhangjiajin] Merge branch 'master' of https://github.com/apache/spark into multiItems_2
ae8c02d [zhangjiajin] Fixed some Scala style errors.
216ab0c [zhangjiajin] Support non-temporal sequence in PrefixSpan
b572f54 [zhangjiajin] initialize file before rebase.
f06772f [zhangjiajin] fix a scala style error.
a7e50d4 [zhangjiajin] Add feature: Collect enough frequent prefixes before projection in PrefixSpan.
c1d13d0 [zhang jiajin] Delete PrefixspanSuite.scala
d9d8137 [zhang jiajin] Delete Prefixspan.scala
c6ceb63 [zhangjiajin] Add new algorithm PrefixSpan and test file.
---
 .../spark/mllib/fpm/LocalPrefixSpan.scala     |  46 ++--
 .../apache/spark/mllib/fpm/PrefixSpan.scala   | 111 +++++---
 .../spark/mllib/fpm/PrefixSpanSuite.scala     | 237 +++++++++++++++---
 3 files changed, 302 insertions(+), 92 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/LocalPrefixSpan.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/LocalPrefixSpan.scala
index 0ea792081086d..ccebf951c850d 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/LocalPrefixSpan.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/LocalPrefixSpan.scala
@@ -25,7 +25,7 @@ import org.apache.spark.Logging
  * Calculate all patterns of a projected database in local.
  */
 private[fpm] object LocalPrefixSpan extends Logging with Serializable {
-
+  import PrefixSpan._
   /**
    * Calculate all patterns of a projected database.
    * @param minCount minimum count
@@ -39,12 +39,19 @@ private[fpm] object LocalPrefixSpan extends Logging with Serializable {
   def run(
       minCount: Long,
       maxPatternLength: Int,
-      prefixes: List[Int],
-      database: Iterable[Array[Int]]): Iterator[(List[Int], Long)] = {
-    if (prefixes.length == maxPatternLength || database.isEmpty) return Iterator.empty
-    val frequentItemAndCounts = getFreqItemAndCounts(minCount, database)
-    val filteredDatabase = database.map(x => x.filter(frequentItemAndCounts.contains))
-    frequentItemAndCounts.iterator.flatMap { case (item, count) =>
+      prefixes: List[Set[Int]],
+      database: Iterable[List[Set[Int]]]): Iterator[(List[Set[Int]], Long)] = {
+    if (prefixes.length == maxPatternLength || database.isEmpty) {
+      return Iterator.empty
+    }
+    val freqItemSetsAndCounts = getFreqItemAndCounts(minCount, database)
+    val freqItems = freqItemSetsAndCounts.keys.flatten.toSet
+    val filteredDatabase = database.map { suffix =>
+      suffix
+        .map(item => freqItems.intersect(item))
+        .filter(_.nonEmpty)
+    }
+    freqItemSetsAndCounts.iterator.flatMap { case (item, count) =>
       val newPrefixes = item :: prefixes
       val newProjected = project(filteredDatabase, item)
       Iterator.single((newPrefixes, count)) ++
@@ -54,20 +61,23 @@ private[fpm] object LocalPrefixSpan extends Logging with Serializable {
 
   /**
    * Calculate suffix sequence immediately after the first occurrence of an item.
-   * @param item item to get suffix after
+   * @param item itemset to get suffix after
    * @param sequence sequence to extract suffix from
    * @return suffix sequence
    */
-  def getSuffix(item: Int, sequence: Array[Int]): Array[Int] = {
-    val index = sequence.indexOf(item)
+  def getSuffix(item: Set[Int], sequence: List[Set[Int]]): List[Set[Int]] = {
+    val itemsetSeq = sequence
+    val index = itemsetSeq.indexWhere(item.subsetOf(_))
     if (index == -1) {
-      Array()
+      List()
     } else {
-      sequence.drop(index + 1)
+      itemsetSeq.drop(index + 1)
     }
   }
 
-  def project(database: Iterable[Array[Int]], prefix: Int): Iterable[Array[Int]] = {
+  def project(
+      database: Iterable[List[Set[Int]]],
+      prefix: Set[Int]): Iterable[List[Set[Int]]] = {
     database
       .map(getSuffix(prefix, _))
       .filter(_.nonEmpty)
@@ -81,14 +91,16 @@ private[fpm] object LocalPrefixSpan extends Logging with Serializable {
    */
   private def getFreqItemAndCounts(
       minCount: Long,
-      database: Iterable[Array[Int]]): mutable.Map[Int, Long] = {
+      database: Iterable[List[Set[Int]]]): Map[Set[Int], Long] = {
     // TODO: use PrimitiveKeyOpenHashMap
-    val counts = mutable.Map[Int, Long]().withDefaultValue(0L)
+    val counts = mutable.Map[Set[Int], Long]().withDefaultValue(0L)
     database.foreach { sequence =>
-      sequence.distinct.foreach { item =>
+      sequence.flatMap(nonemptySubsets(_)).distinct.foreach { item =>
         counts(item) += 1L
       }
     }
-    counts.filter(_._2 >= minCount)
+    counts
+      .filter { case (_, count) => count >= minCount }
+      .toMap
   }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala
index e6752332cdeeb..22b4ddb8b3495 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.mllib.fpm
 
-import scala.collection.mutable.ArrayBuffer
+import scala.collection.mutable.ArrayBuilder
 
 import org.apache.spark.Logging
 import org.apache.spark.annotation.Experimental
@@ -44,13 +44,14 @@ import org.apache.spark.storage.StorageLevel
 class PrefixSpan private (
     private var minSupport: Double,
     private var maxPatternLength: Int) extends Logging with Serializable {
+  import PrefixSpan._
 
   /**
    * The maximum number of items allowed in a projected database before local processing. If a
    * projected database exceeds this size, another iteration of distributed PrefixSpan is run.
    */
-  // TODO: make configurable with a better default value, 10000 may be too small
-  private val maxLocalProjDBSize: Long = 10000
+  // TODO: make configurable with a better default value
+  private val maxLocalProjDBSize: Long = 32000000L
 
   /**
    * Constructs a default instance with default parameters
@@ -90,35 +91,41 @@ class PrefixSpan private (
 
   /**
    * Find the complete set of sequential patterns in the input sequences.
-   * @param sequences input data set, contains a set of sequences,
-   *                  a sequence is an ordered list of elements.
+   * @param data ordered sequences of itemsets. Items are represented by non-negative integers.
+   *                  Each itemset has one or more items and is delimited by [[DELIMITER]].
    * @return a set of sequential pattern pairs,
    *         the key of pair is pattern (a list of elements),
    *         the value of pair is the pattern's count.
    */
-  def run(sequences: RDD[Array[Int]]): RDD[(Array[Int], Long)] = {
-    val sc = sequences.sparkContext
+  // TODO: generalize to arbitrary item-types and use mapping to Ints for internal algorithm
+  def run(data: RDD[Array[Int]]): RDD[(Array[Int], Long)] = {
+    val sc = data.sparkContext
 
-    if (sequences.getStorageLevel == StorageLevel.NONE) {
+    if (data.getStorageLevel == StorageLevel.NONE) {
       logWarning("Input data is not cached.")
     }
 
+    // Use List[Set[Item]] for internal computation
+    val sequences = data.map { seq => splitSequence(seq.toList) }
+
     // Convert min support to a min number of transactions for this dataset
     val minCount = if (minSupport == 0) 0L else math.ceil(sequences.count() * minSupport).toLong
 
     // (Frequent items -> number of occurrences, all items here satisfy the `minSupport` threshold
     val freqItemCounts = sequences
-      .flatMap(seq => seq.distinct.map(item => (item, 1L)))
+      .flatMap(seq => seq.flatMap(nonemptySubsets(_)).distinct.map(item => (item, 1L)))
       .reduceByKey(_ + _)
-      .filter(_._2 >= minCount)
+      .filter { case (item, count) => (count >= minCount) }
       .collect()
+      .toMap
 
     // Pairs of (length 1 prefix, suffix consisting of frequent items)
     val itemSuffixPairs = {
-      val freqItems = freqItemCounts.map(_._1).toSet
+      val freqItemSets = freqItemCounts.keys.toSet
+      val freqItems = freqItemSets.flatten
       sequences.flatMap { seq =>
-        val filteredSeq = seq.filter(freqItems.contains(_))
-        freqItems.flatMap { item =>
+        val filteredSeq = seq.map(item => freqItems.intersect(item)).filter(_.nonEmpty)
+        freqItemSets.flatMap { item =>
           val candidateSuffix = LocalPrefixSpan.getSuffix(item, filteredSeq)
           candidateSuffix match {
             case suffix if !suffix.isEmpty => Some((List(item), suffix))
@@ -130,14 +137,15 @@ class PrefixSpan private (
 
     // Accumulator for the computed results to be returned, initialized to the frequent items (i.e.
     // frequent length-one prefixes)
-    var resultsAccumulator = freqItemCounts.map(x => (List(x._1), x._2))
+    var resultsAccumulator = freqItemCounts.map { case (item, count) => (List(item), count) }.toList
 
     // Remaining work to be locally and distributively processed respectfully
     var (pairsForLocal, pairsForDistributed) = partitionByProjDBSize(itemSuffixPairs)
 
     // Continue processing until no pairs for distributed processing remain (i.e. all prefixes have
-    // projected database sizes <= `maxLocalProjDBSize`)
-    while (pairsForDistributed.count() != 0) {
+    // projected database sizes <= `maxLocalProjDBSize`) or `maxPatternLength` is reached
+    var patternLength = 1
+    while (pairsForDistributed.count() != 0 && patternLength < maxPatternLength) {
       val (nextPatternAndCounts, nextPrefixSuffixPairs) =
         extendPrefixes(minCount, pairsForDistributed)
       pairsForDistributed.unpersist()
@@ -146,6 +154,7 @@ class PrefixSpan private (
       pairsForDistributed.persist(StorageLevel.MEMORY_AND_DISK)
       pairsForLocal ++= smallerPairsPart
       resultsAccumulator ++= nextPatternAndCounts.collect()
+      patternLength += 1 // pattern length grows one per iteration
     }
 
     // Process the small projected databases locally
@@ -153,7 +162,7 @@ class PrefixSpan private (
       minCount, sc.parallelize(pairsForLocal, 1).groupByKey())
 
     (sc.parallelize(resultsAccumulator, 1) ++ remainingResults)
-      .map { case (pattern, count) => (pattern.toArray, count) }
+      .map { case (pattern, count) => (flattenSequence(pattern.reverse).toArray, count) }
   }
 
 
@@ -163,8 +172,8 @@ class PrefixSpan private (
    * @return prefix-suffix pairs partitioned by whether their projected database size is <= or
    *         greater than [[maxLocalProjDBSize]]
    */
-  private def partitionByProjDBSize(prefixSuffixPairs: RDD[(List[Int], Array[Int])])
-    : (Array[(List[Int], Array[Int])], RDD[(List[Int], Array[Int])]) = {
+  private def partitionByProjDBSize(prefixSuffixPairs: RDD[(List[Set[Int]], List[Set[Int]])])
+    : (List[(List[Set[Int]], List[Set[Int]])], RDD[(List[Set[Int]], List[Set[Int]])]) = {
     val prefixToSuffixSize = prefixSuffixPairs
       .aggregateByKey(0)(
         seqOp = { case (count, suffix) => count + suffix.length },
@@ -176,12 +185,12 @@ class PrefixSpan private (
       .toSet
     val small = prefixSuffixPairs.filter { case (prefix, _) => smallPrefixes.contains(prefix) }
     val large = prefixSuffixPairs.filter { case (prefix, _) => !smallPrefixes.contains(prefix) }
-    (small.collect(), large)
+    (small.collect().toList, large)
   }
 
   /**
-   * Extends all prefixes by one item from their suffix and computes the resulting frequent prefixes
-   * and remaining work.
+   * Extends all prefixes by one itemset from their suffix and computes the resulting frequent
+   * prefixes and remaining work.
    * @param minCount minimum count
    * @param prefixSuffixPairs prefix (length N) and suffix pairs,
    * @return (frequent length N+1 extended prefix, count) pairs and (frequent length N+1 extended
@@ -189,15 +198,16 @@ class PrefixSpan private (
    */
   private def extendPrefixes(
       minCount: Long,
-      prefixSuffixPairs: RDD[(List[Int], Array[Int])])
-    : (RDD[(List[Int], Long)], RDD[(List[Int], Array[Int])]) = {
+      prefixSuffixPairs: RDD[(List[Set[Int]], List[Set[Int]])])
+    : (RDD[(List[Set[Int]], Long)], RDD[(List[Set[Int]], List[Set[Int]])]) = {
 
-    // (length N prefix, item from suffix) pairs and their corresponding number of occurrences
+    // (length N prefix, itemset from suffix) pairs and their corresponding number of occurrences
     // Every (prefix :+ suffix) is guaranteed to have support exceeding `minSupport`
     val prefixItemPairAndCounts = prefixSuffixPairs
-      .flatMap { case (prefix, suffix) => suffix.distinct.map(y => ((prefix, y), 1L)) }
+      .flatMap { case (prefix, suffix) =>
+      suffix.flatMap(nonemptySubsets(_)).distinct.map(y => ((prefix, y), 1L)) }
       .reduceByKey(_ + _)
-      .filter(_._2 >= minCount)
+      .filter { case (item, count) => (count >= minCount) }
 
     // Map from prefix to set of possible next items from suffix
     val prefixToNextItems = prefixItemPairAndCounts
@@ -207,7 +217,6 @@ class PrefixSpan private (
       .collect()
       .toMap
 
-
     // Frequent patterns with length N+1 and their corresponding counts
     val extendedPrefixAndCounts = prefixItemPairAndCounts
       .map { case ((prefix, item), count) => (item :: prefix, count) }
@@ -216,9 +225,12 @@ class PrefixSpan private (
     val extendedPrefixAndSuffix = prefixSuffixPairs
       .filter(x => prefixToNextItems.contains(x._1))
       .flatMap { case (prefix, suffix) =>
-        val frequentNextItems = prefixToNextItems(prefix)
-        val filteredSuffix = suffix.filter(frequentNextItems.contains(_))
-        frequentNextItems.flatMap { item =>
+        val frequentNextItemSets = prefixToNextItems(prefix)
+        val frequentNextItems = frequentNextItemSets.flatten
+        val filteredSuffix = suffix
+          .map(item => frequentNextItems.intersect(item))
+          .filter(_.nonEmpty)
+        frequentNextItemSets.flatMap { item =>
           LocalPrefixSpan.getSuffix(item, filteredSuffix) match {
             case suffix if !suffix.isEmpty => Some(item :: prefix, suffix)
             case _ => None
@@ -237,13 +249,38 @@ class PrefixSpan private (
    */
   private def getPatternsInLocal(
       minCount: Long,
-      data: RDD[(List[Int], Iterable[Array[Int]])]): RDD[(List[Int], Long)] = {
+      data: RDD[(List[Set[Int]], Iterable[List[Set[Int]]])]): RDD[(List[Set[Int]], Long)] = {
     data.flatMap {
-      case (prefix, projDB) =>
-        LocalPrefixSpan.run(minCount, maxPatternLength, prefix.toList.reverse, projDB)
-          .map { case (pattern: List[Int], count: Long) =>
-          (pattern.reverse, count)
-        }
+      case (prefix, projDB) => LocalPrefixSpan.run(minCount, maxPatternLength, prefix, projDB)
+    }
+  }
+
+}
+
+private[fpm] object PrefixSpan {
+  private[fpm] val DELIMITER = -1
+
+  /** Splits a sequence of itemsets delimited by [[DELIMITER]]. */
+  private[fpm] def splitSequence(sequence: List[Int]): List[Set[Int]] = {
+    sequence.span(_ != DELIMITER) match {
+      case (x, xs) if xs.length > 1 => x.toSet :: splitSequence(xs.tail)
+      case (x, xs) => List(x.toSet)
+    }
+  }
+
+  /** Flattens a sequence of itemsets into an Array, inserting[[DELIMITER]] between itemsets. */
+  private[fpm] def flattenSequence(sequence: List[Set[Int]]): List[Int] = {
+    val builder = ArrayBuilder.make[Int]()
+    for (itemSet <- sequence) {
+      builder += DELIMITER
+      builder ++= itemSet.toSeq.sorted
     }
+    builder.result().toList.drop(1) // drop trailing delimiter
+  }
+
+  /** Returns an iterator over all non-empty subsets of `itemSet` */
+  private[fpm] def nonemptySubsets(itemSet: Set[Int]): Iterator[Set[Int]] = {
+    // TODO: improve complexity by using partial prefixes, considering one item at a time
+    itemSet.subsets.filter(_ != Set.empty[Int])
   }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/fpm/PrefixSpanSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/fpm/PrefixSpanSuite.scala
index 6dd2dc926acc5..457f32670fd4e 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/fpm/PrefixSpanSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/fpm/PrefixSpanSuite.scala
@@ -21,7 +21,7 @@ import org.apache.spark.mllib.util.MLlibTestSparkContext
 
 class PrefixSpanSuite extends SparkFunSuite with MLlibTestSparkContext {
 
-  test("PrefixSpan using Integer type") {
+  test("PrefixSpan using Integer type, singleton itemsets") {
 
     /*
       library("arulesSequences")
@@ -35,12 +35,12 @@ class PrefixSpanSuite extends SparkFunSuite with MLlibTestSparkContext {
     */
 
     val sequences = Array(
-      Array(1, 3, 4, 5),
-      Array(2, 3, 1),
-      Array(2, 4, 1),
-      Array(3, 1, 3, 4, 5),
-      Array(3, 4, 4, 3),
-      Array(6, 5, 3))
+      Array(1, -1, 3, -1, 4, -1, 5),
+      Array(2, -1, 3, -1, 1),
+      Array(2, -1, 4, -1, 1),
+      Array(3, -1, 1, -1, 3, -1, 4, -1, 5),
+      Array(3, -1, 4, -1, 4, -1, 3),
+      Array(6, -1, 5, -1, 3))
 
     val rdd = sc.parallelize(sequences, 2).cache()
 
@@ -50,64 +50,225 @@ class PrefixSpanSuite extends SparkFunSuite with MLlibTestSparkContext {
     val result1 = prefixspan.run(rdd)
     val expectedValue1 = Array(
       (Array(1), 4L),
-      (Array(1, 3), 2L),
-      (Array(1, 3, 4), 2L),
-      (Array(1, 3, 4, 5), 2L),
-      (Array(1, 3, 5), 2L),
-      (Array(1, 4), 2L),
-      (Array(1, 4, 5), 2L),
-      (Array(1, 5), 2L),
+      (Array(1, -1, 3), 2L),
+      (Array(1, -1, 3, -1, 4), 2L),
+      (Array(1, -1, 3, -1, 4, -1, 5), 2L),
+      (Array(1, -1, 3, -1, 5), 2L),
+      (Array(1, -1, 4), 2L),
+      (Array(1, -1, 4, -1, 5), 2L),
+      (Array(1, -1, 5), 2L),
       (Array(2), 2L),
-      (Array(2, 1), 2L),
+      (Array(2, -1, 1), 2L),
       (Array(3), 5L),
-      (Array(3, 1), 2L),
-      (Array(3, 3), 2L),
-      (Array(3, 4), 3L),
-      (Array(3, 4, 5), 2L),
-      (Array(3, 5), 2L),
+      (Array(3, -1, 1), 2L),
+      (Array(3, -1, 3), 2L),
+      (Array(3, -1, 4), 3L),
+      (Array(3, -1, 4, -1, 5), 2L),
+      (Array(3, -1, 5), 2L),
       (Array(4), 4L),
-      (Array(4, 5), 2L),
+      (Array(4, -1, 5), 2L),
       (Array(5), 3L)
     )
-    assert(compareResults(expectedValue1, result1.collect()))
+    compareResults(expectedValue1, result1.collect())
 
     prefixspan.setMinSupport(0.5).setMaxPatternLength(50)
     val result2 = prefixspan.run(rdd)
     val expectedValue2 = Array(
       (Array(1), 4L),
       (Array(3), 5L),
-      (Array(3, 4), 3L),
+      (Array(3, -1, 4), 3L),
       (Array(4), 4L),
       (Array(5), 3L)
     )
-    assert(compareResults(expectedValue2, result2.collect()))
+    compareResults(expectedValue2, result2.collect())
 
     prefixspan.setMinSupport(0.33).setMaxPatternLength(2)
     val result3 = prefixspan.run(rdd)
     val expectedValue3 = Array(
       (Array(1), 4L),
-      (Array(1, 3), 2L),
-      (Array(1, 4), 2L),
-      (Array(1, 5), 2L),
-      (Array(2, 1), 2L),
+      (Array(1, -1, 3), 2L),
+      (Array(1, -1, 4), 2L),
+      (Array(1, -1, 5), 2L),
+      (Array(2, -1, 1), 2L),
       (Array(2), 2L),
       (Array(3), 5L),
-      (Array(3, 1), 2L),
-      (Array(3, 3), 2L),
-      (Array(3, 4), 3L),
-      (Array(3, 5), 2L),
+      (Array(3, -1, 1), 2L),
+      (Array(3, -1, 3), 2L),
+      (Array(3, -1, 4), 3L),
+      (Array(3, -1, 5), 2L),
       (Array(4), 4L),
-      (Array(4, 5), 2L),
+      (Array(4, -1, 5), 2L),
       (Array(5), 3L)
     )
-    assert(compareResults(expectedValue3, result3.collect()))
+    compareResults(expectedValue3, result3.collect())
+  }
+
+  test("PrefixSpan using Integer type, variable-size itemsets") {
+    val sequences = Array(
+      Array(1, -1, 1, 2, 3, -1, 1, 3, -1, 4, -1, 3, 6),
+      Array(1, 4, -1, 3, -1, 2, 3, -1, 1, 5),
+      Array(5, 6, -1, 1, 2, -1, 4, 6, -1, 3, -1, 2),
+      Array(5, -1, 7, -1, 1, 6, -1, 3, -1, 2, -1, 3))
+    val rdd = sc.parallelize(sequences, 2).cache()
+    val prefixspan = new PrefixSpan().setMinSupport(0.5).setMaxPatternLength(5)
+    val result = prefixspan.run(rdd)
+
+    /*
+      To verify results, create file "prefixSpanSeqs" with content
+      (format = (transactionID, idxInTransaction, numItemsinItemset, itemset)):
+        1 1 1 1
+        1 2 3 1 2 3
+        1 3 2 1 3
+        1 4 1 4
+        1 5 2 3 6
+        2 1 2 1 4
+        2 2 1 3
+        2 3 2 2 3
+        2 4 2 1 5
+        3 1 2 5 6
+        3 2 2 1 2
+        3 3 2 4 6
+        3 4 1 3
+        3 5 1 2
+        4 1 1 5
+        4 2 1 7
+        4 3 2 1 6
+        4 4 1 3
+        4 5 1 2
+        4 6 1 3
+      In R, run:
+        library("arulesSequences")
+        prefixSpanSeqs = read_baskets("prefixSpanSeqs", info = c("sequenceID","eventID","SIZE"))
+        freqItemSeq = cspade(prefixSpanSeqs,
+                             parameter = list(support = 0.5, maxlen = 5 ))
+        resSeq = as(freqItemSeq, "data.frame")
+        resSeq
+
+                    sequence support
+        1              <{1}>    1.00
+        2              <{2}>    1.00
+        3              <{3}>    1.00
+        4              <{4}>    0.75
+        5              <{5}>    0.75
+        6              <{6}>    0.75
+        7          <{1},{6}>    0.50
+        8          <{2},{6}>    0.50
+        9          <{5},{6}>    0.50
+        10       <{1,2},{6}>    0.50
+        11         <{1},{4}>    0.50
+        12         <{2},{4}>    0.50
+        13       <{1,2},{4}>    0.50
+        14         <{1},{3}>    1.00
+        15         <{2},{3}>    0.75
+        16           <{2,3}>    0.50
+        17         <{3},{3}>    0.75
+        18         <{4},{3}>    0.75
+        19         <{5},{3}>    0.50
+        20         <{6},{3}>    0.50
+        21     <{5},{6},{3}>    0.50
+        22     <{6},{2},{3}>    0.50
+        23     <{5},{2},{3}>    0.50
+        24     <{5},{1},{3}>    0.50
+        25     <{2},{4},{3}>    0.50
+        26     <{1},{4},{3}>    0.50
+        27   <{1,2},{4},{3}>    0.50
+        28     <{1},{3},{3}>    0.75
+        29       <{1,2},{3}>    0.50
+        30     <{1},{2},{3}>    0.50
+        31       <{1},{2,3}>    0.50
+        32         <{1},{2}>    1.00
+        33           <{1,2}>    0.50
+        34         <{3},{2}>    0.75
+        35         <{4},{2}>    0.50
+        36         <{5},{2}>    0.50
+        37         <{6},{2}>    0.50
+        38     <{5},{6},{2}>    0.50
+        39     <{6},{3},{2}>    0.50
+        40     <{5},{3},{2}>    0.50
+        41     <{5},{1},{2}>    0.50
+        42     <{4},{3},{2}>    0.50
+        43     <{1},{3},{2}>    0.75
+        44 <{5},{6},{3},{2}>    0.50
+        45 <{5},{1},{3},{2}>    0.50
+        46         <{1},{1}>    0.50
+        47         <{2},{1}>    0.50
+        48         <{3},{1}>    0.50
+        49         <{5},{1}>    0.50
+        50       <{2,3},{1}>    0.50
+        51     <{1},{3},{1}>    0.50
+        52   <{1},{2,3},{1}>    0.50
+        53     <{1},{2},{1}>    0.50
+     */
+    val expectedValue = Array(
+      (Array(1), 4L),
+      (Array(2), 4L),
+      (Array(3), 4L),
+      (Array(4), 3L),
+      (Array(5), 3L),
+      (Array(6), 3L),
+      (Array(1, -1, 6), 2L),
+      (Array(2, -1, 6), 2L),
+      (Array(5, -1, 6), 2L),
+      (Array(1, 2, -1, 6), 2L),
+      (Array(1, -1, 4), 2L),
+      (Array(2, -1, 4), 2L),
+      (Array(1, 2, -1, 4), 2L),
+      (Array(1, -1, 3), 4L),
+      (Array(2, -1, 3), 3L),
+      (Array(2, 3), 2L),
+      (Array(3, -1, 3), 3L),
+      (Array(4, -1, 3), 3L),
+      (Array(5, -1, 3), 2L),
+      (Array(6, -1, 3), 2L),
+      (Array(5, -1, 6, -1, 3), 2L),
+      (Array(6, -1, 2, -1, 3), 2L),
+      (Array(5, -1, 2, -1, 3), 2L),
+      (Array(5, -1, 1, -1, 3), 2L),
+      (Array(2, -1, 4, -1, 3), 2L),
+      (Array(1, -1, 4, -1, 3), 2L),
+      (Array(1, 2, -1, 4, -1, 3), 2L),
+      (Array(1, -1, 3, -1, 3), 3L),
+      (Array(1, 2, -1, 3), 2L),
+      (Array(1, -1, 2, -1, 3), 2L),
+      (Array(1, -1, 2, 3), 2L),
+      (Array(1, -1, 2), 4L),
+      (Array(1, 2), 2L),
+      (Array(3, -1, 2), 3L),
+      (Array(4, -1, 2), 2L),
+      (Array(5, -1, 2), 2L),
+      (Array(6, -1, 2), 2L),
+      (Array(5, -1, 6, -1, 2), 2L),
+      (Array(6, -1, 3, -1, 2), 2L),
+      (Array(5, -1, 3, -1, 2), 2L),
+      (Array(5, -1, 1, -1, 2), 2L),
+      (Array(4, -1, 3, -1, 2), 2L),
+      (Array(1, -1, 3, -1, 2), 3L),
+      (Array(5, -1, 6, -1, 3, -1, 2), 2L),
+      (Array(5, -1, 1, -1, 3, -1, 2), 2L),
+      (Array(1, -1, 1), 2L),
+      (Array(2, -1, 1), 2L),
+      (Array(3, -1, 1), 2L),
+      (Array(5, -1, 1), 2L),
+      (Array(2, 3, -1, 1), 2L),
+      (Array(1, -1, 3, -1, 1), 2L),
+      (Array(1, -1, 2, 3, -1, 1), 2L),
+      (Array(1, -1, 2, -1, 1), 2L))
+
+    compareResults(expectedValue, result.collect())
   }
 
   private def compareResults(
-    expectedValue: Array[(Array[Int], Long)],
-    actualValue: Array[(Array[Int], Long)]): Boolean = {
-    expectedValue.map(x => (x._1.toSeq, x._2)).toSet ==
-      actualValue.map(x => (x._1.toSeq, x._2)).toSet
+      expectedValue: Array[(Array[Int], Long)],
+      actualValue: Array[(Array[Int], Long)]): Unit = {
+    val expectedSet = expectedValue.map(x => (x._1.toSeq, x._2)).toSet
+    val actualSet = actualValue.map(x => (x._1.toSeq, x._2)).toSet
+    assert(expectedSet === actualSet)
+  }
+
+  private def insertDelimiter(sequence: Array[Int]): Array[Int] = {
+    sequence.zip(Seq.fill(sequence.length)(PrefixSpan.DELIMITER)).map { case (a, b) =>
+      List(a, b)
+    }.flatten
   }
 
 }

From 8765665015ef47a23e00f7d01d4d280c31bb236d Mon Sep 17 00:00:00 2001
From: Yuhao Yang <hhbyyh@gmail.com>
Date: Sat, 1 Aug 2015 02:31:28 -0700
Subject: [PATCH 549/549] [SPARK-8169] [ML] Add StopWordsRemover as a
 transformer

jira: https://issues.apache.org/jira/browse/SPARK-8169

stop words: http://en.wikipedia.org/wiki/Stop_words

StopWordsRemover takes a string array column and outputs a string array column with all defined stop words removed. The transformer should also come with a standard set of stop words as default.

Currently I used a minimum stop words set since on some [case](http://nlp.stanford.edu/IR-book/html/htmledition/dropping-common-terms-stop-words-1.html), small set of stop words is preferred.
ASCII char has been tested, Yet I cannot check it in due to style check.

Further thought,
1. Maybe I should use OpenHashSet. Is it recommended?
2. Currently I leave the null in input array untouched, i.e. Array(null, null) => Array(null, null).
3. If the current stop words set looks too limited, any suggestion for replacement? We can have something similar to the one in [SKlearn](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_extraction/stop_words.py).

Author: Yuhao Yang <hhbyyh@gmail.com>

Closes #6742 from hhbyyh/stopwords and squashes the following commits:

fa959d8 [Yuhao Yang] separating udf
f190217 [Yuhao Yang] replace default list and other small fix
04403ab [Yuhao Yang] Merge remote-tracking branch 'upstream/master' into stopwords
b3aa957 [Yuhao Yang] add stopWordsRemover
---
 .../spark/ml/feature/StopWordsRemover.scala   | 155 ++++++++++++++++++
 .../ml/feature/StopWordsRemoverSuite.scala    |  80 +++++++++
 2 files changed, 235 insertions(+)
 create mode 100644 mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala
 create mode 100644 mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala
new file mode 100644
index 0000000000000..3cc41424460f2
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala
@@ -0,0 +1,155 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.feature
+
+import org.apache.spark.annotation.Experimental
+import org.apache.spark.ml.Transformer
+import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
+import org.apache.spark.ml.param.{ParamMap, BooleanParam, Param}
+import org.apache.spark.ml.util.Identifiable
+import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.types.{StringType, StructField, ArrayType, StructType}
+import org.apache.spark.sql.functions.{col, udf}
+
+/**
+ * stop words list
+ */
+private object StopWords {
+
+  /**
+   * Use the same default stopwords list as scikit-learn.
+   * The original list can be found from "Glasgow Information Retrieval Group"
+   * [[http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words]]
+   */
+  val EnglishStopWords = Array( "a", "about", "above", "across", "after", "afterwards", "again",
+    "against", "all", "almost", "alone", "along", "already", "also", "although", "always",
+    "am", "among", "amongst", "amoungst", "amount", "an", "and", "another",
+    "any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are",
+    "around", "as", "at", "back", "be", "became", "because", "become",
+    "becomes", "becoming", "been", "before", "beforehand", "behind", "being",
+    "below", "beside", "besides", "between", "beyond", "bill", "both",
+    "bottom", "but", "by", "call", "can", "cannot", "cant", "co", "con",
+    "could", "couldnt", "cry", "de", "describe", "detail", "do", "done",
+    "down", "due", "during", "each", "eg", "eight", "either", "eleven", "else",
+    "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone",
+    "everything", "everywhere", "except", "few", "fifteen", "fify", "fill",
+    "find", "fire", "first", "five", "for", "former", "formerly", "forty",
+    "found", "four", "from", "front", "full", "further", "get", "give", "go",
+    "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter",
+    "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his",
+    "how", "however", "hundred", "i", "ie", "if", "in", "inc", "indeed",
+    "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter",
+    "latterly", "least", "less", "ltd", "made", "many", "may", "me",
+    "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly",
+    "move", "much", "must", "my", "myself", "name", "namely", "neither",
+    "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone",
+    "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on",
+    "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our",
+    "ours", "ourselves", "out", "over", "own", "part", "per", "perhaps",
+    "please", "put", "rather", "re", "same", "see", "seem", "seemed",
+    "seeming", "seems", "serious", "several", "she", "should", "show", "side",
+    "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone",
+    "something", "sometime", "sometimes", "somewhere", "still", "such",
+    "system", "take", "ten", "than", "that", "the", "their", "them",
+    "themselves", "then", "thence", "there", "thereafter", "thereby",
+    "therefore", "therein", "thereupon", "these", "they", "thick", "thin",
+    "third", "this", "those", "though", "three", "through", "throughout",
+    "thru", "thus", "to", "together", "too", "top", "toward", "towards",
+    "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us",
+    "very", "via", "was", "we", "well", "were", "what", "whatever", "when",
+    "whence", "whenever", "where", "whereafter", "whereas", "whereby",
+    "wherein", "whereupon", "wherever", "whether", "which", "while", "whither",
+    "who", "whoever", "whole", "whom", "whose", "why", "will", "with",
+    "within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves")
+}
+
+/**
+ * :: Experimental ::
+ * A feature transformer that filters out stop words from input.
+ * Note: null values from input array are preserved unless adding null to stopWords explicitly.
+ * @see [[http://en.wikipedia.org/wiki/Stop_words]]
+ */
+@Experimental
+class StopWordsRemover(override val uid: String)
+  extends Transformer with HasInputCol with HasOutputCol {
+
+  def this() = this(Identifiable.randomUID("stopWords"))
+
+  /** @group setParam */
+  def setInputCol(value: String): this.type = set(inputCol, value)
+
+  /** @group setParam */
+  def setOutputCol(value: String): this.type = set(outputCol, value)
+
+  /**
+   * the stop words set to be filtered out
+   * @group param
+   */
+  val stopWords: Param[Array[String]] = new Param(this, "stopWords", "stop words")
+
+  /** @group setParam */
+  def setStopWords(value: Array[String]): this.type = set(stopWords, value)
+
+  /** @group getParam */
+  def getStopWords: Array[String] = $(stopWords)
+
+  /**
+   * whether to do a case sensitive comparison over the stop words
+   * @group param
+   */
+  val caseSensitive: BooleanParam = new BooleanParam(this, "caseSensitive",
+    "whether to do case-sensitive comparison during filtering")
+
+  /** @group setParam */
+  def setCaseSensitive(value: Boolean): this.type = set(caseSensitive, value)
+
+  /** @group getParam */
+  def getCaseSensitive: Boolean = $(caseSensitive)
+
+  setDefault(stopWords -> StopWords.EnglishStopWords, caseSensitive -> false)
+
+  override def transform(dataset: DataFrame): DataFrame = {
+    val outputSchema = transformSchema(dataset.schema)
+    val t = if ($(caseSensitive)) {
+        val stopWordsSet = $(stopWords).toSet
+        udf { terms: Seq[String] =>
+          terms.filter(s => !stopWordsSet.contains(s))
+        }
+      } else {
+        val toLower = (s: String) => if (s != null) s.toLowerCase else s
+        val lowerStopWords = $(stopWords).map(toLower(_)).toSet
+        udf { terms: Seq[String] =>
+          terms.filter(s => !lowerStopWords.contains(toLower(s)))
+        }
+    }
+
+    val metadata = outputSchema($(outputCol)).metadata
+    dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata))
+  }
+
+  override def transformSchema(schema: StructType): StructType = {
+    val inputType = schema($(inputCol)).dataType
+    require(inputType.sameType(ArrayType(StringType)),
+      s"Input type must be ArrayType(StringType) but got $inputType.")
+    val outputFields = schema.fields :+
+      StructField($(outputCol), inputType, schema($(inputCol)).nullable)
+    StructType(outputFields)
+  }
+
+  override def copy(extra: ParamMap): StopWordsRemover = defaultCopy(extra)
+}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala
new file mode 100644
index 0000000000000..f01306f89cb5f
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.feature
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.sql.{DataFrame, Row}
+
+object StopWordsRemoverSuite extends SparkFunSuite {
+  def testStopWordsRemover(t: StopWordsRemover, dataset: DataFrame): Unit = {
+    t.transform(dataset)
+      .select("filtered", "expected")
+      .collect()
+      .foreach { case Row(tokens, wantedTokens) =>
+        assert(tokens === wantedTokens)
+    }
+  }
+}
+
+class StopWordsRemoverSuite extends SparkFunSuite with MLlibTestSparkContext {
+  import StopWordsRemoverSuite._
+
+  test("StopWordsRemover default") {
+    val remover = new StopWordsRemover()
+      .setInputCol("raw")
+      .setOutputCol("filtered")
+    val dataSet = sqlContext.createDataFrame(Seq(
+      (Seq("test", "test"), Seq("test", "test")),
+      (Seq("a", "b", "c", "d"), Seq("b", "c", "d")),
+      (Seq("a", "the", "an"), Seq()),
+      (Seq("A", "The", "AN"), Seq()),
+      (Seq(null), Seq(null)),
+      (Seq(), Seq())
+    )).toDF("raw", "expected")
+
+    testStopWordsRemover(remover, dataSet)
+  }
+
+  test("StopWordsRemover case sensitive") {
+    val remover = new StopWordsRemover()
+      .setInputCol("raw")
+      .setOutputCol("filtered")
+      .setCaseSensitive(true)
+    val dataSet = sqlContext.createDataFrame(Seq(
+      (Seq("A"), Seq("A")),
+      (Seq("The", "the"), Seq("The"))
+    )).toDF("raw", "expected")
+
+    testStopWordsRemover(remover, dataSet)
+  }
+
+  test("StopWordsRemover with additional words") {
+    val stopWords = StopWords.EnglishStopWords ++ Array("python", "scala")
+    val remover = new StopWordsRemover()
+      .setInputCol("raw")
+      .setOutputCol("filtered")
+      .setStopWords(stopWords)
+    val dataSet = sqlContext.createDataFrame(Seq(
+      (Seq("python", "scala", "a"), Seq()),
+      (Seq("Python", "Scala", "swift"), Seq("swift"))
+    )).toDF("raw", "expected")
+
+    testStopWordsRemover(remover, dataSet)
+  }
+}